%matplotlib inline import numpy as np import matplotlib.pyplot as plt from scipy import stats # use seaborn plotting defaults import seaborn as sns; sns.set() np.random.seed(2) x = np.concatenate([np.random.normal(0, 2, 2000), np.random.normal(5, 5, 2000), np.random.normal(3, 0.5, 600)]) plt.hist(x, 80, normed=True) plt.xlim(-10, 20); from sklearn.mixture import GMM clf = GMM(4, n_iter=500, random_state=3).fit(x) xpdf = np.linspace(-10, 20, 1000) density = np.exp(clf.score(xpdf)) plt.hist(x, 80, normed=True, alpha=0.5) plt.plot(xpdf, density, '-r') plt.xlim(-10, 20); clf.means_ clf.covars_ clf.weights_ plt.hist(x, 80, normed=True, alpha=0.3) plt.plot(xpdf, density, '-r') for i in range(clf.n_components): pdf = clf.weights_[i] * stats.norm(clf.means_[i, 0], np.sqrt(clf.covars_[i, 0])).pdf(xpdf) plt.fill(xpdf, pdf, facecolor='gray', edgecolor='none', alpha=0.3) plt.xlim(-10, 20); print(clf.bic(x)) print(clf.aic(x)) n_estimators = np.arange(1, 10) clfs = [GMM(n, n_iter=1000).fit(x) for n in n_estimators] bics = [clf.bic(x) for clf in clfs] aics = [clf.aic(x) for clf in clfs] plt.plot(n_estimators, bics, label='BIC') plt.plot(n_estimators, aics, label='AIC') plt.legend(); np.random.seed(0) # Add 20 outliers true_outliers = np.sort(np.random.randint(0, len(x), 20)) y = x.copy() y[true_outliers] += 50 * np.random.randn(20) clf = GMM(4, n_iter=500, random_state=0).fit(y) xpdf = np.linspace(-10, 20, 1000) density_noise = np.exp(clf.score(xpdf)) plt.hist(y, 80, normed=True, alpha=0.5) plt.plot(xpdf, density_noise, '-r') #plt.xlim(-10, 20); log_likelihood = clf.score_samples(y)[0] plt.plot(y, log_likelihood, '.k'); detected_outliers = np.where(log_likelihood < -9)[0] print("true outliers:") print(true_outliers) print("\ndetected outliers:") print(detected_outliers) set(true_outliers) - set(detected_outliers) set(detected_outliers) - set(true_outliers) from sklearn.neighbors import KernelDensity kde = KernelDensity(0.15).fit(x[:, None]) density_kde = np.exp(kde.score_samples(xpdf[:, None])) plt.hist(x, 80, normed=True, alpha=0.5) plt.plot(xpdf, density, '-b', label='GMM') plt.plot(xpdf, density_kde, '-r', label='KDE') plt.xlim(-10, 20) plt.legend();