import numpy as np import scipy as sp import matplotlib.pyplot as plt import pandas as pd import sklearn.datasets as sk_data import sklearn.metrics as metrics from sklearn.cluster import KMeans #import matplotlib as mpl import seaborn as sns %matplotlib inline X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=30,center_box=(-10.0, 10.0),random_state=0) sns.heatmap(X, xticklabels=False, yticklabels=False, linewidths=0,cbar=False) euclidean_dists = metrics.euclidean_distances(X) sns.heatmap(euclidean_dists, xticklabels=False, yticklabels=False, linewidths=0, square=True,cbar=False) kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10) kmeans.fit_predict(X) centroids = kmeans.cluster_centers_ labels = kmeans.labels_ error = kmeans.inertia_ print "The total error of the clustering is: ", error print '\nCluster labels' print labels print '\n Cluster Centroids' print centroids #print original and cluster data idx = np.argsort(labels) rX = X[idx,:] sns.heatmap( rX,xticklabels=False, yticklabels=False, linewidths=0,cbar=False) #Rearrange so that all same labels are consecutive #print labels #print labels[idx] rearranged_dists = euclidean_dists[idx,:][:,idx] sns.heatmap(rearranged_dists, xticklabels=False, yticklabels=False, linewidths=0, square=True,cbar=False) error = np.zeros(11) error[0] = 0; for k in range(1,11): kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10) kmeans.fit_predict(X) error[k] = kmeans.inertia_ plt.plot(range(1,len(error)),error[1:]) plt.xlabel('Number of clusters') plt.ylabel('Error') def evaluate_clusters(X,max_clusters): error = np.zeros(max_clusters+1) error[0] = 0; for k in range(1,max_clusters+1): kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10) kmeans.fit_predict(X) error[k] = kmeans.inertia_ plt.plot(range(1,len(error)),error[1:]) plt.xlabel('Number of clusters') plt.ylabel('Error') evaluate_clusters(X,10) from sklearn.datasets import fetch_20newsgroups """ categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.autos', 'rec.sport.baseball' ]""" categories = ['alt.atheism', 'sci.space','rec.sport.baseball'] news_data = fetch_20newsgroups(subset='train', categories=categories) print news_data.target, len(news_data.target) print news_data.target_names from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words='english', min_df=4, max_df=0.8) data = vectorizer.fit_transform(news_data.data) print type(data), data.shape fig, ax1 = plt.subplots(1,1,figsize=(15,10)) sns.heatmap(data[1:100,1:200].todense(), xticklabels=False, yticklabels=False, linewidths=0, cbar=False, ax=ax1) print news_data.target print news_data.target_names evaluate_clusters(data, 10) ri_evaluate_clusters(data,10,news_data.target) sc_evaluate_clusters(data,10) k=4 kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) kmeans.fit_predict(data) print("Top terms per cluster:") asc_order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1] order_centroids = asc_order_centroids[:,::-1] terms = vectorizer.get_feature_names() for i in range(k): print "Cluster %d:" % i for ind in order_centroids[i, :10]: print ' %s' % terms[ind] print # Code for setting the style of the notebook from IPython.core.display import HTML def css_styling(): styles = open("../theme/custom.css", "r").read() return HTML(styles) css_styling()