import pandas as pd import numpy as np from sklearn import cluster import matplotlib.pyplot as plt %matplotlib inline from scipy import stats df = pd.read_csv('fruit.csv') # Since this is unsupervised classification, we'll drop the labels df = df.drop(['fruit_id', 'fruit_name'], axis=1) df.sort(['sweetness', 'acidity', 'weight', 'elongatedness'], inplace=True) df.reset_index(drop=True, inplace=True) df.tail(10) columns = ['acidity', 'sweetness'] df[columns].describe() col1 = columns[0] col2 = columns[1] plt.scatter(df[col1], df[col2], s=44, c='#808080', alpha=0.5) plt.xlim(df[col1].min(), df[col1].max()) plt.ylim(df[col2].min(), df[col2].max()) plt.title('before standardization') plt.xlabel(col1) plt.ylabel(col2) plt.show() col1 = 'acidity_normal' col2 = 'sweetness_normal' df['sweetness_normal'] = (df.sweetness - df.sweetness.mean()) / df.sweetness.std() df['acidity_normal'] = (df.acidity - df.acidity.mean()) / df.acidity.std() plt.scatter(df[col1], df[col2], s=44, c='#808080', alpha=0.5) plt.xlim(df[col1].min(), df[col1].max()) plt.ylim(df[col2].min(), df[col2].max()) plt.title('after standardization') plt.xlabel(col1) plt.ylabel(col2) plt.show() data = np.array([list(df[col1]), list(df[col2])]).T # a 179 x 2 array of instances x features data.shape k = 3 #number of clusters start= np.array([[ 1 , 0], [ 2, -1], [ 2, -2] ]) # starting points for the clusters steps = ['Set initial centroids', '1A: assign clusters by proximity', '1B: move centroids to mid-cluster', '2A: re-assign clusters by proximity', '2B: move centroids to mid-cluster', '3A: re-assign clusters by proximity', '3B: move centroids to mid-cluster', 'Final centroids and clusters'] centroids = [] for i, stepname in enumerate(steps): num_iterations = [1, 2, 2, 3, 3, 4, 4, 100][i] kmeans = cluster.KMeans(n_clusters=k, max_iter=num_iterations, init=start, n_init=1) kmeans.fit(data) labels = kmeans.labels_ centroids.append(kmeans.cluster_centers_) plt.xlabel(col1) plt.ylabel(col2) plt.title(stepname) if i == 0: #plot every point in grey plt.plot(data[:,0],data[:,1],'o',markerfacecolor='#808080') for j in range(k): lines = plt.plot(centroids[i][j,0],centroids[i][j,1],'kx') # make the centroid x's bigger plt.setp(lines,ms=15.0) plt.setp(lines,mew=2.0) else: # plot every label in a different color for j in range(k): subset = data[np.where(labels==j)] plt.plot(subset[:,0],subset[:,1],'o') # plot the centroids if i in [1,3,5]: # from previous step lines = plt.plot(centroids[i-1][j,0],centroids[i-1][j,1],'kx') else: lines = plt.plot(centroids[i][j,0],centroids[i][j,1],'kx') # make the centroid x's bigger plt.setp(lines,ms=15.0) plt.setp(lines,mew=2.0) plt.show() for i in range(100): kmeans = cluster.KMeans(n_clusters=3, max_iter=100, init='random', n_init=1) kmeans.fit(data) labels = np.array(kmeans.labels_) if i==0: all_labels = labels else: all_labels = np.vstack((all_labels, labels)) plt.scatter(df[col1], df[col2], s=7, c='#808080', alpha=0.5) plt.scatter(data[10,0], data[10,1], s=72, c='#dddd22') plt.scatter(data[144,0], data[144,1], s=66, c='#dddd22') plt.scatter(data[160,0], data[160,1], s=66, c='#dddd22') plt.xlim(df[col1].min(), df[col1].max()) plt.ylim(df[col2].min(), df[col2].max()) plt.title('chosen characteristic cluster points') plt.xlabel(col1) plt.ylabel(col2) plt.show() regularized_labels = np.zeros(all_labels.shape, dtype=np.int8) disagreements = [] for i in range(all_labels.shape[0]): cluster0 = all_labels[i, 10] cluster1 = all_labels[i, 144] cluster2 = all_labels[i, 160] for j in range(all_labels.shape[1]): if all_labels[i,j] == cluster1: regularized_labels[i,j] = 1 elif all_labels[i,j] == cluster2: regularized_labels[i,j] = 2 for i in range(regularized_labels.shape[1]): disagreements.append(regularized_labels[:,i].std()) # standard deviations plt.scatter(df[col1], df[col2], s=12, c='#808080', alpha=0.5) for i in range(len(disagreements)): if disagreements[i] > 0: plt.scatter(data[i,0], data[i,1], s=72, c='#dd22dd') plt.xlim(df[col1].min(), df[col1].max()) plt.ylim(df[col2].min(), df[col2].max()) plt.title('points in different clusters after 100 runs') plt.xlabel(col1) plt.ylabel(col2) plt.show() # comparison of different values of k def compare_kmeans(k): kmeans = cluster.KMeans(n_clusters=k) kmeans.fit(data) labels = kmeans.labels_ centroids = kmeans.cluster_centers_ plt.xlabel(col1) plt.ylabel(col2) plt.title('k-Means for k = ' + str(k)) for i in range(k): subset = data[np.where(labels==i)] plt.plot(subset[:,0],subset[:,1],'o') # plot the centroids lines = plt.plot(centroids[i,0],centroids[i,1],'kx') # make the centroid x's bigger plt.setp(lines,ms=15.0) plt.setp(lines,mew=2.0) plt.show() compare_kmeans(2) compare_kmeans(3) compare_kmeans(4) compare_kmeans(5) from sklearn.metrics import silhouette_score def kmeans_silhouette(k): kmeans = cluster.KMeans(n_clusters=k) kmeans.fit(data) labels = kmeans.labels_ score = silhouette_score(data, labels) return score silhouettes = [] for i in range(2,10): silhouettes.append(kmeans_silhouette(i)) plt.plot(range(2,10), silhouettes, 'ro-', lw=2) plt.title('Silhouette coefficient plot') plt.xlabel('Number of clusters') plt.ylabel('Silhouette coefficient') plt.ylim(0, 0.6) plt.xlim(1,10)