import pandas as pd
import numpy as np
from sklearn import cluster
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats

df = pd.read_csv('fruit.csv')
# Since this is unsupervised classification, we'll drop the labels
df = df.drop(['fruit_id', 'fruit_name'], axis=1)
df.sort(['sweetness', 'acidity', 'weight', 'elongatedness'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.tail(10)

columns = ['acidity', 'sweetness']
df[columns].describe()

col1 = columns[0]
col2 = columns[1]
plt.scatter(df[col1], df[col2], s=44, c='#808080', alpha=0.5)
plt.xlim(df[col1].min(), df[col1].max())
plt.ylim(df[col2].min(), df[col2].max())
plt.title('before standardization')
plt.xlabel(col1)
plt.ylabel(col2)
plt.show()

col1 = 'acidity_normal'
col2 = 'sweetness_normal'
df['sweetness_normal'] = (df.sweetness - df.sweetness.mean()) / df.sweetness.std()
df['acidity_normal'] = (df.acidity - df.acidity.mean()) / df.acidity.std()
plt.scatter(df[col1], df[col2], s=44, c='#808080', alpha=0.5)
plt.xlim(df[col1].min(), df[col1].max())
plt.ylim(df[col2].min(), df[col2].max())
plt.title('after standardization')
plt.xlabel(col1)
plt.ylabel(col2)
plt.show()

data = np.array([list(df[col1]), list(df[col2])]).T # a 179 x 2 array of instances x features
data.shape

k = 3 #number of clusters

start= np.array([[ 1 , 0], [ 2, -1], [ 2, -2] ]) # starting points for the clusters

steps = ['Set initial centroids',
         '1A: assign clusters by proximity',
         '1B: move centroids to mid-cluster',
         '2A: re-assign clusters by proximity',
         '2B: move centroids to mid-cluster',
         '3A: re-assign clusters by proximity',
         '3B: move centroids to mid-cluster',
         'Final centroids and clusters']

centroids = []

for i, stepname in enumerate(steps):

    num_iterations = [1, 2, 2, 3, 3, 4, 4, 100][i]
    kmeans = cluster.KMeans(n_clusters=k, max_iter=num_iterations, init=start, n_init=1)
    kmeans.fit(data)
    labels = kmeans.labels_
    centroids.append(kmeans.cluster_centers_)
    
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.title(stepname)

    if i == 0:
        #plot every point in grey
        plt.plot(data[:,0],data[:,1],'o',markerfacecolor='#808080')
        for j in range(k):
            lines = plt.plot(centroids[i][j,0],centroids[i][j,1],'kx')
        # make the centroid x's bigger
            plt.setp(lines,ms=15.0)
            plt.setp(lines,mew=2.0)
    else: 
        # plot every label in a different color
        for j in range(k):
            subset = data[np.where(labels==j)]
            plt.plot(subset[:,0],subset[:,1],'o')
            # plot the centroids
            if i in [1,3,5]:
                # from previous step
                lines = plt.plot(centroids[i-1][j,0],centroids[i-1][j,1],'kx')
            else:
                lines = plt.plot(centroids[i][j,0],centroids[i][j,1],'kx')
            # make the centroid x's bigger
            plt.setp(lines,ms=15.0)
            plt.setp(lines,mew=2.0)

    plt.show()
    

for i in range(100):
    kmeans = cluster.KMeans(n_clusters=3, max_iter=100, init='random', n_init=1)
    kmeans.fit(data)
    labels = np.array(kmeans.labels_)
    if i==0:
        all_labels = labels
    else:
        all_labels = np.vstack((all_labels, labels))

plt.scatter(df[col1], df[col2], s=7, c='#808080', alpha=0.5)
plt.scatter(data[10,0], data[10,1], s=72, c='#dddd22')
plt.scatter(data[144,0], data[144,1], s=66, c='#dddd22')
plt.scatter(data[160,0], data[160,1], s=66, c='#dddd22')
plt.xlim(df[col1].min(), df[col1].max())
plt.ylim(df[col2].min(), df[col2].max())
plt.title('chosen characteristic cluster points')
plt.xlabel(col1)
plt.ylabel(col2)
plt.show()

regularized_labels = np.zeros(all_labels.shape, dtype=np.int8)
disagreements = []

for i in range(all_labels.shape[0]):
    cluster0 = all_labels[i, 10]
    cluster1 = all_labels[i, 144]
    cluster2 = all_labels[i, 160]
    for j in range(all_labels.shape[1]):
        if all_labels[i,j] == cluster1:
            regularized_labels[i,j] = 1
        elif all_labels[i,j] == cluster2:
            regularized_labels[i,j] = 2
        
for i in range(regularized_labels.shape[1]):
    disagreements.append(regularized_labels[:,i].std()) # standard deviations

plt.scatter(df[col1], df[col2], s=12, c='#808080', alpha=0.5)
for i in range(len(disagreements)):
    if disagreements[i] > 0:
        plt.scatter(data[i,0], data[i,1], s=72, c='#dd22dd')
plt.xlim(df[col1].min(), df[col1].max())
plt.ylim(df[col2].min(), df[col2].max())
plt.title('points in different clusters after 100 runs')
plt.xlabel(col1)
plt.ylabel(col2)
plt.show()

# comparison of different values of k
def compare_kmeans(k):
    
    kmeans = cluster.KMeans(n_clusters=k)
    kmeans.fit(data)

    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.title('k-Means for k = ' + str(k))

    for i in range(k):
        subset = data[np.where(labels==i)]
        plt.plot(subset[:,0],subset[:,1],'o')
        # plot the centroids
        lines = plt.plot(centroids[i,0],centroids[i,1],'kx')
        # make the centroid x's bigger
        plt.setp(lines,ms=15.0)
        plt.setp(lines,mew=2.0)
    plt.show()
    
compare_kmeans(2)
compare_kmeans(3)
compare_kmeans(4)
compare_kmeans(5)

from sklearn.metrics import silhouette_score

def kmeans_silhouette(k):
    kmeans = cluster.KMeans(n_clusters=k)
    kmeans.fit(data)
    labels = kmeans.labels_
    score = silhouette_score(data, labels)
    return score

silhouettes = []
for i in range(2,10):
    silhouettes.append(kmeans_silhouette(i))

plt.plot(range(2,10), silhouettes, 'ro-', lw=2)
plt.title('Silhouette coefficient plot')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette coefficient')
plt.ylim(0, 0.6)
plt.xlim(1,10)