from sklearn import decomposition    # allows you to do PCA
from sklearn import cluster # allows you to do k means clustering
from matplotlib import cm, colors # helps with creating a legend
import pandas as pd # awesome data handling
import plotly as py
import warnings
warnings.filterwarnings('ignore')

pandas.set_option('display.height', 500)
pandas.set_option('display.max_rows', 500)
pandas.set_option('display.max_columns', 500)
pandas.set_option('display.width', 500)
pandas.set_option('display.mpl_style', False)

username='geomando'
api_key = '83orx28wm4'
py = plotly.plotly(username=username, key=api_key)

rec_data = pd.read_csv('sample_data.csv') # our recovery data table
row_labels = rec_data.index.astype('int')
column_labels = rec_data.columns

rec_data

pca = decomposition.ProbabilisticPCA(n_components=2, whiten=True)
myPCA = pca.fit_transform(rec_data)
pc1 = myPCA[:, 0]
pc2 = myPCA[:, 1]

pca_rec = decomposition.ProbabilisticPCA(n_components=2, whiten=True)
myPCA_rec = pca.fit_transform(rec_data.T)  # took the transpose of the recovery data table
pc1_rec = myPCA_rec[:, 0]
pc2_rec = myPCA_rec[:, 1]

figure(figsize=(10,8))

# Choose recovery indicator for graduated symbol
# Set arbitrary scale value
area_scale = 10
indicator_for_area = rec_data['# Residents Left']*area_scale 


# Choose recovery indicator for graduated colors
# Do various stuff for implementing graduated colors
indicator_for_color = rec_data['% Renter']
color_norm = [item/100. for item in indicator_for_color] # normalize from 0 to 1 in order to use as grayscale colormap
color_norm_str = [str(item) for item in color_norm] # convert each entry to string for use in plot command

# create scatter plot of the two neighborhood principal components with graduated symbols and colors.
scatter(pc1, pc2, s=indicator_for_area, c=color_norm_str, alpha=1)

# Plot the neighborhood labels
# have to use parallel for loop using zip because annotate function will only label one point at a time
for label, x, y in zip(row_labels, pc1, pc2):
    annotate(
        label, 
        xy = (x, y), xytext = (0, 0),
        textcoords = 'offset points', ha = 'center', va = 'center', size='8')

# label x and y axes of plot, inserting the calculated variance explained by the first and second principal components
xlabel("Principal Component 1" + " (" + str(round(100*pca.explained_variance_ratio_[0],2)) + "% of variance)") # the myPCA.fracs bit add the variance percentage to the label
ylabel("Principal Component 2"  + " (" + str(round(100*pca.explained_variance_ratio_[1],2)) + "% of variance)") # the myPCA.fracs bit add the variance percentage to the label

# Create colorbar legend
m = cm.ScalarMappable()
m.set_array(indicator_for_color)
m.set_cmap(cmap='gray')
norm = colors.Normalize(vmin=0, vmax=100)
m.set_norm(norm)
m.set_clim(vmin=0.0, vmax=100.0)
cbar = plt.colorbar(m) 
cbar.set_label('% Renter') # Too lazy to generalize legend label

# Anyone know a good solution for making a graduated symbol legend?
# None of my attempts have been generalizable
annotate('Number Residents Left', xy = (1.8, 2.25), xytext = (0, 0),
        textcoords = 'offset points', ha = 'right', va = 'center', size='12')
annotate('as Graduated Symbols', xy = (1.8, 2.1), xytext = (0, 0),
        textcoords = 'offset points', ha = 'right', va = 'center', size='12')

# Plot the principal components -- as just labels -- of the recovery indicators
for label, x, y in zip(column_labels, pc1_rec, pc2_rec):
    annotate(
        label, 
        xy = (x, y), xytext = (0, 0),
        textcoords = 'offset points', ha = 'center', va = 'center', size='8')    

kmeans = cluster.KMeans(n_clusters=5)
clust = kmeans.fit(myPCA)

kmeans_rec = cluster.KMeans(n_clusters=5)
clusters_rec = kmeans_rec.fit(myPCA_rec)

figure(figsize=(10,8))

# Plot two principal components of neigbhorhoods with color determined by which of the five k-means clusters assigned to components
scatter(pc1, pc2, s=200, c=clust.labels_, alpha=.5)

# label x and y axes
xlabel("Principal Component 1" + " (" + str(round(100*pca.explained_variance_ratio_[0],2)) + "% of variance)") # the myPCA.fracs bit add the variance percentage to the label
ylabel("Principal Component 2"  + " (" + str(round(100*pca.explained_variance_ratio_[1],2)) + "% of variance)") # the myPCA.fracs bit add the variance percentage to the label


# plot labels of PCA recovery indicators
# have to use parallel for loop using zip because annotate function will only label one point at a time
for label, x, y in zip(column_labels, pc1_rec, pc2_rec):
    annotate(
        label,
        xy = (x, y), xytext = (0, 0),
        textcoords = 'offset points', ha = 'center', va = 'center', size='8')

# plot labels of neighborhood IDs 
for label, x, y in zip(row_labels, pc1, pc2):
    annotate(
        label, 
        xy = (x, y), xytext = (0, 0),
        textcoords = 'offset points', ha = 'center', va = 'center', size='8')


plotly_color_scale = ['hsl('+ str(200) + ',' + str(100) + ',' + str(indicator_for_color.values[i]) + ')' for i in indicator_for_color.index]

data = [
        {
         'x': pc1,
         'y': pc2,
         'marker':
             {'size': indicator_for_area.values / 30.0,
              'color': plotly_color_scale, 
              'line':{'width':2}
              },
          'type':'scatter',
          'mode': 'markers'
          },
         {
          "x": pc1_rec, "y": pc2_rec,
          "text": list(column_labels),
          "type": "scatter",
          "mode": "text",
          "textposition": "top",
        }
         ]

xlabel = "Principal Component 1" + " (" + str(round(100*pca.explained_variance_ratio_[0],2)) + "% of variance)"
ylabel = "Principal Component 2" + " (" + str(round(100*pca.explained_variance_ratio_[1],2)) + "% of variance)"

layout = {
          'autosize': False,
          'height': 750,
          'width': 750,
          'xaxis':{'title': xlabel},
          'yaxis':{'title': ylabel},
          'showlegend':False
          }
            
            
filename = 'resilus_pca_bubble'
py.iplot(data, layout=layout, filename=filename, fileopt='overwrite')