import os
import glob
import scipy
import array
import Image
import pickle
import compiler
import numpy as np
import pandas as pd
import scipy.misc
from itertools import cycle
import matplotlib.pyplot as plt
from IPython.display import Image
from random import randrangeimport leargist
from scipy.spatial.distance import pdist,squareform
from sklearn.cluster import MeanShift, estimate_bandwidth

%matplotlib inline  

print scipy.__version__
print scipy.misc.imsave
print pd.__version__

filename = 'zeusbin_16db6e2a9998430df9017f5cc6dd41f8.ex0';
f = open('samples/' + filename,'rb');
ln = os.path.getsize('samples/' + filename); # length of file in bytes
width = 256;
rem = ln%width; 

a = array.array("B"); # uint8 array
a.fromfile(f,ln-rem);
f.close(); 

g = np.reshape(a,(len(a)/width,width));
g = np.uint8(g);
scipy.misc.imsave('images/16db6e2a9998430df9017f5cc6dd41f8.png',g)

im = Image.open('images/16db6e2a9998430df9017f5cc6dd41f8.png')
im1 = im.resize((64,64)) # for faster computation
des = leargist.color_gist(im1) # 960 values
feature = des[0:320] # since the image is grayscale, we need only first 320 values
# Double check that we at least have some values
feature[:10]

corpus_features = []
width = 256
c = ['sample', 'features', 'cluster.2', 'cluster.3', 'cluster.4', 'cluster.e']
# The DataFrame is super useful for keeping track of samples, and which cluster they
# belong to at specific bandwidth settings.
dataframe = pd.DataFrame(columns=c)

for sample in glob.glob("samples/*.ex0"):
    f = open(sample,'rb')
    ln = os.path.getsize(sample) # length of file in bytes
    rem = ln%width
    a = array.array("B") # uint8 array
    a.fromfile(f,ln-rem)
    f.close() 
    g = np.reshape(a,(len(a)/width,width))
    g = np.uint8(g)
    name = sample.split('/')[1]
    try:
        scipy.misc.imsave('images/' + name + '.png',g)
        im = Image.open('images/' + name + '.png')
        im1 = im.resize((64,64)) # for faster computation
        des = leargist.color_gist(im1) # 960 values
        feature = des[0:320] # since the image is grayscale, we need only first 320 values
        strings = ["%.7f" % number for number in feature]
        # Can't use scalars for dataframe creation, or we need to pass an index
        d = {'sample':[name], 'features':[':'.join(strings)], 'cluster.2':[0], 'cluster.3':[0], 'cluster.4':[0], 'cluster.e':[0]}
        dataframe = dataframe.append(pd.DataFrame(data=d, columns=c))
        corpus_features.append(feature)
    # Some of them cause errors, maybe I'll eventually take some time to figure out why
    except Exception as e:
        print "[*] ERROR: %s - %s" %(sample, str(e))

np.save('corpus_features.npy',corpus_features)

# The data frame is populated, and naturally the cluster specific information hasn't been populated yet
dataframe.head(5)

# This takes enough time that it's worth saving the results
X = np.load('corpus_features.npy');
# Wonder what scikit-learn thinks the bandwidth should be?
ebw = estimate_bandwidth(X)
bandwidths = [0.2, 0.3, 0.4, ebw]
for bw in bandwidths:
    print "Running: %s" %bw
    ms1 = MeanShift(bandwidth=bw)
    ms1.fit(X)
    pickle.dump(ms1, open(str(bw) + '.ms1.p', 'wb'))

for bw in bandwidths:
    print "Bandwidth : %s" %bw
    ms1 = pickle.load(open(str(bw) + '.ms1.p', 'rb'))
    labels1 = ms1.labels_
    labels1_u = np.unique(labels1)
    nclusters = len(labels1_u)
    l_sort_ind = np.argsort(labels1)
    
    X_sort = np.zeros((X.shape[0],X.shape[1]))
    for i in range(X.shape[0]):
        X_sort[i] = X[l_sort_ind[i]]
    
    yd_sort = pdist(X_sort,'euclidean')
    yd_sort_sq = squareform(yd_sort)
    yd_sort_sq.shape
    plt.imshow(yd_sort_sq/yd_sort_sq.max())
    plt.colorbar()
    plt.show()
    
    plt.figure(1)
    plt.clf()
    cluster_centers = ms1.cluster_centers_
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(nclusters), colors):
        my_members = labels1 == k
        cluster_center = cluster_centers[k]
        plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=14)
    plt.title('Estimated number of clusters: %d' % nclusters)
    plt.show()
    print "\n\n"
    # Populater cluster information in the dataframe
    if bw == 0.2:
        for i in range(len(X)):
            strings = ["%.7f" % number for number in X[i]]
            dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.2'] = 'cluster_' + str(labels1[i])
    if bw == 0.3:
        for i in range(len(X)):
            strings = ["%.7f" % number for number in X[i]]
            dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.3'] = 'cluster_' + str(labels1[i])
    if bw == 0.4:
        for i in range(len(X)):
            strings = ["%.7f" % number for number in X[i]]
            dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.4'] = 'cluster_' + str(labels1[i])
    # Estimated bandwidth from above
    if bw == ebw:
        for i in range(len(X)):
            strings = ["%.7f" % number for number in X[i]]
            dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.e'] = 'cluster_' + str(labels1[i])

# Woooo, cluster information is populated!
dataframe.head(10)

cluster = 'cluster.2'
c = dataframe[cluster].value_counts()
print "%s Clusters in %s" %(len(c), cluster)
print "%s Clusters with more than one object" %len(c[c > 1])
print "%s Clusters with exactly one object" %len(c[c == 1])
print "\n"
fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k')
ax = c.plot(kind='line', logy=True)
ax.set_xticklabels(c.index.tolist(), rotation=90)
ax.set_ylabel('Samples in Cluster ' + cluster)
ax.set_xlabel('Cluster Name')

cluster = 'cluster.3'
c = dataframe[cluster].value_counts()
print "%s Clusters in %s" %(len(c), cluster)
print "%s Clusters with more than one object" %len(c[c > 1])
print "%s Clusters with exactly one object" %len(c[c == 1])
print "\n"
fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k')
ax = c.plot(kind='line', logy=True)
ax.set_xticklabels(c.index.tolist(), rotation=90)
ax.set_ylabel('Samples in Cluster ' + cluster)
ax.set_xlabel('Cluster Name')

cluster = 'cluster.4'
c = dataframe[cluster].value_counts()
print "%s Clusters in %s" %(len(c), cluster)
print "%s Clusters with more than one object" %len(c[c > 1])
print "%s Clusters with exactly one object" %len(c[c == 1])
print "\n"
fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k')
ax = c.plot(kind='line', logy=True)
ax.set_xticklabels(c.index.tolist(), rotation=90)
ax.set_ylabel('Samples in Cluster ' + cluster)
ax.set_xlabel('Cluster Name')

cluster = 'cluster.e'
c = dataframe[cluster].value_counts()
print "%s Clusters in %s" %(len(c), cluster)
print "%s Clusters with more than one object" %len(c[c > 1])
print "%s Clusters with exactly one object" %len(c[c == 1])
print "\n"
fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k')
ax = c.plot(kind='line', logy=True)
ax.set_xticklabels(c.index.tolist(), rotation=90)
ax.set_ylabel('Samples in Cluster ' + cluster)
ax.set_xlabel('Cluster Name')

s = "cluster_2"
filename = dataframe[dataframe['cluster.2'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.2'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)

filename = dataframe[dataframe['cluster.2'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.2'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)

s = "cluster_2"
filename = dataframe[dataframe['cluster.e'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.e'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)

filename = dataframe[dataframe['cluster.e'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.e'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)

s = "cluster_2"
filename = dataframe[dataframe['cluster.4'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.4'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)

filename = dataframe[dataframe['cluster.4'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.4'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)