import os import glob import scipy import array import Image import pickle import compiler import numpy as np import pandas as pd import scipy.misc from itertools import cycle import matplotlib.pyplot as plt from IPython.display import Image from random import randrangeimport leargist from scipy.spatial.distance import pdist,squareform from sklearn.cluster import MeanShift, estimate_bandwidth %matplotlib inline print scipy.__version__ print scipy.misc.imsave print pd.__version__ filename = 'zeusbin_16db6e2a9998430df9017f5cc6dd41f8.ex0'; f = open('samples/' + filename,'rb'); ln = os.path.getsize('samples/' + filename); # length of file in bytes width = 256; rem = ln%width; a = array.array("B"); # uint8 array a.fromfile(f,ln-rem); f.close(); g = np.reshape(a,(len(a)/width,width)); g = np.uint8(g); scipy.misc.imsave('images/16db6e2a9998430df9017f5cc6dd41f8.png',g) im = Image.open('images/16db6e2a9998430df9017f5cc6dd41f8.png') im1 = im.resize((64,64)) # for faster computation des = leargist.color_gist(im1) # 960 values feature = des[0:320] # since the image is grayscale, we need only first 320 values # Double check that we at least have some values feature[:10] corpus_features = [] width = 256 c = ['sample', 'features', 'cluster.2', 'cluster.3', 'cluster.4', 'cluster.e'] # The DataFrame is super useful for keeping track of samples, and which cluster they # belong to at specific bandwidth settings. dataframe = pd.DataFrame(columns=c) for sample in glob.glob("samples/*.ex0"): f = open(sample,'rb') ln = os.path.getsize(sample) # length of file in bytes rem = ln%width a = array.array("B") # uint8 array a.fromfile(f,ln-rem) f.close() g = np.reshape(a,(len(a)/width,width)) g = np.uint8(g) name = sample.split('/')[1] try: scipy.misc.imsave('images/' + name + '.png',g) im = Image.open('images/' + name + '.png') im1 = im.resize((64,64)) # for faster computation des = leargist.color_gist(im1) # 960 values feature = des[0:320] # since the image is grayscale, we need only first 320 values strings = ["%.7f" % number for number in feature] # Can't use scalars for dataframe creation, or we need to pass an index d = {'sample':[name], 'features':[':'.join(strings)], 'cluster.2':[0], 'cluster.3':[0], 'cluster.4':[0], 'cluster.e':[0]} dataframe = dataframe.append(pd.DataFrame(data=d, columns=c)) corpus_features.append(feature) # Some of them cause errors, maybe I'll eventually take some time to figure out why except Exception as e: print "[*] ERROR: %s - %s" %(sample, str(e)) np.save('corpus_features.npy',corpus_features) # The data frame is populated, and naturally the cluster specific information hasn't been populated yet dataframe.head(5) # This takes enough time that it's worth saving the results X = np.load('corpus_features.npy'); # Wonder what scikit-learn thinks the bandwidth should be? ebw = estimate_bandwidth(X) bandwidths = [0.2, 0.3, 0.4, ebw] for bw in bandwidths: print "Running: %s" %bw ms1 = MeanShift(bandwidth=bw) ms1.fit(X) pickle.dump(ms1, open(str(bw) + '.ms1.p', 'wb')) for bw in bandwidths: print "Bandwidth : %s" %bw ms1 = pickle.load(open(str(bw) + '.ms1.p', 'rb')) labels1 = ms1.labels_ labels1_u = np.unique(labels1) nclusters = len(labels1_u) l_sort_ind = np.argsort(labels1) X_sort = np.zeros((X.shape[0],X.shape[1])) for i in range(X.shape[0]): X_sort[i] = X[l_sort_ind[i]] yd_sort = pdist(X_sort,'euclidean') yd_sort_sq = squareform(yd_sort) yd_sort_sq.shape plt.imshow(yd_sort_sq/yd_sort_sq.max()) plt.colorbar() plt.show() plt.figure(1) plt.clf() cluster_centers = ms1.cluster_centers_ colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(nclusters), colors): my_members = labels1 == k cluster_center = cluster_centers[k] plt.plot(X[my_members, 0], X[my_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) plt.title('Estimated number of clusters: %d' % nclusters) plt.show() print "\n\n" # Populater cluster information in the dataframe if bw == 0.2: for i in range(len(X)): strings = ["%.7f" % number for number in X[i]] dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.2'] = 'cluster_' + str(labels1[i]) if bw == 0.3: for i in range(len(X)): strings = ["%.7f" % number for number in X[i]] dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.3'] = 'cluster_' + str(labels1[i]) if bw == 0.4: for i in range(len(X)): strings = ["%.7f" % number for number in X[i]] dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.4'] = 'cluster_' + str(labels1[i]) # Estimated bandwidth from above if bw == ebw: for i in range(len(X)): strings = ["%.7f" % number for number in X[i]] dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.e'] = 'cluster_' + str(labels1[i]) # Woooo, cluster information is populated! dataframe.head(10) cluster = 'cluster.2' c = dataframe[cluster].value_counts() print "%s Clusters in %s" %(len(c), cluster) print "%s Clusters with more than one object" %len(c[c > 1]) print "%s Clusters with exactly one object" %len(c[c == 1]) print "\n" fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k') ax = c.plot(kind='line', logy=True) ax.set_xticklabels(c.index.tolist(), rotation=90) ax.set_ylabel('Samples in Cluster ' + cluster) ax.set_xlabel('Cluster Name') cluster = 'cluster.3' c = dataframe[cluster].value_counts() print "%s Clusters in %s" %(len(c), cluster) print "%s Clusters with more than one object" %len(c[c > 1]) print "%s Clusters with exactly one object" %len(c[c == 1]) print "\n" fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k') ax = c.plot(kind='line', logy=True) ax.set_xticklabels(c.index.tolist(), rotation=90) ax.set_ylabel('Samples in Cluster ' + cluster) ax.set_xlabel('Cluster Name') cluster = 'cluster.4' c = dataframe[cluster].value_counts() print "%s Clusters in %s" %(len(c), cluster) print "%s Clusters with more than one object" %len(c[c > 1]) print "%s Clusters with exactly one object" %len(c[c == 1]) print "\n" fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k') ax = c.plot(kind='line', logy=True) ax.set_xticklabels(c.index.tolist(), rotation=90) ax.set_ylabel('Samples in Cluster ' + cluster) ax.set_xlabel('Cluster Name') cluster = 'cluster.e' c = dataframe[cluster].value_counts() print "%s Clusters in %s" %(len(c), cluster) print "%s Clusters with more than one object" %len(c[c > 1]) print "%s Clusters with exactly one object" %len(c[c == 1]) print "\n" fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k') ax = c.plot(kind='line', logy=True) ax.set_xticklabels(c.index.tolist(), rotation=90) ax.set_ylabel('Samples in Cluster ' + cluster) ax.set_xlabel('Cluster Name') s = "cluster_2" filename = dataframe[dataframe['cluster.2'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.2'] == s]['sample'].tolist()))] + '.png' print filename Image(filename='images/' + filename) filename = dataframe[dataframe['cluster.2'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.2'] == s]['sample'].tolist()))] + '.png' print filename Image(filename='images/' + filename) s = "cluster_2" filename = dataframe[dataframe['cluster.e'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.e'] == s]['sample'].tolist()))] + '.png' print filename Image(filename='images/' + filename) filename = dataframe[dataframe['cluster.e'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.e'] == s]['sample'].tolist()))] + '.png' print filename Image(filename='images/' + filename) s = "cluster_2" filename = dataframe[dataframe['cluster.4'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.4'] == s]['sample'].tolist()))] + '.png' print filename Image(filename='images/' + filename) filename = dataframe[dataframe['cluster.4'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.4'] == s]['sample'].tolist()))] + '.png' print filename Image(filename='images/' + filename)