I came across a blog post at: http://sarvamblog.blogspot.com/2013/04/clustering-malware-corpus.html
I liked the idea, so I wanted to see if I could get the code/ideas/examples in the blog post working. This is my attempt.
The usual suspects:
import os
import glob
import scipy
import array
import Image
import pickle
import compiler
import numpy as np
import pandas as pd
import scipy.misc
from itertools import cycle
import matplotlib.pyplot as plt
from IPython.display import Image
from random import randrangeimport leargist
from scipy.spatial.distance import pdist,squareform
from sklearn.cluster import MeanShift, estimate_bandwidth
%matplotlib inline
print scipy.__version__
print scipy.misc.imsave
print pd.__version__
0.13.3 <function imsave at 0x10cce8c80> 0.13.1
filename = 'zeusbin_16db6e2a9998430df9017f5cc6dd41f8.ex0';
f = open('samples/' + filename,'rb');
ln = os.path.getsize('samples/' + filename); # length of file in bytes
width = 256;
rem = ln%width;
a = array.array("B"); # uint8 array
a.fromfile(f,ln-rem);
f.close();
g = np.reshape(a,(len(a)/width,width));
g = np.uint8(g);
scipy.misc.imsave('images/16db6e2a9998430df9017f5cc6dd41f8.png',g)
im = Image.open('images/16db6e2a9998430df9017f5cc6dd41f8.png')
im1 = im.resize((64,64)) # for faster computation
des = leargist.color_gist(im1) # 960 values
feature = des[0:320] # since the image is grayscale, we need only first 320 values
# Double check that we at least have some values
feature[:10]
array([ 0.10362913, 0.06557233, 0.07774512, 0.08131842, 0.0632457 , 0.07343196, 0.07885037, 0.08080819, 0.09836281, 0.05373001], dtype=float32)
Now that we know it works for one, let's do it for every sample!
corpus_features = []
width = 256
c = ['sample', 'features', 'cluster.2', 'cluster.3', 'cluster.4', 'cluster.e']
# The DataFrame is super useful for keeping track of samples, and which cluster they
# belong to at specific bandwidth settings.
dataframe = pd.DataFrame(columns=c)
for sample in glob.glob("samples/*.ex0"):
f = open(sample,'rb')
ln = os.path.getsize(sample) # length of file in bytes
rem = ln%width
a = array.array("B") # uint8 array
a.fromfile(f,ln-rem)
f.close()
g = np.reshape(a,(len(a)/width,width))
g = np.uint8(g)
name = sample.split('/')[1]
try:
scipy.misc.imsave('images/' + name + '.png',g)
im = Image.open('images/' + name + '.png')
im1 = im.resize((64,64)) # for faster computation
des = leargist.color_gist(im1) # 960 values
feature = des[0:320] # since the image is grayscale, we need only first 320 values
strings = ["%.7f" % number for number in feature]
# Can't use scalars for dataframe creation, or we need to pass an index
d = {'sample':[name], 'features':[':'.join(strings)], 'cluster.2':[0], 'cluster.3':[0], 'cluster.4':[0], 'cluster.e':[0]}
dataframe = dataframe.append(pd.DataFrame(data=d, columns=c))
corpus_features.append(feature)
# Some of them cause errors, maybe I'll eventually take some time to figure out why
except Exception as e:
print "[*] ERROR: %s - %s" %(sample, str(e))
[*] ERROR: samples/zeusbin_0097e7ebc48af545d0758c9f2fb6882e.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_0b02e6141fdeb045a5475fc423d2dafe.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_122ce7df059d582dbb0324cb15abeb57.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_144b49b2b46195e9864aa9f317199bfb.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_1486e2c21a58f44ff14c9cef8da386fb.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_17d717c8008b5a874e5d42d5037ba615.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_2279de832112c4aec0972cf688c64216.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_2420fb71d32b61b886177b20b4159a7f.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_39034f600a1efd84038a63ec90c16630.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_433d03f2dfe6a9411b935247e2a76bd8.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_49790b8b6c07e330aa343555cbba048a.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_4de74498b8c78a917af0969d801ae92c.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_52ab4a86cf8402d4ea6e86b45360d081.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_57af12ad1138fd6fc45809adca5c7748.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_62169455fd9b0908eb51191e38217c4c.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_67aea18ba9acf78045decb4da8549aec.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_68b329da9893e34099c7d8ad5cb9c940.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_6b187ac48c93d272bd192a4b8e416905.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_6f15ca7a41c1133b227ba2fcabbf2113.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_72505cb57f3aa49d2ddccb825a690969.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_727bf9c7f700689b9af1562f25b85d5c.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_727d75ddd0a387d6309647dd77d9d4f9.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_756447e177fc3cc39912797b7ecb2f92.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_77967ae40bfc7667dd1afc9703b85ec8.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_77be5a8ea34c47eb803459d91a3730fc.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_79462f574b7b022869050089f7d3f0d2.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_7d1b8de4ad528bf546dc025150ff7785.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_81051bcc2cf1bedf378224b0a93e2877.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_83cfcd0e8bd653c8aaf2bdfcd936edb1.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_8dcc9b7410966bb194246bcfb95072e4.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_912e4c3a98b7b77fa8cadb8ed77b9846.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_935b7ed1ecfd63afe8b67a0fee429ac6.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_971a6c8098046f0032caf836167419f3.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_9750ab9b939fada0691388a13e55c156.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_97675eb3f268048604dc5155511a2a4d.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_9e2a7344a95c96ce32f841ad2cc6151f.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_a30aa6318a9e32d092d534ce8cb43da8.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_a666507eb415499a752f7eca3a2ab6e2.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_a8e84618d44a02868c29e1cf7c7a625c.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_a92f523c7fa6d0f127d6b955e1634e16.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_a93dfbff5735c7afee4857c2a432a488.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_af6beea4660331d90deb7d87f7344ca9.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_aff3977dbd5a3a8b075d61c99c10c076.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_aff6441dc71c129c94f4bf587cbfcb4c.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_b67dc1cbcb23ccef1a0328f152bf53a5.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_b9a2d3dea532c67e04ece627b060d7e4.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_bd323f8565e03439a61aad6debfed3e4.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_bec7595c4b5377921474293bb6075311.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_bf27ac5c832d929e94066a1320ba3e95.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_c0dc9cf472e1b10fe2bb31b0048a5bce.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_c265cbc422defd00cb2c0e5671cb407c.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_c4b8c4cbbd8c7215b688058cecea8a2e.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_c805266553dde514efd37905ab8750ff.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_c9e0fb04d933a98c6f1ca6a4cd52a0f8.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_d12611d7a32da067d99320ab9b7fa9f7.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_d131caa8469351546093108493ef772d.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_d15305d7a4e34e02489c74a5ef542f36.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_d490a173c8ba38d0390126d02ce773c8.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_e164aabe89fcd8d92040a15f76c995c9.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_e7df872dbc5381f3596ea5470b4fdf35.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_e94de39e9a23550f4a8a18e6cf77323b.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_ec774d47af0270d1fe3f9b31ab8eb076.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_ee5d6f9e4625be005dab2b599c3ba968.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_ee7ef9e27a1c47bd2cc4a64888d10150.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_f0e68b81cef4b212106782fe3bf030c8.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_f4131be5d90d7d8ba3c1381a60f73d93.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_f5a8d7b5336a625eadd6cdaad49b2037.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_f65a157487abd9b89ecf9f3e9ca780a6.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_f90e52d0e6aa39ad91c94ffef5502975.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_f9cfa96e45235f629fadac4dd9f0a694.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_fa519bfd49ba82acd616cb12a82f998c.ex0 - tile cannot extend outside image [*] ERROR: samples/zeusbin_fd813555c5b26fc719e782420828fd67.ex0 - tile cannot extend outside image
np.save('corpus_features.npy',corpus_features)
# The data frame is populated, and naturally the cluster specific information hasn't been populated yet
dataframe.head(5)
sample | features | cluster.2 | cluster.3 | cluster.4 | cluster.e | |
---|---|---|---|---|---|---|
0 | zeusbin_0007d213acc8f79cdca26922a89b13ee.ex0 | 0.1152227:0.0551382:0.0654014:0.0792643:0.1262... | 0 | 0 | 0 | 0 |
0 | zeusbin_000a10bb968d37acaf37adf02c283c69.ex0 | 0.1059370:0.1512601:0.1556943:0.1042381:0.1026... | 0 | 0 | 0 | 0 |
0 | zeusbin_000e19d6fccd980a8b534c4cfb56c6bb.ex0 | 0.0918169:0.1059434:0.1254217:0.1089197:0.0958... | 0 | 0 | 0 | 0 |
0 | zeusbin_001a1d5714890cf7459bd1bff933d66d.ex0 | 0.0812866:0.0770250:0.0823474:0.0812574:0.0678... | 0 | 0 | 0 | 0 |
0 | zeusbin_00240863a67bb6a001bf11aa87a1d76e.ex0 | 0.0550865:0.0731409:0.0671090:0.0631345:0.0553... | 0 | 0 | 0 | 0 |
5 rows × 6 columns
In the blog post, Mean-shift clustering was used. Here's some more information:
# This takes enough time that it's worth saving the results
X = np.load('corpus_features.npy');
# Wonder what scikit-learn thinks the bandwidth should be?
ebw = estimate_bandwidth(X)
bandwidths = [0.2, 0.3, 0.4, ebw]
for bw in bandwidths:
print "Running: %s" %bw
ms1 = MeanShift(bandwidth=bw)
ms1.fit(X)
pickle.dump(ms1, open(str(bw) + '.ms1.p', 'wb'))
Running: 0.2 Running: 0.3 Running: 0.4 Running: 0.637463391528
Understanding (and obviously duplicating) the graphs in the original post was the next step. The author used a Distance matrix to show the various cluster layouts (by showing the Euclidean distance between feature vectors). What you're looking for here is the size (and density) of the black squares along the diagonal.
I also tossed in some additional graphs to help visualize the clusters in another way. However, it's not super useful until you get to the estimated bandwidth example (also an addition), due to the higher number of clusters.
for bw in bandwidths:
print "Bandwidth : %s" %bw
ms1 = pickle.load(open(str(bw) + '.ms1.p', 'rb'))
labels1 = ms1.labels_
labels1_u = np.unique(labels1)
nclusters = len(labels1_u)
l_sort_ind = np.argsort(labels1)
X_sort = np.zeros((X.shape[0],X.shape[1]))
for i in range(X.shape[0]):
X_sort[i] = X[l_sort_ind[i]]
yd_sort = pdist(X_sort,'euclidean')
yd_sort_sq = squareform(yd_sort)
yd_sort_sq.shape
plt.imshow(yd_sort_sq/yd_sort_sq.max())
plt.colorbar()
plt.show()
plt.figure(1)
plt.clf()
cluster_centers = ms1.cluster_centers_
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(nclusters), colors):
my_members = labels1 == k
cluster_center = cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % nclusters)
plt.show()
print "\n\n"
# Populater cluster information in the dataframe
if bw == 0.2:
for i in range(len(X)):
strings = ["%.7f" % number for number in X[i]]
dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.2'] = 'cluster_' + str(labels1[i])
if bw == 0.3:
for i in range(len(X)):
strings = ["%.7f" % number for number in X[i]]
dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.3'] = 'cluster_' + str(labels1[i])
if bw == 0.4:
for i in range(len(X)):
strings = ["%.7f" % number for number in X[i]]
dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.4'] = 'cluster_' + str(labels1[i])
# Estimated bandwidth from above
if bw == ebw:
for i in range(len(X)):
strings = ["%.7f" % number for number in X[i]]
dataframe.loc[dataframe['features'] == ':'.join(strings),'cluster.e'] = 'cluster_' + str(labels1[i])
Bandwidth : 0.2
Bandwidth : 0.3
Bandwidth : 0.4
Bandwidth : 0.637463391528
# Woooo, cluster information is populated!
dataframe.head(10)
sample | features | cluster.2 | cluster.3 | cluster.4 | cluster.e | |
---|---|---|---|---|---|---|
0 | zeusbin_0007d213acc8f79cdca26922a89b13ee.ex0 | 0.1152227:0.0551382:0.0654014:0.0792643:0.1262... | cluster_951 | cluster_658 | cluster_366 | cluster_0 |
0 | zeusbin_000a10bb968d37acaf37adf02c283c69.ex0 | 0.1059370:0.1512601:0.1556943:0.1042381:0.1026... | cluster_5119 | cluster_2881 | cluster_0 | cluster_0 |
0 | zeusbin_000e19d6fccd980a8b534c4cfb56c6bb.ex0 | 0.0918169:0.1059434:0.1254217:0.1089197:0.0958... | cluster_3725 | cluster_2133 | cluster_0 | cluster_0 |
0 | zeusbin_001a1d5714890cf7459bd1bff933d66d.ex0 | 0.0812866:0.0770250:0.0823474:0.0812574:0.0678... | cluster_31 | cluster_28 | cluster_20 | cluster_7 |
0 | zeusbin_00240863a67bb6a001bf11aa87a1d76e.ex0 | 0.0550865:0.0731409:0.0671090:0.0631345:0.0553... | cluster_27 | cluster_24 | cluster_18 | cluster_1 |
0 | zeusbin_00316307ccc463f8f33705ca737ee0aa.ex0 | 0.1024716:0.1114129:0.1066840:0.0985467:0.0648... | cluster_21 | cluster_20 | cluster_12 | cluster_0 |
0 | zeusbin_0031f720e72378e3b296bdec306f455f.ex0 | 0.1256922:0.1384959:0.1319066:0.1682533:0.1170... | cluster_997 | cluster_1419 | cluster_53 | cluster_0 |
0 | zeusbin_003638e5e37b0e8632091028f0eb93f7.ex0 | 0.0962100:0.1084216:0.1084990:0.0860506:0.1125... | cluster_4 | cluster_4 | cluster_2 | cluster_0 |
0 | zeusbin_0038fd97d96fb8e2beb339be68fc462d.ex0 | 0.1251350:0.1320120:0.1297445:0.1711554:0.1410... | cluster_1061 | cluster_0 | cluster_0 | cluster_0 |
0 | zeusbin_0051ddcfa4fcb8c019392f84b5b6c4d9.ex0 | 0.1274043:0.1534231:0.1303160:0.1171802:0.1193... | cluster_4540 | cluster_2726 | cluster_0 | cluster_0 |
10 rows × 6 columns
Here's the number of samples in each cluster at each bandwidth setting. At the lower bandwidth setting there are significantly more clusters than at higher settings.
cluster = 'cluster.2'
c = dataframe[cluster].value_counts()
print "%s Clusters in %s" %(len(c), cluster)
print "%s Clusters with more than one object" %len(c[c > 1])
print "%s Clusters with exactly one object" %len(c[c == 1])
print "\n"
fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k')
ax = c.plot(kind='line', logy=True)
ax.set_xticklabels(c.index.tolist(), rotation=90)
ax.set_ylabel('Samples in Cluster ' + cluster)
ax.set_xlabel('Cluster Name')
5277 Clusters in cluster.2 333 Clusters with more than one object 4944 Clusters with exactly one object
<matplotlib.text.Text at 0x1113c7310>
cluster = 'cluster.3'
c = dataframe[cluster].value_counts()
print "%s Clusters in %s" %(len(c), cluster)
print "%s Clusters with more than one object" %len(c[c > 1])
print "%s Clusters with exactly one object" %len(c[c == 1])
print "\n"
fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k')
ax = c.plot(kind='line', logy=True)
ax.set_xticklabels(c.index.tolist(), rotation=90)
ax.set_ylabel('Samples in Cluster ' + cluster)
ax.set_xlabel('Cluster Name')
2960 Clusters in cluster.3 434 Clusters with more than one object 2526 Clusters with exactly one object
<matplotlib.text.Text at 0x11033a990>
cluster = 'cluster.4'
c = dataframe[cluster].value_counts()
print "%s Clusters in %s" %(len(c), cluster)
print "%s Clusters with more than one object" %len(c[c > 1])
print "%s Clusters with exactly one object" %len(c[c == 1])
print "\n"
fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k')
ax = c.plot(kind='line', logy=True)
ax.set_xticklabels(c.index.tolist(), rotation=90)
ax.set_ylabel('Samples in Cluster ' + cluster)
ax.set_xlabel('Cluster Name')
641 Clusters in cluster.4 253 Clusters with more than one object 388 Clusters with exactly one object
<matplotlib.text.Text at 0x12669a8d0>
cluster = 'cluster.e'
c = dataframe[cluster].value_counts()
print "%s Clusters in %s" %(len(c), cluster)
print "%s Clusters with more than one object" %len(c[c > 1])
print "%s Clusters with exactly one object" %len(c[c == 1])
print "\n"
fig = plt.figure(num=None, figsize=(14, 6), dpi=100, facecolor='w', edgecolor='k')
ax = c.plot(kind='line', logy=True)
ax.set_xticklabels(c.index.tolist(), rotation=90)
ax.set_ylabel('Samples in Cluster ' + cluster)
ax.set_xlabel('Cluster Name')
75 Clusters in cluster.e 53 Clusters with more than one object 22 Clusters with exactly one object
<matplotlib.text.Text at 0x11d8ca7d0>
s = "cluster_2"
filename = dataframe[dataframe['cluster.2'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.2'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)
zeusbin_a0e8111382949a286ec7e457d05ba00b.ex0.png
filename = dataframe[dataframe['cluster.2'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.2'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)
zeusbin_81e78ccf8d70a30efb54bfada07f85bc.ex0.png
Above are 2 random images from cluster_2 with a bandwidth of 0.2
Below are 2 random images from cluster_2 with the estimated bandwidth
By comparing images we can visually spot-check to see how well the samples are grouped.
s = "cluster_2"
filename = dataframe[dataframe['cluster.e'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.e'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)
zeusbin_04a52158af1ce913ed5c7e174f406049.ex0.png
filename = dataframe[dataframe['cluster.e'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.e'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)
zeusbin_3dd34aa57d263e06a2c957bd96ebfcc1.ex0.png
Last but not least, we've got a couple of samples from cluster_2 and bandwidth of 0.4
s = "cluster_2"
filename = dataframe[dataframe['cluster.4'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.4'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)
zeusbin_c6d5155dc9bdcc6e67a17d9615430688.ex0.png
filename = dataframe[dataframe['cluster.4'] == s]['sample'].tolist()[randrange(len(dataframe[dataframe['cluster.4'] == s]['sample'].tolist()))] + '.png'
print filename
Image(filename='images/' + filename)
zeusbin_a57cfdf45fc8903106b9cb5d5d91b0a2.ex0.png