%matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import KMeans def is_std_perm(p): return p.startswith('android.permission.') or p.startswith('com.android.') def elide(s, l=20): s = s[:-4] # remove extension s = s[:s.find('_')] # remove _ if len(s) <= l: return s h = l / 2 return s[:h] + '..' + s[-h:] dataset = pd.read_csv('crawled-apks.csv') dataset.fillna(0, inplace=True) # remove some samples dataset.drop(np.random.choice(dataset.index, int(len(dataset) * 0.5), replace=False), inplace=True) dataset.reset_index(drop=True, inplace=True) # remove columns which are completely all zero dataset = dataset.ix[:,(dataset != 0).any(axis=0)] # remove columns with non-standard permissions dataset = dataset[[c for c in dataset.columns if not c.startswith('p_') or is_std_perm(c[2:])]] # length of filenames dataset['_file'].apply(len).describe() # visualize sample data d = dataset.ix[np.random.choice(dataset.index, 50)] d = d.reset_index(drop=True) fig, ax = plt.subplots() fig.set_size_inches(fig.get_size_inches() * (1.5, 1.6)) plt.yticks(d.index, d['_file'].apply(elide), fontsize='small') ax.imshow(d[:50][[c for c in d.columns if c != '_file']], aspect='auto', cmap=plt.cm.gray_r, interpolation='none') # visualize data fig, ax = plt.subplots() fig.set_size_inches(fig.get_size_inches() * (2.5, 15)) plt.yticks(dataset.index, dataset['_file'].apply(elide), fontsize='small') ax.imshow(dataset[[c for c in dataset.columns if c != '_file']], aspect='auto', cmap=plt.cm.gray_r, interpolation='none') # plot frequency of permission & features used by apps plt.plot(dataset.mean()) dataset.mean().order(ascending=False) X = dataset[[c for c in dataset.columns if not c.startswith('_')]] clustering = AgglomerativeClustering(n_clusters=10) clustering.fit(X) datasetC = dataset.copy() datasetC['_label'] = pd.DataFrame(clustering.labels_, index=dataset.index) # reset the indices datasetC.sort('_label', inplace=True) datasetC.reset_index(drop=True, inplace=True) fig, ax = plt.subplots() fig.set_size_inches(fig.get_size_inches() * (2.5, 10)) plt.yticks(datasetC.index, datasetC['_file'].apply(elide), fontsize='small') # visualize clusters for label, rows in datasetC.groupby('_label').groups.iteritems(): r = sorted(rows) start, end = r[0], r[-1] # separator line & text label ax.axhline(end + 0.5, lw=2, color='blue', alpha=0.4) ax.text(.4 * len(datasetC.columns), start + .5 * (end - start), '%d' % label, fontsize=30, fontweight='bold', va='center', color='blue', alpha=0.3) ax.imshow(datasetC[[c for c in datasetC.columns if not c.startswith('_')]], aspect='auto', cmap=plt.cm.gray_r, interpolation='none') for label, rows in datasetC.groupby('_label').groups.iteritems(): print 'Group %d' % label print datasetC.ix[rows]['_file'] print