# All the imports and some basic level setting with various versions import IPython import re import os import json import time import pylab import string import pandas import pickle import struct import socket import collections import matplotlib.pyplot as plt import numpy as np import pandas as pd print "IPython version: %s" %IPython.__version__ print "pandas version: %s" %pd.__version__ print "numpy version: %s" %np.__version__ %matplotlib inline engines = ['Symantec', 'Sophos', 'F-Prot', 'Kaspersky', 'McAfee', 'Malwarebytes'] def extract_vtdata(data): vt = {} if 'scans' in data: if data['positives'] > 0: vt['label'] = 'malicious' else: vt['label'] = 'nonmalicious' vt['positives'] = data['positives'] for eng in engines: if eng in data['scans']: if data['scans'][eng]['result'] == None or not 'result' in data['scans'][eng]: vt[eng] = 'no detection' else: vt[eng] = data['scans'][eng]['result'] else: vt['label'] = 'no results' for eng in engines: vt[eng] = 'no results' vt['positives'] = 0 return vt def load_vt_data(file_list): import json features_list = {} for filename in file_list: with open(filename,'rb') as f: features = extract_vtdata(json.loads(f.read())) fname = os.path.split(filename)[1].split('.')[0] features_list[fname] = features return features_list import glob file_list = glob.glob('/Users/user/vt_data/*.vtdata') vt_data = load_vt_data(file_list) # This simply loads up the JSON and flattens it. FAT binaries are broken down into a feature vector for each architecture def extract_features(filename, data): all_features = [] if not 'error' in data['characteristics']['macho']: for i in range(data['characteristics']['macho']['number of architectures']): features = {} #features['magic'] = int(data['characteristics']['macho']['header'][i]['magic'], 0) #features['h_size'] = data['characteristics']['macho']['header'][i]['size'] #features['h_offset'] = data['characteristics']['macho']['header'][i]['offset'] for command in data['verbose']['macho']['header'][i]['commands']: if command['cmd_name'] in ['LC_SEGMENT', 'LC_SEGMENT_64']: bits = '' if command['cmd_name'] == 'LC_SEGMENT_64': bits = "64" if command['segname'] == '__PAGEZERO': features['lc_segment_' + bits + '_vmaddr'] = command['vmaddr'] features['lc_segment_' + bits + '_vmsize'] = command['vmsize'] features['lc_segment_' + bits + '_filesize'] = command['filesize'] features['lc_segment_' + bits + '_fileoff'] = command['fileoff'] if command['cmd_name'] == 'LC_VERSION_MIN_MACOSX': features['lc_version_min_macosx_min_version'] = float('.'.join(command['version'].split('.')[:2])) if command['cmd_name'] == 'LC_SYMTAB': features['lc_symtab_strsize'] = command['strsize'] features['lc_symtab_stroff'] = command['stroff'] features['lc_symtab_symoff'] = command['symoff'] features['lc_symtab_nsyms'] = command['nsyms'] if command['cmd_name'] in ['LC_DYLD_INFO_ONLY', 'LC_DYLD_INFO']: features['lc_dyld_info_lazy_bind_size'] = command['lazy_bind_size'] features['lc_dyld_info_rebase_size'] = command['rebase_size'] features['lc_dyld_info_lazy_bind_off'] = command['lazy_bind_off'] features['lc_dyld_info_export_off'] = command['export_off'] features['lc_dyld_info_export_size'] = command['export_size'] features['lc_dyld_info_bind_off'] = command['bind_off'] features['lc_dyld_info_rebase_off'] = command['rebase_off'] features['lc_dyld_info_bind_size'] = command['bind_size'] features['lc_dyld_info_weak_bind_size'] = command['weak_bind_size'] features['lc_dyld_info_weak_bind_off'] = command['weak_bind_off'] if command['cmd_name'] == 'LC_DYSYMTAB': features['lc_dysymtab_nextdefsym'] = command['nextdefsym'] features['lc_dysymtab_extreloff'] = command['extreloff'] features['lc_dysymtab_nlocrel'] = command['nlocrel'] features['lc_dysymtab_modtaboff'] = command['modtaboff'] features['lc_dysymtab_iundefsym'] = command['iundefsym'] features['lc_dysymtab_ntoc'] = command['ntoc'] features['lc_dysymtab_ilocalsym'] = command['ilocalsym'] features['lc_dysymtab_nundefsym'] = command['nundefsym'] features['lc_dysymtab_nextrefsyms'] = command['nextrefsyms'] features['lc_dysymtab_locreloff'] = command['locreloff'] features['lc_dysymtab_nmodtab'] = command['nmodtab'] features['lc_dysymtab_nlocalsym'] = command['nlocalsym'] features['lc_dysymtab_tocoff'] = command['tocoff'] features['lc_dysymtab_extrefsymoff'] = command['extrefsymoff'] features['lc_dysymtab_nindirectsyms'] = command['nindirectsyms'] features['lc_dysymtab_iextdefsym'] = command['iextdefsym'] features['lc_dysymtab_nextrel'] = command['nextrel'] features['lc_dysymtab_indirectsymoff'] = command['indirectsymoff'] features.update(data['verbose']['macho']['header'][i]['command type count']) if 'LC_SEGMENT' in features: features['number of segments'] = features['LC_SEGMENT'] else: features['number of segments'] = features['LC_SEGMENT_64'] features['filename'] = filename[2:-8] # Remove some more features for lc in ['LC_MAIN', 'LC_UNIXTHREAD']: if lc in features: features.pop(lc, None) filename = os.path.split(filename)[1].split('.')[0] for eng in engines: if filename in vt_data: if eng in vt_data[filename]: features[eng] = vt_data[filename][eng] else: features[eng] = 'no result' features['label'] = vt_data[filename]['label'] features['positives'] = vt_data[filename]['positives'] else: features[eng] = 'no result' features['label'] = 'no result' all_features.append(features) return all_features def load_files(file_list): import json features_list = [] for filename in file_list: with open(filename,'rb') as f: features = extract_features(filename, json.loads(f.read())) features_list.extend(features) return features_list import glob file_list = glob.glob('./*.results') features = load_files(file_list) print "Files:", len(file_list) print "Number of feature vectors:", len(features) df = pd.DataFrame.from_records(features) for col in df.columns: if col[0:3] in ['LC_']: df[col].fillna(0, inplace=True) df.fillna(-1, inplace=True) print df.shape df.head() # Brief overview of the various things detected by Symantec in this dataset, really we just want to verifiy that we have some data df[engines].Symantec.value_counts().head(10) ignore_cols = engines + ['filename', 'label', 'positives'] cols = [x for x in df.columns.tolist() if not x in ignore_cols] X = df.as_matrix(cols) from sklearn.preprocessing import scale X = scale(X) from sklearn.decomposition import PCA DDD = PCA(n_components=3).fit_transform(X) DD = PCA(n_components=2).fit_transform(X) from mpl_toolkits.mplot3d import Axes3D figsize(12,8) fig = plt.figure(figsize=plt.figaspect(.5)) ax = fig.add_subplot(1, 2, 1, projection='3d') ax.scatter(DDD[:,0], DDD[:,1], DDD[:,2], s=50) ax.set_title("Raw Data 3D") ax = fig.add_subplot(1, 2, 2) ax.scatter(DD[:,0], DD[:,1], s=50) ax.set_title("Raw Data 2D)") plt.show() from sklearn.cluster import DBSCAN X = df.as_matrix(cols) dbscan = DBSCAN(min_samples=3) dbscan.fit(X) labels1 = dbscan.labels_ labels1_u = np.unique(labels1) nclusters = len(labels1_u) dbscan_df = pd.DataFrame() dbscan_df = df[['label', 'filename', 'positives'] + engines] dbscan_df['cluster'] = labels1 print "Number of clusters: %d" % nclusters print "Labeled samples: %s" % dbscan_df[dbscan_df['cluster'] != -1].filename.value_counts().sum() print "Unlabeled samples: %s" % dbscan_df[dbscan_df['cluster'] == -1].filename.value_counts().sum() dbscan_df.groupby(['cluster', 'label']).count()[['filename']].head(10) clusters = set() for name, blah in dbscan_df.groupby(['cluster', 'label'])['label']: if name[0] in clusters: print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0] clusters.add(name[0]) sample_cluster = 3 dbscan_df[dbscan_df['cluster'] == sample_cluster][engines] for eng in engines: print "%s - %s" %(eng, len(dbscan_df[dbscan_df['cluster'] == sample_cluster][eng].unique().tolist())) # This is a ballpark to see what might be a good number of components to reduce our original 66 features to X = df.as_matrix(cols) X = scale(X) pca = PCA().fit(X) n_comp = len([x for x in pca.explained_variance_ if x > 1e0]) print "Number of components w/explained variance > 1: %s" % n_comp X = df.as_matrix(cols) X = scale(X) X = PCA(n_components=n_comp).fit_transform(X) dbscan = DBSCAN(min_samples=3) dbscan.fit(X) labels1 = dbscan.labels_ labels1_u = np.unique(labels1) nclusters = len(labels1_u) dbscan_df = pd.DataFrame() dbscan_df = df[['label', 'filename', 'positives'] + engines] dbscan_df['cluster'] = labels1 print "Number of clusters: %d" % nclusters print "Labeled samples: %s" % dbscan_df[dbscan_df['cluster'] != -1].filename.value_counts().sum() print "Unlabeled samples: %s" % dbscan_df[dbscan_df['cluster'] == -1].filename.value_counts().sum() # Get rid of unlabeled samples and show what we're left with (number of samples per cluster) dbscan_df.cluster.value_counts().head(10) # Remove unlabeled samples for graphing to make it prettier df['cluster'] = dbscan_df['cluster'] tempdf = df[df['cluster'] != -1].reset_index(drop=True) X = tempdf.as_matrix(cols) X = scale(X) DDD = PCA(n_components=3).fit_transform(X) DD = PCA(n_components=2).fit_transform(X) figsize(12,12) fig = plt.figure(figsize=plt.figaspect(.5)) ax = fig.add_subplot(2, 2, 1, projection='3d') ax.scatter(DDD[:,0], DDD[:,1], DDD[:,2], c=tempdf['cluster'], s=50) ax.set_title("DBSCAN Clusters") ax = fig.add_subplot(2, 2, 2, projection='3d') ax.set_xlim(-10,5) ax.set_ylim(-10,15) ax.set_zlim(-30,5) ax.scatter(DDD[:,0], DDD[:,1], DDD[:,2], c=tempdf['cluster'], s=50) ax.set_title("DBSCAN Clusters (zoomed in)") ax = fig.add_subplot(2, 2, 3) ax.scatter(DD[:,0], DD[:,1], c=tempdf['cluster'], s=50) ax.set_title("DBSCAN Clusters") ax = fig.add_subplot(2, 2, 4) ax.set_xlim(-6,5) ax.set_ylim(-15,10) ax.scatter(DD[:,0], DD[:,1], c=tempdf['cluster'], s=50) ax.set_title("DBSCAN Clusters (zoomed in)") plt.show() #df.drop('cluster', axis=1, inplace=True) dbscan_df.groupby(['cluster', 'label']).count()[['filename']].head(10) clusters = set() print "Total Number of Clusters: %s\n" % (len(dbscan_df['cluster'].unique().tolist())) for name, blah in dbscan_df.groupby(['cluster', 'label'])['label']: if name[0] in clusters: print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0] clusters.add(name[0]) sample_cluster = 0 dbscan_df[dbscan_df['cluster'] == sample_cluster][engines] for eng in engines: print "%s - %s" %(eng, len(dbscan_df[dbscan_df['cluster'] == sample_cluster][eng].unique().tolist())) import yara_signature name = 3 fdf = pd.DataFrame() for f in dbscan_df[dbscan_df['cluster'] == name].filename.tolist(): fdf = fdf.append(df[df['filename'] == f], ignore_index=True) # Choose a signature from cluster to use as the basis of the sig w/the attributes below filename = fdf.filename.value_counts().index[0] meta = {"author" : "sconzo", "email" : "sconzo_at_clicksecurity_dot_com"} sig = yara_signature.yara_macho_generator.YaraMachoGenerator("/Users/sconzo/macho-yara/macho/" +filename, samplename="Cluster_"+str(name), meta=meta) lc_cmds = [] lc_symtab = [] lc_dysymtab = [] lc_dyld_info = [] lc_segment = [] lc_segment_64 = [] for col in fdf.columns: if len(fdf[col].unique()) == 1: if fdf[col].unique()[0] != 0: lower = [s for s in col if s.islower()] if fdf[col].unique()[0] > 0 or (len(lower) == len(col)): if col.startswith('LC_'): lc_cmds.append(col) if col.startswith('lc_segment_'): lc_segment.append(col) if col.startswith('lc_segment_64_'): lc_segment_64.append(col) if col.startswith('lc_symtab_'): lc_symtab.append(col) if col.startswith('lc_dysymtab_'): lc_dysymtab.append(col) if col.startswith('lc_dyld_info_'): lc_dyld_info.append(col) if len(lc_symtab) > 0: lc_cmds = [x for x in lc_cmds if x != 'LC_SYMTAB'] lc_symtab = set([x[10:] for x in lc_symtab]) sig.add_symtab(lc_symtab) if len(lc_dysymtab) > 0: lc_cmds = [x for x in lc_cmds if x != 'LC_DYSYMTAB'] lc_dysymtab = set([x[12:] for x in lc_dysymtab]) sig.add_dysymtab(lc_dysymtab) if len(lc_dyld_info): lc_cmds = [x for x in lc_cmds if x != 'LC_DYLD_INFO'] lc_cmds = [x for x in lc_cmds if x != 'LC_DYLD_INFO_ONLY'] lc_dyld_info = set([x[13:] for x in lc_dyld_info]) sig.add_dyld_info(lc_dyld_info) if len(lc_segment) > 0: lc_cmds = [x for x in lc_cmds if x != 'LC_SEGMENT'] lc_segment = set([x[12:] for x in lc_segment]) sig.add_segment(lc_segment) if len(lc_segment_64) > 0: lc_cmds = [x for x in lc_cmds if x != 'LC_SEGMENT_64'] lc_segment_64 = set([x[14:] for x in lc_segment_64]) sig.add_segment(lc_segment_64) if 'LC_VERSION_MIN_IPHONEOS' in lc_cmds: lc_cmds = [x for x in lc_cmds if x != 'LC_VERSION_MIN_IPHONEOS'] sig.add_version_min_macosx() if 'LC_VERSION_MIN_MACOSX' in lc_cmds: lc_cmds = [x for x in lc_cmds if x != 'LC_VERSION_MIN_MACOSX'] sig.add_version_min_macosx() [sig.add_lc(x) for x in lc_cmds] print sig.get_signature() from sklearn.cluster import KMeans X = df.as_matrix(cols) X = scale(X) #rule of thumb of k = sqrt(#samples/2), thanks wikipedia :) k_clusters = int(math.sqrt(int(len(X)/2))) kmeans = KMeans(n_clusters=k_clusters) kmeans.fit(X) labels1 = kmeans.labels_ df['cluster'] = labels1 labels1_u = np.unique(labels1) nclusters = len(labels1_u) kmeans_df = pd.DataFrame() kmeans_df = df[['label', 'filename', 'positives'] + engines] kmeans_df['cluster'] = labels1 print "Number of clusters: %d" % k_clusters kmeans_df['cluster'].value_counts().head(10) X = df.as_matrix(cols) X = scale(X) X = PCA(n_components=3).fit_transform(X) figsize(12,8) fig = plt.figure(figsize=plt.figaspect(.5)) ax = fig.add_subplot(1, 2, 1, projection='3d') ax.scatter(X[:,0], X[:,1], X[:,2], c=kmeans_df['cluster'], s=50) ax.set_title("Kmeans Clusters") ax = fig.add_subplot(1, 2, 2, projection='3d') ax.set_xlim(-10,2) ax.set_ylim(10,35) ax.set_zlim(-20,10) ax.scatter(X[:,0], X[:,1], X[:,2], c=kmeans_df['cluster'], s=50) ax.set_title("KMeans Clusters (zoomed in)") plt.show() clusters = set() for name, blah in kmeans_df.groupby(['cluster', 'label'])['label']: if name[0] in clusters: print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0] clusters.add(name[0]) sample_cluster = 5 kmeans_df[kmeans_df['cluster'] == sample_cluster][engines].head() for eng in engines: print "%s - %s" %(eng, len(kmeans_df[kmeans_df['cluster'] == sample_cluster][eng].unique().tolist())) kmeans_df[kmeans_df['cluster'] == sample_cluster]['Symantec'].value_counts() X = df.as_matrix(cols) X = scale(X) X = PCA(n_components=n_comp).fit_transform(X) #rule of thumb of k = sqrt(#samples/2), thanks wikipedia :) k_clusters = int(math.sqrt(int(len(X)/2))) kmeans = KMeans(n_clusters=k_clusters) kmeans.fit(X) labels1 = kmeans.labels_ df['cluster'] = labels1 labels1_u = np.unique(labels1) nclusters = len(labels1_u) kmeans_df = pd.DataFrame() kmeans_df = df[['label', 'filename', 'positives'] + engines] kmeans_df['cluster'] = labels1 print "Number of clusters: %d" % nclusters print print "Cluster/Sample Layout" print df.cluster.value_counts().head(10) print X = df.as_matrix(cols) X = scale(X) X = PCA(n_components=3).fit_transform(X) figsize(12,8) fig = plt.figure(figsize=plt.figaspect(.5)) ax = fig.add_subplot(1, 2, 1, projection='3d') ax.scatter(X[:,0], X[:,1], X[:,2], c=kmeans_df['cluster'], s=50) ax.set_title("KMeans Clusters") ax = fig.add_subplot(1, 2, 2, projection='3d') ax.set_xlim(-10,2) ax.set_ylim(15,30) ax.set_zlim(-20,0) ax.scatter(X[:,0], X[:,1], X[:,2], c=kmeans_df['cluster'], s=50) ax.set_title("KMeans Clusters (zoomed in)") plt.show() clusters = set() for name, blah in kmeans_df.groupby(['cluster', 'label'])['label']: if name[0] in clusters: print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0] clusters.add(name[0]) sample_cluster = 4 kmeans_df[kmeans_df['cluster'] == sample_cluster][engines].head() for eng in engines: print "%s - %s" %(eng, len(kmeans_df[kmeans_df['cluster'] == sample_cluster][eng].unique().tolist())) from sklearn.cluster import MeanShift, estimate_bandwidth X = df.as_matrix(cols) X = scale(X) ebw = estimate_bandwidth(X) ms1 = MeanShift(bandwidth=ebw) ms1.fit(X) labels1 = ms1.labels_ labels1_u = np.unique(labels1) nclusters = len(labels1_u) meanshift_df = pd.DataFrame() meanshift_df = df[['label', 'filename', 'positives'] + engines] meanshift_df['cluster'] = labels1 print "Estimated Bandwidth: %s" % ebw print "Number of clusters: %d" % nclusters X = df.as_matrix(cols) X = scale(X) X = PCA(n_components=3).fit_transform(X) figsize(12,8) fig = plt.figure(figsize=plt.figaspect(.5)) ax = fig.add_subplot(1, 2, 1, projection='3d') ax.scatter(X[:,0], X[:,1], X[:,2], c=meanshift_df['cluster'], s=50) ax.set_title("MeanShift Clusters") ax = fig.add_subplot(1, 2, 2, projection='3d') ax.set_xlim(-10,2) ax.set_ylim(15,30) ax.set_zlim(-20,0) ax.scatter(X[:,0], X[:,1], X[:,2], c=meanshift_df['cluster'], s=50) ax.set_title("MeanShift Clusters (zoomed in)") plt.show() meanshift_df.cluster.value_counts().head(10) clusters = set() for name, blah in meanshift_df.groupby(['cluster', 'label'])['label']: if name[0] in clusters: print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0] clusters.add(name[0]) sample_cluster = 2 meanshift_df[meanshift_df['cluster'] == sample_cluster][engines].head() for eng in engines: print "%s - %s" %(eng, len(kmeans_df[kmeans_df['cluster'] == sample_cluster][eng].unique().tolist())) X = df.as_matrix(cols) X = scale(X) X = PCA(n_components=n_comp).fit_transform(X) ebw = estimate_bandwidth(X) ms1 = MeanShift(bandwidth=ebw) ms1.fit(X) labels1 = ms1.labels_ labels1_u = np.unique(labels1) nclusters = len(labels1_u) meanshift_df = pd.DataFrame() meanshift_df = df[['label', 'filename', 'positives'] + engines] meanshift_df['cluster'] = labels1 print "Estimated Bandwidth: %s" % ebw print "Number of clusters: %d" % nclusters print print "Cluster/Sample Layout" print df.cluster.value_counts().head(10) print df['cluster'] = meanshift_df['cluster'] # Once again we can remove, in this case, the largest cluster for a less dense graph tempdf = df[df['cluster'] != 0].reset_index(drop=True) X = tempdf.as_matrix(cols) X = scale(X) X = PCA(n_components=3).fit_transform(X) figsize(12,8) fig = plt.figure(figsize=plt.figaspect(.5)) ax = fig.add_subplot(1, 2, 1, projection='3d') ax.scatter(X[:,0], X[:,1], X[:,2], c=tempdf['cluster'], s=50) ax.set_title("MeanShift Clusters") ax = fig.add_subplot(1, 2, 2, projection='3d') ax.set_xlim(-10,2) ax.set_ylim(15,30) ax.set_zlim(-20,0) ax.scatter(X[:,0], X[:,1], X[:,2], c=tempdf['cluster'], s=50) ax.set_title("MeanShift Clusters (zoomed in)") plt.show() clusters = set() for name, blah in meanshift_df.groupby(['cluster', 'label'])['label']: if name[0] in clusters: print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0] clusters.add(name[0]) sample_cluster = 2 meanshift_df[meanshift_df['cluster'] == sample_cluster][engines].head() for eng in engines: print "%s - %s" %(eng, len(kmeans_df[kmeans_df['cluster'] == sample_cluster][eng].unique().tolist()))