import pandas as pd print 'pandas version is', pd.__version__ import numpy as np print 'numpy version is', np.__version__ import sklearn print 'scikit-learn version is', sklearn.__version__ import matplotlib print 'matplotlib version is', matplotlib.__version__ import matplotlib.pyplot as plt %matplotlib inline plt.rcParams['font.size'] = 18.0 plt.rcParams['figure.figsize'] = 16.0, 5.0 def plot_cm(cm, labels): # Compute percentanges percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) print 'Confusion Matrix Stats' for i, label_i in enumerate(labels): for j, label_j in enumerate(labels): print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()) # Show confusion matrix # Thanks to kermit666 from stackoverflow fig = plt.figure() ax = fig.add_subplot(111) ax.grid(b=False) cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100) plt.title('Confusion matrix of the classifier') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) plt.xlabel('Predicted') plt.ylabel('True') plt.show() def extract_features(data): all_features = [] if not 'error' in data['characteristics']['macho']: for i in range(data['characteristics']['macho']['number of architectures']): features = {} features['entropy'] = data['metadata']['entropy'] features['file_size'] = data['metadata']['file_size'] features['number of architectures'] = data['characteristics']['macho']['number of architectures'] features['header size'] = data['characteristics']['macho']['header'][i]['size'] features['cmd count'] = data['characteristics']['macho']['header'][i]['mach header']['ncmds'] features['cmd size'] = data['characteristics']['macho']['header'][i]['mach header']['sizeofcmds'] features['cputype'] = data['characteristics']['macho']['header'][i]['mach header']['cputype_string'] for flag in data['characteristics']['macho']['header'][i]['mach header']['flags']: features[flag['name']] = 1 features['string count'] = data['verbose']['macho']['header'][i]['const strings count'] if 'const strings' in data['verbose']['macho']['header'][i]: features['const strings'] = data['verbose']['macho']['header'][i]['const strings'] else: features['const strings'] = [] if 'symbol table strings' in data['verbose']['macho']['header'][i]: features['symbol table strings'] = data['verbose']['macho']['header'][i]['symbol table strings'] else: features['symbol table strings'] = [] features['segment names'] = [] features['section names'] = [] for command in data['verbose']['macho']['header'][i]['commands']: if command['cmd_name'] in ['LC_SEGMENT', 'LC_SEGMENT_64']: features['segment names'].append(command['segname']) if command['segname'] == '__PAGEZERO': features['pz_size'] = command['cmd_size'] features['pz_vmsize'] = command['vmsize'] features['pz_vmaddr'] = command['vmaddr'] features['pz_flags'] = command['flags'] features['pz_filesize'] = command['filesize'] features['pz_nsects'] = command['nsects'] features['pz_fileoff'] = command['fileoff'] for flag in command['initprot']: features['pz_initprot_'+flag] = 1 for flag in command['maxprot']: features['pz_maxprot_'+flag] = 1 if command['segname'] == '__TEXT': features['text_size'] = command['cmd_size'] features['text_vmsize'] = command['vmsize'] features['text_vmaddr'] = command['vmaddr'] features['text_flags'] = command['flags'] features['text_filesize'] = command['filesize'] features['text_nsects'] = command['nsects'] features['text_fileoff'] = command['fileoff'] for flag in command['initprot']: features['text_initprot_'+flag] = 1 for flag in command['maxprot']: features['text_maxprot_'+flag] = 1 features['text_entropy'] = command['entropy'] for section in command['sections']: if section['sectname'] == '__text': features['text_section_reloff'] = section['reloff'] features['text_section_addr'] = section['addr'] features['text_section_align'] = section['align'] features['text_section_nreloc'] = section['nreloc'] features['text_section_offset'] = section['offset'] features['text_section_size'] = section['size'] features['text_section_reserved1'] = section['reserved1'] features['text_section_reserved2'] = section['reserved2'] features['text_section_' + section['flags']['type']] = 1 if 'attributes' in section['flags']: for attr in section['flags']['attributes']: features['text_section_flag_attribute_'+attr] = 1 if section['sectname'] == '__const': features['const_section_reloff'] = section['reloff'] features['const_section_addr'] = section['addr'] features['const_section_align'] = section['align'] features['const_section_nreloc'] = section['nreloc'] features['const_section_offset'] = section['offset'] features['const_section_size'] = section['size'] features['const_section_reserved1'] = section['reserved1'] features['const_section_reserved2'] = section['reserved2'] features['const_section_' + section['flags']['type']] = 1 if 'attributes' in section['flags']: for attr in section['flags']['attributes']: features['const_section_flag_attribute_'+attr] = 1 if command['segname'] == '__DATA' and command['nsects'] > 0: features['data_size'] = command['cmd_size'] features['data_vmsize'] = command['vmsize'] features['data_vmaddr'] = command['vmaddr'] features['data_flags'] = command['flags'] features['data_filesize'] = command['filesize'] features['data_nsects'] = command['nsects'] features['data_fileoff'] = command['fileoff'] for flag in command['initprot']: features['data_initprot_'+flag] = 1 for flag in command['maxprot']: features['data_maxprot_'+flag] = 1 features['data_entropy'] = command['entropy'] #if command['segname'] == '__IMPORT': # features['import_size'] = command['cmd_size'] # features['import_vmsize'] = command['vmsize'] # features['import_vmaddr'] = command['vmaddr'] # features['import_flags'] = command['flags'] # features['import_filesize'] = command['filesize'] # features['import_nsects'] = command['nsects'] # features['import_fileoff'] = command['fileoff'] # for flag in command['initprot']: # features['import_initprot_'+flag] = 1 # for flag in command['maxprot']: # features['import_maxprot_'+flag] = 1 # features['import_entropy'] = command['entropy'] if command['segname'] == '__LINKEDIT': features['linkedit_size'] = command['cmd_size'] features['linkedit_vmsize'] = command['vmsize'] features['linkedit_vmaddr'] = command['vmaddr'] features['linkedit_flags'] = command['flags'] features['linkedit_filesize'] = command['filesize'] features['linkedit_nsects'] = command['nsects'] features['linkedit_fileoff'] = command['fileoff'] for flag in command['initprot']: features['linkedit_initprot_'+flag] = 1 for flag in command['maxprot']: features['linkedit_maxprot_'+flag] = 1 if 'sections' in command: for section in command['sections']: features['section names'].append(section['sectname']) if command['cmd_name'] == 'LC_SYMTAB': features['strsize'] = command['strsize'] features['stroff'] = command['stroff'] features['symoff'] = command['symoff'] features['nsyms'] = command['nsyms'] if command['cmd_name'] in ['LC_DYLD_INFO_ONLY', 'LC_DYLD_INFO']: features['lazy_bind_size'] = command['lazy_bind_size'] features['rebase_size'] = command['rebase_size'] features['weak_bind_size'] = command['weak_bind_size'] features['lazy_bind_off'] = command['lazy_bind_off'] features['export_off'] = command['export_off'] features['export_size'] = command['export_size'] features['bind_off'] = command['bind_off'] features['rebase_off'] = command['rebase_off'] features['bind_size'] = command['bind_size'] features['weak_bind_off'] = command['weak_bind_off'] if command['cmd_name'] == 'LC_DYSYMTAB': features['nextdefsym'] = command['nextdefsym'] features['extreloff'] = command['extreloff'] features['nlocrel'] = command['nlocrel'] features['indirectsymoff'] = command['indirectsymoff'] features['modtaboff'] = command['modtaboff'] features['iundefsym'] = command['iundefsym'] features['ntoc'] = command['ntoc'] features['ilocalsym'] = command['ilocalsym'] features['nundefsym'] = command['nundefsym'] features['nextrefsyms'] = command['nextrefsyms'] features['locreloff'] = command['locreloff'] features['nmodtab'] = command['nmodtab'] features['nlocalsym'] = command['nlocalsym'] features['tocoff'] = command['tocoff'] features['extrefsymoff'] = command['extrefsymoff'] features['nindirectsyms'] = command['nindirectsyms'] features['iextdefsym'] = command['iextdefsym'] features['nextrel'] = command['nextrel'] features.update(data['verbose']['macho']['header'][i]['command type count']) if 'LC_SEGMENT' in features: features['number of segments'] = features['LC_SEGMENT'] else: features['number of segments'] = features['LC_SEGMENT_64'] if 'entry point' in data['verbose']['macho']['header'][i]: for key, value in data['verbose']['macho']['header'][i]['entry point'].iteritems(): features['entry point ' + key] = value all_features.append(features) return all_features def load_files(file_list): import json features_list = [] for filename in file_list: with open(filename,'rb') as f: features = extract_features(json.loads(f.read())) features_list.extend(features) return features_list # Good files import glob good_list = glob.glob('data/good/*.results') good_features = load_files(good_list) print "Files:", len(good_list) print "Number of feature vectors:", len(good_features) # Bad files bad_list = glob.glob('data/bad/*.results') bad_features = load_files(bad_list) print "Files:", len(bad_list) print "Number of feature vectors:", len(bad_features) df_good_orig = pd.DataFrame.from_records(good_features) df_good_orig['label'] = 'good' df_good_orig.head() df_good = df_good_orig for col in df_good.columns: if col[0:3] in ['LC_', 'MH_']: #print col df_good[col].fillna(0, inplace=True) df_good.fillna(-1, inplace=True) df_good_orig.head() df_bad_orig = pd.DataFrame.from_records(bad_features) df_bad_orig['label'] = 'bad' df_bad = df_bad_orig for col in df_bad.columns: if col[0:3] in ['LC_', 'MH_']: #print col df_bad[col].fillna(0, inplace=True) df_bad.fillna(-1, inplace=True) df_bad.head() df_good['cputype'].value_counts().plot(kind='bar') df_bad['cputype'].value_counts().plot(kind='bar') df_all_orig = pd.concat([df_bad, df_good], ignore_index=True) df_all = df_all_orig for col in df_all.columns: if col[0:3] in ['LC_', 'MH_']: #print col df_all[col].fillna(0, inplace=True) df_all.fillna(-1, inplace=True) # Break out by cpu type cond = df_all['cputype'] == 'x86_64' df_x64 = df_all[cond] cond = df_all['cputype'] == 'i386' df_x86 = df_all[cond] df_all = pd.concat([df_x64, df_x86], ignore_index=True) #print df['symbol table strings'] df = df_all.drop(['const strings', 'symbol table strings', 'segment names', 'section names', 'entry point section name', 'entry point segment name', 'entry point instruction type'], axis=1) df['cputype'].value_counts().plot(kind='bar') cond = df['label'] == 'good' good = df[cond] bad = df[~cond] bad['cmd count'].hist(alpha=.5,label='bad',bins=40) good['cmd count'].hist(alpha=.5,label='good',bins=40) plt.legend() df.boxplot('cmd count', 'label') plt.xlabel('bad vs. good files') plt.ylabel('Number of Commands') plt.title('Comparision of Number of Commands') plt.suptitle('') df.boxplot(column='cmd size', by='label') plt.xlabel('bad vs. good files') plt.ylabel('Command Size') plt.title('Comparision of Command Sizes') plt.suptitle("") # Split the classes up so we can set colors, size, labels cond = df['label'] == 'good' good = df[cond] bad = df[~cond] plt.scatter(good['cmd count'], good['cmd size'], s=140, c='#aaaaff', label='Good', alpha=.4) plt.scatter(bad['cmd count'], bad['cmd size'], s=40, c='r', label='Bad', alpha=.5) plt.legend() plt.xlabel('Command Count') plt.ylabel('Command Size') plt.title('Command Size Vs Command Count') df.boxplot('entropy', 'label') plt.xlabel('bad vs. good files') plt.ylabel('Number of Commands') plt.title('Comparision of Entropy') plt.suptitle('') df.boxplot('number of segments', 'label') plt.xlabel('bad vs. good files') plt.ylabel('Number of Segments') plt.title('Comparision of Number of Segments') plt.suptitle('') cond = df['label'] == 'good' good = df[cond] bad = df[~cond] plt.scatter(good['entropy'], good['number of segments'], s=140, c='#aaaaff', label='Good', alpha=.4) plt.scatter(bad['entropy'], bad['number of segments'], s=40, c='r', label='Bad', alpha=.5) plt.legend() plt.xlabel('Entropy') plt.ylabel('Number of Segments') plt.title('Number of Segments Vs Entropy') # List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors) X_cmd = df.as_matrix(['cmd count', 'cmd size']) # Labels (scikit learn uses 'y' for classification labels) y = np.array(df['label'].tolist()) # Random Forest is a popular ensemble machine learning classifier. # http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html # import sklearn.ensemble clf_cmd = sklearn.ensemble.RandomForestClassifier(n_estimators=50) # Now we can use scikit learn's cross validation to assess predictive performance. scores = sklearn.cross_validation.cross_val_score(clf_cmd, X_cmd, y, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) my_seed = 1022 my_tsize = .2 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_cmd, y, test_size=my_tsize, random_state=my_seed) clf_cmd.fit(X_train, y_train) clf_cmd_scores = clf_cmd.score(X_test, y_test) print("Accuracy: %0.2f" % clf_cmd_scores) y_pred = clf_cmd.predict(X_test) from sklearn.metrics import confusion_matrix labels = ['good', 'bad'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) clf_2 = sklearn.ensemble.RandomForestClassifier(n_estimators=50) X_2 = df.as_matrix(['number of segments', 'entropy']) X_train, X_test, y_train, y_test = train_test_split(X_2, y, test_size=my_tsize, random_state=my_seed) clf_2.fit(X_train, y_train) y_pred = clf_2.predict(X_test) from sklearn.metrics import confusion_matrix labels = ['good', 'bad'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) clf_all = sklearn.ensemble.RandomForestClassifier(n_estimators=50) no_label = list(df.columns.values) no_label.remove('label') no_label.remove('cputype') X = df.as_matrix(no_label) # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed) clf_all.fit(X_train, y_train) y_pred = clf_all.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # Feature Selection # Which features best deferentiated the two classes? # Here we're going to grab the feature_importances from the classifier itself, importances = zip(no_label, clf_all.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for im in importances[0:20]: print im[0].ljust(30), im[1] df_good['LC_UNIXTHREAD'].value_counts().plot(kind='bar', label='good') plt.legend() df_bad['LC_UNIXTHREAD'].value_counts().plot(kind='bar') clf_some = sklearn.ensemble.RandomForestClassifier(n_estimators=50) some = list(df.columns.values) some.remove('label') some.remove('cputype') some.remove('LC_UNIXTHREAD') some.remove('LC_MAIN') X_some = df.as_matrix(some) # 80/20 Split for predictive test X_train_some, X_test_some, y_train_some, y_test_some = train_test_split(X_some, y, test_size=my_tsize, random_state=my_seed) clf_some.fit(X_train_some, y_train_some) y_pred_some = clf_some.predict(X_test_some) cm = confusion_matrix(y_test_some, y_pred_some, labels) plot_cm(cm, labels) importances = zip(some, clf_some.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for im in importances[0:20]: print im[0].ljust(30), im[1] symbol_strings = [] for strings, label in zip(df_all['symbol table strings'], df_all['label']): for symbol in strings: symbol_strings.append({'symbol string': symbol, 'label': label}) pd_symbols = pd.DataFrame.from_records(symbol_strings) pd_symbols.head() import sys sys.path.insert(0,'..') import data_hacking.simple_stats as ss # Spin up our g_test class g_test = ss.GTest() # Here we'd like to see how strongly various strings are assoicated with being clean or malware. df_ct, df_cd, df_symbol_stats = g_test.highest_gtest_scores(pd_symbols['symbol string'], pd_symbols['label'], N=0, matches=0, min_volume=5) df_symbol_stats.dropna(inplace=True) df_symbol_stats.sort('bad_g', ascending=0).head(25) df_symbol_stats.sort('good_g', ascending=0).head(25) def tokenize_string(string): if '___cxx_global_var_init' in string: return '___cxx_global_var_init_TOKEN' elif '__ZN12_GLOBAL' in string: return '__ZN12_GLOBAL_TOKEN' else: return string pd_symbols['tokenized string'] = pd_symbols['symbol string'].map(lambda x: tokenize_string(x)) df_ct, df_cd, df_symbol_stats = g_test.highest_gtest_scores(pd_symbols['tokenized string'], pd_symbols['label'], N=10, matches=5, min_volume=5) df_symbol_stats.dropna(inplace=True) df_symbol_stats.sort('bad_g', ascending=0).head(10) def g_aggregate(df_stats, sequence, name): try: g_scores = [df_stats.ix[tokenize_string(item)][name] for item in sequence] except KeyError: return 0 return sum(g_scores)/len(g_scores) if g_scores else 0 # Average df_all['symbol table strings malicious_g'] = \ df_all['symbol table strings'].map(lambda x: g_aggregate(df_symbol_stats, x, 'bad_g')) df_all['symbol table strings clean_g'] = \ df_all['symbol table strings'].map(lambda x: g_aggregate(df_symbol_stats, x, 'good_g')) segment_names = [] for strings, label in zip(df_all['segment names'], df_all['label']): for name in strings: segment_names.append({'segment names': name, 'label': label}) pd_segment_names = pd.DataFrame.from_records(segment_names) pd_segment_names.head() #Spin up our g_test class g_test = ss.GTest() df_ct, df_cd, df_segment_stats = g_test.highest_gtest_scores(pd_segment_names['segment names'], pd_segment_names['label'], N=0, matches=0, min_volume=0) df_segment_stats.dropna(inplace=True) df_segment_stats.sort('bad_g', ascending=0).head(10) s = [] for strings, label in zip(df_all['section names'], df_all['label']): for name in strings: s.append({'section names': name, 'label': label}) pd_section_names = pd.DataFrame.from_records(s) print pd_section_names.shape pd_section_names.head() #Spin up our g_test class g_test = ss.GTest() # Here we'd like to see how various exploits (description) are related to # the ASN (Autonomous System Number) associated with the ip/domain. df_ct, df_cd, df_section_stats = g_test.highest_gtest_scores(pd_section_names['section names'], pd_section_names['label'], N=0, matches=0, min_volume=5) df_section_stats.dropna(inplace=True) df_section_stats.sort('bad_g', ascending=0).head(10) df_section_stats.sort('good_g', ascending=0).head(10) df_all['segment names malicious_g'] = df_all['segment names'].map(lambda x: g_aggregate(df_segment_stats, x, 'bad_g')) df_all['segment names clean_g'] = df_all['segment names'].map(lambda x: g_aggregate(df_segment_stats, x, 'good_g')) df_all['section names malicious_g'] = df_all['section names'].map(lambda x: g_aggregate(df_section_stats, x, 'bad_g')) df_all['section names clean_g'] = df_all['section names'].map(lambda x: g_aggregate(df_section_stats, x, 'good_g')) g_test = ss.GTest() df_entrypoint = pd.DataFrame(df_all, columns=['entry point section name', 'entry point segment name', 'entry point instruction type', 'label']) df_entrypoint = df_entrypoint.replace(0, np.nan) df_entrypoint.dropna(inplace=True) df_entrypoint.head() df_ct, df_cd, df_ep_section_stats = g_test.highest_gtest_scores(df_entrypoint['entry point section name'], df_entrypoint['label'], N=0, matches=0, min_volume=5) df_ct, df_cd, df_ep_segment_stats = g_test.highest_gtest_scores(df_entrypoint['entry point segment name'], df_entrypoint['label'], N=0, matches=0, min_volume=5) df_ct, df_cd, df_ep_instruction_stats = g_test.highest_gtest_scores(df_entrypoint['entry point instruction type'], df_entrypoint['label'], N=0, matches=0, min_volume=5) df_ep_section_stats.head() df_ep_segment_stats.head() df_ep_instruction_stats.dropna(inplace=True) df_ep_instruction_stats.sort('bad_g', ascending=0).head(20) df_entrypoint['entry point instruction type'].value_counts().plot(kind='bar') clf_final = sklearn.ensemble.RandomForestClassifier(n_estimators=50) final_labels = list(df_all.columns.values) final_labels.remove('label') final_labels.remove('cputype') final_labels.remove('LC_UNIXTHREAD') final_labels.remove('LC_MAIN') final_labels.remove('const strings') final_labels.remove('symbol table strings') final_labels.remove('segment names') final_labels.remove('section names') final_labels.remove('entry point section name') final_labels.remove('entry point segment name') final_labels.remove('entry point instruction type') X_final = df_all.as_matrix(final_labels) y_final = np.array(df_all['label'].tolist()) scores = sklearn.cross_validation.cross_val_score(clf_final, X_final, y_final, cv=10, n_jobs=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # 80/20 Split for predictive test X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_final, y_final, test_size=my_tsize, random_state=my_seed) clf_final.fit(X_train_final, y_train_final) y_pred_final = clf_final.predict(X_test_final) cm_final = confusion_matrix(y_test_final, y_pred_final, labels) plot_cm(cm_final, labels) y_probs_final = clf_final.predict_proba(X_test_final)[:,0] thres = .8 # This can be set to whatever you'd like y_pred_final[y_probs_final=thres] = 'bad' cm = confusion_matrix(y_test_final, y_pred_final, labels) plot_cm(cm, labels) importances = zip(final_labels, clf_final.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for im in importances[0:20]: print im[0].ljust(35), im[1] clf_most = sklearn.ensemble.RandomForestClassifier(n_estimators=50) X = df_all.as_matrix([item[0] for item in importances[:20]]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed) clf_most.fit(X_train, y_train) y_pred = clf_most.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) y_probs = clf_most.predict_proba(X_test)[:,0] thres = .92 # This can be set to whatever you'd like y_pred[y_probs=thres] = 'bad' cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) clf_most = sklearn.ensemble.ExtraTreesClassifier(n_estimators=50) X = df_all.as_matrix([item[0] for item in importances[:20]]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed) clf_most.fit(X_train, y_train) y_pred = clf_most.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) df_all.boxplot(column='rebase_off', by='label') plt.xlabel('bad vs. good files') plt.ylabel('rebase_off') plt.title('Comparision of rebase offset') plt.suptitle("") df_all.boxplot(column='rebase_off', by='label', sym='') plt.xlabel('bad vs. good files') plt.ylabel('rebase_off') plt.title('Comparision of rebase offset') plt.suptitle("") cond = df_all['label'] == 'good' good_all = df_all[cond] bad_all = df_all[~cond] bvc = pd.DataFrame(bad_all['rebase_off'].value_counts(), columns=['count']) bvc['perctange'] = bvc/bad_all.shape[0] gvc = pd.DataFrame(good_all['rebase_off'].value_counts(), columns=['count']) gvc['perctange'] = gvc/good_all.shape[0] print 'Most common rebase_off values from malicious set' print bvc.head(5) print '' print 'Most common rebase_off values from clean set' print gvc.head() bad_all['rebase_off'].hist(alpha=1,label='bad',bins=1000) good_all['rebase_off'].hist(alpha=1,label='good',bins=1000) plt.title('Values for rebase offset') plt.legend() plt.xlim(-50,3000000) bad_all['rebase_off'].hist(alpha=1,label='bad',bins=2000) good_all['rebase_off'].hist(alpha=1,label='good',bins=2000) plt.legend() plt.title('Values for rebase offset') plt.xlim(-50,80000) df_all.boxplot(column='rebase_size', by='label') plt.xlabel('bad vs. good files') plt.ylabel('rebase_size') plt.title('Comparision of rebase size') plt.suptitle("") df_all.boxplot(column='rebase_size', by='label', sym='') plt.xlabel('bad vs. good files') plt.ylabel('rebase_size') plt.title('Comparision of rebase size') plt.suptitle("") bvc = pd.DataFrame(bad_all['rebase_size'].value_counts(), columns=['count']) bvc['percentange'] = bvc/bad_all.shape[0] gvc = pd.DataFrame(good_all['rebase_size'].value_counts(), columns=['count']) gvc['percentange'] = gvc/good_all.shape[0] print 'Most common rebase_size values from malicious set' print bvc.head(5) print '' print 'Most common rebase_size values from clean set' print gvc.head(5) plt.scatter(good_all['rebase_off'], good_all['rebase_size'], s=140, c='#aaaaff', label='Good', alpha=.4) plt.scatter(bad_all['rebase_off'], bad['rebase_size'], s=40, c='r', label='Bad', alpha=.5) plt.title('rebase_off vs rebase_size') plt.xlabel('rebase_off') plt.ylabel('rebase_size') plt.scatter(good_all['rebase_off'], good_all['rebase_size'], s=140, c='#aaaaff', label='Good', alpha=.4) plt.scatter(bad_all['rebase_off'], bad['rebase_size'], s=40, c='r', label='Bad', alpha=.5) plt.xlim(-1000,150000) plt.ylim(-100,1100) plt.title('rebase_off vs rebase_size') plt.xlabel('rebase_off') plt.ylabel('rebase_size') plt.scatter(good_all['rebase_off'], good_all['rebase_size'], s=140, c='#aaaaff', label='Good', alpha=.4) plt.scatter(bad_all['rebase_off'], bad['rebase_size'], s=40, c='r', label='Bad', alpha=.5) plt.xlim(-10,10) plt.ylim(-10,10) plt.title('rebase_off vs rebase_size') plt.xlabel('rebase_off') plt.ylabel('rebase_size') def load_vt_data(vt_list): import json import collections results = {} for filename in vt_list: with open(filename,'rb') as f: vt_data = f.read() if len(vt_data) == 0: break vt = json.loads(vt_data) for engine in vt['scans'].keys(): if engine not in results: results[engine] = 0 if vt['scans'][engine]['detected']: results[engine] += 1 r = [] for key, value in results.iteritems(): r.append({'Engine': key, 'Count': value}) return r vt_list = glob.glob('data/bad/vt_data/*.vtdata') num_files = len(vt_list) vt_results = load_vt_data(vt_list) vt_df = pd.DataFrame.from_records(vt_results, columns=['Engine', 'Count']) vt_df['Files Scanned'] = num_files vt_df['Percentage'] = vt_df['Count']/num_files vt_df.sort('Count', ascending=0).head(52) vt_list = glob.glob('data/good/vt_data/*.vtdata') num_files = len(vt_list) vt_results = load_vt_data(vt_list) vt_df = pd.DataFrame.from_records(vt_results, columns=['Engine', 'Count']) vt_df['Files Scanned'] = num_files vt_df['Percentage'] = vt_df['Count']/num_files vt_df.sort('Count', ascending=0).head(15)