import pandas as pd print 'pandas version is', pd.__version__ import numpy as np print 'numpy version is', np.__version__ import sklearn print 'scikit-learn version is', sklearn.__version__ import matplotlib print 'matplotlib version is', matplotlib.__version__ import matplotlib.pyplot as plt %matplotlib inline plt.rcParams['font.size'] = 18.0 plt.rcParams['figure.figsize'] = 16.0, 5.0 def plot_cm(cm, labels): # Compute percentanges percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) print 'Confusion Matrix Stats' for i, label_i in enumerate(labels): for j, label_j in enumerate(labels): print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()) # Show confusion matrix # Thanks to kermit666 from stackoverflow fig = plt.figure() ax = fig.add_subplot(111) ax.grid(b=False) cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100) plt.title('') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) plt.xlabel('Predicted') plt.ylabel('True') plt.show() def extract_features(data): features = {} try: features['sha256'] = data['metadata']['sha256'] features['size'] = data['metadata']['file_size'] features['entropy'] = data['metadata']['entropy'] features['version'] = data['characteristics']['swf']['swf metadata']['version'] features['frame count'] = data['characteristics']['swf']['swf metadata']['framecount'] features['frame rate'] = data['characteristics']['swf']['swf metadata']['framerate'] x_min = data['characteristics']['swf']['swf metadata']['xmin'] x_max = data['characteristics']['swf']['swf metadata']['xmax'] y_min = data['characteristics']['swf']['swf metadata']['ymin'] y_max = data['characteristics']['swf']['swf metadata']['ymax'] x_length = x_max - x_min y_length = y_max - y_min features['swf area'] = x_length * y_length features['swf perimeter'] = 2*(x_length+y_length) features['tag count'] = 0 for tag_info in data['characteristics']['swf']['tag types']: features[tag_info['tag name']] = 1 features['tag count'] += tag_info['count'] abc_info = {} for tag_info in data['verbose']['swf']['tags']: if 'DoABC' in tag_info or 'DoABCDefine' in tag_info: key = 'DoABC' if 'DoABCDefine' in tag_info: key = 'DoABCDefine' if 'abc bytecodename' not in features: abc_info['abc bytecodename'] = [] try: abc_info['abc bytecodename'].append(tag_info[key]['bytecodename']) except KeyError: abc_info['abc bytecodename'].append('DoABCDefine') try: abc_info['abc flag'] = tag_info[key]['flag'] except KeyError: abc_info['abc flag'] = 0 if 'abc strings' not in features: abc_info['abc strings'] = [] if 'abc string count' not in features: abc_info['abc string count'] = 0 abc_info['abc strings'].extend(tag_info[key]['abc']['strings']) if abc_info: if abc_info['abc bytecodename'][0] == '': features['first abc bytecode name'] = 1 elif abc_info['abc bytecodename'][0] == 'DoABCDefine': features['first abc bytecode name'] = 2 elif abc_info['abc bytecodename'][0] == 'frame1': features['first abc bytecode name'] = 3 else: features['first abc bytecode name'] = 4 features['abc bytecode name'] = abc_info['abc bytecodename'] features['bytecode name count'] = len(abc_info['abc bytecodename']) features['unique bytecode name count'] = len(set(abc_info['abc bytecodename'])) features['abc strings'] = abc_info['abc strings'] features['abc string count'] = len(features['abc strings']) features['long hex string'] = 0 for s in features['abc strings']: if len(s) > 100: try: s.decode('hex') features['long hex string'] = 1 break except: pass try: features['abc string m/m ratio'] = float(data['verbose']['swf']['SWF String Statistical Analysis']['ActionScript String Length Mean to Median Ratio']) except KeyError as k: features['abc string m/m ratio'] = 0.0 except KeyError as ke: print 'ERROR:', ke, data['metadata']['sha256'] return features def load_files(file_list): import json features_list = [] for filename in file_list: with open(filename,'rb') as f: features = extract_features(json.loads(f.read())) features_list.append(features) return features_list # Good files import glob good_list = glob.glob('data/clean/*.results') good_features = load_files(good_list) print "Files:", len(good_list) # Bad files bad_list = glob.glob('data/malicious/*.results') bad_features = load_files(bad_list) print "Files:", len(bad_list) df_good = pd.DataFrame.from_records(good_features) df_good.fillna(0, inplace=True) df_good['label'] = 'benign' df_good.head() df_bad = pd.DataFrame.from_records(bad_features) df_bad.fillna(0, inplace=True) df_bad['label'] = 'malicious' df_bad.head() df = pd.concat([df_bad, df_good], ignore_index=True) df.fillna(0, inplace=True) df.groupby(['label', 'version'])['version'].count().unstack('label').fillna(0).plot( colormap='GnBu', kind='bar', stacked=True, grid=False) df.boxplot(column='size', by='label') plt.ylabel('File Size') plt.xlabel('') plt.title('') plt.suptitle('') plt.ylim(0, 200000) df.boxplot('entropy', 'label') plt.ylabel('Entropy') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot(column='frame count', by='label') plt.ylabel('Frame Count') plt.xlabel('') plt.title('') plt.suptitle('') plt.ylim(0, 5000) df_good['frame count'].value_counts()[0:10] df.boxplot(column='frame rate', by='label') plt.ylabel('Frame Rate') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot('swf area', 'label') plt.xlabel('') plt.ylabel('Frame Area') plt.title('') plt.suptitle('') plt.ylim(0, 750000) df.boxplot('swf perimeter', 'label') plt.xlabel('') plt.ylabel('Frame Perimeter') plt.title('') plt.suptitle('') my_seed = 1022 my_tsize = .2 import sklearn.ensemble clf_simple = sklearn.ensemble.RandomForestClassifier(n_estimators=50) simple_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version'] X = df.as_matrix(simple_features) y = np.array(df['label'].tolist()) scores = sklearn.cross_validation.cross_val_score(clf_simple, X, y, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf_simple.fit(X_train, y_train) y_pred = clf_simple.predict(X_test) labels = ['benign', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # Feature Selection importances = zip(simple_features, clf_simple.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for idx, im in enumerate(importances[0:10]): print (str(idx+1) + ':').ljust(4), im[0].ljust(20), round(im[1], 5) df.boxplot('tag count', 'label') plt.xlabel('') plt.ylabel('Number of Tags') plt.title('') plt.suptitle('') plt.ylim(0,400) p = df.groupby(['PlaceObject2','label'])['PlaceObject2'].count().unstack('PlaceObject2').fillna(0).plot( kind='bar', stacked=False, grid=False) p.set_xlabel('') p.plot() p = df.groupby(['DoABC','label'])['DoABC'].count().unstack('DoABC').fillna(0).plot( kind='bar', stacked=False, grid=False) p.set_xlabel('') p.plot() p = df.groupby(['DoABCDefine','label'])['DoABCDefine'].count().unstack('DoABCDefine').fillna(0).plot( kind='bar', stacked=False, grid=False) p.set_xlabel('') p.plot() p = df.groupby(['DefineBitsJPEG2','label'])['DefineBitsJPEG2'].count().unstack('DefineBitsJPEG2').fillna(0).plot( kind='bar', stacked=False, grid=False) p.set_xlabel('') p.plot() p = df.groupby(['End','label'])['End'].count().unstack('End').fillna(0).plot( kind='bar', stacked=False, grid=False) p.set_xlabel('') p.plot() import sklearn.ensemble clf_tags = sklearn.ensemble.RandomForestClassifier(n_estimators=50) tag_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version', 'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits', 'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless', 'DefineBitsLossless2', 'DefineButton', 'DefineButton2', 'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2', 'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo', 'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape', 'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData', 'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4', 'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2', 'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction', 'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel', 'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2', 'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits', 'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead', 'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count'] X = df.as_matrix(tag_features) y = np.array(df['label'].tolist()) scores = sklearn.cross_validation.cross_val_score(clf_tags, X, y, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf_tags.fit(X_train, y_train) y_pred = clf_tags.predict(X_test) labels = ['benign', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) importances = zip(tag_features, clf_tags.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for idx, im in enumerate(importances[0:25]): print (str(idx+1) + ':').ljust(4), im[0].ljust(40), round(im[1], 5) df.boxplot('abc string count', 'label') plt.xlabel('') plt.ylabel('Number of ActionScript Strings') plt.title('') plt.suptitle('') plt.ylim(0, 1000) df.boxplot('abc string m/m ratio', 'label') plt.xlabel('') plt.ylabel('ActionScript Mean/Median Ratio') plt.title('') plt.suptitle('') plt.ylim(0, 15) import sklearn.ensemble clf_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50) abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version', 'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits', 'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless', 'DefineBitsLossless2', 'DefineButton', 'DefineButton2', 'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2', 'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo', 'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape', 'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData', 'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4', 'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2', 'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction', 'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel', 'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2', 'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits', 'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead', 'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count', 'abc string count', 'abc string m/m ratio', 'bytecode name count', 'first abc bytecode name', 'long hex string', 'unique bytecode name count'] X = df.as_matrix(abc_features) y = np.array(df['label'].tolist()) scores = sklearn.cross_validation.cross_val_score(clf_abc, X, y, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) #### Again, not a real improvement. import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf_abc.fit(X_train, y_train) y_pred = clf_abc.predict(X_test) labels = ['benign', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) importances = zip(abc_features, clf_abc.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) total = 0 for idx, im in enumerate(importances[0:20]): total += round(im[1], 5) print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total clf_everything = sklearn.ensemble.RandomForestClassifier(n_estimators=50) abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version', 'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits', 'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless', 'DefineBitsLossless2', 'DefineButton', 'DefineButton2', 'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2', 'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo', 'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape', 'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData', 'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4', 'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2', 'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction', 'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel', 'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2', 'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits', 'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead', 'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count', 'abc string count', 'abc string m/m ratio', 'bytecode name count', 'first abc bytecode name', 'long hex string', 'unique bytecode name count'] X_all = df.as_matrix(abc_features) y_all = np.array(df['label'].tolist()) clf_everything.fit(X_all, y_all) swf_malware_df = pd.read_hdf('data/swf_malware_df.hd5', 'table') swf_malware_df['label'] = 'malicious' swf_malware_df.shape swf_bigpile_df = pd.read_hdf('data/swf_bigpile_df.hd5', 'table') swf_bigpile_df['label'] = 'benign' swf_bigpile_df.shape clean = 0 gray = 0 bad = 0 for x in swf_bigpile_df.as_matrix(abc_features): try: score = clf_everything.predict_proba(x)[:,1][0] if score < 0.5: clean += 1 elif score < 0.8: gray += 1 else: bad += 1 except: print "Sad" print x break print swf_bigpile_df.shape print clean print gray print bad swf_random_df = swf_bigpile_df.reindex(np.random.permutation(swf_bigpile_df.index)) swf_random_5k_df = swf_random_df[0:5000] swf_random_the_rest_df = swf_random_df[5000:] swf_bigger_df = pd.concat([swf_malware_df, swf_random_5k_df], ignore_index=True) swf_bigger_df.fillna(0, inplace=True) import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split clf_5k = sklearn.ensemble.RandomForestClassifier(n_estimators=50) abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version', 'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits', 'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless', 'DefineBitsLossless2', 'DefineButton', 'DefineButton2', 'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2', 'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo', 'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape', 'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData', 'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4', 'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2', 'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction', 'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel', 'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2', 'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits', 'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead', 'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count', 'abc string count', 'abc string m/m ratio', 'bytecode name count', 'first abc bytecode name', 'long hex string', 'unique bytecode name count'] X = swf_bigger_df.as_matrix(abc_features) y = np.array(swf_bigger_df['label'].tolist()) scores = sklearn.cross_validation.cross_val_score(clf_5k, X, y, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf_5k.fit(X_train, y_train) y_pred = clf_abc.predict(X_test) labels = ['benign', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) importances = zip(abc_features, clf_5k.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) total = 0 for idx, im in enumerate(importances[0:20]): total += round(im[1], 5) print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total #### Next we training over all the data again, and test on the large corpus of files. clf_everything_2 = sklearn.ensemble.RandomForestClassifier(n_estimators=50) abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version', 'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits', 'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless', 'DefineBitsLossless2', 'DefineButton', 'DefineButton2', 'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2', 'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo', 'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape', 'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData', 'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4', 'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2', 'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction', 'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel', 'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2', 'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits', 'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead', 'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count', 'abc string count', 'abc string m/m ratio', 'bytecode name count', 'first abc bytecode name', 'long hex string', 'unique bytecode name count'] X_all_2 = swf_bigger_df.as_matrix(abc_features) y_all_2 = np.array(swf_bigger_df['label'].tolist()) clf_everything_2.fit(X_all_2, y_all_2) clean = 0 gray = 0 bad = 0 for x in swf_random_the_rest_df.as_matrix(abc_features): try: score = clf_everything_2.predict_proba(x)[:,1][0] if score < 0.5: clean += 1 elif score < 0.8: gray += 1 else: bad += 1 except: print "Sad" print x break print swf_bigpile_df.shape print clean print gray print bad df_abc_only = swf_bigger_df[(swf_bigger_df['DoABC'] == 1) | (swf_bigger_df['DoABCDefine'] == 1)] df_abc_only.shape import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split clf_abc_only = sklearn.ensemble.RandomForestClassifier(n_estimators=50) abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version', 'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits', 'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless', 'DefineBitsLossless2', 'DefineButton', 'DefineButton2', 'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2', 'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo', 'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape', 'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData', 'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4', 'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2', 'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction', 'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel', 'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2', 'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits', 'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead', 'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count', 'abc string count', 'abc string m/m ratio', 'bytecode name count', 'first abc bytecode name', 'long hex string', 'unique bytecode name count'] X = df_abc_only.as_matrix(abc_features) y = np.array(df_abc_only['label'].tolist()) scores = sklearn.cross_validation.cross_val_score(clf_abc_only, X, y, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf_abc_only.fit(X_train, y_train) y_pred = clf_abc.predict(X_test) labels = ['benign', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) importances = zip(abc_features, clf_abc.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) total = 0 for idx, im in enumerate(importances[0:20]): total += round(im[1], 5) print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total swf_abc_only_the_rest_df = swf_random_the_rest_df[(swf_random_the_rest_df['DoABC'] == 1) | (swf_random_the_rest_df['DoABCDefine'] == 1)] clf_everything_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50) abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version', 'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits', 'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless', 'DefineBitsLossless2', 'DefineButton', 'DefineButton2', 'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2', 'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo', 'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape', 'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData', 'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4', 'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2', 'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction', 'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel', 'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2', 'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits', 'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead', 'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count', 'abc string count', 'abc string m/m ratio', 'bytecode name count', 'first abc bytecode name', 'long hex string', 'unique bytecode name count'] X_all_3 = df_abc_only.as_matrix(abc_features) y_all_3 = np.array(df_abc_only['label'].tolist()) clf_everything_abc.fit(X_all_3, y_all_3) clean = 0 gray = 0 bad = 0 for x in swf_abc_only_the_rest_df.as_matrix(abc_features): try: score = clf_everything_abc.predict_proba(x)[:,1][0] if score < 0.5: clean += 1 elif score < 0.8: gray += 1 else: bad += 1 except Exception as e: print "Sad" print e print x break print swf_abc_only_the_rest_df.shape print clean print gray print bad