import pandas as pd
print 'pandas version is', pd.__version__
import numpy as np
print 'numpy version is', np.__version__
import sklearn
print 'scikit-learn version is', sklearn.__version__
import matplotlib
print 'matplotlib version is', matplotlib.__version__
import matplotlib.pyplot as plt
pandas version is 0.13.0 numpy version is 1.7.1 scikit-learn version is 0.14.1 matplotlib version is 1.4.1
%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 16.0, 5.0
def plot_cm(cm, labels):
# Compute percentanges
percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)
print 'Confusion Matrix Stats'
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())
# Show confusion matrix
# Thanks to kermit666 from stackoverflow
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid(b=False)
cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100)
plt.title('')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
def extract_features(data):
features = {}
try:
features['sha256'] = data['metadata']['sha256']
features['size'] = data['metadata']['file_size']
features['entropy'] = data['metadata']['entropy']
features['version'] = data['characteristics']['swf']['swf metadata']['version']
features['frame count'] = data['characteristics']['swf']['swf metadata']['framecount']
features['frame rate'] = data['characteristics']['swf']['swf metadata']['framerate']
x_min = data['characteristics']['swf']['swf metadata']['xmin']
x_max = data['characteristics']['swf']['swf metadata']['xmax']
y_min = data['characteristics']['swf']['swf metadata']['ymin']
y_max = data['characteristics']['swf']['swf metadata']['ymax']
x_length = x_max - x_min
y_length = y_max - y_min
features['swf area'] = x_length * y_length
features['swf perimeter'] = 2*(x_length+y_length)
features['tag count'] = 0
for tag_info in data['characteristics']['swf']['tag types']:
features[tag_info['tag name']] = 1
features['tag count'] += tag_info['count']
abc_info = {}
for tag_info in data['verbose']['swf']['tags']:
if 'DoABC' in tag_info or 'DoABCDefine' in tag_info:
key = 'DoABC'
if 'DoABCDefine' in tag_info:
key = 'DoABCDefine'
if 'abc bytecodename' not in features:
abc_info['abc bytecodename'] = []
try:
abc_info['abc bytecodename'].append(tag_info[key]['bytecodename'])
except KeyError:
abc_info['abc bytecodename'].append('DoABCDefine')
try:
abc_info['abc flag'] = tag_info[key]['flag']
except KeyError:
abc_info['abc flag'] = 0
if 'abc strings' not in features:
abc_info['abc strings'] = []
if 'abc string count' not in features:
abc_info['abc string count'] = 0
abc_info['abc strings'].extend(tag_info[key]['abc']['strings'])
if abc_info:
if abc_info['abc bytecodename'][0] == '':
features['first abc bytecode name'] = 1
elif abc_info['abc bytecodename'][0] == 'DoABCDefine':
features['first abc bytecode name'] = 2
elif abc_info['abc bytecodename'][0] == 'frame1':
features['first abc bytecode name'] = 3
else:
features['first abc bytecode name'] = 4
features['abc bytecode name'] = abc_info['abc bytecodename']
features['bytecode name count'] = len(abc_info['abc bytecodename'])
features['unique bytecode name count'] = len(set(abc_info['abc bytecodename']))
features['abc strings'] = abc_info['abc strings']
features['abc string count'] = len(features['abc strings'])
features['long hex string'] = 0
for s in features['abc strings']:
if len(s) > 100:
try:
s.decode('hex')
features['long hex string'] = 1
break
except:
pass
try:
features['abc string m/m ratio'] = float(data['verbose']['swf']['SWF String Statistical Analysis']['ActionScript String Length Mean to Median Ratio'])
except KeyError as k:
features['abc string m/m ratio'] = 0.0
except KeyError as ke:
print 'ERROR:', ke, data['metadata']['sha256']
return features
def load_files(file_list):
import json
features_list = []
for filename in file_list:
with open(filename,'rb') as f:
features = extract_features(json.loads(f.read()))
features_list.append(features)
return features_list
# Good files
import glob
good_list = glob.glob('data/clean/*.results')
good_features = load_files(good_list)
print "Files:", len(good_list)
Files: 500
# Bad files
bad_list = glob.glob('data/malicious/*.results')
bad_features = load_files(bad_list)
print "Files:", len(bad_list)
Files: 620
df_good = pd.DataFrame.from_records(good_features)
df_good.fillna(0, inplace=True)
df_good['label'] = 'benign'
df_good.head()
CSMTextSettings | DebugID | DefineBinaryData | DefineBits | DefineBitsJPEG2 | DefineBitsJPEG3 | DefineBitsLossless | DefineBitsLossless2 | DefineButton | DefineButton2 | DefineButtonSound | DefineEditText | DefineFont | DefineFont2 | DefineFont3 | DefineFont4 | DefineFontAlignZones | DefineFontInfo2 | DefineFontName | DefineMorphShape | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... |
1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
5 rows × 76 columns
df_bad = pd.DataFrame.from_records(bad_features)
df_bad.fillna(0, inplace=True)
df_bad['label'] = 'malicious'
df_bad.head()
CSMTextSettings | DebugID | DefineBinaryData | DefineBits | DefineBitsJPEG2 | DefineBitsJPEG3 | DefineBitsLossless | DefineBitsLossless2 | DefineButton2 | DefineButtonSound | DefineEditText | DefineFont | DefineFont2 | DefineFont3 | DefineFont4 | DefineFontAlignZones | DefineFontInfo | DefineFontName | DefineMorphShape | DefineScalingGrid | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
5 rows × 73 columns
df = pd.concat([df_bad, df_good], ignore_index=True)
df.fillna(0, inplace=True)
df.groupby(['label', 'version'])['version'].count().unstack('label').fillna(0).plot(
colormap='GnBu', kind='bar', stacked=True, grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x112cf7190>
df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 200000)
(0, 200000)
df.boxplot('entropy', 'label')
plt.ylabel('Entropy')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x112e1d710>
df.boxplot(column='frame count', by='label')
plt.ylabel('Frame Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 5000)
(0, 5000)
df_good['frame count'].value_counts()[0:10]
1 390 2 16 3 3 361 3 25 3 320 2 200 2 95 2 300 2 217 2 dtype: int64
df.boxplot(column='frame rate', by='label')
plt.ylabel('Frame Rate')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x112e60090>
df.boxplot('swf area', 'label')
plt.xlabel('')
plt.ylabel('Frame Area')
plt.title('')
plt.suptitle('')
plt.ylim(0, 750000)
(0, 750000)
df.boxplot('swf perimeter', 'label')
plt.xlabel('')
plt.ylabel('Frame Perimeter')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x112e0c0d0>
my_seed = 1022
my_tsize = .2
import sklearn.ensemble
clf_simple = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
simple_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version']
X = df.as_matrix(simple_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_simple, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.963 (+/- 0.034)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_simple.fit(X_train, y_train)
y_pred = clf_simple.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 96.04% (97/101) benign/malicious: 3.96% (4/101) malicious/benign: 3.25% (4/123) malicious/malicious: 96.75% (119/123)
# Feature Selection
importances = zip(simple_features, clf_simple.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances[0:10]):
print (str(idx+1) + ':').ljust(4), im[0].ljust(20), round(im[1], 5)
1: entropy 0.27378 2: swf perimeter 0.15704 3: size 0.14867 4: swf area 0.13079 5: version 0.12864 6: frame rate 0.11256 7: frame count 0.04852
df.boxplot('tag count', 'label')
plt.xlabel('')
plt.ylabel('Number of Tags')
plt.title('')
plt.suptitle('')
plt.ylim(0,400)
(0, 400)
p = df.groupby(['PlaceObject2','label'])['PlaceObject2'].count().unstack('PlaceObject2').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
[]
p = df.groupby(['DoABC','label'])['DoABC'].count().unstack('DoABC').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
[]
p = df.groupby(['DoABCDefine','label'])['DoABCDefine'].count().unstack('DoABCDefine').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
[]
p = df.groupby(['DefineBitsJPEG2','label'])['DefineBitsJPEG2'].count().unstack('DefineBitsJPEG2').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
[]
p = df.groupby(['End','label'])['End'].count().unstack('End').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
[]
import sklearn.ensemble
clf_tags = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
tag_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count']
X = df.as_matrix(tag_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_tags, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.971 (+/- 0.029)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_tags.fit(X_train, y_train)
y_pred = clf_tags.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 97.85% (91/93) benign/malicious: 2.15% (2/93) malicious/benign: 4.58% (6/131) malicious/malicious: 95.42% (125/131)
importances = zip(tag_features, clf_tags.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances[0:25]):
print (str(idx+1) + ':').ljust(4), im[0].ljust(40), round(im[1], 5)
1: PlaceObject2 0.08836 2: swf perimeter 0.08419 3: entropy 0.08106 4: swf area 0.07323 5: tag count 0.06813 6: size 0.06797 7: frame rate 0.06583 8: version 0.05843 9: DefineShape 0.04059 10: DefineShape3 0.03661 11: ProductInfo 0.02833 12: DefineFontName 0.02659 13: Unknown 0.02473 14: DefineFontAlignZones 0.01981 15: frame count 0.01832 16: ScriptLimits 0.01822 17: DefineBitsJPEG2 0.0174 18: DefineFont3 0.01693 19: ExportAssets 0.01529 20: Metadata 0.01521 21: DefineBitsJPEG3 0.01431 22: DefineSprite 0.01369 23: DefineBinaryData 0.00869 24: DebugID 0.00738 25: DoABCDefine 0.00667
df.boxplot('abc string count', 'label')
plt.xlabel('')
plt.ylabel('Number of ActionScript Strings')
plt.title('')
plt.suptitle('')
plt.ylim(0, 1000)
(0, 1000)
df.boxplot('abc string m/m ratio', 'label')
plt.xlabel('')
plt.ylabel('ActionScript Mean/Median Ratio')
plt.title('')
plt.suptitle('')
plt.ylim(0, 15)
(0, 15)
import sklearn.ensemble
clf_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X = df.as_matrix(abc_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_abc, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.970 (+/- 0.028)
#### Again, not a real improvement.
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_abc.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 97.22% (105/108) benign/malicious: 2.78% (3/108) malicious/benign: 1.72% (2/116) malicious/malicious: 98.28% (114/116)
importances = zip(abc_features, clf_abc.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
total += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total
1: abc string count 0.08828 0.08828 2: entropy 0.08444 0.17272 3: swf area 0.06474 0.23746 4: swf perimeter 0.05966 0.29712 5: tag count 0.05951 0.35663 6: PlaceObject2 0.05094 0.40757 7: DefineShape3 0.05022 0.45779 8: size 0.04628 0.50407 9: frame rate 0.04317 0.54724 10: version 0.04204 0.58928 11: abc string m/m ratio 0.03019 0.61947 12: DefineShape 0.02958 0.64905 13: long hex string 0.02897 0.67802 14: DefineSprite 0.02816 0.70618 15: DefineFont3 0.02448 0.73066 16: DefineBitsJPEG2 0.02426 0.75492 17: DefineFontName 0.01949 0.77441 18: ProductInfo 0.01839 0.7928 19: Unknown 0.01636 0.80916 20: first abc bytecode name 0.01585 0.82501
clf_everything = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X_all = df.as_matrix(abc_features)
y_all = np.array(df['label'].tolist())
clf_everything.fit(X_all, y_all)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, n_jobs=1, oob_score=False, random_state=None, verbose=0)
swf_malware_df = pd.read_hdf('data/swf_malware_df.hd5', 'table')
swf_malware_df['label'] = 'malicious'
swf_malware_df.shape
/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:520: DeprecationWarning: openFile() is pending deprecation, use open_file() instead. You may use the pt2to3 tool to update your source code. self._handle = tables.openFile(self._path, self._mode, **kwargs) /opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:1017: DeprecationWarning: getNode() is pending deprecation, use get_node() instead. You may use the pt2to3 tool to update your source code. return self._handle.getNode(self.root, key)
(621, 89)
swf_bigpile_df = pd.read_hdf('data/swf_bigpile_df.hd5', 'table')
swf_bigpile_df['label'] = 'benign'
swf_bigpile_df.shape
/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:520: DeprecationWarning: openFile() is pending deprecation, use open_file() instead. You may use the pt2to3 tool to update your source code. self._handle = tables.openFile(self._path, self._mode, **kwargs) /opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:1017: DeprecationWarning: getNode() is pending deprecation, use get_node() instead. You may use the pt2to3 tool to update your source code. return self._handle.getNode(self.root, key)
(288846, 90)
clean = 0
gray = 0
bad = 0
for x in swf_bigpile_df.as_matrix(abc_features):
try:
score = clf_everything.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
except:
print "Sad"
print x
break
print swf_bigpile_df.shape
print clean
print gray
print bad
(288846, 90) 279673 7215 1958
swf_random_df = swf_bigpile_df.reindex(np.random.permutation(swf_bigpile_df.index))
swf_random_5k_df = swf_random_df[0:5000]
swf_random_the_rest_df = swf_random_df[5000:]
swf_bigger_df = pd.concat([swf_malware_df, swf_random_5k_df], ignore_index=True)
swf_bigger_df.fillna(0, inplace=True)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
clf_5k = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X = swf_bigger_df.as_matrix(abc_features)
y = np.array(swf_bigger_df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_5k, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.990 (+/- 0.006)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_5k.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 95.01% (953/1003) benign/malicious: 4.99% (50/1003) malicious/benign: 0.00% (0/122) malicious/malicious: 100.00% (122/122)
importances = zip(abc_features, clf_5k.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
total += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total
1: swf perimeter 0.11688 0.11688 2: swf area 0.0828 0.19968 3: entropy 0.08055 0.28023 4: abc string m/m ratio 0.06875 0.34898 5: size 0.05404 0.40302 6: abc string count 0.05321 0.45623 7: frame rate 0.04839 0.50462 8: tag count 0.04693 0.55155 9: frame count 0.04146 0.59301 10: PlaceObject2 0.0363 0.62931 11: Unknown 0.03536 0.66467 12: End 0.03411 0.69878 13: first abc bytecode name 0.02746 0.72624 14: version 0.02738 0.75362 15: DoABCDefine 0.02428 0.7779 16: DefineShape3 0.01646 0.79436 17: bytecode name count 0.01576 0.81012 18: DefineShape 0.01375 0.82387 19: DefineSprite 0.01213 0.836 20: FrameLabel 0.01186 0.84786
#### Next we training over all the data again, and test on the large corpus of files.
clf_everything_2 = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X_all_2 = swf_bigger_df.as_matrix(abc_features)
y_all_2 = np.array(swf_bigger_df['label'].tolist())
clf_everything_2.fit(X_all_2, y_all_2)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, n_jobs=1, oob_score=False, random_state=None, verbose=0)
clean = 0
gray = 0
bad = 0
for x in swf_random_the_rest_df.as_matrix(abc_features):
try:
score = clf_everything_2.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
except:
print "Sad"
print x
break
print swf_bigpile_df.shape
print clean
print gray
print bad
(288846, 90) 282662 800 384
df_abc_only = swf_bigger_df[(swf_bigger_df['DoABC'] == 1) | (swf_bigger_df['DoABCDefine'] == 1)]
df_abc_only.shape
(3246, 90)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
clf_abc_only = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X = df_abc_only.as_matrix(abc_features)
y = np.array(df_abc_only['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_abc_only, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.992 (+/- 0.008)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_abc_only.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 96.92% (535/552) benign/malicious: 3.08% (17/552) malicious/benign: 0.00% (0/98) malicious/malicious: 100.00% (98/98)
importances = zip(abc_features, clf_abc.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
total += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total
1: abc string count 0.08828 0.08828 2: entropy 0.08444 0.17272 3: swf area 0.06474 0.23746 4: swf perimeter 0.05966 0.29712 5: tag count 0.05951 0.35663 6: PlaceObject2 0.05094 0.40757 7: DefineShape3 0.05022 0.45779 8: size 0.04628 0.50407 9: frame rate 0.04317 0.54724 10: version 0.04204 0.58928 11: abc string m/m ratio 0.03019 0.61947 12: DefineShape 0.02958 0.64905 13: long hex string 0.02897 0.67802 14: DefineSprite 0.02816 0.70618 15: DefineFont3 0.02448 0.73066 16: DefineBitsJPEG2 0.02426 0.75492 17: DefineFontName 0.01949 0.77441 18: ProductInfo 0.01839 0.7928 19: Unknown 0.01636 0.80916 20: first abc bytecode name 0.01585 0.82501
swf_abc_only_the_rest_df = swf_random_the_rest_df[(swf_random_the_rest_df['DoABC'] == 1) | (swf_random_the_rest_df['DoABCDefine'] == 1)]
clf_everything_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X_all_3 = df_abc_only.as_matrix(abc_features)
y_all_3 = np.array(df_abc_only['label'].tolist())
clf_everything_abc.fit(X_all_3, y_all_3)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, n_jobs=1, oob_score=False, random_state=None, verbose=0)
clean = 0
gray = 0
bad = 0
for x in swf_abc_only_the_rest_df.as_matrix(abc_features):
try:
score = clf_everything_abc.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
except Exception as e:
print "Sad"
print e
print x
break
print swf_abc_only_the_rest_df.shape
print clean
print gray
print bad
(152768, 90) 151967 474 327