# SWF Data Analysis In this notebook we're going to explore, understand, and classify shockwave flash files as being 'benign' or 'malicious'. We will explore the data, apply machine learning algorithms to the data, add new features, do more machine learning. Then we will test our classifier on a large amount of files to measure it's effectiveness.

** DISCLAIMER:** This exercise is for illustrative purposes and only uses about 500 samples which is too small for a generalizable model. ### References

http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/swf/pdf/swf-file-format-spec.pdf
http://www.adobe.com/content/dam/Adobe/en/devnet/actionscript/articles/avm2overview.pdf

### Python Modules Used:

Pandas: Python Data Analysis Library (http://pandas.pydata.org)
Scikit Learn (http://scikit-learn.org) Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
Matplotlib: Python 2D plotting library (http://matplotlib.org)

### IPython Notebook for this talk: http://clicksecurity.github.io/data_hacking

Imports and plot defaults¶

In [1]:

import pandas as pd
print 'pandas version is', pd.__version__
import numpy as np
print 'numpy version is', np.__version__
import sklearn
print 'scikit-learn version is', sklearn.__version__
import matplotlib
print 'matplotlib version is', matplotlib.__version__
import matplotlib.pyplot as plt

pandas version is 0.13.0
numpy version is 1.7.1
scikit-learn version is 0.14.1
matplotlib version is 1.4.1

Plotting defaults and helper functions¶

In [2]:

%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 16.0, 5.0

In [3]:

def plot_cm(cm, labels):
    # Compute percentanges
    percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)
    print 'Confusion Matrix Stats'
    for i, label_i in enumerate(labels):
        for j, label_j in enumerate(labels):
            print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())

    # Show confusion matrix
    # Thanks to kermit666 from stackoverflow
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.grid(b=False)
    cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100)
    plt.title('')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

Function to extract out the features we are interested with from the json blob¶

In [4]:

def extract_features(data):
    features = {}
    try:
        features['sha256'] = data['metadata']['sha256']
        features['size'] = data['metadata']['file_size']
        features['entropy'] = data['metadata']['entropy']
        features['version'] = data['characteristics']['swf']['swf metadata']['version']
        features['frame count'] = data['characteristics']['swf']['swf metadata']['framecount']
        features['frame rate'] = data['characteristics']['swf']['swf metadata']['framerate']
        
        x_min = data['characteristics']['swf']['swf metadata']['xmin']
        x_max = data['characteristics']['swf']['swf metadata']['xmax']
        y_min = data['characteristics']['swf']['swf metadata']['ymin']
        y_max = data['characteristics']['swf']['swf metadata']['ymax']
        x_length = x_max - x_min
        y_length = y_max - y_min
        features['swf area'] = x_length * y_length
        features['swf perimeter'] = 2*(x_length+y_length)
        
        features['tag count'] = 0
        for tag_info in data['characteristics']['swf']['tag types']:
            features[tag_info['tag name']] = 1
            features['tag count'] += tag_info['count']

        abc_info = {}
        for tag_info in data['verbose']['swf']['tags']:
            if 'DoABC' in tag_info or 'DoABCDefine' in tag_info:
                key = 'DoABC'
                if 'DoABCDefine' in tag_info:
                    key = 'DoABCDefine'
                
                if 'abc bytecodename' not in features:
                    abc_info['abc bytecodename'] = []
                try:
                    abc_info['abc bytecodename'].append(tag_info[key]['bytecodename'])
                except KeyError:
                    abc_info['abc bytecodename'].append('DoABCDefine')
                                                        
                try:
                    abc_info['abc flag'] = tag_info[key]['flag']
                except KeyError:
                    abc_info['abc flag'] = 0
                
                if 'abc strings' not in features:
                    abc_info['abc strings'] = []
                if 'abc string count' not in features:
                    abc_info['abc string count'] = 0
                    
                abc_info['abc strings'].extend(tag_info[key]['abc']['strings'])

        if abc_info:
            if abc_info['abc bytecodename'][0] == '':
                features['first abc bytecode name'] = 1
            elif abc_info['abc bytecodename'][0] == 'DoABCDefine':
                features['first abc bytecode name'] = 2
            elif abc_info['abc bytecodename'][0] == 'frame1':
                features['first abc bytecode name'] = 3
            else:
                features['first abc bytecode name'] = 4
                
            features['abc bytecode name'] = abc_info['abc bytecodename']
            features['bytecode name count'] = len(abc_info['abc bytecodename'])
            features['unique bytecode name count'] = len(set(abc_info['abc bytecodename']))
            features['abc strings'] = abc_info['abc strings']
            features['abc string count'] = len(features['abc strings'])
            
            features['long hex string'] = 0
            for s in features['abc strings']:
                if len(s) > 100:
                    try:
                        s.decode('hex')
                        features['long hex string'] = 1
                        break
                    except:
                        pass
            try:
                features['abc string m/m ratio'] = float(data['verbose']['swf']['SWF String Statistical Analysis']['ActionScript String Length Mean to Median Ratio'])
            except KeyError as k:
                features['abc string m/m ratio'] = 0.0         

    except KeyError as ke:
        print 'ERROR:', ke, data['metadata']['sha256']
    return features

In [5]:

def load_files(file_list):
    import json
    features_list = []
    for filename in file_list:
        with open(filename,'rb') as f:
            features = extract_features(json.loads(f.read()))
            features_list.append(features)
    return features_list

In [6]:

# Good files
import glob
good_list = glob.glob('data/clean/*.results')
good_features = load_files(good_list)
print "Files:", len(good_list)

Files: 500

The set of bad files came from Contagio and VirusTotal¶

In [7]:

# Bad files
bad_list = glob.glob('data/malicious/*.results')
bad_features = load_files(bad_list)
print "Files:", len(bad_list)

Files: 620

In [8]:

df_good = pd.DataFrame.from_records(good_features)
df_good.fillna(0, inplace=True)
df_good['label'] = 'benign'
df_good.head()

Out[8]:

	DebugID	DefineBinaryData	DefineBits	DefineBitsJPEG2	DefineBitsJPEG3	DefineButton2	DefineFont2	DefineMorphShape
0	0	0	0	0	0	1	1	1	...
1	0	0	1	1	1	1	0	0	...
2	0	0	0	1	0	0	1	0	...
3	1	1	0	0	0	0	0	0	...
4	0	0	0	0	0	0	0	0	...

5 rows × 76 columns

In [9]:

df_bad = pd.DataFrame.from_records(bad_features)
df_bad.fillna(0, inplace=True)
df_bad['label'] = 'malicious'
df_bad.head()

Out[9]:

	DefineBits
0	0	...
1	0	...
2	0	...
3	1	...
4	0	...

5 rows × 73 columns

Let's explore the data.¶

In [10]:

df = pd.concat([df_bad, df_good], ignore_index=True)
df.fillna(0, inplace=True)

Let's start to explore the data, first with just the version numbers.¶

In [11]:

df.groupby(['label', 'version'])['version'].count().unstack('label').fillna(0).plot(
    colormap='GnBu', kind='bar', stacked=True, grid=False)

Out[11]:

<matplotlib.axes._subplots.AxesSubplot at 0x112cf7190>

Next we see if there are any tendencies from the size of the file.¶

In [12]:

df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 200000)

Out[12]:

(0, 200000)

Next we compare the entropy¶

In [13]:

df.boxplot('entropy', 'label')
plt.ylabel('Entropy')
plt.xlabel('')
plt.title('')
plt.suptitle('')

Out[13]:

<matplotlib.text.Text at 0x112e1d710>

Compare the frame count¶

In [14]:

df.boxplot(column='frame count', by='label')
plt.ylabel('Frame Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 5000)

Out[14]:

(0, 5000)

The value for the benign files seems really low, let's dig a little deeper.¶

In [15]:

df_good['frame count'].value_counts()[0:10]

Out[15]:

1      390
2       16
3        3
361      3
25       3
320      2
200      2
95       2
300      2
217      2
dtype: int64

That's kind of interesting, most of the values are 1.¶

Next we compare the frame rate.¶

In [16]:

df.boxplot(column='frame rate', by='label')
plt.ylabel('Frame Rate')
plt.xlabel('')
plt.title('')
plt.suptitle('')

Out[16]:

<matplotlib.text.Text at 0x112e60090>

Compare the area of the frame¶

In [17]:

df.boxplot('swf area', 'label')
plt.xlabel('')
plt.ylabel('Frame Area')
plt.title('')
plt.suptitle('')
plt.ylim(0, 750000)

Out[17]:

(0, 750000)

Next compare the perimeter of the frame.¶

In [18]:

df.boxplot('swf perimeter', 'label')
plt.xlabel('')
plt.ylabel('Frame Perimeter')
plt.title('')
plt.suptitle('')

Out[18]:

<matplotlib.text.Text at 0x112e0c0d0>

Classification¶

First we try classifying binaries based on some of features from the SWF header. The values we explored above. There didn't seem to be a lot of separation, so I'm not expecting great results. Yet.¶

In [19]:

my_seed = 1022
my_tsize = .2

In [20]:

import sklearn.ensemble
clf_simple = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
simple_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version']

X = df.as_matrix(simple_features)
y = np.array(df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_simple, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.963 (+/- 0.034)

96.1%? That was better than I thought it would be. Let's break the numbers down a little more.¶

In [21]:

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_simple.fit(X_train, y_train)
y_pred = clf_simple.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

Confusion Matrix Stats
benign/benign: 96.04% (97/101)
benign/malicious: 3.96% (4/101)
malicious/benign: 3.25% (4/123)
malicious/malicious: 96.75% (119/123)

Fairly equal on IDing benign and malicious files. Next we will have the classifier tell use which features were most important and how important they were.¶

In [22]:

# Feature Selection
importances = zip(simple_features, clf_simple.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances[0:10]):
    print (str(idx+1) + ':').ljust(4), im[0].ljust(20), round(im[1], 5)

1:   entropy              0.27378
2:   swf perimeter        0.15704
3:   size                 0.14867
4:   swf area             0.13079
5:   version              0.12864
6:   frame rate           0.11256
7:   frame count          0.04852

Tag Information¶

Now we will add information about the tags present in the SWF file. We do not take into condsideration the number of each tags, just that they exist in the file. We also will take into consideration the number of tags. Again, we start by exploring some of these values.¶

In [23]:

df.boxplot('tag count', 'label')
plt.xlabel('')
plt.ylabel('Number of Tags')
plt.title('')
plt.suptitle('')
plt.ylim(0,400)

Out[23]:

(0, 400)

Next we switch it up to see how many files contain several tags from each file classification.¶

In [24]:

p = df.groupby(['PlaceObject2','label'])['PlaceObject2'].count().unstack('PlaceObject2').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

Out[24]:

[]

DoABC tags mean that the file contains ActionScript3¶

In [25]:

p = df.groupby(['DoABC','label'])['DoABC'].count().unstack('DoABC').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

Out[25]:

[]

DoABCDefine is a deprecated way to do the same things.¶

In [26]:

p = df.groupby(['DoABCDefine','label'])['DoABCDefine'].count().unstack('DoABCDefine').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

Out[26]:

[]

In [27]:

p = df.groupby(['DefineBitsJPEG2','label'])['DefineBitsJPEG2'].count().unstack('DefineBitsJPEG2').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

Out[27]:

[]

In [28]:

p = df.groupby(['End','label'])['End'].count().unstack('End').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

Out[28]:

[]

Now we add that information to the classifier and check the results again.¶

In [29]:

import sklearn.ensemble
clf_tags = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
tag_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count']

X = df.as_matrix(tag_features)
y = np.array(df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_tags, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.971 (+/- 0.029)

They didn't really seem to help that much unfortunately. Below, we check out the confusion matrix, and see similar results as before.¶

In [30]:

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_tags.fit(X_train, y_train)
y_pred = clf_tags.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

Confusion Matrix Stats
benign/benign: 97.85% (91/93)
benign/malicious: 2.15% (2/93)
malicious/benign: 4.58% (6/131)
malicious/malicious: 95.42% (125/131)

When checking the feature importance, we see that the new features were not really that important.¶

In [31]:

importances = zip(tag_features, clf_tags.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances[0:25]):
    print (str(idx+1) + ':').ljust(4), im[0].ljust(40), round(im[1], 5)

1:   PlaceObject2                             0.08836
2:   swf perimeter                            0.08419
3:   entropy                                  0.08106
4:   swf area                                 0.07323
5:   tag count                                0.06813
6:   size                                     0.06797
7:   frame rate                               0.06583
8:   version                                  0.05843
9:   DefineShape                              0.04059
10:  DefineShape3                             0.03661
11:  ProductInfo                              0.02833
12:  DefineFontName                           0.02659
13:  Unknown                                  0.02473
14:  DefineFontAlignZones                     0.01981
15:  frame count                              0.01832
16:  ScriptLimits                             0.01822
17:  DefineBitsJPEG2                          0.0174
18:  DefineFont3                              0.01693
19:  ExportAssets                             0.01529
20:  Metadata                                 0.01521
21:  DefineBitsJPEG3                          0.01431
22:  DefineSprite                             0.01369
23:  DefineBinaryData                         0.00869
24:  DebugID                                  0.00738
25:  DoABCDefine                              0.00667

ActionScript¶

Next we will add in some features about ActionScript. First up is the number of strings.¶

In [32]:

df.boxplot('abc string count', 'label')
plt.xlabel('')
plt.ylabel('Number of ActionScript Strings')
plt.title('')
plt.suptitle('')
plt.ylim(0, 1000)

Out[32]:

(0, 1000)

This next feature is taking the length of every string and calculating the mean length and the median length of the string.¶

In [33]:

df.boxplot('abc string m/m ratio', 'label')
plt.xlabel('')
plt.ylabel('ActionScript Mean/Median Ratio')
plt.title('')
plt.suptitle('')
plt.ylim(0, 15)

Out[33]:

(0, 15)

Now let's check out the classifier with these new features.¶

In [34]:

import sklearn.ensemble
clf_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X = df.as_matrix(abc_features)
y = np.array(df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_abc, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.970 (+/- 0.028)

In [35]:

#### Again, not a real improvement.

In [36]:

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_abc.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

Confusion Matrix Stats
benign/benign: 97.22% (105/108)
benign/malicious: 2.78% (3/108)
malicious/benign: 1.72% (2/116)
malicious/malicious: 98.28% (114/116)

We see the number of strings in actionscript is a relatively important feature, but not enough to make a significant improvement in the classifier.¶

In [37]:

importances = zip(abc_features, clf_abc.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
    total += round(im[1], 5)
    print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total

1:   abc string count                    0.08828 0.08828
2:   entropy                             0.08444 0.17272
3:   swf area                            0.06474 0.23746
4:   swf perimeter                       0.05966 0.29712
5:   tag count                           0.05951 0.35663
6:   PlaceObject2                        0.05094 0.40757
7:   DefineShape3                        0.05022 0.45779
8:   size                                0.04628 0.50407
9:   frame rate                          0.04317 0.54724
10:  version                             0.04204 0.58928
11:  abc string m/m ratio                0.03019 0.61947
12:  DefineShape                         0.02958 0.64905
13:  long hex string                     0.02897 0.67802
14:  DefineSprite                        0.02816 0.70618
15:  DefineFont3                         0.02448 0.73066
16:  DefineBitsJPEG2                     0.02426 0.75492
17:  DefineFontName                      0.01949 0.77441
18:  ProductInfo                         0.01839 0.7928
19:  Unknown                             0.01636 0.80916
20:  first abc bytecode name             0.01585 0.82501

Testing on a large corpus of files¶

In the next couple steps, we train a classifier on all the data, and then test it on approximately 289K files. This data is not labeled, but I would expect them all to be benign. Maybe a few malicious ones, but a small number.¶

In [38]:

clf_everything = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X_all = df.as_matrix(abc_features)
y_all = np.array(df['label'].tolist())
clf_everything.fit(X_all, y_all)

Out[38]:

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)

In [39]:

swf_malware_df = pd.read_hdf('data/swf_malware_df.hd5', 'table')
swf_malware_df['label'] = 'malicious'
swf_malware_df.shape

/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:520: DeprecationWarning: openFile() is pending deprecation, use open_file() instead. You may use the pt2to3 tool to update your source code.
  self._handle = tables.openFile(self._path, self._mode, **kwargs)
/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:1017: DeprecationWarning: getNode() is pending deprecation, use get_node() instead. You may use the pt2to3 tool to update your source code.
  return self._handle.getNode(self.root, key)

Out[39]:

(621, 89)

In [40]:

swf_bigpile_df = pd.read_hdf('data/swf_bigpile_df.hd5', 'table')
swf_bigpile_df['label'] = 'benign'
swf_bigpile_df.shape

/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:520: DeprecationWarning: openFile() is pending deprecation, use open_file() instead. You may use the pt2to3 tool to update your source code.
  self._handle = tables.openFile(self._path, self._mode, **kwargs)
/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:1017: DeprecationWarning: getNode() is pending deprecation, use get_node() instead. You may use the pt2to3 tool to update your source code.
  return self._handle.getNode(self.root, key)

Out[40]:

(288846, 90)

One of the features of the classifier is the ability to give a probability that the feature set belongs to a specific class. In the code below, we are asking the classifier to give us the probability the feature set (file) belongs to the malicious class. We interpret these results as anything less the 0.5 is benign, 0.5 - 0.8 is a gray area, and 0.8 and above is malicious.¶

In [41]:

clean = 0
gray = 0
bad = 0
for x in swf_bigpile_df.as_matrix(abc_features):
    try:
        score = clf_everything.predict_proba(x)[:,1][0]
        if score < 0.5:
            clean += 1
        elif score < 0.8:
            gray += 1
        else:
            bad += 1
    except:
        print "Sad"
        print x
        break

print swf_bigpile_df.shape
print clean
print gray
print bad

(288846, 90)
279673
7215
1958

279K of the files were marked as clean. Almost 8K of the files are in the gray area, and almost 2K were classified as malicious. Clearly, this doesn't fit our expectations. But, we only started with a small number of files to train on. Next we will take 5K of the files, and blindly accept them as benign and add them to our training set. Ideally, we'd like to verify this first, but for now, we will just assume the random 5K are actually benign.¶

In [42]:

swf_random_df = swf_bigpile_df.reindex(np.random.permutation(swf_bigpile_df.index))
swf_random_5k_df = swf_random_df[0:5000]
swf_random_the_rest_df = swf_random_df[5000:]

In [43]:

swf_bigger_df = pd.concat([swf_malware_df, swf_random_5k_df], ignore_index=True)
swf_bigger_df.fillna(0, inplace=True)

In [44]:

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

clf_5k = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X = swf_bigger_df.as_matrix(abc_features)
y = np.array(swf_bigger_df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_5k, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.990 (+/- 0.006)

You can see we actually get significant improvement in the classifier. The rate is higher and the margin of error is smaller too. And looking at the confusion matrix below shows one classification detection isn't better than the other.¶

In [45]:

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_5k.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

Confusion Matrix Stats
benign/benign: 95.01% (953/1003)
benign/malicious: 4.99% (50/1003)
malicious/benign: 0.00% (0/122)
malicious/malicious: 100.00% (122/122)

Not surprisingly, the feature importance is very similar to the smaller training set¶

In [46]:

importances = zip(abc_features, clf_5k.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
    total += round(im[1], 5)
    print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total

1:   swf perimeter                       0.11688 0.11688
2:   swf area                            0.0828 0.19968
3:   entropy                             0.08055 0.28023
4:   abc string m/m ratio                0.06875 0.34898
5:   size                                0.05404 0.40302
6:   abc string count                    0.05321 0.45623
7:   frame rate                          0.04839 0.50462
8:   tag count                           0.04693 0.55155
9:   frame count                         0.04146 0.59301
10:  PlaceObject2                        0.0363 0.62931
11:  Unknown                             0.03536 0.66467
12:  End                                 0.03411 0.69878
13:  first abc bytecode name             0.02746 0.72624
14:  version                             0.02738 0.75362
15:  DoABCDefine                         0.02428 0.7779
16:  DefineShape3                        0.01646 0.79436
17:  bytecode name count                 0.01576 0.81012
18:  DefineShape                         0.01375 0.82387
19:  DefineSprite                        0.01213 0.836
20:  FrameLabel                          0.01186 0.84786

In [47]:

#### Next we training over all the data again, and test on the large corpus of files.

In [48]:

clf_everything_2 = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X_all_2 = swf_bigger_df.as_matrix(abc_features)
y_all_2 = np.array(swf_bigger_df['label'].tolist())
clf_everything_2.fit(X_all_2, y_all_2)

Out[48]:

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)

In [49]:

clean = 0
gray = 0
bad = 0
for x in swf_random_the_rest_df.as_matrix(abc_features):
    try:
        score = clf_everything_2.predict_proba(x)[:,1][0]
        if score < 0.5:
            clean += 1
        elif score < 0.8:
            gray += 1
        else:
            bad += 1
    except:
        print "Sad"
        print x
        break

print swf_bigpile_df.shape
print clean
print gray
print bad

(288846, 90)
282662
800
384

You can see that there is a big improvement in the gray and malicious classification. There is still room for improvement, but we are moving closer to a manageable level of detections an analyst could look at.¶

Next we explore the idea that since we added features about actionscript, maybe we should limit the classifier to only train and test on files that contain actionscript since those features will be meaningless when actionscript is not present.¶

In [50]:

df_abc_only = swf_bigger_df[(swf_bigger_df['DoABC'] == 1) | (swf_bigger_df['DoABCDefine'] == 1)]
df_abc_only.shape

Out[50]:

(3246, 90)

In [51]:

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

clf_abc_only = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X = df_abc_only.as_matrix(abc_features)
y = np.array(df_abc_only['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_abc_only, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.992 (+/- 0.008)

Surprisingly (at least to me), no significant improvement was noticed.¶

In [53]:

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_abc_only.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

Confusion Matrix Stats
benign/benign: 96.92% (535/552)
benign/malicious: 3.08% (17/552)
malicious/benign: 0.00% (0/98)
malicious/malicious: 100.00% (98/98)

When we check the importance, we see some of the actionscript feature are more prominent, but they don't contain enough information to make a significant improvement to the classifier.¶

In [54]:

importances = zip(abc_features, clf_abc.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
    total += round(im[1], 5)
    print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total

1:   abc string count                    0.08828 0.08828
2:   entropy                             0.08444 0.17272
3:   swf area                            0.06474 0.23746
4:   swf perimeter                       0.05966 0.29712
5:   tag count                           0.05951 0.35663
6:   PlaceObject2                        0.05094 0.40757
7:   DefineShape3                        0.05022 0.45779
8:   size                                0.04628 0.50407
9:   frame rate                          0.04317 0.54724
10:  version                             0.04204 0.58928
11:  abc string m/m ratio                0.03019 0.61947
12:  DefineShape                         0.02958 0.64905
13:  long hex string                     0.02897 0.67802
14:  DefineSprite                        0.02816 0.70618
15:  DefineFont3                         0.02448 0.73066
16:  DefineBitsJPEG2                     0.02426 0.75492
17:  DefineFontName                      0.01949 0.77441
18:  ProductInfo                         0.01839 0.7928
19:  Unknown                             0.01636 0.80916
20:  first abc bytecode name             0.01585 0.82501

Finally, we test on the large corpus of files, but limit them to just actionscript again.¶

In [55]:

swf_abc_only_the_rest_df = swf_random_the_rest_df[(swf_random_the_rest_df['DoABC'] == 1) | (swf_random_the_rest_df['DoABCDefine'] == 1)]

In [56]:

clf_everything_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X_all_3 = df_abc_only.as_matrix(abc_features)
y_all_3 = np.array(df_abc_only['label'].tolist())
clf_everything_abc.fit(X_all_3, y_all_3)

Out[56]:

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)

In [57]:

clean = 0
gray = 0
bad = 0
for x in swf_abc_only_the_rest_df.as_matrix(abc_features):
    try:
        score = clf_everything_abc.predict_proba(x)[:,1][0]
        if score < 0.5:
            clean += 1
        elif score < 0.8:
            gray += 1
        else:
            bad += 1
    except Exception as e:
        print "Sad"
        print e
        print x
        break

print swf_abc_only_the_rest_df.shape
print clean
print gray
print bad

(152768, 90)
151967
474
327

	DebugID	DefineBinaryData	DefineBits	DefineBitsJPEG2	DefineBitsJPEG3	DefineButton2	DefineFont2	DefineMorphShape
0	0	0	0	0	0	1	1	1	...
1	0	0	1	1	1	1	0	0	...
2	0	0	0	1	0	0	1	0	...
3	1	1	0	0	0	0	0	0	...
4	0	0	0	0	0	0	0	0	...

	DefineBits
0	0	...
1	0	...
2	0	...
3	1	...
4	0	...

	DebugID	DefineBinaryData	DefineBits	DefineBitsJPEG2	DefineBitsJPEG3	DefineButton2	DefineFont2	DefineMorphShape
0	0	0	0	0	0	1	1	1	...
1	0	0	1	1	1	1	0	0	...
2	0	0	0	1	0	0	1	0	...
3	1	1	0	0	0	0	0	0	...
4	0	0	0	0	0	0	0	0	...

	DefineBits
0	0	...
1	0	...
2	0	...
3	1	...
4	0	...