import pandas as pd
print 'pandas version is', pd.__version__
import numpy as np
print 'numpy version is', np.__version__
import sklearn
print 'scikit-learn version is', sklearn.__version__
import matplotlib
print 'matplotlib version is', matplotlib.__version__
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 16.0, 5.0

def plot_cm(cm, labels):
    # Compute percentanges
    percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)
    print 'Confusion Matrix Stats'
    for i, label_i in enumerate(labels):
        for j, label_j in enumerate(labels):
            print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())

    # Show confusion matrix
    # Thanks to kermit666 from stackoverflow
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.grid(b=False)
    cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100)
    plt.title('')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

def extract_features(data):
    features = {}
    try:
        features['sha256'] = data['metadata']['sha256']
        features['size'] = data['metadata']['file_size']
        features['entropy'] = data['metadata']['entropy']
        features['version'] = data['characteristics']['swf']['swf metadata']['version']
        features['frame count'] = data['characteristics']['swf']['swf metadata']['framecount']
        features['frame rate'] = data['characteristics']['swf']['swf metadata']['framerate']
        
        x_min = data['characteristics']['swf']['swf metadata']['xmin']
        x_max = data['characteristics']['swf']['swf metadata']['xmax']
        y_min = data['characteristics']['swf']['swf metadata']['ymin']
        y_max = data['characteristics']['swf']['swf metadata']['ymax']
        x_length = x_max - x_min
        y_length = y_max - y_min
        features['swf area'] = x_length * y_length
        features['swf perimeter'] = 2*(x_length+y_length)
        
        features['tag count'] = 0
        for tag_info in data['characteristics']['swf']['tag types']:
            features[tag_info['tag name']] = 1
            features['tag count'] += tag_info['count']

        abc_info = {}
        for tag_info in data['verbose']['swf']['tags']:
            if 'DoABC' in tag_info or 'DoABCDefine' in tag_info:
                key = 'DoABC'
                if 'DoABCDefine' in tag_info:
                    key = 'DoABCDefine'
                
                if 'abc bytecodename' not in features:
                    abc_info['abc bytecodename'] = []
                try:
                    abc_info['abc bytecodename'].append(tag_info[key]['bytecodename'])
                except KeyError:
                    abc_info['abc bytecodename'].append('DoABCDefine')
                                                        
                try:
                    abc_info['abc flag'] = tag_info[key]['flag']
                except KeyError:
                    abc_info['abc flag'] = 0
                
                if 'abc strings' not in features:
                    abc_info['abc strings'] = []
                if 'abc string count' not in features:
                    abc_info['abc string count'] = 0
                    
                abc_info['abc strings'].extend(tag_info[key]['abc']['strings'])

        if abc_info:
            if abc_info['abc bytecodename'][0] == '':
                features['first abc bytecode name'] = 1
            elif abc_info['abc bytecodename'][0] == 'DoABCDefine':
                features['first abc bytecode name'] = 2
            elif abc_info['abc bytecodename'][0] == 'frame1':
                features['first abc bytecode name'] = 3
            else:
                features['first abc bytecode name'] = 4
                
            features['abc bytecode name'] = abc_info['abc bytecodename']
            features['bytecode name count'] = len(abc_info['abc bytecodename'])
            features['unique bytecode name count'] = len(set(abc_info['abc bytecodename']))
            features['abc strings'] = abc_info['abc strings']
            features['abc string count'] = len(features['abc strings'])
            
            features['long hex string'] = 0
            for s in features['abc strings']:
                if len(s) > 100:
                    try:
                        s.decode('hex')
                        features['long hex string'] = 1
                        break
                    except:
                        pass
            try:
                features['abc string m/m ratio'] = float(data['verbose']['swf']['SWF String Statistical Analysis']['ActionScript String Length Mean to Median Ratio'])
            except KeyError as k:
                features['abc string m/m ratio'] = 0.0         

    except KeyError as ke:
        print 'ERROR:', ke, data['metadata']['sha256']
    return features

def load_files(file_list):
    import json
    features_list = []
    for filename in file_list:
        with open(filename,'rb') as f:
            features = extract_features(json.loads(f.read()))
            features_list.append(features)
    return features_list

# Good files
import glob
good_list = glob.glob('data/clean/*.results')
good_features = load_files(good_list)
print "Files:", len(good_list)

# Bad files
bad_list = glob.glob('data/malicious/*.results')
bad_features = load_files(bad_list)
print "Files:", len(bad_list)

df_good = pd.DataFrame.from_records(good_features)
df_good.fillna(0, inplace=True)
df_good['label'] = 'benign'
df_good.head()

df_bad = pd.DataFrame.from_records(bad_features)
df_bad.fillna(0, inplace=True)
df_bad['label'] = 'malicious'
df_bad.head()

df = pd.concat([df_bad, df_good], ignore_index=True)
df.fillna(0, inplace=True)

df.groupby(['label', 'version'])['version'].count().unstack('label').fillna(0).plot(
    colormap='GnBu', kind='bar', stacked=True, grid=False)

df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 200000)

df.boxplot('entropy', 'label')
plt.ylabel('Entropy')
plt.xlabel('')
plt.title('')
plt.suptitle('')

df.boxplot(column='frame count', by='label')
plt.ylabel('Frame Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 5000)

df_good['frame count'].value_counts()[0:10]

df.boxplot(column='frame rate', by='label')
plt.ylabel('Frame Rate')
plt.xlabel('')
plt.title('')
plt.suptitle('')

df.boxplot('swf area', 'label')
plt.xlabel('')
plt.ylabel('Frame Area')
plt.title('')
plt.suptitle('')
plt.ylim(0, 750000)

df.boxplot('swf perimeter', 'label')
plt.xlabel('')
plt.ylabel('Frame Perimeter')
plt.title('')
plt.suptitle('')

my_seed = 1022
my_tsize = .2

import sklearn.ensemble
clf_simple = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
simple_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version']

X = df.as_matrix(simple_features)
y = np.array(df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_simple, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_simple.fit(X_train, y_train)
y_pred = clf_simple.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

# Feature Selection
importances = zip(simple_features, clf_simple.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances[0:10]):
    print (str(idx+1) + ':').ljust(4), im[0].ljust(20), round(im[1], 5)

df.boxplot('tag count', 'label')
plt.xlabel('')
plt.ylabel('Number of Tags')
plt.title('')
plt.suptitle('')
plt.ylim(0,400)

p = df.groupby(['PlaceObject2','label'])['PlaceObject2'].count().unstack('PlaceObject2').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

p = df.groupby(['DoABC','label'])['DoABC'].count().unstack('DoABC').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

p = df.groupby(['DoABCDefine','label'])['DoABCDefine'].count().unstack('DoABCDefine').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

p = df.groupby(['DefineBitsJPEG2','label'])['DefineBitsJPEG2'].count().unstack('DefineBitsJPEG2').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

p = df.groupby(['End','label'])['End'].count().unstack('End').fillna(0).plot(
    kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()

import sklearn.ensemble
clf_tags = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
tag_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count']

X = df.as_matrix(tag_features)
y = np.array(df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_tags, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_tags.fit(X_train, y_train)
y_pred = clf_tags.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

importances = zip(tag_features, clf_tags.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances[0:25]):
    print (str(idx+1) + ':').ljust(4), im[0].ljust(40), round(im[1], 5)

df.boxplot('abc string count', 'label')
plt.xlabel('')
plt.ylabel('Number of ActionScript Strings')
plt.title('')
plt.suptitle('')
plt.ylim(0, 1000)

df.boxplot('abc string m/m ratio', 'label')
plt.xlabel('')
plt.ylabel('ActionScript Mean/Median Ratio')
plt.title('')
plt.suptitle('')
plt.ylim(0, 15)

import sklearn.ensemble
clf_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X = df.as_matrix(abc_features)
y = np.array(df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_abc, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

#### Again, not a real improvement.

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_abc.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

importances = zip(abc_features, clf_abc.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
    total += round(im[1], 5)
    print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total

clf_everything = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X_all = df.as_matrix(abc_features)
y_all = np.array(df['label'].tolist())
clf_everything.fit(X_all, y_all)

swf_malware_df = pd.read_hdf('data/swf_malware_df.hd5', 'table')
swf_malware_df['label'] = 'malicious'
swf_malware_df.shape

swf_bigpile_df = pd.read_hdf('data/swf_bigpile_df.hd5', 'table')
swf_bigpile_df['label'] = 'benign'
swf_bigpile_df.shape

clean = 0
gray = 0
bad = 0
for x in swf_bigpile_df.as_matrix(abc_features):
    try:
        score = clf_everything.predict_proba(x)[:,1][0]
        if score < 0.5:
            clean += 1
        elif score < 0.8:
            gray += 1
        else:
            bad += 1
    except:
        print "Sad"
        print x
        break

print swf_bigpile_df.shape
print clean
print gray
print bad

swf_random_df = swf_bigpile_df.reindex(np.random.permutation(swf_bigpile_df.index))
swf_random_5k_df = swf_random_df[0:5000]
swf_random_the_rest_df = swf_random_df[5000:]

swf_bigger_df = pd.concat([swf_malware_df, swf_random_5k_df], ignore_index=True)
swf_bigger_df.fillna(0, inplace=True)

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

clf_5k = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X = swf_bigger_df.as_matrix(abc_features)
y = np.array(swf_bigger_df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_5k, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_5k.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

importances = zip(abc_features, clf_5k.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
    total += round(im[1], 5)
    print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total

#### Next we training over all the data again, and test on the large corpus of files.

clf_everything_2 = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X_all_2 = swf_bigger_df.as_matrix(abc_features)
y_all_2 = np.array(swf_bigger_df['label'].tolist())
clf_everything_2.fit(X_all_2, y_all_2)

clean = 0
gray = 0
bad = 0
for x in swf_random_the_rest_df.as_matrix(abc_features):
    try:
        score = clf_everything_2.predict_proba(x)[:,1][0]
        if score < 0.5:
            clean += 1
        elif score < 0.8:
            gray += 1
        else:
            bad += 1
    except:
        print "Sad"
        print x
        break

print swf_bigpile_df.shape
print clean
print gray
print bad

df_abc_only = swf_bigger_df[(swf_bigger_df['DoABC'] == 1) | (swf_bigger_df['DoABCDefine'] == 1)]
df_abc_only.shape

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

clf_abc_only = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X = df_abc_only.as_matrix(abc_features)
y = np.array(df_abc_only['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_abc_only, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_abc_only.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

importances = zip(abc_features, clf_abc.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
    total += round(im[1], 5)
    print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total

swf_abc_only_the_rest_df = swf_random_the_rest_df[(swf_random_the_rest_df['DoABC'] == 1) | (swf_random_the_rest_df['DoABCDefine'] == 1)]

clf_everything_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
                    'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
                    'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
                    'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
                    'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
                    'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
                    'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
                    'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
                    'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
                    'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
                    'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
                    'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
                    'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
                    'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
                    'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
                    'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',                    
                    'abc string count', 'abc string m/m ratio',
                    'bytecode name count', 'first abc bytecode name', 'long hex string',
                    'unique bytecode name count']

X_all_3 = df_abc_only.as_matrix(abc_features)
y_all_3 = np.array(df_abc_only['label'].tolist())
clf_everything_abc.fit(X_all_3, y_all_3)

clean = 0
gray = 0
bad = 0
for x in swf_abc_only_the_rest_df.as_matrix(abc_features):
    try:
        score = clf_everything_abc.predict_proba(x)[:,1][0]
        if score < 0.5:
            clean += 1
        elif score < 0.8:
            gray += 1
        else:
            bad += 1
    except Exception as e:
        print "Sad"
        print e
        print x
        break

print swf_abc_only_the_rest_df.shape
print clean
print gray
print bad