import pandas as pd
print 'pandas version is', pd.__version__
import numpy as np
print 'numpy version is', np.__version__
import sklearn
print 'scikit-learn version is', sklearn.__version__
import matplotlib
print 'matplotlib version is', matplotlib.__version__
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 16.0, 5.0

def plot_cm(cm, labels):
    # Compute percentanges
    percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)
    print 'Confusion Matrix Stats'
    for i, label_i in enumerate(labels):
        for j, label_j in enumerate(labels):
            print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())

    # Show confusion matrix
    # Thanks to kermit666 from stackoverflow
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.grid(b=False)
    cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100)
    plt.title('Confusion matrix of the classifier')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

def extract_features(data):
    all_features = []
    if not 'error' in data['characteristics']['macho']:
        for i in range(data['characteristics']['macho']['number of architectures']):
            features = {}
            features['entropy'] = data['metadata']['entropy']
            features['file_size'] = data['metadata']['file_size']
            features['number of architectures'] = data['characteristics']['macho']['number of architectures']
            features['header size'] = data['characteristics']['macho']['header'][i]['size']
            features['cmd count'] = data['characteristics']['macho']['header'][i]['mach header']['ncmds']
            features['cmd size'] = data['characteristics']['macho']['header'][i]['mach header']['sizeofcmds']
            features['cputype'] = data['characteristics']['macho']['header'][i]['mach header']['cputype_string']
            for flag in data['characteristics']['macho']['header'][i]['mach header']['flags']:
                features[flag['name']] = 1
            features['string count'] = data['verbose']['macho']['header'][i]['const strings count']
            if 'const strings' in data['verbose']['macho']['header'][i]:
                features['const strings'] = data['verbose']['macho']['header'][i]['const strings']
            else:
                features['const strings'] = []
                
            if 'symbol table strings' in data['verbose']['macho']['header'][i]:
                features['symbol table strings'] = data['verbose']['macho']['header'][i]['symbol table strings']
            else:
                features['symbol table strings'] = []
            features['segment names'] = []
            features['section names'] = []
            for command in data['verbose']['macho']['header'][i]['commands']:
                if command['cmd_name'] in ['LC_SEGMENT', 'LC_SEGMENT_64']:
                    features['segment names'].append(command['segname'])
                    if command['segname'] == '__PAGEZERO':
                        features['pz_size'] = command['cmd_size']
                        features['pz_vmsize'] = command['vmsize']
                        features['pz_vmaddr'] = command['vmaddr']
                        features['pz_flags'] = command['flags']
                        features['pz_filesize'] = command['filesize']
                        features['pz_nsects'] = command['nsects']
                        features['pz_fileoff'] = command['fileoff']
                        for flag in command['initprot']:
                            features['pz_initprot_'+flag] = 1
                        for flag in command['maxprot']:
                            features['pz_maxprot_'+flag] = 1
                    if command['segname'] == '__TEXT':
                        features['text_size'] = command['cmd_size']
                        features['text_vmsize'] = command['vmsize']
                        features['text_vmaddr'] = command['vmaddr']
                        features['text_flags'] = command['flags']
                        features['text_filesize'] = command['filesize']
                        features['text_nsects'] = command['nsects']
                        features['text_fileoff'] = command['fileoff']
                        for flag in command['initprot']:
                            features['text_initprot_'+flag] = 1
                        for flag in command['maxprot']:
                            features['text_maxprot_'+flag] = 1
                        features['text_entropy'] = command['entropy']
                        for section in command['sections']:
                            if section['sectname'] == '__text':
                                features['text_section_reloff'] = section['reloff']
                                features['text_section_addr'] = section['addr']
                                features['text_section_align'] = section['align']
                                features['text_section_nreloc'] = section['nreloc']
                                features['text_section_offset'] = section['offset']
                                features['text_section_size'] = section['size']
                                features['text_section_reserved1'] = section['reserved1']
                                features['text_section_reserved2'] = section['reserved2']
                                features['text_section_' + section['flags']['type']] = 1
                                if 'attributes' in section['flags']:
                                    for attr in section['flags']['attributes']:
                                        features['text_section_flag_attribute_'+attr] = 1
                            if section['sectname'] == '__const':
                                features['const_section_reloff'] = section['reloff']
                                features['const_section_addr'] = section['addr']
                                features['const_section_align'] = section['align']
                                features['const_section_nreloc'] = section['nreloc']
                                features['const_section_offset'] = section['offset']
                                features['const_section_size'] = section['size']
                                features['const_section_reserved1'] = section['reserved1']
                                features['const_section_reserved2'] = section['reserved2']
                                features['const_section_' + section['flags']['type']] = 1
                                if 'attributes' in section['flags']:
                                    for attr in section['flags']['attributes']:
                                        features['const_section_flag_attribute_'+attr] = 1
                    if command['segname'] == '__DATA' and command['nsects'] > 0:
                        features['data_size'] = command['cmd_size']
                        features['data_vmsize'] = command['vmsize']
                        features['data_vmaddr'] = command['vmaddr']
                        features['data_flags'] = command['flags']
                        features['data_filesize'] = command['filesize']
                        features['data_nsects'] = command['nsects']
                        features['data_fileoff'] = command['fileoff']
                        for flag in command['initprot']:
                            features['data_initprot_'+flag] = 1
                        for flag in command['maxprot']:
                            features['data_maxprot_'+flag] = 1
                        features['data_entropy'] = command['entropy']
                    #if command['segname'] == '__IMPORT':
                    #    features['import_size'] = command['cmd_size']
                    #    features['import_vmsize'] = command['vmsize']
                    #    features['import_vmaddr'] = command['vmaddr']
                    #    features['import_flags'] = command['flags']
                    #    features['import_filesize'] = command['filesize']
                    #    features['import_nsects'] = command['nsects']
                    #    features['import_fileoff'] = command['fileoff']
                    #    for flag in command['initprot']:
                    #        features['import_initprot_'+flag] = 1
                    #    for flag in command['maxprot']:
                    #        features['import_maxprot_'+flag] = 1
                    #    features['import_entropy'] = command['entropy']
                    if command['segname'] == '__LINKEDIT':
                        features['linkedit_size'] = command['cmd_size']
                        features['linkedit_vmsize'] = command['vmsize']
                        features['linkedit_vmaddr'] = command['vmaddr']
                        features['linkedit_flags'] = command['flags']
                        features['linkedit_filesize'] = command['filesize']
                        features['linkedit_nsects'] = command['nsects']
                        features['linkedit_fileoff'] = command['fileoff']
                        for flag in command['initprot']:
                            features['linkedit_initprot_'+flag] = 1
                        for flag in command['maxprot']:
                            features['linkedit_maxprot_'+flag] = 1
                    
                    if 'sections' in command:
                        for section in command['sections']:
                            features['section names'].append(section['sectname'])
                if command['cmd_name'] == 'LC_SYMTAB':
                    features['strsize'] = command['strsize']
                    features['stroff'] = command['stroff']
                    features['symoff'] = command['symoff']
                    features['nsyms'] = command['nsyms']
                if command['cmd_name'] in ['LC_DYLD_INFO_ONLY', 'LC_DYLD_INFO']:
                    features['lazy_bind_size'] = command['lazy_bind_size']
                    features['rebase_size'] = command['rebase_size']
                    features['weak_bind_size'] = command['weak_bind_size']
                    features['lazy_bind_off'] = command['lazy_bind_off']
                    features['export_off'] = command['export_off']
                    features['export_size'] = command['export_size']
                    features['bind_off'] = command['bind_off']
                    features['rebase_off'] = command['rebase_off']
                    features['bind_size'] = command['bind_size']
                    features['weak_bind_off'] = command['weak_bind_off']
                if command['cmd_name'] == 'LC_DYSYMTAB':
                    features['nextdefsym'] = command['nextdefsym']
                    features['extreloff'] = command['extreloff']
                    features['nlocrel'] = command['nlocrel']
                    features['indirectsymoff'] = command['indirectsymoff']
                    features['modtaboff'] = command['modtaboff']
                    features['iundefsym'] = command['iundefsym']
                    features['ntoc'] = command['ntoc']
                    features['ilocalsym'] = command['ilocalsym']
                    features['nundefsym'] = command['nundefsym']
                    features['nextrefsyms'] = command['nextrefsyms']
                    features['locreloff'] = command['locreloff']
                    features['nmodtab'] = command['nmodtab']
                    features['nlocalsym'] = command['nlocalsym']
                    features['tocoff'] = command['tocoff']
                    features['extrefsymoff'] = command['extrefsymoff']
                    features['nindirectsyms'] = command['nindirectsyms']
                    features['iextdefsym'] = command['iextdefsym']
                    features['nextrel'] = command['nextrel']

            features.update(data['verbose']['macho']['header'][i]['command type count'])
            if 'LC_SEGMENT' in features:
                features['number of segments'] = features['LC_SEGMENT']
            else:
                features['number of segments'] = features['LC_SEGMENT_64']
            if 'entry point' in data['verbose']['macho']['header'][i]:
                for key, value in data['verbose']['macho']['header'][i]['entry point'].iteritems():
                    features['entry point ' + key] = value
            all_features.append(features)

    return all_features

def load_files(file_list):
    import json
    features_list = []
    for filename in file_list:
        with open(filename,'rb') as f:
            features = extract_features(json.loads(f.read()))
            features_list.extend(features)
    return features_list

# Good files
import glob
good_list = glob.glob('data/good/*.results')
good_features = load_files(good_list)
print "Files:", len(good_list)
print "Number of feature vectors:", len(good_features)

# Bad files
bad_list = glob.glob('data/bad/*.results')
bad_features = load_files(bad_list)
print "Files:", len(bad_list)
print "Number of feature vectors:", len(bad_features)

df_good_orig = pd.DataFrame.from_records(good_features)
df_good_orig['label'] = 'good'
df_good_orig.head()

df_good = df_good_orig
for col in df_good.columns:
    if col[0:3] in ['LC_', 'MH_']:
        #print col
        df_good[col].fillna(0, inplace=True)
        
df_good.fillna(-1, inplace=True)
df_good_orig.head()

df_bad_orig = pd.DataFrame.from_records(bad_features)
df_bad_orig['label'] = 'bad'
df_bad = df_bad_orig
for col in df_bad.columns:
    if col[0:3] in ['LC_', 'MH_']:
        #print col
        df_bad[col].fillna(0, inplace=True)
        
df_bad.fillna(-1, inplace=True)
df_bad.head()

df_good['cputype'].value_counts().plot(kind='bar')

df_bad['cputype'].value_counts().plot(kind='bar')

df_all_orig = pd.concat([df_bad, df_good], ignore_index=True)
df_all = df_all_orig
for col in df_all.columns:
    if col[0:3] in ['LC_', 'MH_']:
        #print col
        df_all[col].fillna(0, inplace=True)
        
df_all.fillna(-1, inplace=True)
# Break out by cpu type
cond = df_all['cputype'] == 'x86_64' 
df_x64 = df_all[cond]
cond = df_all['cputype'] == 'i386'
df_x86 = df_all[cond]
df_all = pd.concat([df_x64, df_x86], ignore_index=True)
#print df['symbol table strings']
df = df_all.drop(['const strings', 'symbol table strings', 'segment names',
                  'section names', 'entry point section name', 'entry point segment name',
                  'entry point instruction type'], axis=1)
df['cputype'].value_counts().plot(kind='bar')

cond = df['label'] == 'good'
good = df[cond]
bad  = df[~cond]
bad['cmd count'].hist(alpha=.5,label='bad',bins=40)
good['cmd count'].hist(alpha=.5,label='good',bins=40)
plt.legend()

df.boxplot('cmd count', 'label')
plt.xlabel('bad vs. good files')
plt.ylabel('Number of Commands')
plt.title('Comparision of Number of Commands')
plt.suptitle('')

df.boxplot(column='cmd size', by='label')
plt.xlabel('bad vs. good files')
plt.ylabel('Command Size')
plt.title('Comparision of Command Sizes')
plt.suptitle("")

# Split the classes up so we can set colors, size, labels
cond = df['label'] == 'good'
good = df[cond]
bad  = df[~cond]
plt.scatter(good['cmd count'], good['cmd size'], 
            s=140, c='#aaaaff', label='Good', alpha=.4)
plt.scatter(bad['cmd count'], bad['cmd size'], 
            s=40, c='r', label='Bad', alpha=.5)
plt.legend()
plt.xlabel('Command Count')
plt.ylabel('Command Size')
plt.title('Command Size Vs Command Count')

df.boxplot('entropy', 'label')
plt.xlabel('bad vs. good files')
plt.ylabel('Number of Commands')
plt.title('Comparision of Entropy')
plt.suptitle('')

df.boxplot('number of segments', 'label')
plt.xlabel('bad vs. good files')
plt.ylabel('Number of Segments')
plt.title('Comparision of Number of Segments')
plt.suptitle('')

cond = df['label'] == 'good'
good = df[cond]
bad  = df[~cond]
plt.scatter(good['entropy'], good['number of segments'], 
            s=140, c='#aaaaff', label='Good', alpha=.4)
plt.scatter(bad['entropy'], bad['number of segments'], 
            s=40, c='r', label='Bad', alpha=.5)
plt.legend()
plt.xlabel('Entropy')
plt.ylabel('Number of Segments')
plt.title('Number of Segments Vs Entropy')

# List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors)
X_cmd = df.as_matrix(['cmd count', 'cmd size'])

# Labels (scikit learn uses 'y' for classification labels)
y = np.array(df['label'].tolist())

# Random Forest is a popular ensemble machine learning classifier.
# http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#
import sklearn.ensemble
clf_cmd = sklearn.ensemble.RandomForestClassifier(n_estimators=50)

# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf_cmd, X_cmd, y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

my_seed = 1022
my_tsize = .2
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_cmd, y, test_size=my_tsize, random_state=my_seed)
clf_cmd.fit(X_train, y_train)
clf_cmd_scores = clf_cmd.score(X_test, y_test)
print("Accuracy: %0.2f" % clf_cmd_scores)
y_pred = clf_cmd.predict(X_test)

from sklearn.metrics import confusion_matrix
labels = ['good', 'bad']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

clf_2 = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
X_2 = df.as_matrix(['number of segments', 'entropy'])
X_train, X_test, y_train, y_test = train_test_split(X_2, y, test_size=my_tsize, random_state=my_seed)
clf_2.fit(X_train, y_train)
y_pred = clf_2.predict(X_test)

from sklearn.metrics import confusion_matrix
labels = ['good', 'bad']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

clf_all = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
no_label = list(df.columns.values)
no_label.remove('label')
no_label.remove('cputype')
X = df.as_matrix(no_label)

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed)
clf_all.fit(X_train, y_train)
y_pred = clf_all.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

# Feature Selection
# Which features best deferentiated the two classes?
# Here we're going to grab the feature_importances from the classifier itself, 
importances = zip(no_label, clf_all.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for im in importances[0:20]:
    print im[0].ljust(30), im[1]

df_good['LC_UNIXTHREAD'].value_counts().plot(kind='bar', label='good')
plt.legend()

df_bad['LC_UNIXTHREAD'].value_counts().plot(kind='bar')

clf_some = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
some = list(df.columns.values)
some.remove('label')
some.remove('cputype')
some.remove('LC_UNIXTHREAD')
some.remove('LC_MAIN')
X_some = df.as_matrix(some)

# 80/20 Split for predictive test
X_train_some, X_test_some, y_train_some, y_test_some = train_test_split(X_some, y, test_size=my_tsize, random_state=my_seed)
clf_some.fit(X_train_some, y_train_some)
y_pred_some = clf_some.predict(X_test_some)
cm = confusion_matrix(y_test_some, y_pred_some, labels)
plot_cm(cm, labels)

importances = zip(some, clf_some.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for im in importances[0:20]:
    print im[0].ljust(30), im[1]

symbol_strings = []
for strings, label in zip(df_all['symbol table strings'], df_all['label']):
    for symbol in strings:
        symbol_strings.append({'symbol string': symbol, 'label': label})

pd_symbols = pd.DataFrame.from_records(symbol_strings)
pd_symbols.head()

import sys
sys.path.insert(0,'..')
import data_hacking.simple_stats as ss

# Spin up our g_test class
g_test = ss.GTest()

# Here we'd like to see how strongly various strings are assoicated with being clean or malware.
df_ct, df_cd, df_symbol_stats = g_test.highest_gtest_scores(pd_symbols['symbol string'],
                                                            pd_symbols['label'], N=0, matches=0, min_volume=5)


df_symbol_stats.dropna(inplace=True)
df_symbol_stats.sort('bad_g', ascending=0).head(25)

df_symbol_stats.sort('good_g', ascending=0).head(25)

def tokenize_string(string):
    if '___cxx_global_var_init' in string:
        return '___cxx_global_var_init_TOKEN'
    elif '__ZN12_GLOBAL' in string:
        return '__ZN12_GLOBAL_TOKEN'
    else:
        return string

pd_symbols['tokenized string'] = pd_symbols['symbol string'].map(lambda x: tokenize_string(x))

df_ct, df_cd, df_symbol_stats = g_test.highest_gtest_scores(pd_symbols['tokenized string'],
                                                     pd_symbols['label'], N=10, matches=5, min_volume=5)

df_symbol_stats.dropna(inplace=True)
df_symbol_stats.sort('bad_g', ascending=0).head(10)

def g_aggregate(df_stats, sequence, name):
    try:
        g_scores = [df_stats.ix[tokenize_string(item)][name] for item in sequence]
    except KeyError:
        return 0
    return sum(g_scores)/len(g_scores) if g_scores else 0 # Average

df_all['symbol table strings malicious_g'] = \
    df_all['symbol table strings'].map(lambda x: g_aggregate(df_symbol_stats, x, 'bad_g'))
df_all['symbol table strings clean_g'] = \
    df_all['symbol table strings'].map(lambda x: g_aggregate(df_symbol_stats, x, 'good_g'))

segment_names = []
for strings, label in zip(df_all['segment names'], df_all['label']):
    for name in strings:
        segment_names.append({'segment names': name, 'label': label})

pd_segment_names = pd.DataFrame.from_records(segment_names)
pd_segment_names.head()

#Spin up our g_test class
g_test = ss.GTest()

df_ct, df_cd, df_segment_stats = g_test.highest_gtest_scores(pd_segment_names['segment names'],
                                                     pd_segment_names['label'], N=0, matches=0, min_volume=0)

df_segment_stats.dropna(inplace=True)
df_segment_stats.sort('bad_g', ascending=0).head(10)

s = []
for strings, label in zip(df_all['section names'], df_all['label']):
    for name in strings:
        s.append({'section names': name, 'label': label})

pd_section_names = pd.DataFrame.from_records(s)
print pd_section_names.shape
pd_section_names.head()

#Spin up our g_test class
g_test = ss.GTest()

# Here we'd like to see how various exploits (description) are related to
# the ASN (Autonomous System Number) associated with the ip/domain.
df_ct, df_cd, df_section_stats = g_test.highest_gtest_scores(pd_section_names['section names'],
                                                     pd_section_names['label'], N=0, matches=0, min_volume=5)

df_section_stats.dropna(inplace=True)
df_section_stats.sort('bad_g', ascending=0).head(10)

df_section_stats.sort('good_g', ascending=0).head(10)

df_all['segment names malicious_g'] = df_all['segment names'].map(lambda x: g_aggregate(df_segment_stats, x, 'bad_g'))
df_all['segment names clean_g'] = df_all['segment names'].map(lambda x: g_aggregate(df_segment_stats, x, 'good_g'))
df_all['section names malicious_g'] = df_all['section names'].map(lambda x: g_aggregate(df_section_stats, x, 'bad_g'))
df_all['section names clean_g'] = df_all['section names'].map(lambda x: g_aggregate(df_section_stats, x, 'good_g'))

g_test = ss.GTest()
df_entrypoint = pd.DataFrame(df_all, columns=['entry point section name', 'entry point segment name',
                                              'entry point instruction type', 'label'])

df_entrypoint = df_entrypoint.replace(0, np.nan)
df_entrypoint.dropna(inplace=True)
df_entrypoint.head()

df_ct, df_cd, df_ep_section_stats = g_test.highest_gtest_scores(df_entrypoint['entry point section name'],
                                                     df_entrypoint['label'], N=0, matches=0, min_volume=5)
df_ct, df_cd, df_ep_segment_stats = g_test.highest_gtest_scores(df_entrypoint['entry point segment name'],
                                                     df_entrypoint['label'], N=0, matches=0, min_volume=5)
df_ct, df_cd, df_ep_instruction_stats = g_test.highest_gtest_scores(df_entrypoint['entry point instruction type'],
                                                     df_entrypoint['label'], N=0, matches=0, min_volume=5)

df_ep_section_stats.head()

df_ep_segment_stats.head()

df_ep_instruction_stats.dropna(inplace=True)
df_ep_instruction_stats.sort('bad_g', ascending=0).head(20)

df_entrypoint['entry point instruction type'].value_counts().plot(kind='bar')

clf_final = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
final_labels = list(df_all.columns.values)
final_labels.remove('label')
final_labels.remove('cputype')
final_labels.remove('LC_UNIXTHREAD')
final_labels.remove('LC_MAIN')
final_labels.remove('const strings')
final_labels.remove('symbol table strings')
final_labels.remove('segment names')
final_labels.remove('section names')
final_labels.remove('entry point section name')
final_labels.remove('entry point segment name')
final_labels.remove('entry point instruction type')
X_final = df_all.as_matrix(final_labels)
y_final = np.array(df_all['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_final, X_final, y_final, cv=10, n_jobs=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# 80/20 Split for predictive test
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_final, y_final,
                                                                            test_size=my_tsize,
                                                                            random_state=my_seed)
clf_final.fit(X_train_final, y_train_final)
y_pred_final = clf_final.predict(X_test_final)
cm_final = confusion_matrix(y_test_final, y_pred_final, labels)
plot_cm(cm_final, labels)

y_probs_final = clf_final.predict_proba(X_test_final)[:,0]
thres = .8 # This can be set to whatever you'd like
y_pred_final[y_probs_final<thres] = 'good'
y_pred_final[y_probs_final>=thres] = 'bad'
cm = confusion_matrix(y_test_final, y_pred_final, labels)
plot_cm(cm, labels)

importances = zip(final_labels, clf_final.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for im in importances[0:20]:
    print im[0].ljust(35), im[1]

clf_most = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
X = df_all.as_matrix([item[0] for item in importances[:20]])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed)
clf_most.fit(X_train, y_train)
y_pred = clf_most.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

y_probs = clf_most.predict_proba(X_test)[:,0]
thres = .92 # This can be set to whatever you'd like
y_pred[y_probs<thres] = 'good'
y_pred[y_probs>=thres] = 'bad'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

clf_most = sklearn.ensemble.ExtraTreesClassifier(n_estimators=50)
X = df_all.as_matrix([item[0] for item in importances[:20]])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed)
clf_most.fit(X_train, y_train)
y_pred = clf_most.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

df_all.boxplot(column='rebase_off', by='label')
plt.xlabel('bad vs. good files')
plt.ylabel('rebase_off')
plt.title('Comparision of rebase offset')
plt.suptitle("")

df_all.boxplot(column='rebase_off', by='label', sym='')
plt.xlabel('bad vs. good files')
plt.ylabel('rebase_off')
plt.title('Comparision of rebase offset')
plt.suptitle("")

cond = df_all['label'] == 'good'
good_all = df_all[cond]
bad_all  = df_all[~cond]
bvc = pd.DataFrame(bad_all['rebase_off'].value_counts(), columns=['count'])
bvc['perctange'] = bvc/bad_all.shape[0]
gvc = pd.DataFrame(good_all['rebase_off'].value_counts(), columns=['count'])
gvc['perctange'] = gvc/good_all.shape[0]
print 'Most common rebase_off values from malicious set'
print bvc.head(5)
print ''
print 'Most common rebase_off values from clean set'
print gvc.head()

bad_all['rebase_off'].hist(alpha=1,label='bad',bins=1000)
good_all['rebase_off'].hist(alpha=1,label='good',bins=1000)
plt.title('Values for rebase offset')
plt.legend()
plt.xlim(-50,3000000)

bad_all['rebase_off'].hist(alpha=1,label='bad',bins=2000)
good_all['rebase_off'].hist(alpha=1,label='good',bins=2000)
plt.legend()
plt.title('Values for rebase offset')
plt.xlim(-50,80000)

df_all.boxplot(column='rebase_size', by='label')
plt.xlabel('bad vs. good files')
plt.ylabel('rebase_size')
plt.title('Comparision of rebase size')
plt.suptitle("")

df_all.boxplot(column='rebase_size', by='label', sym='')
plt.xlabel('bad vs. good files')
plt.ylabel('rebase_size')
plt.title('Comparision of rebase size')
plt.suptitle("")

bvc = pd.DataFrame(bad_all['rebase_size'].value_counts(), columns=['count'])
bvc['percentange'] = bvc/bad_all.shape[0]
gvc = pd.DataFrame(good_all['rebase_size'].value_counts(), columns=['count'])
gvc['percentange'] = gvc/good_all.shape[0]
print 'Most common rebase_size values from malicious set'
print bvc.head(5)
print ''
print 'Most common rebase_size values from clean set'
print gvc.head(5)

plt.scatter(good_all['rebase_off'], good_all['rebase_size'], 
            s=140, c='#aaaaff', label='Good', alpha=.4)
plt.scatter(bad_all['rebase_off'], bad['rebase_size'], 
            s=40, c='r', label='Bad', alpha=.5)
plt.title('rebase_off vs rebase_size')
plt.xlabel('rebase_off')
plt.ylabel('rebase_size')

plt.scatter(good_all['rebase_off'], good_all['rebase_size'], 
            s=140, c='#aaaaff', label='Good', alpha=.4)
plt.scatter(bad_all['rebase_off'], bad['rebase_size'], 
            s=40, c='r', label='Bad', alpha=.5)
plt.xlim(-1000,150000)
plt.ylim(-100,1100)
plt.title('rebase_off vs rebase_size')
plt.xlabel('rebase_off')
plt.ylabel('rebase_size')

plt.scatter(good_all['rebase_off'], good_all['rebase_size'], 
            s=140, c='#aaaaff', label='Good', alpha=.4)
plt.scatter(bad_all['rebase_off'], bad['rebase_size'], 
            s=40, c='r', label='Bad', alpha=.5)
plt.xlim(-10,10)
plt.ylim(-10,10)
plt.title('rebase_off vs rebase_size')
plt.xlabel('rebase_off')
plt.ylabel('rebase_size')

def load_vt_data(vt_list):
    import json
    import collections
    results = {}
    for filename in vt_list:
        with open(filename,'rb') as f:
            vt_data = f.read()
            if len(vt_data) == 0:
                break
            vt = json.loads(vt_data)
            for engine in vt['scans'].keys():
                if engine not in results:
                    results[engine] = 0
                if vt['scans'][engine]['detected']:                
                    results[engine] += 1
                
    r = []
    for key, value in results.iteritems():
        r.append({'Engine': key, 'Count': value})
    return r

vt_list = glob.glob('data/bad/vt_data/*.vtdata')
num_files = len(vt_list)
vt_results = load_vt_data(vt_list)
vt_df = pd.DataFrame.from_records(vt_results, columns=['Engine', 'Count'])
vt_df['Files Scanned'] = num_files
vt_df['Percentage'] = vt_df['Count']/num_files
vt_df.sort('Count', ascending=0).head(52)

vt_list = glob.glob('data/good/vt_data/*.vtdata')
num_files = len(vt_list)
vt_results = load_vt_data(vt_list)
vt_df = pd.DataFrame.from_records(vt_results, columns=['Engine', 'Count'])
vt_df['Files Scanned'] = num_files
vt_df['Percentage'] = vt_df['Count']/num_files
vt_df.sort('Count', ascending=0).head(15)