import os
import sklearn.feature_extraction
sklearn.__version__
'0.14.1'
import pandas as pd
pd.__version__
'0.13.1'
import numpy as np
np.__version__
'1.8.0'
# Plotting defaults
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 16.0, 5.0
def plot_cm(cm, labels):
# Compute percentanges
percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) # Derp, I'm sure there's a better way
print 'Confusion Matrix Stats'
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())
# Show confusion matrix
# Thanks kermit666 from stackoverflow :)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid(b=False)
cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
import os, warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pe_features
my_extractor = pe_features.PEFileFeatures()
# Open a PE File and see what features we get
filename = 'data/bad/0cb9aa6fb9c4aa3afad7a303e21ac0f3'
with open(filename,'rb') as f:
features = my_extractor.execute(f.read())
features
{'check_sum': 0, 'compile_date': 1218437803, 'datadir_IMAGE_DIRECTORY_ENTRY_BASERELOC_size': 0, 'datadir_IMAGE_DIRECTORY_ENTRY_EXPORT_size': 0, 'datadir_IMAGE_DIRECTORY_ENTRY_IAT_size': 468, 'datadir_IMAGE_DIRECTORY_ENTRY_IMPORT_size': 100, 'datadir_IMAGE_DIRECTORY_ENTRY_RESOURCE_size': 1048, 'debug_size': 0, 'export_size': 0, 'generated_check_sum': 53913, 'iat_rva': 9256, 'major_version': 0, 'minor_version': 0, 'number_of_bound_import_symbols': 0, 'number_of_bound_imports': 0, 'number_of_export_symbols': 0, 'number_of_import_symbols': 113, 'number_of_imports': 4, 'number_of_rva_and_sizes': 16, 'number_of_sections': 4, 'pe_char': 271, 'pe_dll': 0, 'pe_driver': 0, 'pe_exe': 1, 'pe_i386': 1, 'pe_majorlink': 6, 'pe_minorlink': 0, 'pe_warnings': 0, 'sec_entropy_data': 0.4421475832668401, 'sec_entropy_rdata': 3.2064873564662046, 'sec_entropy_reloc': 0, 'sec_entropy_rsrc': 1.028676764457129, 'sec_entropy_text': 4.852962403013336, 'sec_raw_execsize': 16384, 'sec_rawptr_data': 12288, u'sec_rawptr_rdata': 8192, 'sec_rawptr_rsrc': 16384, 'sec_rawptr_text': 4096, 'sec_rawsize_data': 4096, u'sec_rawsize_rdata': 4096, 'sec_rawsize_rsrc': 4096, 'sec_rawsize_text': 4096, 'sec_va_execsize': 7044, 'sec_vasize_data': 468, u'sec_vasize_rdata': 2182, 'sec_vasize_rsrc': 1048, 'sec_vasize_text': 3346, 'size_code': 4096, 'size_image': 20480, 'size_initdata': 12288, 'size_uninit': 0, 'std_section_names': 1, 'total_size_pe': 20480, 'virtual_address': 4096, 'virtual_size': 3346, 'virtual_size_2': 2182}
# Load up all our files (files come from various places contagio, around the net...)
def load_files(file_list):
features_list = []
for filename in file_list:
with open(filename,'rb') as f:
features_list.append(my_extractor.execute(f.read()))
return features_list
# Bad (malicious) files
file_list = [os.path.join('data/bad', child) for child in os.listdir('data/bad')]
bad_features = load_files(file_list)
print 'Loaded up %d malicious PE Files' % len(bad_features)
Loaded up 50 malicious PE Files
# Good (benign) files
file_list = [os.path.join('data/good', child) for child in os.listdir('data/good')]
good_features = load_files(file_list)
print 'Loaded up %d benign PE Files' % len(good_features)
Loaded up 50 benign PE Files
# Putting the features into a pandas dataframe
import pandas as pd
df_bad = pd.DataFrame.from_records(bad_features)
df_bad['label'] = 'bad'
df_good = pd.DataFrame.from_records(good_features)
df_good['label'] = 'good'
df_good.head()
check_sum | compile_date | datadir_IMAGE_DIRECTORY_ENTRY_BASERELOC_size | datadir_IMAGE_DIRECTORY_ENTRY_EXPORT_size | datadir_IMAGE_DIRECTORY_ENTRY_IAT_size | datadir_IMAGE_DIRECTORY_ENTRY_IMPORT_size | datadir_IMAGE_DIRECTORY_ENTRY_RESOURCE_size | debug_size | export_size | generated_check_sum | iat_rva | major_version | minor_version | number_of_bound_import_symbols | number_of_bound_imports | number_of_export_symbols | number_of_import_symbols | number_of_imports | number_of_rva_and_sizes | number_of_sections | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 97308 | 1383744221 | 3044 | 0 | 592 | 140 | 7368 | 28 | 0 | 97308 | 50424 | 0 | 0 | 0 | 0 | 0 | 142 | 6 | 16 | 5 | ... |
1 | 103233 | 1383102953 | 60 | 0 | 1008 | 60 | 872 | 28 | 0 | 103233 | 53248 | 5 | 1 | 0 | 0 | 0 | 124 | 2 | 16 | 8 | ... |
2 | 26573 | 1386271379 | 360 | 0 | 208 | 100 | 2588 | 28 | 0 | 25971 | 8804 | 0 | 0 | 0 | 0 | 0 | 48 | 4 | 16 | 5 | ... |
3 | 0 | 1373925025 | 12 | 0 | 8 | 83 | 11904 | 28 | 0 | 54015 | 35064 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 16 | 4 | ... |
4 | 50003 | 1378865704 | 360 | 0 | 208 | 100 | 2588 | 28 | 0 | 59485 | 8804 | 0 | 0 | 0 | 0 | 0 | 48 | 4 | 16 | 5 | ... |
5 rows × 108 columns
# Now we're set and we open up a a whole new world!
# Gisting and statistics
df_bad.describe()
check_sum | compile_date | datadir_IMAGE_DIRECTORY_ENTRY_BASERELOC_size | datadir_IMAGE_DIRECTORY_ENTRY_EXPORT_size | datadir_IMAGE_DIRECTORY_ENTRY_IAT_size | datadir_IMAGE_DIRECTORY_ENTRY_IMPORT_size | datadir_IMAGE_DIRECTORY_ENTRY_RESOURCE_size | debug_size | export_size | generated_check_sum | iat_rva | major_version | minor_version | number_of_bound_import_symbols | number_of_bound_imports | number_of_export_symbols | number_of_import_symbols | number_of_imports | number_of_rva_and_sizes | number_of_sections | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50.000000 | 5.000000e+01 | 50.000000 | 50.000000 | 50.000000 | 50.000000 | 50.00000 | 50.000000 | 50.000000 | 50.000000 | 50.000000 | 50.000000 | 50.000000 | 50.000000 | 50.000000 | 50.000000 | 50.000000 | 50.000000 | 50 | 50.00000 | ... |
mean | 25235.660000 | 1.035770e+09 | 415.280000 | 14.640000 | 126.720000 | 456.160000 | 9615.64000 | 3.920000 | 14.640000 | 86998.520000 | 43982.640000 | 0.960000 | 0.120000 | 0.140000 | 0.740000 | 0.240000 | 44.560000 | 3.740000 | 16 | 4.32000 | ... |
std | 45704.015095 | 3.202979e+08 | 1061.159532 | 55.908365 | 180.722252 | 1060.814846 | 19062.02003 | 9.814275 | 55.908365 | 30119.209943 | 44546.311213 | 2.137708 | 0.328261 | 0.404566 | 2.028471 | 0.893514 | 46.412595 | 3.445257 | 0 | 1.75476 | ... |
min | 0.000000 | 2.099200e+06 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 26104.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 16 | 1.00000 | ... |
25% | 0.000000 | 9.372855e+08 | 0.000000 | 0.000000 | 0.000000 | 40.000000 | 0.00000 | 0.000000 | 0.000000 | 68094.000000 | 14593.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.250000 | 1.000000 | 16 | 3.00000 | ... |
50% | 0.000000 | 1.172916e+09 | 0.000000 | 0.000000 | 44.000000 | 100.000000 | 1152.00000 | 0.000000 | 0.000000 | 82579.000000 | 25835.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 26.000000 | 2.500000 | 16 | 4.00000 | ... |
75% | 36417.000000 | 1.219691e+09 | 14.000000 | 0.000000 | 231.000000 | 186.000000 | 5938.00000 | 0.000000 | 0.000000 | 108406.000000 | 55948.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 70.500000 | 5.000000 | 16 | 5.00000 | ... |
max | 150326.000000 | 1.382647e+09 | 4612.000000 | 313.000000 | 748.000000 | 6234.000000 | 84152.00000 | 28.000000 | 313.000000 | 164776.000000 | 189824.000000 | 10.000000 | 1.000000 | 2.000000 | 8.000000 | 4.000000 | 180.000000 | 18.000000 | 16 | 9.00000 | ... |
8 rows × 199 columns
# Visualization I
df_bad['check_sum'].hist(alpha=.5,label='bad',bins=40)
df_good['check_sum'].hist(alpha=.5,label='good',bins=40)
plt.legend()
<matplotlib.legend.Legend at 0x110fd4b10>
# Visualization I
df_bad['generated_check_sum'].hist(alpha=.5,label='bad',bins=40)
df_good['generated_check_sum'].hist(alpha=.5,label='good',bins=40)
plt.legend()
<matplotlib.legend.Legend at 0x111e38f50>
# Concatenate the info into a big pile!
df = pd.concat([df_bad, df_good], ignore_index=True)
df.replace(np.nan, 0, inplace=True)
# Boxplots show you the distribution of the data (spread).
# http://en.wikipedia.org/wiki/Box_plot
# Get some quick summary stats and plot it!
df.boxplot('number_of_import_symbols','label')
plt.xlabel('bad vs. good files')
plt.ylabel('# Import Symbols')
plt.title('Comparision of # Import Symbols')
plt.suptitle("")
<matplotlib.text.Text at 0x11089fb10>
# Get some quick summary stats and plot it!
df.boxplot('number_of_sections','label')
plt.xlabel('bad vs. good files')
plt.ylabel('Num Sections')
plt.title('Comparision of Number of Sections')
plt.suptitle("")
<matplotlib.text.Text at 0x1108a3850>
# Split the classes up so we can set colors, size, labels
cond = df['label'] == 'good'
good = df[cond]
bad = df[~cond]
plt.scatter(good['number_of_import_symbols'], good['number_of_sections'],
s=140, c='#aaaaff', label='Good', alpha=.4)
plt.scatter(bad['number_of_import_symbols'], bad['number_of_sections'],
s=40, c='r', label='Bad', alpha=.5)
plt.legend()
plt.xlabel('Import Symbols')
plt.ylabel('Num Sections')
<matplotlib.text.Text at 0x110914f50>
# In preparation for using scikit learn we're just going to use
# some handles that help take us from pandas land to scikit land
# List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors)
X = df.as_matrix(['number_of_import_symbols', 'number_of_sections'])
# Labels (scikit learn uses 'y' for classification labels)
y = np.array(df['label'].tolist())
# Random Forest is a popular ensemble machine learning classifier.
# http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#
import sklearn.ensemble
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, compute_importances=True)
# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=5, n_jobs=4)
print scores
[ 0.8 0.75 0.7 0.65 0.78947368]
# Typically you train/test on an 80% / 20% split meaning you train on 80%
# of the data and you test against the remaining 20%. In the case of this
# exercise we have so FEW samples (50 good/50 bad) that if were going
# to play around with predictive performance it's more meaningful
# to train on 60% of the data and test against the remaining 40%.
my_seed = 123
my_tsize = .4 # 40%
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# Now plot the results of the 60/40 split in a confusion matrix
from sklearn.metrics import confusion_matrix
labels = ['good', 'bad']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats good/good: 72.73% (16/22) good/bad: 27.27% (6/22) bad/good: 33.33% (6/18) bad/bad: 66.67% (12/18)
Here we going to explore some of the ways you can adjust the 'knobs' associated with either the feature input into your ML algorithm or the prediction probability methods that many classes in scikit-learn have.
# Okay now try putting in ALL the features (except the label, which would be cheating :)
no_label = list(df.columns.values)
no_label.remove('label')
X = df.as_matrix(no_label)
# 60/40 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats good/good: 95.45% (21/22) good/bad: 4.55% (1/22) bad/good: 5.56% (1/18) bad/bad: 94.44% (17/18)
# Feature Selection
# Which features best deferentiated the two classes?
# Here we're going to grab the feature_importances from the classifier itself,
# you can also use a Chi Squared Test sklearn.feature_selection.SelectKBest(chi2)
importances = zip(no_label, clf.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
importances[:10]
[('compile_date', 0.087118104042058525), ('pe_majorlink', 0.059725488127989876), (u'sec_rawptr_reloc', 0.059172331241524503), ('debug_size', 0.036744505259105907), (u'sec_vasize_reloc', 0.035061138659616312), ('datadir_IMAGE_DIRECTORY_ENTRY_RESOURCE_size', 0.033765884515823991), (u'sec_rawptr_rdata', 0.033184786235755895), ('datadir_IMAGE_DIRECTORY_ENTRY_IAT_size', 0.03261159140279881), ('pe_char', 0.030081949321901769), ('sec_rawsize_text', 0.028226654611623891)]
# Produce an X matrix with only the most important featuers
X = df.as_matrix([item[0] for item in importances[:10]])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats good/good: 95.45% (21/22) good/bad: 4.55% (1/22) bad/good: 0.00% (0/18) bad/bad: 100.00% (18/18)
# Compute the predition probabilities and use them to mimimize our false positives
# Note: This is simply a trade off, it means we'll miss a few of the malicious
# ones but typically false alarms are a death blow to any new 'fancy stuff' so
# we definitely want to mimimize the false alarms.
y_probs = clf.predict_proba(X_test)[:,0]
thres = .8 # This can be set to whatever you'd like
y_pred[y_probs<thres] = 'good'
y_pred[y_probs>=thres] = 'bad'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats good/good: 100.00% (22/22) good/bad: 0.00% (0/22) bad/good: 16.67% (3/18) bad/bad: 83.33% (15/18)