import os import sklearn.feature_extraction sklearn.__version__ import pandas as pd pd.__version__ import numpy as np np.__version__ # Plotting defaults import matplotlib.pyplot as plt %matplotlib inline plt.rcParams['font.size'] = 18.0 plt.rcParams['figure.figsize'] = 16.0, 5.0 def plot_cm(cm, labels): # Compute percentanges percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) # Derp, I'm sure there's a better way print 'Confusion Matrix Stats' for i, label_i in enumerate(labels): for j, label_j in enumerate(labels): print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()) # Show confusion matrix # Thanks kermit666 from stackoverflow :) fig = plt.figure() ax = fig.add_subplot(111) ax.grid(b=False) cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100) plt.title('Confusion matrix of the classifier') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) plt.xlabel('Predicted') plt.ylabel('True') plt.show() import os, warnings warnings.filterwarnings("ignore", category=DeprecationWarning) import pe_features my_extractor = pe_features.PEFileFeatures() # Open a PE File and see what features we get filename = 'data/bad/0cb9aa6fb9c4aa3afad7a303e21ac0f3' with open(filename,'rb') as f: features = my_extractor.execute(f.read()) features # Load up all our files (files come from various places contagio, around the net...) def load_files(file_list): features_list = [] for filename in file_list: with open(filename,'rb') as f: features_list.append(my_extractor.execute(f.read())) return features_list # Bad (malicious) files file_list = [os.path.join('data/bad', child) for child in os.listdir('data/bad')] bad_features = load_files(file_list) print 'Loaded up %d malicious PE Files' % len(bad_features) # Good (benign) files file_list = [os.path.join('data/good', child) for child in os.listdir('data/good')] good_features = load_files(file_list) print 'Loaded up %d benign PE Files' % len(good_features) # Putting the features into a pandas dataframe import pandas as pd df_bad = pd.DataFrame.from_records(bad_features) df_bad['label'] = 'bad' df_good = pd.DataFrame.from_records(good_features) df_good['label'] = 'good' df_good.head() # Now we're set and we open up a a whole new world! # Gisting and statistics df_bad.describe() # Visualization I df_bad['check_sum'].hist(alpha=.5,label='bad',bins=40) df_good['check_sum'].hist(alpha=.5,label='good',bins=40) plt.legend() # Visualization I df_bad['generated_check_sum'].hist(alpha=.5,label='bad',bins=40) df_good['generated_check_sum'].hist(alpha=.5,label='good',bins=40) plt.legend() # Concatenate the info into a big pile! df = pd.concat([df_bad, df_good], ignore_index=True) df.replace(np.nan, 0, inplace=True) # Boxplots show you the distribution of the data (spread). # http://en.wikipedia.org/wiki/Box_plot # Get some quick summary stats and plot it! df.boxplot('number_of_import_symbols','label') plt.xlabel('bad vs. good files') plt.ylabel('# Import Symbols') plt.title('Comparision of # Import Symbols') plt.suptitle("") # Get some quick summary stats and plot it! df.boxplot('number_of_sections','label') plt.xlabel('bad vs. good files') plt.ylabel('Num Sections') plt.title('Comparision of Number of Sections') plt.suptitle("") # Split the classes up so we can set colors, size, labels cond = df['label'] == 'good' good = df[cond] bad = df[~cond] plt.scatter(good['number_of_import_symbols'], good['number_of_sections'], s=140, c='#aaaaff', label='Good', alpha=.4) plt.scatter(bad['number_of_import_symbols'], bad['number_of_sections'], s=40, c='r', label='Bad', alpha=.5) plt.legend() plt.xlabel('Import Symbols') plt.ylabel('Num Sections') # In preparation for using scikit learn we're just going to use # some handles that help take us from pandas land to scikit land # List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors) X = df.as_matrix(['number_of_import_symbols', 'number_of_sections']) # Labels (scikit learn uses 'y' for classification labels) y = np.array(df['label'].tolist()) # Random Forest is a popular ensemble machine learning classifier. # http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html # import sklearn.ensemble clf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, compute_importances=True) # Now we can use scikit learn's cross validation to assess predictive performance. scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=5, n_jobs=4) print scores # Typically you train/test on an 80% / 20% split meaning you train on 80% # of the data and you test against the remaining 20%. In the case of this # exercise we have so FEW samples (50 good/50 bad) that if were going # to play around with predictive performance it's more meaningful # to train on 60% of the data and test against the remaining 40%. my_seed = 123 my_tsize = .4 # 40% from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Now plot the results of the 60/40 split in a confusion matrix from sklearn.metrics import confusion_matrix labels = ['good', 'bad'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # Okay now try putting in ALL the features (except the label, which would be cheating :) no_label = list(df.columns.values) no_label.remove('label') X = df.as_matrix(no_label) # 60/40 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # Feature Selection # Which features best deferentiated the two classes? # Here we're going to grab the feature_importances from the classifier itself, # you can also use a Chi Squared Test sklearn.feature_selection.SelectKBest(chi2) importances = zip(no_label, clf.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) importances[:10] # Produce an X matrix with only the most important featuers X = df.as_matrix([item[0] for item in importances[:10]]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=my_tsize, random_state=my_seed) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # Compute the predition probabilities and use them to mimimize our false positives # Note: This is simply a trade off, it means we'll miss a few of the malicious # ones but typically false alarms are a death blow to any new 'fancy stuff' so # we definitely want to mimimize the false alarms. y_probs = clf.predict_proba(X_test)[:,0] thres = .8 # This can be set to whatever you'd like y_pred[y_probs=thres] = 'bad' cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels)