import sklearn.feature_extraction sklearn.__version__ import pandas as pd pd.__version__ # Plotting defaults import matplotlib.pyplot as plt %matplotlib inline plt.rcParams['font.size'] = 18.0 plt.rcParams['figure.figsize'] = 12.0, 5.0 # A plotting helper method def plot_it(df,label_x,label_y): fig, ax = plt.subplots(subplot_kw={'axisbg':'#EEEEE5'}) ax.grid(color='grey', linestyle='solid') df.T.plot(kind='bar', logx=True, rot=0, ax=ax, colormap='PiYG') ax.legend(loc=0, prop={'size':14}) plt.subplots_adjust(left=0, right=1, bottom=0, top=1, wspace=0, hspace=0) plt.xlabel(label_x) plt.ylabel(label_y) return ax # Read in a set of SQL statements from various sources import os basedir = 'data' filelist = os.listdir(basedir) df_list = [] for file in filelist: df = pd.read_csv(os.path.join(basedir,file), sep='|||', names=['raw_sql'], header=None) df['type'] = 'legit' if file.split('.')[0] == 'legit' else 'malicious' df_list.append(df) dataframe = pd.concat(df_list, ignore_index=True) dataframe.dropna(inplace=True) print dataframe['type'].value_counts() dataframe.head() !which python # Use the SQLParse module: sqlparse is a non-validating SQL parser module for Python # https://github.com/andialbrecht/sqlparse import sqlparse def parse_it(raw_sql): parsed = sqlparse.parse(unicode(raw_sql,'utf-8')) return [token._get_repr_name() for parse in parsed for token in parse.tokens if token._get_repr_name() != 'Whitespace'] dataframe['parsed_sql'] = dataframe['raw_sql'].map(lambda x: parse_it(x)) dataframe.head() # Looking at the SQL tokens is 'kinda' interesting but sequences of tokens and transitions # between tokens seems more meaningful so we're also going to compute sequences by # computing NGrams for every SQL statement... def ngrams(lst, N): ngrams = [] for n in xrange(0,N): ngrams += zip(*(lst[i:] for i in xrange(n+1))) return [str(tuple) for tuple in ngrams] dataframe['sequences'] = dataframe['parsed_sql'].map(lambda x: ngrams(x, 3)) # Helper method def token_expansion(series, types): _tokens, _types = zip(*[(token,token_type) for t_list,token_type in zip(series,types) for token in t_list]) return pd.Series(_tokens), pd.Series(_types) dataframe['sequences'] # The data hacking repository has a simple stats module we're going to use import data_hacking.simple_stats as ss # Spin up our g_test class g_test = ss.GTest() # Here we'd like to see how various sql tokens and transitions are related. # Is there an association with particular token sets and malicious SQL statements. tokens, types = token_expansion(dataframe['sequences'], dataframe['type']) df_ct, df_cd, df_stats = g_test.highest_gtest_scores(tokens, types, matches=0, N=0) df_stats.sort('malicious_g', ascending=0).head(10) # The table below shows raw counts, conditional distributions, expected counts, and g-test score. # Now plot the the head() and the tail() of the dataframe to see who's been naughty or nice sorted_df = df_stats.sort('malicious_g', ascending=0) naughty = sorted_df.head(7) nice = sorted_df.tail(7).sort('malicious_g', ascending=0) naughty_and_nice = pd.concat([naughty, nice]) ax = plot_it(naughty_and_nice[['malicious_g']],'SQL Command Types','G-Test Scores') ax.set_xlim(.2, 1.4) # Documentation in sqlparse for the mapping can be found here: # https://github.com/andialbrecht/sqlparse/blob/master/sqlparse/keywords.py # or here # https://github.com/andialbrecht/sqlparse/blob/master/sqlparse/lexer.py # Here we look at example of the SQL sequence that G-Test has indicated are good # indicators of SQL injections. dataframe[dataframe['sequences'].map(lambda x: "('Single', 'Identifier')" in x)].head() dataframe[dataframe['sequences'].map(lambda x: "('Punctuation',)" in x)].head() # Generating additional feature dimensions for the machine learning to expand its mind into... # We're basically building up features to include into our 'feature vector' for ML import math from collections import Counter def entropy(s): p, lns = Counter(s), float(len(s)) return -sum( count/lns * math.log(count/lns, 2) for count in p.values()) dataframe['length'] = dataframe['parsed_sql'].map(lambda x: len(x)) dataframe['entropy'] = dataframe['raw_sql'].map(lambda x: entropy(x)) dataframe.head() # For each SQL statement aggregate the malicious and legit g-test scores as features import numpy as np def g_aggregate(sequence, name): try: g_scores = [df_stats.ix[item][name] for item in sequence] except KeyError: return 0 return sum(g_scores)/len(g_scores) if g_scores else 0 # Average dataframe['malicious_g'] = dataframe['sequences'].map(lambda x: g_aggregate(x, 'malicious_g')) dataframe['legit_g'] = dataframe['sequences'].map(lambda x: g_aggregate(x, 'legit_g')) dataframe.head() # Boxplots show you the distribution of the data (spread). # http://en.wikipedia.org/wiki/Box_plot # Plot the length and entropy of SQL statements # Fixme Brian: make these pretty dataframe.boxplot('length','type') plt.ylabel('SQL Statement Length') dataframe.boxplot('entropy','type') plt.ylabel('SQL Statement Entropy') # Split the classes up so we can set colors, size, labels fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEE5')) ax.grid(color='grey', linestyle='solid') cond = dataframe['type'] == 'malicious' evil = dataframe[cond] legit = dataframe[~cond] plt.scatter(legit['length'], legit['entropy'], s=140, c='#aaaaff', label='Legit', alpha=.7) plt.scatter(evil['length'], evil['entropy'], s=40, c='r', label='Injections', alpha=.3) plt.legend() plt.xlabel('SQL Statement Length') plt.ylabel('SQL Statement Entropy') # Split the classes up so we can set colors, size, labels fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEE5')) ax.grid(color='grey', linestyle='solid') plt.scatter(legit['malicious_g'], legit['legit_g'], s=140, c='#aaaaff', label='Legit', alpha=.7) plt.scatter(evil['malicious_g'], evil['legit_g'], s=40, c='r', label='Injections', alpha=.3) plt.legend() plt.ylabel('Legit SQL G-Test Score') plt.xlabel('Malicious SQL G-Test Score') # In preparation for using scikit learn we're just going to use # some handles that help take us from pandas land to scikit land # List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors) X = dataframe.as_matrix(['length', 'entropy','legit_g','malicious_g']) # Labels (scikit learn uses 'y' for classification labels) y = np.array(dataframe['type'].tolist()) # Yes, this is weird but it needs # to be an np.array of strings # Random Forest is a popular ensemble machine learning classifier. # http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html # import sklearn.ensemble clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20) # Trees in the forest # Now we can use scikit learn's cross validation to assess predictive performance. scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10, n_jobs=4) print scores # Wow 99% accurate! There is an issue though... # Recall that we have ~13k 'malicious SQL statements and # we only have about 1k 'legit' SQL statements, so we dive # in a bit and look at the predictive performance more deeply. # Train on a 80/20 split from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(X, y, dataframe.index, test_size=0.2) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Now plot the results of the 80/20 split in a confusion matrix from sklearn.metrics import confusion_matrix labels = ['legit', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) def plot_cm(cm, labels): # Compute percentanges percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) # Derp, I'm sure there's a better way print 'Confusion Matrix Stats' for i, label_i in enumerate(labels): for j, label_j in enumerate(labels): print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()) # Show confusion matrix # Thanks kermit666 from stackoverflow :) fig = plt.figure() ax = fig.add_subplot(111) ax.grid(b=False) cax = ax.matshow(percent, cmap='coolwarm') plt.title('Confusion matrix of the classifier') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) plt.xlabel('Predicted') plt.ylabel('True') plt.show() plot_cm(cm, labels) # Compute the precition probabilities and use them to mimimize our false positives # Note: This is simply a trade off, it means we'll miss a few of the malicious # ones but typically false alarms are a death blow to any new 'fancy stuff' so # we definitely want to mimimize the false alarms. y_probs = clf.predict_proba(X_test)[:,1] thres = .9 # This can be set to whatever you'd like y_pred[y_probs=thres] = 'malicious' cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # We can also look at what features the learning algorithm thought were the most important importances = zip(['length', 'entropy', 'legit_g', 'malicious_g'], clf.feature_importances_) importances # From the list below we see our feature importance scores. There's a lot of feature selection, # sensitivity study, etc stuff that you could do if you wanted at this point. # Now were going to just do some post analysis on how the ML algorithm performed. # Lets look at the legit samples that were misclassified as malicious test_set = dataframe.ix[index_test] test_set['pred'] = y_pred misclassified = test_set[(test_set['type']=='legit') & (test_set['pred']=='malicious')] misclassified.head() # Discussion for how to use the resulting models. # Typically Machine Learning comes in two phases # - Training of the Model # - Evaluation of new observations against the Model # This notebook is about exploration of the data and training the model. # After you have a model that you are satisfied with, just 'pickle' it # at the end of the your training script and then in a separate # evaluation script 'unpickle' it and evaluate/score new observations # coming in (through a file, or ZeroMQ, or whatever...) # # In this case we'd have to pickle the RandomForest classifier. # See 'test_it' below for how to use them in evaluation mode. # test_it shows how to do evaluation, also fun for manual testing below :) def test_it(sql): parsed_sql = parse_it(sql) ngram_list = ngrams(parsed_sql, 3) malicious_g = g_aggregate(ngram_list, 'malicious_g') legit_g = g_aggregate(ngram_list, 'legit_g') _X = [len(parsed_sql), entropy(sql), legit_g, malicious_g] print '%-40s: %s' % (sql, clf.predict(_X)[0]) test_it('select * from employees') test_it("'; exec master..xp_cmdshell") test_it("'any 'x'='x'") test_it('from dorseys mom xp_cmdshell biache') test_it('select * from your_mom')