import sklearn.feature_extraction
sklearn.__version__

import pandas as pd
pd.__version__

# Plotting defaults
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 12.0, 5.0

# A plotting helper method
def plot_it(df,label_x,label_y):
    fig, ax = plt.subplots(subplot_kw={'axisbg':'#EEEEE5'})
    ax.grid(color='grey', linestyle='solid')
    df.T.plot(kind='bar', logx=True, rot=0, ax=ax, colormap='PiYG')
    ax.legend(loc=0, prop={'size':14})
    plt.subplots_adjust(left=0, right=1, bottom=0, top=1, wspace=0, hspace=0)
    plt.xlabel(label_x)
    plt.ylabel(label_y)
    return ax

# Read in a set of SQL statements from various sources
import os
basedir = 'data'
filelist = os.listdir(basedir) 
df_list = []
for file in filelist:
    df = pd.read_csv(os.path.join(basedir,file), sep='|||', names=['raw_sql'], header=None)
    df['type'] = 'legit' if file.split('.')[0] == 'legit' else 'malicious'
    df_list.append(df)
dataframe = pd.concat(df_list, ignore_index=True)
dataframe.dropna(inplace=True)
print dataframe['type'].value_counts()
dataframe.head()

!which python

# Use the SQLParse module: sqlparse is a non-validating SQL parser module for Python
# https://github.com/andialbrecht/sqlparse
import sqlparse
def parse_it(raw_sql):
    parsed = sqlparse.parse(unicode(raw_sql,'utf-8'))
    return [token._get_repr_name() for parse in parsed for token in parse.tokens if token._get_repr_name() != 'Whitespace']

dataframe['parsed_sql'] = dataframe['raw_sql'].map(lambda x: parse_it(x))

dataframe.head()

# Looking at the SQL tokens is 'kinda' interesting but sequences of tokens and transitions
# between tokens seems more meaningful so we're also going to compute sequences by
# computing NGrams for every SQL statement...
def ngrams(lst, N):
    ngrams = []
    for n in xrange(0,N):
        ngrams += zip(*(lst[i:] for i in xrange(n+1)))
    return [str(tuple) for tuple in ngrams]

dataframe['sequences'] = dataframe['parsed_sql'].map(lambda x: ngrams(x, 3))

# Helper method
def token_expansion(series, types):
    _tokens, _types = zip(*[(token,token_type) for t_list,token_type in zip(series,types) for token in t_list])
    return pd.Series(_tokens), pd.Series(_types)

dataframe['sequences']

# The data hacking repository has a simple stats module we're going to use
import data_hacking.simple_stats as ss

# Spin up our g_test class
g_test = ss.GTest()

# Here we'd like to see how various sql tokens and transitions are related.
# Is there an association with particular token sets and malicious SQL statements.
tokens, types = token_expansion(dataframe['sequences'], dataframe['type'])
df_ct, df_cd, df_stats = g_test.highest_gtest_scores(tokens, types, matches=0, N=0)

df_stats.sort('malicious_g', ascending=0).head(10)

# The table below shows raw counts, conditional distributions, expected counts, and g-test score.

# Now plot the the head() and the tail() of the dataframe to see who's been naughty or nice
sorted_df = df_stats.sort('malicious_g', ascending=0)
naughty = sorted_df.head(7)
nice = sorted_df.tail(7).sort('malicious_g', ascending=0)
naughty_and_nice = pd.concat([naughty, nice])
ax = plot_it(naughty_and_nice[['malicious_g']],'SQL Command Types','G-Test Scores')
ax.set_xlim(.2, 1.4)

# Documentation in sqlparse for the mapping can be found here: 
# https://github.com/andialbrecht/sqlparse/blob/master/sqlparse/keywords.py
# or here
# https://github.com/andialbrecht/sqlparse/blob/master/sqlparse/lexer.py

# Here we look at example of the SQL sequence that G-Test has indicated are good
# indicators of SQL injections.
dataframe[dataframe['sequences'].map(lambda x: "('Single', 'Identifier')" in x)].head()

dataframe[dataframe['sequences'].map(lambda x: "('Punctuation',)" in x)].head()

# Generating additional feature dimensions for the machine learning to expand its mind into...
# We're basically building up features to include into our 'feature vector' for ML
import math
from collections import Counter
def entropy(s):
    p, lns = Counter(s), float(len(s))
    return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
dataframe['length'] = dataframe['parsed_sql'].map(lambda x: len(x))
dataframe['entropy'] = dataframe['raw_sql'].map(lambda x: entropy(x))
dataframe.head()

# For each SQL statement aggregate the malicious and legit g-test scores as features
import numpy as np
def g_aggregate(sequence, name):
    try:
        g_scores = [df_stats.ix[item][name] for item in sequence]
    except KeyError:
        return 0
    return sum(g_scores)/len(g_scores) if g_scores else 0 # Average
dataframe['malicious_g'] = dataframe['sequences'].map(lambda x: g_aggregate(x, 'malicious_g'))
dataframe['legit_g'] = dataframe['sequences'].map(lambda x: g_aggregate(x, 'legit_g'))

dataframe.head()

# Boxplots show you the distribution of the data (spread).
# http://en.wikipedia.org/wiki/Box_plot

# Plot the length and entropy of SQL statements
# Fixme Brian: make these pretty
dataframe.boxplot('length','type')
plt.ylabel('SQL Statement Length')
dataframe.boxplot('entropy','type')
plt.ylabel('SQL Statement Entropy')

# Split the classes up so we can set colors, size, labels
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEE5'))
ax.grid(color='grey', linestyle='solid')
cond = dataframe['type'] == 'malicious'
evil = dataframe[cond]
legit = dataframe[~cond]
plt.scatter(legit['length'], legit['entropy'], s=140, c='#aaaaff', label='Legit', alpha=.7)
plt.scatter(evil['length'], evil['entropy'], s=40, c='r', label='Injections', alpha=.3)
plt.legend()
plt.xlabel('SQL Statement Length')
plt.ylabel('SQL Statement Entropy')

# Split the classes up so we can set colors, size, labels
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEE5'))
ax.grid(color='grey', linestyle='solid')
plt.scatter(legit['malicious_g'], legit['legit_g'], s=140, c='#aaaaff', label='Legit', alpha=.7)
plt.scatter(evil['malicious_g'], evil['legit_g'], s=40, c='r', label='Injections', alpha=.3)
plt.legend()
plt.ylabel('Legit SQL G-Test Score')
plt.xlabel('Malicious SQL G-Test Score')

# In preparation for using scikit learn we're just going to use
# some handles that help take us from pandas land to scikit land

# List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors)
X = dataframe.as_matrix(['length', 'entropy','legit_g','malicious_g'])

# Labels (scikit learn uses 'y' for classification labels)
y = np.array(dataframe['type'].tolist())  # Yes, this is weird but it needs 
                                            # to be an np.array of strings

# Random Forest is a popular ensemble machine learning classifier.
# http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#
import sklearn.ensemble
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20) # Trees in the forest

# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10, n_jobs=4)
print scores

# Wow 99% accurate! There is an issue though...
# Recall that we have ~13k 'malicious SQL statements and
# we only have about 1k 'legit' SQL statements, so we dive 
# in a bit and look at the predictive performance more deeply.

# Train on a 80/20 split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(X, y, dataframe.index, test_size=0.2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Now plot the results of the 80/20 split in a confusion matrix
from sklearn.metrics import confusion_matrix
labels = ['legit', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)

def plot_cm(cm, labels):
    
    # Compute percentanges
    percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)  # Derp, I'm sure there's a better way
    
    print 'Confusion Matrix Stats'
    for i, label_i in enumerate(labels):
        for j, label_j in enumerate(labels):
            print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())

    # Show confusion matrix
    # Thanks kermit666 from stackoverflow :)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.grid(b=False)
    cax = ax.matshow(percent, cmap='coolwarm')
    plt.title('Confusion matrix of the classifier')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

plot_cm(cm, labels)

# Compute the precition probabilities and use them to mimimize our false positives
# Note: This is simply a trade off, it means we'll miss a few of the malicious
# ones but typically false alarms are a death blow to any new 'fancy stuff' so
# we definitely want to mimimize the false alarms.
y_probs = clf.predict_proba(X_test)[:,1]
thres = .9 # This can be set to whatever you'd like
y_pred[y_probs<thres] = 'legit'
y_pred[y_probs>=thres] = 'malicious'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)

# We can also look at what features the learning algorithm thought were the most important
importances = zip(['length', 'entropy', 'legit_g', 'malicious_g'], clf.feature_importances_)
importances

# From the list below we see our feature importance scores. There's a lot of feature selection,
# sensitivity study, etc stuff that you could do if you wanted at this point.

# Now were going to just do some post analysis on how the ML algorithm performed.
# Lets look at the legit samples that were misclassified as malicious
test_set = dataframe.ix[index_test]
test_set['pred'] = y_pred
misclassified = test_set[(test_set['type']=='legit') & (test_set['pred']=='malicious')]
misclassified.head()

# Discussion for how to use the resulting models.
# Typically Machine Learning comes in two phases
#    - Training of the Model
#    - Evaluation of new observations against the Model
# This notebook is about exploration of the data and training the model.
# After you have a model that you are satisfied with, just 'pickle' it
# at the end of the your training script and then in a separate
# evaluation script 'unpickle' it and evaluate/score new observations
# coming in (through a file, or ZeroMQ, or whatever...)
#
# In this case we'd have to pickle the RandomForest classifier.
# See 'test_it' below for how to use them in evaluation mode.


# test_it shows how to do evaluation, also fun for manual testing below :)
def test_it(sql):
    parsed_sql = parse_it(sql)
    ngram_list = ngrams(parsed_sql, 3)
    malicious_g = g_aggregate(ngram_list, 'malicious_g')
    legit_g = g_aggregate(ngram_list, 'legit_g')
    _X = [len(parsed_sql), entropy(sql), legit_g, malicious_g]
    print '%-40s: %s' % (sql, clf.predict(_X)[0])

test_it('select * from employees')
test_it("'; exec master..xp_cmdshell")
test_it("'any 'x'='x'")
test_it('from dorseys mom xp_cmdshell biache')
test_it('select * from your_mom')