import pandas as pd
pd.__version__

import numpy as np
np.__version__

# Just some plotting defaults
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.size'] = 14.0
plt.rcParams['figure.figsize'] = 12.0, 8.0

# Create a BRO log file reader and pull from the logfile
import bro_log_reader
bro_log = bro_log_reader.BroLogReader()
headers = bro_log.read_log('data/http_headers.log')

# Nice, so lets look at some of the outputs by tossing them into a pandas dataframe
dataframe = pd.DataFrame(headers)

# What do we have
print 'Number of Rows: %d   Columns:%d' % (dataframe.shape[0], dataframe.shape[1])
dataframe.head()

# Okay so were only interested in client header requests for this exercise
dataframe = dataframe[dataframe['origin']=='client']

# Okay we also want to process the header events (that are in a JSON blob) 
# into a header feature vector (just pulling 'keys' not values).
import json
def make_header_features(json_header_info_series):
    header_features = []
    for header_info in json_header_info_series:
        try:
            header_list = json.loads(unicode(header_info, 'utf8'))
            features = [item.keys()[0] for item in header_list]
        # There are some lines w/no features
        except Exception as e:
            features = ''
        header_features.append(features)
    return header_features

# Create a nicely formatted feature vector and a string representation
dataframe['feature_vector'] = make_header_features(dataframe['header_events_json'])
dataframe['features'] = dataframe['feature_vector'].map(lambda x: ':'.join(x))

# Making shorter agent names based on information from
# http://msdn.microsoft.com/library/ms537503.aspx
import re
def replace_stuff(m):
    return 'n'+m.group(1) if '.net clr' in m.group() else ''
def short_agent_names(useragent_series, resolution=12):
    short_agent_list = []
    excludes = re.compile(r',|;|\(|\)|compatible|\.net clr ([0-9].[0-9])[^;]*;|khtml,|like')
    for useragent in useragent_series:
        processed_user_agent = re.sub(excludes, replace_stuff, useragent.lower()).strip()
        short_agent = ':'.join(processed_user_agent.split()[:resolution])
        short_agent_list.append(short_agent)
    return short_agent_list

# Generate shorter agent names
dataframe['short_agent'] = short_agent_names(dataframe['useragent'])
# Remove any 'na' agents
dataframe = dataframe.replace('na',np.nan)
dataframe = dataframe.dropna()

# Okay lets exercise some of the pandas dataframe functionality
dataframe['count'] = 1
agent_group_df = dataframe.groupby(['short_agent','features']).sum()
agent_group_df.head(20)

# Now lets get the number of different header sequence permutations per agent
agent_counts = agent_group_df.count(level=0)

# Looks like MSIE agents have a higher number of permutations than all the other stuff
# So we 'groupby' a conditional statement (do you have msie in your agent string)
agent_types = agent_counts.groupby(by=lambda x: 'msie' if 'msie' in x else 'other')
agent_types.head(20)

# Get some quick descriptive stats and plot it!
fig, ax = plt.subplots(subplot_kw={'axisbg':'#EEEEE5'})
ax.grid(color='lightgrey', linestyle='solid')
agent_types.boxplot(False)

# Now lets flip the group by around
features = dataframe[['short_agent','features','count']].groupby(['features','short_agent']).sum()
print features.shape
features.head(20)

# Lets look at the a few examples of Levenshtein distance
import data_hacking.lsh_sims as lsh_sims
lsh = lsh_sims.LSHSimilarities([])
a = ['ACCEPT', 'USER-AGENT', 'HOST', 'COOKIE']
b = ['ACCEPT', 'USER-AGENT', 'HOST']
c = ['ACCEPT', 'USER-AGENT', 'DORSEYS-MOM']
d = ['COOKIE', 'ACCEPT', 'USER-AGENT', 'HOST']

print 'Levenshtein: %s -- %s   ( %f )' % (a, b, lsh.levenshtein(a, b))
print 'Levenshtein: %s -- %s   ( %f )' % (b, c, lsh.levenshtein(b, c))
print 'Levenshtein: %s -- %s   ( %f )' % (a, d, lsh.levenshtein(a, d))

# Lets compute levenshtein distance between the header sequences for each agent
params = {'num_hashes':20, 'lsh_bands':20, 'lsh_rows':1, 'drop_duplicates':True}

agent_distances = {}
agent_groups = dataframe.groupby(['short_agent'])
for name, group in agent_groups:
    lsh = lsh_sims.LSHSimilarities(group['feature_vector'], mh_params=params)
    distances = lsh.batch_compute_similarities(distance_metric='levenshtein_tapered', threshold=10)
    distances.sort() 
    agent_distances[name] = distances

# For one agent show the top 5 closest (levenshtein) header sequences
agent = 'mozilla/4.0:msie:7.0:windows:nt:6.1:wow64:trident/5.0:slcc2:n2.0:n3.5:n3.0'
distances = agent_distances[agent]

print '\nAgent: %s' % agent
print 'Distances:'
features = agent_groups.get_group(agent)['feature_vector']
for distance in distances[:5]:
    print '\n%s\n%s' % (features.iloc[distance[1]], features.iloc[distance[2]])

# MLPD3 is a cool python module for using D3 as a back end to matplotlib
# go to https://github.com/jakevdp/mpld3 and behold the awesome.

# Note we're commenting this out that the nbviewer work correctly, 
# but feel free to uncomment if you download the notebook and play
# with it yourself.
'''
try:
    import mpld3
    mpld3.enable_notebook(d3_url="/files/d3/d3.v3.js")
except ImportError:
    print 'Info: Could not load mpld3 module. No worries stuff will still work fine...'
'''

# Compute a hierarchical clustering from the header similarities for each agent
import data_hacking.hcluster as hcluster
agent_h_graphs = {}
groups = dict(list(agent_groups))
for name, group in groups.iteritems():
    lsh = lsh_sims.LSHSimilarities(group['feature_vector'], mh_params=params)
    distances = lsh.batch_compute_similarities(distance_metric='l_tapered_sim', threshold=0)
    h_clustering = hcluster.HCluster(group['feature_vector'])
    h_clustering.set_sim_method(lsh.l_sim)
    h_graph, root = h_clustering.sims_to_hcluster(distances, agg_sim=.2)
    agent_h_graphs[name] = {'graph':h_graph, 'root':root}

# Plot a couple of agents
import networkx as nx

def plot_h_tree(graph, layout='neato'):
    pos = nx.graphviz_layout(graph, prog=layout)
    labels = {node[0]:node[1]['label'] for node in graph.nodes(data=True)}
    nx.draw_networkx(graph, pos, node_size=800, alpha=.7, node_color=[.6,.4,.6], labels=labels)
    edge_labels=dict([((u,v,),str(d['weight'])[:4]) for u,v,d in graph.edges(data=True)])
    nx.draw_networkx_edge_labels(graph,pos,edge_labels=edge_labels)

# MSIE 8
msie_8 = 'mozilla/4.0:msie:8.0:windows:nt:6.1:wow64:trident/4.0:gtb7.4:slcc2:n2.0:n3.5'
plot_h_tree(agent_h_graphs[msie_8]['graph'])

msie_9 = 'mozilla/5.0:msie:9.0:windows:nt:6.1:wow64:trident/5.0'
plot_h_tree(agent_h_graphs[msie_9]['graph'])

flash = 'shockwave:flash'
plot_h_tree(agent_h_graphs[flash]['graph'])

import collections
def subtree_labels(g, root):
    labels = nx.get_node_attributes(g,'label')
    sub_labels = collections.defaultdict(list)
    leaves = [k for k,v in g.out_degree().iteritems() if v == 0]
    for leaf in leaves:
        sub_labels[g.predecessors(leaf)[0]].append(labels[leaf])
    return sub_labels

import pprint
g = agent_h_graphs[good_test_agent]['graph']
root = agent_h_graphs[good_test_agent]['root']

# Hmph, well just for fun we made a RE Morpher class; you simply keep adding
# strings to it and it figures out the RE that matches all the strings.
# It's very hack-tastic so a better way to auto-generate regular expressions 
# will be a fun task for some contributor :)
import re
import re_morpher

# Lets experiment a bit
a = [u'HOST', u'CONNECTION', u'ACCEPT', u'USER-AGENT', u'ACCEPT-ENCODING']
b = [u'HOST', u'CONNECTION', u'AUTHORIZATION', u'ACCEPT', u'USER-AGENT', u'ACCEPT-ENCODING']
b = [u'HOST', u'CONNECTION', u'AUTHORIZATION', u'ACCEPT', u'USER-AGENT', u'DORSEYS-MOM']

my_re_morpher = re_morpher.REMorpher()
my_re_morpher.add_sequence(a)
print my_re_morpher.get_re_pattern()
my_re_morpher.add_sequence(b)
print my_re_morpher.get_re_pattern()

# Alright now try it out on our agents header sequences
import collections
agent_res = collections.defaultdict(list)
for agent, graph_info in agent_h_graphs.iteritems():
#for agent, graph_info in zip(good_test_agent,agent_h_graphs[good_test_agent]):
    graph = graph_info['graph']
    root = graph_info['root']
    if graph:
        # Get the re patterns for this agent
        for sub_key, feature_list in subtree_labels(graph,root).iteritems():
            for f in feature_list:
                my_re_morpher.add_sequence(f.split(':'))

            # Append to my re list
            agent_res[agent].append(my_re_morpher.get_re_pattern())
            my_re_morpher.reset_re()
    
# Print out the agent sets just to get an idea
for agent, graph_info in agent_h_graphs.iteritems():
    print '\n%s' % agent
    for my_re in agent_res[agent]:
        print '\t%s' % my_re

# An evaluation method for our auto-magically-generated RE expressions
import re
def evaluate_agents(agent_list, feature_list):
    print 'Evaluating %d requests' % len(agent_list)
    for agent, features in zip(agent_list, feature_list):
        my_res = [re.compile(my_re) for my_re in agent_res[agent]]
        match = any([my_re.match(features.replace(':','')) for my_re in my_res])
        if not match:
            print '\nAlert: No Match on Agent(%s) Sequence(%s)' % (agent,features)

# Evaluation against the training set (there should be no alerts)
t_agents = [(len(agent_res[agent])>0) for agent in dataframe['short_agent']] # Degenerate case where no H-Tree was built
training_agents = dataframe[t_agents]
evaluate_agents(training_agents['short_agent'], training_agents['features'])

# Read in from contagio dumps' pcap samples for evaluation testing
bro_log = bro_log_reader.BroLogReader()
contagio_headers = bro_log.read_log('data/contagio.headers.txt')
contagio_df = pd.DataFrame(contagio_headers)
contagio_df.head()

# A bit of processing on the raw data to prepate it for evaluation
contagio_df = contagio_df[contagio_df['origin']=='client']
contagio_df['short_agent'] = short_agent_names(contagio_df['useragent'])
contagio_df['feature_vector'] = make_header_features(contagio_df['header_events_json'])
contagio_df['features'] = contagio_df['feature_vector'].map(lambda x: ':'.join(x))

# Lets look at the overlap of agents from our training set and the contagio set
trained_agents = set(dataframe['short_agent'].unique())
evil_agents = set(contagio_df['short_agent'].unique())
evil_agents = evil_agents.intersection(trained_agents)
contagio_subset = contagio_df[contagio_df['short_agent'].isin(evil_agents)]
evil_agents

# Well only a couple of agents overlap our training data, but that's okay 
# still a reasonable set of header requests to test against.

# Lets see how the Contagio CrimeWare PCAP requests measure up against our dataset of computed regex's
evaluate_agents(contagio_subset['short_agent'],contagio_subset['features'])