import pandas as pd pd.__version__ import numpy as np np.__version__ # Just some plotting defaults import matplotlib.pyplot as plt %matplotlib inline plt.rcParams['font.size'] = 14.0 plt.rcParams['figure.figsize'] = 12.0, 8.0 # Create a BRO log file reader and pull from the logfile import bro_log_reader bro_log = bro_log_reader.BroLogReader() headers = bro_log.read_log('data/http_headers.log') # Nice, so lets look at some of the outputs by tossing them into a pandas dataframe dataframe = pd.DataFrame(headers) # What do we have print 'Number of Rows: %d Columns:%d' % (dataframe.shape[0], dataframe.shape[1]) dataframe.head() # Okay so were only interested in client header requests for this exercise dataframe = dataframe[dataframe['origin']=='client'] # Okay we also want to process the header events (that are in a JSON blob) # into a header feature vector (just pulling 'keys' not values). import json def make_header_features(json_header_info_series): header_features = [] for header_info in json_header_info_series: try: header_list = json.loads(unicode(header_info, 'utf8')) features = [item.keys()[0] for item in header_list] # There are some lines w/no features except Exception as e: features = '' header_features.append(features) return header_features # Create a nicely formatted feature vector and a string representation dataframe['feature_vector'] = make_header_features(dataframe['header_events_json']) dataframe['features'] = dataframe['feature_vector'].map(lambda x: ':'.join(x)) # Making shorter agent names based on information from # http://msdn.microsoft.com/library/ms537503.aspx import re def replace_stuff(m): return 'n'+m.group(1) if '.net clr' in m.group() else '' def short_agent_names(useragent_series, resolution=12): short_agent_list = [] excludes = re.compile(r',|;|\(|\)|compatible|\.net clr ([0-9].[0-9])[^;]*;|khtml,|like') for useragent in useragent_series: processed_user_agent = re.sub(excludes, replace_stuff, useragent.lower()).strip() short_agent = ':'.join(processed_user_agent.split()[:resolution]) short_agent_list.append(short_agent) return short_agent_list # Generate shorter agent names dataframe['short_agent'] = short_agent_names(dataframe['useragent']) # Remove any 'na' agents dataframe = dataframe.replace('na',np.nan) dataframe = dataframe.dropna() # Okay lets exercise some of the pandas dataframe functionality dataframe['count'] = 1 agent_group_df = dataframe.groupby(['short_agent','features']).sum() agent_group_df.head(20) # Now lets get the number of different header sequence permutations per agent agent_counts = agent_group_df.count(level=0) # Looks like MSIE agents have a higher number of permutations than all the other stuff # So we 'groupby' a conditional statement (do you have msie in your agent string) agent_types = agent_counts.groupby(by=lambda x: 'msie' if 'msie' in x else 'other') agent_types.head(20) # Get some quick descriptive stats and plot it! fig, ax = plt.subplots(subplot_kw={'axisbg':'#EEEEE5'}) ax.grid(color='lightgrey', linestyle='solid') agent_types.boxplot(False) # Now lets flip the group by around features = dataframe[['short_agent','features','count']].groupby(['features','short_agent']).sum() print features.shape features.head(20) # Lets look at the a few examples of Levenshtein distance import data_hacking.lsh_sims as lsh_sims lsh = lsh_sims.LSHSimilarities([]) a = ['ACCEPT', 'USER-AGENT', 'HOST', 'COOKIE'] b = ['ACCEPT', 'USER-AGENT', 'HOST'] c = ['ACCEPT', 'USER-AGENT', 'DORSEYS-MOM'] d = ['COOKIE', 'ACCEPT', 'USER-AGENT', 'HOST'] print 'Levenshtein: %s -- %s ( %f )' % (a, b, lsh.levenshtein(a, b)) print 'Levenshtein: %s -- %s ( %f )' % (b, c, lsh.levenshtein(b, c)) print 'Levenshtein: %s -- %s ( %f )' % (a, d, lsh.levenshtein(a, d)) # Lets compute levenshtein distance between the header sequences for each agent params = {'num_hashes':20, 'lsh_bands':20, 'lsh_rows':1, 'drop_duplicates':True} agent_distances = {} agent_groups = dataframe.groupby(['short_agent']) for name, group in agent_groups: lsh = lsh_sims.LSHSimilarities(group['feature_vector'], mh_params=params) distances = lsh.batch_compute_similarities(distance_metric='levenshtein_tapered', threshold=10) distances.sort() agent_distances[name] = distances # For one agent show the top 5 closest (levenshtein) header sequences agent = 'mozilla/4.0:msie:7.0:windows:nt:6.1:wow64:trident/5.0:slcc2:n2.0:n3.5:n3.0' distances = agent_distances[agent] print '\nAgent: %s' % agent print 'Distances:' features = agent_groups.get_group(agent)['feature_vector'] for distance in distances[:5]: print '\n%s\n%s' % (features.iloc[distance[1]], features.iloc[distance[2]]) # MLPD3 is a cool python module for using D3 as a back end to matplotlib # go to https://github.com/jakevdp/mpld3 and behold the awesome. # Note we're commenting this out that the nbviewer work correctly, # but feel free to uncomment if you download the notebook and play # with it yourself. ''' try: import mpld3 mpld3.enable_notebook(d3_url="/files/d3/d3.v3.js") except ImportError: print 'Info: Could not load mpld3 module. No worries stuff will still work fine...' ''' # Compute a hierarchical clustering from the header similarities for each agent import data_hacking.hcluster as hcluster agent_h_graphs = {} groups = dict(list(agent_groups)) for name, group in groups.iteritems(): lsh = lsh_sims.LSHSimilarities(group['feature_vector'], mh_params=params) distances = lsh.batch_compute_similarities(distance_metric='l_tapered_sim', threshold=0) h_clustering = hcluster.HCluster(group['feature_vector']) h_clustering.set_sim_method(lsh.l_sim) h_graph, root = h_clustering.sims_to_hcluster(distances, agg_sim=.2) agent_h_graphs[name] = {'graph':h_graph, 'root':root} # Plot a couple of agents import networkx as nx def plot_h_tree(graph, layout='neato'): pos = nx.graphviz_layout(graph, prog=layout) labels = {node[0]:node[1]['label'] for node in graph.nodes(data=True)} nx.draw_networkx(graph, pos, node_size=800, alpha=.7, node_color=[.6,.4,.6], labels=labels) edge_labels=dict([((u,v,),str(d['weight'])[:4]) for u,v,d in graph.edges(data=True)]) nx.draw_networkx_edge_labels(graph,pos,edge_labels=edge_labels) # MSIE 8 msie_8 = 'mozilla/4.0:msie:8.0:windows:nt:6.1:wow64:trident/4.0:gtb7.4:slcc2:n2.0:n3.5' plot_h_tree(agent_h_graphs[msie_8]['graph']) msie_9 = 'mozilla/5.0:msie:9.0:windows:nt:6.1:wow64:trident/5.0' plot_h_tree(agent_h_graphs[msie_9]['graph']) flash = 'shockwave:flash' plot_h_tree(agent_h_graphs[flash]['graph']) import collections def subtree_labels(g, root): labels = nx.get_node_attributes(g,'label') sub_labels = collections.defaultdict(list) leaves = [k for k,v in g.out_degree().iteritems() if v == 0] for leaf in leaves: sub_labels[g.predecessors(leaf)[0]].append(labels[leaf]) return sub_labels import pprint g = agent_h_graphs[good_test_agent]['graph'] root = agent_h_graphs[good_test_agent]['root'] # Hmph, well just for fun we made a RE Morpher class; you simply keep adding # strings to it and it figures out the RE that matches all the strings. # It's very hack-tastic so a better way to auto-generate regular expressions # will be a fun task for some contributor :) import re import re_morpher # Lets experiment a bit a = [u'HOST', u'CONNECTION', u'ACCEPT', u'USER-AGENT', u'ACCEPT-ENCODING'] b = [u'HOST', u'CONNECTION', u'AUTHORIZATION', u'ACCEPT', u'USER-AGENT', u'ACCEPT-ENCODING'] b = [u'HOST', u'CONNECTION', u'AUTHORIZATION', u'ACCEPT', u'USER-AGENT', u'DORSEYS-MOM'] my_re_morpher = re_morpher.REMorpher() my_re_morpher.add_sequence(a) print my_re_morpher.get_re_pattern() my_re_morpher.add_sequence(b) print my_re_morpher.get_re_pattern() # Alright now try it out on our agents header sequences import collections agent_res = collections.defaultdict(list) for agent, graph_info in agent_h_graphs.iteritems(): #for agent, graph_info in zip(good_test_agent,agent_h_graphs[good_test_agent]): graph = graph_info['graph'] root = graph_info['root'] if graph: # Get the re patterns for this agent for sub_key, feature_list in subtree_labels(graph,root).iteritems(): for f in feature_list: my_re_morpher.add_sequence(f.split(':')) # Append to my re list agent_res[agent].append(my_re_morpher.get_re_pattern()) my_re_morpher.reset_re() # Print out the agent sets just to get an idea for agent, graph_info in agent_h_graphs.iteritems(): print '\n%s' % agent for my_re in agent_res[agent]: print '\t%s' % my_re # An evaluation method for our auto-magically-generated RE expressions import re def evaluate_agents(agent_list, feature_list): print 'Evaluating %d requests' % len(agent_list) for agent, features in zip(agent_list, feature_list): my_res = [re.compile(my_re) for my_re in agent_res[agent]] match = any([my_re.match(features.replace(':','')) for my_re in my_res]) if not match: print '\nAlert: No Match on Agent(%s) Sequence(%s)' % (agent,features) # Evaluation against the training set (there should be no alerts) t_agents = [(len(agent_res[agent])>0) for agent in dataframe['short_agent']] # Degenerate case where no H-Tree was built training_agents = dataframe[t_agents] evaluate_agents(training_agents['short_agent'], training_agents['features']) # Read in from contagio dumps' pcap samples for evaluation testing bro_log = bro_log_reader.BroLogReader() contagio_headers = bro_log.read_log('data/contagio.headers.txt') contagio_df = pd.DataFrame(contagio_headers) contagio_df.head() # A bit of processing on the raw data to prepate it for evaluation contagio_df = contagio_df[contagio_df['origin']=='client'] contagio_df['short_agent'] = short_agent_names(contagio_df['useragent']) contagio_df['feature_vector'] = make_header_features(contagio_df['header_events_json']) contagio_df['features'] = contagio_df['feature_vector'].map(lambda x: ':'.join(x)) # Lets look at the overlap of agents from our training set and the contagio set trained_agents = set(dataframe['short_agent'].unique()) evil_agents = set(contagio_df['short_agent'].unique()) evil_agents = evil_agents.intersection(trained_agents) contagio_subset = contagio_df[contagio_df['short_agent'].isin(evil_agents)] evil_agents # Well only a couple of agents overlap our training data, but that's okay # still a reasonable set of header requests to test against. # Lets see how the Contagio CrimeWare PCAP requests measure up against our dataset of computed regex's evaluate_agents(contagio_subset['short_agent'],contagio_subset['features'])