import pandas as pd
import numpy as np
import string
import pylab
import re
import pandas
import time
import os
import collections
import matplotlib
import struct
import socket
import json
from datetime import datetime
from netaddr import IPNetwork, IPAddress

%matplotlib inline

print pd.__version__

pylab.rcParams['figure.figsize'] = (16.0, 5.0)

# Mapping of fields of the files we want to read in and initial setup of pandas dataframes
logs_to_process = {
                    'conn.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','service','duration','orig_bytes','resp_bytes','conn_state','local_orig','missed_bytes','history','orig_pkts','orig_ip_bytes','resp_pkts','resp_ip_bytes','tunnel_parents','threat','sample'],
                    'dns.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','trans_id','query','qclass','qclass_name','qtype','qtype_name','rcode','rcode_name','AA','TC','RD','RA','Z','answers','TTLs','rejected','threat','sample'],
                    'files.log' : ['ts','fuid','tx_hosts','rx_hosts','conn_uids','source','depth','analyzers','mime_type','filename','duration','local_orig','is_orig','seen_bytes','total_bytes','missing_bytes','overflow_bytes','timedout','parent_fuid','md5','sha1','sha256','extracted','threat','sample'],
                    'ftp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','user','password','command','arg','mime_type','file_size','reply_code','reply_msg','data_channel.passive','data_channel.orig_h','data_channel.resp_h','data_channel.resp_p','fuid','threat','sample'],
                    'http.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','method','host','uri','referrer','user_agent','request_body_len','response_body_len','status_code','status_msg','info_code','info_msg','filename','tags','username','password','proxied','orig_fuids','orig_mime_types','resp_fuids','resp_mime_types','threat','sample'],
                    'notice.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','fuid','file_mime_type','file_desc','proto','note','msg','sub','src','dst','p','n','peer_descr','actions','suppress_for','dropped','remote_location.country_code','remote_location.region','remote_location.city','remote_location.latitude','remote_location.longitude','threat','sample'],
                    'signatures.log' : ['ts','src_addr','src_port','dst_addr','dst_port','note','sig_id','event_msg','sub_msg','sig_count','host_count','threat','sample'],
                    'smtp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','helo','mailfrom','rcptto','date','from','to','reply_to','msg_id','in_reply_to','subject','x_originating_ip','first_received','second_received','last_reply','path','user_agent','fuids','is_webmail','threat','sample'],
                    'ssl.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','version','cipher','server_name','session_id','subject','issuer_subject','not_valid_before','not_valid_after','last_alert','client_subject','client_issuer_subject','cert_hash','validation_status','threat','sample'],
                    'tunnel.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','tunnel_type','action','threat','sample'],
                    'weird.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','name','addl','notice','peer','threat','sample']
                  }

conndf   = pd.DataFrame(columns=logs_to_process['conn.log'])
dnsdf    = pd.DataFrame(columns=logs_to_process['dns.log'])
filesdf  = pd.DataFrame(columns=logs_to_process['files.log'])
ftpdf    = pd.DataFrame(columns=logs_to_process['ftp.log'])
httpdf   = pd.DataFrame(columns=logs_to_process['http.log'])
noticedf = pd.DataFrame(columns=logs_to_process['notice.log'])
sigdf    = pd.DataFrame(columns=logs_to_process['signatures.log'])
smtpdf   = pd.DataFrame(columns=logs_to_process['smtp.log'])
ssldf    = pd.DataFrame(columns=logs_to_process['ssl.log'])
tunneldf = pd.DataFrame(columns=logs_to_process['tunnel.log'])
weirddf  = pd.DataFrame(columns=logs_to_process['weird.log'])

# Process the directory structure
# If you download the complete PCAP zip from Contagio and unzip a structure like:
# PCAPS_TRAFFIC_PATTERNS
#      |->CRIME
#          |-> <sample>
#      |->APT
#          |-> <sample>
#      |->METASPLOIT
#          |-> <sample>
#
# Will appear and this is the structure that's walk CRIME/APT/METASPLOIT will make their way into the "threat" tag
# while the sample/PCAP name will wind up in "sample"
#
# Bro data generated via the "run_bro.sh" shell script (this places all Bro output in the respective sample directories and 
# contributes to the directory structure above
for dirName, subdirList, fileList in os.walk('.'):
    #print('Found directory: %s' % dirName)
    for fname in fileList:
        tags = dirName.split('/')
        if len(tags) == 4 and fname in logs_to_process:
            #print ('%s/%s' %(dirName, fname))
            logname = fname.split('.')
            try:
                tempdf = pd.read_csv(dirName+'/'+fname, sep='\t',skiprows=8, header=None, 
                                     names=logs_to_process[fname][:-2], skipfooter=1)
                tempdf['threat'] = tags[2]
                tempdf['sample'] = tags[3]
                if tags[2] == "0":
                    print ('%s/%s' %(dirName, fname))
                if fname == 'conn.log':
                    conndf = conndf.append(tempdf)
                if fname == 'dns.log':
                    dnsdf = dnsdf.append(tempdf)
                if fname == 'files.log':
                    filesdf = filesdf.append(tempdf)
                if fname == 'ftp.log':
                    ftpdf = ftpdf.append(tempdf)
                if fname == 'http.log':
                    httpdf = httpdf.append(tempdf)
                if fname == 'notice.log':
                    noticedf = noticedf.append(tempdf)
                if fname == 'signatures.log':
                    sigdf = sigdf.append(tempdf)
                if fname == 'smtp.log':
                    smtpdf = smtpdf.append(tempdf)
                if fname == 'ssl.log':
                    ssldf = ssldf.append(tempdf)
                if fname == 'tunnel.log':
                    tunneldf = tunneldf.append(tempdf)
                if fname == 'weird.log':
                    weirddf = weirddf.append(tempdf)
            except Exception as e:
                print "[*] error: %s, on %s/%s" % (str(e), dirName, fname)

# Read in and configure the maxmind db (free ASN)                
maxmind = pd.read_csv("./GeoIPASNum2.csv", sep=',', header=None, names=['low','high','asn'])
maxmind['low'] = maxmind['low'].astype(int)
maxmind['high'] = maxmind['high'].astype(int)

# Helper Functions
def ip2int(addr):               
    try:
        return struct.unpack("!I", socket.inet_aton(addr))[0]
    except Exception as e:
        pass
        #print "Error: %s - %s" % (str(e), addr)
    return 0

maxcache = {}
def maxmind_lookup(ip):
    if ip in maxcache:
        return maxcache[ip]
    i = ip2int(ip)
    if i == 0:
        return "UNKNOWN"
    results = list(maxmind.loc[(maxmind["low"] < i) & (maxmind['high'] > i)]['asn'])
    if len(results) > 0:
        maxcache[ip] = results[0]
        return results[0]
    maxcache[ip] = "UNKNOWN"
    return "UNKNOWN"

def box_plot_df_setup(series_a, series_b): 
    # Count up all the times that a category from series_a
    # matches up with a category from series_b. This is
    # basically a gigantic contingency table
    cont_table = collections.defaultdict(lambda : collections.Counter())
    for val_a, val_b in zip(series_a.values, series_b.values):
        cont_table[val_a][val_b] += 1
    
    # Create a dataframe
    # A dataframe with keys from series_a as the index, series_b_keys
    # as the columns and the counts as the values.
    dataframe = pd.DataFrame(cont_table.values(), index=cont_table.keys())
    dataframe.fillna(0, inplace=True)
    return dataframe

def is_ip(ip):
    try:
        socket.inet_aton(ip)
        return True
    except socket.error:
        return False

# misc cleanup of the Bro conn.log dataframe
try:
    conndf.orig_bytes[conndf.orig_bytes == '-'] = 0
except Exception as e:
    pass
try:
    conndf.resp_bytes[conndf.resp_bytes == '-'] = 0
except Exception as e:
    pass
conndf['orig_bytes'] = conndf['orig_bytes'].astype(long)
conndf['resp_bytes'] = conndf['resp_bytes'].astype(long)
conndf['total_bytes'] = conndf['orig_bytes'] + conndf['resp_bytes']

# and augmentation (asn)
conndf['maxmind_asn'] = conndf['id.resp_h'].map(maxmind_lookup)
# add date
good_datetime = [datetime.fromtimestamp(float(date)) for date in conndf['ts'].values]
conndf['date'] = pd.Series(good_datetime, index=conndf.index)

# reindex the dataframes
conndf = conndf.reindex()
httpdf = httpdf.reindex()
dnsdf = dnsdf.reindex()
noticedf = noticedf.reindex()
filesdf = filesdf.reindex()
smtpdf = smtpdf.reindex()

for threat in ['APT', 'CRIME']:
    subset = conndf[conndf['threat'] == threat][['date','sample']]
    subset['count'] = 1
    pivot = pd.pivot_table(subset, values='count', rows=['date'], cols=['sample'], fill_value=0)
    by = lambda x: lambda y: getattr(y, x)
    grouped = pivot.groupby([by('year'),by('month')]).sum()
    
    ax = grouped.plot()
    pylab.ylabel('Connections')
    pylab.xlabel('Date Recorded')
    patches, labels = ax.get_legend_handles_labels()
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2, title="Sample Name")

print "Total Samples:          %s" % conndf['sample'].nunique()
print ""
print "APT Samples:            %s" % conndf[conndf['threat'] == 'APT']['sample'].nunique()
print "Crime Samples:          %s" % conndf[conndf['threat'] == 'CRIME']['sample'].nunique()
print "Metasploit Samples:     %s" % conndf[conndf['threat'] == 'METASPLOIT']['sample'].nunique()
print ""
print "Connection Log Entries: %s" % conndf.shape[0]
print "DNS Log Entries:        %s" % dnsdf.shape[0]
print "HTTP Log Entries:       %s" % httpdf.shape[0]
print "Files Log Entries:      %s" % filesdf.shape[0]
print "SMTP Log Entries:       %s" % smtpdf.shape[0]
print "Weird Log Entries:      %s" % weirddf.shape[0]
print "SSL Log Entries:        %s" % ssldf.shape[0]
print "Notice Log Entries:     %s" % noticedf.shape[0]
print "Tunnel Log Entries:     %s" % tunneldf.shape[0]
print "Signature Log Entries:  %s" % sigdf.shape[0]

# Get all the destination addresses from all the signature hits, in this case it's only one.
sig_dst_ips = sigdf['dst_addr'].tolist()
sigdf[['dst_addr', 'dst_port','sig_id','sub_msg','threat','sample']]

# Let's see what other information we can gather about the network sessions surrounding that signature
for ip in sig_dst_ips:
    print "**** IP: %s ****" %ip
    print "  ** Flow Information **"
    print conndf[conndf['id.resp_h'] == ip][['id.resp_p','proto','service','duration','conn_state','orig_ip_bytes','resp_ip_bytes']]
    print "  ** HTTP Information **"
    print httpdf[httpdf['id.resp_h'] == ip][['method','host','uri','user_agent']]
    files = httpdf[httpdf['id.resp_h'] == ip]['orig_fuids']
    flist = files.append(httpdf[httpdf['id.resp_h'] == ip]['resp_fuids']).tolist()
    # We use SHA1 because that's what gets tossed in the Bro notice.log for the Team Cymru MHR alerts
    print "  ** File SHA1 **"
    for f in flist:
        if f != '-':
            sha1 = filesdf[filesdf['fuid'] == f]['sha1'].tolist()
            for m in sha1:
                print "Sample Hash: %s" % m
                if noticedf[noticedf['sub'].str.contains(m)][['sub','sample']].shape[0] > 0:
                    print noticedf[noticedf['sub'].str.contains(m)][['sub','sample']]
                print "Filename: %s    mime-type: %s" % (filesdf[filesdf['sha1'] == m]['filename'].tolist()[0], filesdf[filesdf['sha1'] == m]['mime_type'].tolist()[0])
                print ""
            #print md5

print dnsdf.qtype_name.value_counts()

for q in dnsdf['qtype_name'].unique().tolist():
    print "Query Type: %s" % q
    print dnsdf[dnsdf['qtype_name'] == q]['query'].value_counts().head(5)
    print ""

dnsdf['rcode_name'].value_counts()

dnsdf[dnsdf['rcode_name'] == 'NXDOMAIN']['sample'].value_counts().head(10)

intersect_hostnames = set(pd.Series(list(set(httpdf['host']).intersection(set(dnsdf['query'])))))
interesting = []
tempdf = pd.DataFrame()
for hn in list(set(httpdf['host'])):
    if hn not in intersect_hostnames and not is_ip(hn):
        #print hn
        interesting.append(hn)
        tempdf = tempdf.append(httpdf[httpdf['host'] == hn])

tempdf['count'] = 1
tempdf[['host', 'id.resp_h', 'sample', 'count']].groupby(['sample', 'host', 'id.resp_h']).sum().sort('count', ascending=0)

print dnsdf[dnsdf['query'] == "dgyqimolcqm.cm"]
print dnsdf[dnsdf.answers.str.contains('dgyqimolcqm.cm')]
print dnsdf[dnsdf['sample'] == "BIN_ZeroAccess_Sirefef_C2A9CCC8C6A6DF1CA1725F9"]['query'].value_counts().head(50)

httpdf[httpdf['host'] == "dgyqimolcqm.cm"][['id.orig_h','id.orig_p','id.resp_h','id.resp_p','uri','sample','threat']]

print "%s Unique User-Agents in %s samples." % (httpdf['user_agent'].nunique(), httpdf['sample'].nunique())

tempdf = pd.DataFrame(columns=['sample','num_ua'])
for sample in list(set(httpdf['sample'])):
    tempdf = tempdf.append({'sample':sample, 'num_ua':httpdf[httpdf['sample'] == sample]['user_agent'].nunique()}, ignore_index=True)
tempdf.sort('num_ua', ascending=0).head()

# Well, at least we know what UA this sample uses for C2, and it seems we can see some other OS activity as well
tsample = 'purplehaze'
httpdf[httpdf['sample'] == tsample].user_agent.value_counts()

httpdf['count'] = 1
grouped = httpdf[httpdf['sample'] == tsample][['sample','user_agent','host','count']].groupby(['sample', 'user_agent', 'host']).sum()
grouped.sort('count', ascending = 0).head(10)

tsample = 'BIN_dirtjumper_2011-10'
httpdf[httpdf['sample'] == tsample].user_agent.value_counts()

grouped = httpdf[httpdf['sample'] == tsample][['sample','user_agent','host','count']].groupby(['sample', 'host']).sum()
grouped.sort('count', ascending = 0)

grouped = httpdf[httpdf['sample'] == tsample][['sample','user_agent','host','count']].groupby(['sample', 'user_agent', 'host']).sum()
grouped.sort('count', ascending = 0)

conndf[conndf['maxmind_asn'] == "UNKNOWN"]['id.resp_h'].value_counts()

ax = box_plot_df_setup(conndf[conndf['threat'] == 'APT']['sample'], conndf[conndf['threat'] == 'APT']['maxmind_asn']).T.plot(kind='bar', stacked=True)
pylab.ylabel('Sample Occurrences')
pylab.xlabel('ASN (Autonomous System Number)')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="Sample Name")

conndf[conndf['sample'] == "BIN_8202_6d2c12085f0018daeb9c1a53e53fd4d1"][['maxmind_asn','id.resp_h']]

conndf['count'] = 1
grouped = conndf.groupby(['sample', 'id.resp_p']).sum()
grouped.sort('total_bytes', ascending = 0).head(10)

# That port 1935 from above might be interesting, where's it going?
conndf[conndf['id.resp_p'] == 1935][['id.resp_h','proto']]

# Same with port 336
conndf[conndf['id.resp_p'] == 336][['id.resp_h','proto']]

smtpdf.sample.value_counts()

print "Unique Hosts found as the HELO portion of SMTP traffic: %s" % smtpdf.helo.nunique()
print ""
print "Some of the examples"
print smtpdf.helo.value_counts().head(10)

smtpdf['count'] = 1
grouped = smtpdf[smtpdf['from'] != "-"][['from','subject','count']].groupby(['from', 'subject']).sum()
grouped.sort('count', ascending = 0).head(20)

ax = box_plot_df_setup(filesdf['source'], filesdf['mime_type']).T.plot(kind='bar', stacked=True)
pylab.xlabel('Mime-Type')
pylab.ylabel('Number of Files')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, title="Service Type")

ax = box_plot_df_setup(filesdf.loc[(filesdf["mime_type"] != 'text/html') & (filesdf['mime_type'] != 'text/plain')]['source'], filesdf.loc[(filesdf["mime_type"] != 'text/html') & (filesdf['mime_type'] != 'text/plain')]['mime_type']).T.plot(kind='bar', stacked=True)
pylab.xlabel('Mime-Type')
pylab.ylabel('Number of Files')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, title="Service Type")

filesdf['count'] = 1
filesdf[filesdf['filename'] != '-'][['source','mime_type','seen_bytes','count']].groupby(['source','mime_type']).sum().sort('count', ascending=0).head(10)

filesdf[filesdf['filename'] != '-'][['source','mime_type','filename','count']].groupby(['source','mime_type','filename']).sum().sort('count', ascending=0).head(10)

filesdf[filesdf['filename'] != '-'][['sample','mime_type','filename','count']].groupby(['sample','mime_type','filename']).sum().sort('count', ascending=0).head(10)

noticedf['count'] = 1
noticedf[['note','msg','count']].groupby(['note','msg']).sum().sort('count', ascending=0)

# We can get a slightly different look at the world by throwing some ports into the mix! Looks like we might have some winners here.
noticedf[['note','msg','id.resp_p','count']].groupby(['note','msg','id.resp_p']).sum().sort('count', ascending=0)

noticedf[noticedf['note'] == 'Scan::Address_Scan']['sample']

ssldf['id.resp_p'].value_counts()

ssldf.subject.value_counts().head(10)

ssldf['count'] = 1
ssldf[['version','cipher','count']].groupby(['version','cipher']).sum().sort('count', ascending=0)

ssldf[['sample','server_name','id.resp_p','count']].groupby(['sample','id.resp_p','server_name']).sum().sort('count', ascending=0)

data = {'name' : 'ssl'}
samples = list(set(ssldf['sample'].tolist()))
data['children'] = list()
sampleindex = 0
for sample in samples:
    data['children'].append({'name' : sample, 'children' : list()})
    ports = set(ssldf[ssldf['sample'] == sample]['id.resp_p'].tolist())
    portindex = 0
    for port in ports:
        data['children'][sampleindex]['children'].append({'name' : str(port), 'children' : list()})
        hostnames = set(list(ssldf.loc[(ssldf['id.resp_p'] == int(port)) & (ssldf['sample'] == sample)]['server_name']))
        for hostname in hostnames:
            data['children'][sampleindex]['children'][portindex]['children'].append({'name' : hostname, 'size' : 1}) 
        portindex += 1
    sampleindex += 1
json.dump(data, open('ssl.json', 'w'))

# Ports per sample
ax = box_plot_df_setup(ssldf['id.resp_p'], ssldf['sample']).T.plot(kind='bar', stacked=True)
pylab.ylabel('Total # of connections')
pylab.xlabel('Samples')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="Port")

# Or as you might see it in an operational sense...
ax = box_plot_df_setup(ssldf['id.resp_p'], ssldf['id.orig_h']).T.plot(kind='bar', stacked=True)
pylab.ylabel('Total # of connections')
pylab.xlabel('Source IP')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="Port")

weirddf.name.value_counts()