# All the imports and some basic level setting with various versions
import IPython
import os
import pylab
import string
import pandas
import pickle
import matplotlib
import collections
import numpy as np
import pandas as pd
import matplotlib as plt
from __future__ import division

print "IPython version: %s" %IPython.__version__
print "pandas version: %s" %pd.__version__
print "numpy version: %s" %np.__version__
print "matplotlib version: %s" %plt.__version__

%matplotlib inline
pylab.rcParams['figure.figsize'] = (16.0, 5.0)

# Mapping of fields of the files we want to read in and initial setup of pandas dataframes
# Borrowed from aonther notebook, this time we're just going to focus on notice and files for starters
# But the rest are here when we need 'em
logs_to_process = {
                    'conn.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','service','duration','orig_bytes','resp_bytes','conn_state','local_orig','missed_bytes','history','orig_pkts','orig_ip_bytes','resp_pkts','resp_ip_bytes','tunnel_parents','sample'],
                    'dns.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','trans_id','query','qclass','qclass_name','qtype','qtype_name','rcode','rcode_name','AA','TC','RD','RA','Z','answers','TTLs','rejected','sample'],
                    'files.log' : ['ts','fuid','tx_hosts','rx_hosts','conn_uids','source','depth','analyzers','mime_type','filename','duration','local_orig','is_orig','seen_bytes','total_bytes','missing_bytes','overflow_bytes','timedout','parent_fuid','md5','sha1','sha256','extracted','sample'],
                    'ftp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','user','password','command','arg','mime_type','file_size','reply_code','reply_msg','data_channel.passive','data_channel.orig_h','data_channel.resp_h','data_channel.resp_p','fuid','sample'],
                    'http.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','method','host','uri','referrer','user_agent','request_body_len','response_body_len','status_code','status_msg','info_code','info_msg','filename','tags','username','password','proxied','orig_fuids','orig_mime_types','resp_fuids','resp_mime_types','sample'],
                    'irc.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','nick','user','command','value','addl','dcc_file_name','dcc_file_size','dcc_mime_type','fuid','sample'],
                    'notice.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','fuid','file_mime_type','file_desc','proto','note','msg','sub','src','dst','p','n','peer_descr','actions','suppress_for','dropped','remote_location.country_code','remote_location.region','remote_location.city','remote_location.latitude','remote_location.longitude','sample'],
                    'signatures.log' : ['ts','src_addr','src_port','dst_addr','dst_port','note','sig_id','event_msg','sub_msg','sig_count','host_count','sample'],
                    'smtp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','helo','mailfrom','rcptto','date','from','to','reply_to','msg_id','in_reply_to','subject','x_originating_ip','first_received','second_received','last_reply','path','user_agent','fuids','is_webmail','sample'],
                    'ssl.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','version','cipher','server_name','session_id','subject','issuer_subject','not_valid_before','not_valid_after','last_alert','client_subject','client_issuer_subject','cert_hash','validation_status','sample'],
                    'tunnel.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','tunnel_type','action','sample'],
                    'weird.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','name','addl','notice','peer','sample']
                  }

conndf   = pd.DataFrame(columns=logs_to_process['conn.log'])
dnsdf    = pd.DataFrame(columns=logs_to_process['dns.log'])
filesdf  = pd.DataFrame(columns=logs_to_process['files.log'])
ftpdf    = pd.DataFrame(columns=logs_to_process['ftp.log'])
httpdf   = pd.DataFrame(columns=logs_to_process['http.log'])
ircdf    = pd.DataFrame(columns=logs_to_process['irc.log'])
noticedf = pd.DataFrame(columns=logs_to_process['notice.log'])
smtpdf   = pd.DataFrame(columns=logs_to_process['smtp.log'])
ssldf    = pd.DataFrame(columns=logs_to_process['ssl.log'])
weirddf  = pd.DataFrame(columns=logs_to_process['weird.log'])

process_files = ['notice.log','files.log']
for dirName, subdirList, fileList in os.walk('..'):
    for fname in fileList:
        tags = dirName.split('/')
        if len(tags) == 2 and fname in logs_to_process:
            logname = fname.split('.')
            try:
                if fname in process_files:
                    #print "Processing %s - %s" %(tags[1], fname)
                    tempdf = pd.read_csv(dirName+'/'+fname, sep='\t',skiprows=8, header=None, 
                                     names=logs_to_process[fname][:-1], skipfooter=1)
                    tempdf['sample'] = tags[1]
                    if fname == 'conn.log':
                        conndf = conndf.append(tempdf)
                    if fname == 'dns.log':
                        dnsdf = dnsdf.append(tempdf)
                    if fname == 'files.log':
                        filesdf = filesdf.append(tempdf)
                    if fname == 'ftp.log':
                        ftpdf = ftpdf.append(tempdf)
                    if fname == 'http.log':
                        httpdf = httpdf.append(tempdf)
                    if fname == 'notice.log':
                        noticedf = noticedf.append(tempdf)
                    if fname == 'signatures.log':
                        sigdf = sigdf.append(tempdf)
                    if fname == 'smtp.log':
                        smtpdf = smtpdf.append(tempdf)
                    if fname == 'ssl.log':
                        ssldf = ssldf.append(tempdf)
                    if fname == 'tunnel.log':
                        tunneldf = tunneldf.append(tempdf)
                    if fname == 'weird.log':
                        weirddf = weirddf.append(tempdf)
            except Exception as e:
                print "[*] error: %s, on %s/%s" % (str(e), dirName, fname)

#You can use these to save a copy of the raw dataframe, because reading in the files over-and-over again is awful
#pickle.dump(filesdf, open('files.dataframe', 'wb'))
filesdf = pickle.load(open('files.dataframe', 'rb'))
#pickle.dump(noticedf, open('notice.dataframe', 'wb'))
noticedf = pickle.load(open('notice.dataframe', 'rb'))

noticedf.head(3)

filesdf.head()

noticedf.note.value_counts()

hashes = set()
def grab_hash(s):
    if 'virustotal' in s:
        hashes.add(s.split('=')[1])
    return ''

throwaway = noticedf['sub'].map(grab_hash)

def box_plot_df_setup(series_a, series_b): 
    # Count up all the times that a category from series_a
    # matches up with a category from series_b. This is
    # basically a gigantic contingency table
    cont_table = collections.defaultdict(lambda : collections.Counter())
    for val_a, val_b in zip(series_a.values, series_b.values):
        cont_table[val_a][val_b] += 1
    
    # Create a dataframe
    # A dataframe with keys from series_a as the index, series_b_keys
    # as the columns and the counts as the values.
    dataframe = pd.DataFrame(cont_table.values(), index=cont_table.keys())
    dataframe.fillna(0, inplace=True)
    return dataframe

ax = box_plot_df_setup(filesdf['source'], filesdf['mime_type']).T.plot(kind='bar', stacked=True)
pylab.xlabel('Mime-Type')
pylab.ylabel('Number of Files')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, title="Service Type")

filesdf.mime_type.value_counts().head()

print "Lots of files!"
print "Total # of files (across all samples): %s" %filesdf.shape[0]
print "Total # of unique files: %s" %len(filesdf['sha1'].unique())
print "Total # of network sessions involving files: %s" %len(filesdf['conn_uids'].unique())
print "Total # of unique mime_types: %s" %len(filesdf['mime_type'].unique())
print "Total # of unique filenames: %s" %len(filesdf['filename'].unique())

# We can use some of the output from above and get rid of them and look are more exciting files
# Just an example, I don't think we'll do much with this data frame today
boring = set(['text/html','text/plain','image/jpeg','image/gif','image/png','application/xml','image/x-icon'])
exciting_filesdf = filesdf[filesdf['mime_type'].apply(lambda x: x not in boring)]
exciting_filesdf.head(2)

filesdf['count'] = 1
filesdf[['source','mime_type','count']].groupby(['source','mime_type']).sum().sort('count', ascending=0).head(10)

# We can get a slightly different view if we look at percentages of files
# Wonder how accurate the percentages are vs. monitored network traffic?
filesdf.groupby('source')['mime_type'].apply(lambda x: pd.value_counts(x)/x.count().astype(float)).head(20)

filesdf['count'] = 1
filesdf[filesdf['filename'] != '-'][['source','mime_type','count']].groupby(['source','mime_type']).sum().sort('count', ascending=0).head(10)

filesdf[filesdf['filename'] != '-'][['source','mime_type','filename','count']].groupby(['source','mime_type','filename']).sum().sort('count', ascending=0).head(20)

# Filenames with a '/' in them??
# Just some random exploring, wonder why these have a path associated w/them and not the rest? Questions for another day.
print filesdf[filesdf['filename'].str.contains('/')]['filename'].value_counts().head(10)
print 
print filesdf[filesdf['filename'].str.contains('\.\.')]['filename'].value_counts()

# Lots of duplicate files Wonder what these look like?
filesdf.md5.value_counts().head()

filesdf[filesdf['filename'] != '-'][['filename','mime_type','count']].groupby(['filename','mime_type']).sum().sort('count', ascending=0).head(10)

filesdf[filesdf['filename'] != '-'][['filename','md5','count']].groupby(['filename','md5']).sum().sort('count', ascending=0).head(10)

# Lookup to see what the Bro - Team Cymru Malware Hash Registry picks up
def tc_mhr_present_single(sha1):
    if sha1 in hashes:
        return True
    return False

tempdf = filesdf
tempdf['count'] = 1
tempdf['mhr'] = tempdf['sha1'].map(tc_mhr_present_single)
# The following 2 Commands Print out the tables below
#tempdf[tempdf['filename'] != '-'][['mhr','filename','count']].groupby(['mhr','filename']).sum().sort('count', ascending=0).head(12)
#tempdf[tempdf['filename'] != '-'][['filename','mhr','count']].groupby(['filename','mhr']).sum().sort('count', ascending=0).head(12)

tempdf.groupby('mhr')['filename'].apply(lambda x: pd.value_counts(x)/len(tempdf.index))

# Number of files per sample
filesdf[['sample','count']].groupby(['sample']).sum().sort('count', ascending=0).head(10)

filesdf[['conn_uids','count']].groupby(['conn_uids']).sum().sort('count', ascending=0).head(10)

filesdf[['sample','conn_uids','count']].groupby(['sample','conn_uids']).sum().sort('count', ascending=0).head(20)

filesdf[filesdf['conn_uids'] == 'CVANI54q1eA8PJEPNl'].shape[0]

print filesdf[filesdf['conn_uids'] == 'CVANI54q1eA8PJEPNl']['sample'].unique()
print filesdf[filesdf['conn_uids'] == 'C9KyD7n902u8uko9j']['sample'].unique() 

# These are so we can pick up combinations of executable/persistent mime-types along with a mime-type that is frequently
# associated with exploits/drivebys
executable_types = set(['application/x-dosexec', 'application/octet-stream', 'binary', 'application/vnd.ms-cab-compressed'])
common_exploit_types = set(['application/x-java-applet','application/pdf','application/zip','application/jar','application/x-shockwave-flash'])

# If there is at least one executable type and one exploit type in a list of mime-types
def intresting_combo_data(mimetypes):
    mt = set(mimetypes.tolist())
    et = set()
    cet = set()
    et = mt.intersection(executable_types)
    cet = mt.intersection(common_exploit_types)
    if len(et) > 0 and len(cet) > 0:
        return ":".join(cet) + ":" + ":".join(et)
    if len(et) > 0 and len(cet) == 0:
        return ":".join(et)
    if len(cet) > 0 and len(et) == 0:
        return ":".join(cet)
    return "NONE"

def intresting_combo_label(mimetypes):
    mt = set(mimetypes.tolist())
    et = set()
    cet = set()
    et = mt.intersection(executable_types)
    cet = mt.intersection(common_exploit_types)
    if len(et) > 0 and len(cet) > 0:
        return "C-c-c-combo"
    if len(et) > 0 and len(cet) == 0:
        return "Executable only"
    if len(cet) > 0 and len(et) == 0:
        return "Exploit only"
    return "NONE"

# Lookup to see what the Bro - Team Cymru Malware Hash Registry picks up
def tc_mhr_present(sha1):
    for h in set(sha1.tolist()):
        if h in hashes:
            return True
    return False

# Get the data in a list (Series) of <sample name> -> <nparray of mime-types>
sample_groups = filesdf.groupby('sample')
s = sample_groups['mime_type'].apply(lambda x: x.unique())

# Rebuild the series into a dataframe and then "collapse" the dataframe with a reset index
sample_combos = pd.DataFrame(s, columns=['mime_types'])
sample_combos['sample'] = s.index
sample_combos['combos_data'] = s.map(intresting_combo_data)
sample_combos['combos_label'] = s.map(intresting_combo_label)

# Add some more columns and reset the index
sample_combos['sha1'] = sample_groups['sha1'].apply(lambda x: x.unique())
sample_combos['num_files'] = sample_groups['sha1'].apply(lambda x: len(x))
sample_combos = sample_combos.reset_index(drop=True)
sample_combos['mhr'] = sample_combos['sha1'].map(tc_mhr_present)
sample_combos.head()
# Now we have a nice "flat" dataframe that for each sample has a list of sha1s associated with it, along with mime-types
# associated with it. Including the list of interesting mime-type combinations and if any of the sha1 hashes were picked up in
# the MHR

print "Reminder"
print "Total files found: %s" %len(filesdf.index)
print "Total Samples: %s" %len(sample_combos.index)
print "\nData summary"
print sample_combos.combos_label.value_counts()
sample_combos['count'] = 1
sample_combos[['combos_label','combos_data','count']].groupby(['combos_label','combos_data']).sum().sort('count', ascending=0).head(15)

(100. * sample_combos.combos_data.value_counts() / len(sample_combos.index)).head(10)

# Sorry for the lousy formatting, I really wanted (me) and you to see all the crazy combinations of files in some of these samples.
# Maybe they're multiple malicious sites, or maybe it's just some crazy spray-and-pray happening!
sample_mhr = sample_combos[sample_combos['mhr'] == True]
print "Total Samples: %s" %len(sample_mhr.index) 
print
print sample_mhr.combos_label.value_counts()
print
print (100. * sample_mhr.combos_data.value_counts() / len(sample_mhr.index)).head(10)

# Get the data in a list (Series) of <sample name> -> <nparray of mime-types>
uid_groups = filesdf.groupby('conn_uids')
s = uid_groups['mime_type'].apply(lambda x: x.unique())

# Rebuild the series into a dataframe and then "collapse" the dataframe with a reset index
uid_combos = pd.DataFrame(s, columns=['mime_types'])
uid_combos['conn_uid'] = s.index
uid_combos['combos_data'] = s.map(intresting_combo_data)
uid_combos['combos_label'] = s.map(intresting_combo_label)

# Same trick, different day
uid_combos['sha1'] = uid_groups['sha1'].apply(lambda x: x.unique())
uid_combos['num_files'] = uid_groups['sha1'].apply(lambda x: len(x))
uid_combos = uid_combos.reset_index(drop=True)
uid_combos = uid_combos[uid_combos['conn_uid'] != '(empty)']
uid_combos['mhr'] = uid_combos['sha1'].map(tc_mhr_present)
uid_combos.head()
# Now we've got the same type of dataframe as above in sample_combos.

# Same deal as above, these 2 make the same tables as the images below
#uid_combos.describe()
#sample_combos.describe()

df_uid = pd.DataFrame()
df_uid['num_files'] = uid_combos['num_files']
df_uid['label'] = "session"
df_sample = pd.DataFrame()
df_sample['num_files'] = sample_combos['num_files']
df_sample['label'] = "sample"
df = pd.concat([df_sample, df_uid], ignore_index=True)
df.boxplot('num_files','label',vert=False)
plt.pyplot.xlabel('Number of Files, Session v. Sample')
plt.pyplot.ylabel('# of Files')
plt.pyplot.title('Comparision of # Files')
plt.pyplot.suptitle("")

print "Total connections: %s" %len(uid_combos.index)
#100. * uid_combos.combos.value_counts() / len(uid_combos.index)
uid_combos['count'] = 1
uid_combos[['combos_label','combos_data','count']].groupby(['combos_label','combos_data']).sum().sort('count', ascending=0).head(15)

uid_mhr = uid_combos[uid_combos['mhr'] == True]
print "Total connections: %s" %len(uid_mhr.index)
print
print uid_mhr.combos_label.value_counts()
print
100. * uid_mhr.combos_data.value_counts() / len(uid_mhr.index)

uid_mhr = uid_combos[uid_combos['mhr'] != True]
print uid_mhr.combos_label.value_counts()
print
uid_mhr[uid_mhr['combos_label'] == 'C-c-c-combo']['combos_data'].value_counts()

combos = uid_combos[uid_combos['combos_label'] == 'C-c-c-combo'].shape[0]
print "% of MHR hits in sessions with an \"interesting\" file combination"
print uid_combos[uid_combos['combos_label'] == 'C-c-c-combo']['mhr'].value_counts().apply(lambda x: x/combos)
print "\n% of MHR hits from samples (end systems) with an \"interesting\" file combination"
combos = sample_combos[sample_combos['combos_label'] == 'C-c-c-combo'].shape[0]
print sample_combos[sample_combos['combos_label'] == 'C-c-c-combo']['mhr'].value_counts().apply(lambda x: x/combos)