# All the imports and some basic level setting with various versions import IPython import os import pylab import string import pandas import pickle import matplotlib import collections import numpy as np import pandas as pd import matplotlib as plt from __future__ import division print "IPython version: %s" %IPython.__version__ print "pandas version: %s" %pd.__version__ print "numpy version: %s" %np.__version__ print "matplotlib version: %s" %plt.__version__ %matplotlib inline pylab.rcParams['figure.figsize'] = (16.0, 5.0) # Mapping of fields of the files we want to read in and initial setup of pandas dataframes # Borrowed from aonther notebook, this time we're just going to focus on notice and files for starters # But the rest are here when we need 'em logs_to_process = { 'conn.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','service','duration','orig_bytes','resp_bytes','conn_state','local_orig','missed_bytes','history','orig_pkts','orig_ip_bytes','resp_pkts','resp_ip_bytes','tunnel_parents','sample'], 'dns.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','trans_id','query','qclass','qclass_name','qtype','qtype_name','rcode','rcode_name','AA','TC','RD','RA','Z','answers','TTLs','rejected','sample'], 'files.log' : ['ts','fuid','tx_hosts','rx_hosts','conn_uids','source','depth','analyzers','mime_type','filename','duration','local_orig','is_orig','seen_bytes','total_bytes','missing_bytes','overflow_bytes','timedout','parent_fuid','md5','sha1','sha256','extracted','sample'], 'ftp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','user','password','command','arg','mime_type','file_size','reply_code','reply_msg','data_channel.passive','data_channel.orig_h','data_channel.resp_h','data_channel.resp_p','fuid','sample'], 'http.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','method','host','uri','referrer','user_agent','request_body_len','response_body_len','status_code','status_msg','info_code','info_msg','filename','tags','username','password','proxied','orig_fuids','orig_mime_types','resp_fuids','resp_mime_types','sample'], 'irc.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','nick','user','command','value','addl','dcc_file_name','dcc_file_size','dcc_mime_type','fuid','sample'], 'notice.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','fuid','file_mime_type','file_desc','proto','note','msg','sub','src','dst','p','n','peer_descr','actions','suppress_for','dropped','remote_location.country_code','remote_location.region','remote_location.city','remote_location.latitude','remote_location.longitude','sample'], 'signatures.log' : ['ts','src_addr','src_port','dst_addr','dst_port','note','sig_id','event_msg','sub_msg','sig_count','host_count','sample'], 'smtp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','helo','mailfrom','rcptto','date','from','to','reply_to','msg_id','in_reply_to','subject','x_originating_ip','first_received','second_received','last_reply','path','user_agent','fuids','is_webmail','sample'], 'ssl.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','version','cipher','server_name','session_id','subject','issuer_subject','not_valid_before','not_valid_after','last_alert','client_subject','client_issuer_subject','cert_hash','validation_status','sample'], 'tunnel.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','tunnel_type','action','sample'], 'weird.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','name','addl','notice','peer','sample'] } conndf = pd.DataFrame(columns=logs_to_process['conn.log']) dnsdf = pd.DataFrame(columns=logs_to_process['dns.log']) filesdf = pd.DataFrame(columns=logs_to_process['files.log']) ftpdf = pd.DataFrame(columns=logs_to_process['ftp.log']) httpdf = pd.DataFrame(columns=logs_to_process['http.log']) ircdf = pd.DataFrame(columns=logs_to_process['irc.log']) noticedf = pd.DataFrame(columns=logs_to_process['notice.log']) smtpdf = pd.DataFrame(columns=logs_to_process['smtp.log']) ssldf = pd.DataFrame(columns=logs_to_process['ssl.log']) weirddf = pd.DataFrame(columns=logs_to_process['weird.log']) process_files = ['notice.log','files.log'] for dirName, subdirList, fileList in os.walk('..'): for fname in fileList: tags = dirName.split('/') if len(tags) == 2 and fname in logs_to_process: logname = fname.split('.') try: if fname in process_files: #print "Processing %s - %s" %(tags[1], fname) tempdf = pd.read_csv(dirName+'/'+fname, sep='\t',skiprows=8, header=None, names=logs_to_process[fname][:-1], skipfooter=1) tempdf['sample'] = tags[1] if fname == 'conn.log': conndf = conndf.append(tempdf) if fname == 'dns.log': dnsdf = dnsdf.append(tempdf) if fname == 'files.log': filesdf = filesdf.append(tempdf) if fname == 'ftp.log': ftpdf = ftpdf.append(tempdf) if fname == 'http.log': httpdf = httpdf.append(tempdf) if fname == 'notice.log': noticedf = noticedf.append(tempdf) if fname == 'signatures.log': sigdf = sigdf.append(tempdf) if fname == 'smtp.log': smtpdf = smtpdf.append(tempdf) if fname == 'ssl.log': ssldf = ssldf.append(tempdf) if fname == 'tunnel.log': tunneldf = tunneldf.append(tempdf) if fname == 'weird.log': weirddf = weirddf.append(tempdf) except Exception as e: print "[*] error: %s, on %s/%s" % (str(e), dirName, fname) #You can use these to save a copy of the raw dataframe, because reading in the files over-and-over again is awful #pickle.dump(filesdf, open('files.dataframe', 'wb')) filesdf = pickle.load(open('files.dataframe', 'rb')) #pickle.dump(noticedf, open('notice.dataframe', 'wb')) noticedf = pickle.load(open('notice.dataframe', 'rb')) noticedf.head(3) filesdf.head() noticedf.note.value_counts() hashes = set() def grab_hash(s): if 'virustotal' in s: hashes.add(s.split('=')[1]) return '' throwaway = noticedf['sub'].map(grab_hash) def box_plot_df_setup(series_a, series_b): # Count up all the times that a category from series_a # matches up with a category from series_b. This is # basically a gigantic contingency table cont_table = collections.defaultdict(lambda : collections.Counter()) for val_a, val_b in zip(series_a.values, series_b.values): cont_table[val_a][val_b] += 1 # Create a dataframe # A dataframe with keys from series_a as the index, series_b_keys # as the columns and the counts as the values. dataframe = pd.DataFrame(cont_table.values(), index=cont_table.keys()) dataframe.fillna(0, inplace=True) return dataframe ax = box_plot_df_setup(filesdf['source'], filesdf['mime_type']).T.plot(kind='bar', stacked=True) pylab.xlabel('Mime-Type') pylab.ylabel('Number of Files') patches, labels = ax.get_legend_handles_labels() ax.legend(patches, labels, title="Service Type") filesdf.mime_type.value_counts().head() print "Lots of files!" print "Total # of files (across all samples): %s" %filesdf.shape[0] print "Total # of unique files: %s" %len(filesdf['sha1'].unique()) print "Total # of network sessions involving files: %s" %len(filesdf['conn_uids'].unique()) print "Total # of unique mime_types: %s" %len(filesdf['mime_type'].unique()) print "Total # of unique filenames: %s" %len(filesdf['filename'].unique()) # We can use some of the output from above and get rid of them and look are more exciting files # Just an example, I don't think we'll do much with this data frame today boring = set(['text/html','text/plain','image/jpeg','image/gif','image/png','application/xml','image/x-icon']) exciting_filesdf = filesdf[filesdf['mime_type'].apply(lambda x: x not in boring)] exciting_filesdf.head(2) filesdf['count'] = 1 filesdf[['source','mime_type','count']].groupby(['source','mime_type']).sum().sort('count', ascending=0).head(10) # We can get a slightly different view if we look at percentages of files # Wonder how accurate the percentages are vs. monitored network traffic? filesdf.groupby('source')['mime_type'].apply(lambda x: pd.value_counts(x)/x.count().astype(float)).head(20) filesdf['count'] = 1 filesdf[filesdf['filename'] != '-'][['source','mime_type','count']].groupby(['source','mime_type']).sum().sort('count', ascending=0).head(10) filesdf[filesdf['filename'] != '-'][['source','mime_type','filename','count']].groupby(['source','mime_type','filename']).sum().sort('count', ascending=0).head(20) # Filenames with a '/' in them?? # Just some random exploring, wonder why these have a path associated w/them and not the rest? Questions for another day. print filesdf[filesdf['filename'].str.contains('/')]['filename'].value_counts().head(10) print print filesdf[filesdf['filename'].str.contains('\.\.')]['filename'].value_counts() # Lots of duplicate files Wonder what these look like? filesdf.md5.value_counts().head() filesdf[filesdf['filename'] != '-'][['filename','mime_type','count']].groupby(['filename','mime_type']).sum().sort('count', ascending=0).head(10) filesdf[filesdf['filename'] != '-'][['filename','md5','count']].groupby(['filename','md5']).sum().sort('count', ascending=0).head(10) # Lookup to see what the Bro - Team Cymru Malware Hash Registry picks up def tc_mhr_present_single(sha1): if sha1 in hashes: return True return False tempdf = filesdf tempdf['count'] = 1 tempdf['mhr'] = tempdf['sha1'].map(tc_mhr_present_single) # The following 2 Commands Print out the tables below #tempdf[tempdf['filename'] != '-'][['mhr','filename','count']].groupby(['mhr','filename']).sum().sort('count', ascending=0).head(12) #tempdf[tempdf['filename'] != '-'][['filename','mhr','count']].groupby(['filename','mhr']).sum().sort('count', ascending=0).head(12) tempdf.groupby('mhr')['filename'].apply(lambda x: pd.value_counts(x)/len(tempdf.index)) # Number of files per sample filesdf[['sample','count']].groupby(['sample']).sum().sort('count', ascending=0).head(10) filesdf[['conn_uids','count']].groupby(['conn_uids']).sum().sort('count', ascending=0).head(10) filesdf[['sample','conn_uids','count']].groupby(['sample','conn_uids']).sum().sort('count', ascending=0).head(20) filesdf[filesdf['conn_uids'] == 'CVANI54q1eA8PJEPNl'].shape[0] print filesdf[filesdf['conn_uids'] == 'CVANI54q1eA8PJEPNl']['sample'].unique() print filesdf[filesdf['conn_uids'] == 'C9KyD7n902u8uko9j']['sample'].unique() # These are so we can pick up combinations of executable/persistent mime-types along with a mime-type that is frequently # associated with exploits/drivebys executable_types = set(['application/x-dosexec', 'application/octet-stream', 'binary', 'application/vnd.ms-cab-compressed']) common_exploit_types = set(['application/x-java-applet','application/pdf','application/zip','application/jar','application/x-shockwave-flash']) # If there is at least one executable type and one exploit type in a list of mime-types def intresting_combo_data(mimetypes): mt = set(mimetypes.tolist()) et = set() cet = set() et = mt.intersection(executable_types) cet = mt.intersection(common_exploit_types) if len(et) > 0 and len(cet) > 0: return ":".join(cet) + ":" + ":".join(et) if len(et) > 0 and len(cet) == 0: return ":".join(et) if len(cet) > 0 and len(et) == 0: return ":".join(cet) return "NONE" def intresting_combo_label(mimetypes): mt = set(mimetypes.tolist()) et = set() cet = set() et = mt.intersection(executable_types) cet = mt.intersection(common_exploit_types) if len(et) > 0 and len(cet) > 0: return "C-c-c-combo" if len(et) > 0 and len(cet) == 0: return "Executable only" if len(cet) > 0 and len(et) == 0: return "Exploit only" return "NONE" # Lookup to see what the Bro - Team Cymru Malware Hash Registry picks up def tc_mhr_present(sha1): for h in set(sha1.tolist()): if h in hashes: return True return False # Get the data in a list (Series) of -> sample_groups = filesdf.groupby('sample') s = sample_groups['mime_type'].apply(lambda x: x.unique()) # Rebuild the series into a dataframe and then "collapse" the dataframe with a reset index sample_combos = pd.DataFrame(s, columns=['mime_types']) sample_combos['sample'] = s.index sample_combos['combos_data'] = s.map(intresting_combo_data) sample_combos['combos_label'] = s.map(intresting_combo_label) # Add some more columns and reset the index sample_combos['sha1'] = sample_groups['sha1'].apply(lambda x: x.unique()) sample_combos['num_files'] = sample_groups['sha1'].apply(lambda x: len(x)) sample_combos = sample_combos.reset_index(drop=True) sample_combos['mhr'] = sample_combos['sha1'].map(tc_mhr_present) sample_combos.head() # Now we have a nice "flat" dataframe that for each sample has a list of sha1s associated with it, along with mime-types # associated with it. Including the list of interesting mime-type combinations and if any of the sha1 hashes were picked up in # the MHR print "Reminder" print "Total files found: %s" %len(filesdf.index) print "Total Samples: %s" %len(sample_combos.index) print "\nData summary" print sample_combos.combos_label.value_counts() sample_combos['count'] = 1 sample_combos[['combos_label','combos_data','count']].groupby(['combos_label','combos_data']).sum().sort('count', ascending=0).head(15) (100. * sample_combos.combos_data.value_counts() / len(sample_combos.index)).head(10) # Sorry for the lousy formatting, I really wanted (me) and you to see all the crazy combinations of files in some of these samples. # Maybe they're multiple malicious sites, or maybe it's just some crazy spray-and-pray happening! sample_mhr = sample_combos[sample_combos['mhr'] == True] print "Total Samples: %s" %len(sample_mhr.index) print print sample_mhr.combos_label.value_counts() print print (100. * sample_mhr.combos_data.value_counts() / len(sample_mhr.index)).head(10) # Get the data in a list (Series) of -> uid_groups = filesdf.groupby('conn_uids') s = uid_groups['mime_type'].apply(lambda x: x.unique()) # Rebuild the series into a dataframe and then "collapse" the dataframe with a reset index uid_combos = pd.DataFrame(s, columns=['mime_types']) uid_combos['conn_uid'] = s.index uid_combos['combos_data'] = s.map(intresting_combo_data) uid_combos['combos_label'] = s.map(intresting_combo_label) # Same trick, different day uid_combos['sha1'] = uid_groups['sha1'].apply(lambda x: x.unique()) uid_combos['num_files'] = uid_groups['sha1'].apply(lambda x: len(x)) uid_combos = uid_combos.reset_index(drop=True) uid_combos = uid_combos[uid_combos['conn_uid'] != '(empty)'] uid_combos['mhr'] = uid_combos['sha1'].map(tc_mhr_present) uid_combos.head() # Now we've got the same type of dataframe as above in sample_combos. # Same deal as above, these 2 make the same tables as the images below #uid_combos.describe() #sample_combos.describe() df_uid = pd.DataFrame() df_uid['num_files'] = uid_combos['num_files'] df_uid['label'] = "session" df_sample = pd.DataFrame() df_sample['num_files'] = sample_combos['num_files'] df_sample['label'] = "sample" df = pd.concat([df_sample, df_uid], ignore_index=True) df.boxplot('num_files','label',vert=False) plt.pyplot.xlabel('Number of Files, Session v. Sample') plt.pyplot.ylabel('# of Files') plt.pyplot.title('Comparision of # Files') plt.pyplot.suptitle("") print "Total connections: %s" %len(uid_combos.index) #100. * uid_combos.combos.value_counts() / len(uid_combos.index) uid_combos['count'] = 1 uid_combos[['combos_label','combos_data','count']].groupby(['combos_label','combos_data']).sum().sort('count', ascending=0).head(15) uid_mhr = uid_combos[uid_combos['mhr'] == True] print "Total connections: %s" %len(uid_mhr.index) print print uid_mhr.combos_label.value_counts() print 100. * uid_mhr.combos_data.value_counts() / len(uid_mhr.index) uid_mhr = uid_combos[uid_combos['mhr'] != True] print uid_mhr.combos_label.value_counts() print uid_mhr[uid_mhr['combos_label'] == 'C-c-c-combo']['combos_data'].value_counts() combos = uid_combos[uid_combos['combos_label'] == 'C-c-c-combo'].shape[0] print "% of MHR hits in sessions with an \"interesting\" file combination" print uid_combos[uid_combos['combos_label'] == 'C-c-c-combo']['mhr'].value_counts().apply(lambda x: x/combos) print "\n% of MHR hits from samples (end systems) with an \"interesting\" file combination" combos = sample_combos[sample_combos['combos_label'] == 'C-c-c-combo'].shape[0] print sample_combos[sample_combos['combos_label'] == 'C-c-c-combo']['mhr'].value_counts().apply(lambda x: x/combos)