import zerorpc c = zerorpc.Client() c.connect("tcp://127.0.0.1:4242") with open('evil.pcap','rb') as f: md5 = c.store_sample(f.read(), 'evil.pcap', 'pcap') print c.work_request('pcap_meta', md5)
{'pcap_meta': {'encoding': 'binary', 'file_size': 54339570, 'file_type': 'tcpdump (little-endian) - version 2.4 (Ethernet, 65535)', 'filename': 'evil.pcap', 'import_time': '2014-02-08T22:15:50.282000Z', 'md5': 'bba97e16d7f92240196dc0caef9c457a', 'mime_type': 'application/vnd.tcpdump.pcap'}}
Run the workbench server (from somewhere, for the demo we're just going to start a local one)
$ workbench_server
# Lets start to interact with workbench, please note there is NO specific client to workbench,
# Just use the ZeroRPC Python, Node.js, or CLI interfaces.
import zerorpc
c = zerorpc.Client()
c.connect("tcp://127.0.0.1:4242")
[None]
# I forgot what stuff I can do with workbench
print c.help()
Welcome to Workbench: Here's a list of help commands: - Run c.help_basic() for beginner help - Run c.help_commands() for command help - Run c.help_workers() for a list of workers - Run c.help_advanced() for advanced help See https://github.com/SuperCowPowers/workbench for more information
print c.help_basic()
Workbench: Getting started... - 1) $ print c.help_commands() for a list of commands - 2) $ print c.help_command('store_sample') for into on a specific command - 3) $ print c.help_workers() for a list a workers - 4) $ print c.help_worker('meta') for info on a specific worker - 5) $ my_md5 = c.store_sample(...) - 6) $ output = c.work_request('meta', my_md5)
# STEP 1:
# Okay get the list of commands from workbench
print c.help_commands()
Workbench Commands: add_node(node_id, name, labels) add_rel(source_id, target_id, rel) clear_db() clear_graph_db() get_datastore_uri() get_sample(md5) get_sample_set(md5) get_sample_window(type_tag, size) has_node(node_id) have_sample(md5) help() help_advanced() help_basic() help_command(command) help_commands() help_worker(worker) help_workers() index_sample(md5, index_name) index_worker_output(worker_class, md5, index_name, subfield) search(index_name, query) store_sample(input_bytes, filename, type_tag) store_sample_set(md5_list) work_request(worker_class, md5, subkeys=None)
# STEP 2:
# Lets gets the infomation on a specific command 'store_sample'
print c.help_command('store_sample')
Command: store_sample(input_bytes, filename, type_tag) Store a sample into the DataStore. Args: filename: name of the file (used purely as meta data not for lookup) input_bytes: the actual bytes of the sample e.g. f.read() type_tag: ('exe','pcap','pdf','json','swf', or ...) Returns: the md5 of the sample
# STEP 3:
# Now lets get infomation about the dynamically loaded workers (your site may have many more!)
# Next to each worker name is the list of dependences that worker has declared
print c.help_workers()
Workbench Workers: json_meta ['sample', 'meta'] log_meta ['sample', 'meta'] meta ['sample'] meta_deep ['sample', 'meta'] pcap_bro ['sample'] pcap_graph ['pcap_bro'] pcap_graph_0_1 ['pcap_bro'] pcap_http_graph ['pcap_bro'] pe_classifier ['pe_features', 'pe_indicators'] pe_deep_sim ['meta_deep'] pe_features ['sample'] pe_indicators ['sample'] pe_peid ['sample'] strings ['sample'] swf_meta ['sample', 'meta'] unzip ['sample'] url ['strings'] view ['meta'] view_customer ['meta'] view_log_meta ['log_meta'] view_meta ['meta'] view_pcap ['pcap_bro'] view_pcap_details ['view_pcap'] view_pdf ['meta', 'strings'] view_pe ['meta', 'strings', 'pe_peid', 'pe_indicators', 'pe_classifier', 'pe_disass'] view_zip ['meta', 'unzip'] vt_query ['meta'] yara_sigs ['sample']
# STEP 4:
# Lets gets the infomation about the meta worker
print c.help_worker('meta')
Worker: meta ['sample'] This worker computes meta data for any file type.
# STEP 5:
# Okay when we load up a file, we get the md5 back
filename = '../data/pe/bad/0cb9aa6fb9c4aa3afad7a303e21ac0f3'
with open(filename,'rb') as f:
my_md5 = c.store_sample(f.read(), filename, 'exe')
print my_md5
0cb9aa6fb9c4aa3afad7a303e21ac0f3
# STEP 6:
# Run a worker on my sample
output = c.work_request('meta', my_md5)
output
{'meta': {'customer': 'BearTron', 'encoding': 'binary', 'file_size': 20480, 'file_type': 'PE32 executable (GUI) Intel 80386, for MS Windows', 'filename': '../data/pe/bad/0cb9aa6fb9c4aa3afad7a303e21ac0f3', 'import_time': '2014-06-10T20:48:15.321000Z', 'length': 20480, 'md5': '0cb9aa6fb9c4aa3afad7a303e21ac0f3', 'mime_type': 'application/x-dosexec', 'type_tag': 'exe'}}
# Lets see what view_pe does
print c.help_worker('view_pe')
Worker: view_pe ['meta', 'strings', 'pe_peid', 'pe_indicators', 'pe_classifier', 'pe_disass'] Generates a high level summary view for PE files that incorporates a large set of workers
# Okay lets give it a try
c.work_request('view_pe', my_md5)
{'view_pe': {'classification': 'Evil!', 'customer': 'BearTron', 'disass': 'plugin_failed', 'encoding': 'binary', 'file_size': 20480, 'file_type': 'PE32 executable (GUI) Intel 80386, for MS Windows', 'filename': '../data/pe/bad/0cb9aa6fb9c4aa3afad7a303e21ac0f3', 'import_time': '2014-06-10T20:48:15.321000Z', 'indicators': [{'attributes': ['findwindowexa', 'findwindowa'], 'category': 'ANTI_DEBUG', 'description': 'Imported symbols related to anti-debugging', 'severity': 3}, {'category': 'MALFORMED', 'description': 'Checksum of Zero', 'severity': 1}, {'category': 'MALFORMED', 'description': 'Reported Checksum does not match actual checksum', 'severity': 2}, {'attributes': ['sendmessagea'], 'category': 'COMMUNICATION', 'description': 'Imported symbols related to network communication', 'severity': 1}, {'attributes': ['getmodulehandlea', 'getstartupinfoa'], 'category': 'PROCESS_MANIPULATION', 'description': 'Imported symbols related to process manipulation/injection', 'severity': 3}, {'attributes': ['getsystemmetrics'], 'category': 'PROCESS_SPAWN', 'description': 'Imported symbols related to spawning a new process', 'severity': 2}], 'length': 20480, 'md5': '0cb9aa6fb9c4aa3afad7a303e21ac0f3', 'mime_type': 'application/x-dosexec', 'peid_Matches': ['Microsoft Visual C++ v6.0'], 'type_tag': 'exe'}}
# Okay, that worker needed the output of pe_features and pe_indicators
# so what happened? The worker has a dependency list and workbench
# recursively satisfies that dependency list.. this is powerful because
# when we're interested in one particular analysis we just want to get
# the darn thing without having to worry about a bunch of details
# Well lets do this for a bunch of files!
import os
file_list = [os.path.join('../data/pe/bad', child) for child in os.listdir('../data/pe/bad')]
working_set = []
for filename in file_list:
with open(filename,'rb') as f:
md5 = c.store_sample(f.read(), filename, 'exe')
results = c.work_request('pe_classifier', md5)
working_set.append(md5)
print 'Results: %s' % (results)
Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '033d91aae8ad29ed9fbb858179271232'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '0cb9aa6fb9c4aa3afad7a303e21ac0f3'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '0e882ec9b485979ea84c7843d41ba36f'}} Results: {'pe_classifier': {'classification': 'Benign', 'md5': '0e8b030fb6ae48ffd29e520fc16b5641'}} Results: {'pe_classifier': {'classification': 'Benign', 'md5': '0eb9e990c521b30428a379700ec5ab3e'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '127f2bade752445b3dbf2cf2ea75c201'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '139385a91b9bca0833bdc1fa77e42b91'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '13dcc5b4570180118eb65529b77f6d89'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '1cac80a2147cd8f3860547e43edcaa00'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '1cea13cf888cd8ce4f869029f1dbb601'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '1d733a9e3e571ce5f5f633a0cfd3d5f0'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2058c50de5976c67a09dfa5e0e1c7eb5'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d012cba541c22fb7250975d5ad0d065'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d015553c7388e4d78f05b24aba0819c'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d017ff228c39a0a727586b33f8168b0'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d09133abb48e1c7f3f5c8f8ced8fef4'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d094b6c69020091b68d1bcf5d11fa4b'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d095091983dd0bf6ab7c0bb6dd695f9'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d09546831b17d2cc0583362b6d312ae'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d099171876f7301c155bd775fff2b6a'}} Results: {'pe_classifier': {'classification': 'Benign', 'md5': '2d09a573b0e9d02a9fa47b16e9c01a48'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d09b5768e3617523d8afa110361919c'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d09b8d9852c3176259915e3509bcbd1'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d09ca902990545fec9ac190b0338b50'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d09cb38c268aa9297e5a7f27e677267'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d09cc92bbe29d96bb3a91b350d1725f'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '2d09e4aff42aebac87ae2fd737aba94f'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '32b24e73cfc3ac4c43f1926f8935e438'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '4ed28b6207560f127d267de639a4e1bf'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '505804ec7c7212a52ec85e075b91ed84'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '60a83c049e135cc199138c1f8861437c'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '69f9633fa6fd5dc1fd917cb435bba8ad'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '79f5e1af9fdb92476045989bda7515c7'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '7f313447b887b078215617fbed1a34a1'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '8006782bdf703e2f3fdf1d1650f45ffd'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '86714940f491bc38c2e842e80c7f778e'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '987bd46899b2a9493e6dec051edcb66c'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '9cd3d7b1b0f2aea5950cbf7d97776f2f'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '9ceccd9f32cb2ad0b140b6d15d8993b6'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': '9e42ff1e6f75ae3e60b24e48367c8f26'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': 'a7b0a9067d8292b252d741e6fae17cd9'}} Results: {'pe_classifier': {'classification': 'Benign', 'md5': 'afddc552b31a8f2438768c73674bf29e'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': 'b681485cb9e0cad73ee85b9274c0d3c2'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': 'bf1249a258cbcccec0f1b4ea1e9451a1'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': 'c8c54ac7e827056174762c68db84534f'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': 'cc113aa59c04b17e7cb832fc417f104d'}} Results: {'pe_classifier': {'classification': 'Benign', 'md5': 'd94da41e7e809f7366971b3b50f8ef68'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': 'e9a6c83826deacfbc2281b6c7e401694'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': 'ea5d95c96a23b21b9038f03b91955c18'}} Results: {'pe_classifier': {'classification': 'Evil!', 'md5': 'f6190648c2efb764ae1d73b0e9a4fd13'}}
# We just ran the classifer on 50 files and you'll note that we ONLY got back the
# information we ask for. On a large amount of files (100k or greater) if you don't
# have a granular system, something this easy WILL NOT BE POSSIBLE! (dramatic enough?)
# So lets look at the features going into the classifier (btw the classifier is currently a TOY EXAMPLE)
c.work_request('pe_features', md5)
{'pe_features': {'dense_features': {'check_sum': 0, 'compile_date': 1074901182, 'datadir_IMAGE_DIRECTORY_ENTRY_BASERELOC_size': 2348796035, 'datadir_IMAGE_DIRECTORY_ENTRY_EXPORT_size': 0, 'datadir_IMAGE_DIRECTORY_ENTRY_IAT_size': 0, 'datadir_IMAGE_DIRECTORY_ENTRY_IMPORT_size': 20, 'datadir_IMAGE_DIRECTORY_ENTRY_RESOURCE_size': 262, 'debug_size': 0, 'export_size': 0, 'generated_check_sum': 62500, 'iat_rva': 86510, 'major_version': 0, 'minor_version': 57, 'number_of_bound_import_symbols': 0, 'number_of_bound_imports': 0, 'number_of_export_symbols': 0, 'number_of_import_symbols': 0, 'number_of_imports': 0, 'number_of_rva_and_sizes': 10, 'number_of_sections': 3, 'pe_char': 33167, 'pe_dll': 0, 'pe_driver': 0, 'pe_exe': 1, 'pe_i386': 1, 'pe_majorlink': 76, 'pe_minorlink': 111, 'pe_warnings': 1, 'sec_entropy_': 7.876937179741062, 'sec_entropy_@': 5.388003199139927, 'sec_entropy_data': 0, 'sec_entropy_ps': 5.388003199139927, 'sec_entropy_rdata': 0, 'sec_entropy_reloc': 0, 'sec_entropy_rsrc': 0, 'sec_entropy_text': 0, 'sec_raw_execsize': 6984, 'sec_rawptr_': 512, 'sec_rawptr_@': 16, 'sec_rawptr_data': 0, 'sec_rawptr_ps': 16, 'sec_rawptr_rsrc': 0, 'sec_rawptr_text': 0, 'sec_rawsize_': 5992, 'sec_rawsize_@': 496, 'sec_rawsize_data': 0, 'sec_rawsize_ps': 496, 'sec_rawsize_rsrc': 0, 'sec_rawsize_text': 0, 'sec_va_execsize': 86016, 'sec_vasize_': 36864, 'sec_vasize_@': 4096, 'sec_vasize_data': 0, 'sec_vasize_ps': 45056, 'sec_vasize_rsrc': 0, 'sec_vasize_text': 0, 'size_code': 1766614113, 'size_image': 90112, 'size_initdata': 1918988898, 'size_uninit': 16761, 'std_section_names': 0, 'total_size_pe': 6504, 'virtual_address': 4096, 'virtual_size': 45056, 'virtual_size_2': 36864}, 'md5': 'f6190648c2efb764ae1d73b0e9a4fd13', 'sparse_features': {'imp_hash': 'Not found: Install pefile 1.2.10-139 or later', 'imported_symbols': [], 'pe_warning_strings': ['Error parsing section 0. PointerToRawData should normally be a multiple of FileAlignment, this might imply the file is trying to confuse tools which parse this incorrectly', 'Suspicious flags set for section 0. Both IMAGE_SCN_MEM_WRITE and IMAGE_SCN_MEM_EXECUTE are set. This might indicate a packed executable.', 'Suspicious flags set for section 1. Both IMAGE_SCN_MEM_WRITE and IMAGE_SCN_MEM_EXECUTE are set. This might indicate a packed executable.', 'Error parsing section 2. PointerToRawData should normally be a multiple of FileAlignment, this might imply the file is trying to confuse tools which parse this incorrectly', 'Suspicious flags set for section 2. Both IMAGE_SCN_MEM_WRITE and IMAGE_SCN_MEM_EXECUTE are set. This might indicate a packed executable.', 'Corrupt header "IMAGE_IMPORT_DESCRIPTOR" at file offset 494. Exception: \'Data length less than expected header length.\'', "Invalid relocation information. Can't read data at RVA: 0x476ffa5"], 'section_names': ['ps', '', '@']}}}
c.work_request('pe_indicators', md5)
{'pe_indicators': {'indicator_list': [{'category': 'PE_WARN', 'description': 'Error parsing section 0. PointerToRawData should normally be a multiple of FileAlignment, this might imply the file is trying to confuse tools which parse this incorrectly', 'severity': 2}, {'category': 'PE_WARN', 'description': 'Suspicious flags set for section 0. Both IMAGE_SCN_MEM_WRITE and IMAGE_SCN_MEM_EXECUTE are set. This might indicate a packed executable.', 'severity': 2}, {'category': 'PE_WARN', 'description': 'Suspicious flags set for section 1. Both IMAGE_SCN_MEM_WRITE and IMAGE_SCN_MEM_EXECUTE are set. This might indicate a packed executable.', 'severity': 2}, {'category': 'PE_WARN', 'description': 'Error parsing section 2. PointerToRawData should normally be a multiple of FileAlignment, this might imply the file is trying to confuse tools which parse this incorrectly', 'severity': 2}, {'category': 'PE_WARN', 'description': 'Suspicious flags set for section 2. Both IMAGE_SCN_MEM_WRITE and IMAGE_SCN_MEM_EXECUTE are set. This might indicate a packed executable.', 'severity': 2}, {'category': 'PE_WARN', 'description': 'Corrupt header "IMAGE_IMPORT_DESCRIPTOR" at file offset 494. Exception: \'Data length less than expected header length.\'', 'severity': 2}, {'category': 'PE_WARN', 'description': "Invalid relocation information. Can't read data at RVA: 0x476ffa5", 'severity': 2}, {'category': 'MALFORMED', 'description': 'Checksum of Zero', 'severity': 1}, {'category': 'MALFORMED', 'description': 'Reported Checksum does not match actual checksum', 'severity': 2}, {'category': 'MALFORMED', 'description': 'Image size does not match reported size', 'severity': 3}, {'attributes': ['ps', '', '@'], 'category': 'MALFORMED', 'description': 'Section(s) with a non-standard name, tamper indication', 'severity': 3}, {'attributes': ['PS', '@\x00\x0f@\x00'], 'category': 'MALFORMED', 'description': 'Unaligned section, tamper indication', 'severity': 3}], 'md5': 'f6190648c2efb764ae1d73b0e9a4fd13'}}
On another note, did we just waste some time there? Did workbench have to recompute the features? No everything done by workbench is pushed into the MongoDB backend and then if the work results for that md5 are already in the datastore the a very lightweight call is made to get the results. In fact results are never directly returned, the worker pushes into Mongo and then we pull them out and hand them to the client, that way we ^ensure^ that the bits in the datastore and the bits that you get are the exact same 'gold bits' (seems like overkill but it's important).
# Another example.. I want to look at strings for different types of files (not just pe_files)
# So we can load up a few pdfs (the pe's are already in the datastore)
file_list = [os.path.join('../data/pdf/bad', child) for child in os.listdir('../data/pdf/bad')]
for filename in file_list:
with open(filename,'rb') as f:
md5 = c.store_sample(f.read(), filename, 'pdf')
working_set.append(md5)
# Now we rip the strings worker on them all
for md5 in working_set:
result = c.work_request('strings', md5)
print 'results: %s' % (result['strings']['string_list'][:5]) # strings output is large so just showing the first 5
results: [' ', '!This program cannot be run in DOS mode.', 'Rich', '.text', '.rdata'] results: ['!This program cannot be run in DOS mode.', 'Rich3', '.text', '`.rdata', '@.data'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.text', '`.data', '.rsrc'] results: ['This program must be run under Win32', 'CODE', '`DATA', '.idata', '.tls'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.text', '`.rdata', '@DATA'] results: ['!This program cannot be run in DOS mode.', 'kRich', '.text', '`.rdata', '@.data'] results: ['!This program cannot be run in DOS mode.', '.text', '.code', '`.data', '.data3'] results: ['!This program cannot be run in DOS mode.', 'YRich', '.text', '`.rdata', '@.data'] results: ['!This program cannot be run in DOS mode.', '.text', '`.data', '.rsrc', 'blenkxr'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.text', '`.rdata', '@.data'] results: ['!This program cannot be run in DOS mode.', '.text', '.data', '.rsrc', 'aitrfvl'] results: ['This program must be run under Win32', 'ATSEC0', '`ATSEC1', '@ATSEC2', '@idata'] results: ['This program must be run under Win32', 'CODE', '`DATA', '.idata', '.reloc'] results: ['yrf<[LordPE]', '.text', '.text', 'ExitProcess', 'KERNEL32.dll'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.text', '`rdata', '.data'] results: ['!This program cannot be run in DOS mode.', '6Rich#', '^"+M', 'UPX0', 'UPX1'] results: ['!This program cannot be run in DOS mode.', 'Rich"', '.text', '@.code', '`.data'] results: ['yrf<[LordPE]', '.text', '.text', 'ExitProcess', 'KERNEL32.dll'] results: ['!This program cannot be run in DOS mode.', 'Riche', '.text', '.data', '.rsrc'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.text', '`rdata', '.data'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.data', '`.data', '`.data'] results: ['!This program cannot be run in DOS mode.', '.text', '`.data', '.idata', '@.rsrc'] results: ['!This program cannot be run in DOS mode.', ">O'X_!tX_!tX_!t", '@+tN_!t', 'C/tR_!t:@2tS_!tX_ t', '@*t[_!t'] results: ['!This program cannot be run in DOS mode.', 'RichX', 'E0TK', '.text', '`.rdata'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.text', '`rdata', '.data'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.text', '`.rdata', '@.data'] results: ['!This program cannot be run in DOS mode.', 'UPX0', 'UPX1', '.rsrc', '3.03'] results: [' ', '!This program cannot be run in DOS mode.', 'Rich', '.text', '.rdata'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.data', '.pdata', '.ex_cod'] results: ['This program must be run under Win32', 'CODE', '`DATA', '.idata', '.tls'] results: ['!This program cannot be run in DOS mode.', '.text', '`.rdata', '@.data', '_TEXT2'] results: ['!This program cannot be run in DOS mode.', 'ERich', 'UPX0', 'UPX1', 'UPX2'] results: ['This program must be run under Win32', 'mnYD', 'CODE', '`DATA', '.idata'] results: ['!This program cannot be run in DOS mode.', '.text', '`.rdata', '@.data', '_TEXT2'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.PEX', '`.PEX', 'Bome'] results: ['!This program cannot be run in DOS mode.', '|Richv', 'UPX0', 'UPX1', '.rsrc'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.text', '`.rdata', '@.data'] results: ['!This program cannot be run in DOS mode.', 'sIPE', '.text', '.data', '.rsrc'] results: ['!This program cannot be run in DOS mode.', '.text', '.data', '.rsrc', 'mzphdwa'] results: ['!This program cannot be run in DOS mode.', '.text', '.data', '.rsrc', 'lsicbkg'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.packed', '`.RLPack', 'a?/u'] results: ['!This program cannot be run in DOS mode.', 'Rich', '.text', 'h.rdata', 'H.reloc'] results: ['This program must be run under Win32', 'CODE', '`DATA', '.idata', '.tls'] results: [' ', '!This program cannot be run in DOS mode.', 'Rich', '.text', '.rdata'] results: ['!This program cannot be run in DOS mode.', '.text', '.data', '.rdata', '@.bss'] results: ['!This program cannot be run in DOS mode.', 'ssaR', "'#K'", '.data', '.text'] results: ['!This program cannot be run in DOS mode.', 'BZ.`', 'Richy', 'J!NH', '.text'] results: ['!This program cannot be run in DOS mode.', 'RichW', '.text', '.data', '.rsrc'] results: ['.text', 'KERNEL32.DLL', 'MSVCRT.DLL', 'USER32.DLL', 'ADVAPI32.DLL'] results: ['MZKERNEL32.DLL', 'LoadLibraryA', 'GetProcAddress', '^]YF', 'W8mu'] results: ['%PDF-1.6', '52 0 obj<</Length 51252212/Root 1 0 R/Info 3 0 R%/F/W[1 2 1]/Index[5 1 7 1 9 4 23 4 50 3]>>stream', '/Filter/FlateDecode/W[1 2 1]/Index[5 1 7 1 9 4 23 4 50 3]>>stream', 'bbb0b`b```', '310Z'] results: ['%PDF-1.1', '1 0 obj', ' /Type /Catalog', ' /Outlines 2 0 R', ' /Pages 3 0 R'] results: ['%PDF-1.3', '2 0 obj', '/OpenAction << /JS 9 0 R /S /JavaScript >>', '/Type /Catalog', '/Pages 3 0 R'] results: ['%PDF-1.4', '1 0 obj', '/Type /Catalog', '/Outlines 3 0 R', '/Pages 4 0 R'] results: ['%PDF-1.6', '11 0 obj', '<</Filter/FlateDecode /Length 2523>>', 'stream', 's--e'] results: ['%PDF-1.4', '1 0 obj', '<</Pages 2 0 R /OpenAction <<', '/JS 4 0 R /S /JavaScript /Type /Catalog>>>>', 'endobj'] results: ['%PDF-1.0', '1 0 obj<</Type/Catalog/Pages 2 0 R /Names 3 0 R >>endobj', '2 0 obj<</Type/Pages/Count 1/Kids[ 4 0 R ]>>endobj', '3 0 obj<</JavaScript 5 0 R >>endobj', '4 0 obj<</Type/Page/Parent 2 0 R /Contents 12 0 R>>endobj'] results: ['%PDF-1.6', '7 0 obj', '<</Count 1/Type/Pages/Kids[28 0 R]>>', 'endobj', '21 0 obj'] results: ['%PDF-1.3', '4 0 obj', 'endobj', '5 0 obj', '/Producer (substr)'] results: ['%PDF-1.5', '1 0 obj<</#54ype/#43atal#6fg/Outlin#65#73 2 0 R/#50ages 3 0 R/Ope#6e#41ctio#6e 5 0 R>>endobj', '2 0 obj<</T#79#70#65/#4fu#74li#6e#65s/C#6funt 0>>endobj', '3 0 obj<</Ty#70#65/Pa#67#65s/K#69ds[4 0 R]/#43o#75nt 1>>endobj', '4 0 obj<</#54#79pe/P#61g#65/#50a#72#65#6et 3 0 R/#4d#65#64i#61B#6f#78[0 0 612 792]>>endobj'] results: ['%PDF-1.3', '4 0 obj', '<< /Length 5 0 R /Filter /FlateDecode >>', 'stream', '}b%~T\\'] results: ['%PDF-1.7', '3 0 obj', '<</Type /Page', '/Parent 1 0 R', '/MediaBox [0 0 595.28 841.89]'] results: ['%PDF-1.3', '%вгПУ', '1 0 obj', '/Outlines 2 0 R', '/OpenAction 3 0 R'] results: ['%PDF-1.6', '10 0 obj', '<</Filter/FlateDecode /Length 1563>>', 'stream', '}V]o'] results: ['%PDF-1.3', '3 0 obj', '<</Type /Page', '/Parent 1 0 R', '/Resources 2 0 R'] results: ['%PDF-1.6', '12 0 obj', '<</Filter/FlateDecode /Length 2063>>', 'stream', 'W1OOO'] results: ['%PDF-1.4', '1 0 obj', '/Type /Catalog', '/Outlines 3 0 R', '/Pages 4 0 R'] results: ['%PDF-1.0', '1 0 obj<</Type/Catalog/Pages 2 0 R /Names 3 0 R >>endobj', '2 0 obj<</Type/Pages/Count 1/Kids[ 4 0 R ]>>endobj', '3 0 obj<</JavaScript 5 0 R >>endobj', '4 0 obj<</Type/Page/Parent 2 0 R /Contents 12 0 R>>endobj'] results: ['%PDF-1.6', '7 0 obj', '<</Length 2307 /Filter/FlateDecode>>', 'stream', '&^:_'] results: ['%PDF-1.0', '1 0 obj<</Type/Catalog/Pages 2 0 R /Names 3 0 R >>endobj', '2 0 obj<</Type/Pages/Count 1/Kids[ 4 0 R ]>>endobj', '3 0 obj<</JavaScript 5 0 R >>endobj', '4 0 obj<</Type/Page/Parent 2 0 R /Contents 12 0 R>>endobj'] results: ['%PDF-1.6', '9 0 obj', '<</Filter/FlateDecode /Length 2278>>', 'stream', '$z(R'] results: ['%PDF-1.6', '1 0 obj', '<</MediaBox [0 0 1 1] /Type/Page /Contents 3 0 R /Parent 5 0 R>>', 'endobj', '5 0 obj'] results: ['%PDF-1.3', '1 0 obj', '/Kids [ 4 0 R ]', '/Type /Pages', '/Count 1'] results: ['%PDF-1.6', '7 0 obj', '<</Count 1/Type/Pages/Kids[28 0 R]>>', 'endobj', '21 0 obj'] results: ['%PDF-1.6', '3 0 obj', '<</Filter/FlateDecode /Length 1905>>', 'stream', "z]%'"]
Views can also be precise or general (example shows the latter):
- Customer billing View
- Sample volume over time View
- All samples that use communications calls View
- DO_EVERYTHING_BECAUSE_I_WANT_TO_PUNCH_GRANULARITY_IN_THE_NUTS! View
So lets look at the last kind .. it's called 'view' and like many of the other workers it's 20 lines of code.
But it's deceptively simple, if you think about what must be happening below... over a dozen workers are getting orchestrated and run only when it makes sense for that MIME type. So with a few 'pull' calls the recursive dependency chains are invoked; work is done if/when it's needed and the whole thing is fantastically elegant and efficient. If your mind isn't a little bit blown by what happens below then you might not be paying attention.
# This just grabs all the file_paths recursively
def tag_type(path):
types = ['bro','json','log','pcap','pdf','exe','swf','zip']
for try_type in types:
if try_type in os.path.dirname(path):
return try_type
file_list = []
for p,d,f_list in os.walk('../data'):
file_list += [os.path.join(p, f) for f in f_list]
# We're going to load in all the files which include PE files, PCAPS, PDFs, and ZIPs and run 'view' on them.
# Note: This takes a while :)
import pprint
results = []
for filename in file_list:
with open(filename,'rb') as f:
md5 = c.store_sample(f.read(), os.path.basename(filename), tag_type(filename))
results.append(c.work_request('view', md5))
pprint.pprint(results[:5])
[{'view': {'md5': '142372845adfdb668ba5bca0e81e6c19', 'meta': {'customer': 'Mega Corp', 'encoding': 'binary', 'file_size': 12292, 'file_type': 'Apple Desktop Services Store', 'filename': '.DS_Store', 'import_time': '2014-06-10T20:48:33.501000Z', 'length': 12292, 'md5': '142372845adfdb668ba5bca0e81e6c19', 'mime_type': 'binary', 'type_tag': None}}}, {'view': {'md5': 'f12f0237be84a8e353477e55ec43589b', 'meta': {'customer': 'Dorseys Mom', 'encoding': 'us-ascii', 'file_size': 25218, 'file_type': 'ASCII text', 'filename': 'conn.log', 'import_time': '2014-06-10T20:48:33.514000Z', 'length': 25218, 'md5': 'f12f0237be84a8e353477e55ec43589b', 'mime_type': 'text/plain', 'type_tag': 'bro'}}}, {'view': {'md5': 'a62fd3c72c1d688ff8041e0be87d07aa', 'meta': {'customer': 'Dorseys Mom', 'encoding': 'us-ascii', 'file_size': 541, 'file_type': 'ASCII text', 'filename': 'dhcp.log', 'import_time': '2014-06-10T20:48:33.532000Z', 'length': 541, 'md5': 'a62fd3c72c1d688ff8041e0be87d07aa', 'mime_type': 'text/plain', 'type_tag': 'bro'}}}, {'view': {'md5': '438022b94b10cada18414314b0c8584b', 'meta': {'customer': 'Huge Inc', 'encoding': 'us-ascii', 'file_size': 23896, 'file_type': 'ASCII text', 'filename': 'dns.log', 'import_time': '2014-06-10T20:48:33.551000Z', 'length': 23896, 'md5': '438022b94b10cada18414314b0c8584b', 'mime_type': 'text/plain', 'type_tag': 'bro'}}}, {'view': {'md5': 'f57da114f0f1e07b93f374d68c65c583', 'meta': {'customer': 'Dorseys Mom', 'encoding': 'us-ascii', 'file_size': 40283, 'file_type': 'ASCII text', 'filename': 'files.log', 'import_time': '2014-06-10T20:48:33.573000Z', 'length': 40283, 'md5': 'f57da114f0f1e07b93f374d68c65c583', 'mime_type': 'text/plain', 'type_tag': 'bro'}}}]
# Okay so views can either aggregate results from multiple workers or they
# can subset to just want you want (webpage presentation for instance)
results = c.batch_work_request('view_customer')
print results
<generator object iterator at 0x10e3375a0>
# At this granularity it opens up a new world
import pandas as pd
df = pd.DataFrame(results)
df.head(10)
customer | filename | import_time | length | md5 | type_tag | |
---|---|---|---|---|---|---|
0 | BearTron | ../data/pe/bad/0cb9aa6fb9c4aa3afad7a303e21ac0f3 | 2014-06-10T20:48:15.321000Z | 20480 | 0cb9aa6fb9c4aa3afad7a303e21ac0f3 | pe |
1 | Mega Corp | ../data/pe/bad/033d91aae8ad29ed9fbb858179271232 | 2014-06-10T20:48:19.036000Z | 85504 | 033d91aae8ad29ed9fbb858179271232 | pe |
2 | Dorseys Mom | ../data/pe/bad/0e882ec9b485979ea84c7843d41ba36f | 2014-06-10T20:48:19.146000Z | 64512 | 0e882ec9b485979ea84c7843d41ba36f | pe |
3 | Mega Corp | ../data/pe/bad/0e8b030fb6ae48ffd29e520fc16b5641 | 2014-06-10T20:48:19.258000Z | 81920 | 0e8b030fb6ae48ffd29e520fc16b5641 | pe |
4 | Dorseys Mom | ../data/pe/bad/0eb9e990c521b30428a379700ec5ab3e | 2014-06-10T20:48:19.519000Z | 97280 | 0eb9e990c521b30428a379700ec5ab3e | pe |
5 | Dorseys Mom | ../data/pe/bad/127f2bade752445b3dbf2cf2ea75c201 | 2014-06-10T20:48:19.697000Z | 66560 | 127f2bade752445b3dbf2cf2ea75c201 | pe |
6 | Huge Inc | ../data/pe/bad/139385a91b9bca0833bdc1fa77e42b91 | 2014-06-10T20:48:19.817000Z | 22510 | 139385a91b9bca0833bdc1fa77e42b91 | pe |
7 | Mega Corp | ../data/pe/bad/13dcc5b4570180118eb65529b77f6d89 | 2014-06-10T20:48:19.897000Z | 29184 | 13dcc5b4570180118eb65529b77f6d89 | pe |
8 | BearTron | ../data/pe/bad/1cac80a2147cd8f3860547e43edcaa00 | 2014-06-10T20:48:20.009000Z | 72704 | 1cac80a2147cd8f3860547e43edcaa00 | pe |
9 | Mega Corp | ../data/pe/bad/1cea13cf888cd8ce4f869029f1dbb601 | 2014-06-10T20:48:20.160000Z | 53248 | 1cea13cf888cd8ce4f869029f1dbb601 | pe |
10 rows × 6 columns
# Lets look at the file submission types broken down by customer
df['count'] = 1
df.groupby(['customer','type_tag']).sum()
length | count | ||
---|---|---|---|
customer | type_tag | ||
BearTron | bro | 52422 | 12 |
cab | 54007 | 1 | |
json | 288312 | 1 | |
own | 554 | 1 | |
pcap | 35057 | 3 | |
374438 | 13 | ||
pe | 1588841 | 28 | |
Dorseys Mom | bro | 146979 | 20 |
jar | 10629 | 1 | |
pcap | 1799435 | 1 | |
251385 | 9 | ||
pe | 1673734 | 27 | |
zip | 151268 | 2 | |
Huge Inc | bro | 197288 | 8 |
own | 52961 | 3 | |
pcap | 3461100 | 3 | |
512620 | 18 | ||
pe | 1120037 | 23 | |
swf | 7724 | 1 | |
Mega Corp | bro | 59415 | 12 |
jar | 18643 | 1 | |
own | 10252 | 4 | |
pcap | 693558 | 1 | |
390356 | 12 | ||
pe | 1507236 | 27 |
25 rows × 2 columns
# Plotting defaults
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 18.0, 8.0
# Plot box plots based on customer (PDFs)
df[df['type_tag']=='pdf'].boxplot('length','customer')
plt.xlabel('Customer')
plt.ylabel('File Size')
plt.title('File Length (PDF) by Customer')
plt.suptitle('')
<matplotlib.text.Text at 0x10e5addd0>
# Plot box plots based on customer (PEs)
df[df['type_tag']=='exe'].boxplot('length','customer')
plt.xlabel('Customer')
plt.ylabel('File Size')
plt.title('File Length (PE) by Customer')
plt.suptitle('')
<matplotlib.text.Text at 0x10e661cd0>
# Okay now lets do some plots on the file meta-data
results = c.batch_work_request('meta_deep')
df_meta = pd.DataFrame(results)
df_meta.head()
customer | encoding | entropy | file_size | file_type | filename | import_time | length | md5 | mime_type | sha1 | sha256 | ssdeep | type_tag | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BearTron | binary | 2.440069 | 20480 | PE32 executable (GUI) Intel 80386, for MS Windows | ../data/pe/bad/0cb9aa6fb9c4aa3afad7a303e21ac0f3 | 2014-06-10T20:48:15.321000Z | 20480 | 0cb9aa6fb9c4aa3afad7a303e21ac0f3 | application/x-dosexec | 96e85768a12b2f319f2a4f0c048460e1b73aa573 | 4ecf79302ba0439f62e15d0526a297975e6bb32ea25c8c... | 192:a8jJIFYrq9ATskBTp2jLDL3P1oynldvSo71nF:oFpN... | pe |
1 | Mega Corp | binary | 7.894680 | 85504 | PE32 executable (GUI) Intel 80386, for MS Windows | ../data/pe/bad/033d91aae8ad29ed9fbb858179271232 | 2014-06-10T20:48:19.036000Z | 85504 | 033d91aae8ad29ed9fbb858179271232 | application/x-dosexec | 83ab10907b254752f312c89125957f10d35cb9d4 | eb107c004e6e1bbd3b32ad7961661bbe28a577b0cb5dac... | 1536:h6+LbfPbI5dzmJu9Tgj5aOItvEqRCHW9pjVrs2ryr... | pe |
2 | Dorseys Mom | binary | 5.125292 | 64512 | PE32 executable (GUI) Intel 80386, for MS Windows | ../data/pe/bad/0e882ec9b485979ea84c7843d41ba36f | 2014-06-10T20:48:19.146000Z | 64512 | 0e882ec9b485979ea84c7843d41ba36f | application/x-dosexec | 12fb0a1b7d9c2b2a41f4da9ce5bbfb140fb16939 | 616cf9e729c883d979212eb55178b7aac80dd9f58cb449... | 768:5HyLMqtEM1Htz8kDmP9l+nZZYp41oj7EZmJxl/N9j6... | pe |
3 | Mega Corp | binary | 6.303055 | 81920 | PE32 executable (GUI) Intel 80386, for MS Windows | ../data/pe/bad/0e8b030fb6ae48ffd29e520fc16b5641 | 2014-06-10T20:48:19.258000Z | 81920 | 0e8b030fb6ae48ffd29e520fc16b5641 | application/x-dosexec | 82d57b8302b7497b2f6943f18e2d2687b9b0f5eb | feaf72bdad035e198d297bfb0b8d891645f1dacd78f0db... | 1536:1uNqjqzs1hQHhInEeJMzcmGqyF7Jwe9pvUo+5TDU4... | pe |
4 | Dorseys Mom | binary | 7.593283 | 97280 | PE32 executable (GUI) Intel 80386, for MS Windows | ../data/pe/bad/0eb9e990c521b30428a379700ec5ab3e | 2014-06-10T20:48:19.519000Z | 97280 | 0eb9e990c521b30428a379700ec5ab3e | application/x-dosexec | b778fc55f0538de865d4853099a3faa0b29f311d | dc5e8176a5f012ebdb4835f9b570a12c045d059f6f5bdc... | 1536:KcE4iMgXjTJpdGaaJG6Mhawv7r9ZaobsLBq+h5ttB... | pe |
5 rows × 14 columns
# Plot entropy box plots based on file type
df_meta.boxplot('entropy','type_tag')
plt.xlabel('Mime Type')
plt.ylabel('Entropy')
<matplotlib.text.Text at 0x10f436410>
# Plot customer submissions based on file type
group_df = df[['customer','type_tag']]
group_df['submissions'] = 1
group_df = group_df.groupby(['customer','type_tag']).sum().unstack()
group_df.head()
submissions | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|
type_tag | bro | cab | jar | json | own | pcap | pe | swf | zip | |
customer | ||||||||||
BearTron | 12 | 1 | NaN | 1 | 1 | 3 | 13 | 28 | NaN | NaN |
Dorseys Mom | 20 | NaN | 1 | NaN | NaN | 1 | 9 | 27 | NaN | 2 |
Huge Inc | 8 | NaN | NaN | NaN | 3 | 3 | 18 | 23 | 1 | NaN |
Mega Corp | 12 | NaN | 1 | NaN | 4 | 1 | 12 | 27 | NaN | NaN |
4 rows × 10 columns
# Plot entropy box plots based on mime-type
my_colors = [(x/9.0, .8, 1.0-x/9.0) for x in range(10)] # Why the heck dosen't matplotlib have better categorical cmaps?
group_df['submissions'].plot(kind='bar', stacked=True, color=my_colors)
plt.xlabel('Customer')
plt.ylabel('Submissions')
<matplotlib.text.Text at 0x10f9f6e90>