#!/usr/bin/env python
# coding: utf-8

# ## General info
# Dates of interest
# 
# * 01/06/2015
# * 01/27/2015
# * 12/11/2014
# 
# Timeframes of interest : Between 4 and 6 PM (but should still consider the entire day)
# 
# Keywords: JPMorgan, wire, transfer

# ## Mounting the image
# Mounting the image read-only with NTFS specific parameters

# In[ ]:


mount -o ro,loop,show_sys_files,streams_interface=windows /mnt/hgfs/ssd/039533.001 /mnt/usb/


# ## Creating timeline
# Used 'log2timeline' from SANS SIFT VM
# Image local time is set to CET (Paris time)
# Output format set to CSV

# In[ ]:


log2timeline -z CET -r -p -f win7 -o csv -w /cases/bodyfile /mnt/usb


# ## Analyzing the timeline using Apache Spark
# Timeline analysis can be difficult and very time consuming if the CSV files are too large, or if we have multiple images to go through.
# 
# Spark can make that job much easier and more efficient.

# In[ ]:


# Create a Spark SQL context
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Load CSV files into a Spark DataFrame
df = sqlContext.load(source="com.databricks.spark.csv", header="true", path = "/user/cloudera/bodyfile")


# In[99]:


# Count the number of rows in the DataFrame and the schema from the CSV
df.printSchema()
df.count()


# In[ ]:


# This displays the DataFrame's column names based on the CSV header
df.columns


# In[ ]:


# Register the DataFrame as a Spark SQL table called 'tl' so we can run queries using SQL syntax
sqlContext.registerDataFrameAsTable(df, 'tl')
# Cache the table in memory for faster lookups
sqlContext.cacheTable('tl')

#RDD Name	Storage Level	Cached Partitions	Fraction Cached	Size in Memory	Size in Tachyon	Size on Disk
#In-memory table tl	Memory Deserialized 1x Replicated	5	100%	286.2 MB	0.0 B	0.0 B


# ### Keyword search
# #### Did the user execute any files that seem potentially malicious?

# In[92]:


# Collect all the rows into a Python list that only contains rows matching certain conditions
filtered = sqlContext.sql("select * from tl where `date` like '01/%/2015' and short like '%wire%'").collect()

# Print out the results
for i in filtered:
    print i.date+" "+i.time+" "+i.source+" "+i.MACB+" "+i.short+" "+i.desc


# - On 01/27/2015 the user appears to have executed a file named wire_tr91297_pdf.exe that was compressed as a ZIP file (Temp1_wire_tr91297.zip)
# - It was not possible to recover the file from the file-system
# - It would be reasonable to conclude this file was malware
# - We can be certain the malware was executed, but it is unclear whether it continued to run normally

# ## System activity: Deleted files
# 
# ###On which dates were files deleted the most?

# In[89]:


deletedFilesDF = sqlContext.sql("SELECT `date`, short FROM tl WHERE `date` LIKE '%/%/2015' AND short LIKE '%DELETED%'")
deletedFilesRowList = deletedFilesDF.collect()


# In[90]:


deletedFileListDate = []
deletedFileList = []
for deletedFile in deletedFilesRowList:
    deletedFileListDate.append(deletedFile.date)
    deletedFileList.append(deletedFile.short)


# In[91]:


import pandas as pd
from collections import Counter
dates = Counter(deletedFileListDate)
counts = dates
index = []
data = []

for k,v in counts.iteritems():
    index.append(k)
    data.append(v)
ts = pd.TimeSeries(data, index)
figure(num=None, figsize=(10, 8), dpi=80, facecolor='w', edgecolor='r')
ts.plot(kind="barh")


# #### Display the files that were deleted
# ( limited it to 10 results to avoid bloating the notebook with results )

# In[114]:


i = 0
for file in deletedFilesRowList:
        if file.date == '07/15/2015' or file.date == '07/13/2015' or file.date == '08/11/2015':
            if i < 10:
                print file
            i=i+1


# - On 07/15/2015 and 07/13/2015 the user deleted browisng history
# - On 08/11/2015 an application was uninstalled

# ### Web history on the dates in question

# In[96]:


visitedList = []
myDates = ['12/11/2014', '01/06/2015', '01/27/2015']
for i in myDates:
    webhist = sqlContext.sql("select * from tl where source='WEBHIST' and `date` like '%s' limit 20 " %(i) ).collect()
    for i in webhist:
        visitedList.append(i.date + " " + i.short)


# In[115]:


for i in visitedList:
    dateurl = i.split(" ")
    url = dateurl[0] + " " + dateurl[2]
    print url


#