# All the imports and some basic level setting with various versions
import IPython
import re
import os
import json
import time
import pylab
import string
import pandas
import pickle
import struct
import socket
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

print "IPython version: %s" %IPython.__version__
print "pandas version: %s" %pd.__version__
print "numpy version: %s" %np.__version__

%matplotlib inline

engines = ['Symantec', 'Sophos', 'F-Prot', 'Kaspersky', 'McAfee', 'Malwarebytes']

def extract_vtdata(data):
    vt = {}
    if 'scans' in data:
        if data['positives'] > 0:
            vt['label'] = 'malicious'
        else:
            vt['label'] = 'nonmalicious'
        vt['positives'] = data['positives']
        for eng in engines:
            if eng in data['scans']:
                if data['scans'][eng]['result'] == None or not 'result' in data['scans'][eng]:
                    vt[eng] = 'no detection'
                else:
                    vt[eng] = data['scans'][eng]['result']
    else:
        vt['label'] = 'no results'
        for eng in engines:
            vt[eng] = 'no results'
        vt['positives'] = 0
    return vt

def load_vt_data(file_list):
    import json
    features_list = {}
    for filename in file_list:
        with open(filename,'rb') as f:
            features = extract_vtdata(json.loads(f.read()))
            fname = os.path.split(filename)[1].split('.')[0]
            features_list[fname] = features
    return features_list

import glob
file_list = glob.glob('/Users/user/vt_data/*.vtdata')
vt_data = load_vt_data(file_list)

# This simply loads up the JSON and flattens it. FAT binaries are broken down into a feature vector for each architecture
def extract_features(filename, data):
    all_features = []
    if not 'error' in data['characteristics']['macho']:
        for i in range(data['characteristics']['macho']['number of architectures']):
            features = {}
            #features['magic'] = int(data['characteristics']['macho']['header'][i]['magic'], 0)
            #features['h_size'] = data['characteristics']['macho']['header'][i]['size']
            #features['h_offset'] = data['characteristics']['macho']['header'][i]['offset']
            for command in data['verbose']['macho']['header'][i]['commands']:
                if command['cmd_name'] in ['LC_SEGMENT', 'LC_SEGMENT_64']:
                    bits = ''
                    if command['cmd_name'] == 'LC_SEGMENT_64':
                        bits = "64"
                    if command['segname'] == '__PAGEZERO':
                        features['lc_segment_' + bits + '_vmaddr'] = command['vmaddr']
                        features['lc_segment_' + bits + '_vmsize'] = command['vmsize']
                        features['lc_segment_' + bits + '_filesize'] = command['filesize']
                        features['lc_segment_' + bits + '_fileoff'] = command['fileoff']
                if command['cmd_name'] == 'LC_VERSION_MIN_MACOSX':
                    features['lc_version_min_macosx_min_version'] = float('.'.join(command['version'].split('.')[:2]))
                if command['cmd_name'] == 'LC_SYMTAB':
                    features['lc_symtab_strsize'] = command['strsize']
                    features['lc_symtab_stroff'] = command['stroff']
                    features['lc_symtab_symoff'] = command['symoff']
                    features['lc_symtab_nsyms'] = command['nsyms']
                if command['cmd_name'] in ['LC_DYLD_INFO_ONLY', 'LC_DYLD_INFO']:
                    features['lc_dyld_info_lazy_bind_size'] = command['lazy_bind_size']
                    features['lc_dyld_info_rebase_size'] = command['rebase_size']
                    features['lc_dyld_info_lazy_bind_off'] = command['lazy_bind_off']
                    features['lc_dyld_info_export_off'] = command['export_off']
                    features['lc_dyld_info_export_size'] = command['export_size']
                    features['lc_dyld_info_bind_off'] = command['bind_off']
                    features['lc_dyld_info_rebase_off'] = command['rebase_off']
                    features['lc_dyld_info_bind_size'] = command['bind_size']
                    features['lc_dyld_info_weak_bind_size'] = command['weak_bind_size']
                    features['lc_dyld_info_weak_bind_off'] = command['weak_bind_off']
                if command['cmd_name'] == 'LC_DYSYMTAB':
                    features['lc_dysymtab_nextdefsym'] = command['nextdefsym']
                    features['lc_dysymtab_extreloff'] = command['extreloff']
                    features['lc_dysymtab_nlocrel'] = command['nlocrel']
                    features['lc_dysymtab_modtaboff'] = command['modtaboff']
                    features['lc_dysymtab_iundefsym'] = command['iundefsym']
                    features['lc_dysymtab_ntoc'] = command['ntoc']
                    features['lc_dysymtab_ilocalsym'] = command['ilocalsym']
                    features['lc_dysymtab_nundefsym'] = command['nundefsym']
                    features['lc_dysymtab_nextrefsyms'] = command['nextrefsyms']
                    features['lc_dysymtab_locreloff'] = command['locreloff']
                    features['lc_dysymtab_nmodtab'] = command['nmodtab']
                    features['lc_dysymtab_nlocalsym'] = command['nlocalsym']
                    features['lc_dysymtab_tocoff'] = command['tocoff']
                    features['lc_dysymtab_extrefsymoff'] = command['extrefsymoff']
                    features['lc_dysymtab_nindirectsyms'] = command['nindirectsyms']
                    features['lc_dysymtab_iextdefsym'] = command['iextdefsym']
                    features['lc_dysymtab_nextrel'] = command['nextrel']
                    features['lc_dysymtab_indirectsymoff'] = command['indirectsymoff']

            features.update(data['verbose']['macho']['header'][i]['command type count'])
            if 'LC_SEGMENT' in features:
                features['number of segments'] = features['LC_SEGMENT']
            else:
                features['number of segments'] = features['LC_SEGMENT_64']
            features['filename'] = filename[2:-8]
            
            # Remove some more features
            for lc in ['LC_MAIN', 'LC_UNIXTHREAD']:
                if lc in features: features.pop(lc, None)
            
            filename = os.path.split(filename)[1].split('.')[0]
            for eng in engines:
                if filename in vt_data:
                    if eng in vt_data[filename]:
                        features[eng] = vt_data[filename][eng]
                    else:
                        features[eng] = 'no result'
                    features['label'] = vt_data[filename]['label']
                    features['positives'] = vt_data[filename]['positives']
                else:
                    features[eng] = 'no result'
                    features['label'] = 'no result'
            all_features.append(features)
            
    return all_features

def load_files(file_list):
    import json
    features_list = []
    for filename in file_list:
        with open(filename,'rb') as f:
            features = extract_features(filename, json.loads(f.read()))
            features_list.extend(features)
    return features_list

import glob
file_list = glob.glob('./*.results')
features = load_files(file_list)
print "Files:", len(file_list)
print "Number of feature vectors:", len(features)

df = pd.DataFrame.from_records(features)
for col in df.columns:
    if col[0:3] in ['LC_']:
        df[col].fillna(0, inplace=True)
        
df.fillna(-1, inplace=True)
print df.shape
df.head()

# Brief overview of the various things detected by Symantec in this dataset, really we just want to verifiy that we have some data
df[engines].Symantec.value_counts().head(10)

ignore_cols = engines + ['filename', 'label', 'positives']
cols = [x for x in df.columns.tolist() if not x in ignore_cols]

X = df.as_matrix(cols)
from sklearn.preprocessing import scale
X = scale(X)

from sklearn.decomposition import PCA
DDD = PCA(n_components=3).fit_transform(X)
DD = PCA(n_components=2).fit_transform(X)

from mpl_toolkits.mplot3d import Axes3D

figsize(12,8)
fig = plt.figure(figsize=plt.figaspect(.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.scatter(DDD[:,0], DDD[:,1], DDD[:,2], s=50)
ax.set_title("Raw Data 3D")
ax = fig.add_subplot(1, 2, 2)
ax.scatter(DD[:,0], DD[:,1], s=50)
ax.set_title("Raw Data 2D)")
plt.show()

from sklearn.cluster import DBSCAN

X = df.as_matrix(cols)

dbscan = DBSCAN(min_samples=3)
dbscan.fit(X)
labels1 = dbscan.labels_
labels1_u = np.unique(labels1)
nclusters = len(labels1_u)

dbscan_df = pd.DataFrame()
dbscan_df = df[['label', 'filename', 'positives'] + engines]
dbscan_df['cluster'] = labels1

print "Number of clusters: %d" % nclusters
print "Labeled samples: %s" % dbscan_df[dbscan_df['cluster'] != -1].filename.value_counts().sum()
print "Unlabeled samples: %s" % dbscan_df[dbscan_df['cluster'] == -1].filename.value_counts().sum()

dbscan_df.groupby(['cluster', 'label']).count()[['filename']].head(10)

clusters = set()
for name, blah in dbscan_df.groupby(['cluster', 'label'])['label']:
    if name[0] in clusters:
        print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0]
    clusters.add(name[0])

sample_cluster = 3
dbscan_df[dbscan_df['cluster'] == sample_cluster][engines]

for eng in engines:
    print "%s - %s" %(eng, len(dbscan_df[dbscan_df['cluster'] == sample_cluster][eng].unique().tolist()))

# This is a ballpark to see what might be a good number of components to reduce our original 66 features to
X = df.as_matrix(cols)
X = scale(X)
pca = PCA().fit(X)
n_comp = len([x for x in pca.explained_variance_ if x > 1e0])
print "Number of components w/explained variance > 1: %s" % n_comp

X = df.as_matrix(cols)
X = scale(X)
X = PCA(n_components=n_comp).fit_transform(X)

dbscan = DBSCAN(min_samples=3)
dbscan.fit(X)
labels1 = dbscan.labels_
labels1_u = np.unique(labels1)
nclusters = len(labels1_u)

dbscan_df = pd.DataFrame()
dbscan_df = df[['label', 'filename', 'positives'] + engines]
dbscan_df['cluster'] = labels1

print "Number of clusters: %d" % nclusters
print "Labeled samples: %s" % dbscan_df[dbscan_df['cluster'] != -1].filename.value_counts().sum()
print "Unlabeled samples: %s" % dbscan_df[dbscan_df['cluster'] == -1].filename.value_counts().sum()

# Get rid of unlabeled samples and show what we're left with (number of samples per cluster)
dbscan_df.cluster.value_counts().head(10)

# Remove unlabeled samples for graphing to make it prettier
df['cluster'] = dbscan_df['cluster']
tempdf = df[df['cluster'] != -1].reset_index(drop=True)
X = tempdf.as_matrix(cols)
X = scale(X)
DDD = PCA(n_components=3).fit_transform(X)
DD = PCA(n_components=2).fit_transform(X)

figsize(12,12)
fig = plt.figure(figsize=plt.figaspect(.5))
ax = fig.add_subplot(2, 2, 1, projection='3d')
ax.scatter(DDD[:,0], DDD[:,1], DDD[:,2], c=tempdf['cluster'], s=50)
ax.set_title("DBSCAN Clusters")
ax = fig.add_subplot(2, 2, 2, projection='3d')
ax.set_xlim(-10,5)
ax.set_ylim(-10,15)
ax.set_zlim(-30,5)
ax.scatter(DDD[:,0], DDD[:,1], DDD[:,2], c=tempdf['cluster'], s=50)
ax.set_title("DBSCAN Clusters (zoomed in)")
ax = fig.add_subplot(2, 2, 3)
ax.scatter(DD[:,0], DD[:,1], c=tempdf['cluster'], s=50)
ax.set_title("DBSCAN Clusters")
ax = fig.add_subplot(2, 2, 4)
ax.set_xlim(-6,5)
ax.set_ylim(-15,10)
ax.scatter(DD[:,0], DD[:,1], c=tempdf['cluster'], s=50)
ax.set_title("DBSCAN Clusters (zoomed in)")
plt.show()
#df.drop('cluster', axis=1, inplace=True)

dbscan_df.groupby(['cluster', 'label']).count()[['filename']].head(10)

clusters = set()
print "Total Number of Clusters: %s\n" % (len(dbscan_df['cluster'].unique().tolist()))
for name, blah in dbscan_df.groupby(['cluster', 'label'])['label']:
    if name[0] in clusters:
        print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0]
    clusters.add(name[0])

sample_cluster = 0
dbscan_df[dbscan_df['cluster'] == sample_cluster][engines]

for eng in engines:
    print "%s - %s" %(eng, len(dbscan_df[dbscan_df['cluster'] == sample_cluster][eng].unique().tolist()))

import yara_signature

name = 3
fdf = pd.DataFrame()
for f in dbscan_df[dbscan_df['cluster'] == name].filename.tolist():
    fdf = fdf.append(df[df['filename'] == f], ignore_index=True)
    
# Choose a signature from cluster to use as the basis of the sig w/the attributes below
filename = fdf.filename.value_counts().index[0]

meta = {"author" : "sconzo", "email" : "sconzo_at_clicksecurity_dot_com"}

sig = yara_signature.yara_macho_generator.YaraMachoGenerator("/Users/sconzo/macho-yara/macho/" +filename, samplename="Cluster_"+str(name), meta=meta)
    
lc_cmds = []
lc_symtab = []
lc_dysymtab = []
lc_dyld_info = []
lc_segment = []
lc_segment_64 = []

for col in fdf.columns:
    if len(fdf[col].unique()) == 1:
        if fdf[col].unique()[0] != 0:
            lower = [s for s in col if s.islower()]
            if fdf[col].unique()[0] > 0 or (len(lower) == len(col)):
                if col.startswith('LC_'):
                    lc_cmds.append(col)
                if col.startswith('lc_segment_'):
                    lc_segment.append(col)
                if col.startswith('lc_segment_64_'):
                    lc_segment_64.append(col)
                if col.startswith('lc_symtab_'):
                    lc_symtab.append(col)
                if col.startswith('lc_dysymtab_'):
                    lc_dysymtab.append(col)
                if col.startswith('lc_dyld_info_'):
                    lc_dyld_info.append(col)

if len(lc_symtab) > 0:
    lc_cmds = [x for x in lc_cmds if x != 'LC_SYMTAB']
    lc_symtab = set([x[10:] for x in lc_symtab])
    sig.add_symtab(lc_symtab)

if len(lc_dysymtab) > 0:
    lc_cmds = [x for x in lc_cmds if x != 'LC_DYSYMTAB']
    lc_dysymtab = set([x[12:] for x in lc_dysymtab])
    sig.add_dysymtab(lc_dysymtab)

if len(lc_dyld_info):
    lc_cmds = [x for x in lc_cmds if x != 'LC_DYLD_INFO']
    lc_cmds = [x for x in lc_cmds if x != 'LC_DYLD_INFO_ONLY']
    lc_dyld_info = set([x[13:] for x in lc_dyld_info])
    sig.add_dyld_info(lc_dyld_info)

if len(lc_segment) > 0:
    lc_cmds = [x for x in lc_cmds if x != 'LC_SEGMENT']
    lc_segment = set([x[12:] for x in lc_segment])
    sig.add_segment(lc_segment)

if len(lc_segment_64) > 0:
    lc_cmds = [x for x in lc_cmds if x != 'LC_SEGMENT_64']
    lc_segment_64 = set([x[14:] for x in lc_segment_64])
    sig.add_segment(lc_segment_64)

if 'LC_VERSION_MIN_IPHONEOS' in lc_cmds:
    lc_cmds = [x for x in lc_cmds if x != 'LC_VERSION_MIN_IPHONEOS']
    sig.add_version_min_macosx()

if 'LC_VERSION_MIN_MACOSX' in lc_cmds:
    lc_cmds = [x for x in lc_cmds if x != 'LC_VERSION_MIN_MACOSX']
    sig.add_version_min_macosx()
[sig.add_lc(x) for x in lc_cmds]

print sig.get_signature()

from sklearn.cluster import KMeans

X = df.as_matrix(cols)
X = scale(X)

#rule of thumb of k = sqrt(#samples/2), thanks wikipedia :)
k_clusters = int(math.sqrt(int(len(X)/2)))
kmeans = KMeans(n_clusters=k_clusters)
kmeans.fit(X)
labels1 = kmeans.labels_
df['cluster'] = labels1
labels1_u = np.unique(labels1)
nclusters = len(labels1_u)

kmeans_df = pd.DataFrame()
kmeans_df = df[['label', 'filename', 'positives'] + engines]
kmeans_df['cluster'] = labels1

print "Number of clusters: %d" % k_clusters

kmeans_df['cluster'].value_counts().head(10)

X = df.as_matrix(cols)
X = scale(X)
X = PCA(n_components=3).fit_transform(X)

figsize(12,8)
fig = plt.figure(figsize=plt.figaspect(.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.scatter(X[:,0], X[:,1], X[:,2], c=kmeans_df['cluster'], s=50)
ax.set_title("Kmeans Clusters")
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.set_xlim(-10,2)
ax.set_ylim(10,35)
ax.set_zlim(-20,10)
ax.scatter(X[:,0], X[:,1], X[:,2], c=kmeans_df['cluster'], s=50)
ax.set_title("KMeans Clusters (zoomed in)")
plt.show()

clusters = set()
for name, blah in kmeans_df.groupby(['cluster', 'label'])['label']:
    if name[0] in clusters:
        print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0]
    clusters.add(name[0])

sample_cluster = 5
kmeans_df[kmeans_df['cluster'] == sample_cluster][engines].head()

for eng in engines:
    print "%s - %s" %(eng, len(kmeans_df[kmeans_df['cluster'] == sample_cluster][eng].unique().tolist()))

kmeans_df[kmeans_df['cluster'] == sample_cluster]['Symantec'].value_counts()

X = df.as_matrix(cols)
X = scale(X)
X = PCA(n_components=n_comp).fit_transform(X)

#rule of thumb of k = sqrt(#samples/2), thanks wikipedia :)
k_clusters = int(math.sqrt(int(len(X)/2)))

kmeans = KMeans(n_clusters=k_clusters)
kmeans.fit(X)
labels1 = kmeans.labels_
df['cluster'] = labels1
labels1_u = np.unique(labels1)
nclusters = len(labels1_u)

kmeans_df = pd.DataFrame()
kmeans_df = df[['label', 'filename', 'positives'] + engines]
kmeans_df['cluster'] = labels1

print "Number of clusters: %d" % nclusters
print
print "Cluster/Sample Layout"
print df.cluster.value_counts().head(10)
print

X = df.as_matrix(cols)
X = scale(X)
X = PCA(n_components=3).fit_transform(X)

figsize(12,8)
fig = plt.figure(figsize=plt.figaspect(.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.scatter(X[:,0], X[:,1], X[:,2], c=kmeans_df['cluster'], s=50)
ax.set_title("KMeans Clusters")
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.set_xlim(-10,2)
ax.set_ylim(15,30)
ax.set_zlim(-20,0)
ax.scatter(X[:,0], X[:,1], X[:,2], c=kmeans_df['cluster'], s=50)
ax.set_title("KMeans Clusters (zoomed in)")
plt.show()

clusters = set()
for name, blah in kmeans_df.groupby(['cluster', 'label'])['label']:
    if name[0] in clusters:
        print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0]
    clusters.add(name[0])

sample_cluster = 4
kmeans_df[kmeans_df['cluster'] == sample_cluster][engines].head()

for eng in engines:
    print "%s - %s" %(eng, len(kmeans_df[kmeans_df['cluster'] == sample_cluster][eng].unique().tolist()))

from sklearn.cluster import MeanShift, estimate_bandwidth

X = df.as_matrix(cols)
X = scale(X)

ebw = estimate_bandwidth(X)
ms1 = MeanShift(bandwidth=ebw)
ms1.fit(X)

labels1 = ms1.labels_
labels1_u = np.unique(labels1)
nclusters = len(labels1_u)

meanshift_df = pd.DataFrame()
meanshift_df = df[['label', 'filename', 'positives'] + engines]
meanshift_df['cluster'] = labels1

print "Estimated Bandwidth: %s" % ebw
print "Number of clusters: %d" % nclusters

X = df.as_matrix(cols)
X = scale(X)
X = PCA(n_components=3).fit_transform(X)

figsize(12,8)
fig = plt.figure(figsize=plt.figaspect(.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.scatter(X[:,0], X[:,1], X[:,2], c=meanshift_df['cluster'], s=50)
ax.set_title("MeanShift Clusters")
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.set_xlim(-10,2)
ax.set_ylim(15,30)
ax.set_zlim(-20,0)
ax.scatter(X[:,0], X[:,1], X[:,2], c=meanshift_df['cluster'], s=50)
ax.set_title("MeanShift Clusters (zoomed in)")
plt.show()

meanshift_df.cluster.value_counts().head(10)

clusters = set()
for name, blah in meanshift_df.groupby(['cluster', 'label'])['label']:
    if name[0] in clusters:
        print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0]
    clusters.add(name[0])

sample_cluster = 2
meanshift_df[meanshift_df['cluster'] == sample_cluster][engines].head()

for eng in engines:
    print "%s - %s" %(eng, len(kmeans_df[kmeans_df['cluster'] == sample_cluster][eng].unique().tolist()))

X = df.as_matrix(cols)
X = scale(X)
X = PCA(n_components=n_comp).fit_transform(X)

ebw = estimate_bandwidth(X)
ms1 = MeanShift(bandwidth=ebw)
ms1.fit(X)

labels1 = ms1.labels_
labels1_u = np.unique(labels1)
nclusters = len(labels1_u)

meanshift_df = pd.DataFrame()
meanshift_df = df[['label', 'filename', 'positives'] + engines]
meanshift_df['cluster'] = labels1

print "Estimated Bandwidth: %s" % ebw
print "Number of clusters: %d" % nclusters
print
print "Cluster/Sample Layout"
print df.cluster.value_counts().head(10)
print

df['cluster'] = meanshift_df['cluster']
# Once again we can remove, in this case, the largest cluster for a less dense graph
tempdf = df[df['cluster'] != 0].reset_index(drop=True)
X = tempdf.as_matrix(cols)
X = scale(X)
X = PCA(n_components=3).fit_transform(X)

figsize(12,8)
fig = plt.figure(figsize=plt.figaspect(.5))
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.scatter(X[:,0], X[:,1], X[:,2], c=tempdf['cluster'], s=50)
ax.set_title("MeanShift Clusters")
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.set_xlim(-10,2)
ax.set_ylim(15,30)
ax.set_zlim(-20,0)
ax.scatter(X[:,0], X[:,1], X[:,2], c=tempdf['cluster'], s=50)
ax.set_title("MeanShift Clusters (zoomed in)")
plt.show()

clusters = set()
for name, blah in meanshift_df.groupby(['cluster', 'label'])['label']:
    if name[0] in clusters:
        print "%s Cluster has both Malicious and Non-Malicious Samples" % name[0]
    clusters.add(name[0])

sample_cluster = 2
meanshift_df[meanshift_df['cluster'] == sample_cluster][engines].head()

for eng in engines:
    print "%s - %s" %(eng, len(kmeans_df[kmeans_df['cluster'] == sample_cluster][eng].unique().tolist()))