from IPython.display import YouTubeVideo YouTubeVideo('R4OlXb9aTvQ') import pandas as pd import numpy as np import seaborn as sns %pylab inline alexadf = pd.read_csv('top2k.csv',names=['uri'],usecols= [1],header=None,encoding='utf-8') alexadf.head(2) alexadf['uri'] = alexadf['uri'].apply(lambda x: x[:x.find('.')]).astype(str) alexadf = alexadf.dropna() alexadf = alexadf.drop_duplicates() alexadf['label'] = 0 # Create label identifying values as nominal alexadf.head(2) malicious_domains = pd.read_csv('suspiciousdomains_High.txt',encoding='utf-8',skiprows=17,header=None,nrows=2200) malicious_domains.columns = ['uri'] malicious_domains['uri'] = malicious_domains['uri'].apply(lambda x: x[:x.find('.')]).astype(str) malicious_domains.dropna() malicious_domains.drop_duplicates() malicious_domains['label'] = 1 malicious_domains.head(2) #malicious_domains.info() df = pd.concat([alexadf, malicious_domains], ignore_index=True) df['length'] = [len(x) for x in df['uri']] df.head(2) from collections import Counter # Remember mixed standard python functions and numpy functions are very slow def calcEntropy(x): p, lens = Counter(x), np.float(len(x)) return -np.sum( count/lens * np.log2(count/lens) for count in p.values()) df['entropy'] = [calcEntropy(x) for x in df['uri']] df.sort('entropy', ascending = False)[:5] from scipy import stats #plt.rcParams['figure.figsize'] = 13, 7 sns.set_context(rc={"figure.figsize": (7, 5)}) g = sns.JointGrid(df.length.astype(float), df.entropy.astype(float)) g.plot(sns.regplot, sns.distplot, stats.spearmanr); print "Pearson's r: {0}".format(stats.pearsonr(df.length.astype(float), df.entropy.astype(float))) sns.set_context(rc={"figure.figsize": (7, 5)}) dfNominal = df[df['label']== 0] dfDGA = df[df['label']== 1] def shadedHist(df,col,bins): df[col].hist(bins = bins, color = 'dodgerblue', alpha = .6, normed = False) len_mean = df[col].mean() len_std = df[col].std() # mpl plt.plot([len_mean, len_mean], [0,2500 ],'k-',lw=3,color = 'black',alpha = .4) plt.plot([len_mean + (2 * len_std), len_mean + (2 * len_std)], [0, 2500], 'k-', lw=2, color = 'red', alpha = .4) plt.axvspan(len_mean + (2 * len_std), max(df[col]), facecolor='r', alpha=0.3) plt.title(col) sns.set_context(rc={"figure.figsize": (7, 5)}) shadedHist(df[df['label']== 0],'entropy',10) nominal_parametric_upper = dfNominal['entropy'].mean() + \ 2 * dfNominal['entropy'].std() print nominal_parametric_upper sns.set_context(rc={"figure.figsize": (7, 5)}) shadedHist(dfDGA,'entropy',10) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(analyzer='char', ngram_range=(3,4)) cv_nominal = cv.fit_transform(df[df['label']== 0]['uri']) cv_all = cv.fit_transform(df['uri']) feature_names = cv.get_feature_names() import operator sorted(cv.vocabulary_.iteritems(), key=operator.itemgetter(1), reverse= True)[0:5] dfConcat = pd.concat([df.ix[:, 2:4], pd.DataFrame(cv_all.toarray())], join='outer', axis=1, ignore_index=False) dfConcat.head(3) # cv.vocabulary_ # cv_nominal.toarray() # cv.get_feature_names() # cv.vocabulary_ X = dfConcat.values y = df.ix[:,1] print X[0:3] print '' print y[0:3] from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22) from sklearn.dummy import DummyClassifier for strategy in ['stratified', 'most_frequent', 'uniform']: clf = DummyClassifier(strategy=strategy,random_state=None) clf.fit(X_train, y_train) print strategy + ': ' + str(clf.score(X_test, y_test)) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score rf = RandomForestClassifier(n_jobs = -1) rf.fit(X_train, y_train) print 'RF' + ': ' + str(rf.score(X_test, y_test)) # scores = cross_val_score(clf, X, y, cv=2) # print scores dfEng = pd.read_csv('eng_words.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8') # Convert all words to lowercase to match domain name dataframes dfEng['word'] = dfEng['word'].map(lambda x: np.str(x).strip().lower()) dfEng['word'].drop_duplicates(inplace=True) dfEng['word'].dropna(inplace=True) dfEng[10:15] from sklearn.feature_extraction.text import CountVectorizer cvEng = CountVectorizer(analyzer='char', ngram_range=(3,4)) cvEngfeatures = cvEng.fit_transform(dfEng['word']) import operator print sorted(cvEng.vocabulary_.iteritems(), key=operator.itemgetter(1), reverse= True)[0:5] cvEngfeatures def engDictMatch(x): return str(np.log10(cvEngfeatures.sum(axis=0).getA1()) * cvEng.transform([x]).T) print 'English dictionary match: ' + str(engDictMatch('yahoo')) print 'English dictionary match: ' + str(engDictMatch('drudgereport')) print 'English dictionary match: ' + str(engDictMatch('32tsdgseg')) df['dictMatch'] = np.log10(cvEngfeatures.sum(axis=0).getA1()) * cvEng.transform(df['uri']).T # df['dictMatch'] = df['uri'].apply(lambda x: engDictMatch(x)) df.head(3) dfConcat2 = pd.concat([pd.DataFrame(df.ix[:,4]),dfConcat],join='outer', axis=1, ignore_index=False) X = dfConcat2.values y = df.ix[:,1] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=None) rf = RandomForestClassifier(n_jobs = -1) rf.fit(X_train, y_train) print 'RF' + ': ' + str(rf.score(X_test, y_test))