#!/usr/bin/env python # coding: utf-8 # In[2]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from collections import Counter import gzip import json import time import pickle get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: import xml.etree.ElementTree as ET root = ET.parse('c:/tmp/langs.model.xml') keywords = [] for elem in root.findall('.//Keywords'): if elem.text: keywords.extend(elem.text.split(' ')) keywords = {s.strip() for s in keywords} # In[4]: code = [] languages = [] i = 0 for line in gzip.open('c:/tmp/out.json.gz', mode='r'): raw = json.loads(line) tokens = [el.strip() for el in raw['source']] tokens = {el for el in tokens if el and el in keywords} if tokens: code.append(tokens) languages.append(raw['language']) i = i + 1 if i % 10000 == 0: print time.strftime('%H:%M:%S'), 'processing %ds line...' % i # In[5]: Counter(languages).most_common() # In[6]: pl_norm = {'GNU C++': u'C/C++', 'GNU C++11': u'C/C++', 'MS C++': u'C/C++', 'GNU C': u'C/C++', 'GNU C11': u'C/C++', 'Java 8': u'Java', 'Java 7': u'Java', 'Python 3': u'Python', 'Python 2': u'Python', 'PyPy 2': u'Python', 'PyPy 3': u'Python', 'FPC': u'Pascal', 'Delphi': u'Pascal', 'MS C#': u'C#', 'Mono C#': u'C#' } # In[7]: languages_normalized = [pl_norm[l] if l in pl_norm else l for l in languages] freqs = Counter(languages_normalized).most_common() freqs # In[8]: langs, freqs = zip(*freqs) plt.figure(figsize=(6, 5)) plt.subplot(2, 1, 1) plt.bar(range(len(langs)), freqs) plt.xticks(np.arange(len(langs)) + 0.5, langs, rotation='vertical') plt.subplot(2, 1, 2) plt.bar(range(len(langs)), freqs, log='true') plt.xticks(np.arange(len(langs)) + 0.5, langs, rotation='vertical') plt.tight_layout() plt.show() # In[9]: global_frequency = Counter() for c in code: global_frequency.update(c) print 'Most common' print global_frequency.most_common(50) print print 'Least common' print global_frequency.most_common()[::-1][:50] # In[10]: len(global_frequency) # Let's remove some tokens that don't apear frequently enough (say <= 30) # In[11]: for cnt in code: for c in list(cnt): if global_frequency[c] <= 30: cnt.remove(c) # In[12]: global_frequency = Counter() for c in code: global_frequency.update(c) len(global_frequency) # In[13]: df = pd.DataFrame(data={'code_bow': code, 'language': languages_normalized}) # In[14]: bow_total = [] for lang, group in df.groupby('language'): bows = Counter() for c in group['code_bow'][:50]: bows.update(c) bow_total.extend(bows.most_common(50)) del bows # In[15]: from wordcloud import WordCloud from scipy.misc import imread # In[16]: mask = imread('c:/tmp/ellipse.png', flatten=True) wc = WordCloud(background_color='white', mask=mask, font_path='c:/tmp/consola.ttf') wc.generate_from_frequencies(bow_total) plt.figure(figsize=(6 * 2, 4 * 2)) plt.imshow(wc) plt.axis('off') plt.show() # ## Let's build a classifier # In[17]: from sklearn.feature_extraction.text import CountVectorizer from sklearn.cross_validation import StratifiedKFold, StratifiedShuffleSplit from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score # In[18]: def unwrap_set(s): return list(s) vectorizer = CountVectorizer(analyzer=unwrap_set) X = vectorizer.fit_transform(code) # In[19]: X.shape # First, train a model C/C++ vs rest # In[20]: c_not_c = np.where(df.language == 'C/C++', 'C/C++', 'OTHER') Counter(c_not_c) # In[21]: skf = StratifiedKFold(c_not_c, 10) out_of_fold_pred = c_not_c.copy() aucs = [] for i, (train, test) in enumerate(skf): t0 = time.time() y_train = c_not_c[train] y_test = c_not_c[test] X_train = X[train] X_test = X[test] dt = DecisionTreeClassifier(max_depth=3) dt.fit(X_train, y_train) pred = dt.predict(X_test) out_of_fold_pred[test] = pred y_pred = dt.predict_proba(X_test) auc = roc_auc_score(y_test == 'C/C++', y_pred[:, 0]) aucs.append(auc) print "done in %fs" % (time.time() - t0) print np.mean(aucs), np.std(aucs) # In[22]: pd.DataFrame(confusion_matrix(c_not_c, out_of_fold_pred)) # Let's see what is misclassified # In[23]: totl = Counter(languages_normalized) for l, c in Counter(df.language[c_not_c != out_of_fold_pred]).items(): print '%s: %0.2f%%' % (l, c * 100.0 / totl[l]) # Almost all Ocaml sumbissions are misclassified, and ~ 13% of Go. The rest seems fine. Will use this model for further analysis # In[24]: X_not_c = X[np.where(df.language != 'C/C++')] lang_not_c = df.language[df.language != 'C/C++'] lang_not_c = np.array(lang_not_c) y_dummies = pd.get_dummies(lang_not_c) X_not_c.shape # Let's reduce dimensionality with one-vs-all LASSO SVM # In[25]: from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel # In[26]: lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_not_c, lang_not_c) selection_model = SelectFromModel(lsvc, prefit=True) X_new = selection_model.transform(X_not_c) X_new.shape # Important features: # In[27]: np.array(vectorizer.get_feature_names())[selection_model._get_support_mask()] # In[28]: skf = StratifiedKFold(lang_not_c, 5) out_of_fold_pred = lang_not_c.copy() aucs = [] for i, (train, test) in enumerate(skf): t0 = time.time() y_train = lang_not_c[train] y_test = lang_not_c[test] X_train = X_not_c[train] X_test = X_not_c[test] lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train) selection_model = SelectFromModel(lsvc, prefit=True) X_train = selection_model.transform(X_train) X_test = selection_model.transform(X_test) dt_rest = DecisionTreeClassifier(max_depth=6) dt_rest.fit(X_train, y_train) pred = dt_rest.predict(X_test) out_of_fold_pred[test] = pred pred_score = dt_rest.predict_proba(X_test) auc = roc_auc_score(y_dummies.ix[test], pred_score) aucs.append(auc) print "done in %fs" % (time.time() - t0) print np.mean(aucs), np.std(aucs) # In[29]: cm = confusion_matrix(lang_not_c, out_of_fold_pred) print 'actual | predicted -->' cm = pd.DataFrame(cm, columns=dt_rest.classes_) cm.index = dt_rest.classes_ cm # Model has some problems with Pascal, D and Go, but other than that the performance seems OK. # # So, the final model is: # In[30]: # http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html # http://stackoverflow.com/a/30104792 def get_code(tree, feature_names, spacer_base=' '): left = tree.tree_.children_left right = tree.tree_.children_right threshold = tree.tree_.threshold features = [feature_names[i] for i in tree.tree_.feature] value = tree.tree_.value target_names = tree.classes_ def recurse(left, right, threshold, features, node, depth): spacer = spacer_base * depth if threshold[node] != -2: # replace <= with > - it reads better for this problem # print spacer + "if " + features[node] + " > " + str(threshold[node]) + ":" print spacer + "if contains('" + features[node] + "'):" if right[node] != -1: recurse(left, right, threshold, features, right[node], depth + 1) print spacer + "else: # doesn't contain '" + features[node] + "'" if left[node] != -1: recurse(left, right, threshold, features, left[node], depth + 1) else: target = value[node] total_sum = np.sum(target) target_name = target_names[target.argmax()] target_count = target.max() print spacer + "return '%s' # (%0.3f, %d/%d examples)" % \ (target_name, target_count / total_sum, target_count, total_sum) recurse(left, right, threshold, features, 0, 0) # In[31]: dt_first = DecisionTreeClassifier(max_depth=3) dt_first.fit(X, np.where(df.language == 'C/C++', 'C/C++', 'OTHER')) y_noc = df.language[df.language != 'C/C++'] X_noc = X[np.where(df.language != 'C/C++')] lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_noc, y_noc) selection_model = SelectFromModel(lsvc, prefit=True) X_noc = selection_model.transform(X_noc) dt_rest = DecisionTreeClassifier(max_depth=6) dt_rest.fit(X_noc, y_noc) # In[32]: get_code(dt_first, feature_names=vectorizer.get_feature_names()) # In[33]: features = np.array(vectorizer.get_feature_names()) features = features[selection_model._get_support_mask()] get_code(dt_rest, feature_names=features)