#!/usr/bin/env python
# coding: utf-8

# In[2]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter

import gzip
import json
import time
import pickle 
get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


import xml.etree.ElementTree as ET
root = ET.parse('c:/tmp/langs.model.xml')

keywords = []

for elem in root.findall('.//Keywords'):
    if elem.text:
        keywords.extend(elem.text.split(' '))

keywords = {s.strip() for s in keywords}


# In[4]:


code = []
languages = []

i = 0

for line in gzip.open('c:/tmp/out.json.gz', mode='r'):
    raw = json.loads(line)
    tokens = [el.strip() for el in raw['source']]
    tokens = {el for el in tokens if el and el in keywords}
    if tokens:
        code.append(tokens)
        languages.append(raw['language'])

    i = i + 1
    if i % 10000 == 0:
        print time.strftime('%H:%M:%S'), 'processing %ds line...' % i


# In[5]:


Counter(languages).most_common()


# In[6]:


pl_norm = {'GNU C++': u'C/C++', 'GNU C++11': u'C/C++', 'MS C++': u'C/C++', 'GNU C': u'C/C++', 'GNU C11': u'C/C++',
           'Java 8': u'Java', 'Java 7': u'Java', 
           'Python 3': u'Python', 'Python 2': u'Python', 'PyPy 2': u'Python', 'PyPy 3': u'Python',
           'FPC': u'Pascal', 'Delphi': u'Pascal',
           'MS C#': u'C#', 'Mono C#': u'C#' }


# In[7]:


languages_normalized = [pl_norm[l] if l in pl_norm else l for l in languages]
freqs = Counter(languages_normalized).most_common()
freqs


# In[8]:


langs, freqs = zip(*freqs)

plt.figure(figsize=(6, 5))

plt.subplot(2, 1, 1)
plt.bar(range(len(langs)), freqs)
plt.xticks(np.arange(len(langs)) + 0.5, langs, rotation='vertical')


plt.subplot(2, 1, 2)
plt.bar(range(len(langs)), freqs, log='true')
plt.xticks(np.arange(len(langs)) + 0.5, langs, rotation='vertical')


plt.tight_layout()
plt.show()


# In[9]:


global_frequency = Counter()

for c in code: 
    global_frequency.update(c)

print 'Most common'
print global_frequency.most_common(50)
print 

print 'Least common'
print global_frequency.most_common()[::-1][:50]


# In[10]:


len(global_frequency)


# Let's remove some tokens that don't apear frequently enough (say <= 30)

# In[11]:


for cnt in code:
    for c in list(cnt):
        if global_frequency[c] <= 30:
            cnt.remove(c)


# In[12]:


global_frequency = Counter()

for c in code: 
    global_frequency.update(c)

len(global_frequency)


# In[13]:


df = pd.DataFrame(data={'code_bow': code, 'language': languages_normalized})


# In[14]:


bow_total = []
for lang, group in df.groupby('language'): 
    bows = Counter()

    for c in group['code_bow'][:50]: 
        bows.update(c)

    bow_total.extend(bows.most_common(50))

del bows


# In[15]:


from wordcloud import WordCloud
from scipy.misc import imread


# In[16]:


mask = imread('c:/tmp/ellipse.png', flatten=True)

wc = WordCloud(background_color='white', mask=mask, font_path='c:/tmp/consola.ttf')
wc.generate_from_frequencies(bow_total)

plt.figure(figsize=(6 * 2, 4 * 2))
plt.imshow(wc)
plt.axis('off')
plt.show()


# ## Let's build a classifier

# In[17]:


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import StratifiedKFold, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


# In[18]:


def unwrap_set(s):
    return list(s)

vectorizer = CountVectorizer(analyzer=unwrap_set)
X = vectorizer.fit_transform(code)


# In[19]:


X.shape


# First, train a model C/C++ vs rest

# In[20]:


c_not_c = np.where(df.language == 'C/C++', 'C/C++', 'OTHER')
Counter(c_not_c)


# In[21]:


skf = StratifiedKFold(c_not_c, 10)
out_of_fold_pred = c_not_c.copy()

aucs = []
for i, (train, test) in enumerate(skf):
    t0 = time.time()

    y_train = c_not_c[train]
    y_test = c_not_c[test]
    X_train = X[train]
    X_test = X[test]

    dt = DecisionTreeClassifier(max_depth=3)
    dt.fit(X_train, y_train)

    pred = dt.predict(X_test)
    out_of_fold_pred[test] = pred

    y_pred = dt.predict_proba(X_test)
    auc = roc_auc_score(y_test == 'C/C++', y_pred[:, 0])
    aucs.append(auc)

    print "done in %fs" % (time.time() - t0)

print np.mean(aucs), np.std(aucs)


# In[22]:


pd.DataFrame(confusion_matrix(c_not_c, out_of_fold_pred))


# Let's see what is misclassified

# In[23]:


totl = Counter(languages_normalized)

for l, c in Counter(df.language[c_not_c != out_of_fold_pred]).items():
    print '%s: %0.2f%%' % (l, c * 100.0 / totl[l])


# Almost all Ocaml sumbissions are misclassified, and ~ 13% of Go. The rest seems fine. Will use this model for further analysis

# In[24]:


X_not_c = X[np.where(df.language != 'C/C++')]
lang_not_c = df.language[df.language != 'C/C++']
lang_not_c = np.array(lang_not_c)
y_dummies = pd.get_dummies(lang_not_c)
X_not_c.shape


# Let's reduce dimensionality with one-vs-all LASSO SVM

# In[25]:


from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel


# In[26]:


lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_not_c, lang_not_c)
selection_model = SelectFromModel(lsvc, prefit=True)
X_new = selection_model.transform(X_not_c)
X_new.shape


# Important features:

# In[27]:


np.array(vectorizer.get_feature_names())[selection_model._get_support_mask()]


# In[28]:


skf = StratifiedKFold(lang_not_c, 5)
out_of_fold_pred = lang_not_c.copy()

aucs = []

for i, (train, test) in enumerate(skf):
    t0 = time.time()

    y_train = lang_not_c[train]
    y_test = lang_not_c[test]
    X_train = X_not_c[train]
    X_test = X_not_c[test]

    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train)
    selection_model = SelectFromModel(lsvc, prefit=True)
    X_train = selection_model.transform(X_train)
    X_test = selection_model.transform(X_test)

    dt_rest = DecisionTreeClassifier(max_depth=6)
    dt_rest.fit(X_train, y_train)

    pred = dt_rest.predict(X_test)
    out_of_fold_pred[test] = pred

    pred_score = dt_rest.predict_proba(X_test)
    auc = roc_auc_score(y_dummies.ix[test], pred_score)
    aucs.append(auc)
    
    print "done in %fs" % (time.time() - t0)
print np.mean(aucs), np.std(aucs)


# In[29]:


cm = confusion_matrix(lang_not_c, out_of_fold_pred)

print 'actual | predicted -->'
cm = pd.DataFrame(cm, columns=dt_rest.classes_)
cm.index = dt_rest.classes_
cm


# Model has some problems with Pascal, D and Go, but other than that the performance seems OK.
# 
# So, the final model is:

# In[30]:


# http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html
# http://stackoverflow.com/a/30104792

def get_code(tree, feature_names, spacer_base='  '):
    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features  = [feature_names[i] for i in tree.tree_.feature]
    value = tree.tree_.value
    target_names = tree.classes_

    def recurse(left, right, threshold, features, node, depth):
        spacer = spacer_base * depth
        if threshold[node] != -2:
            # replace <= with > - it reads better for this problem
            # print spacer + "if " + features[node] + " > " + str(threshold[node]) + ":"
            print spacer + "if contains('" + features[node] + "'):"
            if right[node] != -1:
                recurse(left, right, threshold, features, right[node], depth + 1)
            print spacer + "else: # doesn't contain '" + features[node] + "'"
            if left[node] != -1:
                recurse(left, right, threshold, features, left[node], depth + 1)
        else:
            target = value[node]
            
            total_sum = np.sum(target)
            target_name = target_names[target.argmax()]
            target_count = target.max()

            print spacer + "return '%s' # (%0.3f, %d/%d examples)" %  \
                  (target_name, target_count / total_sum, target_count, total_sum)

    recurse(left, right, threshold, features, 0, 0)


# In[31]:


dt_first = DecisionTreeClassifier(max_depth=3)
dt_first.fit(X, np.where(df.language == 'C/C++', 'C/C++', 'OTHER'))

y_noc = df.language[df.language != 'C/C++']
X_noc = X[np.where(df.language != 'C/C++')]

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_noc, y_noc)
selection_model = SelectFromModel(lsvc, prefit=True)
X_noc = selection_model.transform(X_noc)

dt_rest = DecisionTreeClassifier(max_depth=6)
dt_rest.fit(X_noc, y_noc)


# In[32]:


get_code(dt_first, feature_names=vectorizer.get_feature_names())


# In[33]:


features = np.array(vectorizer.get_feature_names())
features = features[selection_model._get_support_mask()]
get_code(dt_rest, feature_names=features)