#!/usr/bin/env python
# coding: utf-8

# #OSP Syllabus Classification [work in progress]
# 
# The [Open Syllabus Project](http://opensyllabusproject.org/) has a collection of 1M+ documents to sift through for syllabi.
# This is a classifier for whether a document is a syllabus or not. It turns out, roughly half of the documents are syllabi.

# In[30]:


from osp.corpus.syllabus import Syllabus
import pandas as pd
import numpy as np
import scipy
import pickle

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

from collections import defaultdict

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RandomizedLogisticRegression


# In[32]:


with open('./training_data.p', 'rb') as pf:
    training_3 = pickle.load(pf)


# In[36]:


training_df_3 = pd.DataFrame(training_3).rename(columns={'labels': 'syllabus'})


# In[37]:


training_df_1 = pd.read_csv('/home/ubuntu/data/syllabus_tags.csv')
# A second labeled set of 500 documents
training_df_2 = pd.read_csv('/home/ubuntu/data/refinement.csv')

training_df = pd.concat([training_df_1, training_df_2, training_df_3])


# In[38]:


training_df.head()


# We tokenize the syllabus text in the positive and negative examples, and featurize them for a classifier.
# 
# First pass: tf-idf features of text tokens, classified using naive bayes.

# In[12]:


text_preprocessing = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer())
])
clf_nb = MultinomialNB()


# In[39]:


features = text_preprocessing.fit_transform(training_df.text.values)


# In[40]:


# Need dense features to index into it
features_dense = features.todense()


# In[44]:


full_clf = Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer()),
                     ('clf_lr', LogisticRegression())
])

full_clf.fit(training_df.text.values, training_df.syllabus.values)


# In[45]:


with open('model2.p', 'wb') as pout:
    pickle.dump(full_clf, pout)


# In[16]:


kf = KFold(n=len(training_df), n_folds=5, shuffle=True, random_state=983214)
cv_results = cross_val_score(clf_nb, features_dense, training_df.syllabus.values, cv=kf)
cv_results.mean()


# In[31]:


kf = KFold(n=len(training_df), n_folds=5, shuffle=True, random_state=983214)
cv_results = cross_val_score(text_clf, training_df.text.values,
                             training_df.syllabus.values, cv=kf, scoring='roc_auc')
cv_results.mean()


# #### We get 86.4% mean accuracy and 94.22% mean ROC using out-of-the-box features and the multinomial NB classifier.

# One question we might ask is: is this good?
# 
# The classifier returns a probability between 0 and 1 that a given document is a syllabus. In the ROC curves below, the movement of the line represents the changing false positive and true positive rates for different cutoff values. For example, if the cutoff is 0, then all documents with a probability greater than 0 of being a syllabus (i.e., all documents) will be classified as syllabi, leading us to have a perfect true positive rate but also a perfect false positive rate -- the upper right corner.
# 
# This chart shows us that we can choose a threshold somewhere on that line. For example, we can achieve a true positive rate (a recall) of 90% with only a 20% false positive rate (also known as fallout). How useful this will be in practice will depend on the ratio of syllabi to non-syllabi in the corpus, and our tolerance for errors of either kind.

# In[17]:


fprs = []
tprs = []
thresholds = []

kf = KFold(n=len(training_df), n_folds=5, shuffle=True, random_state=983214)
for train, test in kf:
    clf_nb.fit(features_dense[train], training_df.syllabus.values[train])
    predictions = clf_nb.predict_proba(features_dense[test])

    fpr, tpr, threshold = roc_curve(training_df.syllabus.values[test], predictions[:, 1])
    fprs.append(fpr)
    tprs.append(tpr)
    thresholds.append(threshold)


# In[104]:


for i in range(len(fprs)):
    plt.plot(fprs[i], tprs[i], lw=1)

plt.show()


# ## Experimentation with additional classifiers and parameters

# In[23]:


features_dense.shape, training_df.syllabus.shape, train.shape, test.shape


# In[19]:


classifiers = {'rf': RandomForestClassifier(),
               'lr': LogisticRegression(),
               'nb': clf_nb,
               'dt': DecisionTreeClassifier()
               }

fprs = defaultdict(list)
tprs = defaultdict(list)
thresholds = defaultdict(list)

mean_fprs = {}
mean_tprs = {}
mean_aucs = {}

kf = KFold(n=len(training_df), n_folds=5, shuffle=True, random_state=983214)


# In[25]:


for train, test in kf:
    for clf_type, clf in classifiers.items():
        # Train and predict using selected classifier
        clf.fit(features_dense[train], training_df.syllabus.values[train])
        predictions = clf.predict_proba(features_dense[test])
        fpr, tpr, threshold = roc_curve(training_df.syllabus.values[test], predictions[:, 1])
        
        # Append results to that classifier's dictionary entry
        fprs[clf_type].append(fpr)
        tprs[clf_type].append(tpr)
        thresholds[clf_type].append(threshold)


# In[28]:


for clf_type in classifiers:
    mean_fprs[clf_type] = [np.mean(x) for x in zip(*fprs[clf_type])]
    mean_tprs[clf_type] = [np.mean(x) for x in zip(*tprs[clf_type])]
    mean_aucs[clf_type] = auc(mean_fprs[clf_type], mean_tprs[clf_type])
    plt.plot(mean_fprs[clf_type], mean_tprs[clf_type], lw=1, label='%s (AUC = %0.2f)' % (clf_type, mean_aucs[clf_type]))
    
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Classifier Performance ROC Curves')
plt.legend(loc="lower right")
plt.show()


# In[130]:


# Old performance

for clf_type in classifiers:
    mean_fprs[clf_type] = [np.mean(x) for x in zip(*fprs[clf_type])]
    mean_tprs[clf_type] = [np.mean(x) for x in zip(*tprs[clf_type])]
    mean_aucs[clf_type] = auc(mean_fprs[clf_type], mean_tprs[clf_type])
    plt.plot(mean_fprs[clf_type], mean_tprs[clf_type], lw=1, label='%s (AUC = %0.2f)' % (clf_type, mean_aucs[clf_type]))
    
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Classifier Performance ROC Curves')
plt.legend(loc="lower right")
plt.show()


# Given that logistic regression performed just as well as Naive Bayes on cross-validation without tuning the parameters, I expect it to outperform once we tune it.

# ### Feature Analysis

# In[150]:


# We still need to grid-search for the right logit parameters.

rand_logit = RandomizedLogisticRegression(C=1, scaling=0.5, n_resampling=100)
rand_logit.fit(features, is_syllabus)


# In[151]:


sorted(zip(rand_logit.all_scores_, vect.get_feature_names()), reverse=True)


# ### Baseline comparisons
# My intuitive suspicion is that a set of hand-crafted rules could perform remarkably well at classifying syllabi. I'm curious to see how it compares to the automated methods above. The rules would be along the lines of:
# 
# Has one of the following words:
# - syllabus
# - class
# - assignment
# - due
# - spring
# - fall

# In[194]:


syllabus_words = ['syllabus', 'class', 'assignment', 'due', 'spring', 'fall']
# TODO


# ### Parameter Tuning (TODO)

# In[24]:


parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2')
}

grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
grid_search.fit(training_df.text.values, is_syllabus.values)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


# In this grid search across parameters, having a cutoff that eliminates words with document frequency above 0.5 is more effective than cutoffs at 0.75 and 1. Bigrams also perform better than unigrams. The best score here is lower than the mean score in our stratified run above, so more tests are warranted.

# ## Classify documents

# First step is to take all of the training data, and feed that through the classifier.

# In[238]:


text_preprocessing.fit(training_df.text.values)


# In[239]:


f = text_preprocessing.transform(training_df.text)


# In[248]:


from scipy.sparse import csr_matrix
csr_matrix(features_dense)


# In[249]:


lr.fit(f, is_syllabus)


# Now, run documents through the classifier

# In[289]:


from playhouse.postgres_ext import ServerSide
from osp.corpus.models.text import Document_Text
 
# Select all texts.
query = Document_Text.select()


# Mapping from a syllabus id to its predicted probability of being a syllabus
predictions = {}

# Counter
i = 0

# Wrap the query in a server-side cursor, to avoid
# loading the plaintext for all docs into memory.
for sy in ServerSide(query):
    examples.append(sy.text)
   
    # Featurize text of document
    sy_features = text_preprocessing.transform([sy.text])

    # Predict probability
    p  = lr.predict_proba(sy_features)[0,1]

    predictions[sy.document] = p
    if i % 100 == 0:
        print('{}. {}: {}'.format(i, sy.document, p))
    i += 1


# How well did we do? I stop at this point to label some documents, to make sure our training sample was representative.

# In[304]:


labels = {}
for d in predictions:
    t = [x.text for x in Document_Text.select().where(Document_Text.document == d)]
    print(t)
    label = input('y/n/q')
    
    if label == 'y':
        labels[d] = True
    elif label == 'n':
        labels[d] = False
    elif label == 'q':
        break
    else:
        print('Skipping...')
        continue


# In[314]:


tp = 0
fp = 0
tn = 0
fn = 0

for l in labels:
    if predictions[l] > 0.5:
        if labels[l] == True:
            tp += 1
        else:
            fp += 1
    
    if predictions[l] < 0.5:
        if labels[l] == False:
            tn += 1
        else:
            fn += 1


# In[320]:


tp, fp, tn, fn


# I labeled 98 documents, 48 of which were syllabi. That gave us an overall accuracy of (45 + 34) / 98 = 80.1% -- slightly lower than what we achieved in cross-validation. The majority of error comes in with false negatives, which means a threshold of 0.5 might be too tight.

# In[329]:


fpr, tpr, t = roc_curve([labels[x] for x in labels], [predictions[x] for x in labels])


# In[331]:


plt.plot(fpr, tpr), auc(fpr, tpr)


# In[439]:


p, r, t = precision_recall_curve([labels[x] for x in labels], [predictions[x] for x in labels])
plt.plot(r, p)


# In this graph, you can see that there is a sharp dropoff of precision around the 80% recall. To me, this implies that while many documents are obviously syllabi or not, at a certain point, there is a sharp increase in ambiguity. This mirrors my experience with labeling the syllabi, where sometimes it was close enough that I gave ambiguous responses across trials. For example, I didn't have a rigorous take toward how to classify course teasers in catalogs.

# In[347]:


additional_training = []
for l in labels:
    t = [x.text for x in Document_Text.select().where(Document_Text.document == l)][0]
    additional_training.append({
            'id': l,
            'text': t,
            'is_syllabus': labels[l]
            })


# In[349]:


additional_training_df = pd.DataFrame(additional_training)


# In[370]:


training_texts = np.concatenate((training_df.text.values, additional_training_df.text.values))
training_labels = np.concatenate((is_syllabus.values, additional_training_df.is_syllabus.values))


# In[29]:


import pickle


# In[423]:


import json
with open('extra_labels.json', 'w') as out:
    json.dump(labels, out)

with open('training_data.p', 'wb') as out:
    pickle.dump({'text': training_texts,
                  'labels': training_labels}, out)


# In[371]:


full_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                     ('clf_lr', LogisticRegression())
])

parameters = {
    'vect__max_df': (0.5, 1),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams,
    'clf_lr__C': (1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10),
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2')
}

grid_search = GridSearchCV(full_clf, parameters, n_jobs=1, verbose=1, scoring='roc_auc')
grid_search.fit(training_texts, training_labels)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


# In[373]:


full_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                     ('clf_lr', LogisticRegression())
])

parameters = {
    'vect__max_df': (0.5,),
    'vect__ngram_range': ((1, 2),),  # unigrams or bigrams,
    'clf_lr__C': (10, 100, 1000),
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2')
}

grid_search = GridSearchCV(full_clf, parameters, n_jobs=1, verbose=1, scoring='roc_auc')
grid_search.fit(training_texts, training_labels)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


# In[407]:


full_clf = Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
                      ('tfidf', TfidfTransformer()),
                     ('clf_lr', LogisticRegression())
])


# In[398]:


# extract additional features from the syllabus
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer

class BasicInfoExtractor(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, docs):
        return [{ 'length': len(x) } for x in docs]


# In[401]:


full_clf_with_length = Pipeline([
        ('features', FeatureUnion(
                transformer_list=[
                    
                    # Calculate the length of the document
                    ('summary_stats', Pipeline([
                            ('stats', BasicInfoExtractor()),
                            ('dv', DictVectorizer())])),
                    
                    # Grab the bag of words and do tf-idf
                    ('vocab', Pipeline([
                            ('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
                            ('tfidf', TfidfTransformer())]))],

            )
        ),
        ('clf_lr', LogisticRegression(C=100))])


# In[403]:


kf = KFold(n=len(training_texts), n_folds=3, shuffle=True, random_state=983214)
cv_results = cross_val_score(full_clf_with_length, training_texts, training_labels, cv=kf, scoring='roc_auc')
cv_results.mean()


# Whoah. This is atrocious. Let's run it without the length.

# In[409]:


full_clf.fit(training_texts, training_labels)


# In[ ]:


from playhouse.postgres_ext import ServerSide
from osp.corpus.models.text import Document_Text
import time

# Select all texts.
query = Document_Text.select()

# Mapping from a syllabus id to its predicted probability of being a syllabus
predictions2 = {}

# Counter
i = 0

# Wrap the query in a server-side cursor, to avoid
# loading the plaintext for all docs into memory.
for sy in ServerSide(query):
   
    # Predict probability
    p  = full_clf.predict_proba([sy.text])[0,1]

    predictions[sy.document] = p
    if i < 5 or i % 10000 == 0:
        print('{}. {}: {}'.format(i, sy.document, p))
    i += 1


# In[ ]:


with open('predictions.p', 'wb') as out:
    pickle.dump(predictions, out)


# In[435]:


with open('predictions.csv', 'w') as out:
    for k in predictions:
        out.write('{},{}\n'.format(k, predictions[k]))


# In[447]:


t.shape, p.shape


# In[462]:


plt.plot(t, p[:-1], label='precision')
plt.plot(t, 1-r[:-1], label='recall')
plt.xlabel('Threshold')
plt.ylabel('Precision/Recall')
plt.xticks(np.arange(0, 1, 0.1))

plt.vlines(0.325, 0, 1, label='0.325')
plt.legend(loc='lower left')
plt.plot()


# In[467]:


fprs = []
tprs = []
thresholds = []

ps = []
rs = []
ts = []

kf = KFold(n=len(training_labels), n_folds=5, shuffle=True, random_state=983214)
for train, test in kf:
    full_clf.fit(training_texts[train], training_labels[train])
    predictions = full_clf.predict_proba(training_texts[test])

    p, r, t = precision_recall_curve(training_labels[test], predictions[:, 1])
    ps.append(p)
    rs.append(r)
    ts.append(t)


# In[468]:


mean_p = [np.mean(x) for x in zip(*ps)]
mean_r = [np.mean(x) for x in zip(*rs)]
mean_t = [np.mean(x) for x in zip(*ts)]


# In[480]:


plt.plot(mean_t, mean_p[1:], label='precision')
plt.plot(mean_t, mean_r[1:], label='recall')
plt.xlabel('Threshold')
plt.ylabel('Precision/Recall')
plt.xticks(np.arange(0, 1, 0.1))

plt.vlines(0.43, 0, 1, label='0.43')
plt.legend(loc='lower left')
plt.plot()


# In[479]:


c = full_clf.named_steps['vect']
c.stop_words_