#!/usr/bin/env python # coding: utf-8 # #OSP Syllabus Classification [work in progress] # # The [Open Syllabus Project](http://opensyllabusproject.org/) has a collection of 1M+ documents to sift through for syllabi. # This is a classifier for whether a document is a syllabus or not. It turns out, roughly half of the documents are syllabi. # In[30]: from osp.corpus.syllabus import Syllabus import pandas as pd import numpy as np import scipy import pickle import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') from collections import defaultdict from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.naive_bayes import MultinomialNB from sklearn.cross_validation import KFold, cross_val_score from sklearn.grid_search import GridSearchCV from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression, RandomizedLogisticRegression # In[32]: with open('./training_data.p', 'rb') as pf: training_3 = pickle.load(pf) # In[36]: training_df_3 = pd.DataFrame(training_3).rename(columns={'labels': 'syllabus'}) # In[37]: training_df_1 = pd.read_csv('/home/ubuntu/data/syllabus_tags.csv') # A second labeled set of 500 documents training_df_2 = pd.read_csv('/home/ubuntu/data/refinement.csv') training_df = pd.concat([training_df_1, training_df_2, training_df_3]) # In[38]: training_df.head() # We tokenize the syllabus text in the positive and negative examples, and featurize them for a classifier. # # First pass: tf-idf features of text tokens, classified using naive bayes. # In[12]: text_preprocessing = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()) ]) clf_nb = MultinomialNB() # In[39]: features = text_preprocessing.fit_transform(training_df.text.values) # In[40]: # Need dense features to index into it features_dense = features.todense() # In[44]: full_clf = Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf_lr', LogisticRegression()) ]) full_clf.fit(training_df.text.values, training_df.syllabus.values) # In[45]: with open('model2.p', 'wb') as pout: pickle.dump(full_clf, pout) # In[16]: kf = KFold(n=len(training_df), n_folds=5, shuffle=True, random_state=983214) cv_results = cross_val_score(clf_nb, features_dense, training_df.syllabus.values, cv=kf) cv_results.mean() # In[31]: kf = KFold(n=len(training_df), n_folds=5, shuffle=True, random_state=983214) cv_results = cross_val_score(text_clf, training_df.text.values, training_df.syllabus.values, cv=kf, scoring='roc_auc') cv_results.mean() # #### We get 86.4% mean accuracy and 94.22% mean ROC using out-of-the-box features and the multinomial NB classifier. # One question we might ask is: is this good? # # The classifier returns a probability between 0 and 1 that a given document is a syllabus. In the ROC curves below, the movement of the line represents the changing false positive and true positive rates for different cutoff values. For example, if the cutoff is 0, then all documents with a probability greater than 0 of being a syllabus (i.e., all documents) will be classified as syllabi, leading us to have a perfect true positive rate but also a perfect false positive rate -- the upper right corner. # # This chart shows us that we can choose a threshold somewhere on that line. For example, we can achieve a true positive rate (a recall) of 90% with only a 20% false positive rate (also known as fallout). How useful this will be in practice will depend on the ratio of syllabi to non-syllabi in the corpus, and our tolerance for errors of either kind. # In[17]: fprs = [] tprs = [] thresholds = [] kf = KFold(n=len(training_df), n_folds=5, shuffle=True, random_state=983214) for train, test in kf: clf_nb.fit(features_dense[train], training_df.syllabus.values[train]) predictions = clf_nb.predict_proba(features_dense[test]) fpr, tpr, threshold = roc_curve(training_df.syllabus.values[test], predictions[:, 1]) fprs.append(fpr) tprs.append(tpr) thresholds.append(threshold) # In[104]: for i in range(len(fprs)): plt.plot(fprs[i], tprs[i], lw=1) plt.show() # ## Experimentation with additional classifiers and parameters # In[23]: features_dense.shape, training_df.syllabus.shape, train.shape, test.shape # In[19]: classifiers = {'rf': RandomForestClassifier(), 'lr': LogisticRegression(), 'nb': clf_nb, 'dt': DecisionTreeClassifier() } fprs = defaultdict(list) tprs = defaultdict(list) thresholds = defaultdict(list) mean_fprs = {} mean_tprs = {} mean_aucs = {} kf = KFold(n=len(training_df), n_folds=5, shuffle=True, random_state=983214) # In[25]: for train, test in kf: for clf_type, clf in classifiers.items(): # Train and predict using selected classifier clf.fit(features_dense[train], training_df.syllabus.values[train]) predictions = clf.predict_proba(features_dense[test]) fpr, tpr, threshold = roc_curve(training_df.syllabus.values[test], predictions[:, 1]) # Append results to that classifier's dictionary entry fprs[clf_type].append(fpr) tprs[clf_type].append(tpr) thresholds[clf_type].append(threshold) # In[28]: for clf_type in classifiers: mean_fprs[clf_type] = [np.mean(x) for x in zip(*fprs[clf_type])] mean_tprs[clf_type] = [np.mean(x) for x in zip(*tprs[clf_type])] mean_aucs[clf_type] = auc(mean_fprs[clf_type], mean_tprs[clf_type]) plt.plot(mean_fprs[clf_type], mean_tprs[clf_type], lw=1, label='%s (AUC = %0.2f)' % (clf_type, mean_aucs[clf_type])) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Classifier Performance ROC Curves') plt.legend(loc="lower right") plt.show() # In[130]: # Old performance for clf_type in classifiers: mean_fprs[clf_type] = [np.mean(x) for x in zip(*fprs[clf_type])] mean_tprs[clf_type] = [np.mean(x) for x in zip(*tprs[clf_type])] mean_aucs[clf_type] = auc(mean_fprs[clf_type], mean_tprs[clf_type]) plt.plot(mean_fprs[clf_type], mean_tprs[clf_type], lw=1, label='%s (AUC = %0.2f)' % (clf_type, mean_aucs[clf_type])) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Classifier Performance ROC Curves') plt.legend(loc="lower right") plt.show() # Given that logistic regression performed just as well as Naive Bayes on cross-validation without tuning the parameters, I expect it to outperform once we tune it. # ### Feature Analysis # In[150]: # We still need to grid-search for the right logit parameters. rand_logit = RandomizedLogisticRegression(C=1, scaling=0.5, n_resampling=100) rand_logit.fit(features, is_syllabus) # In[151]: sorted(zip(rand_logit.all_scores_, vect.get_feature_names()), reverse=True) # ### Baseline comparisons # My intuitive suspicion is that a set of hand-crafted rules could perform remarkably well at classifying syllabi. I'm curious to see how it compares to the automated methods above. The rules would be along the lines of: # # Has one of the following words: # - syllabus # - class # - assignment # - due # - spring # - fall # In[194]: syllabus_words = ['syllabus', 'class', 'assignment', 'due', 'spring', 'fall'] # TODO # ### Parameter Tuning (TODO) # In[24]: parameters = { 'vect__max_df': (0.5, 0.75, 1.0), #'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams #'tfidf__use_idf': (True, False), #'tfidf__norm': ('l1', 'l2') } grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1) grid_search.fit(training_df.text.values, is_syllabus.values) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) # In this grid search across parameters, having a cutoff that eliminates words with document frequency above 0.5 is more effective than cutoffs at 0.75 and 1. Bigrams also perform better than unigrams. The best score here is lower than the mean score in our stratified run above, so more tests are warranted. # ## Classify documents # First step is to take all of the training data, and feed that through the classifier. # In[238]: text_preprocessing.fit(training_df.text.values) # In[239]: f = text_preprocessing.transform(training_df.text) # In[248]: from scipy.sparse import csr_matrix csr_matrix(features_dense) # In[249]: lr.fit(f, is_syllabus) # Now, run documents through the classifier # In[289]: from playhouse.postgres_ext import ServerSide from osp.corpus.models.text import Document_Text # Select all texts. query = Document_Text.select() # Mapping from a syllabus id to its predicted probability of being a syllabus predictions = {} # Counter i = 0 # Wrap the query in a server-side cursor, to avoid # loading the plaintext for all docs into memory. for sy in ServerSide(query): examples.append(sy.text) # Featurize text of document sy_features = text_preprocessing.transform([sy.text]) # Predict probability p = lr.predict_proba(sy_features)[0,1] predictions[sy.document] = p if i % 100 == 0: print('{}. {}: {}'.format(i, sy.document, p)) i += 1 # How well did we do? I stop at this point to label some documents, to make sure our training sample was representative. # In[304]: labels = {} for d in predictions: t = [x.text for x in Document_Text.select().where(Document_Text.document == d)] print(t) label = input('y/n/q') if label == 'y': labels[d] = True elif label == 'n': labels[d] = False elif label == 'q': break else: print('Skipping...') continue # In[314]: tp = 0 fp = 0 tn = 0 fn = 0 for l in labels: if predictions[l] > 0.5: if labels[l] == True: tp += 1 else: fp += 1 if predictions[l] < 0.5: if labels[l] == False: tn += 1 else: fn += 1 # In[320]: tp, fp, tn, fn # I labeled 98 documents, 48 of which were syllabi. That gave us an overall accuracy of (45 + 34) / 98 = 80.1% -- slightly lower than what we achieved in cross-validation. The majority of error comes in with false negatives, which means a threshold of 0.5 might be too tight. # In[329]: fpr, tpr, t = roc_curve([labels[x] for x in labels], [predictions[x] for x in labels]) # In[331]: plt.plot(fpr, tpr), auc(fpr, tpr) # In[439]: p, r, t = precision_recall_curve([labels[x] for x in labels], [predictions[x] for x in labels]) plt.plot(r, p) # In this graph, you can see that there is a sharp dropoff of precision around the 80% recall. To me, this implies that while many documents are obviously syllabi or not, at a certain point, there is a sharp increase in ambiguity. This mirrors my experience with labeling the syllabi, where sometimes it was close enough that I gave ambiguous responses across trials. For example, I didn't have a rigorous take toward how to classify course teasers in catalogs. # In[347]: additional_training = [] for l in labels: t = [x.text for x in Document_Text.select().where(Document_Text.document == l)][0] additional_training.append({ 'id': l, 'text': t, 'is_syllabus': labels[l] }) # In[349]: additional_training_df = pd.DataFrame(additional_training) # In[370]: training_texts = np.concatenate((training_df.text.values, additional_training_df.text.values)) training_labels = np.concatenate((is_syllabus.values, additional_training_df.is_syllabus.values)) # In[29]: import pickle # In[423]: import json with open('extra_labels.json', 'w') as out: json.dump(labels, out) with open('training_data.p', 'wb') as out: pickle.dump({'text': training_texts, 'labels': training_labels}, out) # In[371]: full_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_lr', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.5, 1), 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams, 'clf_lr__C': (1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10), #'tfidf__use_idf': (True, False), #'tfidf__norm': ('l1', 'l2') } grid_search = GridSearchCV(full_clf, parameters, n_jobs=1, verbose=1, scoring='roc_auc') grid_search.fit(training_texts, training_labels) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) # In[373]: full_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_lr', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.5,), 'vect__ngram_range': ((1, 2),), # unigrams or bigrams, 'clf_lr__C': (10, 100, 1000), #'tfidf__use_idf': (True, False), #'tfidf__norm': ('l1', 'l2') } grid_search = GridSearchCV(full_clf, parameters, n_jobs=1, verbose=1, scoring='roc_auc') grid_search.fit(training_texts, training_labels) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) # In[407]: full_clf = Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf_lr', LogisticRegression()) ]) # In[398]: # extract additional features from the syllabus from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import FeatureUnion from sklearn.feature_extraction import DictVectorizer class BasicInfoExtractor(BaseEstimator, TransformerMixin): def fit(self, x, y=None): return self def transform(self, docs): return [{ 'length': len(x) } for x in docs] # In[401]: full_clf_with_length = Pipeline([ ('features', FeatureUnion( transformer_list=[ # Calculate the length of the document ('summary_stats', Pipeline([ ('stats', BasicInfoExtractor()), ('dv', DictVectorizer())])), # Grab the bag of words and do tf-idf ('vocab', Pipeline([ ('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('tfidf', TfidfTransformer())]))], ) ), ('clf_lr', LogisticRegression(C=100))]) # In[403]: kf = KFold(n=len(training_texts), n_folds=3, shuffle=True, random_state=983214) cv_results = cross_val_score(full_clf_with_length, training_texts, training_labels, cv=kf, scoring='roc_auc') cv_results.mean() # Whoah. This is atrocious. Let's run it without the length. # In[409]: full_clf.fit(training_texts, training_labels) # In[ ]: from playhouse.postgres_ext import ServerSide from osp.corpus.models.text import Document_Text import time # Select all texts. query = Document_Text.select() # Mapping from a syllabus id to its predicted probability of being a syllabus predictions2 = {} # Counter i = 0 # Wrap the query in a server-side cursor, to avoid # loading the plaintext for all docs into memory. for sy in ServerSide(query): # Predict probability p = full_clf.predict_proba([sy.text])[0,1] predictions[sy.document] = p if i < 5 or i % 10000 == 0: print('{}. {}: {}'.format(i, sy.document, p)) i += 1 # In[ ]: with open('predictions.p', 'wb') as out: pickle.dump(predictions, out) # In[435]: with open('predictions.csv', 'w') as out: for k in predictions: out.write('{},{}\n'.format(k, predictions[k])) # In[447]: t.shape, p.shape # In[462]: plt.plot(t, p[:-1], label='precision') plt.plot(t, 1-r[:-1], label='recall') plt.xlabel('Threshold') plt.ylabel('Precision/Recall') plt.xticks(np.arange(0, 1, 0.1)) plt.vlines(0.325, 0, 1, label='0.325') plt.legend(loc='lower left') plt.plot() # In[467]: fprs = [] tprs = [] thresholds = [] ps = [] rs = [] ts = [] kf = KFold(n=len(training_labels), n_folds=5, shuffle=True, random_state=983214) for train, test in kf: full_clf.fit(training_texts[train], training_labels[train]) predictions = full_clf.predict_proba(training_texts[test]) p, r, t = precision_recall_curve(training_labels[test], predictions[:, 1]) ps.append(p) rs.append(r) ts.append(t) # In[468]: mean_p = [np.mean(x) for x in zip(*ps)] mean_r = [np.mean(x) for x in zip(*rs)] mean_t = [np.mean(x) for x in zip(*ts)] # In[480]: plt.plot(mean_t, mean_p[1:], label='precision') plt.plot(mean_t, mean_r[1:], label='recall') plt.xlabel('Threshold') plt.ylabel('Precision/Recall') plt.xticks(np.arange(0, 1, 0.1)) plt.vlines(0.43, 0, 1, label='0.43') plt.legend(loc='lower left') plt.plot() # In[479]: c = full_clf.named_steps['vect'] c.stop_words_