#!/usr/bin/env python
# coding: utf-8

# A tutorial on using tree fragments for text classification
# ----------------------------------------------------------
# 
# Tree fragments are arbitrarly sized connected subgraphs of parse trees. For a reference see e.g. http://dare.uva.nl/record/371504
# 
# As input any set of parse trees can be used, obtained by for example the Charniak & Johnson parser (my recommendation, see http://github.com/BLLIP/bllip-parser ), the Stanford Parser, or the Berkeley Parser.
# 
# This assumes you have successfully installed the disco-dop parser, which contains the code for fragment extraction. See http://github.com/andreasvc/disco-dop
# 
# For the machine learning part we rely on scikit-learn, see http://scikit-learn.org/

# In[1]:


import glob
from collections import defaultdict
from discodop import treebank, treetransforms, fragments
from sklearn import linear_model, preprocessing, feature_extraction, model_selection
vectorizer = feature_extraction.DictVectorizer(sparse=True)


# Read the trees. Here we read only the first 1000 parse trees from a single novel from the Gutenberg project.
# 
# Trees need to be binarized for fragment extraction. There are many parameters for binarization, but the most important are the ones related to Markovization. See Klein & Manning (2003), Accurate unlexicalized parsing.

# In[2]:


text = treebank.BracketCorpusReader('1027.txt.mrg.gz')
trees = [treetransforms.binarize(item.tree, horzmarkov=1, vertmarkov=1)
         for _, item in text.itertrees(0, 1000)]
sents = [item.sent for _, item in text.itertrees(0, 1000)]


# Run the fragment extraction. When running on a machine with multiple cores, the numproc parameter can be increased to run multiple processes in parallel.

# In[3]:


result = fragments.recurringfragments(trees, sents, numproc=1, disc=False, maxdepth=1)


# The results are fragments in string form, along with a dictionary of all the sentence numbers where the given fragment occurs. A summation reduces this to a simple occurrence count.

# In[4]:


for a, b in list(result.items())[:5]:
    print('%3d\t%s' % (sum(b), a))


# To use the fragments for a machine learning problem, we want to have a feature mapping for each sentence (or document). 

# In[5]:


tmp = [defaultdict(int) for _ in range(1000)]
for a, b in result.items():
    for n in b:
        tmp[n][a] += 1


# In[6]:


# Convert list of dicts to a sparse matrix
vectorizer = feature_extraction.DictVectorizer(sparse=True)
X = vectorizer.fit_transform(tmp)


# In[7]:


# Trivial machine learning objective: detect long sentences
target = ['long' if len(sent) > 20 else 'short' for sent in sents]
y = preprocessing.LabelEncoder().fit_transform(target)


# In[8]:


# Use an SVM-like classifier and 10-fold crossvalidation for evaluation
classifier = linear_model.SGDClassifier(loss='hinge', penalty='elasticnet', max_iter=5, tol=None)
cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
model_selection.cross_val_score(classifier, X, y, cv=cv)


# To further analyze the machine learning results, consult the sci-kit learn documentation: http://scikit-learn.org/stable/documentation.html
# 
# Also see my notebook on text classification with bag-of-word models, which shows how to list difficult to classify documents, and find the most important features: http://nbviewer.ipython.org/gist/andreasvc/5d9b17fb981ee2a8b728

# In[ ]: