#!/usr/bin/env python # coding: utf-8 # A tutorial on using tree fragments for text classification # ---------------------------------------------------------- # # Tree fragments are arbitrarly sized connected subgraphs of parse trees. For a reference see e.g. http://dare.uva.nl/record/371504 # # As input any set of parse trees can be used, obtained by for example the Charniak & Johnson parser (my recommendation, see http://github.com/BLLIP/bllip-parser ), the Stanford Parser, or the Berkeley Parser. # # This assumes you have successfully installed the disco-dop parser, which contains the code for fragment extraction. See http://github.com/andreasvc/disco-dop # # For the machine learning part we rely on scikit-learn, see http://scikit-learn.org/ # In[1]: import glob from collections import defaultdict from discodop import treebank, treetransforms, fragments from sklearn import linear_model, preprocessing, feature_extraction, model_selection vectorizer = feature_extraction.DictVectorizer(sparse=True) # Read the trees. Here we read only the first 1000 parse trees from a single novel from the Gutenberg project. # # Trees need to be binarized for fragment extraction. There are many parameters for binarization, but the most important are the ones related to Markovization. See Klein & Manning (2003), Accurate unlexicalized parsing. # In[2]: text = treebank.BracketCorpusReader('1027.txt.mrg.gz') trees = [treetransforms.binarize(item.tree, horzmarkov=1, vertmarkov=1) for _, item in text.itertrees(0, 1000)] sents = [item.sent for _, item in text.itertrees(0, 1000)] # Run the fragment extraction. When running on a machine with multiple cores, the numproc parameter can be increased to run multiple processes in parallel. # In[3]: result = fragments.recurringfragments(trees, sents, numproc=1, disc=False, maxdepth=1) # The results are fragments in string form, along with a dictionary of all the sentence numbers where the given fragment occurs. A summation reduces this to a simple occurrence count. # In[4]: for a, b in list(result.items())[:5]: print('%3d\t%s' % (sum(b), a)) # To use the fragments for a machine learning problem, we want to have a feature mapping for each sentence (or document). # In[5]: tmp = [defaultdict(int) for _ in range(1000)] for a, b in result.items(): for n in b: tmp[n][a] += 1 # In[6]: # Convert list of dicts to a sparse matrix vectorizer = feature_extraction.DictVectorizer(sparse=True) X = vectorizer.fit_transform(tmp) # In[7]: # Trivial machine learning objective: detect long sentences target = ['long' if len(sent) > 20 else 'short' for sent in sents] y = preprocessing.LabelEncoder().fit_transform(target) # In[8]: # Use an SVM-like classifier and 10-fold crossvalidation for evaluation classifier = linear_model.SGDClassifier(loss='hinge', penalty='elasticnet', max_iter=5, tol=None) cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=42) model_selection.cross_val_score(classifier, X, y, cv=cv) # To further analyze the machine learning results, consult the sci-kit learn documentation: http://scikit-learn.org/stable/documentation.html # # Also see my notebook on text classification with bag-of-word models, which shows how to list difficult to classify documents, and find the most important features: http://nbviewer.ipython.org/gist/andreasvc/5d9b17fb981ee2a8b728 # In[ ]: