Notebook

In [1]:

from sklearn import tree

In [2]:

#10 samples of x data [6 binary features]
mx=[]
my=[]   # classification in 4 bins
for k in range(10):
    x=[randint(2) for i in range(6)]
    mx.append(x)
    n=int(''.join(map(str,x)),base=2);
    my.append(n/16)

In [5]:

#the features can be considered numbers in binary
#the class is the multiple of 16 it's greater than
#so the first 110111 = 55 is in bin 3,
#and the fifth 001001 = 9 is in bin 0 
mx,my

Out[5]:

([[1, 1, 0, 1, 1, 1],
  [1, 0, 1, 1, 0, 0],
  [0, 1, 1, 0, 0, 0],
  [0, 1, 0, 1, 1, 1],
  [0, 0, 1, 0, 0, 1],
  [1, 1, 0, 1, 1, 1],
  [1, 1, 1, 0, 0, 0],
  [0, 0, 0, 0, 1, 1],
  [1, 1, 0, 0, 1, 1],
  [1, 1, 1, 1, 0, 0]],
 [3, 2, 1, 1, 0, 3, 3, 0, 3, 3])

In [6]:

from IPython.display import Image #needed to render in notebook
import StringIO, pydot  #needed to convert dot format to png

In [7]:

clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(mx, my)
dot_data = StringIO.StringIO() 
tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

Out[7]:

In [ ]:

#notice that it figures out how to use the most significant bit x[0] to make the first check
# then checks next significant bit, and never uses last four bits

#below is the same decision tree running from nltk,
#which prints result as pseudocode rather than as diagram

In [21]:

samples=[({'x[0]': 1, 'x[1]': 1, 'x[2]': 0, 'x[3]': 1, 'x[4]': 1, 'x[5]': 1}, '48-63'),
 ({'x[0]': 1, 'x[1]': 0, 'x[2]': 1, 'x[3]': 1, 'x[4]': 0, 'x[5]': 0}, '32-47'),
 ({'x[0]': 0, 'x[1]': 1, 'x[2]': 1, 'x[3]': 0, 'x[4]': 0, 'x[5]': 0}, '16-31'),
 ({'x[0]': 0, 'x[1]': 1, 'x[2]': 0, 'x[3]': 1, 'x[4]': 1, 'x[5]': 1}, '16-31'),
 ({'x[0]': 0, 'x[1]': 0, 'x[2]': 1, 'x[3]': 0, 'x[4]': 0, 'x[5]': 1}, '0-15'),
 ({'x[0]': 1, 'x[1]': 1, 'x[2]': 0, 'x[3]': 1, 'x[4]': 1, 'x[5]': 1}, '48-63'),
 ({'x[0]': 1, 'x[1]': 1, 'x[2]': 1, 'x[3]': 0, 'x[4]': 0, 'x[5]': 0}, '48-63'),
 ({'x[0]': 0, 'x[1]': 0, 'x[2]': 0, 'x[3]': 0, 'x[4]': 1, 'x[5]': 1}, '0-15'),
 ({'x[0]': 1, 'x[1]': 1, 'x[2]': 0, 'x[3]': 0, 'x[4]': 1, 'x[5]': 1}, '48-63'),
 ({'x[0]': 1, 'x[1]': 1, 'x[2]': 1, 'x[3]': 1, 'x[4]': 0, 'x[5]': 0}, '48-63')]

In [22]:

import nltk
classbin=nltk.DecisionTreeClassifier.train(samples+samples)

In [23]:

print classbin.pseudocode(depth=4)

if x[1] == 0: 
  if x[0] == 0: return '0-15'
  if x[0] == 1: return '32-47'
if x[1] == 1: 
  if x[0] == 0: return '16-31'
  if x[0] == 1: return '48-63'

In [25]:

import re
#here are the sample categories
categories=['astro-ph', 'cond-mat', 'cs', 'gr-qc', 'hep', 'math', 'nucl', 'physics.optics', 'q-bio', 'q-fin', 'quant-ph']

In [29]:

#load the json file
import json
arxiv_data=json.load(open('arxiv_data.json'))
#e.g., arxiv_data['astro-ph']  is a list of 200 entries, each with id,title,authors,abstract
#you need only the abstract text, e.g.
#  arxiv_data['astro-ph'][0]['abstract']

In [28]:

stopwords=['the', 'of', 'a', 'is', 'in', 'to', 'and', 'that', 'we', 'be', 'for', 'by', 'are', 'with', 'can', 'this', 'an', 'it', 'on', 'as', 'if', 'not', 'one', 'but', 'will', 'then', 'which', 'have', 'where', 'from', 'at', 'or', 'has', 'two', 'may', 'so', 'each', 'all', 'there', 'more', 'some', 'these', 'also', 'such', 'only', 'any', 'our', 'no', 'see', 'us', 'other', 'first', 'between', 'using', 'case', 'given', 'same', 'its', 'was', 'been', 'into', 'both', 'than', 'when', 'however', 'different', 'here', 'used', 'since', 'their', 'shown', 'show', 'above', 'well', 'following', 'result', 'they', 'number', 'use', 'thus', 'work', 'large', 'over', 'very', 'similar', 'possible', 'due', 'note', 'now', 'does', 'therefore', 'out', 'present', 'consider', 'most', 'would', 'while', 'small', 'respectively', 'general', 'even', 'under', 'do', 'because', 'shows', 'should', 'up', 'particular', 'follows', 'were', 'find', 'obtain', 'way', 'known', 'about', 'after', 'through', 'based', 'further', 'those', 'below', 'many', 'without', 'gives', 'much', 'give', 'like', 'being', 'finally', 'could', 'another', 'take', 'within', 'higher', 'must', 'them', 'high', 'several', 'previous', 'still', 'lower', 'larger', 'taken', 'becomes', 'either', 'hence', 'how', 'less', 'associated', 'along', 'discussed', 'expected', 'change', 'although', 'need', 'too', 'had', 'toward', 'though', 'called', 'moreover']

In [33]:

#here's one way to make features:
#skip stopwords, and skip words smaller than four letters
#then keep the first three letters
def mkfeatures(abstext):
    features={}
    abswords = re.findall('[a-z]+',abstext.encode('ascii','ignore')) #convert unicode to ascii
    for w in abswords:
        if w in stopwords or len(w) < 4: continue
        features[w[:3]] = 1
    return features

In [32]:

#then
mkfeatures(arxiv_data['astro-ph'][0]['abstract'])
#will create a set of features for the first file
#you'll want to make a training set, a list of features and classification,e.g.:
train_set=[({'acc':1, ...},'astro-ph'),...]
#and a test set in same format
#try pairwise classifiers, or multiclass classifiers

In [36]:

#here's the result of picking 64 most frequent features from q-bio and quant-ph
#for a 2-way classifier, 'qua...', 'pho...', 'bio...', 'pop...' are important
print myclass.pseudocode(depth=100)

if qua == None: 
  if pho == None: 
    if qub == None: return 'q-bio'
    if qub == True: return 'quant-ph'
  if pho == True: 
    if app == None: return 'quant-ph'
    if app == True: return 'q-bio'
if qua == True: 
  if bio == None: 
    if pop == None: 
      if dat == None: return 'quant-ph'
      if dat == True: 
        if ana == None: return 'quant-ph'
        if ana == True: return 'q-bio'
    if pop == True: 
      if app == None: return 'quant-ph'
      if app == True: return 'q-bio'
  if bio == True: return 'q-bio'

In [ ]: