from sklearn import tree
#10 samples of x data [6 binary features]
mx=[]
my=[] # classification in 4 bins
for k in range(10):
x=[randint(2) for i in range(6)]
mx.append(x)
n=int(''.join(map(str,x)),base=2);
my.append(n/16)
#the features can be considered numbers in binary
#the class is the multiple of 16 it's greater than
#so the first 110111 = 55 is in bin 3,
#and the fifth 001001 = 9 is in bin 0
mx,my
([[1, 1, 0, 1, 1, 1], [1, 0, 1, 1, 0, 0], [0, 1, 1, 0, 0, 0], [0, 1, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1], [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 1, 1], [1, 1, 1, 1, 0, 0]], [3, 2, 1, 1, 0, 3, 3, 0, 3, 3])
from IPython.display import Image #needed to render in notebook
import StringIO, pydot #needed to convert dot format to png
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(mx, my)
dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
#notice that it figures out how to use the most significant bit x[0] to make the first check
# then checks next significant bit, and never uses last four bits
#below is the same decision tree running from nltk,
#which prints result as pseudocode rather than as diagram
samples=[({'x[0]': 1, 'x[1]': 1, 'x[2]': 0, 'x[3]': 1, 'x[4]': 1, 'x[5]': 1}, '48-63'),
({'x[0]': 1, 'x[1]': 0, 'x[2]': 1, 'x[3]': 1, 'x[4]': 0, 'x[5]': 0}, '32-47'),
({'x[0]': 0, 'x[1]': 1, 'x[2]': 1, 'x[3]': 0, 'x[4]': 0, 'x[5]': 0}, '16-31'),
({'x[0]': 0, 'x[1]': 1, 'x[2]': 0, 'x[3]': 1, 'x[4]': 1, 'x[5]': 1}, '16-31'),
({'x[0]': 0, 'x[1]': 0, 'x[2]': 1, 'x[3]': 0, 'x[4]': 0, 'x[5]': 1}, '0-15'),
({'x[0]': 1, 'x[1]': 1, 'x[2]': 0, 'x[3]': 1, 'x[4]': 1, 'x[5]': 1}, '48-63'),
({'x[0]': 1, 'x[1]': 1, 'x[2]': 1, 'x[3]': 0, 'x[4]': 0, 'x[5]': 0}, '48-63'),
({'x[0]': 0, 'x[1]': 0, 'x[2]': 0, 'x[3]': 0, 'x[4]': 1, 'x[5]': 1}, '0-15'),
({'x[0]': 1, 'x[1]': 1, 'x[2]': 0, 'x[3]': 0, 'x[4]': 1, 'x[5]': 1}, '48-63'),
({'x[0]': 1, 'x[1]': 1, 'x[2]': 1, 'x[3]': 1, 'x[4]': 0, 'x[5]': 0}, '48-63')]
import nltk
classbin=nltk.DecisionTreeClassifier.train(samples+samples)
print classbin.pseudocode(depth=4)
if x[1] == 0: if x[0] == 0: return '0-15' if x[0] == 1: return '32-47' if x[1] == 1: if x[0] == 0: return '16-31' if x[0] == 1: return '48-63'
import re
#here are the sample categories
categories=['astro-ph', 'cond-mat', 'cs', 'gr-qc', 'hep', 'math', 'nucl', 'physics.optics', 'q-bio', 'q-fin', 'quant-ph']
#load the json file
import json
arxiv_data=json.load(open('arxiv_data.json'))
#e.g., arxiv_data['astro-ph'] is a list of 200 entries, each with id,title,authors,abstract
#you need only the abstract text, e.g.
# arxiv_data['astro-ph'][0]['abstract']
stopwords=['the', 'of', 'a', 'is', 'in', 'to', 'and', 'that', 'we', 'be', 'for', 'by', 'are', 'with', 'can', 'this', 'an', 'it', 'on', 'as', 'if', 'not', 'one', 'but', 'will', 'then', 'which', 'have', 'where', 'from', 'at', 'or', 'has', 'two', 'may', 'so', 'each', 'all', 'there', 'more', 'some', 'these', 'also', 'such', 'only', 'any', 'our', 'no', 'see', 'us', 'other', 'first', 'between', 'using', 'case', 'given', 'same', 'its', 'was', 'been', 'into', 'both', 'than', 'when', 'however', 'different', 'here', 'used', 'since', 'their', 'shown', 'show', 'above', 'well', 'following', 'result', 'they', 'number', 'use', 'thus', 'work', 'large', 'over', 'very', 'similar', 'possible', 'due', 'note', 'now', 'does', 'therefore', 'out', 'present', 'consider', 'most', 'would', 'while', 'small', 'respectively', 'general', 'even', 'under', 'do', 'because', 'shows', 'should', 'up', 'particular', 'follows', 'were', 'find', 'obtain', 'way', 'known', 'about', 'after', 'through', 'based', 'further', 'those', 'below', 'many', 'without', 'gives', 'much', 'give', 'like', 'being', 'finally', 'could', 'another', 'take', 'within', 'higher', 'must', 'them', 'high', 'several', 'previous', 'still', 'lower', 'larger', 'taken', 'becomes', 'either', 'hence', 'how', 'less', 'associated', 'along', 'discussed', 'expected', 'change', 'although', 'need', 'too', 'had', 'toward', 'though', 'called', 'moreover']
#here's one way to make features:
#skip stopwords, and skip words smaller than four letters
#then keep the first three letters
def mkfeatures(abstext):
features={}
abswords = re.findall('[a-z]+',abstext.encode('ascii','ignore')) #convert unicode to ascii
for w in abswords:
if w in stopwords or len(w) < 4: continue
features[w[:3]] = 1
return features
#then
mkfeatures(arxiv_data['astro-ph'][0]['abstract'])
#will create a set of features for the first file
#you'll want to make a training set, a list of features and classification,e.g.:
train_set=[({'acc':1, ...},'astro-ph'),...]
#and a test set in same format
#try pairwise classifiers, or multiclass classifiers
#here's the result of picking 64 most frequent features from q-bio and quant-ph
#for a 2-way classifier, 'qua...', 'pho...', 'bio...', 'pop...' are important
print myclass.pseudocode(depth=100)
if qua == None: if pho == None: if qub == None: return 'q-bio' if qub == True: return 'quant-ph' if pho == True: if app == None: return 'quant-ph' if app == True: return 'q-bio' if qua == True: if bio == None: if pop == None: if dat == None: return 'quant-ph' if dat == True: if ana == None: return 'quant-ph' if ana == True: return 'q-bio' if pop == True: if app == None: return 'quant-ph' if app == True: return 'q-bio' if bio == True: return 'q-bio'