%matplotlib inline import itertools import math import matplotlib.pyplot as plt # also, down below, we use pattern, numpy, and scipy # code example from Building Machine Learning Systems with Python (Richert & Coelho) # - modified slightly by Lynn import math def tfidf(t, d, D): tf = float(d.count(t)) / sum(d.count(w) for w in set(d)) # normalized # Note his version doesn't use +1 in denominator. idf = math.log( float(len(D)) / (len([doc for doc in D if t in doc]))) return tf * idf a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"] # try adding another c to the last doc! D = [a, abb, abc] print(tfidf("a", a, D)) # a is in all of them print(tfidf("a", abc, D)) # a is in all of them print(tfidf("b", abc, D)) # b occurs only once here, but in 2 docs print(tfidf("b", abb, D)) # b occurs more frequently in this doc print(tfidf("c", abc, D)) # c is unique in the doc set from pattern.vector import Document, Model, TFIDF, TF, LEMMA, PORTER, COSINE, KMEANS, HIERARCHICAL filelist = !ls data/stories/ filelist # Load in the stories... def load_texts(filenames, dirpath): """ filenames are the leaves, dirpath is the path to them with the / """ loaded_text = {} for filen in filenames: with open(dirpath + filen) as handle: loaded_text[filen] = handle.read() return loaded_text loaded_text = load_texts(filelist, 'data/stories/') loaded_text.items()[0] def make_pattern_docs(texts): """ texts is a dictionary! key is the name of text or filename """ from pattern.vector import Document docs = [] # Create a pattern.vector Document object for each article, and lemmatize as it goes in for key, val in texts.iteritems(): typestring = key[0] # will be a G or A, for Grimms or Andersen docs.append(Document(val, name=key, type=typestring, stemmer=LEMMA)) return docs docs = make_pattern_docs(loaded_text) docs[1] docs[1].keywords() # normalized counts in the document (TF) sorted(docs[1].features)[0:10] # the words = features # the normalized vector for the word occurrences in this document - # these scores are the same as the keywords above. docs[1].vector['sister'] # TF-IDF is a property of the doc set. The "Model" object handles operations across the doc set. mtfidf = Model(documents=docs, weight=TFIDF) mtfidf.documents mtfidf.document_frequency('sister') mtfidf.inverse_document_frequency('sister') doc1 = mtfidf.document(name='G_LITTLE ONE-EYE, TWO-EYES AND THREE-EYES.txt') # or: # equivalent: doc1 = mtfidf.documents[1] doc1.term_frequency('sister') # note this is same as doing it above on the doc object! doc1.tf_idf('sister') mtfidf.documents[4].tf('sister') mtfidf.documents[4].tf_idf('sister') # Taken from the pattern.vec doc page: http://www.clips.ua.ac.be/pages/pattern-vector from pattern.vector import Document, Model d0 = Document('A tiger is a big yellow cat with stripes.', type='tiger') d1 = Document('A lion is a big yellow cat with manes.', type='lion',) d2 = Document('An elephant is a big grey animal with a slurf.', type='elephant') d3 = Document('An elephant is an animal.', type='elephant') print "Before model, vector for d1:", d1.vector simple = Model(documents=[d0, d1, d2, d3], weight=TFIDF) print "After model, vector for d1:", d1.vector # vector now weighted according to document collection! print print "Tiger vs lion text similarity:", simple.similarity(d0, d1) # tiger vs. lion, 1-cosine print "Tiger vs. elephant text similarity:", simple.similarity(d0, d2) # tiger vs. elephant, 1-cosine print "Elephant 1 vs. Elephant 2 similarity:", simple.similarity(d2, d3) # this exports the array of tf-idf, but with some extra stuff we can parse out. Will be large for real data. simple.export('data/csv/simple_tfidf.tsv') mtfidf.similarity(docs[1], docs[1]) # similarity to self is 1. mtfidf.similarity(mtfidf.docs[1], mtfidf.docs[6]) # try some different docs # check what that was mtfidf.docs[6] docs[1] mtfidf.neighbors(docs[1]) # finds the closest matches in similarity # Model.search() returns a sorted list of (similarity, Document)-tuples, # based on a list of query words. A Document is created on-the-fly for the # given list, using the given optional arguments. mtfidf.search(['witch','girl','boy']) # You can do hierchical clustering right inside pattern, without having to use scipy for it. # k is the number of "clusters" you want to produce hier = mtfidf.cluster(method=HIERARCHICAL, k=5) hier.depth # Get a giant listing of the Cluster objects in the tree structure. # Doesn't seem to be a built in tool to vis them, though! hier # Look at some of the functions on hier... hier import csv import numpy as np from scipy.spatial.distance import pdist, squareform from scipy.cluster.hierarchy import linkage, dendrogram def read_weka_tfidf(filen): """ Read in the Weka file output by pattern's model export and just keep tfidf scores.""" rows = [] with open(filen, 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter='\t') count = 0 for row in spamreader: # skipping first row which is the word labels if count > 0: rows.append(row[:-2]) # skip extra junk at the last 2 cols count += 1 return rows simplerows = read_weka_tfidf('data/csv/simple_tfidf.tsv') simplerows dist = pdist(simplerows, metric='cosine') # look at the manpage and pick a different measure to try linkage(dist) from pylab import rcParams rcParams['figure.figsize'] = 6, 5 dendrogram(linkage(dist)) # this plotting function has a ton of things you can manipulate if you look at the docs. # Reminder: print "d0", d0.words print "d1", d1.words print "d2", d2.words print "d3", d3.words # show the distances, which are used to get the hierarchy: print "d2, d3 distance", 1-simple.similarity(d2,d3) print "d0, d1 distance", 1-simple.similarity(d0,d1) mtfidf.export('data/csv/fairy.tsv') fairyrows = read_weka_tfidf('data/csv/fairy.tsv') len(fairyrows) len(fairyrows[0]) # words in the vector def make_dend(data, labels=None, height=6): from pylab import rcParams dist = pdist(data, metric='cosine') link = linkage(dist, method='complete') rcParams['figure.figsize'] = 6, height rcParams['axes.labelsize'] = 5 if not labels: dend = dendrogram(link, orientation='right') #labels=names) else: dend = dendrogram(link, orientation='right', labels=labels) return dist # if you want to label by doc names names = [doc.name for doc in docs] dist = make_dend(fairyrows, height=15) 1-mtfidf.similarity(docs[12], docs[11]) 1-mtfidf.similarity(docs[25], docs[17]) # Code borrowed from: http://nbviewer.ipython.org/github/OxanaSachenkova/hclust-python/blob/master/hclust.ipynb def make_heatmap_matrix(dist, method='complete'): """ Pass in the distance matrix; method options are complete or single """ # Compute and plot first dendrogram. fig = plt.figure(figsize=(10,10)) # x ywidth height ax1 = fig.add_axes([0.05,0.1,0.2,0.6]) Y = linkage(dist, method=method) Z1 = dendrogram(Y, orientation='right') # adding/removing the axes ax1.set_xticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3,0.71,0.6,0.2]) Z2 = dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) #Compute and plot the heatmap axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] D = squareform(dist) D = D[idx1,:] D = D[:,idx2] im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu) axmatrix.set_xticks([]) axmatrix.set_yticks([]) # Plot colorbar. axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) plt.colorbar(im, cax=axcolor) make_heatmap_matrix(dist, method='complete') books = !ls data/books books booktexts = load_texts(books, 'data/books/') bookdocs = make_pattern_docs(booktexts) booktfidf = Model(documents=bookdocs, weight=TFIDF) booktfidf.docs booknames = [doc.name for doc in booktfidf.docs] booktfidf.export('data/csv/books_tfidf.tsv') bookweights = read_weka_tfidf('data/csv/books_tfidf.tsv') dist = make_dend(bookweights, labels=booknames) make_heatmap_matrix(dist, method='complete') kmeans = mtfidf.cluster(method=KMEANS, k=5) from pattern.vector import centroid import operator # For each cluster center, look at the most important features. for i in range(5): print i print sorted(centroid(kmeans[i]).items(), key=operator.itemgetter(1))[0:10] print