%matplotlib inline

import itertools
import math
import matplotlib.pyplot as plt
# also, down below, we use pattern, numpy, and scipy

# code example from Building Machine Learning Systems with Python (Richert & Coelho) 
# - modified slightly by Lynn

import math

def tfidf(t, d, D):
    tf = float(d.count(t)) / sum(d.count(w) for w in set(d))  # normalized
    # Note his version doesn't use +1 in denominator.
    idf = math.log( float(len(D)) / (len([doc for doc in D if t in doc])))
    return tf * idf


a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]  # try adding another c to the last doc!
D = [a, abb, abc]

print(tfidf("a", a, D))   # a is in all of them
print(tfidf("a", abc, D)) # a is in all of them
print(tfidf("b", abc, D)) # b occurs only once here, but in 2 docs
print(tfidf("b", abb, D)) # b occurs more frequently in this doc
print(tfidf("c", abc, D)) # c is unique in the doc set

from pattern.vector import Document, Model, TFIDF, TF, LEMMA, PORTER, COSINE, KMEANS, HIERARCHICAL

filelist = !ls data/stories/

filelist

# Load in the stories...

def load_texts(filenames, dirpath):
    """ filenames are the leaves, dirpath is the path to them with the / """
    loaded_text = {}
    for filen in filenames:
        with open(dirpath + filen) as handle:
            loaded_text[filen] = handle.read()
    return loaded_text

loaded_text = load_texts(filelist, 'data/stories/')

loaded_text.items()[0]

def make_pattern_docs(texts):
    """ texts is a dictionary! key is the name of text or filename """
    from pattern.vector import Document
    docs = []

    # Create a pattern.vector Document object for each article, and lemmatize as it goes in
    for key, val in texts.iteritems():
        typestring = key[0]  # will be a G or A, for Grimms or Andersen
        docs.append(Document(val, name=key, type=typestring, stemmer=LEMMA))
    return docs

docs = make_pattern_docs(loaded_text)
docs[1]

docs[1].keywords()  # normalized counts in the document (TF)

sorted(docs[1].features)[0:10]  # the words = features

# the normalized vector for the word occurrences in this document - 
# these scores are the same as the keywords above. 
docs[1].vector['sister']

# TF-IDF is a property of the doc set.  The "Model" object handles operations across the doc set.
mtfidf = Model(documents=docs, weight=TFIDF)

mtfidf.documents

mtfidf.document_frequency('sister')

mtfidf.inverse_document_frequency('sister')

doc1 = mtfidf.document(name='G_LITTLE ONE-EYE, TWO-EYES AND THREE-EYES.txt')  # or:

# equivalent:

doc1 = mtfidf.documents[1]

doc1.term_frequency('sister')  # note this is same as doing it above on the doc object!

doc1.tf_idf('sister')

mtfidf.documents[4].tf('sister')

mtfidf.documents[4].tf_idf('sister')

# Taken from the pattern.vec doc page: http://www.clips.ua.ac.be/pages/pattern-vector

from pattern.vector import Document, Model

d0 = Document('A tiger is a big yellow cat with stripes.', type='tiger')
d1 = Document('A lion is a big yellow cat with manes.', type='lion',)
d2 = Document('An elephant is a big grey animal with a slurf.', type='elephant')
d3 = Document('An elephant is an animal.', type='elephant')
  
print "Before model, vector for d1:", d1.vector

simple = Model(documents=[d0, d1, d2, d3], weight=TFIDF)

print "After model, vector for d1:", d1.vector # vector now weighted according to document collection!
print
print "Tiger vs lion text similarity:", simple.similarity(d0, d1) # tiger vs. lion, 1-cosine
print "Tiger vs. elephant text similarity:", simple.similarity(d0, d2) # tiger vs. elephant, 1-cosine
print "Elephant 1 vs. Elephant 2 similarity:", simple.similarity(d2, d3)

# this exports the array of tf-idf, but with some extra stuff we can parse out. Will be large for real data.

simple.export('data/csv/simple_tfidf.tsv')

mtfidf.similarity(docs[1], docs[1])  # similarity to self is 1.

mtfidf.similarity(mtfidf.docs[1], mtfidf.docs[6])  # try some different docs

# check what that was

mtfidf.docs[6]

docs[1]

mtfidf.neighbors(docs[1])  # finds the closest matches in similarity

# Model.search() returns a sorted list of (similarity, Document)-tuples, 
# based on a list of query words. A Document is created on-the-fly for the 
# given list, using the given optional arguments.

mtfidf.search(['witch','girl','boy'])

# You can do hierchical clustering right inside pattern, without having to use scipy for it.
# k is the number of "clusters" you want to produce

hier = mtfidf.cluster(method=HIERARCHICAL, k=5)

hier.depth

# Get a giant listing of the Cluster objects in the tree structure.  
# Doesn't seem to be a built in tool to vis them, though!

hier

# Look at some of the functions on hier...
hier

import csv
import numpy as np
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram

def read_weka_tfidf(filen):
    """ Read in the Weka file output by pattern's model export and just keep tfidf scores."""
    rows = []
    with open(filen, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter='\t')
        count = 0
        for row in spamreader:
            # skipping first row which is the word labels
            if count > 0:
                rows.append(row[:-2]) # skip extra junk at the last 2 cols
            count += 1
    return rows

simplerows = read_weka_tfidf('data/csv/simple_tfidf.tsv')

simplerows

dist = pdist(simplerows, metric='cosine')  # look at the manpage and pick a different measure to try

linkage(dist)

from pylab import rcParams
rcParams['figure.figsize'] = 6, 5

dendrogram(linkage(dist))  # this plotting function has a ton of things you can manipulate if you look at the docs.

# Reminder:

print "d0", d0.words
print "d1", d1.words
print "d2", d2.words
print "d3", d3.words

# show the distances, which are used to get the hierarchy:
print "d2, d3 distance", 1-simple.similarity(d2,d3)
print "d0, d1 distance", 1-simple.similarity(d0,d1)

mtfidf.export('data/csv/fairy.tsv')

fairyrows = read_weka_tfidf('data/csv/fairy.tsv')

len(fairyrows)

len(fairyrows[0]) # words in the vector

def make_dend(data, labels=None, height=6):
    from pylab import rcParams
    dist = pdist(data, metric='cosine')
    link = linkage(dist, method='complete')
    rcParams['figure.figsize'] = 6, height
    rcParams['axes.labelsize'] = 5
    if not labels:
        dend = dendrogram(link, orientation='right') #labels=names)
    else:
        dend = dendrogram(link, orientation='right', labels=labels)
    return dist

# if you want to label by doc names
names = [doc.name for doc in docs]

dist = make_dend(fairyrows, height=15)

1-mtfidf.similarity(docs[12], docs[11])

1-mtfidf.similarity(docs[25], docs[17])

# Code borrowed from: http://nbviewer.ipython.org/github/OxanaSachenkova/hclust-python/blob/master/hclust.ipynb

def make_heatmap_matrix(dist, method='complete'):
    """ Pass in the distance matrix; method options are complete or single """
    # Compute and plot first dendrogram.
    fig = plt.figure(figsize=(10,10))
    # x ywidth height
    ax1 = fig.add_axes([0.05,0.1,0.2,0.6])
    Y = linkage(dist, method=method)
    Z1 = dendrogram(Y, orientation='right') # adding/removing the axes
    ax1.set_xticks([])

    # Compute and plot second dendrogram.
    ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
    Z2 = dendrogram(Y)
    ax2.set_xticks([])
    ax2.set_yticks([])

    #Compute and plot the heatmap
    axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
    idx1 = Z1['leaves']
    idx2 = Z2['leaves']
    D = squareform(dist)
    D = D[idx1,:]
    D = D[:,idx2]
    im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu)
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])

    # Plot colorbar.
    axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
    plt.colorbar(im, cax=axcolor)

make_heatmap_matrix(dist, method='complete')

books = !ls data/books
books

booktexts = load_texts(books, 'data/books/')

bookdocs = make_pattern_docs(booktexts)

booktfidf = Model(documents=bookdocs, weight=TFIDF)

booktfidf.docs

booknames = [doc.name for doc in booktfidf.docs]

booktfidf.export('data/csv/books_tfidf.tsv')

bookweights = read_weka_tfidf('data/csv/books_tfidf.tsv')

dist = make_dend(bookweights, labels=booknames)

make_heatmap_matrix(dist, method='complete')

kmeans = mtfidf.cluster(method=KMEANS, k=5)
from pattern.vector import centroid
import operator
# For each cluster center, look at the most important features.
for i in range(5):
    print i
    print sorted(centroid(kmeans[i]).items(), key=operator.itemgetter(1))[0:10]
    print