#according to one of the guys who helped write the tfidf implementation in scikits-learn: http://stackoverflow.com/a/8897648
from sklearn.feature_extraction.text import TfidfVectorizer

mydoclist = ['Julie loves me more than Linda loves me',
'Jane likes me more than Julie loves me',
'He likes basketball more than baseball']

tfidf_vectorizer = TfidfVectorizer(min_df = 1)
tfidf_matrix = tfidf_vectorizer.fit_transform(mydoclist)

document_distances = (tfidf_matrix * tfidf_matrix.T)
print 'Created a ' + str(document_distances.get_shape()[0]) + ' by ' + str(document_distances.get_shape()[1]) + ' document-document cosine distance matrix.'
print document_distances.toarray()

from sklearn.metrics.pairwise import linear_kernel

#code taken from here: http://stackoverflow.com/a/12128777
from sklearn.metrics.pairwise import linear_kernel 

#linear kernel is the same as cosine distance when using tfidf + euclidean normalized vectors (L2 Norm=1))
#this is the benefit of sticking to scikits-learn from beginning to end of an analysis

cosine_similarities = linear_kernel(tfidf_matrix[0:1], tfidf_matrix).flatten() # let's look at similarity to the very first document
related_docs_indices = cosine_similarities.argsort()[:-len(mydoclist)-1:-1] #what is the order of most to least similar?

print mydoclist
print cosine_similarities[related_docs_indices] # what are the cosine distances?

import os
import csv
#os.chdir('/path/to/wherever/you/downloaded/data/from/textcleaning')
os.chdir('/Users/rweiss/Dropbox/presentations/IRiSS2013/text2/extra/amazon')

amazon_reviews = []
target_labels = []

for infile in os.listdir(os.path.join(os.getcwd())):
    if infile.endswith('csv'):
        label = infile.split('.')[0]
        target_labels.append(label)
        
        with open(infile, 'rb') as csvfile:
            amazon_reader = csv.DictReader(csvfile, delimiter=',')
            infile_rows = [{ label: row['review_text'] } for row in amazon_reader]
            
        for doc in infile_rows:
            amazon_reviews.append(doc)
        
print 'There are ' + str(len(amazon_reviews)) + ' total reviews.'

print 'The labels are '+ ', '.join(target_labels) + '.'

#first, we need to shuffle the docs into random order
#this is to make it easier for me to make train and test sets

from random import shuffle
x = [amazon_reviews[i] for i in range(len(amazon_reviews))]
shuffle(x)

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from operator import itemgetter

trainset_size = int(round(len(amazon_reviews)*0.75)) # i chose this threshold arbitrarily...
print 'The training set size for this classifier is ' + str(trainset_size) + '\n'

X_train = np.array([''.join(el.values()) for el in x[0:trainset_size]])
y_train = np.array([''.join(el.keys()) for el in x[0:trainset_size]])

X_test = np.array([''.join(el.values()) for el in x[trainset_size+1:len(amazon_reviews)]])   
y_test = np.array([''.join(el.keys()) for el in x[trainset_size+1:len(amazon_reviews)]])  

vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1), stop_words='english', strip_accents='unicode', norm='l2')
                             
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

classifier = MultinomialNB().fit(X_train, y_train)
y_predicted = classifier.predict(X_test)

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_predicted))

#hey, not bad!  shouldn't be surprising; there's a lot of data
#simple thing to do would be to up the n-grams to bigrams; try varying ngram_range from (1, 1) to (1, 2)
#we could also modify the vectorizer to stem or lemmatize
print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_predicted)

#What are the top N most predictive features per class?
N = 10
vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary_.iteritems(), key=itemgetter(1))])

for i, label in enumerate(target_labels):
    topN = np.argsort(classifier.coef_[i])[-N:]
    print "\nThe top %d most informative features for %s: \n%s" % (N, label, " ".join(vocabulary[topN]))

import os
import zipfile

for review in amazon_reviews:
    label = ''.join(review.keys())
    text = ''.join(review.values())
    
    etcMLdir = os.path.join(os.getcwd() + '/etcML/' + label)
    
    if not os.path.exists(etcMLdir):
        try:
            os.makedirs(etcMLdir)
        except OSError:
            print "Skipping creation of %s because it exists already." % etcMLdir
    
    #would probably be better to create a dictionary that stores the DOI and then names the file the DOI rather than the index number
    with open(os.path.join(etcMLdir + os.sep + 'review_' + str(amazon_reviews.index(review)) + '.txt'), 'wb' ) as outfile:
        outfile.write(text)

#note that it wasn't really necessary to write these files out to a directory first...
#we could have written a function that added to a zipfile dynamically
        
def zipdir(path, zip):
    for root, dirs, files in os.walk(path):
        for file in files:
            zip.write(os.path.join(root, file))

zip = zipfile.ZipFile('amazon.zip', 'w')
zipdir('etcML/', zip)
zip.close()