from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from operator import itemgetter
from sklearn.metrics import classification_report
import csv
import os

os.chdir('/Users/rweiss/Dropbox/presentations/MozFest2013/data/')

#note that if you generated this from R, you will need to delete the row
#"NYT_sample.Topic.Code","NYT_sample.Title"
#from the top of the file.
nyt = open('../data/nyt_title_data.csv') # check the structure of this file!
nyt_data = []
nyt_labels = []
csv_reader = csv.reader(nyt)

for line in csv_reader:
 nyt_labels.append(int(line[0]))
 nyt_data.append(line[1])

nyt.close()

trainset_size = int(round(len(nyt_data)*0.75)) # i chose this threshold arbitrarily...to discuss
print 'The training set size for this classifier is ' + str(trainset_size) + '\n'

X_train = np.array([''.join(el) for el in nyt_data[0:trainset_size]])
y_train = np.array([el for el in nyt_labels[0:trainset_size]])

X_test = np.array([''.join(el) for el in nyt_data[trainset_size+1:len(nyt_data)]]) 
y_test = np.array([el for el in nyt_labels[trainset_size+1:len(nyt_labels)]]) 

#print(X_train)

vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 2), 
 stop_words='english', 
 strip_accents='unicode', 
 norm='l2')
 
test_string = unicode(nyt_data[0])

print "Example string: " + test_string
print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string)
print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string))
print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string))
print "\n"
 

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_nb_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_nb_predicted)

#simple thing to do would be to up the n-grams to bigrams; try varying ngram_range from (1, 1) to (1, 2)
#we could also modify the vectorizer to stem or lemmatize
print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_nb_predicted, labels=unique(nyt_labels))


#What are the top N most predictive features per class?
N = 10
vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary_.iteritems(), key=itemgetter(1))])

for i, label in enumerate(nyt_labels):
 if i == 7: # hack...
 break
 topN = np.argsort(nb_classifier.coef_[i])[-N:]
 print "\nThe top %d most informative features for topic code %s: \n%s" % (N, label, " ".join(vocabulary[topN]))
 #print topN

from sklearn.svm import LinearSVC

svm_classifier = LinearSVC().fit(X_train, y_train)

y_svm_predicted = svm_classifier.predict(X_test)
print "MODEL: Linear SVC\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_svm_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_svm_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_svm_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_svm_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_svm_predicted)

#simple thing to do would be to up the n-grams to bigrams; try varying ngram_range from (1, 1) to (1, 2)
#we could also modify the vectorizer to stem or lemmatize
print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_svm_predicted, labels=unique(nyt_labels))

#What are the top N most predictive features per class?
N = 10
vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary_.iteritems(), key=itemgetter(1))])

for i, label in enumerate(nyt_labels):
 if i == 7: # hack...
 break
 topN = np.argsort(svm_classifier.coef_[i])[-N:]
 print "\nThe top %d most informative features for topic code %s: \n%s" % (N, label, " ".join(vocabulary[topN]))
 #print topN

from sklearn.linear_model import LogisticRegression

maxent_classifier = LogisticRegression().fit(X_train, y_train)

y_maxent_predicted = maxent_classifier.predict(X_test)
print "MODEL: Maximum Entropy\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_maxent_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_maxent_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_maxent_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_maxent_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_maxent_predicted)

#simple thing to do would be to up the n-grams to bigrams; try varying ngram_range from (1, 1) to (1, 2)
#we could also modify the vectorizer to stem or lemmatize
print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_maxent_predicted, labels=unique(nyt_labels))

#What are the top N most predictive features per class?
N = 10
vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary_.iteritems(), key=itemgetter(1))])

for i, label in enumerate(nyt_labels):
 if i == 7: # hack...
 break
 topN = np.argsort(maxent_classifier.coef_[i])[-N:]
 print "\nThe top %d most informative features for topic code %s: \n%s" % (N, label, " ".join(vocabulary[topN]))
 #print topN

from gensim import corpora, models, similarities
from itertools import chain
import nltk
from nltk.corpus import stopwords
from operator import itemgetter
import re

url_pattern = r'https?:\/\/(.*[\r\n]*)+'

documents = [nltk.clean_html(document) for document in nyt_data]
stoplist = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stoplist]
 for document in documents]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = models.TfidfModel(corpus) 
corpus_tfidf = tfidf[corpus]

#lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
#lsi.print_topics(20)

n_topics = 60
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)

for i in range(0, n_topics):
 temp = lda.show_topic(i, 10)
 terms = []
 for term in temp:
 terms.append(term[1])
 print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join(terms)
 
print 
print 'Which LDA topic maximally describes a document?\n'
print 'Original document: ' + documents[1]
print 'Preprocessed document: ' + str(texts[1])
print 'Matrix Market format: ' + str(corpus[1])
print 'Topic probability mixture: ' + str(lda[corpus[1]])
print 'Maximally probable topic: topic #' + str(max(lda[corpus[1]],key=itemgetter(1))[0])