#according to one of the guys who helped write the tfidf implementation in scikits-learn: http://stackoverflow.com/a/8897648 from sklearn.feature_extraction.text import TfidfVectorizer mydoclist = ['Julie loves me more than Linda loves me', 'Jane likes me more than Julie loves me', 'He likes basketball more than baseball'] tfidf_vectorizer = TfidfVectorizer(min_df = 1) tfidf_matrix = tfidf_vectorizer.fit_transform(mydoclist) document_distances = (tfidf_matrix * tfidf_matrix.T) print 'Created a ' + str(document_distances.get_shape()[0]) + ' by ' + str(document_distances.get_shape()[1]) + ' document-document cosine distance matrix.' print document_distances.toarray() from sklearn.metrics.pairwise import linear_kernel #code taken from here: http://stackoverflow.com/a/12128777 from sklearn.metrics.pairwise import linear_kernel #linear kernel is the same as cosine distance when using tfidf + euclidean normalized vectors (L2 Norm=1)) #this is the benefit of sticking to scikits-learn from beginning to end of an analysis cosine_similarities = linear_kernel(tfidf_matrix[0:1], tfidf_matrix).flatten() # let's look at similarity to the very first document related_docs_indices = cosine_similarities.argsort()[:-len(mydoclist)-1:-1] #what is the order of most to least similar? print mydoclist print cosine_similarities[related_docs_indices] # what are the cosine distances? import os import csv #os.chdir('/path/to/wherever/you/downloaded/data/from/textcleaning') os.chdir('/Users/rweiss/Dropbox/presentations/IRiSS2013/text2/extra/amazon') amazon_reviews = [] target_labels = [] for infile in os.listdir(os.path.join(os.getcwd())): if infile.endswith('csv'): label = infile.split('.')[0] target_labels.append(label) with open(infile, 'rb') as csvfile: amazon_reader = csv.DictReader(csvfile, delimiter=',') infile_rows = [{ label: row['review_text'] } for row in amazon_reader] for doc in infile_rows: amazon_reviews.append(doc) print 'There are ' + str(len(amazon_reviews)) + ' total reviews.' print 'The labels are '+ ', '.join(target_labels) + '.' #first, we need to shuffle the docs into random order #this is to make it easier for me to make train and test sets from random import shuffle x = [amazon_reviews[i] for i in range(len(amazon_reviews))] shuffle(x) from sklearn.naive_bayes import MultinomialNB from sklearn import metrics from operator import itemgetter trainset_size = int(round(len(amazon_reviews)*0.75)) # i chose this threshold arbitrarily... print 'The training set size for this classifier is ' + str(trainset_size) + '\n' X_train = np.array([''.join(el.values()) for el in x[0:trainset_size]]) y_train = np.array([''.join(el.keys()) for el in x[0:trainset_size]]) X_test = np.array([''.join(el.values()) for el in x[trainset_size+1:len(amazon_reviews)]]) y_test = np.array([''.join(el.keys()) for el in x[trainset_size+1:len(amazon_reviews)]]) vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1), stop_words='english', strip_accents='unicode', norm='l2') X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) classifier = MultinomialNB().fit(X_train, y_train) y_predicted = classifier.predict(X_test) print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_predicted)) print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_predicted)) print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_predicted)) #hey, not bad! shouldn't be surprising; there's a lot of data #simple thing to do would be to up the n-grams to bigrams; try varying ngram_range from (1, 1) to (1, 2) #we could also modify the vectorizer to stem or lemmatize print '\nHere is the confusion matrix:' print metrics.confusion_matrix(y_test, y_predicted) #What are the top N most predictive features per class? N = 10 vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary_.iteritems(), key=itemgetter(1))]) for i, label in enumerate(target_labels): topN = np.argsort(classifier.coef_[i])[-N:] print "\nThe top %d most informative features for %s: \n%s" % (N, label, " ".join(vocabulary[topN])) import os import zipfile for review in amazon_reviews: label = ''.join(review.keys()) text = ''.join(review.values()) etcMLdir = os.path.join(os.getcwd() + '/etcML/' + label) if not os.path.exists(etcMLdir): try: os.makedirs(etcMLdir) except OSError: print "Skipping creation of %s because it exists already." % etcMLdir #would probably be better to create a dictionary that stores the DOI and then names the file the DOI rather than the index number with open(os.path.join(etcMLdir + os.sep + 'review_' + str(amazon_reviews.index(review)) + '.txt'), 'wb' ) as outfile: outfile.write(text) #note that it wasn't really necessary to write these files out to a directory first... #we could have written a function that added to a zipfile dynamically def zipdir(path, zip): for root, dirs, files in os.walk(path): for file in files: zip.write(os.path.join(root, file)) zip = zipfile.ZipFile('amazon.zip', 'w') zipdir('etcML/', zip) zip.close()