import os import csv #os.chdir('/path/to/wherever/you/downloaded/data/from/textcleaning') os.chdir('/Users/rweiss/Dropbox/presentations/MozFest2013/data/') review_data = [] labels = set() tsvfile = open('../sentiment_examples/reviews_sample.csv', 'r') # take a look at what this .csv looks like and check the file structure csv_reader = csv.DictReader(tsvfile, delimiter='\t') for line in csv_reader: temp = {line['label'] : line['content']} review_data.append(temp) labels.add(line['label']) tsvfile.close() print 'There are ' + str(len(review_data)) + ' total reviews.' print 'The labels are '+ ', '.join(labels) + '.' from random import shuffle x = [review_data[i] for i in range(len(review_data))] shuffle(x) #shuffling just to show you how to review_data = [''.join(el.values()) for el in x] target_labels = [''.join(el.keys()) for el in x] #!/usr/bin/python # code adapted from the SASA script classifyFromCmdLine.py import sys try: from sasa.classifier import Classifier except ImportError: raise ImportError("Did you try to run '. setup.env'?\n" + "(or add the sasa-tool directory to PYTHONPATH, ie export PYTHONPATH=)?") classifier = Classifier() classified_reviews = [] for line in review_data: print "classifying %s" % line.strip() sentiment, valence, posterior = classifier.classifyFromText(line) if valence >= 0: score = "positive" elif valence <0: score = "negative" classified_reviews.append({score: line}) outfile = open('../sentiment_examples/reviews_sample_3.csv', 'w') for line in classified_reviews: body = ''.join(line.values()) label = ''.join(line.keys()) body = body.decode('iso-8859-1') # this might not be the encoding of your data! body = body.encode('utf-8') # utf-8 is usually the best encoding to use label = label.encode('utf-8') try: outfile.write(label + '\t' + body + '\n') except UnicodeDecodeError: print "Unicode error" + line outfile.close()