import nltk.classify.util from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews print(movie_reviews.readme()) def word_feats(words): return dict([(word, True) for word in words]) negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*1/16 poscutoff = len(posfeats)*1/16 negcutoff trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features()