Doing the same operation using a more flexible classification tool... Full repo here: https://github.com/arnicas/NLP-in-Python
For scikit-learn, it's convenient to have the data files in separate directories based on their classification. So there's a "yes" and "no" subdirectory for the labeled files for my training and test data.
ls data/fiftyshades/
no/ yes/
from sklearn.datasets import load_files
# If you want to load data into sklearn, put your text into directories labeled by the category
# 'labels'
bunchf = load_files('data/fiftyshades/', categories=['yes','no'])
# This is what sklearn creates:
bunchf.keys()
['target_names', 'data', 'target', 'DESCR', 'filenames']
len(bunchf.data)
382
bunchf.filenames[0:10]
array(['data/fiftyshades/no/no_fifty_500_77', 'data/fiftyshades/no/no_fifty_500_239', 'data/fiftyshades/yes/yes_fifty_500_328', 'data/fiftyshades/yes/yes_fifty_500_208', 'data/fiftyshades/no/no_fifty_500_30', 'data/fiftyshades/no/no_fifty_500_183', 'data/fiftyshades/yes/yes_fifty_500_114', 'data/fiftyshades/no/no_fifty_500_17', 'data/fiftyshades/yes/yes_fifty_500_92', 'data/fiftyshades/no/no_fifty_500_15'], dtype='|S38')
# Try uncommenting each of the lines to see what's what
bunchf.filenames[0]
#bunchf.target[0]
#bunchf.target_names[0]
#bunchf.data[0]
'data/fiftyshades/no/no_fifty_500_77'
# instead of a simple true/false for a feature (word), we'll use the TF-IDF weight
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cross_validation
# try changing the random_state and % of test data - interesting differences in results.
Xf_train, Xf_test, yf_train, yf_test = cross_validation.train_test_split(bunchf.data,
bunchf.target,
test_size=0.30,
random_state=4)
from nltk import word_tokenize, wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string
class LemmaTokenizer(object):
""" You can use either the lemmatizer or the stemmer here - try both? """
def __init__(self):
self.wnl = WordNetLemmatizer()
self.port = PorterStemmer()
def __call__(self, doc):
# Comment/uncomment whichever one you want to try
#return [self.wnl.lemmatize(t.lower()) for t in word_tokenize(doc) if t not in string.punctuation and len(t) > 2]
return [self.port.stem(t.lower()) for t in word_tokenize(doc) if t not in string.punctuation and len(t) > 2]
# The sklearn vectorizer for TF-IDF has the stopwords as an option.
tfidfvec = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
# we create the tf-idf model from the training data:
vectors_train = tfidfvec.fit_transform(Xf_train)
# Depending on whether you stemmed or lemmatized, you'll get different column numbers here!
vectors_train.shape
(267, 5405)
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics
# We pick our classifier
clf = MultinomialNB(alpha=.01)
# We train the classifier on the training data and target classes (yes/no)
clf.fit(vectors_train, yf_train)
# We use the model on the test data:
vectors_test = tfidfvec.transform(Xf_test)
# We get a prediction from the test data
pred = clf.predict(vectors_test)
# We check the accuracy against the "truth" in the yf_test var
metrics.accuracy_score(yf_test, pred)
0.93043478260869561
Unfortunately, getting most informative features out of sklearn is a little more ugly:
# code for binary classification case posted here:
# http://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers
def show_most_informative_features(vectorizer, clf, n=20):
feature_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)
show_most_informative_features(tfidfvec, clf, n=15) # positive to the right, negative to left.
-11.1539 -nine-tail -3.9677 hi -11.1539 146963 -5.1706 hand -11.1539 15.1 -5.3202 eye -11.1539 15.10 -5.3398 thi -11.1539 15.14 -5.4032 want -11.1539 15.19 -5.4302 breath -11.1539 15.2 -5.5105 kiss -11.1539 15.21 -5.5657 feel -11.1539 15.22 -5.5670 finger -11.1539 15.24 -5.5917 mouth -11.1539 15.3 -5.6324 slowli -11.1539 15.4 -5.6421 whisper -11.1539 15.5 -5.6838 pull -11.1539 15.6 -5.6875 bodi -11.1539 15.7 -5.6961 groan
Why do you think the features differ? What's different about how we created the inputs?
A few more references:
And for more fun--