import nltk
!conda install nltk
Fetching package metadata: .. Solving package specifications: ................. # All requested packages already installed. # packages in environment at /Applications/anaconda: # nltk 2.0.4 np18py27_0
# NLTK has a corpus with a list of male and female names
from nltk.corpus import names
names.abspath
<bound method WordListCorpusReader.abspath of <WordListCorpusReader in '/Users/dave/nltk_data/corpora/names'>>
# Let's read about it:
print(names.readme())
Names Corpus, Version 1.3 (1994-03-29) Copyright (C) 1991 Mark Kantrowitz Additions by Bill Ross This corpus contains 5001 female names and 2943 male names, sorted alphabetically, one per line. You may use the lists of names for any purpose, so long as credit is given in any published work. You may also redistribute the list if you provide the recipients with a copy of this README file. The lists are not in the public domain (I retain the copyright on the lists) but are freely redistributable. If you have any additions to the lists of names, I would appreciate receiving them. Mark Kantrowitz <mkant+@cs.cmu.edu> http://www-2.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/
# Let's review the filenames
names.fileids()
# Let's take a look at the content:
names.words('female.txt')
# Let's create a list with each element containing value and category
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(names)
# For example:
names[2]
# Now we'll create our classifier using features of each word -- NLTK uses dictionary formats for features
# Step 1 is choosing features!
def gender_features(name):
features = {}
features["lastletter"] = name[-1].lower()
return features
# For example
gender_features('Dave')
# Now we'll create a featureset, with features and the associated category
featuresets = [(gender_features(n), g) for (n,g) in names]
# The featuresets are of the form dictionary, category. or "x" and "c" in our Bayes notation.
# In this case there is one 'feature'
featuresets
# We take a portion to be the training set and a portion to be the test set
train_set, test_set = featuresets[500:], featuresets[:500]
# NLTK Magic
classifier = nltk.NaiveBayesClassifier.train(train_set)
# Not so magic -- Uses frequency distributions
nltk.NaiveBayesClassifier??
# Let's try it
classifier.classify(gender_features('Neo'))
'male'
classifier.classify(gender_features('Trinity'))
# Check accuracy:
print nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(5)
# Using a development set:
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, devtest_set)
errors = []
for (name, tag) in devtest_names:
guess = classifier.classify(gender_features(name))
if guess != tag:
errors.append( (tag, guess, name) )
for (tag, guess, name) in sorted(errors):
print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)
Task: Modify the gender_features() function to provide the classifier with features encoding the length of the name and any other features that seem like they might be informative. Retrain the classifier with these new features, and test its accuracy.