Regularization and feature selection
Given labeled dataset D={(x1,y1),…(xn,yn)}
E(D,θ)=−∏i∈Dp(yi|xi)import numpy as np
import math
x = np.array([1,0,0,1]) # feature vector
theta = np.array([.9, -.9, .1, -.1]) # model coefficients
x.dot(theta)
# E.g.,
# feature 0 (.9) is strongly predictive of y=1
# feature 1 (-.9) is strongly predictive of y=-1
# feature 2 (.1) is weakly predictive of y=1
# feature 3 (-.1) is weakly predictive of y=-1
# dot product: (1 * .9) - (1*.1)
0.8
def logistic(x, theta, y):
"""logistic function :=
probability of class y for feature vector x
"""
return 1 / (1 + math.exp(-y * x.dot(theta)))
print('for x=', x)
print('p(y=1|x)=%.3g' % logistic(x, theta, 1))
print('p(y=-1|x)=%.3g' % logistic(x, theta, -1))
for x= [1 0 0 1] p(y=1|x)=0.69 p(y=-1|x)=0.31
x = np.array([1, 0, 1, 0])
print('for x=', x)
print('p(y=1|x)=%.3g' % logistic(x, theta, 1.))
print('p(y=-1|x)=%.3g' % logistic(x, theta, -1.))
for x= [1 0 1 0] p(y=1|x)=0.731 p(y=-1|x)=0.269
x = np.array([0, 1, 1, 0])
print('for x=', x)
print('p(y=1|x)=%.3g' % logistic(x, theta, 1.))
print('p(y=-1|x)=%.3g' % logistic(x, theta, -1.))
for x= [0 1 1 0] p(y=1|x)=0.31 p(y=-1|x)=0.69
# A way to interpret coefficient theta_j:
# probability of positive class is 1 / (1 + e^-theta_j)
x = np.array([1, 0, 0, 0])
print('for x=', x)
print('p(y=1|x)=%.3g' % logistic(x, theta, 1.))
for x= [1 0 0 0] p(y=1|x)=0.711
1/ (1 + math.exp(-.9))
0.7109495026250039
def error(X, y, theta):
"""
negative product of probabilities of
correct labels for each instance in X.
"""
error = 1
for xi, yi in zip(X, y):
prob = logistic(xi, theta, yi)
print('probability of %d for %s=%.3f' %
(yi, str(xi), prob))
error *= prob
return -error
X = np.array([
[1, 0, 0, 0],
[1, 0, 1, 0],
[0, 1, 0, 0],
[0, 1, 0, 1],
])
y = np.array([
1,
1,
-1,
-1
])
print('error for coefficients theta=', theta)
error(X, y, theta)
error for coefficients theta= [ 0.9 -0.9 0.1 -0.1] probability of 1 for [1 0 0 0]=0.711 probability of 1 for [1 0 1 0]=0.731 probability of -1 for [0 1 0 0]=0.711 probability of -1 for [0 1 0 1]=0.731
-0.2701356268331891
# Make theta even better:
theta = np.array([10, -10, 10, -10])
error(X, y, theta)
probability of 1 for [1 0 0 0]=1.000 probability of 1 for [1 0 1 0]=1.000 probability of -1 for [0 1 0 0]=1.000 probability of -1 for [0 1 0 1]=1.000
-0.9999092022016287
# Make theta much, much worse:
theta = np.array([-10, 10, -10, 10])
error(X, y, theta)
probability of 1 for [1 0 0 0]=0.000 probability of 1 for [1 0 1 0]=0.000 probability of -1 for [0 1 0 0]=0.000 probability of -1 for [0 1 0 1]=0.000
-8.755715690797854e-27
# Make theta even better:
theta = np.array([100, -100, 100, -100])
error(X, y, theta)
probability of 1 for [1 0 0 0]=1.000 probability of 1 for [1 0 1 0]=1.000 probability of -1 for [0 1 0 0]=1.000 probability of -1 for [0 1 0 1]=1.000
-1.0
error ∈{−1,0}
theta = np.array([-10, 10, -10, 10]) # error=0
theta = np.array([100, -100, 100, -100]) # error=-1.0
theta = np.array([10, -10, 10, -10]) # error=-0.999
Is the second really better than the first?
Controls how many parameters the model has, or how large each parameter can get.
E.g. L2 Regularization for logistic regression:
E(D,θ)=−∏i∈Dp(yi|xi)+1C||→θ||22As C→0, the classifier prefers smaller and smaller coefficients.
This is the C parameter of the LogisticRegression
class in sklearn.
Below, we will investigate the effect of regularization using the IMDB sentiment classification data.
from collections import Counter
import glob
import hashlib
import io
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import tarfile
import urllib.request
%matplotlib inline
def get_data():
""" Download and unzip data."""
urllib.request.urlretrieve('https://www.dropbox.com/s/8oehplrobcgi9cq/imdb.tgz?dl=1', 'imdb.tgz')
tar = tarfile.open("imdb.tgz")
tar.extractall()
tar.close()
get_data()
# Here is the path to the data directory.
path = 'data'
print('subdirectories are:' + str(os.listdir(path)))
subdirectories are:['test', 'train']
def get_files(path):
""" Return a list of file names in this directory that end in .txt
The list should be sorted alphabetically by file name.
Params:
path....a directory containing .txt review files.
Returns:
a list of .txt file names, sorted alphabetically.
"""
return sorted([f for f in glob.glob(path + os.sep + '*.txt')])
pos_train_files = get_files(path + os.sep + 'train' + os.sep + 'pos')
neg_train_files = get_files(path + os.sep + 'train' + os.sep + 'neg')
all_train_files = pos_train_files + neg_train_files
print('found %d positive and %d negative training files' %
(len(pos_train_files), len(neg_train_files)))
print('first positive file: %s' % pos_train_files[0])
print('first negative file: %s' % neg_train_files[0])
found 200 positive and 200 negative training files first positive file: data/train/pos/10057_9.txt first negative file: data/train/neg/10108_1.txt
def get_true_labels(file_names):
"""Return a *numpy array* of ints for the true sentiment labels of each file.
1 means positive, 0 means negative. Use the name of the file to determine
the true label.
Params:
file_names....a list of .txt file paths, e.g., data/train/pos/10057_9.txt
Returns:
a numpy array of 1 or 0 values corresponding to each element
of file_names, where 1 indicates a positive review, and 0
indicates a negative review.
"""
return np.array([1 if 'pos' in f else 0 for f in file_names])
labels = get_true_labels(all_train_files)
print('first 3 and last 3 labels are: %s' % str(labels[[1,2,3,-3,-2,-1]]))
first 3 and last 3 labels are: [1 1 1 0 0 0]
# Here's what a positive review looks like.
def file2string(filename):
return io.open(filename, encoding='utf8').readlines()[0]
file2string(pos_train_files[10])
"This is a great film!! The first time I saw it I thought it was absorbing from start to finish and I still do now. I may not have seen the play, but even if I had it wouldn't stop me thinking that the film is just as good."
def tokenize(text):
"""Given a string, return a list of tokens such that: (1) all
tokens are lowercase, (2) all punctuation is removed. Note that
underscore (_) is not considered punctuation.
Params:
text....a string
Returns:
a list of tokens
"""
return re.findall('\w+', text.lower())
tokenize("Hi! How's it going??? an_underscore is not *really* punctuation.")
['hi', 'how', 's', 'it', 'going', 'an_underscore', 'is', 'not', 'really', 'punctuation']
def do_vectorize(filenames, tokenizer_fn=tokenize, min_df=1,
max_df=1., binary=True, ngram_range=(1,1)):
"""
Convert a list of filenames into a sparse csr_matrix, where
each row is a file and each column represents a unique word.
Use sklearn's CountVectorizer: http://goo.gl/eJ2PJ5
Params:
filenames.......list of review file names
tokenizer_fn....the function used to tokenize each document
min_df..........remove terms from the vocabulary that don't appear
in at least this many documents
max_df..........remove terms from the vocabulary that appear in more
than this fraction of documents
binary..........If true, each documents is represented by a binary
vector, where 1 means a term occurs at least once in
the document. If false, the term frequency is used instead.
ngram_range.....A tuple (n,m) means to use phrases of length n to m inclusive.
E.g., (1,2) means consider unigrams and bigrams.
Return:
A tuple (X, vec), where X is the csr_matrix of feature vectors,
and vec is the CountVectorizer object.
"""
vec = CountVectorizer(input='filename', tokenizer=tokenizer_fn,
binary=binary, min_df=min_df, max_df=max_df,
ngram_range=ngram_range)
X = vec.fit_transform(filenames)
return (X, vec)
matrix, vec = do_vectorize(all_train_files)
print ('matrix represents %d documents with %d features' % (matrix.shape[0], matrix.shape[1]))
print('first doc has terms:\n%s' % (str(sorted(matrix[0].nonzero()[1]))))
matrix represents 400 documents with 10708 features first doc has terms: [128, 170, 202, 253, 260, 312, 355, 439, 504, 514, 560, 673, 683, 702, 750, 860, 869, 961, 985, 1013, 1222, 1254, 1312, 1341, 1403, 1444, 1451, 1469, 1504, 1657, 1664, 1742, 2467, 2539, 2998, 3111, 3208, 3231, 3358, 3370, 3517, 3636, 3708, 3718, 3761, 3812, 3928, 4017, 4061, 4063, 4089, 4141, 4207, 4209, 4224, 4312, 4369, 4387, 4415, 4438, 4475, 4513, 4527, 4634, 4693, 4760, 4801, 5065, 5077, 5228, 5279, 5292, 5294, 5317, 5365, 5423, 5614, 5615, 5651, 5698, 5766, 5893, 5937, 5953, 6121, 6248, 6263, 6299, 6428, 6444, 6583, 6624, 6680, 6700, 6864, 6946, 7098, 7629, 8053, 8248, 8336, 8341, 8474, 8767, 8988, 9204, 9411, 9435, 9439, 9504, 9507, 9522, 9549, 9557, 9633, 9683, 9689, 9834, 9854, 9856, 10045, 10335, 10351, 10429, 10439, 10444, 10446, 10517]
"""
Shuffle order of documents (since all positive documents
come before all negatives).
"""
import random
def shuffle(X, y, filenames):
random.seed(42)
indices = sorted(range(X.shape[0]), key=lambda x: random.random())
return X[indices], y[indices], np.array(filenames)[indices]
X, y, filenames = shuffle(matrix, labels, all_train_files)
print('first shuffled document %s has label %d and terms: %s' %
(filenames[0], y[0], sorted(X[0].nonzero()[1])))
first shuffled document data/train/neg/971_3.txt has label 0 and terms: [8, 101, 170, 195, 266, 278, 289, 355, 439, 464, 504, 514, 702, 750, 762, 765, 834, 913, 961, 962, 990, 997, 1013, 1068, 1103, 1248, 1254, 1403, 1445, 1883, 2048, 2107, 2280, 2399, 2508, 2742, 2920, 2937, 3008, 3029, 3047, 3124, 3153, 3154, 3314, 3363, 3377, 3397, 3535, 3596, 3619, 3625, 3653, 3684, 3718, 3921, 4029, 4035, 4089, 4094, 4155, 4387, 4475, 4527, 4599, 4611, 4614, 4630, 4693, 4732, 4744, 4801, 5065, 5077, 5083, 5217, 5481, 5569, 5614, 5617, 5793, 5953, 5992, 6115, 6116, 6206, 6293, 6299, 6446, 6460, 6583, 6624, 6626, 6663, 6700, 6765, 6854, 7475, 7615, 7618, 7620, 7625, 7908, 7912, 8104, 8187, 8206, 8258, 8474, 8698, 8699, 8767, 8794, 8800, 8909, 9068, 9099, 9347, 9499, 9504, 9507, 9532, 9541, 9549, 9557, 9571, 9572, 9613, 9633, 9796, 9856, 9977, 10045, 10158, 10162, 10222, 10225, 10339, 10346, 10351, 10362, 10444, 10455, 10483, 10517, 10561, 10567, 10580, 10657]
# Creates a LogsticRegression object.
def get_clf(c=1, penalty='l2'):
return LogisticRegression(random_state=42, C=c, penalty=penalty, solver='liblinear')
def do_cross_validation(X, y, n_folds=5, c=1, penalty='l2', verbose=False):
"""
Perform n-fold cross validation, calling get_clf() to train n
different classifiers. Use sklearn's KFold class: http://goo.gl/wmyFhi
Be sure not to shuffle the data, otherwise your output will differ.
Params:
X.........a csr_matrix of feature vectors
y.........the true labels of each document
n_folds...the number of folds of cross-validation to do
verbose...If true, report the testing accuracy for each fold.
Return:
the average testing accuracy across all folds.
"""
cv = KFold(n_splits=n_folds, shuffle=False)
accuracies = []
train_accuracies = []
for foldi, (train, test) in enumerate(cv.split(X)):
clf = get_clf(c=c, penalty=penalty)
clf.fit(X[train], y[train])
train_accuracies.append(accuracy_score(clf.predict(X[train]), y[train]))
pred = clf.predict(X[test])
acc = accuracy_score(pred, y[test])
accuracies.append(acc)
if verbose:
print('fold %d accuracy=%.4g' % (foldi, acc))
return (np.mean(accuracies),
np.std(accuracies) / math.sqrt(n_folds),
np.mean(train_accuracies),
np.std(train_accuracies) / math.sqrt(n_folds))
def print_results(results):
print('test accuracy=%.4f (%.2f) train accuracy=%.4f (%.2f)' %
results)
results = do_cross_validation(X, y, verbose=True)
print_results(results)
fold 0 accuracy=0.825 fold 1 accuracy=0.8 fold 2 accuracy=0.7125 fold 3 accuracy=0.7375 fold 4 accuracy=0.6875 test accuracy=0.7525 (0.02) train accuracy=1.0000 (0.00)
def do_expt(filenames, y, tokenizer_fn=tokenize,
min_df=1, max_df=1., binary=True,
ngram_range=(1,1), n_folds=5, c=1, penalty='l2'):
"""
Run one experiment, which consists of vectorizing each file,
performing cross-validation, and returning the average accuracy.
You should call do_vectorize and do_cross_validation here.
Params:
filenames.......list of review file names
y...............the true sentiment labels for each file
tokenizer_fn....the function used to tokenize each document
min_df..........remove terms from the vocabulary that don't appear
in at least this many documents
max_df..........remove terms from the vocabulary that appear in more
than this fraction of documents
binary..........If true, each documents is represented by a binary
vector, where 1 means a term occurs at least once in
the document. If false, the term frequency is used instead.
ngram_range.....A tuple (n,m) means to use phrases of length n to m inclusive.
E.g., (1,2) means consider unigrams and bigrams.
n_folds.........The number of cross-validation folds to use.
Returns:
the average cross validation testing accuracy.
"""
X, vec = do_vectorize(filenames, tokenizer_fn=tokenizer_fn,
binary=binary, min_df=min_df,
max_df=max_df, ngram_range=ngram_range)
return do_cross_validation(X, y, verbose=False, n_folds=n_folds, c=c, penalty=penalty)
print('accuracy using default settings:')
print_results(do_expt(filenames, y))
accuracy using default settings: test accuracy=0.7525 (0.02) train accuracy=1.0000 (0.00)
Next, we'll try out a few different settings to see how they affect cross-validation accuracy.
def compare_c(filenames, y, penalty='l2', cs=[1, 5, 10, 1000, 10000]):
accs = []
train_accs = []
for c in cs:
test_acc, test_sd, train_acc, train_sd = \
do_expt(filenames, y, c=c, penalty=penalty)
accs.append(test_acc)
train_accs.append(train_acc)
# plot train accuracy
plt.figure()
plt.errorbar(cs, train_accs, fmt='go-', label='train acc', yerr=train_sd)
plt.xlabel('C')
plt.ylabel('train accuracy')
plt.xscale('log')
plt.show()
# plot test accuracy
plt.figure()
plt.errorbar(cs, accs, fmt='bo-', label='test acc', yerr=test_sd)
plt.xlabel('C')
plt.ylabel('test accuracy')
plt.xscale('log')
plt.show()
return accs, train_accs
cs = [.001, .01, .1, 1, 5, 10, 1000, 10000]
compare_c(filenames, y, cs=cs)
([0.71, 0.7275, 0.7449999999999999, 0.7525000000000001, 0.7550000000000001, 0.7550000000000001, 0.7449999999999999, 0.735], [0.9425000000000001, 0.9893750000000001, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
def compare_c_coef(X, y, cs, penalty='l2'):
max_coef = []
for c in cs:
clf = get_clf(c=c, penalty=penalty)
clf.fit(X, y)
max_coef.append(max(np.abs(clf.coef_[0])))
plt.figure()
plt.plot(cs, max_coef, 'bo-')
plt.xlabel('C')
plt.ylabel('max coef')
plt.xscale('log')
plt.show()
return max_coef
compare_c_coef(X, y, cs=cs)
[0.01640794018703204, 0.1146999934443533, 0.40789919547410275, 0.8404559900333362, 1.1763613493643088, 1.3264186628108598, 2.3516251487475905, 2.7393176574515805]
Penalize by absolute value of coefficients (rather than square):
E(D,θ)=−∏i∈Dp(yi|xi)+1C∑k|θk|compare_c(filenames, y, penalty='l1', cs=[.1, .5, 1., 10, 1e2, 1e3, 1e4, 1e5])
([0.6199999999999999, 0.68, 0.6725, 0.665, 0.6825, 0.7375, 0.7150000000000001, 0.7050000000000001], [0.683125, 0.9643750000000001, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
X.shape
(400, 10708)
def compare_num_feats(X, y, cs=[.1, 1., 10, 1e2, 1e3, 1e4, 1e5]):
nnzs = []
for c in cs:
clf = get_clf(c=c, penalty='l1')
clf.fit(X, y)
nnz = len(np.where(clf.coef_[0]!=0)[0])
nnzs.append(nnz)
plt.figure()
plt.plot(cs, nnzs, 'bo-')
plt.xlabel('C')
plt.ylabel('# nonzero coefficients')
plt.xscale('log')
plt.show()
return nnzs
compare_num_feats(X, y)
[9, 181, 236, 331, 783, 3303, 5876]
# What features are selected?
def print_top_terms(model, vec, n=10):
terms = np.array(vec.get_feature_names())
print('\nTop Coefficients')
coef = model.coef_[0]
srted = np.argsort(coef)
topi = srted[::-1][:n]
boti = srted[:n]
print('Positive Terms:\n' + '\n'.join('%s (%g)' % (n, c) for n, c in zip(terms[topi], coef[topi])))
print('\nNegativeTerms:\n' + '\n'.join('%s (%g)' % (n, c) for n, c in zip(terms[boti], coef[boti])))
clf = get_clf(c=.1, penalty='l1')
clf.fit(X, y)
print_top_terms(clf, vec)
Top Coefficients Positive Terms: great (0.207901) best (0.207508) will (0.112245) film (0.0667848) worth (0.0246236) well (0.00193315) fantastic (0) farce (0) far (0) fanzine (0) NegativeTerms: worst (-0.726754) bad (-0.313761) nothing (-0.299507) policeman (0) point (0) pointed (0) pointless (0) points (0) poise (0) poison (0)
clf.intercept_
array([0.])
# How to predict when no features?
clf.intercept_=.1
print(clf.predict_proba([[0] * len(vec.vocabulary_)]))
print('intercept=%g' % clf.intercept_)
print(clf.predict([[0] * len(vec.vocabulary_)]))
[[0.47502081 0.52497919]] intercept=0.1 [1]
clf = get_clf(c=1000, penalty='l1')
clf.fit(X, y)
print(clf.predict_proba([[0] * len(vec.vocabulary_)]))
print('intercept=%g' % clf.intercept_)
print(Counter(y))
[[0.50358714 0.49641286]] intercept=-0.0143488 Counter({0: 200, 1: 200})
clf = get_clf(c=1, penalty='l2')
clf.fit(X, y)
clf.intercept_
array([0.05059168])
Rather than using L1 regularization, we can explicitly look for terms that are highly correlated with the class label.
Intuitively, we want terms that are very probable in one class, but not the other.
One way to compute this is the chi-squared statistic: χ2
The chi-squared statistic is used in hypothesis testing to determine whether two variables are independent.
A⊥B⟺P(A,B)≡P(A)P(B)To apply this to feature selection, we wish to know whether the occurrence of a term is independent of the class label.
To compute the chi-squared statistic for term t and class label y:
So, for each term we have a table like this:
y=1 | y=0 | |
---|---|---|
t=1 | Nt11 | Nt10 |
t=0 | Nt01 | Nt00 |
The chi-squared statistic measures how much these values deviate from what would be expected if t and y were independent.
If we assume independence between t and y, then the probability of t appearing in a document with class y=1 is just p(t=1)∗p(y=1).
If N is the total number of documents, then the expected value E11 (for t=1, y=1) is:
p(t=1)=Nt11+Nt10NThen, the chi-squared statistic is as follows: χ2=∑ij(Ntij−Etij)2Etij
from sklearn.feature_selection import chi2
chi, pvals = chi2(X,y)
plt.figure()
plt.hist(pvals, bins=20)
plt.show()
plt.figure()
plt.hist(chi, bins=20)
plt.show()
feats = vec.get_feature_names()
# Top chi-sq terms:
for i in np.argsort(chi)[::-1][:10]:
print('index=%d chisq=%.2f %s' % (i, chi[i], feats[i]))
index=10575 chisq=29.88 worst index=10342 chisq=20.00 waste index=9471 chisq=16.13 terrible index=6501 chisq=16.07 nothing index=10576 chisq=15.21 worth index=1216 chisq=14.44 boring index=869 chisq=13.79 bad index=9244 chisq=13.00 superb index=7161 chisq=12.57 poor index=837 chisq=12.57 awful
# Bottom chi-sq terms:
for i in np.argsort(chi)[:10]:
print('index=%d chisq=%.2f %s' % (i, chi[i], feats[i]))
index=4601 chisq=0.00 hopeless index=927 chisq=0.00 barry index=2809 chisq=0.00 distinct index=10222 chisq=0.00 visit index=2818 chisq=0.00 distraught index=8498 chisq=0.00 shifty index=2830 chisq=0.00 division index=2833 chisq=0.00 diy index=10217 chisq=0.00 virtually index=8503 chisq=0.00 shining
# How many times does "worst" appear?
X.sum(axis=0).A1[10575]
41
# What labels do the documents containing "worst" have?
Counter(y[np.where(X[:, 10575].T.toarray()[0]==1)])
# Clearly skewed toward negative class, so high chisq value.
Counter({0: 38, 1: 3})
# How many times does "superb" appear?
X.sum(axis=0).A1[9244]
13
# What labels do the documents containing "superb" have?
Counter(y[np.where(X[:, 9244].T.toarray()[0]==1)])
# all positive
Counter({1: 13})
# How about "bad"
X.sum(axis=0).A1[869]
94
Counter(y[np.where(X[:, 869].T.toarray()[0]==1)])
Counter({0: 65, 1: 29})
So, 29 positive movies have the term "bad".
clf = get_clf(c=10, penalty='l2')
clf.fit(X, y)
topi = np.argsort(chi)[::-1][:20]
plt.figure(figsize=(12,10))
for feature_idx in topi:
chiscore = chi[feature_idx]
coef = clf.coef_[0][feature_idx]
plt.plot(chiscore, coef, 'b.')
plt.text(chiscore, coef, vec.get_feature_names()[feature_idx], fontsize=14)
plt.xlabel('chisq', size=16)
plt.ylabel('logistic regression coefficient', size=16)
plt.show()
#
from IPython.core.display import HTML
HTML(open('../custom.css').read())