The explanation of this implementation can be found at: http://www.rosariomgomez.me/
Index:
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from random import shuffle
import numpy as np
import json
from time import sleep
#Twitter credentials
consumer_key=""
consumer_secret=""
access_token=""
access_token_secret=""
#authentication process
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
#construct the API instance
api = tweepy.API(auth)
'''Create a client object to a mongod localhost instance and create or retrieve a database
Output: Database connection'''
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
# create or retrieve the twitter database
db = client.twitter
return db
'''Insert a tweet on the 'tweets' collection from the <db> database
Input: Database name, tweet dictionary'''
def add_tweet(db, tweet):
if not db.tweets.find_one(tweet):
db.tweets.insert(tweet)
# create a list of users ids from the fashion list
# Iterate through all of the members on list using Cursor
users = []
for member in tweepy.Cursor(api.list_members, slug="fashion", owner_screen_name="rosariomgomez").items():
users.append(member.id)
#get the 150 most recent tweets from user and store on the db
def get_user_tweets(db, user, items=150):
for tweet in tweepy.Cursor(api.user_timeline, id=user).items(items):
add_tweet(db, {"_id": tweet.id, "text": tweet.text, "is_fashion": 1})
db = get_db()
for user in users:
try:
get_user_tweets(db, user)
except tweepy.TweepError:
sleep(900) #15min sleep (Twitter rate limits for GET requests in API v.1.1: 15 calls every 15 minutes)
class StdOutListener(StreamListener):
def __init__(self):
self.db = get_db()
def on_data(self, data):
tweet = json.loads(data)
self.db.tweets.insert({"_id": tweet["id"], "text": tweet["text"], "is_fashion": 0})
return True
def on_error(self, status):
print status
listener = StdOutListener()
stream = Stream(auth, listener)
stream.filter(track=['mclaren', 'hybrid', 'python', 'recipe', 'race', 'robot', 'opera', 'gardening', 'democrats',
'linux', 'brownie', 'airbus', 'nsa', 'police', '49ers', 'baseball', 'highway' ], languages=['en'])
#retrieve all tweets from the database and create a data and label sets
tweets = db.tweets.find()
data = []
labels = []
for tweet in tweets:
data.append(tweet["text"])
labels.append(tweet["is_fashion"])
print len(tweets)
31412
from sklearn.cross_validation import train_test_split
training_data, test_data, training_labels, test_labels = train_test_split(data, labels, test_size=0.3, random_state=0) #70-30 split
print len(training_data), len(test_data)
21988 9424
#we will split the training data into 2 more subsets: development and evaluation in order to first estimate the pipeline parameters
#with the grid search and then evaluate the accuracy of the model with cross validation
dev_data, eval_data, dev_labels, eval_labels = train_test_split(training_data, training_labels, test_size=0.5, random_state=0)
print len(dev_data), len(eval_data)
10994 10994
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report
'''helper function for displaying best found features on grid_search'''
def print_grid_search_metrics(gs):
print("Best score: %0.3f" % gs.best_score_)
print("Best parameters set:")
best_parameters = gs.best_params_
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
'''helper function for displaying the algorithm metrics'''
def print_metrics(model_name, y_labels, y_predicted):
print "MODEL: " + model_name
print 'Test Accuracy: ' + str(metrics.accuracy_score(y_labels, y_predicted))
print '\nClassification report:'
print classification_report(y_labels, y_predicted, target_names=['non-fashion tweets', 'fashion tweets'])
print '\nConfusion matrix:'
print metrics.confusion_matrix(y_labels, y_predicted)
'''helper to display the most informative features for each group'''
def show_most_informative_features(vectorizer, clf, n=20):
feature_names = vectorizer.get_feature_names()
coefs_with_names = sorted(zip(clf.coef_[0], feature_names))
top_features = zip(coefs_with_names[:n], coefs_with_names[:-(n + 1):-1]) #top features for both groups
for (coef_1, fn_1), (coef_2, fn_2) in top_features:
print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
#add some tweets specific stop words to the built-in english list
remove = ['amp', 'cc', 'did', 'don', 'rt', 'll', 'oh', 've', 'yes', 'let', 'going', 'via', 're', 'tweet' ]
stop = list(ENGLISH_STOP_WORDS) + remove
import re
from nltk.stem.snowball import *
stemmer = SnowballStemmer('english')
class NoUrls_TfidfVectorizer(TfidfVectorizer):
def build_preprocessor(self):
url_pattern = re.compile(r'http(s?)://[\w./]+')
pic_pattern = re.compile(r'pic.twitter.com/[\w.]+')
preprocessor = super(NoUrls_TfidfVectorizer, self).build_preprocessor()
return lambda doc: (pic_pattern.sub('', url_pattern.sub('', preprocessor(doc)) ))
class NoUrls_Stemmed_TfidfVectorizer(TfidfVectorizer):
def build_preprocessor(self):
url_pattern = re.compile(r'http(s?)://[\w./]+')
pic_pattern = re.compile(r'pic.twitter.com/[\w.]+')
preprocessor = super(NoUrls_Stemmed_TfidfVectorizer, self).build_preprocessor()
return lambda doc: (pic_pattern.sub('', url_pattern.sub('', preprocessor(doc)) ))
def build_tokenizer(self):
tokenizer = super(NoUrls_Stemmed_TfidfVectorizer, self).build_tokenizer()
return lambda doc: (stemmer.stem(w) for w in tokenizer(doc))
#ngram_range: lower and upper boundary of the range of n-values for different n-grams to be extracted
#I use words and bi-grams (to consider for example "New York" as unique feature)
#min_df: ignore terms that have a term frequency strictly lower than the given threshold
#because tweets are very short, we consider min_df=1 (consider all)
tfidf = NoUrls_TfidfVectorizer(ngram_range=(1, 2), min_df=1, stop_words=stop, strip_accents='unicode')
#Example process with an specific tweet
tweet = u'rt @harpersbazaar The top 7 swimsuit trends of the season—which will you wear? http://hbazaar.co/60109eOj pic.twitter.com/7J2hR4auMc #pretty'
print 'Preprocess:', tfidf.build_preprocessor()(tweet)
print
print 'Analyze:', tfidf.build_analyzer()(tweet)
tfidf.fit_transform([tweet])
tfidf.vocabulary_
Preprocess: rt @harpersbazaar the top 7 swimsuit trends of the season—which will you wear? #pretty Analyze: [u'harpersbazaar', u'swimsuit', u'trend', u'season', u'wear', u'pretti', u'harpersbazaar swimsuit', u'swimsuit trend', u'trend season', u'season wear', u'wear pretti']
{u'harpersbazaar': 0, u'harpersbazaar swimsuit': 1, u'pretti': 2, u'season': 3, u'season wear': 4, u'swimsuit': 5, u'swimsuit trend': 6, u'trend': 7, u'trend season': 8, u'wear': 9, u'wear pretti': 10}
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import BernoulliNB
Bern_classifier = BernoulliNB(binarize=None)
Bern_pipeline = Pipeline([('tfidf', tfidf), ('clf', Bern_classifier)])
Bern_classifier.get_params()
{'alpha': 1.0, 'binarize': None, 'class_prior': None, 'fit_prior': True}
#estimate the tfidf and classifier parameters by using grid search with a nested cross validation
parameters = {
'tfidf__max_df': (0.8, 1.0),
'tfidf__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
'tfidf__norm': ('l1', 'l2'),
'clf__alpha': (0.1, 0.5, 1)
}
bern_gs = GridSearchCV(Bern_pipeline, parameters, cv=5, verbose=1, refit=False)
bern_gs.fit(dev_data, dev_labels)
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 3.7s [Parallel(n_jobs=1)]: Done 50 jobs | elapsed: 3.1min
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 7.4min finished
GridSearchCV(cv=5, estimator=Pipeline(steps=[('tfidf', NoUrls_TfidfVectorizer(analyzer=u'word', binary=False, charset=None, charset_error=None, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ...vocabulary=None)), ('clf', BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True))]), fit_params={}, iid=True, loss_func=None, n_jobs=1, param_grid={'tfidf__max_df': (0.8, 1.0), 'tfidf__norm': ('l1', 'l2'), 'tfidf__ngram_range': ((1, 1), (1, 2)), 'clf__alpha': (0.1, 0.5, 1)}, pre_dispatch='2*n_jobs', refit=False, score_func=None, scoring=None, verbose=1)
print_grid_search_metrics(bern_gs)
Best score: 0.955 Best parameters set: clf__alpha: 0.5 tfidf__max_df: 0.8 tfidf__ngram_range: (1, 2) tfidf__norm: 'l2'
#build the model with the best parameters set from the grid search
Bern_vect = NoUrls_TfidfVectorizer(ngram_range=(1, 2), min_df=1, max_df=0.8, norm='l2', stop_words=stop, strip_accents='unicode')
Bern_classifier = BernoulliNB(alpha=0.5, binarize=None)
Bern_pipeline = Pipeline([('tfidf', Bern_vect), ('clf', Bern_classifier)])
#score: Array of scores of the estimator for each run of the cross validation
score = cross_val_score(Bern_pipeline, eval_data, eval_labels, cv=10)
print "10-fold cross validation accuracy: " + str(np.mean(score))
10-fold cross validation accuracy: 0.949698817106
#now we build the final model with all the training data we have and predict the class for the testing data
predictive_model = Bern_pipeline.fit(training_data, training_labels)
y_Bern_predicted = Bern_pipeline.predict(test_data)
print_metrics("Bernoulli Naive Bayes", test_labels, y_Bern_predicted)
MODEL: Bernoulli Naive Bayes Test Accuracy: 0.964346349745 Classification report: precision recall f1-score support non-fashion tweets 0.97 0.96 0.96 4693 fashion tweets 0.96 0.97 0.96 4731 avg / total 0.96 0.96 0.96 9424 Confusion matrix: [[4487 206] [ 130 4601]]
show_most_informative_features(Bern_vect, Bern_classifier)
-10.0195 00 109 -4.7106 new -10.0195 00 12 -5.0410 fashion -10.0195 00 15 -5.1579 spring -10.0195 00 bids -5.1903 today -10.0195 00 finds -5.1980 love -10.0195 00 gallon -5.3214 look -10.0195 00 lincoln -5.3766 day -10.0195 00 pa -5.4279 thank -10.0195 00 purchase -5.4301 style -10.0195 00 thunder -5.4485 just -10.0195 000 acquire -5.4713 thanks -10.0195 000 coin -5.4778 best -10.0195 000 democrats -5.5333 like -10.0195 000 gb -5.6368 week -10.0195 000 info -5.6411 post -10.0195 000 missing -5.6555 photo -10.0195 000 pistol -5.7137 time -10.0195 000 pot -5.7828 happy -10.0195 000 race -5.8122 outfit -10.0195 000 remark -5.8530 dress
from sklearn.linear_model import LogisticRegression
logistic_tfidf = NoUrls_TfidfVectorizer(min_df=1, stop_words=stop, strip_accents='unicode')
logistic_classifier = LogisticRegression()
logistic_pipeline = Pipeline([('tfidf', logistic_tfidf), ('clf', logistic_classifier)])
logistic_classifier.get_params()
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'penalty': 'l2', 'random_state': None, 'tol': 0.0001}
parameters = {
'tfidf__max_df': (0.8, 1.0),
'tfidf__ngram_range': ((1, 1), (1, 2)),
'tfidf__norm': ('l1', 'l2'),
'clf__C': (1, 5, 7)
}
logistic_gs = GridSearchCV(logistic_pipeline, parameters, verbose=1, refit=False)
logistic_gs.fit(dev_data, dev_labels)
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 3.5s [Parallel(n_jobs=1)]: Done 50 jobs | elapsed: 3.1min
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[Parallel(n_jobs=1)]: Done 72 out of 72 | elapsed: 4.5min finished
GridSearchCV(cv=None, estimator=Pipeline(steps=[('tfidf', NoUrls_TfidfVectorizer(analyzer=u'word', binary=False, charset=None, charset_error=None, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ...e, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001))]), fit_params={}, iid=True, loss_func=None, n_jobs=1, param_grid={'tfidf__max_df': (0.8, 1.0), 'clf__C': (1, 5, 7), 'tfidf__norm': ('l1', 'l2'), 'tfidf__ngram_range': ((1, 1), (1, 2))}, pre_dispatch='2*n_jobs', refit=False, score_func=None, scoring=None, verbose=1)
print_grid_search_metrics(logistic_gs)
Best score: 0.990 Best parameters set: clf__C: 5 tfidf__max_df: 0.8 tfidf__ngram_range: (1, 1) tfidf__norm: 'l2'
#build the model with the best parameters set from the grid search
logistic_vect = NoUrls_TfidfVectorizer(ngram_range=(1, 1), min_df=1, max_df=0.8, norm='l2', stop_words=stop, strip_accents='unicode')
logistic_classifier = LogisticRegression(C=5)
logistic_pipeline = Pipeline([('tfidf', logistic_vect), ('clf', logistic_classifier)])
#score: Array of scores of the estimator for each run of the cross validation
score = cross_val_score(logistic_pipeline, eval_data, eval_labels, cv=10)
print "10-fold cross validation accuracy: " + str(np.mean(score))
10-fold cross validation accuracy: 0.987356274299
#now we build the final model with all the training data we have and predict the class for the testing data
predictive_model = logistic_pipeline.fit(training_data, training_labels)
y_logistic_predicted = logistic_pipeline.predict(test_data)
print_metrics("Logistic Regression", test_labels, y_logistic_predicted)
MODEL: Logistic Regression Test Accuracy: 0.990980475382 Classification report: precision recall f1-score support non-fashion tweets 0.99 0.99 0.99 4693 fashion tweets 0.99 0.99 0.99 4731 avg / total 0.99 0.99 0.99 9424 Confusion matrix: [[4633 60] [ 25 4706]]
show_most_informative_features(logistic_vect, logistic_classifier)
-39.8939 baseball 6.2993 brunch -32.8749 police 5.3912 fashion -32.1957 race 4.4813 nyfw -20.0285 recipe 4.0389 style -17.4650 democrats 3.9377 row -16.6505 robot 3.8305 thank -15.9959 highway 3.8244 yesterday -14.5942 nsa 3.6027 rapper -14.3154 brownie 3.5777 spring -14.3127 opera 3.5750 voguemagazine -13.4855 hybrid 3.5127 dress -13.4015 linux 3.3977 beauty -12.9324 gardening 3.3425 chanel -11.4597 49ers 3.2145 collection -10.4569 python 3.1684 paris -8.0506 mclaren 3.0480 dressed -6.9610 airbus 2.9573 love -4.2125 game 2.9400 runway -4.1144 win 2.9260 sxsw -3.7430 car 2.9141 oscars
from sklearn.svm import LinearSVC
SVM_tfidf = NoUrls_TfidfVectorizer(min_df=1, stop_words=stop, strip_accents='unicode')
SVM_classifier = LinearSVC()
SVM_pipeline = Pipeline([('tfidf', SVM_tfidf), ('clf', SVM_classifier)])
SVM_classifier.get_params()
{'C': 1.0, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'l2', 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 0.0001, 'verbose': 0}
parameters = {
'tfidf__max_df': (0.8, 1.0),
'tfidf__ngram_range': ((1, 1), (1, 2)),
'tfidf__norm': ('l1', 'l2'),
'clf__C': (1, 5, 7)
}
SVM_gs = GridSearchCV(SVM_pipeline, parameters, verbose=1, refit=False)
SVM_gs.fit(dev_data, dev_labels)
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 3.6s [Parallel(n_jobs=1)]: Done 50 jobs | elapsed: 3.1min
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[Parallel(n_jobs=1)]: Done 72 out of 72 | elapsed: 4.4min finished
GridSearchCV(cv=None, estimator=Pipeline(steps=[('tfidf', NoUrls_TfidfVectorizer(analyzer=u'word', binary=False, charset=None, charset_error=None, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ...ling=1, loss='l2', multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))]), fit_params={}, iid=True, loss_func=None, n_jobs=1, param_grid={'tfidf__max_df': (0.8, 1.0), 'clf__C': (1, 5, 7), 'tfidf__norm': ('l1', 'l2'), 'tfidf__ngram_range': ((1, 1), (1, 2))}, pre_dispatch='2*n_jobs', refit=False, score_func=None, scoring=None, verbose=1)
print_grid_search_metrics(SVM_gs)
Best score: 0.990 Best parameters set: clf__C: 5 tfidf__max_df: 0.8 tfidf__ngram_range: (1, 1) tfidf__norm: 'l2'
#build the model with the best parameters set from the grid search
SVM_vect = NoUrls_TfidfVectorizer(ngram_range=(1, 1), min_df=1, max_df=0.8, norm='l2', stop_words=stop, strip_accents='unicode')
SVM_classifier = LinearSVC(C=5)
SVM_pipeline = Pipeline([('tfidf', SVM_vect), ('clf', SVM_classifier)])
#score: Array of scores of the estimator for each run of the cross validation
score = cross_val_score(SVM_pipeline, eval_data, eval_labels, cv=10)
print "10-fold cross validation accuracy: " + str(np.mean(score))
10-fold cross validation accuracy: 0.988720903301
#now we build the final model with all the training data we have and predict the class for the testing data
predictive_model = SVM_pipeline.fit(training_data, training_labels)
y_SVM_predicted = SVM_pipeline.predict(test_data)
print_metrics("SVM", test_labels, y_SVM_predicted)
MODEL: SVM Test Accuracy: 0.991511035654 Classification report: precision recall f1-score support non-fashion tweets 0.99 0.99 0.99 4693 fashion tweets 0.99 0.99 0.99 4731 avg / total 0.99 0.99 0.99 9424 Confusion matrix: [[4644 49] [ 31 4700]]
show_most_informative_features(SVM_vect, SVM_classifier)
-14.1210 baseball 7.2903 brunch -11.9195 race 4.6183 yesterday -11.9044 police 4.2024 row -7.1655 recipe 3.6168 baseballdoucher -7.0651 democrats 3.2017 rapper -6.5867 robot 2.6846 nyfw -6.5067 highway 2.2212 certainly -6.0213 nsa 2.1878 senrandpaul -6.0183 opera 2.0882 dressed -5.8695 brownie 1.8473 paris -5.7914 linux 1.7594 stayed -5.7502 hybrid 1.7474 frankcentrone -5.3438 gardening 1.7423 msrachelhollis -4.8445 49ers 1.7389 flair -4.7230 python 1.7097 staple -3.7592 mclaren 1.7096 theme -3.3689 airbus 1.6891 sydneyoperahouse -2.0326 vote 1.6381 swaps -1.8247 showed 1.5574 prime -1.7162 sis 1.5556 hmm
wrong_classified = y_logistic_predicted != test_labels
wrong_classified
array([False, False, False, ..., False, False, False], dtype=bool)
wrong_data = test_data[wrong_classified == True]
wrong_labels = y_logistic_predicted[wrong_classified == True]
false_positive = wrong_data[wrong_labels == 1] #labeled as 1 (fashion) when should be 0 (non-fashion)
len(false_positive)
60
#example tweet
false_positive[3]
u'@nationalgridus, thanks for destroying winter. @TWC, thanks for destroying spring baseball. Large corps, thanks for putting profit over ppl.'
false_negative = wrong_data[wrong_labels == 0] #labeled as 0 (non-fashion) when they should belong to 1 (fashion)
len(false_negative)
25
#example tweet
false_negative[24]
u'Help @VanityFair win a Webby! Keep us in 1st place in The #Webbys People\u2019s Voice. VOTE: http://t.co/JQ8wC4OUPL'