%pylab inline
Populating the interactive namespace from numpy and matplotlib
import json, re
from collections import defaultdict
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_curve, auc
dfs = {}
for name in ['train', 'test']:
df = pd.read_json('../data/%s.json' % name)
df['_data'] = name
dfs[name] = df
# combine train and test data into one df
df = dfs['train'].append(dfs['test'])
df = df.reset_index(drop=True)
# limit to shared columns (plus predictor)
cols = list(dfs['test'].columns) + ['requester_received_pizza']
df = df[cols]
# rename a few columns to be pithier
df.rename(columns={
'request_title': 'title',
'request_text_edit_aware': 'body',
'requester_upvotes_minus_downvotes_at_request': 'karma',
'requester_number_of_posts_at_request': 'prior_posts',
'requester_number_of_posts_on_raop_at_request': 'prior_raop_posts',
'requester_account_age_in_days_at_request': 'requester_age',
'unix_timestamp_of_request_utc': 'epoch',
'requester_received_pizza': 'got_pizza',
}, inplace=True)
# convert got pizza indicator to ints
df['got_pizza'] = df['got_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))
df.iloc[0]
giver_username_if_known N/A request_id t3_l25d7 body Hi I am in need of food for my 4 children we a... title Request Colorado Springs Help Us Please requester_age 0 requester_days_since_first_post_on_raop_at_request 0 requester_number_of_comments_at_request 0 requester_number_of_comments_in_raop_at_request 0 prior_posts 0 prior_raop_posts 0 requester_number_of_subreddits_at_request 0 requester_subreddits_at_request [] karma 0 requester_upvotes_plus_downvotes_at_request 0 requester_username nickylvst unix_timestamp_of_request 1317852607 epoch 1317849007 _data train got_pizza 0 Name: 0, dtype: object
# clean up text field (lowercase, letters only)
def clean_txt(raw, remove_stop=False):
# remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", raw)
# convert to lower case, split into individual words
words = letters_only.lower().split()
if remove_stop:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
# join cleaned words
return " ".join(words)
# combine title and body columns, then clean
df['txt_raw'] = df['title'] + ' ' + df['body']
df['txt_clean'] = df['txt_raw'].apply(clean_txt)
# check that it worked
for col in ['txt_raw', 'txt_clean']:
print df.iloc[0][col]
print '--'
Request Colorado Springs Help Us Please Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated -- request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated --
# temporal features
dt = pd.to_datetime(df['epoch'], unit='s')
dt = pd.DatetimeIndex(dt)
df['date'] = dt.date
df['day'] = dt.day
df['month'] = dt.month
df['dow'] = dt.dayofweek
df['community_age'] = (dt - min(dt)).days.astype(int)
temporal_cols = [
'day',
'month',
'community_age',
]
print df[['date'] + temporal_cols].head()
date day month community_age 0 2011-10-05 5 10 232 1 2012-03-25 25 3 404 2 2011-10-26 26 10 253 3 2011-12-02 2 12 290 4 2013-07-12 12 7 878
# status features
status_cols = [
'karma',
'prior_raop_posts',
'prior_posts',
'requester_age',
]
print df[status_cols].describe()
karma prior_raop_posts prior_posts requester_age count 5671.000000 5671.000000 5671.000000 5671.000000 mean 1163.804796 0.058014 21.353024 252.905251 std 3486.076626 0.310860 50.203577 302.625587 min -173.000000 0.000000 0.000000 0.000000 25% 3.000000 0.000000 0.000000 3.584606 50% 170.000000 0.000000 5.000000 155.647593 75% 1159.000000 0.000000 22.000000 386.932639 max 155010.000000 5.000000 867.000000 2809.750787
# narrative groupings from paper
# source: http://cs.stanford.edu/~althoff/raop-dataset/altruistic_requests_icwsm.pdf
narrative_cats = {
'money': [
'money', 'now', 'broke', 'week', 'until', 'time',
'last', 'day', 'when', 'today', 'tonight', 'paid', 'next',
'first', 'night', 'after', 'tomorrow', 'month', 'while',
'account', 'before', 'long', 'Friday', 'rent', 'buy',
'bank', 'still', 'bills', 'bills', 'ago', 'cash', 'due', 'due',
'soon', 'past', 'never', 'paycheck', 'check', 'spent',
'years', 'poor', 'till', 'yesterday', 'morning', 'dollars',
'financial', 'hour', 'bill', 'evening', 'credit',
'budget', 'loan', 'bucks', 'deposit', 'dollar', 'current',
'payed',
],
'job': [
'work', 'job', 'paycheck', 'unemployment', 'interview',
'fired', 'employment', 'hired', 'hire',
],
'student': [
'college', 'student', 'school', 'roommate',
'studying', 'university', 'finals', 'semester',
'class', 'study', 'project', 'dorm', 'tuition',
],
'family': [
'family', 'mom', 'wife', 'parents', 'mother', 'husband',
'dad', 'son', 'daughter', 'father', 'parent',
'mum',
],
'craving': [
'friend', 'girlfriend', 'craving', 'birthday',
'boyfriend', 'celebrate', 'party', 'game', 'games',
'movie', 'date', 'drunk', 'beer', 'celebrating', 'invited',
'drinks', 'crave', 'wasted', 'invite',
],
}
# list of narrative category names
cat_list = sorted(narrative_cats.keys())
print 'cat list: %s\n' % cat_list
# create word to category mapping
word_to_cats = defaultdict(list)
for cat, words in narrative_cats.iteritems():
for word in words:
word_to_cats[word].append(cat)
word_to_cats = dict(word_to_cats)
# check that things are working
print 'checking word to category lookups:'
for word in ['university', 'parent', 'cash']:
print '%s - categories: %s' % (
word,
word_to_cats.get(word, 'NONE')
)
cat list: ['craving', 'family', 'job', 'money', 'student'] checking word to category lookups: university - categories: ['student'] parent - categories: ['family'] cash - categories: ['money']
# loop through cleaned text and count occurrences
# of words in each narrative category
def categorize(words):
cats = defaultdict(int)
for word in words.split():
matches = word_to_cats.get(word)
if matches:
for m in matches:
cats[m] += 1
return dict(cats)
df['txt_cats'] = df['txt_clean'].apply(categorize)
# check that it worked
for i in range(3):
ex = df.iloc[i]
print ex['txt_clean']
print ex['txt_cats']
print '\n---\n'
request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated {'money': 1, 'family': 2} --- request california no cash and i could use some dinner i spent the last money i had on gas today im broke until next thursday {'money': 8} --- request hungry couple in dundee scotland would love some pizza my girlfriend decided it would be a good idea to get off at perth bus station when she was coming to visit me and has since had to spend all her money on a taxi to get to me here in dundee any chance some kind soul would get us some pizza since we don t have any cash anymore {'money': 3, 'craving': 1} ---
# turn data dict into indiv columns (narrative features)
def to_freq(row, cat):
cats, txt = row['txt_cats'], row['txt_clean']
if cats.get(cat) > 0:
return cats.get(cat) * 1.0 / len(txt.split())
else:
return 0
for cat in cat_list:
df['narr_%s' % cat] = df.apply(lambda row: to_freq(row, cat), axis=1)
# assign variable to the list of these new cols
narrative_cols = [c for c in df.columns if c.startswith('narr_')]
# check that it worked
df[['txt_cats'] + narrative_cols].iloc[0]
txt_cats {u'money': 1, u'family': 2} narr_craving 0 narr_family 0.02777778 narr_job 0 narr_money 0.01388889 narr_student 0 Name: 0, dtype: object
# add a few more, potentially useful features
# has link
df['hyperlink'] = df['body'].apply(lambda x: 1 if re.search("http|www", x) else 0)
# character length of title + body fields
df['txt_chars'] = df['txt_clean'].apply(lambda x: len(x))
# politeness indicator
df['polite'] = df['txt_clean'].apply(lambda x: 1 if re.search("thank|appreciate|advance", x) else 0)
# reciprocity indicator
df['reciprocity'] = df['txt_clean'].apply(lambda x:
1 if re.search("repay|pay.+back|pay.+forward|return.+favor", x)
else 0)
# check their distributions
for col in ['polite', 'hyperlink', 'reciprocity']:
print '%s: %s' % (
col,
df[col].value_counts().to_dict()
)
# combine these new cols together
additional_cols = [
'txt_chars',
'polite',
'hyperlink',
'reciprocity',
]
polite: {0: 3344, 1: 2327} hyperlink: {0: 5267, 1: 404} reciprocity: {0: 4497, 1: 1174}
# combine features (and check that things look good)
x_cols = temporal_cols + status_cols + narrative_cols + additional_cols
print x_cols
df[x_cols].head()
['day', 'month', 'community_age', 'karma', 'prior_raop_posts', 'prior_posts', 'requester_age', 'narr_craving', 'narr_family', 'narr_job', 'narr_money', 'narr_student', 'txt_chars', 'polite', 'hyperlink', 'reciprocity']
day | month | community_age | karma | prior_raop_posts | prior_posts | requester_age | narr_craving | narr_family | narr_job | narr_money | narr_student | txt_chars | polite | hyperlink | reciprocity | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 10 | 232 | 0 | 0 | 0 | 0.000000 | 0.000000 | 0.027778 | 0 | 0.013889 | 0.000000 | 354 | 1 | 0 | 0 |
1 | 25 | 3 | 404 | 34 | 0 | 15 | 501.111100 | 0.000000 | 0.000000 | 0 | 0.320000 | 0.000000 | 125 | 0 | 0 | 0 |
2 | 26 | 10 | 253 | 0 | 0 | 0 | 0.000000 | 0.014286 | 0.000000 | 0 | 0.042857 | 0.000000 | 338 | 0 | 0 | 0 |
3 | 2 | 12 | 290 | 54 | 0 | 1 | 6.518438 | 0.000000 | 0.022222 | 0 | 0.022222 | 0.022222 | 227 | 0 | 0 | 0 |
4 | 12 | 7 | 878 | 1121 | 0 | 14 | 162.063252 | 0.033898 | 0.000000 | 0 | 0.033898 | 0.000000 | 548 | 0 | 0 | 0 |
# set up framework to quickly iterate on
# different feature sets and algorithm params
def get_data():
data = df[df['_data'] == 'train'].copy()
return data
def prep_data(data, input_cols):
X = data[input_cols].as_matrix()
y = data['got_pizza'].astype(int).as_matrix()
return X, y
def predict(input_cols, model_params={}):
data = get_data()
X, y = prep_data(data, input_cols)
rando = 123
Xr, Xt, yr, yt = train_test_split(X, y, random_state=rando)
model_params.update({
'random_state': rando,
})
model = GradientBoostingClassifier(**model_params)
model = model.fit(Xr, yr)
ypred = model.predict_proba(Xt)[:, 1]
fpr, tpr, thresholds = roc_curve(yt, ypred)
auc_val = auc(fpr, tpr)
return auc_val
# try out a few different feature sets + model params
# just narrative features
print predict(narrative_cols)
# just temporal features
print predict(temporal_cols)
# all features
print predict(x_cols)
# all features with more n_estimators
print predict(x_cols, {'n_estimators': 1000})
0.546242789819 0.580598346078 0.688667943786 0.657436579432
# model parameter tuning
# (this takes a little while to run)
param_grid = {
'n_estimators': [100, 500, 1000],
'learning_rate': [0.005, 0.01, 0.02],
'max_depth': [2, 3, 4],
}
model = GradientBoostingClassifier(random_state=123)
grid_search = GridSearchCV(model, param_grid, cv=6, verbose=0, scoring='roc_auc')
grid_search.fit(X_train, y_train)
print grid_search.best_score_
print grid_search.best_params_
0.693051973072 {'n_estimators': 500, 'learning_rate': 0.01, 'max_depth': 4}
# finally, train classifier over entire training set
# with best params from grid search and save predictions
df_train = df[df['_data'] == 'train'].copy()
X_train = df_train[x_cols].as_matrix()
y_train = df_train['got_pizza'].astype(int).as_matrix()
model = GradientBoostingClassifier(
n_estimators=500,
learning_rate=0.01,
max_depth=4,
random_state=123
)
model = model.fit(X_train, y_train)
df_test = df[df['_data'] == 'test'].copy()
X_test = df_test[x_cols].as_matrix()
ypred = model.predict_proba(X_test)[:, 1]
df_test['requester_received_pizza'] = ypred
final_df = df_test[['request_id', 'requester_received_pizza']]
final_df.to_csv('../output/predicted.csv', index=False)
print 'boom.'
boom.