In [89]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [90]:

import json, re
from collections import defaultdict
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_curve, auc

In [91]:

dfs = {}
for name in ['train', 'test']:
    df = pd.read_json('../data/%s.json' % name)
    df['_data'] = name
    dfs[name] = df

# combine train and test data into one df
df = dfs['train'].append(dfs['test'])
df = df.reset_index(drop=True)

# limit to shared columns (plus predictor)
cols = list(dfs['test'].columns) + ['requester_received_pizza']
df = df[cols]

# rename a few columns to be pithier
df.rename(columns={
        'request_title': 'title', 
        'request_text_edit_aware': 'body',
        'requester_upvotes_minus_downvotes_at_request': 'karma',
        'requester_number_of_posts_at_request': 'prior_posts',
        'requester_number_of_posts_on_raop_at_request': 'prior_raop_posts',
        'requester_account_age_in_days_at_request': 'requester_age',
        'unix_timestamp_of_request_utc': 'epoch',
        'requester_received_pizza': 'got_pizza',
}, inplace=True)

# convert got pizza indicator to ints
df['got_pizza'] = df['got_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))

df.iloc[0]

Out[91]:

giver_username_if_known                                                                             N/A
request_id                                                                                     t3_l25d7
body                                                  Hi I am in need of food for my 4 children we a...
title                                                           Request Colorado Springs Help Us Please
requester_age                                                                                         0
requester_days_since_first_post_on_raop_at_request                                                    0
requester_number_of_comments_at_request                                                               0
requester_number_of_comments_in_raop_at_request                                                       0
prior_posts                                                                                           0
prior_raop_posts                                                                                      0
requester_number_of_subreddits_at_request                                                             0
requester_subreddits_at_request                                                                      []
karma                                                                                                 0
requester_upvotes_plus_downvotes_at_request                                                           0
requester_username                                                                            nickylvst
unix_timestamp_of_request                                                                    1317852607
epoch                                                                                        1317849007
_data                                                                                             train
got_pizza                                                                                             0
Name: 0, dtype: object

In [92]:

# clean up text field (lowercase, letters only)

def clean_txt(raw, remove_stop=False):
    # remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw) 

    # convert to lower case, split into individual words
    words = letters_only.lower().split()                             

    if remove_stop:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    # join cleaned words
    return " ".join(words)


# combine title and body columns, then clean
df['txt_raw'] = df['title'] + ' ' + df['body']
df['txt_clean'] = df['txt_raw'].apply(clean_txt)


# check that it worked
for col in ['txt_raw', 'txt_clean']:
    print df.iloc[0][col]
    print '--'

Request Colorado Springs Help Us Please Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated
--
request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated
--

In [93]:

# temporal features

dt = pd.to_datetime(df['epoch'], unit='s')
dt = pd.DatetimeIndex(dt)

df['date'] = dt.date
df['day'] = dt.day
df['month'] = dt.month
df['dow'] = dt.dayofweek
df['community_age'] = (dt - min(dt)).days.astype(int)

temporal_cols = [
    'day',
    'month',
    'community_age',
]

print df[['date'] + temporal_cols].head()

         date  day  month  community_age
0  2011-10-05    5     10            232
1  2012-03-25   25      3            404
2  2011-10-26   26     10            253
3  2011-12-02    2     12            290
4  2013-07-12   12      7            878

In [94]:

# status features

status_cols = [
    'karma',
    'prior_raop_posts',
    'prior_posts',
    'requester_age',
]

print df[status_cols].describe()

               karma  prior_raop_posts  prior_posts  requester_age
count    5671.000000       5671.000000  5671.000000    5671.000000
mean     1163.804796          0.058014    21.353024     252.905251
std      3486.076626          0.310860    50.203577     302.625587
min      -173.000000          0.000000     0.000000       0.000000
25%         3.000000          0.000000     0.000000       3.584606
50%       170.000000          0.000000     5.000000     155.647593
75%      1159.000000          0.000000    22.000000     386.932639
max    155010.000000          5.000000   867.000000    2809.750787

In [95]:

# narrative groupings from paper
# source: http://cs.stanford.edu/~althoff/raop-dataset/altruistic_requests_icwsm.pdf

narrative_cats = {
    'money': [
        'money', 'now', 'broke', 'week', 'until', 'time',
        'last', 'day', 'when', 'today', 'tonight', 'paid', 'next',
        'first', 'night', 'after', 'tomorrow', 'month', 'while',
        'account', 'before', 'long', 'Friday', 'rent', 'buy',
        'bank', 'still', 'bills', 'bills', 'ago', 'cash', 'due', 'due',
        'soon', 'past', 'never', 'paycheck', 'check', 'spent',
        'years', 'poor', 'till', 'yesterday', 'morning', 'dollars',
        'financial', 'hour', 'bill', 'evening', 'credit',
        'budget', 'loan', 'bucks', 'deposit', 'dollar', 'current',
        'payed',        
    ],
    'job': [
        'work', 'job', 'paycheck', 'unemployment', 'interview',
        'fired', 'employment', 'hired', 'hire',        
    ],
    'student': [
        'college', 'student', 'school', 'roommate',
        'studying', 'university', 'finals', 'semester',
        'class', 'study', 'project', 'dorm', 'tuition',        
    ],
    'family': [
        'family', 'mom', 'wife', 'parents', 'mother', 'husband',
        'dad', 'son', 'daughter', 'father', 'parent',
        'mum',        
    ],
    'craving': [
        'friend', 'girlfriend', 'craving', 'birthday',
        'boyfriend', 'celebrate', 'party', 'game', 'games',
        'movie', 'date', 'drunk', 'beer', 'celebrating', 'invited',
        'drinks', 'crave', 'wasted', 'invite',        
    ],
}


# list of narrative category names
cat_list = sorted(narrative_cats.keys())
print 'cat list: %s\n' % cat_list


# create word to category mapping
word_to_cats = defaultdict(list)
for cat, words in narrative_cats.iteritems():
    for word in words:
        word_to_cats[word].append(cat)
word_to_cats = dict(word_to_cats)


# check that things are working
print 'checking word to category lookups:'
for word in ['university', 'parent', 'cash']:
    print '%s - categories: %s' % (
        word,
        word_to_cats.get(word, 'NONE')
    )

cat list: ['craving', 'family', 'job', 'money', 'student']

checking word to category lookups:
university - categories: ['student']
parent - categories: ['family']
cash - categories: ['money']

In [96]:

# loop through cleaned text and count occurrences
# of words in each narrative category

def categorize(words):
    cats = defaultdict(int)
    for word in words.split():
        matches = word_to_cats.get(word)
        if matches:
            for m in matches:
                cats[m] += 1
    return dict(cats)


df['txt_cats'] = df['txt_clean'].apply(categorize)


# check that it worked
for i in range(3):
    ex = df.iloc[i]
    print ex['txt_clean']
    print ex['txt_cats']
    print '\n---\n'

request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated
{'money': 1, 'family': 2}

---

request california no cash and i could use some dinner i spent the last money i had on gas today im broke until next thursday
{'money': 8}

---

request hungry couple in dundee scotland would love some pizza my girlfriend decided it would be a good idea to get off at perth bus station when she was coming to visit me and has since had to spend all her money on a taxi to get to me here in dundee any chance some kind soul would get us some pizza since we don t have any cash anymore
{'money': 3, 'craving': 1}

---

In [97]:

# turn data dict into indiv columns (narrative features)

def to_freq(row, cat):
    cats, txt = row['txt_cats'], row['txt_clean']
    if cats.get(cat) > 0:
        return cats.get(cat) * 1.0 / len(txt.split())
    else:
        return 0

for cat in cat_list:
    df['narr_%s' % cat] = df.apply(lambda row: to_freq(row, cat), axis=1)

# assign variable to the list of these new cols
narrative_cols = [c for c in df.columns if c.startswith('narr_')]

# check that it worked
df[['txt_cats'] + narrative_cols].iloc[0]

Out[97]:

txt_cats        {u'money': 1, u'family': 2}
narr_craving                              0
narr_family                      0.02777778
narr_job                                  0
narr_money                       0.01388889
narr_student                              0
Name: 0, dtype: object

In [98]:

# add a few more, potentially useful features

# has link
df['hyperlink'] = df['body'].apply(lambda x: 1 if re.search("http|www", x) else 0)

# character length of title + body fields
df['txt_chars'] = df['txt_clean'].apply(lambda x: len(x))

# politeness indicator
df['polite'] = df['txt_clean'].apply(lambda x: 1 if re.search("thank|appreciate|advance", x) else 0)

# reciprocity indicator
df['reciprocity'] = df['txt_clean'].apply(lambda x: 
                                           1 if re.search("repay|pay.+back|pay.+forward|return.+favor", x) 
                                           else 0)


# check their distributions
for col in ['polite', 'hyperlink', 'reciprocity']:
    print '%s: %s' % (
        col, 
        df[col].value_counts().to_dict()
    )


# combine these new cols together
additional_cols = [
    'txt_chars',
    'polite',
    'hyperlink',
    'reciprocity',
]

polite: {0: 3344, 1: 2327}
hyperlink: {0: 5267, 1: 404}
reciprocity: {0: 4497, 1: 1174}

In [99]:

# combine features (and check that things look good)
x_cols = temporal_cols + status_cols + narrative_cols + additional_cols

print x_cols
df[x_cols].head()

['day', 'month', 'community_age', 'karma', 'prior_raop_posts', 'prior_posts', 'requester_age', 'narr_craving', 'narr_family', 'narr_job', 'narr_money', 'narr_student', 'txt_chars', 'polite', 'hyperlink', 'reciprocity']

Out[99]:

	day	month	community_age	karma	prior_posts	requester_age	narr_craving	narr_family	narr_money	narr_student	txt_chars	polite
0	5	10	232	0	0	0.000000	0.000000	0.027778	0.013889	0.000000	354	1
1	25	3	404	34	15	501.111100	0.000000	0.000000	0.320000	0.000000	125	0
2	26	10	253	0	0	0.000000	0.014286	0.000000	0.042857	0.000000	338	0
3	2	12	290	54	1	6.518438	0.000000	0.022222	0.022222	0.022222	227	0
4	12	7	878	1121	14	162.063252	0.033898	0.000000	0.033898	0.000000	548	0

In [100]:

# set up framework to quickly iterate on
# different feature sets and algorithm params

def get_data():
    data = df[df['_data'] == 'train'].copy()
    return data


def prep_data(data, input_cols):
    X = data[input_cols].as_matrix()
    y = data['got_pizza'].astype(int).as_matrix()
    
    return X, y

    
def predict(input_cols, model_params={}):
    data = get_data()    
    X, y = prep_data(data, input_cols)
    rando = 123
    
    Xr, Xt, yr, yt = train_test_split(X, y, random_state=rando)

    model_params.update({
        'random_state': rando,
    })
    model = GradientBoostingClassifier(**model_params)
    
    model = model.fit(Xr, yr)
    ypred = model.predict_proba(Xt)[:, 1]
    
    fpr, tpr, thresholds = roc_curve(yt, ypred)
    auc_val = auc(fpr, tpr)
    return auc_val

In [101]:

# try out a few different feature sets + model params

# just narrative features
print predict(narrative_cols)

# just temporal features
print predict(temporal_cols)

# all features
print predict(x_cols)

# all features with more n_estimators
print predict(x_cols, {'n_estimators': 1000})

0.546242789819
0.580598346078
0.688667943786
0.657436579432

In [102]:

# model parameter tuning
# (this takes a little while to run)

param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.005, 0.01, 0.02],
    'max_depth': [2, 3, 4],
}

model = GradientBoostingClassifier(random_state=123)
grid_search = GridSearchCV(model, param_grid, cv=6, verbose=0, scoring='roc_auc')
grid_search.fit(X_train, y_train)

print grid_search.best_score_
print grid_search.best_params_

0.693051973072
{'n_estimators': 500, 'learning_rate': 0.01, 'max_depth': 4}

In [103]:

# finally, train classifier over entire training set 
# with best params from grid search and save predictions

df_train = df[df['_data'] == 'train'].copy()
X_train = df_train[x_cols].as_matrix()
y_train = df_train['got_pizza'].astype(int).as_matrix()

model = GradientBoostingClassifier(
    n_estimators=500, 
    learning_rate=0.01, 
    max_depth=4, 
    random_state=123
)
model = model.fit(X_train, y_train)

df_test = df[df['_data'] == 'test'].copy()
X_test = df_test[x_cols].as_matrix()

ypred = model.predict_proba(X_test)[:, 1]
df_test['requester_received_pizza'] = ypred

final_df = df_test[['request_id', 'requester_received_pizza']]
final_df.to_csv('../output/predicted.csv', index=False)
print 'boom.'

boom.