#!/usr/bin/env python
# coding: utf-8

# In[89]:


get_ipython().run_line_magic('pylab', 'inline')


# In[90]:


import json, re
from collections import defaultdict
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_curve, auc


# In[91]:


dfs = {}
for name in ['train', 'test']:
    df = pd.read_json('../data/%s.json' % name)
    df['_data'] = name
    dfs[name] = df

# combine train and test data into one df
df = dfs['train'].append(dfs['test'])
df = df.reset_index(drop=True)

# limit to shared columns (plus predictor)
cols = list(dfs['test'].columns) + ['requester_received_pizza']
df = df[cols]

# rename a few columns to be pithier
df.rename(columns={
        'request_title': 'title', 
        'request_text_edit_aware': 'body',
        'requester_upvotes_minus_downvotes_at_request': 'karma',
        'requester_number_of_posts_at_request': 'prior_posts',
        'requester_number_of_posts_on_raop_at_request': 'prior_raop_posts',
        'requester_account_age_in_days_at_request': 'requester_age',
        'unix_timestamp_of_request_utc': 'epoch',
        'requester_received_pizza': 'got_pizza',
}, inplace=True)

# convert got pizza indicator to ints
df['got_pizza'] = df['got_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))

df.iloc[0]


# In[92]:


# clean up text field (lowercase, letters only)

def clean_txt(raw, remove_stop=False):
    # remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw) 

    # convert to lower case, split into individual words
    words = letters_only.lower().split()                             

    if remove_stop:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    # join cleaned words
    return " ".join(words)


# combine title and body columns, then clean
df['txt_raw'] = df['title'] + ' ' + df['body']
df['txt_clean'] = df['txt_raw'].apply(clean_txt)


# check that it worked
for col in ['txt_raw', 'txt_clean']:
    print df.iloc[0][col]
    print '--'


# In[93]:


# temporal features

dt = pd.to_datetime(df['epoch'], unit='s')
dt = pd.DatetimeIndex(dt)

df['date'] = dt.date
df['day'] = dt.day
df['month'] = dt.month
df['dow'] = dt.dayofweek
df['community_age'] = (dt - min(dt)).days.astype(int)

temporal_cols = [
    'day',
    'month',
    'community_age',
]

print df[['date'] + temporal_cols].head()


# In[94]:


# status features

status_cols = [
    'karma',
    'prior_raop_posts',
    'prior_posts',
    'requester_age',
]

print df[status_cols].describe()


# In[95]:


# narrative groupings from paper
# source: http://cs.stanford.edu/~althoff/raop-dataset/altruistic_requests_icwsm.pdf

narrative_cats = {
    'money': [
        'money', 'now', 'broke', 'week', 'until', 'time',
        'last', 'day', 'when', 'today', 'tonight', 'paid', 'next',
        'first', 'night', 'after', 'tomorrow', 'month', 'while',
        'account', 'before', 'long', 'Friday', 'rent', 'buy',
        'bank', 'still', 'bills', 'bills', 'ago', 'cash', 'due', 'due',
        'soon', 'past', 'never', 'paycheck', 'check', 'spent',
        'years', 'poor', 'till', 'yesterday', 'morning', 'dollars',
        'financial', 'hour', 'bill', 'evening', 'credit',
        'budget', 'loan', 'bucks', 'deposit', 'dollar', 'current',
        'payed',        
    ],
    'job': [
        'work', 'job', 'paycheck', 'unemployment', 'interview',
        'fired', 'employment', 'hired', 'hire',        
    ],
    'student': [
        'college', 'student', 'school', 'roommate',
        'studying', 'university', 'finals', 'semester',
        'class', 'study', 'project', 'dorm', 'tuition',        
    ],
    'family': [
        'family', 'mom', 'wife', 'parents', 'mother', 'husband',
        'dad', 'son', 'daughter', 'father', 'parent',
        'mum',        
    ],
    'craving': [
        'friend', 'girlfriend', 'craving', 'birthday',
        'boyfriend', 'celebrate', 'party', 'game', 'games',
        'movie', 'date', 'drunk', 'beer', 'celebrating', 'invited',
        'drinks', 'crave', 'wasted', 'invite',        
    ],
}


# list of narrative category names
cat_list = sorted(narrative_cats.keys())
print 'cat list: %s\n' % cat_list


# create word to category mapping
word_to_cats = defaultdict(list)
for cat, words in narrative_cats.iteritems():
    for word in words:
        word_to_cats[word].append(cat)
word_to_cats = dict(word_to_cats)


# check that things are working
print 'checking word to category lookups:'
for word in ['university', 'parent', 'cash']:
    print '%s - categories: %s' % (
        word,
        word_to_cats.get(word, 'NONE')
    )


# In[96]:


# loop through cleaned text and count occurrences
# of words in each narrative category

def categorize(words):
    cats = defaultdict(int)
    for word in words.split():
        matches = word_to_cats.get(word)
        if matches:
            for m in matches:
                cats[m] += 1
    return dict(cats)


df['txt_cats'] = df['txt_clean'].apply(categorize)


# check that it worked
for i in range(3):
    ex = df.iloc[i]
    print ex['txt_clean']
    print ex['txt_cats']
    print '\n---\n'


# In[97]:


# turn data dict into indiv columns (narrative features)

def to_freq(row, cat):
    cats, txt = row['txt_cats'], row['txt_clean']
    if cats.get(cat) > 0:
        return cats.get(cat) * 1.0 / len(txt.split())
    else:
        return 0

for cat in cat_list:
    df['narr_%s' % cat] = df.apply(lambda row: to_freq(row, cat), axis=1)

# assign variable to the list of these new cols
narrative_cols = [c for c in df.columns if c.startswith('narr_')]

# check that it worked
df[['txt_cats'] + narrative_cols].iloc[0]


# In[98]:


# add a few more, potentially useful features

# has link
df['hyperlink'] = df['body'].apply(lambda x: 1 if re.search("http|www", x) else 0)

# character length of title + body fields
df['txt_chars'] = df['txt_clean'].apply(lambda x: len(x))

# politeness indicator
df['polite'] = df['txt_clean'].apply(lambda x: 1 if re.search("thank|appreciate|advance", x) else 0)

# reciprocity indicator
df['reciprocity'] = df['txt_clean'].apply(lambda x: 
                                           1 if re.search("repay|pay.+back|pay.+forward|return.+favor", x) 
                                           else 0)


# check their distributions
for col in ['polite', 'hyperlink', 'reciprocity']:
    print '%s: %s' % (
        col, 
        df[col].value_counts().to_dict()
    )


# combine these new cols together
additional_cols = [
    'txt_chars',
    'polite',
    'hyperlink',
    'reciprocity',
]


# In[99]:


# combine features (and check that things look good)
x_cols = temporal_cols + status_cols + narrative_cols + additional_cols

print x_cols
df[x_cols].head()


# In[100]:


# set up framework to quickly iterate on
# different feature sets and algorithm params

def get_data():
    data = df[df['_data'] == 'train'].copy()
    return data


def prep_data(data, input_cols):
    X = data[input_cols].as_matrix()
    y = data['got_pizza'].astype(int).as_matrix()
    
    return X, y

    
def predict(input_cols, model_params={}):
    data = get_data()    
    X, y = prep_data(data, input_cols)
    rando = 123
    
    Xr, Xt, yr, yt = train_test_split(X, y, random_state=rando)

    model_params.update({
        'random_state': rando,
    })
    model = GradientBoostingClassifier(**model_params)
    
    model = model.fit(Xr, yr)
    ypred = model.predict_proba(Xt)[:, 1]
    
    fpr, tpr, thresholds = roc_curve(yt, ypred)
    auc_val = auc(fpr, tpr)
    return auc_val


# In[101]:


# try out a few different feature sets + model params

# just narrative features
print predict(narrative_cols)

# just temporal features
print predict(temporal_cols)

# all features
print predict(x_cols)

# all features with more n_estimators
print predict(x_cols, {'n_estimators': 1000})


# In[102]:


# model parameter tuning
# (this takes a little while to run)

param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.005, 0.01, 0.02],
    'max_depth': [2, 3, 4],
}

model = GradientBoostingClassifier(random_state=123)
grid_search = GridSearchCV(model, param_grid, cv=6, verbose=0, scoring='roc_auc')
grid_search.fit(X_train, y_train)

print grid_search.best_score_
print grid_search.best_params_


# In[103]:


# finally, train classifier over entire training set 
# with best params from grid search and save predictions

df_train = df[df['_data'] == 'train'].copy()
X_train = df_train[x_cols].as_matrix()
y_train = df_train['got_pizza'].astype(int).as_matrix()

model = GradientBoostingClassifier(
    n_estimators=500, 
    learning_rate=0.01, 
    max_depth=4, 
    random_state=123
)
model = model.fit(X_train, y_train)

df_test = df[df['_data'] == 'test'].copy()
X_test = df_test[x_cols].as_matrix()

ypred = model.predict_proba(X_test)[:, 1]
df_test['requester_received_pizza'] = ypred

final_df = df_test[['request_id', 'requester_received_pizza']]
final_df.to_csv('../output/predicted.csv', index=False)
print 'boom.'