#!/usr/bin/env python # coding: utf-8 # In[89]: get_ipython().run_line_magic('pylab', 'inline') # In[90]: import json, re from collections import defaultdict import pandas as pd import numpy as np from nltk.corpus import stopwords from sklearn.ensemble import GradientBoostingClassifier from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.metrics import roc_curve, auc # In[91]: dfs = {} for name in ['train', 'test']: df = pd.read_json('../data/%s.json' % name) df['_data'] = name dfs[name] = df # combine train and test data into one df df = dfs['train'].append(dfs['test']) df = df.reset_index(drop=True) # limit to shared columns (plus predictor) cols = list(dfs['test'].columns) + ['requester_received_pizza'] df = df[cols] # rename a few columns to be pithier df.rename(columns={ 'request_title': 'title', 'request_text_edit_aware': 'body', 'requester_upvotes_minus_downvotes_at_request': 'karma', 'requester_number_of_posts_at_request': 'prior_posts', 'requester_number_of_posts_on_raop_at_request': 'prior_raop_posts', 'requester_account_age_in_days_at_request': 'requester_age', 'unix_timestamp_of_request_utc': 'epoch', 'requester_received_pizza': 'got_pizza', }, inplace=True) # convert got pizza indicator to ints df['got_pizza'] = df['got_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x)) df.iloc[0] # In[92]: # clean up text field (lowercase, letters only) def clean_txt(raw, remove_stop=False): # remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", raw) # convert to lower case, split into individual words words = letters_only.lower().split() if remove_stop: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] # join cleaned words return " ".join(words) # combine title and body columns, then clean df['txt_raw'] = df['title'] + ' ' + df['body'] df['txt_clean'] = df['txt_raw'].apply(clean_txt) # check that it worked for col in ['txt_raw', 'txt_clean']: print df.iloc[0][col] print '--' # In[93]: # temporal features dt = pd.to_datetime(df['epoch'], unit='s') dt = pd.DatetimeIndex(dt) df['date'] = dt.date df['day'] = dt.day df['month'] = dt.month df['dow'] = dt.dayofweek df['community_age'] = (dt - min(dt)).days.astype(int) temporal_cols = [ 'day', 'month', 'community_age', ] print df[['date'] + temporal_cols].head() # In[94]: # status features status_cols = [ 'karma', 'prior_raop_posts', 'prior_posts', 'requester_age', ] print df[status_cols].describe() # In[95]: # narrative groupings from paper # source: http://cs.stanford.edu/~althoff/raop-dataset/altruistic_requests_icwsm.pdf narrative_cats = { 'money': [ 'money', 'now', 'broke', 'week', 'until', 'time', 'last', 'day', 'when', 'today', 'tonight', 'paid', 'next', 'first', 'night', 'after', 'tomorrow', 'month', 'while', 'account', 'before', 'long', 'Friday', 'rent', 'buy', 'bank', 'still', 'bills', 'bills', 'ago', 'cash', 'due', 'due', 'soon', 'past', 'never', 'paycheck', 'check', 'spent', 'years', 'poor', 'till', 'yesterday', 'morning', 'dollars', 'financial', 'hour', 'bill', 'evening', 'credit', 'budget', 'loan', 'bucks', 'deposit', 'dollar', 'current', 'payed', ], 'job': [ 'work', 'job', 'paycheck', 'unemployment', 'interview', 'fired', 'employment', 'hired', 'hire', ], 'student': [ 'college', 'student', 'school', 'roommate', 'studying', 'university', 'finals', 'semester', 'class', 'study', 'project', 'dorm', 'tuition', ], 'family': [ 'family', 'mom', 'wife', 'parents', 'mother', 'husband', 'dad', 'son', 'daughter', 'father', 'parent', 'mum', ], 'craving': [ 'friend', 'girlfriend', 'craving', 'birthday', 'boyfriend', 'celebrate', 'party', 'game', 'games', 'movie', 'date', 'drunk', 'beer', 'celebrating', 'invited', 'drinks', 'crave', 'wasted', 'invite', ], } # list of narrative category names cat_list = sorted(narrative_cats.keys()) print 'cat list: %s\n' % cat_list # create word to category mapping word_to_cats = defaultdict(list) for cat, words in narrative_cats.iteritems(): for word in words: word_to_cats[word].append(cat) word_to_cats = dict(word_to_cats) # check that things are working print 'checking word to category lookups:' for word in ['university', 'parent', 'cash']: print '%s - categories: %s' % ( word, word_to_cats.get(word, 'NONE') ) # In[96]: # loop through cleaned text and count occurrences # of words in each narrative category def categorize(words): cats = defaultdict(int) for word in words.split(): matches = word_to_cats.get(word) if matches: for m in matches: cats[m] += 1 return dict(cats) df['txt_cats'] = df['txt_clean'].apply(categorize) # check that it worked for i in range(3): ex = df.iloc[i] print ex['txt_clean'] print ex['txt_cats'] print '\n---\n' # In[97]: # turn data dict into indiv columns (narrative features) def to_freq(row, cat): cats, txt = row['txt_cats'], row['txt_clean'] if cats.get(cat) > 0: return cats.get(cat) * 1.0 / len(txt.split()) else: return 0 for cat in cat_list: df['narr_%s' % cat] = df.apply(lambda row: to_freq(row, cat), axis=1) # assign variable to the list of these new cols narrative_cols = [c for c in df.columns if c.startswith('narr_')] # check that it worked df[['txt_cats'] + narrative_cols].iloc[0] # In[98]: # add a few more, potentially useful features # has link df['hyperlink'] = df['body'].apply(lambda x: 1 if re.search("http|www", x) else 0) # character length of title + body fields df['txt_chars'] = df['txt_clean'].apply(lambda x: len(x)) # politeness indicator df['polite'] = df['txt_clean'].apply(lambda x: 1 if re.search("thank|appreciate|advance", x) else 0) # reciprocity indicator df['reciprocity'] = df['txt_clean'].apply(lambda x: 1 if re.search("repay|pay.+back|pay.+forward|return.+favor", x) else 0) # check their distributions for col in ['polite', 'hyperlink', 'reciprocity']: print '%s: %s' % ( col, df[col].value_counts().to_dict() ) # combine these new cols together additional_cols = [ 'txt_chars', 'polite', 'hyperlink', 'reciprocity', ] # In[99]: # combine features (and check that things look good) x_cols = temporal_cols + status_cols + narrative_cols + additional_cols print x_cols df[x_cols].head() # In[100]: # set up framework to quickly iterate on # different feature sets and algorithm params def get_data(): data = df[df['_data'] == 'train'].copy() return data def prep_data(data, input_cols): X = data[input_cols].as_matrix() y = data['got_pizza'].astype(int).as_matrix() return X, y def predict(input_cols, model_params={}): data = get_data() X, y = prep_data(data, input_cols) rando = 123 Xr, Xt, yr, yt = train_test_split(X, y, random_state=rando) model_params.update({ 'random_state': rando, }) model = GradientBoostingClassifier(**model_params) model = model.fit(Xr, yr) ypred = model.predict_proba(Xt)[:, 1] fpr, tpr, thresholds = roc_curve(yt, ypred) auc_val = auc(fpr, tpr) return auc_val # In[101]: # try out a few different feature sets + model params # just narrative features print predict(narrative_cols) # just temporal features print predict(temporal_cols) # all features print predict(x_cols) # all features with more n_estimators print predict(x_cols, {'n_estimators': 1000}) # In[102]: # model parameter tuning # (this takes a little while to run) param_grid = { 'n_estimators': [100, 500, 1000], 'learning_rate': [0.005, 0.01, 0.02], 'max_depth': [2, 3, 4], } model = GradientBoostingClassifier(random_state=123) grid_search = GridSearchCV(model, param_grid, cv=6, verbose=0, scoring='roc_auc') grid_search.fit(X_train, y_train) print grid_search.best_score_ print grid_search.best_params_ # In[103]: # finally, train classifier over entire training set # with best params from grid search and save predictions df_train = df[df['_data'] == 'train'].copy() X_train = df_train[x_cols].as_matrix() y_train = df_train['got_pizza'].astype(int).as_matrix() model = GradientBoostingClassifier( n_estimators=500, learning_rate=0.01, max_depth=4, random_state=123 ) model = model.fit(X_train, y_train) df_test = df[df['_data'] == 'test'].copy() X_test = df_test[x_cols].as_matrix() ypred = model.predict_proba(X_test)[:, 1] df_test['requester_received_pizza'] = ypred final_df = df_test[['request_id', 'requester_received_pizza']] final_df.to_csv('../output/predicted.csv', index=False) print 'boom.'