Notebook

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import folium as fm
import geopy

from sklearn import linear_model, naive_bayes, feature_selection, metrics, tree
from sklearn.cross_validation import train_test_split

%matplotlib inline

In [2]:

tips_with_adjectives_1 = pd.read_pickle('./dumps/tips_with_adjectives.pkl')
adj_dummies = pd.read_pickle('./dumps/adjective_dataframe.pkl')
adj_df = pd.read_csv('./dumps/adjective_count_list.csv')

In [3]:

print len(adj_dummies)
print len(tips_with_adjectives_1)

15527
15527

In [4]:

adjective_list = list(adj_df[adj_df['count'] > 10]['word'])

print "Number of tips: ", len(tips_with_adjectives_1)
print "Number of adjectives: ", len(adj_df)
print "Number of significant adjectives (appears in more than 10 tips): ", len(adjective_list)

Number of tips:  15527
Number of adjectives:  1838
Number of significant adjectives (appears in more than 10 tips):  255

In [5]:

# tips_with_adjectives['address'] = tips_with_adjectives.apply(lambda x: "{0} {1} {2}, {3}, New York, NY".format(x['BUILDING'].strip(), x['STREET'].strip(), int(x['ZIPCODE']), x['BORO']), axis=1)

In [6]:

latlong_df = pd.read_pickle('./dumps/with_lat_long.pkl')[['foursquare_id', 'lat_long']]

In [7]:

tips_adj_df = tips_with_adjectives_1.join(adj_dummies)
tips_adj_df.drop_duplicates(['foursquare_id', 'description'], inplace=True)
tips_adj_df = tips_adj_df.merge(latlong_df, on='foursquare_id', how='left')
len(tips_adj_df)

Out[7]:

In [8]:

# desired_columns = [
#     'foursquare_id',
#     'DBA',
#     'description', 
#     'tip_words', 
#     'tip_adjs', 
#     'adj_string', 
#     'foursquare_rating',
#     'foursquare_num_of_users',
#     'foursquare_price_tier',
#     'grade_A', 
#     'grade_C',
#     'GRADE',
#     'lat_long'
# ]

# tips_df = tips_with_adjectives[desired_columns]
# len(tips_df)

In [9]:

def score_and_predict(model, x_features, y_targets, columns, model_type):
    score = model.score(x_features, y_targets)
    y_pred = model.predict(x_features)
    auc = metrics.roc_auc_score(y_targets, y_pred)
    
    p_values = feature_selection.f_classif(x_features, y_targets)
    
    if model_type == 'naive-bayes' or model_type == 'logistic':
        coef_list = [np.exp(round(x, 4)) for x in model.coef_[0]]
    elif model_type == 'linear':
        coef_list = [round(x, 4) for x in model.coef_]
        
    
    df_dict = {'adjective': columns, 'p-value': p_values[0], 'coef': coef_list}
    model_df = pd.DataFrame(df_dict)
    model_df.sort(['p-value', 'coef'], ascending=[1,0], inplace=True)
    
    print 'MODEL: ', model
    print 'SCORE: ', score
    print 'AUC: ', auc
    print '\n'
    
    print 'TOP PREDICTORS (p-value < 0.05):'
    print model_df[model_df['p-value'] <= 0.05]
    print '\n'
    
    #  return model_df
    
    

In [11]:

# for index, column in enumerate(tips_adj_df.columns.values):
#     print column, index

In [12]:

# ADJECTIVE COLUMNS ARE 31 till second to last column
# based on adjectives included in tip descriptions
X_adjs = tips_adj_df.ix[:, 34:-1]

# based on ratings, number of users, and price tier
X_foursquare_info = tips_adj_df[['foursquare_rating', 'foursquare_num_of_users', 'foursquare_price_tier']].dropna(axis=1)

Predicting Grade "A" restaurants¶

In [13]:

y = tips_adj_df['grade_A']

In [14]:

X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)

In [15]:

clf_multi_nb = naive_bayes.MultinomialNB()
clf_multi_nb = clf_multi_nb.fit(X_train, y_train)

print 'Using training set'
score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')

print 'Using testing set'
score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')

print 'Using all data'
score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')

Using training set
MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
SCORE:  0.901251393879
AUC:  0.502431233668


TOP PREDICTORS (p-value < 0.05):
        adjective      coef   p-value
163         other  0.005634  0.000050
26           cant  0.009299  0.000081
73           epic  0.000679  0.000087
210      spacious  0.000679  0.000087
250        yellow  0.000679  0.000087
87      flavorful  0.001290  0.000174
5      affordable  0.001901  0.000261
96        general  0.002579  0.001122
209         solid  0.001968  0.001918
122         irish  0.001358  0.003553
215         steak  0.006109  0.004008
95         garlic  0.004277  0.004226
46           cozy  0.001833  0.006104
115           hot  0.012015  0.006348
227      terrible  0.002647  0.007124
158         olive  0.000747  0.008315
237  unbelievable  0.000747  0.008315
38          clean  0.004548  0.008604
138          long  0.005430  0.009899
84          first  0.004209  0.011209
56           dish  0.002987  0.013767
140         magic  0.000611  0.014504
151       natural  0.000611  0.014504
191           sad  0.000611  0.014504
216       stellar  0.000611  0.014504
119        indian  0.001765  0.020247
157           old  0.004005  0.020831
223         sweet  0.007738  0.023556
118    incredible  0.002104  0.025013
169      personal  0.001154  0.029040
170    phenomenal  0.001154  0.029040
228      terrific  0.001154  0.029040
107       grilled  0.005974  0.031371
81            fat  0.000815  0.033534
127           kid  0.000815  0.033534
162      original  0.000815  0.033534
178        public  0.000815  0.033534
239         usual  0.000815  0.033534
54      different  0.002308  0.033618
82       favorite  0.006652  0.038926


Using testing set
MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
SCORE:  0.888888888889
AUC:  0.5


TOP PREDICTORS (p-value < 0.05):
       adjective      coef   p-value
83           few  0.003665  0.000000
56          dish  0.002987  0.000000
14         baked  0.002715  0.000000
93          full  0.002579  0.000000
120  inexpensive  0.001290  0.000000
35       chinese  0.006517  0.002318
179        quick  0.003462  0.004851
130        large  0.003937  0.006621
114     horrible  0.003326  0.006621
154         next  0.003326  0.006621
124          ive  0.008349  0.009112
113         high  0.002715  0.012537
118   incredible  0.002104  0.012537
68      eggplant  0.001833  0.012537
146     mediocre  0.001222  0.012537
149         much  0.008077  0.014883
10     available  0.002104  0.015660
47         crazy  0.001968  0.015660
199        short  0.001901  0.015660
25          busy  0.001629  0.015660
104       greasy  0.001154  0.015660
133        later  0.001154  0.015660
163        other  0.005634  0.020173
116         huge  0.004141  0.025169
254        yummy  0.006992  0.040728
131         last  0.002579  0.045608
54     different  0.002308  0.045608
23        bubble  0.002240  0.045608
5     affordable  0.001901  0.045608
51        decent  0.005023  0.049499


Using all data
MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
SCORE:  0.898160193273
AUC:  0.50176809245


TOP PREDICTORS (p-value < 0.05):
      adjective      coef   p-value
230        tiny  0.001968  0.000085
151     natural  0.000611  0.000490
3     addictive  0.000543  0.000490
228    terrific  0.001154  0.000981
238      unique  0.001086  0.000981
169    personal  0.001154  0.002052
54    different  0.002308  0.002205
163       other  0.005634  0.002261
90        fresh  0.014595  0.002581
13          bad  0.008077  0.003905
66          eat  0.009299  0.007615
29       casual  0.000407  0.007943
224       swiss  0.000407  0.007943
9     authentic  0.003190  0.008253
180       quiet  0.001968  0.009322
5    affordable  0.001901  0.009322
40        close  0.001901  0.009322
58         dont  0.025929  0.009461
182        real  0.003734  0.010006
95       garlic  0.004277  0.010360
1        accept  0.001154  0.010862
30      central  0.001154  0.010862
56         dish  0.002987  0.011253
140       magic  0.000611  0.015087
191         sad  0.000611  0.015087
204        slow  0.007195  0.016624
157         old  0.004005  0.018804
79    fantastic  0.004887  0.020690
38        clean  0.004548  0.023304
119      indian  0.001765  0.024510
202      simple  0.001697  0.024510
20        black  0.002851  0.029148
57         dive  0.001018  0.030205
143        many  0.003055  0.032713
130       large  0.003937  0.036375
251       youll  0.003937  0.036375
26         cant  0.009299  0.037648
118  incredible  0.002104  0.038944
110     healthy  0.002308  0.039847
209       solid  0.001968  0.039847
45         cool  0.004277  0.043770
234     turkish  0.000815  0.044770
193    saltfish  0.000475  0.045706
178      public  0.000815  0.046227
210    spacious  0.000679  0.046227
15        basic  0.000475  0.046227

In [17]:

clf_linear = linear_model.LinearRegression()
clf_linear = clf_linear.fit(X_train, y_train)

print 'Using training set'
score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear')

print 'Using testing set'
score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear')

print 'Using all data'
score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear')

Using training set
MODEL:  LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
SCORE:  0.0306220058522
AUC:  0.65453453789


TOP PREDICTORS (p-value < 0.05):
        adjective    coef   p-value
163         other -0.0005  0.000050
26           cant  0.0110  0.000081
73           epic  0.0196  0.000087
250        yellow  0.0143  0.000087
210      spacious  0.0123  0.000087
87      flavorful  0.0218  0.000174
5      affordable  0.0061  0.000261
96        general  0.0292  0.001122
209         solid  0.0167  0.001918
122         irish -0.0082  0.003553
215         steak -0.0008  0.004008
95         garlic -0.0077  0.004226
46           cozy -0.0200  0.006104
115           hot  0.0042  0.006348
227      terrible -0.0021  0.007124
158         olive  0.0464  0.008315
237  unbelievable  0.0079  0.008315
38          clean  0.0037  0.008604
138          long -0.0036  0.009899
84          first -0.0020  0.011209
56           dish  0.0091  0.013767
151       natural  0.0361  0.014504
191           sad  0.0085  0.014504
216       stellar -0.0123  0.014504
140         magic -0.0354  0.014504
119        indian -0.0069  0.020247
157           old -0.0030  0.020831
223         sweet  0.0028  0.023556
118    incredible  0.0127  0.025013
228      terrific  0.0222  0.029040
169      personal -0.0176  0.029040
170    phenomenal -0.0186  0.029040
107       grilled  0.0041  0.031371
239         usual  0.0336  0.033534
127           kid  0.0176  0.033534
162      original  0.0168  0.033534
81            fat  0.0163  0.033534
178        public  0.0145  0.033534
54      different -0.0040  0.033618
82       favorite  0.0016  0.038926


Using testing set
MODEL:  LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
SCORE:  -0.0270734866002
AUC:  0.514883782061


TOP PREDICTORS (p-value < 0.05):
       adjective    coef   p-value
120  inexpensive  0.1140  0.000000
93          full  0.0438  0.000000
14         baked  0.0327  0.000000
83           few  0.0294  0.000000
56          dish  0.0091  0.000000
35       chinese -0.0829  0.002318
179        quick -0.0168  0.004851
154         next  0.0210  0.006621
130        large -0.0158  0.006621
114     horrible -0.0324  0.006621
124          ive -0.0264  0.009112
113         high  0.0792  0.012537
118   incredible  0.0127  0.012537
68      eggplant -0.0432  0.012537
146     mediocre -0.1384  0.012537
149         much  0.0396  0.014883
199        short  0.1068  0.015660
47         crazy  0.0316  0.015660
25          busy -0.0378  0.015660
104       greasy -0.0779  0.015660
10     available -0.0937  0.015660
133        later -0.1298  0.015660
163        other -0.0005  0.020173
116         huge -0.0639  0.025169
254        yummy  0.0469  0.040728
23        bubble  0.0391  0.045608
5     affordable  0.0061  0.045608
54     different -0.0040  0.045608
131         last -0.0163  0.045608
51        decent  0.0553  0.049499


Using all data
MODEL:  LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
SCORE:  0.0153673116839
AUC:  0.616453415498


TOP PREDICTORS (p-value < 0.05):
      adjective    coef   p-value
230        tiny -0.0225  0.000085
3     addictive  0.0950  0.000490
151     natural  0.0361  0.000490
228    terrific  0.0222  0.000981
238      unique -0.0312  0.000981
169    personal -0.0176  0.002052
54    different -0.0040  0.002205
163       other -0.0005  0.002261
90        fresh  0.0131  0.002581
13          bad -0.0062  0.003905
66          eat -0.0031  0.007615
224       swiss  0.0856  0.007943
29       casual -0.0786  0.007943
9     authentic  0.0092  0.008253
180       quiet  0.0412  0.009322
5    affordable  0.0061  0.009322
40        close -0.0286  0.009322
58         dont  0.0163  0.009461
182        real  0.0094  0.010006
95       garlic -0.0077  0.010360
1        accept  0.0405  0.010862
30      central  0.0343  0.010862
56         dish  0.0091  0.011253
191         sad  0.0085  0.015087
140       magic -0.0354  0.015087
204        slow -0.0116  0.016624
157         old -0.0030  0.018804
79    fantastic -0.0188  0.020690
38        clean  0.0037  0.023304
202      simple  0.0226  0.024510
119      indian -0.0069  0.024510
20        black  0.0394  0.029148
57         dive -0.0270  0.030205
143        many  0.0036  0.032713
130       large -0.0158  0.036375
251       youll -0.0307  0.036375
26         cant  0.0110  0.037648
118  incredible  0.0127  0.038944
110     healthy  0.0384  0.039847
209       solid  0.0167  0.039847
45         cool  0.0238  0.043770
234     turkish -0.0607  0.044770
193    saltfish -0.0485  0.045706
15        basic  0.1099  0.046227
178      public  0.0145  0.046227
210    spacious  0.0123  0.046227

In [19]:

clf_logistic = linear_model.LogisticRegression()
clf_logistic.fit(X_train, y_train)

print 'Using training set'
score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic')

print 'Using testing set'
score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic')

print 'Using all data'
score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic')

Using training set
MODEL:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
SCORE:  0.901003593111
AUC:  0.500625


TOP PREDICTORS (p-value < 0.05):
        adjective      coef   p-value
163         other  1.057386  0.000050
26           cant  1.184831  0.000081
73           epic  1.098779  0.000087
250        yellow  1.097571  0.000087
210      spacious  1.052954  0.000087
87      flavorful  1.143622  0.000174
5      affordable  1.044564  0.000261
96        general  1.248696  0.001122
209         solid  1.152807  0.001918
122         irish  0.962232  0.003553
215         steak  0.986887  0.004008
95         garlic  0.938474  0.004226
46           cozy  0.893776  0.006104
115           hot  1.031176  0.006348
227      terrible  1.015316  0.007124
158         olive  1.137463  0.008315
237  unbelievable  1.029219  0.008315
38          clean  1.051376  0.008604
138          long  0.981474  0.009899
84          first  0.957337  0.011209
56           dish  1.086107  0.013767
151       natural  1.067479  0.014504
191           sad  1.002704  0.014504
216       stellar  0.941576  0.014504
140         magic  0.860020  0.014504
119        indian  0.969282  0.020247
157           old  0.971514  0.020831
223         sweet  1.015316  0.023556
118    incredible  1.085239  0.025013
228      terrific  1.048332  0.029040
169      personal  0.903481  0.029040
170    phenomenal  0.886300  0.029040
107       grilled  1.022244  0.031371
239         usual  1.144651  0.033534
81            fat  1.101199  0.033534
162      original  1.086650  0.033534
127           kid  1.086107  0.033534
178        public  1.072186  0.033534
54      different  0.943933  0.033618
82       favorite  1.031589  0.038926


Using testing set
MODEL:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
SCORE:  0.888888888889
AUC:  0.5


TOP PREDICTORS (p-value < 0.05):
       adjective      coef   p-value
120  inexpensive  2.360091  0.000000
93          full  1.620282  0.000000
83           few  1.384307  0.000000
14         baked  1.355134  0.000000
56          dish  1.086107  0.000000
35       chinese  0.505807  0.002318
179        quick  0.853679  0.004851
154         next  1.204061  0.006621
130        large  0.860708  0.006621
114     horrible  0.753294  0.006621
124          ive  0.785920  0.009112
113         high  2.387866  0.012537
118   incredible  1.085239  0.012537
68      eggplant  0.727894  0.012537
146     mediocre  0.431538  0.012537
149         much  1.625800  0.014883
199        short  2.895622  0.015660
47         crazy  1.344874  0.015660
25          busy  0.810098  0.015660
104       greasy  0.628449  0.015660
10     available  0.507987  0.015660
133        later  0.476589  0.015660
163        other  1.057386  0.020173
116         huge  0.606652  0.025169
254        yummy  1.751548  0.040728
23        bubble  1.441234  0.045608
5     affordable  1.044564  0.045608
54     different  0.943933  0.045608
131         last  0.852740  0.045608
51        decent  1.715321  0.049499


Using all data
MODEL:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
SCORE:  0.897974354209
AUC:  0.500454959054


TOP PREDICTORS (p-value < 0.05):
      adjective      coef   p-value
230        tiny  0.858216  0.000085
3     addictive  1.555816  0.000490
151     natural  1.067479  0.000490
228    terrific  1.048332  0.000981
238      unique  0.815136  0.000981
169    personal  0.903481  0.002052
54    different  0.943933  0.002205
163       other  1.057386  0.002261
90        fresh  1.188509  0.002581
13          bad  0.933607  0.003905
66          eat  0.951800  0.007615
224       swiss  1.352156  0.007943
29       casual  0.736460  0.007943
9     authentic  1.049171  0.008253
180       quiet  1.439218  0.009322
5    affordable  1.044564  0.009322
40        close  0.794613  0.009322
58         dont  1.234542  0.009461
182        real  1.113268  0.010006
95       garlic  0.938474  0.010360
1        accept  1.302128  0.010862
30      central  1.269979  0.010862
56         dish  1.086107  0.011253
191         sad  1.002704  0.015087
140       magic  0.860020  0.015087
204        slow  0.896999  0.016624
157         old  0.971514  0.018804
79    fantastic  0.846623  0.020690
38        clean  1.051376  0.023304
202      simple  1.174568  0.024510
119      indian  0.969282  0.024510
20        black  1.402000  0.029148
57         dive  0.859590  0.030205
143        many  1.043938  0.032713
130       large  0.860708  0.036375
251       youll  0.745575  0.036375
26         cant  1.184831  0.037648
118  incredible  1.085239  0.038944
110     healthy  1.474177  0.039847
209       solid  1.152807  0.039847
45         cool  1.258222  0.043770
234     turkish  0.737713  0.044770
193    saltfish  0.823576  0.045706
15        basic  1.494662  0.046227
178      public  1.072186  0.046227
210    spacious  1.052954  0.046227

Predicting Grade "C" restaurants¶

In [20]:

y = tips_adj_df['grade_C']

In [21]:

X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)

In [22]:

clf_multi_nb = naive_bayes.MultinomialNB()
clf_multi_nb = clf_multi_nb.fit(X_train, y_train)

print 'Using training set'
score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')

print 'Using testing set'
score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')

print 'Using all data'
score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')

Using training set
MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
SCORE:  0.987981662743
AUC:  0.499937304075


TOP PREDICTORS (p-value < 0.05):
      adjective      coef   p-value
53       delish  0.004338  0.000169
74    excellent  0.006507  0.000341
52    delicious  0.015184  0.001837
106       green  0.004338  0.003959
249       wrong  0.004338  0.005746
22    breakfast  0.006507  0.006458
0    20500daily  0.002169  0.012036
24        bushy  0.002169  0.012036
117      iconic  0.002169  0.012036
176   priceless  0.002169  0.012036
253        yous  0.002169  0.012036
153         new  0.008677  0.017581
107     grilled  0.004338  0.018048
102        good  0.036876  0.023534
171         pic  0.002169  0.024075
192       salad  0.008677  0.029159
19          big  0.004338  0.034574
67    efficient  0.002169  0.036117
99    ginormous  0.002169  0.036117
196    separate  0.002169  0.036117
203       sized  0.002169  0.036117
221    superior  0.002169  0.036117
84        first  0.004338  0.046109
188       royal  0.002169  0.048163


Using testing set
MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
SCORE:  0.990338164251
AUC:  0.5


TOP PREDICTORS (p-value < 0.05):
         adjective      coef   p-value
43   complimentary  0.004338  0.009753
190        russian  0.004338  0.009753
44       congested  0.002169  0.009753
48        creative  0.002169  0.009753
67       efficient  0.002169  0.009753
69           empty  0.002169  0.009753
117         iconic  0.002169  0.009753
151        natural  0.002169  0.009753
169       personal  0.002169  0.009753
171            pic  0.002169  0.009753
188          royal  0.002169  0.009753
193       saltfish  0.002169  0.009753
196       separate  0.002169  0.009753
198        several  0.002169  0.009753
236     unbeatable  0.002169  0.009753
243           weak  0.002169  0.009753
161        organic  0.004338  0.019512
210       spacious  0.004338  0.019512
228       terrific  0.004338  0.019512
237   unbelievable  0.004338  0.019512
2           actual  0.002169  0.019512
31         certain  0.002169  0.019512
39           clear  0.002169  0.019512
103       gorgeous  0.002169  0.019512
140          magic  0.002169  0.019512
144        massive  0.002169  0.019512
187       rosemary  0.002169  0.019512
191            sad  0.002169  0.019512
201        similar  0.002169  0.019512
203          sized  0.002169  0.019512
..             ...       ...       ...
63           earth  0.002169  0.039054
119         indian  0.002169  0.039054
127            kid  0.002169  0.039054
145           mean  0.002169  0.039054
152         nearby  0.002169  0.039054
174           poor  0.002169  0.039054
207         social  0.002169  0.039054
214       standard  0.002169  0.039054
224          swiss  0.002169  0.039054
155           nice  0.013016  0.041094
64            east  0.004338  0.048836
16            bean  0.002169  0.048836
70         english  0.002169  0.048836
72          entire  0.002169  0.048836
73            epic  0.002169  0.048836
112        helpful  0.002169  0.048836
121    interesting  0.002169  0.048836
129         korean  0.002169  0.048836
134          light  0.002169  0.048836
139            low  0.002169  0.048836
148         modern  0.002169  0.048836
156         normal  0.002169  0.048836
177        private  0.002169  0.048836
186     ridiculous  0.002169  0.048836
197        serious  0.002169  0.048836
200       sicilian  0.002169  0.048836
216        stellar  0.002169  0.048836
218         strong  0.002169  0.048836
235        typical  0.002169  0.048836
244          weird  0.002169  0.048836

[83 rows x 3 columns]


Using all data
MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
SCORE:  0.988570897603
AUC:  0.499953007519


TOP PREDICTORS (p-value < 0.05):
      adjective      coef   p-value
130       large  0.004338  0.003783
251       youll  0.004338  0.003783
84        first  0.004338  0.004120
157         old  0.004338  0.004120
41         cold  0.002169  0.007306
123     italian  0.004338  0.011308
0    20500daily  0.002169  0.011465
24        bushy  0.002169  0.011465
44    congested  0.002169  0.011465
176   priceless  0.002169  0.011465
253        yous  0.002169  0.011465
13          bad  0.006507  0.022223
150       music  0.004338  0.022223
117      iconic  0.002169  0.022932
53       delish  0.004338  0.024571
106       green  0.004338  0.031070
114    horrible  0.004338  0.031512
99    ginormous  0.002169  0.034402
171         pic  0.002169  0.034402
221    superior  0.002169  0.034402
102        good  0.036876  0.036221
192       salad  0.008677  0.036831
204        slow  0.004338  0.045644
67    efficient  0.002169  0.045873
196    separate  0.002169  0.045873
154        next  0.004338  0.048159

In [23]:

clf_linear = linear_model.LinearRegression()
clf_linear = clf_linear.fit(X_train, y_train)

print 'Using training set'
score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear')

print 'Using testing set'
score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear')

print 'Using all data'
score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear')

Using training set
MODEL:  LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
SCORE:  0.0265341264606
AUC:  0.813025731452


TOP PREDICTORS (p-value < 0.05):
      adjective    coef   p-value
53       delish  0.0025  0.000169
74    excellent  0.0006  0.000341
52    delicious  0.0004  0.001837
106       green  0.0031  0.003959
249       wrong  0.0009  0.005746
22    breakfast -0.0010  0.006458
117      iconic  0.0080  0.012036
24        bushy -0.0028  0.012036
0    20500daily -0.0091  0.012036
176   priceless -0.0091  0.012036
253        yous -0.0103  0.012036
153         new  0.0012  0.017581
107     grilled  0.0003  0.018048
102        good  0.0016  0.023534
171         pic -0.0208  0.024075
192       salad -0.0021  0.029159
19          big  0.0016  0.034574
203       sized  0.0032  0.036117
196    separate  0.0028  0.036117
221    superior -0.0084  0.036117
67    efficient -0.0124  0.036117
99    ginormous -0.0202  0.036117
84        first  0.0046  0.046109
188       royal -0.0046  0.048163


Using testing set
MODEL:  LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
SCORE:  -0.0391889077113
AUC:  0.464410448838


TOP PREDICTORS (p-value < 0.05):
         adjective    coef   p-value
190        russian  0.0802  0.009753
43   complimentary  0.0609  0.009753
117         iconic  0.0080  0.009753
196       separate  0.0028  0.009753
44       congested  0.0000  0.009753
198        several -0.0005  0.009753
188          royal -0.0046  0.009753
236     unbeatable -0.0051  0.009753
193       saltfish -0.0058  0.009753
243           weak -0.0068  0.009753
69           empty -0.0101  0.009753
151        natural -0.0121  0.009753
67       efficient -0.0124  0.009753
48        creative -0.0159  0.009753
169       personal -0.0189  0.009753
171            pic -0.0208  0.009753
210       spacious  0.0875  0.019512
237   unbelievable  0.0850  0.019512
161        organic  0.0622  0.019512
228       terrific  0.0401  0.019512
203          sized  0.0032  0.019512
140          magic -0.0048  0.019512
252          young -0.0055  0.019512
31         certain -0.0061  0.019512
103       gorgeous -0.0062  0.019512
229           thin -0.0064  0.019512
2           actual -0.0072  0.019512
191            sad -0.0095  0.019512
39           clear -0.0099  0.019512
239          usual -0.0128  0.019512
..             ...     ...       ...
207         social -0.0083  0.039054
30         central -0.0092  0.039054
174           poor -0.0092  0.039054
119         indian -0.0101  0.039054
145           mean -0.0110  0.039054
152         nearby -0.0124  0.039054
28       caribbean -0.0149  0.039054
63           earth -0.0177  0.039054
214       standard -0.0345  0.039054
155           nice  0.0044  0.041094
64            east  0.0230  0.048836
139            low -0.0045  0.048836
72          entire -0.0055  0.048836
218         strong -0.0068  0.048836
148         modern -0.0078  0.048836
235        typical -0.0092  0.048836
70         english -0.0093  0.048836
156         normal -0.0102  0.048836
200       sicilian -0.0102  0.048836
197        serious -0.0103  0.048836
177        private -0.0112  0.048836
134          light -0.0129  0.048836
73            epic -0.0130  0.048836
186     ridiculous -0.0146  0.048836
129         korean -0.0153  0.048836
121    interesting -0.0164  0.048836
244          weird -0.0174  0.048836
112        helpful -0.0176  0.048836
16            bean -0.0188  0.048836
216        stellar -0.0217  0.048836

[83 rows x 3 columns]


Using all data
MODEL:  LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
SCORE:  0.0125850437395
AUC:  0.738506486503


TOP PREDICTORS (p-value < 0.05):
      adjective    coef   p-value
130       large  0.0080  0.003783
251       youll  0.0069  0.003783
157         old  0.0081  0.004120
84        first  0.0046  0.004120
41         cold -0.0127  0.007306
123     italian  0.0008  0.011308
44    congested  0.0000  0.011465
24        bushy -0.0028  0.011465
0    20500daily -0.0091  0.011465
176   priceless -0.0091  0.011465
253        yous -0.0103  0.011465
13          bad  0.0046  0.022223
150       music -0.0060  0.022223
117      iconic  0.0080  0.022932
53       delish  0.0025  0.024571
106       green  0.0031  0.031070
114    horrible  0.0104  0.031512
221    superior -0.0084  0.034402
99    ginormous -0.0202  0.034402
171         pic -0.0208  0.034402
102        good  0.0016  0.036221
192       salad -0.0021  0.036831
204        slow -0.0001  0.045644
196    separate  0.0028  0.045873
67    efficient -0.0124  0.045873
154        next  0.0127  0.048159

In [24]:

clf_logistic = linear_model.LogisticRegression()
clf_logistic = clf_logistic.fit(X_train, y_train)

print 'Using training set'
score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic')

print 'Using testing set'
score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic')

print 'Using all data'
score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic')

Using training set
MODEL:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
SCORE:  0.988105563127
AUC:  0.5


TOP PREDICTORS (p-value < 0.05):
      adjective      coef   p-value
53       delish  1.041227  0.000169
74    excellent  0.962809  0.000341
52    delicious  0.975505  0.001837
106       green  1.059291  0.003959
249       wrong  0.952657  0.005746
22    breakfast  0.981376  0.006458
24        bushy  0.992131  0.012036
117      iconic  0.990644  0.012036
0    20500daily  0.989060  0.012036
176   priceless  0.989060  0.012036
253        yous  0.988665  0.012036
153         new  1.037486  0.017581
107     grilled  1.029425  0.018048
102        good  1.020814  0.023534
171         pic  0.962424  0.024075
192       salad  0.970543  0.029159
19          big  0.932674  0.034574
203       sized  0.980787  0.036117
196    separate  0.975017  0.036117
221    superior  0.968507  0.036117
67    efficient  0.962713  0.036117
99    ginormous  0.961655  0.036117
84        first  1.126257  0.046109
188       royal  0.962617  0.048163


Using testing set
MODEL:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
SCORE:  0.990338164251
AUC:  0.5


TOP PREDICTORS (p-value < 0.05):
         adjective      coef   p-value
43   complimentary  1.939247  0.009753
190        russian  1.936341  0.009753
44       congested  1.000000  0.009753
117         iconic  0.990644  0.009753
196       separate  0.975017  0.009753
67       efficient  0.962713  0.009753
188          royal  0.962617  0.009753
171            pic  0.962424  0.009753
198        several  0.942895  0.009753
193       saltfish  0.939507  0.009753
236     unbeatable  0.934354  0.009753
243           weak  0.933233  0.009753
151        natural  0.912926  0.009753
69           empty  0.881615  0.009753
48        creative  0.845354  0.009753
169       personal  0.800675  0.009753
237   unbelievable  2.080275  0.019512
210       spacious  2.074873  0.019512
161        organic  2.036230  0.019512
228       terrific  1.697913  0.019512
203          sized  0.980787  0.019512
144        massive  0.949424  0.019512
31         certain  0.945917  0.019512
201        similar  0.939037  0.019512
103       gorgeous  0.937536  0.019512
2           actual  0.925334  0.019512
187       rosemary  0.920351  0.019512
191            sad  0.920351  0.019512
140          magic  0.915395  0.019512
39           clear  0.907012  0.019512
..             ...       ...       ...
28       caribbean  0.918696  0.039054
127            kid  0.888607  0.039054
63           earth  0.877306  0.039054
174           poor  0.857358  0.039054
152         nearby  0.850952  0.039054
30         central  0.842653  0.039054
145           mean  0.819058  0.039054
214       standard  0.796602  0.039054
119         indian  0.760332  0.039054
155           nice  1.240234  0.041094
64            east  1.565492  0.048836
72          entire  0.947337  0.048836
156         normal  0.939695  0.048836
177        private  0.916769  0.048836
186     ridiculous  0.904204  0.048836
73            epic  0.903662  0.048836
148         modern  0.894849  0.048836
216        stellar  0.888963  0.048836
235        typical  0.882938  0.048836
197        serious  0.873366  0.048836
200       sicilian  0.871534  0.048836
139            low  0.857443  0.048836
244          weird  0.855474  0.048836
129         korean  0.825472  0.048836
121    interesting  0.819468  0.048836
70         english  0.816850  0.048836
218         strong  0.816115  0.048836
112        helpful  0.786707  0.048836
16            bean  0.784036  0.048836
134          light  0.686053  0.048836

[83 rows x 3 columns]


Using all data
MODEL:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
SCORE:  0.988663817134
AUC:  0.5


TOP PREDICTORS (p-value < 0.05):
      adjective      coef   p-value
130       large  1.219938  0.003783
251       youll  1.153268  0.003783
157         old  1.202016  0.004120
84        first  1.126257  0.004120
41         cold  0.602480  0.007306
123     italian  1.073367  0.011308
44    congested  1.000000  0.011465
24        bushy  0.992131  0.011465
0    20500daily  0.989060  0.011465
176   priceless  0.989060  0.011465
253        yous  0.988665  0.011465
13          bad  1.179039  0.022223
150       music  0.842400  0.022223
117      iconic  0.990644  0.022932
53       delish  1.041227  0.024571
106       green  1.059291  0.031070
114    horrible  1.266301  0.031512
221    superior  0.968507  0.034402
171         pic  0.962424  0.034402
99    ginormous  0.961655  0.034402
102        good  1.020814  0.036221
192       salad  0.970543  0.036831
204        slow  0.879502  0.045644
196    separate  0.975017  0.045873
67    efficient  0.962713  0.045873
154        next  1.290978  0.048159

In [28]:

clf_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)

print 'Using training set'
clf_tree = clf_tree.fit(X_train, y_train)
score = clf_tree.score(X_train, y_train)
y_pred = clf_tree.predict(X_train)

print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y_train, y_pred)), "\n"
print "Classification report"
print metrics.classification_report(y_train, y_pred), "\n"
        
print "Confusion matrix"
print metrics.confusion_matrix(y_train, y_pred), "\n"

print 'Using testing set'
score = clf_tree.score(X_test, y_test)
y_pred = clf_tree.predict(X_test)

print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y_test, y_pred)), "\n"
print "Classification report"
print metrics.classification_report(y_test, y_pred), "\n"
        
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred), "\n"

print 'Using all data'
score = clf_tree.score(X_adjs.values, y.values)
y_pred = clf_tree.predict(X_adjs.values)

print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y.values, y_pred)), "\n"
print "Classification report"
print metrics.classification_report(y.values, y_pred), "\n"
        
print "Confusion matrix"
print metrics.confusion_matrix(y.values, y_pred), "\n"

Using training set
Accuracy:0.991 

Classification report
             precision    recall  f1-score   support

        0.0       0.99      1.00      1.00      1631
        1.0       0.00      0.00      0.00        15

avg / total       0.98      0.99      0.99      1646


Confusion matrix
[[1631    0]
 [  15    0]] 

Using testing set
Accuracy:0.993 

Classification report
             precision    recall  f1-score   support

        0.0       0.99      1.00      1.00       545
        1.0       0.00      0.00      0.00         4

avg / total       0.99      0.99      0.99       549


Confusion matrix
[[545   0]
 [  4   0]] 

Using all data
Accuracy:0.991 

Classification report
             precision    recall  f1-score   support

        0.0       0.99      1.00      1.00      2176
        1.0       0.00      0.00      0.00        19

avg / total       0.98      0.99      0.99      2195


Confusion matrix
[[2176    0]
 [  19    0]]

In [26]:

y = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1]['grade_C']
cheap_df = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1]
X_adjs = cheap_df.ix[:, 34:-1]
X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)

clf_multi_nb = naive_bayes.MultinomialNB()
clf_multi_nb.fit(X_train, y_train)

print 'Using training set'
score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')

print 'Using testing set'
score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')

print 'Using all data'
score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')

Using training set
MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
SCORE:  0.990886998785
AUC:  0.5


TOP PREDICTORS (p-value < 0.05):
       adjective      coef   p-value
10     available  0.003484  0.009191
11       average  0.003484  0.009191
24         bushy  0.003484  0.009191
31       certain  0.003484  0.009191
67     efficient  0.003484  0.009191
73          epic  0.003484  0.009191
86          flat  0.003484  0.009191
111        heavy  0.003484  0.009191
126          key  0.003484  0.009191
148       modern  0.003484  0.009191
151      natural  0.003484  0.009191
161      organic  0.003484  0.009191
171          pic  0.003484  0.009191
172     pleasant  0.003484  0.009191
186   ridiculous  0.003484  0.009191
190      russian  0.003484  0.009191
193     saltfish  0.003484  0.009191
200     sicilian  0.003484  0.009191
201      similar  0.003484  0.009191
203        sized  0.003484  0.009191
216      stellar  0.003484  0.009191
234      turkish  0.003484  0.009191
236   unbeatable  0.003484  0.009191
238       unique  0.003484  0.009191
5     affordable  0.003484  0.018394
7          asian  0.003484  0.018394
8      attentive  0.003484  0.018394
15         basic  0.003484  0.018394
30       central  0.003484  0.018394
48      creative  0.003484  0.018394
..           ...       ...       ...
224        swiss  0.003484  0.027608
230         tiny  0.003484  0.027608
233         true  0.003484  0.027608
252        young  0.003484  0.027608
102         good  0.010453  0.028718
12         awful  0.003484  0.036833
18     beautiful  0.003484  0.036833
47         crazy  0.003484  0.036833
57          dive  0.003484  0.036833
72        entire  0.003484  0.036833
101         goat  0.003484  0.036833
104       greasy  0.003484  0.036833
113         high  0.003484  0.036833
158        olive  0.003484  0.036833
165      overall  0.003484  0.036833
166          own  0.003484  0.036833
202       simple  0.003484  0.036833
211      spanish  0.003484  0.036833
217     straight  0.003484  0.036833
219         such  0.003484  0.036833
232  traditional  0.003484  0.036833
16          bean  0.003484  0.046069
40         close  0.003484  0.046069
55         dirty  0.003484  0.046069
65          easy  0.003484  0.046069
81           fat  0.003484  0.046069
87     flavorful  0.003484  0.046069
183   reasonable  0.003484  0.046069
235      typical  0.003484  0.046069
244        weird  0.003484  0.046069

[116 rows x 3 columns]


Using testing set
MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
SCORE:  0.992714025501
AUC:  0.5


TOP PREDICTORS (p-value < 0.05):
      adjective      coef   p-value
75    expensive  0.006969  0.007326
77     fabulous  0.006969  0.007326
3     addictive  0.003484  0.007326
5    affordable  0.003484  0.007326
6      american  0.003484  0.007326
7         asian  0.003484  0.007326
8     attentive  0.003484  0.007326
10    available  0.003484  0.007326
11      average  0.003484  0.007326
14        baked  0.003484  0.007326
23       bubble  0.003484  0.007326
29       casual  0.003484  0.007326
31      certain  0.003484  0.007326
46         cozy  0.003484  0.007326
55        dirty  0.003484  0.007326
56         dish  0.003484  0.007326
61          dry  0.003484  0.007326
65         easy  0.003484  0.007326
68     eggplant  0.003484  0.007326
69        empty  0.003484  0.007326
73         epic  0.003484  0.007326
86         flat  0.003484  0.007326
87    flavorful  0.003484  0.007326
89       french  0.003484  0.007326
93         full  0.003484  0.007326
94        funny  0.003484  0.007326
97     generous  0.003484  0.007326
99    ginormous  0.003484  0.007326
101        goat  0.003484  0.007326
112     helpful  0.003484  0.007326
..          ...       ...       ...
106       green  0.003484  0.029467
132        late  0.003484  0.029467
152      nearby  0.003484  0.029467
157         old  0.003484  0.029467
181       ready  0.003484  0.029467
184         red  0.003484  0.029467
83          few  0.006969  0.036902
242        want  0.006969  0.036902
12        awful  0.003484  0.036902
20        black  0.003484  0.036902
25         busy  0.003484  0.036902
49         cute  0.003484  0.036902
51       decent  0.003484  0.036902
53       delish  0.003484  0.036902
85         fish  0.003484  0.036902
96      general  0.003484  0.036902
182        real  0.003484  0.036902
215       steak  0.003484  0.036902
13          bad  0.006969  0.044365
38        clean  0.003484  0.044365
74    excellent  0.003484  0.044365
80         fast  0.003484  0.044365
100        give  0.003484  0.044365
108       happy  0.003484  0.044365
116        huge  0.003484  0.044365
168     perfect  0.003484  0.044365
185     regular  0.003484  0.044365
205       small  0.003484  0.044365
209       solid  0.003484  0.044365
251       youll  0.003484  0.044365

[144 rows x 3 columns]


Using all data
MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
SCORE:  0.991343963554
AUC:  0.5


TOP PREDICTORS (p-value < 0.05):
       adjective      coef   p-value
3      addictive  0.003484  0.008728
24         bushy  0.003484  0.008728
29        casual  0.003484  0.008728
67     efficient  0.003484  0.008728
99     ginormous  0.003484  0.008728
148       modern  0.003484  0.008728
161      organic  0.003484  0.008728
164  outstanding  0.003484  0.008728
171          pic  0.003484  0.008728
173       polish  0.003484  0.008728
186   ridiculous  0.003484  0.008728
187     rosemary  0.003484  0.008728
190      russian  0.003484  0.008728
193     saltfish  0.003484  0.008728
200     sicilian  0.003484  0.008728
203        sized  0.003484  0.008728
214     standard  0.003484  0.008728
216      stellar  0.003484  0.008728
238       unique  0.003484  0.008728
10     available  0.003484  0.017463
11       average  0.003484  0.017463
15         basic  0.003484  0.017463
30       central  0.003484  0.017463
31       certain  0.003484  0.017463
48      creative  0.003484  0.017463
73          epic  0.003484  0.017463
78        famous  0.003484  0.017463
86          flat  0.003484  0.017463
121  interesting  0.003484  0.017463
126          key  0.003484  0.017463
..           ...       ...       ...
72        entire  0.003484  0.034959
94         funny  0.003484  0.034959
97      generous  0.003484  0.034959
119       indian  0.003484  0.034959
120  inexpensive  0.003484  0.034959
127          kid  0.003484  0.034959
133        later  0.003484  0.034959
139          low  0.003484  0.034959
158        olive  0.003484  0.034959
162     original  0.003484  0.034959
178       public  0.003484  0.034959
180        quiet  0.003484  0.034959
211      spanish  0.003484  0.034959
217     straight  0.003484  0.034959
232  traditional  0.003484  0.034959
243         weak  0.003484  0.034959
252        young  0.003484  0.034959
16          bean  0.003484  0.043719
33        cheesy  0.003484  0.043719
98         giant  0.003484  0.043719
101         goat  0.003484  0.043719
113         high  0.003484  0.043719
118   incredible  0.003484  0.043719
122        irish  0.003484  0.043719
140        magic  0.003484  0.043719
183   reasonable  0.003484  0.043719
195       second  0.003484  0.043719
202       simple  0.003484  0.043719
206        smile  0.003484  0.043719
218       strong  0.003484  0.043719

[102 rows x 3 columns]

In [27]:

# tips_adj_df.to_pickle('./dumps/tips_complete_features.pkl')