import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium as fm
import geopy
from sklearn import linear_model, naive_bayes, feature_selection, metrics, tree
from sklearn.cross_validation import train_test_split
%matplotlib inline
tips_with_adjectives_1 = pd.read_pickle('./dumps/tips_with_adjectives.pkl')
adj_dummies = pd.read_pickle('./dumps/adjective_dataframe.pkl')
adj_df = pd.read_csv('./dumps/adjective_count_list.csv')
print len(adj_dummies)
print len(tips_with_adjectives_1)
15527 15527
adjective_list = list(adj_df[adj_df['count'] > 10]['word'])
print "Number of tips: ", len(tips_with_adjectives_1)
print "Number of adjectives: ", len(adj_df)
print "Number of significant adjectives (appears in more than 10 tips): ", len(adjective_list)
Number of tips: 15527 Number of adjectives: 1838 Number of significant adjectives (appears in more than 10 tips): 255
# tips_with_adjectives['address'] = tips_with_adjectives.apply(lambda x: "{0} {1} {2}, {3}, New York, NY".format(x['BUILDING'].strip(), x['STREET'].strip(), int(x['ZIPCODE']), x['BORO']), axis=1)
latlong_df = pd.read_pickle('./dumps/with_lat_long.pkl')[['foursquare_id', 'lat_long']]
tips_adj_df = tips_with_adjectives_1.join(adj_dummies)
tips_adj_df.drop_duplicates(['foursquare_id', 'description'], inplace=True)
tips_adj_df = tips_adj_df.merge(latlong_df, on='foursquare_id', how='left')
len(tips_adj_df)
10762
# desired_columns = [
# 'foursquare_id',
# 'DBA',
# 'description',
# 'tip_words',
# 'tip_adjs',
# 'adj_string',
# 'foursquare_rating',
# 'foursquare_num_of_users',
# 'foursquare_price_tier',
# 'grade_A',
# 'grade_C',
# 'GRADE',
# 'lat_long'
# ]
# tips_df = tips_with_adjectives[desired_columns]
# len(tips_df)
def score_and_predict(model, x_features, y_targets, columns, model_type):
score = model.score(x_features, y_targets)
y_pred = model.predict(x_features)
auc = metrics.roc_auc_score(y_targets, y_pred)
p_values = feature_selection.f_classif(x_features, y_targets)
if model_type == 'naive-bayes' or model_type == 'logistic':
coef_list = [np.exp(round(x, 4)) for x in model.coef_[0]]
elif model_type == 'linear':
coef_list = [round(x, 4) for x in model.coef_]
df_dict = {'adjective': columns, 'p-value': p_values[0], 'coef': coef_list}
model_df = pd.DataFrame(df_dict)
model_df.sort(['p-value', 'coef'], ascending=[1,0], inplace=True)
print 'MODEL: ', model
print 'SCORE: ', score
print 'AUC: ', auc
print '\n'
print 'TOP PREDICTORS (p-value < 0.05):'
print model_df[model_df['p-value'] <= 0.05]
print '\n'
# return model_df
# for index, column in enumerate(tips_adj_df.columns.values):
# print column, index
# ADJECTIVE COLUMNS ARE 31 till second to last column
# based on adjectives included in tip descriptions
X_adjs = tips_adj_df.ix[:, 34:-1]
# based on ratings, number of users, and price tier
X_foursquare_info = tips_adj_df[['foursquare_rating', 'foursquare_num_of_users', 'foursquare_price_tier']].dropna(axis=1)
y = tips_adj_df['grade_A']
X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)
clf_multi_nb = naive_bayes.MultinomialNB()
clf_multi_nb = clf_multi_nb.fit(X_train, y_train)
print 'Using training set'
score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')
print 'Using testing set'
score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')
print 'Using all data'
score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')
Using training set MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) SCORE: 0.901251393879 AUC: 0.502431233668 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 163 other 0.005634 0.000050 26 cant 0.009299 0.000081 73 epic 0.000679 0.000087 210 spacious 0.000679 0.000087 250 yellow 0.000679 0.000087 87 flavorful 0.001290 0.000174 5 affordable 0.001901 0.000261 96 general 0.002579 0.001122 209 solid 0.001968 0.001918 122 irish 0.001358 0.003553 215 steak 0.006109 0.004008 95 garlic 0.004277 0.004226 46 cozy 0.001833 0.006104 115 hot 0.012015 0.006348 227 terrible 0.002647 0.007124 158 olive 0.000747 0.008315 237 unbelievable 0.000747 0.008315 38 clean 0.004548 0.008604 138 long 0.005430 0.009899 84 first 0.004209 0.011209 56 dish 0.002987 0.013767 140 magic 0.000611 0.014504 151 natural 0.000611 0.014504 191 sad 0.000611 0.014504 216 stellar 0.000611 0.014504 119 indian 0.001765 0.020247 157 old 0.004005 0.020831 223 sweet 0.007738 0.023556 118 incredible 0.002104 0.025013 169 personal 0.001154 0.029040 170 phenomenal 0.001154 0.029040 228 terrific 0.001154 0.029040 107 grilled 0.005974 0.031371 81 fat 0.000815 0.033534 127 kid 0.000815 0.033534 162 original 0.000815 0.033534 178 public 0.000815 0.033534 239 usual 0.000815 0.033534 54 different 0.002308 0.033618 82 favorite 0.006652 0.038926 Using testing set MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) SCORE: 0.888888888889 AUC: 0.5 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 83 few 0.003665 0.000000 56 dish 0.002987 0.000000 14 baked 0.002715 0.000000 93 full 0.002579 0.000000 120 inexpensive 0.001290 0.000000 35 chinese 0.006517 0.002318 179 quick 0.003462 0.004851 130 large 0.003937 0.006621 114 horrible 0.003326 0.006621 154 next 0.003326 0.006621 124 ive 0.008349 0.009112 113 high 0.002715 0.012537 118 incredible 0.002104 0.012537 68 eggplant 0.001833 0.012537 146 mediocre 0.001222 0.012537 149 much 0.008077 0.014883 10 available 0.002104 0.015660 47 crazy 0.001968 0.015660 199 short 0.001901 0.015660 25 busy 0.001629 0.015660 104 greasy 0.001154 0.015660 133 later 0.001154 0.015660 163 other 0.005634 0.020173 116 huge 0.004141 0.025169 254 yummy 0.006992 0.040728 131 last 0.002579 0.045608 54 different 0.002308 0.045608 23 bubble 0.002240 0.045608 5 affordable 0.001901 0.045608 51 decent 0.005023 0.049499 Using all data MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) SCORE: 0.898160193273 AUC: 0.50176809245 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 230 tiny 0.001968 0.000085 151 natural 0.000611 0.000490 3 addictive 0.000543 0.000490 228 terrific 0.001154 0.000981 238 unique 0.001086 0.000981 169 personal 0.001154 0.002052 54 different 0.002308 0.002205 163 other 0.005634 0.002261 90 fresh 0.014595 0.002581 13 bad 0.008077 0.003905 66 eat 0.009299 0.007615 29 casual 0.000407 0.007943 224 swiss 0.000407 0.007943 9 authentic 0.003190 0.008253 180 quiet 0.001968 0.009322 5 affordable 0.001901 0.009322 40 close 0.001901 0.009322 58 dont 0.025929 0.009461 182 real 0.003734 0.010006 95 garlic 0.004277 0.010360 1 accept 0.001154 0.010862 30 central 0.001154 0.010862 56 dish 0.002987 0.011253 140 magic 0.000611 0.015087 191 sad 0.000611 0.015087 204 slow 0.007195 0.016624 157 old 0.004005 0.018804 79 fantastic 0.004887 0.020690 38 clean 0.004548 0.023304 119 indian 0.001765 0.024510 202 simple 0.001697 0.024510 20 black 0.002851 0.029148 57 dive 0.001018 0.030205 143 many 0.003055 0.032713 130 large 0.003937 0.036375 251 youll 0.003937 0.036375 26 cant 0.009299 0.037648 118 incredible 0.002104 0.038944 110 healthy 0.002308 0.039847 209 solid 0.001968 0.039847 45 cool 0.004277 0.043770 234 turkish 0.000815 0.044770 193 saltfish 0.000475 0.045706 178 public 0.000815 0.046227 210 spacious 0.000679 0.046227 15 basic 0.000475 0.046227
clf_linear = linear_model.LinearRegression()
clf_linear = clf_linear.fit(X_train, y_train)
print 'Using training set'
score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear')
print 'Using testing set'
score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear')
print 'Using all data'
score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear')
Using training set MODEL: LinearRegression(copy_X=True, fit_intercept=True, normalize=False) SCORE: 0.0306220058522 AUC: 0.65453453789 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 163 other -0.0005 0.000050 26 cant 0.0110 0.000081 73 epic 0.0196 0.000087 250 yellow 0.0143 0.000087 210 spacious 0.0123 0.000087 87 flavorful 0.0218 0.000174 5 affordable 0.0061 0.000261 96 general 0.0292 0.001122 209 solid 0.0167 0.001918 122 irish -0.0082 0.003553 215 steak -0.0008 0.004008 95 garlic -0.0077 0.004226 46 cozy -0.0200 0.006104 115 hot 0.0042 0.006348 227 terrible -0.0021 0.007124 158 olive 0.0464 0.008315 237 unbelievable 0.0079 0.008315 38 clean 0.0037 0.008604 138 long -0.0036 0.009899 84 first -0.0020 0.011209 56 dish 0.0091 0.013767 151 natural 0.0361 0.014504 191 sad 0.0085 0.014504 216 stellar -0.0123 0.014504 140 magic -0.0354 0.014504 119 indian -0.0069 0.020247 157 old -0.0030 0.020831 223 sweet 0.0028 0.023556 118 incredible 0.0127 0.025013 228 terrific 0.0222 0.029040 169 personal -0.0176 0.029040 170 phenomenal -0.0186 0.029040 107 grilled 0.0041 0.031371 239 usual 0.0336 0.033534 127 kid 0.0176 0.033534 162 original 0.0168 0.033534 81 fat 0.0163 0.033534 178 public 0.0145 0.033534 54 different -0.0040 0.033618 82 favorite 0.0016 0.038926 Using testing set MODEL: LinearRegression(copy_X=True, fit_intercept=True, normalize=False) SCORE: -0.0270734866002 AUC: 0.514883782061 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 120 inexpensive 0.1140 0.000000 93 full 0.0438 0.000000 14 baked 0.0327 0.000000 83 few 0.0294 0.000000 56 dish 0.0091 0.000000 35 chinese -0.0829 0.002318 179 quick -0.0168 0.004851 154 next 0.0210 0.006621 130 large -0.0158 0.006621 114 horrible -0.0324 0.006621 124 ive -0.0264 0.009112 113 high 0.0792 0.012537 118 incredible 0.0127 0.012537 68 eggplant -0.0432 0.012537 146 mediocre -0.1384 0.012537 149 much 0.0396 0.014883 199 short 0.1068 0.015660 47 crazy 0.0316 0.015660 25 busy -0.0378 0.015660 104 greasy -0.0779 0.015660 10 available -0.0937 0.015660 133 later -0.1298 0.015660 163 other -0.0005 0.020173 116 huge -0.0639 0.025169 254 yummy 0.0469 0.040728 23 bubble 0.0391 0.045608 5 affordable 0.0061 0.045608 54 different -0.0040 0.045608 131 last -0.0163 0.045608 51 decent 0.0553 0.049499 Using all data MODEL: LinearRegression(copy_X=True, fit_intercept=True, normalize=False) SCORE: 0.0153673116839 AUC: 0.616453415498 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 230 tiny -0.0225 0.000085 3 addictive 0.0950 0.000490 151 natural 0.0361 0.000490 228 terrific 0.0222 0.000981 238 unique -0.0312 0.000981 169 personal -0.0176 0.002052 54 different -0.0040 0.002205 163 other -0.0005 0.002261 90 fresh 0.0131 0.002581 13 bad -0.0062 0.003905 66 eat -0.0031 0.007615 224 swiss 0.0856 0.007943 29 casual -0.0786 0.007943 9 authentic 0.0092 0.008253 180 quiet 0.0412 0.009322 5 affordable 0.0061 0.009322 40 close -0.0286 0.009322 58 dont 0.0163 0.009461 182 real 0.0094 0.010006 95 garlic -0.0077 0.010360 1 accept 0.0405 0.010862 30 central 0.0343 0.010862 56 dish 0.0091 0.011253 191 sad 0.0085 0.015087 140 magic -0.0354 0.015087 204 slow -0.0116 0.016624 157 old -0.0030 0.018804 79 fantastic -0.0188 0.020690 38 clean 0.0037 0.023304 202 simple 0.0226 0.024510 119 indian -0.0069 0.024510 20 black 0.0394 0.029148 57 dive -0.0270 0.030205 143 many 0.0036 0.032713 130 large -0.0158 0.036375 251 youll -0.0307 0.036375 26 cant 0.0110 0.037648 118 incredible 0.0127 0.038944 110 healthy 0.0384 0.039847 209 solid 0.0167 0.039847 45 cool 0.0238 0.043770 234 turkish -0.0607 0.044770 193 saltfish -0.0485 0.045706 15 basic 0.1099 0.046227 178 public 0.0145 0.046227 210 spacious 0.0123 0.046227
clf_logistic = linear_model.LogisticRegression()
clf_logistic.fit(X_train, y_train)
print 'Using training set'
score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic')
print 'Using testing set'
score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic')
print 'Using all data'
score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic')
Using training set MODEL: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001) SCORE: 0.901003593111 AUC: 0.500625 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 163 other 1.057386 0.000050 26 cant 1.184831 0.000081 73 epic 1.098779 0.000087 250 yellow 1.097571 0.000087 210 spacious 1.052954 0.000087 87 flavorful 1.143622 0.000174 5 affordable 1.044564 0.000261 96 general 1.248696 0.001122 209 solid 1.152807 0.001918 122 irish 0.962232 0.003553 215 steak 0.986887 0.004008 95 garlic 0.938474 0.004226 46 cozy 0.893776 0.006104 115 hot 1.031176 0.006348 227 terrible 1.015316 0.007124 158 olive 1.137463 0.008315 237 unbelievable 1.029219 0.008315 38 clean 1.051376 0.008604 138 long 0.981474 0.009899 84 first 0.957337 0.011209 56 dish 1.086107 0.013767 151 natural 1.067479 0.014504 191 sad 1.002704 0.014504 216 stellar 0.941576 0.014504 140 magic 0.860020 0.014504 119 indian 0.969282 0.020247 157 old 0.971514 0.020831 223 sweet 1.015316 0.023556 118 incredible 1.085239 0.025013 228 terrific 1.048332 0.029040 169 personal 0.903481 0.029040 170 phenomenal 0.886300 0.029040 107 grilled 1.022244 0.031371 239 usual 1.144651 0.033534 81 fat 1.101199 0.033534 162 original 1.086650 0.033534 127 kid 1.086107 0.033534 178 public 1.072186 0.033534 54 different 0.943933 0.033618 82 favorite 1.031589 0.038926 Using testing set MODEL: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001) SCORE: 0.888888888889 AUC: 0.5 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 120 inexpensive 2.360091 0.000000 93 full 1.620282 0.000000 83 few 1.384307 0.000000 14 baked 1.355134 0.000000 56 dish 1.086107 0.000000 35 chinese 0.505807 0.002318 179 quick 0.853679 0.004851 154 next 1.204061 0.006621 130 large 0.860708 0.006621 114 horrible 0.753294 0.006621 124 ive 0.785920 0.009112 113 high 2.387866 0.012537 118 incredible 1.085239 0.012537 68 eggplant 0.727894 0.012537 146 mediocre 0.431538 0.012537 149 much 1.625800 0.014883 199 short 2.895622 0.015660 47 crazy 1.344874 0.015660 25 busy 0.810098 0.015660 104 greasy 0.628449 0.015660 10 available 0.507987 0.015660 133 later 0.476589 0.015660 163 other 1.057386 0.020173 116 huge 0.606652 0.025169 254 yummy 1.751548 0.040728 23 bubble 1.441234 0.045608 5 affordable 1.044564 0.045608 54 different 0.943933 0.045608 131 last 0.852740 0.045608 51 decent 1.715321 0.049499 Using all data MODEL: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001) SCORE: 0.897974354209 AUC: 0.500454959054 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 230 tiny 0.858216 0.000085 3 addictive 1.555816 0.000490 151 natural 1.067479 0.000490 228 terrific 1.048332 0.000981 238 unique 0.815136 0.000981 169 personal 0.903481 0.002052 54 different 0.943933 0.002205 163 other 1.057386 0.002261 90 fresh 1.188509 0.002581 13 bad 0.933607 0.003905 66 eat 0.951800 0.007615 224 swiss 1.352156 0.007943 29 casual 0.736460 0.007943 9 authentic 1.049171 0.008253 180 quiet 1.439218 0.009322 5 affordable 1.044564 0.009322 40 close 0.794613 0.009322 58 dont 1.234542 0.009461 182 real 1.113268 0.010006 95 garlic 0.938474 0.010360 1 accept 1.302128 0.010862 30 central 1.269979 0.010862 56 dish 1.086107 0.011253 191 sad 1.002704 0.015087 140 magic 0.860020 0.015087 204 slow 0.896999 0.016624 157 old 0.971514 0.018804 79 fantastic 0.846623 0.020690 38 clean 1.051376 0.023304 202 simple 1.174568 0.024510 119 indian 0.969282 0.024510 20 black 1.402000 0.029148 57 dive 0.859590 0.030205 143 many 1.043938 0.032713 130 large 0.860708 0.036375 251 youll 0.745575 0.036375 26 cant 1.184831 0.037648 118 incredible 1.085239 0.038944 110 healthy 1.474177 0.039847 209 solid 1.152807 0.039847 45 cool 1.258222 0.043770 234 turkish 0.737713 0.044770 193 saltfish 0.823576 0.045706 15 basic 1.494662 0.046227 178 public 1.072186 0.046227 210 spacious 1.052954 0.046227
y = tips_adj_df['grade_C']
X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)
clf_multi_nb = naive_bayes.MultinomialNB()
clf_multi_nb = clf_multi_nb.fit(X_train, y_train)
print 'Using training set'
score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')
print 'Using testing set'
score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')
print 'Using all data'
score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')
Using training set MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) SCORE: 0.987981662743 AUC: 0.499937304075 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 53 delish 0.004338 0.000169 74 excellent 0.006507 0.000341 52 delicious 0.015184 0.001837 106 green 0.004338 0.003959 249 wrong 0.004338 0.005746 22 breakfast 0.006507 0.006458 0 20500daily 0.002169 0.012036 24 bushy 0.002169 0.012036 117 iconic 0.002169 0.012036 176 priceless 0.002169 0.012036 253 yous 0.002169 0.012036 153 new 0.008677 0.017581 107 grilled 0.004338 0.018048 102 good 0.036876 0.023534 171 pic 0.002169 0.024075 192 salad 0.008677 0.029159 19 big 0.004338 0.034574 67 efficient 0.002169 0.036117 99 ginormous 0.002169 0.036117 196 separate 0.002169 0.036117 203 sized 0.002169 0.036117 221 superior 0.002169 0.036117 84 first 0.004338 0.046109 188 royal 0.002169 0.048163 Using testing set MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) SCORE: 0.990338164251 AUC: 0.5 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 43 complimentary 0.004338 0.009753 190 russian 0.004338 0.009753 44 congested 0.002169 0.009753 48 creative 0.002169 0.009753 67 efficient 0.002169 0.009753 69 empty 0.002169 0.009753 117 iconic 0.002169 0.009753 151 natural 0.002169 0.009753 169 personal 0.002169 0.009753 171 pic 0.002169 0.009753 188 royal 0.002169 0.009753 193 saltfish 0.002169 0.009753 196 separate 0.002169 0.009753 198 several 0.002169 0.009753 236 unbeatable 0.002169 0.009753 243 weak 0.002169 0.009753 161 organic 0.004338 0.019512 210 spacious 0.004338 0.019512 228 terrific 0.004338 0.019512 237 unbelievable 0.004338 0.019512 2 actual 0.002169 0.019512 31 certain 0.002169 0.019512 39 clear 0.002169 0.019512 103 gorgeous 0.002169 0.019512 140 magic 0.002169 0.019512 144 massive 0.002169 0.019512 187 rosemary 0.002169 0.019512 191 sad 0.002169 0.019512 201 similar 0.002169 0.019512 203 sized 0.002169 0.019512 .. ... ... ... 63 earth 0.002169 0.039054 119 indian 0.002169 0.039054 127 kid 0.002169 0.039054 145 mean 0.002169 0.039054 152 nearby 0.002169 0.039054 174 poor 0.002169 0.039054 207 social 0.002169 0.039054 214 standard 0.002169 0.039054 224 swiss 0.002169 0.039054 155 nice 0.013016 0.041094 64 east 0.004338 0.048836 16 bean 0.002169 0.048836 70 english 0.002169 0.048836 72 entire 0.002169 0.048836 73 epic 0.002169 0.048836 112 helpful 0.002169 0.048836 121 interesting 0.002169 0.048836 129 korean 0.002169 0.048836 134 light 0.002169 0.048836 139 low 0.002169 0.048836 148 modern 0.002169 0.048836 156 normal 0.002169 0.048836 177 private 0.002169 0.048836 186 ridiculous 0.002169 0.048836 197 serious 0.002169 0.048836 200 sicilian 0.002169 0.048836 216 stellar 0.002169 0.048836 218 strong 0.002169 0.048836 235 typical 0.002169 0.048836 244 weird 0.002169 0.048836 [83 rows x 3 columns] Using all data MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) SCORE: 0.988570897603 AUC: 0.499953007519 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 130 large 0.004338 0.003783 251 youll 0.004338 0.003783 84 first 0.004338 0.004120 157 old 0.004338 0.004120 41 cold 0.002169 0.007306 123 italian 0.004338 0.011308 0 20500daily 0.002169 0.011465 24 bushy 0.002169 0.011465 44 congested 0.002169 0.011465 176 priceless 0.002169 0.011465 253 yous 0.002169 0.011465 13 bad 0.006507 0.022223 150 music 0.004338 0.022223 117 iconic 0.002169 0.022932 53 delish 0.004338 0.024571 106 green 0.004338 0.031070 114 horrible 0.004338 0.031512 99 ginormous 0.002169 0.034402 171 pic 0.002169 0.034402 221 superior 0.002169 0.034402 102 good 0.036876 0.036221 192 salad 0.008677 0.036831 204 slow 0.004338 0.045644 67 efficient 0.002169 0.045873 196 separate 0.002169 0.045873 154 next 0.004338 0.048159
clf_linear = linear_model.LinearRegression()
clf_linear = clf_linear.fit(X_train, y_train)
print 'Using training set'
score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear')
print 'Using testing set'
score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear')
print 'Using all data'
score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear')
Using training set MODEL: LinearRegression(copy_X=True, fit_intercept=True, normalize=False) SCORE: 0.0265341264606 AUC: 0.813025731452 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 53 delish 0.0025 0.000169 74 excellent 0.0006 0.000341 52 delicious 0.0004 0.001837 106 green 0.0031 0.003959 249 wrong 0.0009 0.005746 22 breakfast -0.0010 0.006458 117 iconic 0.0080 0.012036 24 bushy -0.0028 0.012036 0 20500daily -0.0091 0.012036 176 priceless -0.0091 0.012036 253 yous -0.0103 0.012036 153 new 0.0012 0.017581 107 grilled 0.0003 0.018048 102 good 0.0016 0.023534 171 pic -0.0208 0.024075 192 salad -0.0021 0.029159 19 big 0.0016 0.034574 203 sized 0.0032 0.036117 196 separate 0.0028 0.036117 221 superior -0.0084 0.036117 67 efficient -0.0124 0.036117 99 ginormous -0.0202 0.036117 84 first 0.0046 0.046109 188 royal -0.0046 0.048163 Using testing set MODEL: LinearRegression(copy_X=True, fit_intercept=True, normalize=False) SCORE: -0.0391889077113 AUC: 0.464410448838 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 190 russian 0.0802 0.009753 43 complimentary 0.0609 0.009753 117 iconic 0.0080 0.009753 196 separate 0.0028 0.009753 44 congested 0.0000 0.009753 198 several -0.0005 0.009753 188 royal -0.0046 0.009753 236 unbeatable -0.0051 0.009753 193 saltfish -0.0058 0.009753 243 weak -0.0068 0.009753 69 empty -0.0101 0.009753 151 natural -0.0121 0.009753 67 efficient -0.0124 0.009753 48 creative -0.0159 0.009753 169 personal -0.0189 0.009753 171 pic -0.0208 0.009753 210 spacious 0.0875 0.019512 237 unbelievable 0.0850 0.019512 161 organic 0.0622 0.019512 228 terrific 0.0401 0.019512 203 sized 0.0032 0.019512 140 magic -0.0048 0.019512 252 young -0.0055 0.019512 31 certain -0.0061 0.019512 103 gorgeous -0.0062 0.019512 229 thin -0.0064 0.019512 2 actual -0.0072 0.019512 191 sad -0.0095 0.019512 39 clear -0.0099 0.019512 239 usual -0.0128 0.019512 .. ... ... ... 207 social -0.0083 0.039054 30 central -0.0092 0.039054 174 poor -0.0092 0.039054 119 indian -0.0101 0.039054 145 mean -0.0110 0.039054 152 nearby -0.0124 0.039054 28 caribbean -0.0149 0.039054 63 earth -0.0177 0.039054 214 standard -0.0345 0.039054 155 nice 0.0044 0.041094 64 east 0.0230 0.048836 139 low -0.0045 0.048836 72 entire -0.0055 0.048836 218 strong -0.0068 0.048836 148 modern -0.0078 0.048836 235 typical -0.0092 0.048836 70 english -0.0093 0.048836 156 normal -0.0102 0.048836 200 sicilian -0.0102 0.048836 197 serious -0.0103 0.048836 177 private -0.0112 0.048836 134 light -0.0129 0.048836 73 epic -0.0130 0.048836 186 ridiculous -0.0146 0.048836 129 korean -0.0153 0.048836 121 interesting -0.0164 0.048836 244 weird -0.0174 0.048836 112 helpful -0.0176 0.048836 16 bean -0.0188 0.048836 216 stellar -0.0217 0.048836 [83 rows x 3 columns] Using all data MODEL: LinearRegression(copy_X=True, fit_intercept=True, normalize=False) SCORE: 0.0125850437395 AUC: 0.738506486503 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 130 large 0.0080 0.003783 251 youll 0.0069 0.003783 157 old 0.0081 0.004120 84 first 0.0046 0.004120 41 cold -0.0127 0.007306 123 italian 0.0008 0.011308 44 congested 0.0000 0.011465 24 bushy -0.0028 0.011465 0 20500daily -0.0091 0.011465 176 priceless -0.0091 0.011465 253 yous -0.0103 0.011465 13 bad 0.0046 0.022223 150 music -0.0060 0.022223 117 iconic 0.0080 0.022932 53 delish 0.0025 0.024571 106 green 0.0031 0.031070 114 horrible 0.0104 0.031512 221 superior -0.0084 0.034402 99 ginormous -0.0202 0.034402 171 pic -0.0208 0.034402 102 good 0.0016 0.036221 192 salad -0.0021 0.036831 204 slow -0.0001 0.045644 196 separate 0.0028 0.045873 67 efficient -0.0124 0.045873 154 next 0.0127 0.048159
clf_logistic = linear_model.LogisticRegression()
clf_logistic = clf_logistic.fit(X_train, y_train)
print 'Using training set'
score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic')
print 'Using testing set'
score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic')
print 'Using all data'
score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic')
Using training set MODEL: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001) SCORE: 0.988105563127 AUC: 0.5 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 53 delish 1.041227 0.000169 74 excellent 0.962809 0.000341 52 delicious 0.975505 0.001837 106 green 1.059291 0.003959 249 wrong 0.952657 0.005746 22 breakfast 0.981376 0.006458 24 bushy 0.992131 0.012036 117 iconic 0.990644 0.012036 0 20500daily 0.989060 0.012036 176 priceless 0.989060 0.012036 253 yous 0.988665 0.012036 153 new 1.037486 0.017581 107 grilled 1.029425 0.018048 102 good 1.020814 0.023534 171 pic 0.962424 0.024075 192 salad 0.970543 0.029159 19 big 0.932674 0.034574 203 sized 0.980787 0.036117 196 separate 0.975017 0.036117 221 superior 0.968507 0.036117 67 efficient 0.962713 0.036117 99 ginormous 0.961655 0.036117 84 first 1.126257 0.046109 188 royal 0.962617 0.048163 Using testing set MODEL: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001) SCORE: 0.990338164251 AUC: 0.5 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 43 complimentary 1.939247 0.009753 190 russian 1.936341 0.009753 44 congested 1.000000 0.009753 117 iconic 0.990644 0.009753 196 separate 0.975017 0.009753 67 efficient 0.962713 0.009753 188 royal 0.962617 0.009753 171 pic 0.962424 0.009753 198 several 0.942895 0.009753 193 saltfish 0.939507 0.009753 236 unbeatable 0.934354 0.009753 243 weak 0.933233 0.009753 151 natural 0.912926 0.009753 69 empty 0.881615 0.009753 48 creative 0.845354 0.009753 169 personal 0.800675 0.009753 237 unbelievable 2.080275 0.019512 210 spacious 2.074873 0.019512 161 organic 2.036230 0.019512 228 terrific 1.697913 0.019512 203 sized 0.980787 0.019512 144 massive 0.949424 0.019512 31 certain 0.945917 0.019512 201 similar 0.939037 0.019512 103 gorgeous 0.937536 0.019512 2 actual 0.925334 0.019512 187 rosemary 0.920351 0.019512 191 sad 0.920351 0.019512 140 magic 0.915395 0.019512 39 clear 0.907012 0.019512 .. ... ... ... 28 caribbean 0.918696 0.039054 127 kid 0.888607 0.039054 63 earth 0.877306 0.039054 174 poor 0.857358 0.039054 152 nearby 0.850952 0.039054 30 central 0.842653 0.039054 145 mean 0.819058 0.039054 214 standard 0.796602 0.039054 119 indian 0.760332 0.039054 155 nice 1.240234 0.041094 64 east 1.565492 0.048836 72 entire 0.947337 0.048836 156 normal 0.939695 0.048836 177 private 0.916769 0.048836 186 ridiculous 0.904204 0.048836 73 epic 0.903662 0.048836 148 modern 0.894849 0.048836 216 stellar 0.888963 0.048836 235 typical 0.882938 0.048836 197 serious 0.873366 0.048836 200 sicilian 0.871534 0.048836 139 low 0.857443 0.048836 244 weird 0.855474 0.048836 129 korean 0.825472 0.048836 121 interesting 0.819468 0.048836 70 english 0.816850 0.048836 218 strong 0.816115 0.048836 112 helpful 0.786707 0.048836 16 bean 0.784036 0.048836 134 light 0.686053 0.048836 [83 rows x 3 columns] Using all data MODEL: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001) SCORE: 0.988663817134 AUC: 0.5 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 130 large 1.219938 0.003783 251 youll 1.153268 0.003783 157 old 1.202016 0.004120 84 first 1.126257 0.004120 41 cold 0.602480 0.007306 123 italian 1.073367 0.011308 44 congested 1.000000 0.011465 24 bushy 0.992131 0.011465 0 20500daily 0.989060 0.011465 176 priceless 0.989060 0.011465 253 yous 0.988665 0.011465 13 bad 1.179039 0.022223 150 music 0.842400 0.022223 117 iconic 0.990644 0.022932 53 delish 1.041227 0.024571 106 green 1.059291 0.031070 114 horrible 1.266301 0.031512 221 superior 0.968507 0.034402 171 pic 0.962424 0.034402 99 ginormous 0.961655 0.034402 102 good 1.020814 0.036221 192 salad 0.970543 0.036831 204 slow 0.879502 0.045644 196 separate 0.975017 0.045873 67 efficient 0.962713 0.045873 154 next 1.290978 0.048159
clf_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
print 'Using training set'
clf_tree = clf_tree.fit(X_train, y_train)
score = clf_tree.score(X_train, y_train)
y_pred = clf_tree.predict(X_train)
print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y_train, y_pred)), "\n"
print "Classification report"
print metrics.classification_report(y_train, y_pred), "\n"
print "Confusion matrix"
print metrics.confusion_matrix(y_train, y_pred), "\n"
print 'Using testing set'
score = clf_tree.score(X_test, y_test)
y_pred = clf_tree.predict(X_test)
print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y_test, y_pred)), "\n"
print "Classification report"
print metrics.classification_report(y_test, y_pred), "\n"
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred), "\n"
print 'Using all data'
score = clf_tree.score(X_adjs.values, y.values)
y_pred = clf_tree.predict(X_adjs.values)
print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y.values, y_pred)), "\n"
print "Classification report"
print metrics.classification_report(y.values, y_pred), "\n"
print "Confusion matrix"
print metrics.confusion_matrix(y.values, y_pred), "\n"
Using training set Accuracy:0.991 Classification report precision recall f1-score support 0.0 0.99 1.00 1.00 1631 1.0 0.00 0.00 0.00 15 avg / total 0.98 0.99 0.99 1646 Confusion matrix [[1631 0] [ 15 0]] Using testing set Accuracy:0.993 Classification report precision recall f1-score support 0.0 0.99 1.00 1.00 545 1.0 0.00 0.00 0.00 4 avg / total 0.99 0.99 0.99 549 Confusion matrix [[545 0] [ 4 0]] Using all data Accuracy:0.991 Classification report precision recall f1-score support 0.0 0.99 1.00 1.00 2176 1.0 0.00 0.00 0.00 19 avg / total 0.98 0.99 0.99 2195 Confusion matrix [[2176 0] [ 19 0]]
y = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1]['grade_C']
cheap_df = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1]
X_adjs = cheap_df.ix[:, 34:-1]
X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)
clf_multi_nb = naive_bayes.MultinomialNB()
clf_multi_nb.fit(X_train, y_train)
print 'Using training set'
score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')
print 'Using testing set'
score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')
print 'Using all data'
score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')
Using training set MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) SCORE: 0.990886998785 AUC: 0.5 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 10 available 0.003484 0.009191 11 average 0.003484 0.009191 24 bushy 0.003484 0.009191 31 certain 0.003484 0.009191 67 efficient 0.003484 0.009191 73 epic 0.003484 0.009191 86 flat 0.003484 0.009191 111 heavy 0.003484 0.009191 126 key 0.003484 0.009191 148 modern 0.003484 0.009191 151 natural 0.003484 0.009191 161 organic 0.003484 0.009191 171 pic 0.003484 0.009191 172 pleasant 0.003484 0.009191 186 ridiculous 0.003484 0.009191 190 russian 0.003484 0.009191 193 saltfish 0.003484 0.009191 200 sicilian 0.003484 0.009191 201 similar 0.003484 0.009191 203 sized 0.003484 0.009191 216 stellar 0.003484 0.009191 234 turkish 0.003484 0.009191 236 unbeatable 0.003484 0.009191 238 unique 0.003484 0.009191 5 affordable 0.003484 0.018394 7 asian 0.003484 0.018394 8 attentive 0.003484 0.018394 15 basic 0.003484 0.018394 30 central 0.003484 0.018394 48 creative 0.003484 0.018394 .. ... ... ... 224 swiss 0.003484 0.027608 230 tiny 0.003484 0.027608 233 true 0.003484 0.027608 252 young 0.003484 0.027608 102 good 0.010453 0.028718 12 awful 0.003484 0.036833 18 beautiful 0.003484 0.036833 47 crazy 0.003484 0.036833 57 dive 0.003484 0.036833 72 entire 0.003484 0.036833 101 goat 0.003484 0.036833 104 greasy 0.003484 0.036833 113 high 0.003484 0.036833 158 olive 0.003484 0.036833 165 overall 0.003484 0.036833 166 own 0.003484 0.036833 202 simple 0.003484 0.036833 211 spanish 0.003484 0.036833 217 straight 0.003484 0.036833 219 such 0.003484 0.036833 232 traditional 0.003484 0.036833 16 bean 0.003484 0.046069 40 close 0.003484 0.046069 55 dirty 0.003484 0.046069 65 easy 0.003484 0.046069 81 fat 0.003484 0.046069 87 flavorful 0.003484 0.046069 183 reasonable 0.003484 0.046069 235 typical 0.003484 0.046069 244 weird 0.003484 0.046069 [116 rows x 3 columns] Using testing set MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) SCORE: 0.992714025501 AUC: 0.5 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 75 expensive 0.006969 0.007326 77 fabulous 0.006969 0.007326 3 addictive 0.003484 0.007326 5 affordable 0.003484 0.007326 6 american 0.003484 0.007326 7 asian 0.003484 0.007326 8 attentive 0.003484 0.007326 10 available 0.003484 0.007326 11 average 0.003484 0.007326 14 baked 0.003484 0.007326 23 bubble 0.003484 0.007326 29 casual 0.003484 0.007326 31 certain 0.003484 0.007326 46 cozy 0.003484 0.007326 55 dirty 0.003484 0.007326 56 dish 0.003484 0.007326 61 dry 0.003484 0.007326 65 easy 0.003484 0.007326 68 eggplant 0.003484 0.007326 69 empty 0.003484 0.007326 73 epic 0.003484 0.007326 86 flat 0.003484 0.007326 87 flavorful 0.003484 0.007326 89 french 0.003484 0.007326 93 full 0.003484 0.007326 94 funny 0.003484 0.007326 97 generous 0.003484 0.007326 99 ginormous 0.003484 0.007326 101 goat 0.003484 0.007326 112 helpful 0.003484 0.007326 .. ... ... ... 106 green 0.003484 0.029467 132 late 0.003484 0.029467 152 nearby 0.003484 0.029467 157 old 0.003484 0.029467 181 ready 0.003484 0.029467 184 red 0.003484 0.029467 83 few 0.006969 0.036902 242 want 0.006969 0.036902 12 awful 0.003484 0.036902 20 black 0.003484 0.036902 25 busy 0.003484 0.036902 49 cute 0.003484 0.036902 51 decent 0.003484 0.036902 53 delish 0.003484 0.036902 85 fish 0.003484 0.036902 96 general 0.003484 0.036902 182 real 0.003484 0.036902 215 steak 0.003484 0.036902 13 bad 0.006969 0.044365 38 clean 0.003484 0.044365 74 excellent 0.003484 0.044365 80 fast 0.003484 0.044365 100 give 0.003484 0.044365 108 happy 0.003484 0.044365 116 huge 0.003484 0.044365 168 perfect 0.003484 0.044365 185 regular 0.003484 0.044365 205 small 0.003484 0.044365 209 solid 0.003484 0.044365 251 youll 0.003484 0.044365 [144 rows x 3 columns] Using all data MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) SCORE: 0.991343963554 AUC: 0.5 TOP PREDICTORS (p-value < 0.05): adjective coef p-value 3 addictive 0.003484 0.008728 24 bushy 0.003484 0.008728 29 casual 0.003484 0.008728 67 efficient 0.003484 0.008728 99 ginormous 0.003484 0.008728 148 modern 0.003484 0.008728 161 organic 0.003484 0.008728 164 outstanding 0.003484 0.008728 171 pic 0.003484 0.008728 173 polish 0.003484 0.008728 186 ridiculous 0.003484 0.008728 187 rosemary 0.003484 0.008728 190 russian 0.003484 0.008728 193 saltfish 0.003484 0.008728 200 sicilian 0.003484 0.008728 203 sized 0.003484 0.008728 214 standard 0.003484 0.008728 216 stellar 0.003484 0.008728 238 unique 0.003484 0.008728 10 available 0.003484 0.017463 11 average 0.003484 0.017463 15 basic 0.003484 0.017463 30 central 0.003484 0.017463 31 certain 0.003484 0.017463 48 creative 0.003484 0.017463 73 epic 0.003484 0.017463 78 famous 0.003484 0.017463 86 flat 0.003484 0.017463 121 interesting 0.003484 0.017463 126 key 0.003484 0.017463 .. ... ... ... 72 entire 0.003484 0.034959 94 funny 0.003484 0.034959 97 generous 0.003484 0.034959 119 indian 0.003484 0.034959 120 inexpensive 0.003484 0.034959 127 kid 0.003484 0.034959 133 later 0.003484 0.034959 139 low 0.003484 0.034959 158 olive 0.003484 0.034959 162 original 0.003484 0.034959 178 public 0.003484 0.034959 180 quiet 0.003484 0.034959 211 spanish 0.003484 0.034959 217 straight 0.003484 0.034959 232 traditional 0.003484 0.034959 243 weak 0.003484 0.034959 252 young 0.003484 0.034959 16 bean 0.003484 0.043719 33 cheesy 0.003484 0.043719 98 giant 0.003484 0.043719 101 goat 0.003484 0.043719 113 high 0.003484 0.043719 118 incredible 0.003484 0.043719 122 irish 0.003484 0.043719 140 magic 0.003484 0.043719 183 reasonable 0.003484 0.043719 195 second 0.003484 0.043719 202 simple 0.003484 0.043719 206 smile 0.003484 0.043719 218 strong 0.003484 0.043719 [102 rows x 3 columns]
# tips_adj_df.to_pickle('./dumps/tips_complete_features.pkl')