#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('pylab', 'inline') # # Supervised Learning: Yelp Reviews # * In this project, I'm trying to build machine learning models based on yelp's data: http://www.yelp.com/dataset_challenge to study features of the business and make prdedictions that could help the business. # * This dataset includes: # * 1.6M reviews and 500K tips by 366K users for 61K businesses # * 481K business attributes, e.g., hours, parking availability, ambience. # * Social network of 366K users for a total of 2.9M social edges. # * Aggregated check-ins over time for each of the 61K businesses # * 5 files: # 1. **Business**: dataset about business attributes, e.g., hours, parking availability, ambience # 2. **Review**: text reviews of the businesses # 3. **User**: information about users who made the reviews # 4. **Check-in**: checkin records of business # 5. **Tip**: short text giving tips about businesses # ## Data Cleaning and Transformation # This dataset comes in as .json format. I used `json-to-csv-convert.py` from https://github.com/Yelp/dataset-examples to convert all the data to .csv files and so it's eaiser to load into dataframes. # ## Explore Business Features # In[2]: import pandas as pd from matplotlib import pyplot as plt import seaborn as sns biz = pd.read_csv('./data/yelp_data/yelp_academic_dataset_business.csv',low_memory=False) reviews = pd.read_csv('./data/yelp_data/yelp_academic_dataset_review.csv') users = pd.read_csv('./data/yelp_data/yelp_academic_dataset_user.csv') checkin = pd.read_csv('./data/yelp_data/yelp_academic_dataset_checkin.csv') tip = pd.read_csv('./data/yelp_data/yelp_academic_dataset_tip.csv') biz.head() # In[3]: reviews.head() # ### Select restaurants from the business dataset # In[4]: biz['ifRestaurants'] = biz.categories.apply(lambda x: 1 if 'Restaurants' in x else 0) restaurants = biz[biz.ifRestaurants == 1] restaurants.shape # ###Show distribution of ratings # In[5]: sns.set_style('darkgrid') plt.figure(figsize=(8,6)) ax = restaurants['stars'].hist(color='g', alpha=0.5, bins=25) ax.set_xlabel("Stars") # In[6]: users.head() # In[7]: checkin.head() # In[8]: tip.head() # In[9]: print biz.shape print biz.dtypes # In[10]: biz.describe() # In[11]: cols = ['review_count', 'stars', 'attributes.Price Range'] pd.scatter_matrix(biz[cols], figsize=(12, 12)) # * In the business dataset, each of the feature attributes such as 'Wi-fi', 'Good for kids' were represented as binary. And there was a lot of missing values. # * Stars, price range look like continous variable but the actually discrete beacuse stars were rounded to half point and price range can only take values from 1-4 # * Most of the businesses have stars above 3.5 in this dataset # ## Predicting business features based on text reviews # * When poeple post reviews on yelp, text reviews are required but reviews of the features of the buiness such as 'Good for Kids', 'Good for Group' , ' Have Wifi' etc. are voluntary. So it will be helpful if we can predict some of the features based on the text reviews. # * To simplify the study, I only predict one feature: 'Is Good for Kids'. But the methdology used could be replicated to study other features as well. # ### Merge reviews and business on business_id # In[12]: biz_review = pd.merge(reviews,restaurants,on='business_id') # In[13]: biz_review.shape # In[14]: ## This helper function takes in a string and return a string with stopwords removed def remove_stop_words(text): from nltk.corpus import stopwords stop = set(stopwords.words('english')) return ' '.join([w for w in text.split() if not w in stop]) # In[15]: df = biz_review[['attributes.Good For Kids','text','votes.useful']] df = df.dropna() df.shape # In[22]: df.head() # In[16]: df['text_sw'] = df.text.apply(remove_stop_words) # In[111]: # Generate the feature vector and target variable #from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer # vectorizer = CountVectorizer(ngram_range = (1,2)) vectorizer = TfidfVectorizer(min_df=2) x = vectorizer.fit_transform(df.text_sw) y = df['attributes.Good For Kids'].values.astype(int) # In[ ]: # #Convert the sparse matrix back to a normal array to run feature selection # x_back = x.toarray() # x_df = pd.DataFrame(x_back, columns=vectorizer.get_feature_names()) # x_columns = list(x_df.columns) # #Feature selection # from sklearn import feature_selection as f_select # significant_features = [] # pvals = [] # for feature in x_columns: # pval = f_select.f_classif(x_df[feature],y) # if pval[1][0] < 0.05: # significant_features.append(feature) # pvals.append(pval[1][0]) # ### Create OOS data # In[112]: ## Create test data from sklearn import cross_validation xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(x, y, test_size=0.2, random_state=1234) # In[113]: from sklearn import naive_bayes, linear_model # new_features = x_df[significant_features] from sklearn.cross_validation import cross_val_score scores = cross_val_score(naive_bayes.MultinomialNB(),xtrain,ytrain,scoring='roc_auc',cv=5) # In[114]: scores.mean() # In[115]: plt.figure(figsize=(8, 6)) sns.kdeplot(np.random.normal(loc=np.array(scores).mean(), scale=np.array(scores).std(), size=10000), shade=True) # In[116]: from sklearn import metrics clf = naive_bayes.MultinomialNB().fit(xtrain, ytrain) print 'fpr', metrics.roc_curve(ytest, clf.predict(xtest))[0][1] #fpr print 'tpr', metrics.roc_curve(ytest, clf.predict(xtest))[1][1] #tpr print 'precision', metrics.precision_score(ytest, clf.predict(xtest)) print 'accuracy', metrics.accuracy_score(ytest, clf.predict(xtest)) roc = metrics.roc_curve(ytest, clf.predict(xtest)) plt.figure() plt.plot([0, 0.5, 1], [0, 0.5, 1]) plt.plot(roc[0], roc[1]) plt.xlabel('false positive rate') plt.ylabel('true positive rate') # We had 0.86 auc score from the cross validated test, which I think is pretty good. # ## Can I improve the prediction by taking the useful votes of the review into account? # * Each review has several vote columns such as, votes.funny, votes.useful and votes.cool, which are vote counts for those categories. So it might be helpful to take the useful votes into consideration. # In[117]: review_votes = df[df['votes.useful']>0] # features = list(review_votes.columns) # features.remove('attributes.Good For Kids') x = vectorizer.fit_transform(review_votes.text_sw) y = review_votes['attributes.Good For Kids'].values.astype(int) xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(x, y, test_size=0.2, random_state=1234) # In[118]: new_score = cross_val_score(naive_bayes.MultinomialNB(),x,y,scoring='roc_auc',cv=5) print new_score.mean() # In[119]: plt.figure(figsize=(8, 6)) sns.kdeplot(np.random.normal(loc=np.array(new_score).mean(), scale=np.array(new_score).std(), size=10000), shade=True) # In[120]: clf = naive_bayes.MultinomialNB().fit(xtrain, ytrain) print 'fpr', metrics.roc_curve(ytest, clf.predict(xtest))[0][1] #fpr print 'tpr', metrics.roc_curve(ytest, clf.predict(xtest))[1][1] #tpr print 'precision', metrics.precision_score(ytest, clf.predict(xtest)) print 'accuracy', metrics.accuracy_score(ytest, clf.predict(xtest)) roc = metrics.roc_curve(ytest, clf.predict(xtest)) plt.figure() plt.plot([0, 0.5, 1], [0, 0.5, 1]) plt.plot(roc[0], roc[1]) plt.xlabel('false positive rate') plt.ylabel('true positive rate') # ##Predict Stars of the review # * Predicting multiclass is hard so I instead I'm trying to predict if a review is a 5-star review # In[126]: stars = biz_review[['stars_x','text','votes.useful']] stars = stars.dropna() stars['if_five']=stars.stars_x.apply(lambda x: 1 if x==5 else 0) stars.shape # In[39]: stars.head() # In[127]: stars.if_five.mean() # In[128]: stars['text_sw'] = stars.text.apply(remove_stop_words) # In[122]: x = vectorizer.fit_transform(stars.text_sw) y = stars.if_five xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(x, y, test_size=0.2, random_state=1234) new_score = cross_val_score(naive_bayes.MultinomialNB(),xtrain,ytrain,scoring='roc_auc',cv=5) print new_score.mean() # In[123]: plt.figure(figsize=(8, 6)) ax = sns.kdeplot(np.random.normal(loc=np.array(new_score).mean(), scale=np.array(new_score).std(), size=10000), shade=True,color='g',alpha=0.3) ax.set_xlabel("AUC SCORE") # In[130]: clf = naive_bayes.MultinomialNB().fit(xtrain, ytrain) print 'fpr', metrics.roc_curve(ytest, clf.predict(xtest))[0][1] #fpr print 'tpr', metrics.roc_curve(ytest, clf.predict(xtest))[1][1] #tpr print 'precision', metrics.precision_score(ytest, clf.predict(xtest)) print 'accuracy', metrics.accuracy_score(ytest, clf.predict(xtest)) roc = metrics.roc_curve(ytest, clf.predict(xtest)) plt.figure() plt.plot([0, 0.5, 1], [0, 0.5, 1]) plt.plot(roc[0], roc[1]) plt.xlabel('false positive rate') plt.ylabel('true positive rate') # In[ ]: # ## Conclusions and Next Steps # 1. Training on useful reviews helped very little than using all reviews # 2. Finding five-star reviews is harder than finding the reviwes that are not five-star # 3. Dig into what makes a review five-star and what insights we can draw for business # In[ ]: