%matplotlib inline from collections import defaultdict import json import numpy as np import scipy as sp import matplotlib.pyplot as plt import pandas as pd from matplotlib import rcParams import matplotlib.cm as cm import matplotlib as mpl #colorbrewer2 Dark2 qualitative color table dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() pd.set_option('display.width', 500) pd.set_option('display.max_columns', 100) fulldf=pd.read_csv("bigdf.csv") fulldf.head(2) 'stars': (star rating, integer 1-5), 'date': (date, formatted like '2011-04-19'), 'review_id': (unique id for the review). 'business_id': (a unique identifier for this business), 'biz_name': (the full business name), 'latitude': (latitude), 'longitude': (longitude), 'business_review_count': (review count for the restaurant[this is a repeated field for all reviews of the restaurant]), 'categories': [(localized category names)], 'business_avg': (average stars over all users reviews for business[this is a repeated field for all reviews of the restaurant]). 'user_id': (unique user identifier), 'user_name': (first name, last initial, like 'Matt J.'), 'user_review_count': (count of restaurants reviewed by user[this is a repeated field for all reviews by the user]), 'user_avg': (floating point average of users reviews over all businesses, like 4.31[this is a repeated field for all reviews by the user]). """ grouped_by_userid = fulldf.groupby('user_id') marker = False for name, group in grouped_by_userid: if len(group.business_id.drop_duplicates()) != len(group.business_id): print 'OW MY GAWD' marker = True if marker == False: print 'they were right after all.' """ #your code here fig = plt.figure() # plot histogram for reviews per user ax = fig.add_subplot(2,1,1) num_reviews_per_user = [numrev for numrev in fulldf['user_id'].value_counts()] #print num_reviews_per_user ax.set_title('Frequency of Restaurant Reviews per User') ax.hist(num_reviews_per_user, bins=100, label="reviews per user") # scale review count as logarithmic ax.set_xscale('log') # add ticks, labels, etc ax.set_xlim(0,1000) ax.set_xticks([1,2,3,4,5,10,50,100,max(num_reviews_per_user),500,1000]) ax.set_xticklabels([1,2,3,4,5,10,50,100,max(num_reviews_per_user),500,1000], horizontalalignment='center') ax.set_yscale('linear') maxline = ax.vlines(max(num_reviews_per_user), 1, 10000, colors='red', label='most reviews per user') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels) # plot histogram for reviews per restaurant ax = fig.add_subplot(2,1,2) num_reviews_per_joint = [numrev for numrev in fulldf['business_id'].value_counts()] #print num_reviews_per_user ax.set_title('Frequency of Restaurant Reviews per Restaurant') ax.hist(num_reviews_per_joint, bins=100, label="reviews per joint") # scale review count as logarithmic ax.set_xscale('log') # add ticks, labels, etc ax.set_xlim(0,1000) ax.set_xticks([1,2,3,4,5,10,50,100,500,max(num_reviews_per_joint),1000]) ax.set_xticklabels([1,2,3,4,5,10,50,100,500,max(num_reviews_per_joint),1000], horizontalalignment='center') ax.set_yscale('linear') ax.set_ylim(1,2000) maxline = ax.vlines(max(num_reviews_per_joint), 1, 500, colors='red', label='most reviews per joint') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels) #your code here if len(fulldf['user_id'].drop_duplicates()) > len(fulldf['business_id'].drop_duplicates()): print 'more users than joints' else: print 'more joints than users' #your code here # not sure which of these you're looking for: print 'Average of all ratings of restaurants:',round(fulldf.stars.mean(),2) means_by_joint = np.mean(fulldf.groupby('business_id')['stars'].mean()) print "Average rating across each restaurant's aggregated rating:",round(means_by_joint,2) def recompute_frame(ldf): """ takes a dataframe ldf, makes a copy of it, and returns the copy with all averages and review counts recomputed this is used when a frame is subsetted. """ ldfu=ldf.groupby('user_id') ldfb=ldf.groupby('business_id') user_avg=ldfu.stars.mean() user_review_count=ldfu.review_id.count() business_avg=ldfb.stars.mean() business_review_count=ldfb.review_id.count() nldf=ldf.copy() nldf.set_index(['business_id'], inplace=True) nldf['business_avg']=business_avg nldf['business_review_count']=business_review_count nldf.reset_index(inplace=True) nldf.set_index(['user_id'], inplace=True) nldf['user_avg']=user_avg nldf['user_review_count']=user_review_count nldf.reset_index(inplace=True) return nldf #your code here # we need to make an actual copy of fulldf here # otherwise straight assignment (eg. x = fulldf) just creates a symbolic referent, and manipulations on x change fulldf! copydf = fulldf.copy() # reduce df by given parameters smalldf = copydf[(copydf.user_review_count > 60) & (copydf.business_review_count > 150)] # recompute frame smalldf = recompute_frame(smalldf) print 'Number unique items:',smalldf.business_id.drop_duplicates().count() print 'Number unique users:',smalldf.user_id.drop_duplicates().count() #your code here fig = plt.figure() # plot histogram for reviews per user ax = fig.add_subplot(2,1,1) num_reviews_per_user = [numrev for numrev in smalldf['user_id'].value_counts()] #print num_reviews_per_user ax.set_title('Frequency of Restaurant Reviews per User') ax.hist(num_reviews_per_user, bins=40, label="reviews per user") # add ticks, labels, etc ax.set_xlim(0,90) maxline = ax.vlines(max(num_reviews_per_user), 0, 5, colors='red', label='most reviews per user') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels) # plot histogram for reviews per restaurant ax = fig.add_subplot(2,1,2) num_reviews_per_joint = [numrev for numrev in smalldf['business_id'].value_counts()] #print num_reviews_per_user ax.set_title('Frequency of Restaurant Reviews per Restaurant') ax.hist(num_reviews_per_joint, bins=40, label="reviews per joint") # add ticks, labels, etc ax.set_xlim(0,120) ax.set_ylim(1,20) maxline = ax.vlines(max(num_reviews_per_joint), 0, 5, colors='red', label='most reviews per joint') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels) plt.subplots_adjust(top=1.5) plt.subplots_adjust(bottom=.5) print 'Average User Rating' a,b,c = plt.hist(smalldf.user_avg) plt.show() print '\n' print 'Average Business Rating' a,b,c = plt.hist(smalldf.business_avg) plt.show() print '\n\nOverall mean:', round(smalldf.stars.mean(),3),'\n\n' restaurants=smalldf.business_id.unique() supports=[] for i,rest1 in enumerate(restaurants): for j,rest2 in enumerate(restaurants): if i < j: rest1_reviewers = smalldf[smalldf.business_id==rest1].user_id.unique() rest2_reviewers = smalldf[smalldf.business_id==rest2].user_id.unique() common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers) supports.append(len(common_reviewers)) print "Mean support is:",np.mean(supports) plt.hist(supports) from scipy.stats.stats import pearsonr def pearson_sim(rest1_reviews, rest2_reviews, n_common): """ Given a subframe of restaurant 1 reviews and a subframe of restaurant 2 reviews, where the reviewers are those who have reviewed both restaurants, return the pearson correlation coefficient between the user average subtracted ratings. The case for zero common reviewers is handled separately. Its ok to return a NaN if any of the individual variances are 0. """ if n_common==0: rho=0. else: diff1=rest1_reviews['stars']-rest1_reviews['user_avg'] diff2=rest2_reviews['stars']-rest2_reviews['user_avg'] rho=pearsonr(diff1, diff2)[0] return rho def get_restaurant_reviews(restaurant_id, df, set_of_users): """ given a resturant id and a set of reviewers, return the sub-dataframe of their reviews. """ mask = (df.user_id.isin(set_of_users)) & (df.business_id==restaurant_id) reviews = df[mask] reviews = reviews[reviews.user_id.duplicated()==False] return reviews """ Function -------- calculate_similarity Parameters ---------- rest1 : string The id of restaurant 1 rest2 : string The id of restaurant 2 df : DataFrame A dataframe of reviews, such as the smalldf above similarity_func : func A function like pearson_sim above which takes two dataframes of individual restaurant reviews made by a common set of reviewers, and the number of common reviews. This function returns the similarity of the two restaurants based on the common reviews. Returns -------- A tuple The first element of the tuple is the similarity and the second the common support n_common. If the similarity is a NaN, set it to 0 """ #your code here def calculate_similarity(rest1, rest2, df, similarity_func): # adapted from the code we were given above rest1_reviewers = df[df.business_id==rest1].user_id.unique() rest2_reviewers = df[df.business_id==rest2].user_id.unique() common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers) n_common = len(common_reviewers) # this is a list comprehension, but it's only really getting at the two rest_ids passed into the function # for each one, it runs get_restaurant_reviews and stores both outputs together in a list revs = [get_restaurant_reviews(rid, df, common_reviewers) for rid in [rest1,rest2]] # compute similarity sim = similarity_func(revs[0], revs[1], n_common) # check for NaN, if so return 0 similarity = sim if (not np.isnan(sim)) else 0 return (similarity, n_common) # test # print calculate_similarity(smalldf.business_id.ix[0], smalldf.business_id.ix[210], smalldf, pearson_sim) class Database: "A class representing a database of similaries and common supports" def __init__(self, df): "the constructor, takes a reviews dataframe like smalldf as its argument" database={} self.df=df self.uniquebizids={v:k for (k,v) in enumerate(df.business_id.unique())} keys=self.uniquebizids.keys() l_keys=len(keys) self.database_sim=np.zeros([l_keys,l_keys]) self.database_sup=np.zeros([l_keys, l_keys], dtype=np.int) def populate_by_calculating(self, similarity_func): """ a populator for every pair of businesses in df. takes similarity_func like pearson_sim as argument """ items=self.uniquebizids.items() for b1, i1 in items: for b2, i2 in items: if i1 < i2: sim, nsup=calculate_similarity(b1, b2, self.df, similarity_func) self.database_sim[i1][i2]=sim self.database_sim[i2][i1]=sim self.database_sup[i1][i2]=nsup self.database_sup[i2][i1]=nsup elif i1==i2: nsup=self.df[self.df.business_id==b1].user_id.count() self.database_sim[i1][i1]=1. self.database_sup[i1][i1]=nsup def get(self, b1, b2): "returns a tuple of similarity,common_support given two business ids" sim=self.database_sim[self.uniquebizids[b1]][self.uniquebizids[b2]] nsup=self.database_sup[self.uniquebizids[b1]][self.uniquebizids[b2]] return (sim, nsup) db=Database(smalldf) db.populate_by_calculating(pearson_sim) db.get("z3yFuLVrmH-3RJruPEMYKw", "zruUQvFySeXyEd7_rQixBg") def shrunk_sim(sim, n_common, reg=3.): "takes a similarity and shrinks it down by using the regularizer" ssim=(n_common*sim)/(n_common+reg) return ssim """ Function -------- knearest Parameters ---------- restaurant_id : string The id of the restaurant whose nearest neighbors we want set_of_restaurants : array The set of restaurants from which we want to find the nearest neighbors dbase : instance of Database class. A database of similarities, on which the get method can be used to get the similarity of two businessed. e.g. dbase.get(rid1,rid2) k : int the number of nearest neighbors desired, default 7 reg: float the regularization. Returns -------- A sorted list of the top k similar restaurants. The list is a list of tuples (business_id, shrunken similarity, common support). """ #your code here from operator import itemgetter def knearest(restaurant_id, set_of_restaurants, dbase, k, reg): neighbors = [] rid = restaurant_id for joint in set_of_restaurants: # make sure we're not comparing a restaurant against itself if joint != rid: # get similarity and ncommon (sim, ncom) = dbase.get(rid, joint) # shrink via regularization shrunken = shrunk_sim(sim, ncom, reg) # add to nearest neighbors list neighbors.append((joint, shrunken, ncom)) # sort neighbors so that the closest are on top neighbors.sort(key=itemgetter(1), reverse=True) # return k nearest return neighbors[:k] testbizid="eIxSLxzIlfExI6vgAbn2JA" testbizid2="L-uPZxooP_ziXCtRrWi8Pw" def biznamefromid(df, theid): return df['biz_name'][df['business_id']==theid].values[0] def usernamefromid(df, theid): return df['user_name'][df['user_id']==theid].values[0] print testbizid, biznamefromid(smalldf,testbizid) print testbizid2, biznamefromid(smalldf, testbizid2) tops=knearest(testbizid, smalldf.business_id.unique(), db, k=7, reg=3.) print "For ",biznamefromid(smalldf, testbizid), ", top matches are:" for i, (biz_id, sim, nc) in enumerate(tops): print i,biznamefromid(smalldf,biz_id), "| Sim", sim, "| Support",nc tops2=knearest(testbizid2, smalldf.business_id.unique(), db, k=7, reg=3.) print "For ",biznamefromid(smalldf, testbizid2), ", top matches are:" for i, (biz_id, sim, nc) in enumerate(tops2): print i,biznamefromid(smalldf,biz_id), "| Sim", sim, "| Support",nc def get_user_top_choices(user_id, df, numchoices=5): "get the sorted top 5 restaurants for a user by the star rating the user gave them" udf=df[df.user_id==user_id][['business_id','stars']].sort(['stars'], ascending=False).head(numchoices) return udf testuserid="7cR92zkDv4W3kqzii6axvg" print "For user", usernamefromid(smalldf,testuserid), "top choices are:" bizs=get_user_top_choices(testuserid, smalldf)['business_id'].values [biznamefromid(smalldf, biz_id) for biz_id in bizs] """ Function -------- get_top_recos_for_user Parameters ---------- userid : string The id of the user for whom we want the top recommendations df : Dataframe The dataframe of restaurant reviews such as smalldf dbase : instance of Database class. A database of similarities, on which the get method can be used to get the similarity of two businessed. e.g. dbase.get(rid1,rid2) n: int the n top choices of the user by star rating k : int the number of nearest neighbors desired, default 8 reg: float the regularization. Returns -------- A sorted list of the top recommendations. The list is a list of tuples (business_id, business_avg). You are combining the k-nearest recommendations for each of the user's n top choices, removing duplicates and the ones the user has already rated. """ #your code here def get_top_recos_for_user(userid, df, dbase, n, k, reg): # put top n user choices into 'tops' tops = get_user_top_choices(userid, df, n) # included is a set because sets only allow unique instances included = set() neighbors = [] # loop through the list of top choices' ids for joint in tops.business_id: # loop through knearest data for each top user choice for (jid, sim, ncom) in knearest(joint, df.business_id.unique(), dbase, k, reg): # set mask to identify businesses that the user has already rated mask = (df.business_id == jid) & (df.user_id == userid) # if we haven't already seen this biz, AND if user hasn't already rated, then keep it if (jid not in included) & (not df[mask]): included.add(jid) # grab avg rating rating = df[df.business_id == jid].stars.mean() # append id/rating tuple to list of neighbors neighbors.append((jid, rating)) # sort in the usual way # does it matter if we use x.sort() or y = sorted(x...)? final_neighbors = sorted(neighbors, key=itemgetter(1), reverse=True) return final_neighbors print "For user", usernamefromid(smalldf,testuserid), "the top recommendations are:" toprecos=get_top_recos_for_user(testuserid, smalldf, db, n=5, k=7, reg=3.) for biz_id, biz_avg in toprecos: print biznamefromid(smalldf,biz_id), "| Average Rating |", round(biz_avg,2) """ Function -------- knearest_amongst_userrated Parameters ---------- restaurant_id : string The id of the restaurant whose nearest neighbors we want user_id : string The id of the user, in whose reviewed restaurants we want to find the neighbors df: Dataframe The dataframe of reviews such as smalldf dbase : instance of Database class. A database of similarities, on which the get method can be used to get the similarity of two businesses. e.g. dbase.get(rid1,rid2) k : int the number of nearest neighbors desired, default 7 reg: float the regularization. Returns -------- A sorted list of the top k similar restaurants. The list is a list of tuples (business_id, shrunken similarity, common support). """ #your code here def knearest_among_userrated(restaurant_id, user_id, df, dbase, k, reg): # trim df to only instances where our specific user has rated mydf = df[df['user_id'] == user_id] # run knearest on the unique rest_ids in this trimmed df neighbors = knearest(restaurant_id, mydf.business_id.unique(), dbase, k, reg) return neighbors[:k] """ Function -------- rating Parameters ---------- df: Dataframe The dataframe of reviews such as smalldf dbase : instance of Database class. A database of similarities, on which the get method can be used to get the similarity of two businessed. e.g. dbase.get(rid1,rid2) restaurant_id : string The id of the restaurant whose nearest neighbors we want user_id : string The id of the user, in whose reviewed restaurants we want to find the neighbors k : int the number of nearest neighbors desired, default 7 reg: float the regularization. Returns -------- A float which is the imputed rating that we predict that user_id will make for restaurant_id """ #your code here def rating(df, dbase, restaurant_id, user_id, k, reg): rid = restaurant_id # get knearest neighbor nearest = knearest_among_userrated(restaurant_id, user_id, df, dbase, k, reg) # compute formula elements Yavg_U = df[df.user_id==user_id].stars.mean() Yavg_M = df[df.business_id==rid].stars.mean() Yavg = df.stars.mean() Yhat_base_M = Yavg + (Yavg_U - Yavg) + (Yavg_M - Yavg) # this list is for collecting S_mj and Y_diff values across all restaurants in set neighbor_data = [] # loop through all joints in k nearest for (jid, S_Mj, ncom) in nearest: # compute all of the 'j' elements of our formula Yavg_J = df[df.business_id==jid].stars.mean() Yhat_base_J = Yavg + (Yavg_U - Yavg) + (Yavg_J - Yavg) Y_Uj = df[(df.user_id == user_id) & (df.business_id == jid)].stars.iloc[0] # compute Yuj - Yhat_base_J Y_diff = Y_Uj - Yhat_base_J # compute atomic (ie. un-summed) numerator of our neighbor ratio expression atomic_numerator = float(S_Mj) * float(Y_diff) # append similarity and atomic numerator before iterating neighbor_data.append((S_Mj, atomic_numerator)) # from: stackoverflow.com/questions/638048/how-do-i-sum-the-first-value-in-each-tuple-in-a-list-of-tuples-in-python numerator = sum([pair[1] for pair in neighbor_data]) denominator = sum([pair[0] for pair in neighbor_data]) if denominator == 0: return Yhat_base_M else: neighbor_factor = float(numerator) / float(denominator) Yhat_M = Yhat_base_M + neighbor_factor return Yhat_M print "User Average", round(smalldf[smalldf.user_id==testuserid].stars.mean(),3),"for",usernamefromid(smalldf,testuserid) print "Predicted ratings for top choices calculated earlier:" for biz_id,biz_avg in toprecos: print biznamefromid(smalldf, biz_id),"|",round(rating(smalldf, db, biz_id, testuserid, k=7, reg=3.),2),"|","Average",round(biz_avg,2) def get_other_ratings(restaurant_id, user_id, df): "get a user's rating for a restaurant and the restaurant's average rating" choice=df[(df.business_id==restaurant_id) & (df.user_id==user_id)] users_score=choice.stars.values[0] average_score=choice.business_avg.values[0] return users_score, average_score print "for user",usernamefromid(smalldf,testuserid), 'avg', smalldf[smalldf.user_id==testuserid].stars.mean() for biz_id in bizs: print "----------------------------------" print biznamefromid(smalldf, biz_id) print "Predicted Rating:",rating(smalldf, db, biz_id, testuserid, k=7, reg=3.) u,a=get_other_ratings(biz_id, testuserid, smalldf) print "Actual User Rating:",u,"Avg Rating",a def compare_results(stars_actual, stars_predicted, ylow=-10, yhigh=15, title=""): """ plot predicted results against actual results. Takes 2 arguments: a numpy array of actual ratings and a numpy array of predicted ratings scatterplots the predictions, a unit slope line, line segments joining the mean, and a filled in area of the standard deviations." """ fig=plt.figure() df=pd.DataFrame(dict(actual=stars_actual, predicted=stars_predicted)) ax=plt.scatter(df.actual, df.predicted, alpha=0.2, s=30, label="predicted") plt.ylim([ylow,yhigh]) plt.plot([1,5],[1,5], label="slope 1") xp=[1,2,3,4,5] yp=df.groupby('actual').predicted.mean().values plt.plot(xp,yp,'k', label="means") sig=df.groupby('actual').predicted.std().values plt.fill_between(xp, yp - sig, yp + sig, color='k', alpha=0.2) plt.xlabel("actual") plt.ylabel("predicted") plt.legend(frameon=False) remove_border() plt.grid(False) plt.title(title) print np.mean(np.abs(df.predicted) < 15) #your code here predictdf = smalldf.copy() # i tried a number of different implementations here, as speed was certainly an issue # in the end i stuck with brian's numpy vectorization solution from Piazza @1133, # but from what i can tell there aren't any solutions that are far better than all the (viable) others. # vrating = np.vectorize(rating, excluded=['df','dbase','k','reg'], otypes=[np.float]) pred_3_3 = vrating(df=smalldf, dbase=db, restaurant_id=smalldf['business_id'], user_id=smalldf['user_id'], k=3, reg=3) pred_3_15 = vrating(df=smalldf, dbase=db, restaurant_id=smalldf['business_id'], user_id=smalldf['user_id'], k=3, reg=15) pred_10_3 = vrating(df=smalldf, dbase=db, restaurant_id=smalldf['business_id'], user_id=smalldf['user_id'], k=10, reg=3) pred_10_15 = vrating(df=predictdf, dbase=db, restaurant_id=predictdf['business_id'], user_id=predictdf['user_id'], k=10, reg=15) predictdf['predict_3_3'] = pred_3_3 predictdf['predict_3_15'] = pred_3_15 predictdf['predict_10_3'] = pred_10_3 predictdf['predict_10_15'] = pred_10_15 #predictdf.to_csv('predictdf.csv', index=False) #predictdf = pd.read_csv('predictdf.csv') actual = predictdf['stars'] compare_results(actual, predictdf['predict_3_3'], title='k=3 and reg=3') compare_results(actual, predictdf['predict_3_15'], title='k=3 and reg=15') compare_results(actual, predictdf['predict_10_3'], title='k=10 and reg=3') compare_results(actual, predictdf['predict_10_15'], title='k=10 and reg=15') print 'user count:',smalldf.user_id.drop_duplicates().count() print 'item count:',smalldf.business_id.drop_duplicates().count() """ Function -------- gamma_m_draw Draw a single sample from the conditional posterior distribution of gamma_m. Inputs ------- X_m: A g-by-L+1 matrix, defined above. Y_m: A 1D vector of length g, defined above. sig2: Residual _variance_, as defined above. Lambda_gamma: Prior precision matrix. Outputs -------- Single draw from conditional posterior, defined above. """ #Item-specific parameters given all else #your code here def gamma_m_draw(X_m, Y_m, sig2, Lambda_gamma): Q = (1/sig2)*X_m.T.dot(X_m) + Lambda_gamma mvn_mean = (1/sig2)*np.linalg.inv(Q).dot(X_m.T).dot(Y_m) mvn_cov = np.linalg.inv(Q) mvn = np.random.multivariate_normal(mvn_mean, mvn_cov) return mvn """ Function -------- theta_u_draw Draw a single sample from the conditional posterior distribution of gamma_m. Inputs ------- X_u: A g-by-L+1 matrix, defined above. Y_u: A 1D vector of length g, defined above. sig2: Residual _variance_, as defined above. Lambda_theta: Prior precision matrix. Outputs -------- Single draw from conditional posterior, defined above. """ #User-specific parameters given all else #your code here def theta_u_draw(X_u, Y_u, sig2, Lambda_theta): Q = (1/sig2)*X_u.T.dot(X_u) + Lambda_theta mvn_mean = (1/sig2)*np.linalg.inv(Q).dot(X_u.T).dot(Y_u) mvn_cov = np.linalg.inv(Q) mvn = np.random.multivariate_normal(mvn_mean, mvn_cov) return mvn """ Function -------- factor_gibbs Runs a gibbs sampler to infer mean, variance, user-specific, and item-specific parameters. Inputs ------- data: A dataframe containing ratings data. L: Dimension of latent factors. maxit: Number of samples to draw from posterior. Lambda_theta_diag: Hyperparameter controlling regularization of Theta. Lambda_gamma_diag: Hyperparameter controlling regularization of Gamma. progress: if true, print iteration number every 100 iterations. Outputs -------- Dictionary with elements mu: Draws of mu. 1D array of length maxiter. sig2: Draws of sig2, residual _variance_. 1D array of length maxiter. theta: Draws of Theta. U-by-L-by-maxiter array. gamma: Draws of Gamma. M-by-L-by-maxiter array. EY: Draws of fitted values of Y. N-by-maxiter array. """ def factor_gibbs(data, L, maxit, Lambda_theta_diag, Lambda_gamma_diag, progress=True): data = data.copy() N = data.shape[0] #Create indices that allow us to map users and restaurants to rows #in parameter vectors. uusers, uidx = np.unique(data.user_id, return_inverse=True) uitems, midx = np.unique(data.business_id, return_inverse=True) nusers = uusers.size nitems = uitems.size #Add numerical indices to dataframe. data["uidx"] = uidx data["midx"] = midx #Group observations by user and by business. ugroups = data.groupby("uidx") mgroups = data.groupby("midx") all_avg = data.stars.mean() u_avg = ugroups.stars.mean() m_avg = mgroups.stars.mean() #Initialize parameters and set up data structures for #holding draws. #Overall mean mu = all_avg mu_draws = np.zeros(maxit) #Residual variance sig2 = 0.5 sig2_draws = np.zeros(maxit) #Matrix of user-specific bias and L latent factors. theta = np.zeros([nusers, L+1]) theta[:,0] = u_avg-all_avg theta_draws = np.zeros([nusers, L+1, maxit]) #Matrix of item-specific bias and L latent factors. gamma = np.zeros([nitems, L+1]) gamma[:,0] = m_avg-all_avg gamma_draws = np.zeros([nitems, L+1, maxit]) #Matrix for holding the expected number of stars #for each observation at each draw from the posterior. EY_draws = np.zeros([data.shape[0], maxit]) #Inverse covariance matrices from the prior on each theta_u #and gamma_b. These are diagonal, like Ridge regression. Lambda_theta = np.eye(L+1)*Lambda_theta_diag Lambda_gamma = np.eye(L+1)*Lambda_gamma_diag #Main sampler code for i in range(maxit): if i%100==0 and progress: print i #The entire regression equation except for the overall mean. nomu = np.sum(theta[data.uidx,1:]*gamma[data.midx,1:], axis=1) +\ theta[data.uidx,0] + gamma[data.midx,0] #Compute the expectation of each observation given the current #parameter values. EY_draws[:,i]=mu+nomu #Draw overall mean from a normal distribution mu = np.random.normal(np.mean(data.stars-nomu), np.sqrt(sig2/N)) #Draw overall residual variance from a scaled inverse-Chi squared distribution. sig2 = np.sum(np.power(data.stars-nomu-mu,2))/np.random.chisquare(N-2) #For each item for mi,itemdf in mgroups: #Gather relevant observations, and subtract out overall mean and #user-specific biases, which we are holding fixed. Y_m = itemdf.stars-mu-theta[itemdf.uidx,0] #Build the regression design matrix implied by holding user factors #fixed. X_m = np.hstack((np.ones([itemdf.shape[0],1]), theta[itemdf.uidx,1:])) gamma[mi,:] = gamma_m_draw(X_m, Y_m, sig2, Lambda_gamma) #For each user for ui,userdf in ugroups: #Gather relevant observations, and subtract out overall mean and #business-specific biases, which we are holding fixed. Y_u = userdf.stars-mu-gamma[userdf.midx,0] #Build the regression design matrix implied by holding business factors #fixed. X_u = np.hstack((np.ones([userdf.shape[0],1]), gamma[userdf.midx,1:])) theta[ui,:] = theta_u_draw(X_u, Y_u, sig2, Lambda_theta) #Record draws mu_draws[i] = mu sig2_draws[i] = sig2 theta_draws[:,:,i] = theta gamma_draws[:,:,i] = gamma return {"mu": mu_draws, "sig2": sig2_draws, "theta": theta_draws, "gamma": gamma_draws, "EY": EY_draws} #your code here gibbs = factor_gibbs(predictdf, 2, 1000, 0.1, 0.1, progress=True) prediction = gibbs['EY'] gibbs_means = np.mean(prediction, axis=1, keepdims=False) #your code here compare_results(predictdf.stars, gibbs_means, title='Gibbs 2-Factor Model Ratings Predictions') subsetoffull=fulldf[['user_id','business_id', 'stars','business_avg','user_avg']] subsetoffull.to_csv("subset-full.csv", index=False, header=False) subsetofsmall=smalldf[['user_id','business_id', 'stars','business_avg','user_avg']] subsetofsmall.to_csv("subset-small.csv", index=False, header=False) #from pygments import highlight #from pygments.lexers import PythonLexer #from pygments.formatters import HtmlFormatter #from IPython.display import HTML #import urllib #skelcode = urllib.urlopen("https://raw.github.com/cs109/content/master/skeleton.py").read() #skelhtml=highlight(skelcode, PythonLexer(), HtmlFormatter()) #HTML(skelhtml) def upper_generator(words): for word in words: yield word.upper() words = ['a', 'couple', 'of', 'words', 'to', 'process'] print upper_generator(words) print list(upper_generator(words)) for u in upper_generator(words): print u thecode = open("computesim.py").read() thehtml=highlight(thecode, PythonLexer(), HtmlFormatter()) HTML(thehtml) output_small_local=[[json.loads(j) for j in line.strip().split("\t")] for line in open("./output.small.local.txt")] output_small_local[0] def make_database_from_pairs(df, bizpairs): """ make the database from the pairs returned from mrjob. df is the dataframe, smalldf or fulldf. bizpairs are a list of elements, each of which is a list of two lists. The first of these lists has the two business id's, while the second has the similarity and the common support Returns an instance of the Database class. """ dbase=Database(df) cache={} for bp,corrs in bizpairs: b1,b2=bp i1=dbase.uniquebizids[b1] i2=dbase.uniquebizids[b2] sim,nsup=corrs dbase.database_sim[i1][i2]=sim dbase.database_sim[i2][i1]=sim dbase.database_sup[i1][i2]=nsup dbase.database_sup[i2][i1]=nsup if cache.has_key(b1): nsup1=cache[b1] else: nsup1=dbase.df[dbase.df.business_id==b1].user_id.count() cache[b1]=nsup1 if cache.has_key(b2): nsup2=cache[b2] else: nsup2=dbase.df[dbase.df.business_id==b2].user_id.count() cache[b2]=nsup2 dbase.database_sim[i1][i1]=1.0 dbase.database_sim[i2][i2]=1.0 dbase.database_sup[i1][i1]=nsup1 dbase.database_sup[i2][i2]=nsup2 return dbase db_mrjob_local=make_database_from_pairs(smalldf, output_small_local) print db.get("zruUQvFySeXyEd7_rQixBg", "z3yFuLVrmH-3RJruPEMYKw") print db_mrjob_local.get("zruUQvFySeXyEd7_rQixBg", "z3yFuLVrmH-3RJruPEMYKw") sums=0. count=0 for k in db.uniquebizids.keys(): for k2 in db.uniquebizids.keys(): count=count+1 sums=sums+db.get(k,k2)[0]-db_mrjob_local.get(k,k2)[0] print sums, count output_full_emr=[[json.loads(j) for j in l.strip().split("\t")] for l in open("./output.full.emr.txt")] dbfull=make_database_from_pairs(fulldf, output_full_emr) #your code here print "for user",usernamefromid(fulldf,testuserid), 'avg', fulldf[fulldf.user_id==testuserid].stars.mean() for biz_id in bizs: print "----------------------------------" print biznamefromid(fulldf, biz_id) print "Predicted Rating:",rating(fulldf, dbfull, biz_id, testuserid, k=7, reg=3.) u,a=get_other_ratings(biz_id, testuserid, fulldf) print "Actual User Rating:",u,"Avg Rating",a