from IPython.core.display import Image Image(filename='./recsys_arch.png') import pandas as pd unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] users = pd.read_table('../data/ml-1m/users.dat', sep='::', header=None, names=unames) rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_table('../data/ml-1m/ratings.dat', sep='::', header=None, names=rnames) mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table('../data/ml-1m/movies.dat', sep='::', header=None, names=mnames) # show how one of them looks ratings.head(5) # show how one of them looks users[:5] movies[:5] movielens = pd.merge(pd.merge(ratings, users), movies) movielens movielens.ix[0] mean_ratings = movielens.pivot_table('rating', rows='title', cols='gender', aggfunc='mean') mean_ratings[:5] ratings_by_title = movielens.groupby('title').size() ratings_by_title[:10] active_titles = ratings_by_title.index[ratings_by_title >= 250] active_titles[:10] mean_ratings = mean_ratings.ix[active_titles] mean_ratings top_female_ratings = mean_ratings.sort_index(by='F', ascending=False) top_female_ratings[:10] mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F'] sorted_by_diff = mean_ratings.sort_index(by='diff') sorted_by_diff[:15] # Reverser order of rows, take first 15 rows sorted_by_diff[::-1][:15] # Standard deviation of rating grouped by title rating_std_by_title = movielens.groupby('title')['rating'].std() # Filter down to active_titles rating_std_by_title = rating_std_by_title.ix[active_titles] # Order Series by value in descending order rating_std_by_title.order(ascending=False)[:10] # let's work with a smaller subset for speed reasons import numpy as np movielens = movielens.ix[np.random.choice(movielens.index, size=10000, replace=False)] print movielens.shape print movielens.user_id.nunique() print movielens.movie_id.nunique() user_ids_larger_1 = pd.value_counts(movielens.user_id, sort=False) > 1 user_ids_larger_1 movielens = movielens[user_ids_larger_1[movielens.user_id].values] print movielens.shape np.all(movielens.user_id.value_counts() > 1) import numpy as np def assign_to_set(df): sampled_ids = np.random.choice(df.index, size=np.int64(np.ceil(df.index.size * 0.2)), replace=False) df.ix[sampled_ids, 'for_testing'] = True return df movielens['for_testing'] = False grouped = movielens.groupby('user_id', group_keys=False).apply(assign_to_set) movielens_train = movielens[grouped.for_testing == False] movielens_test = movielens[grouped.for_testing == True] print movielens_train.shape print movielens_test.shape print movielens_train.index & movielens_test.index movielens_train.to_csv('../data/movielens_train.csv') movielens_test.to_csv('../data/movielens_test.csv') def compute_rmse(y_pred, y_true): """ Compute Root Mean Squared Error. """ return np.sqrt(np.mean(np.power(y_pred - y_true, 2))) def evaluate(estimate_f): """ RMSE-based predictive performance evaluation with pandas. """ ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id) estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate]) real = movielens_test.rating.values return compute_rmse(estimated, real) def estimate1(user_id, item_id): """ Simple content-filtering based on mean ratings. """ return movielens_train.ix[movielens_train.user_id == user_id, 'rating'].mean() print 'RMSE for estimate1: %s' % evaluate(estimate1) def estimate2(user_id, movie_id): """ Simple collaborative filter based on mean ratings. """ ratings_by_others = movielens_train[movielens_train.movie_id == movie_id] if ratings_by_others.empty: return 3.0 return ratings_by_others.rating.mean() print 'RMSE for estimate2: %s' % evaluate(estimate2) # transform the ratings frame into a ratings matrix ratings_mtx_df = movielens_train.pivot_table(values='rating', rows='user_id', cols='movie_id') ratings_mtx_df # with an integer axis index only label-based indexing is possible ratings_mtx_df.ix[ratings_mtx_df.index[-15:],ratings_mtx_df.columns[:15]] import numpy as np; import pandas as pd; from pandas import Series, DataFrame rating = pd.read_csv('../data/movie_rating.csv') rp = rating.pivot_table(cols=['critic'],rows=['title'],values='rating') rp rating_toby = rp['Toby'] sim_toby = rp.corrwith(rating_toby) sim_toby criteria = ((rating_toby[rating.title].isnull()) & (rating.critic != 'Toby')).values rating_c = rating[criteria] rating_c['similarity'] = rating_c['critic'].map(sim_toby.get) rating_c['sim_rating'] = rating_c.similarity * rating_c.rating rating_c recommendation = rating_c.groupby('title').apply(lambda s: s.sim_rating.sum() / s.similarity.sum()) recommendation.order(ascending=False) def pearson(s1, s2): """Take two pd.Series objects and return a pearson correlation.""" s1_c = s1 - s1.mean() s2_c = s2 - s2.mean() return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2)) class CollabFiltering: """ Collaborative filtering using a custom sim(u,u'). """ def learn(self): """ Prepare datastructures for estimation. """ self.all_user_profiles = movielens.pivot_table('rating', rows='movie_id', cols='user_id') def estimate(self, user_id, movie_id): """ Ratings weighted by correlation similarity. """ ratings_by_others = movielens_train[movielens_train.movie_id == movie_id] if ratings_by_others.empty: return 3.0 ratings_by_others.set_index('user_id', inplace=True) their_ids = ratings_by_others.index their_ratings = ratings_by_others.rating their_profiles = self.all_user_profiles[their_ids] user_profile = self.all_user_profiles[user_id] sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0) ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings}) ratings_sims = ratings_sims[ ratings_sims.sim > 0] if ratings_sims.empty: return their_ratings.mean() else: return np.average(ratings_sims.rating, weights=ratings_sims.sim) reco = CollabFiltering() reco.learn() print 'RMSE for CollabFiltering: %s' % evaluate(reco.estimate)