#!/usr/bin/env python # coding: utf-8 # # Collaborative filtering for 'implicit feedback' data # ** * # This projects consists in building a recommender system to recommend songs according to the history of tracks played by a given user from subset of the [MillionSong TasteProfile data](https://labrosa.ee.columbia.edu/millionsong/tasteprofile) that was made into a [Kaggle competition](https://www.kaggle.com/c/msdchallenge), using [Hierarchical Poisson Factorization](https://arxiv.org/abs/1311.1704). Unlike recommendations based on movie ratings, for example, recommending based on listening activity is harder as it is only an indirect measure of user preference, and doesn’t usually signal user dislikes. # # HPF (Hierarchical Poisson Factorization) is a probabilistic model that tries to factorize the user-item interaction count matrix as the product of two lower dimensional matrices, just like regular factorization methods used with explicit feedback data, but taking this product as the parameter of a Poisson random variable (thus the model likelihood optimization is different than minimizing the sum of least squares). Additionally, the user-attribute and item-attribute matrices are given a Bayesian Gamma prior, making them non-negative and adjusting for the user activity level and song popularity. # # Unlike other methods such as BPR (Bayesian Personalized Ranking) or weighted-implicit ALS, it only requires iterating over the data for which an interaction was observed and not over data for which no interaction was observed (i.e. it doesn’t iterate over songs not played by the user), thus being more scalable, and at the same time producing better results. # # The implementation here is based on the paper _Gopalan, P., Hofman, J. M., & Blei, D. M. (2013). Scalable recommendation with poisson factorization. arXiv preprint arXiv:1311.1704._, and is implemented using PyMC3 without relying on variational inference, so speed is perhaps not as great as the authors' original coordinate ascent optimization algorithm. # # It’s possible to extend this model to make use of song side information, such as artist, tags, genre, and others, but that increases the model complexity and computational time a lot without bringing too much of an improvement (as seen in the paper _Gopalan, P. K., Charlin, L., & Blei, D. (2014). Content-based recommendations with poisson factorization. In Advances in Neural Information Processing Systems (pp. 3176-3184)._) # ** * # # Sections # # [1. Loading the data](#p1) # # [2. Implementing the model](#p2) # # [3. Checking some recommendations](#p3) # ** * # # ## 1. Loading the data # # Reading and reindexing the data: # In[1]: import numpy as np, pandas as pd, matplotlib.pyplot as plt, pymc3 as pm, theano playcounts=list() user_id_to_int=dict() user_int_to_id=dict() song_id_to_int=dict() song_int_to_id=dict() cnt_users=0 cnt_songs=0 def parse_line(line): global playcounts, cnt_users, cnt_songs user,song,playcount = line.decode('utf-8').split('\t') if user not in user_id_to_int: user_id_to_int[user]=cnt_users user_int_to_id[cnt_users]=user cnt_users+=1 if song not in song_int_to_id: song_int_to_id[song]=cnt_songs song_int_to_id[cnt_songs]=song cnt_songs+=1 user=user_id_to_int[user] song=song_int_to_id[song] playcount=int(playcount.strip()) playcounts.append((user, song, playcount)) with open('D:\\Downloads\\millionsong\\kaggle_visible_evaluation_triplets.txt','rb') as f: for line in f: parse_line(line) print(cnt_users) print(cnt_songs) print(len(playcounts)) # _Note that an algorithm like implicit-ALS would require constructing a matrix with more than 10^10 entries with this dataset, thus not feasible on a cheap laptop._ # In[2]: del user_id_to_int del user_int_to_id del song_id_to_int # The dataset doesn't contain timestamps so I'll make a random train-test split: # In[3]: from sklearn.model_selection import train_test_split playcounts=pd.DataFrame(playcounts, columns=['UserId', 'SongId', 'Playcount']) train, test = train_test_split(playcounts, test_size=.2, random_state=1) users_train=set(train.UserId) items_train=set(train.SongId) test=test.loc[test.UserId.isin(users_train)] test=test.loc[test.UserId.isin(items_train)] test=test.loc[test.Playcount>1] del users_train del items_train print(train.shape) print(test.shape) train.head() # # ## 2. Implementing the model # # PyMC3 Implementation. The hyperparameters are the ones suggested in the paper: # In[4]: # hyperparameters a=.3 a_prime=.3 b_prime=1.0 c=.3 c_prime=.3 d_prime=1.0 # number of factors k = 30 # In[5]: np.random.seed(1) theta_init=np.random.gamma(.3, .3, size=(cnt_users,k)) beta_init=np.random.gamma(.3, .3, size=(cnt_songs,k)) with pm.Model(): user_activity=pm.Gamma('user_activity', a_prime, a_prime/b_prime, shape = (cnt_users,1) ) user_prior=theano.tensor.tile(user_activity, k, ndim=2) theta=pm.Gamma('theta', a, user_prior, shape=(cnt_users,k), testval=theta_init) item_pop=pm.Gamma('item_pop', c_prime, c_prime/d_prime, shape = (cnt_songs,1) ) item_prior=theano.tensor.tile(item_pop, k, ndim=2) beta=pm.Gamma('beta', c, item_prior, shape=(cnt_songs,k), testval=beta_init) xhat=theano.tensor.sum(theta[train.UserId] * beta[train.SongId], axis=1) R=pm.Poisson('R',mu=xhat, observed=train.Playcount.astype('float32')) HPF=pm.find_MAP() # Now extracting the results: # In[6]: theta_star=HPF['theta'] beta_star=HPF['beta'].T # # ## 3. Checking some recommendations # # Now examining the Top-20 recommended songs for some users. The list of song names, artist, and other side information is also made available: # In[7]: song_info=pd.read_table('D:\\Downloads\\millionsong\\unique_tracks.txt', sep='', engine='python', header=None, names=['TrackId', 'SongId', 'Artist', 'Title']) del song_info['TrackId'] song_info.head() # In[8]: def top_n(user_id, n=20): global theta_star, beta_star, song_info recommended_list = np.argsort(theta_star[user_id].dot(beta_star)) recommended_list = [song_int_to_id[song] for song in recommended_list[:n]] recommended_list = pd.DataFrame(pd.Series(recommended_list), columns=['SongId']) return pd.merge(recommended_list, song_info, on='SongId', how='left') top_n(user_id=0) # In[9]: top_n(user_id=123) # In[10]: top_n(user_id=53000) # In[11]: top_n(user_id=100000) # Unfortunately, for implicit data such as playcounts, there is no intuitive metric such as the root mean squared error or average rating of Top-N recommendations to compare with. There are some metrics such as Precision@K and Recall@K, which try to see how much do the Top-K recommended lists for each user include the items in the hold-out set, but these are very slow to calculate and also not entirely good metrics. # # Nevertheless, it's possible to see how well would it rank the songs for each user in the hold-out set and compare that to the number of playcounts observed. # # It's also possible to do some common sense checks such as checking the mean prediction for songs that were in the test set and compare it to randomly selected songs: # In[12]: test['Predicted']=test.apply(lambda x: theta_star[int(x['UserId'])].dot(beta_star[:,int(x['SongId'])]), axis=1) np.random.seed(1) test['RandomSongId']=np.random.randint(cnt_songs, size=test.shape[0]) test['PredictedRandom']=test.apply(lambda x: theta_star[int(x['UserId'])].dot(beta_star[:,int(x['RandomSongId'])]), axis=1) print('Average Score for songs in hold-out set: ',test.Predicted.mean()) print('Average Score for random songs: ',test.PredictedRandom.mean()) test.head(10) # In[13]: np.corrcoef(test.Predicted, test.Playcount)[0,1] # In[14]: users_more_2_songs=test.groupby('UserId')['SongId'].agg(lambda x: len(tuple(x))>2) users_more_2_songs=users_more_2_songs.loc[users_more_2_songs] users_more_2_songs=set(users_more_2_songs.index) test_eval=test.loc[test.UserId.isin(users_more_2_songs)] test_eval=test_eval.groupby('UserId')[['Playcount','Predicted']].corr(method='kendall').ix[0::2,'Predicted'] np.nanmean(test_eval.to_frame().values) # In[15]: test_eval.head(15)