#!/usr/bin/env python # coding: utf-8 # # Combining recommended lists # ** * # This IPython notebook consists in combining the Top-N recommended items from different recommender methodologies (here one list each coming from collaborative filtering, content-based, and most-popular) for a given user using interleaved ranking, in order to obtain a final recommended list. # # A simple approach to combine recommendations from different sources is to add or multiply the score that each item for a given user gets under each algorithm, but this might not end up changing the recommendations too much if the scores are dissimilar or if they come in the form of a ranking. Interleaved ranking – originally an algorithm for mixing search engine results – offers a method to force the final recommended list to be more “mixed” by making them contain elements from each list. # # There are different algorithms for making an interleaved ranked list – here I’ll use the simplest algorithm, also known as the soccer team selection, which intuitively is as follows: each recommended list gets to contribute items to the final list in a sequence, by trying to add their top-ranked item, but ignoring items that got already put in the final list by another recommended list. # # Here I’ll produce three different recommended lists of 20 items each using the [MovieLens 1M dataset](https://grouplens.org/datasets/movielens/1m/) for the user numbered $100$ (userId = 100) as follows: # * Most-popular: each item’s score is the sum of the ratings they get from all users, thus favoring both highly rated and highly voted movies. This is a non-personalized list (i.e. it’s the same for all users). # * Collaborative filtering: a low-rank matrix factorization of the ratings matrix using alternating least squares. # * Content-based: regression of the (centered) ratings against the outer product of user and movie features – this is a more involved process and the details can be found [in this other IPython notebook](http://nbviewer.ipython.org/github/david-cortes/datascienceprojects/blob/master/machine_learning/recommender_system_w_coldstart.ipynb). # # ** * # ## Sections # # [1. Loading the data](#p1) # # [2. Producing a Most-Popular recommended list](#p2) # # [3. Producing a Collaborative Filtering recommended list](#p3) # # [4. Producing a Content-Based recommended list](#p4) # # [5. Examining the recommendations](#p5) # # [6. Combining recommended lists](#p6) # ** * # # # ## 1. Loading the data # # Initiallizing spark locally (will be used for most computations) and loading the necessary libraries # In[1]: import numpy as np, pandas as pd, re, findspark from collections import defaultdict from sklearn.decomposition import PCA from scipy.sparse import csc_matrix findspark.init("/home/david/Downloads/spark-2.1.1-bin-hadoop2.7/") import pyspark sc = pyspark.SparkContext() from pyspark.sql import SQLContext sqlContext = SQLContext(sc) from pyspark.mllib.regression import (LabeledPoint, RidgeRegressionWithSGD) from pyspark.ml.regression import LinearRegression from pyspark.ml.recommendation import ALS # Loading the MovieLens-1M ratings: # In[2]: ratings=pd.read_table("/home/david/movielens/ml-1m/ml-1m/ratings.dat", sep="::", names=["userId","movieId","Rating","Timestamp"], engine='python') ratings.head() # Loading the movie titles encoding - will be used later to examine recommended lists: # In[3]: movie_titles=pd.read_csv('/home/david/movielens/ml-1m/ml-1m/movies.dat', sep="::", names=['movieId','MovieTitle','genres'],engine='python') movie_titles={i.movieId:i.MovieTitle for i in movie_titles.itertuples()} # # ## 2. Producing a Most-Popular recommended list # # Items are ranked by sum of their ratings: # In[4]: user=100 movies_watched_by_user=set(list(ratings.movieId.loc[ratings.userId==user])) avg_ratings=ratings.groupby('movieId')['Rating'].mean().to_frame().rename(columns={'Rating':'AvgRating'}) num_ratings=ratings.groupby('movieId')['Rating'].agg(lambda x: len(tuple(x))).to_frame().rename(columns={'Rating':'NumRatings'}) pop_rec=num_ratings.join(avg_ratings) pop_rec.loc[~pop_rec.index.isin(movies_watched_by_user)] pop_rec['score']=pop_rec.NumRatings*pop_rec.AvgRating pop_rec=pop_rec.sort_values('score',ascending=False) pop20=list(pop_rec.index[:20]) pop_rec['Title']=pop_rec.index.map(lambda x: movie_titles[x]) pop_rec.head() # # ## 3. Producing a Collaborative Filtering recommended list # # Here I'm using ALS from PySpark to factorize the ratings matrix: # In[5]: ratings_df=sqlContext.createDataFrame(ratings) cfmodel=ALS(rank=50, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="Rating").fit(ratings_df) movies_available=set(list(ratings.movieId)) movies_available=movies_available.difference(movies_watched_by_user) preds=pd.DataFrame([(user,m) for m in movies_available],columns=['userId','movieId']) preds_df=sqlContext.createDataFrame(preds) preds_scores=cfmodel.transform(preds_df).collect() preds_scores=pd.DataFrame(preds_scores, columns=['userId','movieId','score_cf']) preds_scores=preds_scores.sort_values('score_cf',ascending=False) cf20=list(preds_scores.movieId.iloc[:20]) preds_scores['Title']=preds_scores.movieId.map(lambda x: movie_titles[x]) preds_scores.head() # # ## 4. Producing a Content-Based recommended list # # The overall idea is to get user demographic info including their geographical region, which I get from their zip codes by using some free zip code databases, and movie information by taking the movie tags from the latest movielens releases, matching them by title to the movielens-1m ratings and adding the movie genres and release year as a discretized category. # # Then, a regression is performed on the centered rating against the outer product of the user and movie features - a more detailed and explained version can be found [here](http://nbviewer.ipython.org/github/david-cortes/datascienceprojects/blob/master/machine_learning/recommender_system_w_coldstart.ipynb). # In[6]: movies=pd.read_csv('/home/david/movielens/ml-latest/ml-latest/movies.csv') movies_humanreadable=movies.copy() movies['hasYear']=movies.title.map(lambda x: bool(re.search("\s\((\d{4})\)$",x.strip()))) movies['Year']='unknown' movies['Year'].loc[movies.hasYear]=movies.title.loc[movies.hasYear].map(lambda x: re.search("\s\((\d{4})\)$",x.strip()).group(1)) del movies['hasYear'] movies['genres']=movies.genres.map(lambda x: set(x.split('|'))) present_genres=set() for movie in movies.itertuples(): present_genres=present_genres.union(movie.genres) for genre in present_genres: movies['genre'+genre]=movies.genres.map(lambda x: 1.0*(genre in x)) tags=pd.read_csv('/home/david/movielens/ml-latest/ml-latest/genome-scores.csv') tags_wide=tags.pivot(index='movieId', columns='tagId', values='relevance') tags_wide=tags_wide.fillna(0) pca=PCA(svd_solver='full') pca.fit(tags_wide) tags_pca=pd.DataFrame(pca.transform(tags_wide)[:,:50]) tags_pca.columns=["pc"+str(x) for x in tags_pca.columns.values] tags_pca['movieId']=tags_wide.index movies=pd.merge(movies,tags_pca,how='inner',on='movieId') def discretize_year(x): if x=='unknown': return x else: x=int(x) if x>=2000: return '>=2000' if x>=1995 and x<=1999: return str(x) if x>=1990 and x<=1994: return 'low90s' if x>=1980 and x<=1989: return '80s' if x>=1970 and x<=1979: return '70s' if x>=1960 and x<=1969: return '60s' if x>=1950 and x<=1959: return '50s' if x>=1940 and x<=1959: return '40s' if x<1940: return '<1940' else: return 'unknown' movies_features=movies.copy() del movies_features['title'] del movies_features['genres'] del movies_features['genre(no genres listed)'] movies_features['Year']=movies_features.Year.map(lambda x: discretize_year(x)) movies_features=pd.get_dummies(movies_features, columns=['Year']) movies_features.set_index('movieId',inplace=True) zipcode_abbs=pd.read_csv("/home/david/movielens/zips/states.csv") zipcode_abbs_dct={z.State:z.Abbreviation for z in zipcode_abbs.itertuples()} us_regs_table=[ ('New England', 'Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont'), ('Middle Atlantic', 'Delaware, Maryland, New Jersey, New York, Pennsylvania'), ('South', 'Alabama, Arkansas, Florida, Georgia, Kentucky, Louisiana, Mississippi, Missouri, North Carolina, South Carolina, Tennessee, Virginia, West Virginia'), ('Midwest', 'Illinois, Indiana, Iowa, Kansas, Michigan, Minnesota, Nebraska, North Dakota, Ohio, South Dakota, Wisconsin'), ('Southwest', 'Arizona, New Mexico, Oklahoma, Texas'), ('West', 'Alaska, California, Colorado, Hawaii, Idaho, Montana, Nevada, Oregon, Utah, Washington, Wyoming') ] us_regs_table=[(x[0],[i.strip() for i in x[1].split(",")]) for x in us_regs_table] us_regs_dct=dict() for r in us_regs_table: for s in r[1]: us_regs_dct[zipcode_abbs_dct[s]]=r[0] zipcode_info=pd.read_csv("/home/david/movielens/free-zipcode-database.csv") zipcode_info=zipcode_info.groupby('Zipcode').first().reset_index() zipcode_info['State'].loc[zipcode_info.Country!="US"]='UnknownOrNonUS' zipcode_info['Region']=zipcode_info['State'].copy() zipcode_info['Region'].loc[zipcode_info.Country=="US"]=zipcode_info.Region.loc[zipcode_info.Country=="US"].map(lambda x: us_regs_dct[x] if x in us_regs_dct else 'UsOther') zipcode_info=zipcode_info[['Zipcode', 'Region']] users=pd.read_table("/home/david/movielens/ml-1m/ml-1m/users.dat",sep='::',names=["userId","Gender","Age","Occupation","Zipcode"], engine='python') users["Zipcode"]=users.Zipcode.map(lambda x: np.int(re.sub("-.*","",x))) users=pd.merge(users,zipcode_info,on='Zipcode',how='left') users['Region']=users.Region.fillna('UnknownOrNonUS') users_features=users.copy() users_features['Gender']=users_features.Gender.map(lambda x: 1.0*(x=='M')) del users_features['Zipcode'] users_features['Age']=users_features.Age.map(lambda x: str(x)) users_features['Occupation']=users_features.Occupation.map(lambda x: str(x)) users_features=pd.get_dummies(users_features, columns=['Age', 'Occupation', 'Region']) users_features.set_index('userId',inplace=True) movies_w_sideinfo=set(list(movies.movieId)) ratings=ratings.loc[ratings.movieId.map(lambda x: x in movies_w_sideinfo)] avg_rating_by_user=ratings.groupby('userId')['Rating'].mean().to_frame().rename(columns={'Rating':'AvgRating'}) ratings_train=pd.merge(ratings, avg_rating_by_user, left_on='userId',right_index=True) ratings_train['RatingCentered']=ratings_train.Rating-ratings_train.AvgRating def generate_features(user,movie,users_features_bc,movies_features_bc): user_feats=users_features_bc.value.loc[user].as_matrix() movie_feats=movies_features_bc.value.loc[movie].as_matrix() return csc_matrix(np.kron(user_feats,movie_feats).reshape(-1,1)) users_features_bc=sc.broadcast(users_features) movies_features_bc=sc.broadcast(movies_features) trainset=sc.parallelize([(i.userId,i.movieId,i.RatingCentered) for i in ratings_train.itertuples()])\ .map(lambda x: LabeledPoint(x[2],generate_features(x[0],x[1],users_features_bc,movies_features_bc)))\ .map(lambda x: (float(x.label),x.features.asML())).toDF(['label','features']) trainset.repartition(50) recommender=LinearRegression(regParam=1e-4).fit(trainset) formula_coeffs=recommender.coefficients.toArray() def generate_features_series(user,movie): user_feats=users_features.loc[user].as_matrix() movie_feats=movies_features.loc[movie].as_matrix() return pd.Series(np.kron(user_feats,movie_feats).astype('float64')) preds_scores=preds_scores.loc[preds_scores.movieId.map(lambda x: x in movies_w_sideinfo)] X_predict=preds_scores.movieId.apply(lambda x: generate_features_series(user,x)) preds_scores['score_cb']=X_predict.dot(formula_coeffs) preds_scores=preds_scores.sort_values('score_cb',ascending=False) cb20=list(preds_scores.movieId.iloc[:20]) preds_scores.head() # # ## 5. Examining the recommendations # # Now taking a look at what these lists are actually recommend each - their recommendations are very different with little intersection, and as expected, collaborative filtering tends to favor less popular items for this user. First Most-Popular recommended list: # In[7]: def print_reclist(reclist): list_w_info=[str(m+1)+") - "+movie_titles[reclist[m]]+\ " - Average Rating: "+str(np.round(avg_ratings.loc[reclist[m]].iloc[0],2))+\ " - Number of ratings: "+str(num_ratings.loc[reclist[m]].iloc[0]) for m in range(len(reclist))] print "\n".join(list_w_info) print_reclist(pop20) # Collaborative filtering recommended list: # In[8]: print_reclist(cf20) # Content-based recommended list: # In[9]: print_reclist(cb20) # # ## 6. Combining recommended lists # # Finally, combining these three lists through interleaved ranking, prioritizing them in this order: CF-CB-MP: # In[10]: def interleaved_ranking(lst_of_lists,n): final_list=list() while len(final_list)