#this follows chpt2 of #"Programming Collective Intelligence", by T. Segaran ##ask your friends, weight according to similarity from scipy import stats #r,p=stats.pearsonr(xdata,ydata) #slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) # A dictionary of movie critics and their ratings of a small set of movies critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0}, 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 3.5}, 'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0, 'Superman Returns': 3.5, 'The Night Listener': 4.0}, 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'The Night Listener': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 2.5}, 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 2.0}, 'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5}, 'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}} len(critics), map(len,critics.values()) #what has Toby reviewed: critics['Toby'] #look at just the critics who reviewed these two movies dupree='You, Me and Dupree' snakes='Snakes on a Plane' ds={c.split()[-1]:(critics[c][dupree],critics[c][snakes]) for c in critics if dupree in critics[c] and snakes in critics[c]} ds #now plot them figure(figsize=(5,5)) xlim(0,5) ylim(0,5) offset={c:.01 for c in ds} offset['Rose']=0 offset['Puig']=-.15 #move Puig down to avoid collision plot([ds[crit][0] for crit in ds],[ds[crit][1] for crit in ds],'o') for crit in ds: text(ds[crit][0]+.05,ds[crit][1]+offset[crit],crit) xlabel('dupree') ylabel('snakes'); #who's close to whom in the above fig? # Returns a distance-based similarity score for person1 and person2 def sim_distance(prefs,person1,person2): # Get the list of shared_items si=[item for item in prefs[person1] if item in prefs[person2]] # if they have no ratings in common, return 0 if len(si)==0: return 0 v1=array([prefs[person1][item] for item in si]) v2=array([prefs[person2][item] for item in si]) # use numpy euclidean distance (sqrt(sum of squares)) dist=norm(v1-v2) #transform to similarity ranging from 0 to 1 #truncate to three after decimal point return float("%.3f" % (1/(1+dist**2))) sim_distance(critics,'Lisa Rose','Gene Seymour') #consider a different distance measure, # Pearson correlation coefficient #[+1 if correlated, -1 if anticorrelated] def show_pearson(prefs,c1,c2): si=[item for item in prefs[c1] if item in prefs[c2]] figure(figsize=(5,5)) xlim(0,5) ylim(0,5) xdata = [prefs[c1][item] for item in si] ydata = [prefs[c2][item] for item in si] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) print 'pearson r =', '%.2f'%r_value xlabel(c1) ylabel(c2) plot(xdata,ydata,'o') plot(slope*array(range(0,6))+intercept,'--') for item in si: text(prefs[c1][item],prefs[c2][item],item) #two fake critics roughly correlated fcritics={'critic1':{'Dupree':1,'Night':2.5,'Lady':3,'Snakes':3.5,'Superman':4.5}, 'critic2':{'Dupree':2,'Night':3,'Lady':2.5,'Snakes':3.5,'Superman':3.5}} show_pearson(fcritics,'critic1','critic2') #two from original set not quite as well correlated show_pearson(critics,'Mick LaSalle','Gene Seymour') #now define similarity measure, analogous to sim_distance def sim_pearson(prefs,c1,c2): si=[item for item in prefs[c1] if item in prefs[c2]] if len(si)==0: return 0 xdata = [prefs[c1][item] for item in si] ydata = [prefs[c2][item] for item in si] r,p=stats.pearsonr(xdata,ydata) if isnan(r): return 0 return float("%.3f"%r) #note pearson corrects for "grade inflation", unlike euclidean distance #one can be systematically higher than the other, offset won't matter #now rank the critics, finding ones with similar taste # Returns the best matches for person from the prefs dictionary. # Number of results and similarity function are optional params. def topMatches(prefs, person, n=5, similarity=sim_pearson): scores=[(other, similarity(prefs,person,other)) for other in prefs if other!=person] return sorted(scores,key=lambda x:x[1],reverse=True)[:n] topMatches(critics,'Toby',n=6) # see how topmatches function works using other similarity measure topMatches(critics,'Toby', n=3, similarity=sim_distance) #but really want recommendation. could use just most similar person, but that person #might not have seen relevant movie, or might be outlier on particular movie, instead: # Gets recommendations for a person by using a weighted average # of every other user's rankings def getRecommendations(prefs,person,similarity=sim_pearson): totals={} simSums={} for other in prefs: # don't compare me to myself if other==person: continue sim=similarity(prefs,person,other) # ignore scores of zero or lower if sim<=0: continue for item in prefs[other]: # only score movies I haven't seen yet if item not in prefs[person] or prefs[person][item]==0: # Similarity * Score if item not in totals: totals[item]=0 simSums[item]=0 totals[item] += prefs[other][item]*sim # Sum of similarities simSums[item] += sim # Create the normalized list rankings=[(item,float("%.3f"%(totals[item]/simSums[item]))) for item in totals] # Return the sorted list return sorted(rankings,key=lambda x:x[1],reverse=True) getRecommendations(critics,'Toby') #also gives likely rating #or use other distance measure getRecommendations(critics,'Toby',similarity=sim_distance) #now suppose you want matching products, i.e., amazon "customers have also bought" #first reverse role of items and objects def transformPrefs(prefs): result={} for person in prefs: for item in prefs[person]: if item not in result: result[item]={} # Flip item and person result[item][person]=prefs[person][item] return result movies=transformPrefs(critics) movies #now topmatches gives similar movies rather than similar reviewers topMatches(movies,'Superman Returns') #note negative scores, reviews who like one dislike the other show_pearson(movies,'Just My Luck','Superman Returns') getRecommendations(movies,'Just My Luck') #find critics for movie ... invite to premiere? #now apply to de.licio.us data # python code to interact with delicious api is here #http://code.google.com/p/pydelicious/source #svn checkout http://pydelicious.googlecode.com/svn/trunk/ pydelicious #sudo python setup.py install from pydelicious import get_popular,get_userposts,get_urlposts import time #function to get some users from the most popular posts def initializeUserDict(tag,count=5): user_dict={} # get the top count popular posts for p1 in get_popular(tag=tag)[0:count]: print 'p1=',{k:v.encode('ascii','ignore') for k,v in p1.items()} # find all users who posted this for p2 in get_urlposts(p1['url']): user=p2['user'] userasc=user.encode('ascii','ignore') if user == userasc and len(user)>1: user_dict[userasc]={} return user_dict #get users from ten urls with 'programming' tag del_users = initializeUserDict('programming',10) print len(del_users.keys()),'users' print del_users.keys()[:50] [user for user in del_users.keys() if len(user)<3] def fillItems(user_dict): all_items={} # Find links posted by all users initialized in user_dict for user in user_dict: if len(user)<2: continue #skip blank users for i in range(3): try: posts=get_userposts(user) break except: print "Failed user "+user+", retrying" time.sleep(4) for post in posts: url=post['url'].encode('ascii','ignore') user_dict[user][url]=1.0 all_items[url]=1 # Fill in missing items with 0 for ratings in user_dict.values(): for item in all_items: if item not in ratings: ratings[item]=0.0 fillItems(del_users) import random user=random.choice(del_users.keys()) user #find users similar to that one topMatches(del_users,user) #find recommended urls for that user #recommendation engine for del.icio.us #look for tags similar to one another #can also look for people trying to manipulate popular pages by posting same via multiple accounts #item based filtering getRecommendations(del_users,user)[:10] url=getRecommendations(delusers,user)[0][0] print 'url=',url #url='http://ebookbrowse.com/' #find urls co-liked hence 'similar' topMatches(transformPrefs(delusers),url) #user based filtering can be slow, use item based collaborative filtering #precompute most similar for each item #then look at user's top rated items, and create weighted list of items most similar to those #based on precomputed similarities #(comparisons between items don't change as fast as new users added) def calculateSimilarItems(prefs,n=10): # Create a dictionary of items showing which other items they are most similar to. result={} # Invert the preference matrix to be item-centric itemPrefs=transformPrefs(prefs) c=0 for item in itemPrefs: # Status updates for large datasets c+=1 if c%100==0: print "%d / %d" % (c,len(itemPrefs)) # Find the most similar items to this one scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance) result[item]=scores return result itemsim=calculateSimilarItems(critics) itemsim #this recommender now uses the precomputed item similarities def getRecommendedItems(prefs,itemMatch,user): userRatings=prefs[user] scores={} totalSim={} # Loop over items rated by this user for (item,rating) in userRatings.items( ): # Loop over items similar to this one for (item2,similarity) in itemMatch[item]: # Ignore if this user has already rated this item if item2 in userRatings: continue # Weighted sum of rating times similarity if item2 not in scores: scores[item2]=0 totalSim[item2]=0 scores[item2]+=similarity*rating # Sum of all the similarities totalSim[item2]+=similarity # Divide each total score by total weighting to get an average rankings=[(item,float("%.3f"%(scores[item]/totalSim[item]))) for item in scores] return sorted(rankings,key=lambda x:x[1],reverse=True) #works the same for constant set getRecommendedItems(critics,itemsim,'Toby') #now get a real movie dataset from movielens, the smallest one #http://www.grouplens.org/node/12 #http://www.grouplens.org/system/files/ml-100k.zip #contains #u.item list of movie ids and titles #u.data ratings user id, movie id, rating, timestamp #stored in ml-100k/ folder def loadMovieLens(path='ml-100k/'): # Get movie titles movies={} for line in open(path+'u.item'): (id,title)=line.split('|')[0:2] movies[id]=title # Load data prefs={} for line in open(path+'u.data'): (user,movieid,rating,ts)=line.split('\t') if user not in prefs: prefs[user]={} prefs[user][movies[movieid]]=float(rating) return prefs mprefs=loadMovieLens() len(mprefs),map(len,mprefs.values()[:10]) topMatches(transformPrefs(mprefs),'Terminator, The (1984)',n=10) #get recommendations for user 87 getRecommendations(mprefs,'87')[:30] #now item based itemsim=calculateSimilarItems(mprefs,n=50) #takes a while to build, but now recommendations instantaneous, and indep # of number of users. experiment with this dataset... getRecommendedItems(mprefs,itemsim,'189')[:30]