from scipy import stats #r,p=stats.pearsonr(xdata,ydata) #slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) from collections import defaultdict # A dictionary of movie reviewers and their ratings of a small set of movies reviews={ 'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0}, 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 3.5}, 'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0, 'Superman Returns': 3.5, 'The Night Listener': 4.0}, 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'The Night Listener': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 2.5}, 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 2.0}, 'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5}, 'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0} } len(reviews), map(len,reviews.values()) #what has Toby reviewed: reviews['Toby'] # plot the critics who reviewed these two: dupree='You, Me and Dupree' snakes='Snakes on a Plane' crit2 = [critic for critic in reviews if dupree in reviews[critic] and snakes in reviews[critic]] du_ratings=[reviews[c][dupree] for c in crit2] sn_ratings=[reviews[c][snakes] for c in crit2] figure(figsize=(4.5,4.5)) xlim(.9,5.1),ylim(.9,5.1) xlabel(dupree), ylabel(snakes) plot(du_ratings,sn_ratings,'bo') for crit,x,y in zip(crit2,du_ratings,sn_ratings): print '{}: ({},{})'.format(crit,x,y) #move Puig down to avoid collision text(x+.05,y+(.01 if 'Puig' not in crit else -.2),crit.split()[-1]) def sim_distance(prefs,person1,person2): # Get the list of shared_items shared_items=[item for item in prefs[person1] if item in prefs[person2]] # if they have no ratings in common, return 0 if len(shared_items)==0: return 0 v1=array([prefs[person1][item] for item in shared_items]) v2=array([prefs[person2][item] for item in shared_items]) # use numpy euclidean distance (sqrt(sum of squares)) dist=norm(v1-v2) #transform to similarity ranging from 0 to 1 #truncate to three after decimal point return 1/(1+dist**2) #'Lisa Rose' happened to rate all movies all_movies = sorted(reviews['Lisa Rose'],key=reviews['Lisa Rose'].get) def ratings(critic): return array([reviews[critic][m] for m in all_movies]) for crit in ('Lisa Rose','Gene Seymour','Mick LaSalle'): print crit,ratings(crit) plot(ratings(crit),'o-',label=crit) ylim(0,6),xlim(-.5,5.5) legend(loc='upper left'); #for example, the green and blue round(sim_distance(reviews,'Lisa Rose','Gene Seymour'),3) from IPython.display import Image Image('Pearson.png',width=600) #three critics who rated all 6 crit6 = [crit for crit in reviews if len(reviews[crit])==6] #three pairs from above set pairs = [(crit6[i],crit6[j]) for i in range(len(crit6)) for j in range(i)] #compare the similarity based on distance with Pearson for these three pairs: figure(figsize=(10.5,6.6)) for j,(c1,c2) in enumerate(pairs): subplot(2,3,j+1) scatter(ratings(c1),ratings(c2)) for (x,y) in zip(ratings(c1),ratings(c2)): plot((x,x),(x,y),'r-',marker='_',ms=10) grid('on') xticks(range(1,6)),yticks(range(1,6)) axis((.9,5.1,.9,5.1)) xlabel(c1),ylabel(c2) plot((1,5),(1,5),'b--') d=norm(ratings(c1)-ratings(c2)) text(1,4.25,'sim_distance={:.2f}\n(distance={:.2f})'.\ format(sim_distance(reviews,c1,c2),d),fontsize=12,color='b') subplot(2,3,j+4) scatter(ratings(c1),ratings(c2)) xticks(range(1,6)),yticks(range(1,6)) axis((.9,5.1,.9,5.1)) xlabel(c1),ylabel(c2) r,p=stats.pearsonr(ratings(c1),ratings(c2)) text(1,4.25,'Pearson r={:.2f}\n(p={:.2f})'.format(r,p),fontsize=12,color='b') #define function to look at the Pearson r for a few critic pairs def show_pearson(prefs,crit1,crit2): shared_items=[item for item in prefs[crit1] if item in prefs[crit2]] figure(figsize=(5,5)) xlim(.8,5.2),ylim(.8,5.2) xdata = [prefs[crit1][item] for item in shared_items] ydata = [prefs[crit2][item] for item in shared_items] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) xlabel(crit1),ylabel(crit2) plot(xdata,ydata,'o') plot(slope*arange(6)+intercept,'--') voffset={(x,y):.01 for x,y in zip(xdata,ydata)} for item in shared_items: x,y=prefs[crit1][item],prefs[crit2][item] text(x+.05, y+voffset[(x,y)], item) voffset[(x,y)]-=.15 text(1.25,4.5,'Pearson r = {:.2f}'.format(r_value),fontsize=14,color='b') #two fake critics roughly correlated fcritics={'critic1':{'Dupree':1,'Night':2.5,'Lady':3,'Snakes':3.5,'Superman':4.5}, 'critic2':{'Dupree':2,'Night':3,'Lady':2.5,'Snakes':3.5,'Superman':3.5}} show_pearson(fcritics,'critic1','critic2') #two from original set not quite as well correlated show_pearson(reviews,'Mick LaSalle','Gene Seymour') #now define similarity measure, analogous to sim_distance def sim_pearson(prefs,crit1,crit2): shared_items=[item for item in prefs[crit1] if item in prefs[crit2]] # shared_items=list(set(prefs[person1]) & set(prefs[person2])) #equivalent if len(shared_items)==0: return 0 xdata = [prefs[crit1][item] for item in shared_items] ydata = [prefs[crit2][item] for item in shared_items] r,p=stats.pearsonr(xdata,ydata) if isnan(r): return 0 return r # Returns the best matches for person from the prefs dictionary. # Number of results and similarity function are optional params. def topMatches(prefs, person, n=5, similarity=sim_pearson): scores=[(other, round(similarity(prefs,person,other),3)) for other in prefs if other != person] return sorted(scores,key=lambda x:x[1],reverse=True)[:n] topMatches(reviews,'Toby',6) # see how topmatches function works using other similarity measure topMatches(reviews,'Toby', n=3, similarity=sim_distance) def getRecommendations(prefs,person,similarity=sim_pearson): totals={} simSums={} for other in prefs: # don't compare me to myself if other==person: continue sim=similarity(prefs,person,other) # ignore scores of zero or lower if sim<=0: continue for item in prefs[other]: # only score movies I haven't seen yet if item not in prefs[person] or prefs[person][item]==0: # Similarity * Score if item not in totals: totals[item]=0 simSums[item]=0 totals[item] += prefs[other][item]*sim # Sum of similarities simSums[item] += sim # Create the normalized list rankings=[(item,round(totals[item]/simSums[item],3)) for item in totals] # Return the sorted list return sorted(rankings,key=lambda x:x[1],reverse=True) getRecommendations(reviews,'Toby') #also gives likely rating #or use other distance measure getRecommendations(reviews,'Toby',similarity=sim_distance) #first reverse role of items and objects def transformPrefs(prefs): result=defaultdict(dict) for person in prefs: for item in prefs[person]: # Flip item and person result[item][person]=prefs[person][item] return result movies=transformPrefs(reviews) movies #now topmatches gives similar movies rather than similar reviewers topMatches(movies,'Superman Returns') #note negative scores, reviewers who like one dislike the other show_pearson(movies,'Just My Luck','Superman Returns') getRecommendations(movies,'Just My Luck') #find critics for movie ... invite to premiere?