from scipy import stats
#r,p=stats.pearsonr(xdata,ydata)
#slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
from collections import defaultdict

# A dictionary of movie reviewers and their ratings of a small set of movies
reviews={
    'Lisa Rose':
        {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
         'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0},
    'Gene Seymour':
        {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,
         'Superman Returns': 5.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 3.5}, 
    'Michael Phillips':
        {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
         'Superman Returns': 3.5, 'The Night Listener': 4.0},
    'Claudia Puig':
        {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'The Night Listener': 4.5,
         'Superman Returns': 4.0, 'You, Me and Dupree': 2.5},
    'Mick LaSalle':
         {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'Just My Luck': 2.0,
          'Superman Returns': 3.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 2.0}, 
    'Jack Matthews':
        {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
         'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
    'Toby':
        {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}
}

len(reviews), map(len,reviews.values())

#what has Toby reviewed:
reviews['Toby']

# plot the critics who reviewed these two:
dupree='You, Me and Dupree'
snakes='Snakes on a Plane'

crit2 = [critic for critic in reviews if dupree in reviews[critic] and snakes in reviews[critic]]
du_ratings=[reviews[c][dupree] for c in crit2]
sn_ratings=[reviews[c][snakes] for c in crit2]

figure(figsize=(4.5,4.5))
xlim(.9,5.1),ylim(.9,5.1)
xlabel(dupree), ylabel(snakes)

plot(du_ratings,sn_ratings,'bo')

for crit,x,y in zip(crit2,du_ratings,sn_ratings):
    print '{}: ({},{})'.format(crit,x,y)
#move Puig down to avoid collision
    text(x+.05,y+(.01 if 'Puig' not in crit else -.2),crit.split()[-1])

def sim_distance(prefs,person1,person2):
  # Get the list of shared_items
  shared_items=[item for item in prefs[person1] if item in prefs[person2]]

  # if they have no ratings in common, return 0
  if len(shared_items)==0: return 0
    
  v1=array([prefs[person1][item] for item in shared_items])
  v2=array([prefs[person2][item] for item in shared_items])

  # use numpy euclidean distance (sqrt(sum of squares))
  dist=norm(v1-v2)

  #transform to similarity ranging from 0 to 1
  #truncate to three after decimal point
  return 1/(1+dist**2)

#'Lisa Rose' happened to rate all movies
all_movies = sorted(reviews['Lisa Rose'],key=reviews['Lisa Rose'].get)

def ratings(critic): return array([reviews[critic][m] for m in all_movies])

for crit in ('Lisa Rose','Gene Seymour','Mick LaSalle'):
    print crit,ratings(crit)
    plot(ratings(crit),'o-',label=crit)

ylim(0,6),xlim(-.5,5.5)
legend(loc='upper left');

#for example, the green and blue
round(sim_distance(reviews,'Lisa Rose','Gene Seymour'),3)

from IPython.display import Image
Image('Pearson.png',width=600)

#three critics who rated all 6
crit6 = [crit for crit in reviews if len(reviews[crit])==6]
#three pairs from above set
pairs = [(crit6[i],crit6[j]) for i in range(len(crit6)) for j in range(i)]

#compare the similarity based on distance with Pearson for these three pairs:

figure(figsize=(10.5,6.6))
for j,(c1,c2) in enumerate(pairs):
    subplot(2,3,j+1)
    scatter(ratings(c1),ratings(c2))
    for (x,y) in zip(ratings(c1),ratings(c2)): plot((x,x),(x,y),'r-',marker='_',ms=10)        
    grid('on')
    xticks(range(1,6)),yticks(range(1,6))
    axis((.9,5.1,.9,5.1))
    xlabel(c1),ylabel(c2)
    plot((1,5),(1,5),'b--')
    d=norm(ratings(c1)-ratings(c2))
    text(1,4.25,'sim_distance={:.2f}\n(distance={:.2f})'.\
         format(sim_distance(reviews,c1,c2),d),fontsize=12,color='b')
        
    subplot(2,3,j+4)
    scatter(ratings(c1),ratings(c2))
    xticks(range(1,6)),yticks(range(1,6))
    axis((.9,5.1,.9,5.1))
    xlabel(c1),ylabel(c2)
    r,p=stats.pearsonr(ratings(c1),ratings(c2))
    text(1,4.25,'Pearson r={:.2f}\n(p={:.2f})'.format(r,p),fontsize=12,color='b')

#define function to look at the Pearson r for a few critic pairs

def show_pearson(prefs,crit1,crit2):
    shared_items=[item for item in prefs[crit1] if item in prefs[crit2]]

    figure(figsize=(5,5))
    xlim(.8,5.2),ylim(.8,5.2)
    xdata = [prefs[crit1][item] for item in shared_items]
    ydata = [prefs[crit2][item] for item in shared_items]

    slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
    xlabel(crit1),ylabel(crit2)
    
    plot(xdata,ydata,'o')
    plot(slope*arange(6)+intercept,'--')
    
    voffset={(x,y):.01 for x,y in zip(xdata,ydata)}
    for item in shared_items:
        x,y=prefs[crit1][item],prefs[crit2][item]
        text(x+.05, y+voffset[(x,y)], item)
        voffset[(x,y)]-=.15
    text(1.25,4.5,'Pearson r = {:.2f}'.format(r_value),fontsize=14,color='b')

#two fake critics roughly correlated
fcritics={'critic1':{'Dupree':1,'Night':2.5,'Lady':3,'Snakes':3.5,'Superman':4.5},
         'critic2':{'Dupree':2,'Night':3,'Lady':2.5,'Snakes':3.5,'Superman':3.5}}
show_pearson(fcritics,'critic1','critic2')

#two from original set not quite as well correlated
show_pearson(reviews,'Mick LaSalle','Gene Seymour')

#now define similarity measure, analogous to sim_distance
def sim_pearson(prefs,crit1,crit2):
    shared_items=[item for item in prefs[crit1] if item in prefs[crit2]]
  #  shared_items=list(set(prefs[person1]) & set(prefs[person2]))  #equivalent
    if len(shared_items)==0: return 0
    xdata = [prefs[crit1][item] for item in shared_items]
    ydata = [prefs[crit2][item] for item in shared_items]
    r,p=stats.pearsonr(xdata,ydata)
    if isnan(r): return 0
    return r

# Returns the best matches for person from the prefs dictionary. 
# Number of results and similarity function are optional params.

def topMatches(prefs, person, n=5, similarity=sim_pearson):
  scores=[(other, round(similarity(prefs,person,other),3))
                  for other in prefs if other != person]
  return sorted(scores,key=lambda x:x[1],reverse=True)[:n]

topMatches(reviews,'Toby',6)

# see how topmatches function works using other similarity measure
topMatches(reviews,'Toby', n=3, similarity=sim_distance)

def getRecommendations(prefs,person,similarity=sim_pearson):
  totals={}
  simSums={}
  for other in prefs:
    # don't compare me to myself
    if other==person: continue
    sim=similarity(prefs,person,other)
    
    # ignore scores of zero or lower
    if sim<=0: continue
    for item in prefs[other]:

      # only score movies I haven't seen yet
      if item not in prefs[person] or prefs[person][item]==0:
        # Similarity * Score
        if item not in totals:
            totals[item]=0
            simSums[item]=0
        totals[item] += prefs[other][item]*sim
        # Sum of similarities
        simSums[item] += sim

  # Create the normalized list
  rankings=[(item,round(totals[item]/simSums[item],3)) for item in totals]

  # Return the sorted list
  return sorted(rankings,key=lambda x:x[1],reverse=True)

getRecommendations(reviews,'Toby')
#also gives likely rating

#or use other distance measure
getRecommendations(reviews,'Toby',similarity=sim_distance)

#first reverse role of items and objects
def transformPrefs(prefs):
  result=defaultdict(dict)
  for person in prefs:
    for item in prefs[person]:      
      # Flip item and person
      result[item][person]=prefs[person][item]
  return result

movies=transformPrefs(reviews)
movies

#now topmatches gives similar movies rather than similar reviewers
topMatches(movies,'Superman Returns')

#note negative scores, reviewers who like one dislike the other
show_pearson(movies,'Just My Luck','Superman Returns')

getRecommendations(movies,'Just My Luck')
#find critics for movie ... invite to premiere?