#this follows chpt2 of
#"Programming Collective Intelligence", by T. Segaran
##ask your friends, weight according to similarity

from scipy import stats
#r,p=stats.pearsonr(xdata,ydata)
#slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)

# A dictionary of movie critics and their ratings of a small set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
 'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
 'You, Me and Dupree': 3.5}, 
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0}, 
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

len(critics), map(len,critics.values())

#what has Toby reviewed:
critics['Toby']

#look at just the critics who reviewed these two movies
dupree='You, Me and Dupree'
snakes='Snakes on a Plane'
ds={c.split()[-1]:(critics[c][dupree],critics[c][snakes])
    for c in critics if dupree in critics[c] and snakes in critics[c]}
ds

#now plot them
figure(figsize=(5,5))
xlim(0,5)
ylim(0,5)
offset={c:.01 for c in ds}
offset['Rose']=0
offset['Puig']=-.15 #move Puig down to avoid collision
plot([ds[crit][0] for crit in ds],[ds[crit][1] for crit in ds],'o')
for crit in ds: text(ds[crit][0]+.05,ds[crit][1]+offset[crit],crit)
xlabel('dupree')
ylabel('snakes');

#who's close to whom in the above fig?
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
  # Get the list of shared_items
  si=[item for item in prefs[person1] if item in prefs[person2]]

  # if they have no ratings in common, return 0
  if len(si)==0: return 0
    
  v1=array([prefs[person1][item] for item in si])
  v2=array([prefs[person2][item] for item in si])

  # use numpy euclidean distance (sqrt(sum of squares))
  dist=norm(v1-v2)

  #transform to similarity ranging from 0 to 1
  #truncate to three after decimal point
  return float("%.3f" % (1/(1+dist**2))) 

sim_distance(critics,'Lisa Rose','Gene Seymour')

#consider a different distance measure,
# Pearson correlation coefficient
#[+1 if correlated, -1 if anticorrelated]
def show_pearson(prefs,c1,c2):
    si=[item for item in prefs[c1] if item in prefs[c2]]

    figure(figsize=(5,5))
    xlim(0,5)
    ylim(0,5)
    xdata = [prefs[c1][item] for item in si]
    ydata = [prefs[c2][item] for item in si]

    slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
    print 'pearson r =', '%.2f'%r_value

    xlabel(c1)
    ylabel(c2)
    
    plot(xdata,ydata,'o')
    plot(slope*array(range(0,6))+intercept,'--')
    for item in si:
        text(prefs[c1][item],prefs[c2][item],item)

#two fake critics roughly correlated
fcritics={'critic1':{'Dupree':1,'Night':2.5,'Lady':3,'Snakes':3.5,'Superman':4.5},
         'critic2':{'Dupree':2,'Night':3,'Lady':2.5,'Snakes':3.5,'Superman':3.5}}
show_pearson(fcritics,'critic1','critic2')

#two from original set not quite as well correlated
show_pearson(critics,'Mick LaSalle','Gene Seymour')

#now define similarity measure, analogous to sim_distance
def sim_pearson(prefs,c1,c2):
    si=[item for item in prefs[c1] if item in prefs[c2]]
    if len(si)==0: return 0
    xdata = [prefs[c1][item] for item in si]
    ydata = [prefs[c2][item] for item in si]
    r,p=stats.pearsonr(xdata,ydata)
    if isnan(r): return 0
    return float("%.3f"%r)

#note pearson corrects for "grade inflation", unlike euclidean distance
#one can be systematically higher than the other, offset won't matter

#now rank the critics, finding ones with similar taste

# Returns the best matches for person from the prefs dictionary. 
# Number of results and similarity function are optional params.
def topMatches(prefs, person, n=5, similarity=sim_pearson):
  scores=[(other, similarity(prefs,person,other))
                  for other in prefs if other!=person]
  return sorted(scores,key=lambda x:x[1],reverse=True)[:n]

topMatches(critics,'Toby',n=6)

# see how topmatches function works using other similarity measure
topMatches(critics,'Toby', n=3, similarity=sim_distance)

#but really want recommendation. could use just most similar person, but that person
#might not have seen relevant movie, or might be outlier on particular movie, instead:

# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
  totals={}
  simSums={}
  for other in prefs:
    # don't compare me to myself
    if other==person: continue
    sim=similarity(prefs,person,other)
    
    # ignore scores of zero or lower
    if sim<=0: continue
    for item in prefs[other]:

      # only score movies I haven't seen yet
      if item not in prefs[person] or prefs[person][item]==0:
        # Similarity * Score
        if item not in totals:
            totals[item]=0
            simSums[item]=0
        totals[item] += prefs[other][item]*sim
        # Sum of similarities
        simSums[item] += sim

  # Create the normalized list
  rankings=[(item,float("%.3f"%(totals[item]/simSums[item]))) for item in totals]

  # Return the sorted list
  return sorted(rankings,key=lambda x:x[1],reverse=True)

getRecommendations(critics,'Toby')
#also gives likely rating

#or use other distance measure
getRecommendations(critics,'Toby',similarity=sim_distance)

#now suppose you want matching products, i.e., amazon "customers have also bought"

#first reverse role of items and objects
def transformPrefs(prefs):
  result={}
  for person in prefs:
    for item in prefs[person]:
      if item not in result: result[item]={}
      
      # Flip item and person
      result[item][person]=prefs[person][item]
  return result

movies=transformPrefs(critics)
movies

#now topmatches gives similar movies rather than similar reviewers
topMatches(movies,'Superman Returns')

#note negative scores, reviews who like one dislike the other
show_pearson(movies,'Just My Luck','Superman Returns')

getRecommendations(movies,'Just My Luck')
#find critics for movie ... invite to premiere?

#now apply to de.licio.us data
# python code to interact with delicious api is here
#http://code.google.com/p/pydelicious/source
#svn checkout http://pydelicious.googlecode.com/svn/trunk/ pydelicious
#sudo python setup.py install
from pydelicious import get_popular,get_userposts,get_urlposts
import time

#function to get some users from the most popular posts
def initializeUserDict(tag,count=5):
  user_dict={}
  # get the top count popular posts
  for p1 in get_popular(tag=tag)[0:count]:
    print 'p1=',{k:v.encode('ascii','ignore') for k,v in p1.items()}
    # find all users who posted this
    for p2 in get_urlposts(p1['url']):
      user=p2['user']
      userasc=user.encode('ascii','ignore')
      if user == userasc and len(user)>1: user_dict[userasc]={}
  return user_dict

#get users from ten urls with 'programming' tag
del_users = initializeUserDict('programming',10)
print len(del_users.keys()),'users'

print del_users.keys()[:50]
[user for user in del_users.keys() if len(user)<3]

def fillItems(user_dict):
  all_items={}
  # Find links posted by all users initialized in user_dict
  for user in user_dict:
    if len(user)<2: continue  #skip blank users
    for i in range(3):
      try:
        posts=get_userposts(user)
        break
      except:
        print "Failed user "+user+", retrying"
        time.sleep(4)
    for post in posts:
      url=post['url'].encode('ascii','ignore')
      user_dict[user][url]=1.0
      all_items[url]=1
  
  # Fill in missing items with 0
  for ratings in user_dict.values():
    for item in all_items:
      if item not in ratings:
        ratings[item]=0.0

fillItems(del_users)

import random
user=random.choice(del_users.keys())
user

#find users similar to that one
topMatches(del_users,user)

#find recommended urls for that user
#recommendation engine for del.icio.us
#look for tags similar to one another
#can also look for people trying to manipulate popular pages by posting same via multiple accounts
#item based filtering
getRecommendations(del_users,user)[:10]

url=getRecommendations(delusers,user)[0][0]
print 'url=',url
#url='http://ebookbrowse.com/'
#find urls co-liked hence 'similar'
topMatches(transformPrefs(delusers),url)

#user based filtering can be slow, use item based collaborative filtering
#precompute most similar for each item
#then look at user's top rated items, and create weighted list of items most similar to those
#based on precomputed similarities
#(comparisons between items don't change as fast as new users added)
def calculateSimilarItems(prefs,n=10):
  # Create a dictionary of items showing which other items they are most similar to.
  result={}
  # Invert the preference matrix to be item-centric
  itemPrefs=transformPrefs(prefs)
  c=0
  for item in itemPrefs:
    # Status updates for large datasets
    c+=1
    if c%100==0: print "%d / %d" % (c,len(itemPrefs))
    # Find the most similar items to this one
    scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
    result[item]=scores
  return result

itemsim=calculateSimilarItems(critics)
itemsim

#this recommender now uses the precomputed item similarities
def getRecommendedItems(prefs,itemMatch,user):
  userRatings=prefs[user]
  scores={}
  totalSim={}
  # Loop over items rated by this user
  for (item,rating) in userRatings.items( ):

    # Loop over items similar to this one
    for (item2,similarity) in itemMatch[item]:
      # Ignore if this user has already rated this item
      if item2 in userRatings: continue
      # Weighted sum of rating times similarity
      if item2 not in scores:
          scores[item2]=0
          totalSim[item2]=0
      scores[item2]+=similarity*rating
      # Sum of all the similarities
      totalSim[item2]+=similarity

  # Divide each total score by total weighting to get an average
  rankings=[(item,float("%.3f"%(scores[item]/totalSim[item]))) for item in scores]

  return sorted(rankings,key=lambda x:x[1],reverse=True)

#works the same for constant set
getRecommendedItems(critics,itemsim,'Toby')

#now get a real movie dataset from movielens, the smallest one
#http://www.grouplens.org/node/12
#http://www.grouplens.org/system/files/ml-100k.zip
#contains
#u.item list of movie ids and titles
#u.data ratings     user id, movie id, rating, timestamp
#stored in ml-100k/ folder

def loadMovieLens(path='ml-100k/'):
  # Get movie titles
  movies={}
  for line in open(path+'u.item'):
    (id,title)=line.split('|')[0:2]
    movies[id]=title
  
  # Load data
  prefs={}
  for line in open(path+'u.data'):
    (user,movieid,rating,ts)=line.split('\t')
    if user not in prefs: prefs[user]={}
    prefs[user][movies[movieid]]=float(rating)
  return prefs

mprefs=loadMovieLens()

len(mprefs),map(len,mprefs.values()[:10])

topMatches(transformPrefs(mprefs),'Terminator, The (1984)',n=10)

#get recommendations for user 87
getRecommendations(mprefs,'87')[:30]

#now item based
itemsim=calculateSimilarItems(mprefs,n=50)

#takes a while to build, but now recommendations instantaneous, and indep
# of number of users. experiment with this dataset...
getRecommendedItems(mprefs,itemsim,'189')[:30]