%matplotlib inline
from collections import defaultdict
import json

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

fulldf=pd.read_csv("bigdf.csv")
fulldf.head(2)
  'stars': (star rating, integer 1-5),
  'date': (date, formatted like '2011-04-19'),
  'review_id': (unique id for the review).  'business_id': (a unique identifier for this business),
  'biz_name': (the full business name),
  'latitude': (latitude),
  'longitude': (longitude),
  'business_review_count': (review count for the restaurant[this is a repeated field for all reviews of the restaurant]),
  'categories': [(localized category names)],
  'business_avg': (average stars over all users reviews for business[this is a repeated field for all reviews of the restaurant]).  'user_id': (unique user identifier),
  'user_name': (first name, last initial, like 'Matt J.'),
  'user_review_count': (count of restaurants reviewed by user[this is a repeated field for all reviews by the user]),
  'user_avg': (floating point average of users reviews over all businesses, like 4.31[this is a repeated field for all reviews by the user]).
"""
grouped_by_userid = fulldf.groupby('user_id')
marker = False

for name, group in grouped_by_userid:
    if len(group.business_id.drop_duplicates()) != len(group.business_id):
        print 'OW MY GAWD'
        marker = True

if marker == False: print 'they were right after all.'
"""

#your code here

fig = plt.figure()

# plot histogram for reviews per user

ax = fig.add_subplot(2,1,1)
num_reviews_per_user = [numrev for numrev in fulldf['user_id'].value_counts()] 
#print num_reviews_per_user
ax.set_title('Frequency of Restaurant Reviews per User')
ax.hist(num_reviews_per_user, bins=100, label="reviews per user")
# scale review count as logarithmic
ax.set_xscale('log')
# add ticks, labels, etc
ax.set_xlim(0,1000)
ax.set_xticks([1,2,3,4,5,10,50,100,max(num_reviews_per_user),500,1000])
ax.set_xticklabels([1,2,3,4,5,10,50,100,max(num_reviews_per_user),500,1000], horizontalalignment='center')
ax.set_yscale('linear')
maxline = ax.vlines(max(num_reviews_per_user), 1, 10000, colors='red', label='most reviews per user')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)

# plot histogram for reviews per restaurant

ax = fig.add_subplot(2,1,2)
num_reviews_per_joint = [numrev for numrev in fulldf['business_id'].value_counts()] 
#print num_reviews_per_user
ax.set_title('Frequency of Restaurant Reviews per Restaurant')
ax.hist(num_reviews_per_joint, bins=100, label="reviews per joint")
# scale review count as logarithmic
ax.set_xscale('log')
# add ticks, labels, etc
ax.set_xlim(0,1000)
ax.set_xticks([1,2,3,4,5,10,50,100,500,max(num_reviews_per_joint),1000])
ax.set_xticklabels([1,2,3,4,5,10,50,100,500,max(num_reviews_per_joint),1000], horizontalalignment='center')
ax.set_yscale('linear')
ax.set_ylim(1,2000)
maxline = ax.vlines(max(num_reviews_per_joint), 1, 500, colors='red', label='most reviews per joint')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)

#your code here
if len(fulldf['user_id'].drop_duplicates()) > len(fulldf['business_id'].drop_duplicates()): 
    print 'more users than joints'
else: 
    print 'more joints than users'

#your code here

# not sure which of these you're looking for:

print 'Average of all ratings of restaurants:',round(fulldf.stars.mean(),2)

means_by_joint = np.mean(fulldf.groupby('business_id')['stars'].mean())
print "Average rating across each restaurant's aggregated rating:",round(means_by_joint,2)

def recompute_frame(ldf):
    """
    takes a dataframe ldf, makes a copy of it, and returns the copy
    with all averages and review counts recomputed
    this is used when a frame is subsetted.
    """
    ldfu=ldf.groupby('user_id')
    ldfb=ldf.groupby('business_id')
    user_avg=ldfu.stars.mean()
    user_review_count=ldfu.review_id.count()
    business_avg=ldfb.stars.mean()
    business_review_count=ldfb.review_id.count()
    nldf=ldf.copy()
    nldf.set_index(['business_id'], inplace=True)
    nldf['business_avg']=business_avg
    nldf['business_review_count']=business_review_count
    nldf.reset_index(inplace=True)
    nldf.set_index(['user_id'], inplace=True)
    nldf['user_avg']=user_avg
    nldf['user_review_count']=user_review_count
    nldf.reset_index(inplace=True)
    return nldf

#your code here

# we need to make an actual copy of fulldf here
# otherwise straight assignment (eg. x = fulldf) just creates a symbolic referent, and manipulations on x change fulldf!
copydf = fulldf.copy()
# reduce df by given parameters
smalldf = copydf[(copydf.user_review_count > 60) & (copydf.business_review_count > 150)]
# recompute frame
smalldf = recompute_frame(smalldf)

print 'Number unique items:',smalldf.business_id.drop_duplicates().count()
print 'Number unique users:',smalldf.user_id.drop_duplicates().count()

#your code here

fig = plt.figure()

# plot histogram for reviews per user

ax = fig.add_subplot(2,1,1)
num_reviews_per_user = [numrev for numrev in smalldf['user_id'].value_counts()] 

#print num_reviews_per_user
ax.set_title('Frequency of Restaurant Reviews per User')
ax.hist(num_reviews_per_user, bins=40, label="reviews per user")

# add ticks, labels, etc
ax.set_xlim(0,90)
maxline = ax.vlines(max(num_reviews_per_user), 0, 5, colors='red', label='most reviews per user')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)

# plot histogram for reviews per restaurant

ax = fig.add_subplot(2,1,2)
num_reviews_per_joint = [numrev for numrev in smalldf['business_id'].value_counts()] 

#print num_reviews_per_user
ax.set_title('Frequency of Restaurant Reviews per Restaurant')
ax.hist(num_reviews_per_joint, bins=40, label="reviews per joint")

# add ticks, labels, etc
ax.set_xlim(0,120)
ax.set_ylim(1,20)
maxline = ax.vlines(max(num_reviews_per_joint), 0, 5, colors='red', label='most reviews per joint')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)

plt.subplots_adjust(top=1.5)
plt.subplots_adjust(bottom=.5)

print 'Average User Rating'
a,b,c = plt.hist(smalldf.user_avg)
plt.show()

print '\n'

print 'Average Business Rating'
a,b,c = plt.hist(smalldf.business_avg)
plt.show()

print '\n\nOverall mean:', round(smalldf.stars.mean(),3),'\n\n'

restaurants=smalldf.business_id.unique()
supports=[]
for i,rest1 in enumerate(restaurants):
    for j,rest2 in enumerate(restaurants):
        if  i < j:
            rest1_reviewers = smalldf[smalldf.business_id==rest1].user_id.unique()
            rest2_reviewers = smalldf[smalldf.business_id==rest2].user_id.unique()
            common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers)
            supports.append(len(common_reviewers))
print "Mean support is:",np.mean(supports)
plt.hist(supports)

from scipy.stats.stats import pearsonr
def pearson_sim(rest1_reviews, rest2_reviews, n_common):
    """
    Given a subframe of restaurant 1 reviews and a subframe of restaurant 2 reviews,
    where the reviewers are those who have reviewed both restaurants, return 
    the pearson correlation coefficient between the user average subtracted ratings.
    The case for zero common reviewers is handled separately. Its
    ok to return a NaN if any of the individual variances are 0.
    """
    if n_common==0:
        rho=0.
    else:
        diff1=rest1_reviews['stars']-rest1_reviews['user_avg']
        diff2=rest2_reviews['stars']-rest2_reviews['user_avg']
        rho=pearsonr(diff1, diff2)[0]
    return rho

def get_restaurant_reviews(restaurant_id, df, set_of_users):
    """
    given a resturant id and a set of reviewers, return the sub-dataframe of their
    reviews.
    """
    mask = (df.user_id.isin(set_of_users)) & (df.business_id==restaurant_id)
    reviews = df[mask]
    reviews = reviews[reviews.user_id.duplicated()==False]
    return reviews

"""
Function
--------
calculate_similarity

Parameters
----------
rest1 : string
    The id of restaurant 1
rest2 : string
    The id of restaurant 2
df : DataFrame
  A dataframe of reviews, such as the smalldf above
similarity_func : func
  A function like pearson_sim above which takes two dataframes of individual
  restaurant reviews made by a common set of reviewers, and the number of
  common reviews. This function returns the similarity of the two restaurants
  based on the common reviews.
  
Returns
--------
A tuple
  The first element of the tuple is the similarity and the second the
  common support n_common. If the similarity is a NaN, set it to 0
"""
#your code here

def calculate_similarity(rest1, rest2, df, similarity_func):

    # adapted from the code we were given above
    rest1_reviewers = df[df.business_id==rest1].user_id.unique()
    rest2_reviewers = df[df.business_id==rest2].user_id.unique()
    common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers)
    n_common = len(common_reviewers)
    
    # this is a list comprehension, but it's only really getting at the two rest_ids passed into the function
    # for each one, it runs get_restaurant_reviews and stores both outputs together in a list
    revs = [get_restaurant_reviews(rid, df, common_reviewers) for rid in [rest1,rest2]]
    # compute similarity
    sim = similarity_func(revs[0], revs[1], n_common)
    # check for NaN, if so return 0
    similarity = sim if (not np.isnan(sim)) else 0

    return (similarity, n_common)


# test
# print calculate_similarity(smalldf.business_id.ix[0], smalldf.business_id.ix[210], smalldf, pearson_sim)

class Database:
    "A class representing a database of similaries and common supports"
    
    def __init__(self, df):
        "the constructor, takes a reviews dataframe like smalldf as its argument"
        database={}
        self.df=df
        self.uniquebizids={v:k for (k,v) in enumerate(df.business_id.unique())}
        keys=self.uniquebizids.keys()
        l_keys=len(keys)
        self.database_sim=np.zeros([l_keys,l_keys])
        self.database_sup=np.zeros([l_keys, l_keys], dtype=np.int)
        
    def populate_by_calculating(self, similarity_func):
        """
        a populator for every pair of businesses in df. takes similarity_func like
        pearson_sim as argument
        """
        items=self.uniquebizids.items()
        for b1, i1 in items:
            for b2, i2 in items:
                if i1 < i2:
                    sim, nsup=calculate_similarity(b1, b2, self.df, similarity_func)
                    self.database_sim[i1][i2]=sim
                    self.database_sim[i2][i1]=sim
                    self.database_sup[i1][i2]=nsup
                    self.database_sup[i2][i1]=nsup
                elif i1==i2:
                    nsup=self.df[self.df.business_id==b1].user_id.count()
                    self.database_sim[i1][i1]=1.
                    self.database_sup[i1][i1]=nsup
                    

    def get(self, b1, b2):
        "returns a tuple of similarity,common_support given two business ids"
        sim=self.database_sim[self.uniquebizids[b1]][self.uniquebizids[b2]]
        nsup=self.database_sup[self.uniquebizids[b1]][self.uniquebizids[b2]]
        return (sim, nsup)

db=Database(smalldf)
db.populate_by_calculating(pearson_sim)

db.get("z3yFuLVrmH-3RJruPEMYKw", "zruUQvFySeXyEd7_rQixBg")

def shrunk_sim(sim, n_common, reg=3.):
    "takes a similarity and shrinks it down by using the regularizer"
    ssim=(n_common*sim)/(n_common+reg)
    return ssim

"""
Function
--------
knearest

Parameters
----------
restaurant_id : string
    The id of the restaurant whose nearest neighbors we want
set_of_restaurants : array
    The set of restaurants from which we want to find the nearest neighbors
dbase : instance of Database class.
    A database of similarities, on which the get method can be used to get the similarity
  of two businessed. e.g. dbase.get(rid1,rid2)
k : int
    the number of nearest neighbors desired, default 7
reg: float
    the regularization.
    
  
Returns
--------
A sorted list
    of the top k similar restaurants. The list is a list of tuples
    (business_id, shrunken similarity, common support).
"""
#your code here

from operator import itemgetter

def knearest(restaurant_id, set_of_restaurants, dbase, k, reg):
    
    neighbors = []
    rid = restaurant_id
    
    for joint in set_of_restaurants:
        
        # make sure we're not comparing a restaurant against itself
        if joint != rid:
            
            # get similarity and ncommon
            (sim, ncom) = dbase.get(rid, joint)
            # shrink via regularization
            shrunken = shrunk_sim(sim, ncom, reg)
            # add to nearest neighbors list
            neighbors.append((joint, shrunken, ncom))
    
    # sort neighbors so that the closest are on top
    neighbors.sort(key=itemgetter(1), reverse=True)
    
    # return k nearest
    return neighbors[:k]


testbizid="eIxSLxzIlfExI6vgAbn2JA"
testbizid2="L-uPZxooP_ziXCtRrWi8Pw"

def biznamefromid(df, theid):
    return df['biz_name'][df['business_id']==theid].values[0]
def usernamefromid(df, theid):
    return df['user_name'][df['user_id']==theid].values[0]

print testbizid, biznamefromid(smalldf,testbizid)
print testbizid2, biznamefromid(smalldf, testbizid2)

tops=knearest(testbizid, smalldf.business_id.unique(), db, k=7, reg=3.)
print "For ",biznamefromid(smalldf, testbizid), ", top matches are:"
for i, (biz_id, sim, nc) in enumerate(tops):
    print i,biznamefromid(smalldf,biz_id), "| Sim", sim, "| Support",nc

tops2=knearest(testbizid2, smalldf.business_id.unique(), db, k=7, reg=3.)
print "For ",biznamefromid(smalldf, testbizid2), ", top matches are:"
for i, (biz_id, sim, nc) in enumerate(tops2):
    print i,biznamefromid(smalldf,biz_id), "| Sim", sim, "| Support",nc

def get_user_top_choices(user_id, df, numchoices=5):
    "get the sorted top 5 restaurants for a user by the star rating the user gave them"
    udf=df[df.user_id==user_id][['business_id','stars']].sort(['stars'], ascending=False).head(numchoices)
    return udf
testuserid="7cR92zkDv4W3kqzii6axvg"
print "For user", usernamefromid(smalldf,testuserid), "top choices are:" 
bizs=get_user_top_choices(testuserid, smalldf)['business_id'].values
[biznamefromid(smalldf, biz_id) for biz_id in bizs]

"""
Function
--------
get_top_recos_for_user

Parameters
----------
userid : string
    The id of the user for whom we want the top recommendations
df : Dataframe
    The dataframe of restaurant reviews such as smalldf
dbase : instance of Database class.
    A database of similarities, on which the get method can be used to get the similarity
  of two businessed. e.g. dbase.get(rid1,rid2)
n: int
    the n top choices of the user by star rating
k : int
    the number of nearest neighbors desired, default 8
reg: float
    the regularization.
    
  
Returns
--------
A sorted list
    of the top recommendations. The list is a list of tuples
    (business_id, business_avg). You are combining the k-nearest recommendations 
    for each of the user's n top choices, removing duplicates and the ones the user
    has already rated.
"""
#your code here
def get_top_recos_for_user(userid, df, dbase, n, k, reg):
    
    # put top n user choices into 'tops'
    tops = get_user_top_choices(userid, df, n)
    
    # included is a set because sets only allow unique instances
    included = set()
    neighbors = []

    # loop through the list of top choices' ids
    for joint in tops.business_id:

        # loop through knearest data for each top user choice
        for (jid, sim, ncom) in knearest(joint, df.business_id.unique(), dbase, k, reg):
            
            # set mask to identify businesses that the user has already rated
            mask = (df.business_id == jid) & (df.user_id == userid)
            
            # if we haven't already seen this biz, AND if user hasn't already rated, then keep it
            if (jid not in included) & (not df[mask]):
                
                included.add(jid)
                # grab avg rating
                rating = df[df.business_id == jid].stars.mean()
                # append id/rating tuple to list of neighbors
                neighbors.append((jid, rating))
                
    # sort in the usual way
    # does it matter if we use x.sort() or y = sorted(x...)?
    final_neighbors = sorted(neighbors, key=itemgetter(1), reverse=True)
    
    return final_neighbors


print "For user", usernamefromid(smalldf,testuserid), "the top recommendations are:"
toprecos=get_top_recos_for_user(testuserid, smalldf, db, n=5, k=7, reg=3.)
for biz_id, biz_avg in toprecos:
    print biznamefromid(smalldf,biz_id), "| Average Rating |", round(biz_avg,2)

"""
Function
--------
knearest_amongst_userrated

Parameters
----------
restaurant_id : string
    The id of the restaurant whose nearest neighbors we want
user_id : string
    The id of the user, in whose reviewed restaurants we want to find the neighbors
df: Dataframe
    The dataframe of reviews such as smalldf
dbase : instance of Database class.
    A database of similarities, on which the get method can be used to get the similarity
  of two businesses. e.g. dbase.get(rid1,rid2)
k : int
    the number of nearest neighbors desired, default 7
reg: float
    the regularization.
    
  
Returns
--------
A sorted list
    of the top k similar restaurants. The list is a list of tuples
    (business_id, shrunken similarity, common support).
"""
#your code here
def knearest_among_userrated(restaurant_id, user_id, df, dbase, k, reg):
    
    # trim df to only instances where our specific user has rated
    mydf = df[df['user_id'] == user_id]
    
    # run knearest on the unique rest_ids in this trimmed df
    neighbors = knearest(restaurant_id, mydf.business_id.unique(), dbase, k, reg)
    
    return neighbors[:k]
 

"""

Function
--------
rating

Parameters
----------
df: Dataframe
    The dataframe of reviews such as smalldf
dbase : instance of Database class.
    A database of similarities, on which the get method can be used to get the similarity
  of two businessed. e.g. dbase.get(rid1,rid2)
restaurant_id : string
    The id of the restaurant whose nearest neighbors we want
user_id : string
    The id of the user, in whose reviewed restaurants we want to find the neighbors
k : int
    the number of nearest neighbors desired, default 7
reg: float
    the regularization.
    
  
Returns
--------
A float
    which is the imputed rating that we predict that user_id will make for restaurant_id
"""

#your code here
def rating(df, dbase, restaurant_id, user_id, k, reg):

    rid = restaurant_id
    
    # get knearest neighbor
    nearest = knearest_among_userrated(restaurant_id, user_id, df, dbase, k, reg)

    # compute formula elements
    Yavg_U = df[df.user_id==user_id].stars.mean()
    Yavg_M = df[df.business_id==rid].stars.mean()
    Yavg = df.stars.mean()
    Yhat_base_M = Yavg + (Yavg_U - Yavg) + (Yavg_M - Yavg)
    
    # this list is for collecting S_mj and Y_diff values across all restaurants in set
    neighbor_data = []
    
    # loop through all joints in k nearest
    for (jid, S_Mj, ncom) in nearest:
        
        # compute  all of the 'j' elements of our formula
        Yavg_J = df[df.business_id==jid].stars.mean()
        Yhat_base_J = Yavg + (Yavg_U - Yavg) + (Yavg_J - Yavg)
        Y_Uj = df[(df.user_id == user_id) & (df.business_id == jid)].stars.iloc[0]

        # compute Yuj - Yhat_base_J
        Y_diff = Y_Uj - Yhat_base_J
        
        # compute atomic (ie. un-summed) numerator of our neighbor ratio expression
        atomic_numerator = float(S_Mj) * float(Y_diff)
        
        # append similarity and atomic numerator before iterating
        neighbor_data.append((S_Mj, atomic_numerator))
   
    # from: stackoverflow.com/questions/638048/how-do-i-sum-the-first-value-in-each-tuple-in-a-list-of-tuples-in-python
    numerator = sum([pair[1] for pair in neighbor_data])
    denominator = sum([pair[0] for pair in neighbor_data])
    
    if denominator == 0:
        
        return Yhat_base_M
    
    else:
    
        neighbor_factor = float(numerator) / float(denominator)

        Yhat_M = Yhat_base_M + neighbor_factor

        return Yhat_M
  

print "User Average", round(smalldf[smalldf.user_id==testuserid].stars.mean(),3),"for",usernamefromid(smalldf,testuserid)
print "Predicted ratings for top choices calculated earlier:"
for biz_id,biz_avg in toprecos:
    print biznamefromid(smalldf, biz_id),"|",round(rating(smalldf, db, biz_id, testuserid, k=7, reg=3.),2),"|","Average",round(biz_avg,2) 

    
def get_other_ratings(restaurant_id, user_id, df):
    "get a user's rating for a restaurant and the restaurant's average rating"
    choice=df[(df.business_id==restaurant_id) & (df.user_id==user_id)]
    users_score=choice.stars.values[0]
    average_score=choice.business_avg.values[0]
    return users_score, average_score

print "for user",usernamefromid(smalldf,testuserid), 'avg', smalldf[smalldf.user_id==testuserid].stars.mean() 
for biz_id in bizs:
    print "----------------------------------"
    print biznamefromid(smalldf, biz_id)
    print "Predicted Rating:",rating(smalldf, db, biz_id, testuserid, k=7, reg=3.) 
    u,a=get_other_ratings(biz_id, testuserid, smalldf)
    print "Actual User Rating:",u,"Avg Rating",a

def compare_results(stars_actual, stars_predicted, ylow=-10, yhigh=15, title=""):
    """
    plot predicted results against actual results. Takes 2 arguments: a
    numpy array of actual ratings and a numpy array of predicted ratings
    scatterplots the predictions, a unit slope line, line segments joining the mean,
    and a filled in area of the standard deviations."
    """
    fig=plt.figure()
    df=pd.DataFrame(dict(actual=stars_actual, predicted=stars_predicted))
    ax=plt.scatter(df.actual, df.predicted, alpha=0.2, s=30, label="predicted")
    plt.ylim([ylow,yhigh])
    plt.plot([1,5],[1,5], label="slope 1")
    xp=[1,2,3,4,5]
    yp=df.groupby('actual').predicted.mean().values
    plt.plot(xp,yp,'k', label="means")
    sig=df.groupby('actual').predicted.std().values
    plt.fill_between(xp, yp - sig, yp + sig, 
                 color='k', alpha=0.2)
    plt.xlabel("actual")
    plt.ylabel("predicted")
    plt.legend(frameon=False)
    remove_border()
    plt.grid(False)
    plt.title(title)
    print np.mean(np.abs(df.predicted) < 15)


#your code here
predictdf = smalldf.copy()

# i tried a number of different implementations here, as speed was certainly an issue
# in the end i stuck with brian's numpy vectorization solution from Piazza @1133, 
# but from what i can tell there aren't any solutions that are far better than all the (viable) others.

# 
vrating = np.vectorize(rating, excluded=['df','dbase','k','reg'], otypes=[np.float])
pred_3_3 = vrating(df=smalldf, dbase=db, restaurant_id=smalldf['business_id'], user_id=smalldf['user_id'], k=3, reg=3)
pred_3_15 = vrating(df=smalldf, dbase=db, restaurant_id=smalldf['business_id'], user_id=smalldf['user_id'], k=3, reg=15)
pred_10_3 = vrating(df=smalldf, dbase=db, restaurant_id=smalldf['business_id'], user_id=smalldf['user_id'], k=10, reg=3)
pred_10_15 = vrating(df=predictdf, dbase=db, restaurant_id=predictdf['business_id'], user_id=predictdf['user_id'], k=10, reg=15)

predictdf['predict_3_3'] = pred_3_3
predictdf['predict_3_15'] = pred_3_15
predictdf['predict_10_3'] = pred_10_3
predictdf['predict_10_15'] = pred_10_15

#predictdf.to_csv('predictdf.csv', index=False)

#predictdf = pd.read_csv('predictdf.csv')

actual = predictdf['stars']

compare_results(actual, predictdf['predict_3_3'], title='k=3 and reg=3')    
    
compare_results(actual, predictdf['predict_3_15'], title='k=3 and reg=15')
    
compare_results(actual, predictdf['predict_10_3'], title='k=10 and reg=3')
    
compare_results(actual, predictdf['predict_10_15'], title='k=10 and reg=15')

print 'user count:',smalldf.user_id.drop_duplicates().count()
print 'item count:',smalldf.business_id.drop_duplicates().count()


"""
Function
--------
gamma_m_draw

Draw a single sample from the conditional posterior distribution
of gamma_m.

Inputs
-------
X_m: A g-by-L+1 matrix, defined above. 
Y_m: A 1D vector of length g, defined above.
sig2: Residual _variance_, as defined above.
Lambda_gamma: Prior precision matrix.

Outputs
--------
Single draw from conditional posterior, defined above.
"""
#Item-specific parameters given all else
#your code here
def gamma_m_draw(X_m, Y_m, sig2, Lambda_gamma):
    
    Q = (1/sig2)*X_m.T.dot(X_m) + Lambda_gamma
    
    mvn_mean = (1/sig2)*np.linalg.inv(Q).dot(X_m.T).dot(Y_m)
    
    mvn_cov = np.linalg.inv(Q)
    
    mvn = np.random.multivariate_normal(mvn_mean, mvn_cov)
    
    return mvn

"""
Function
--------
theta_u_draw

Draw a single sample from the conditional posterior distribution
of gamma_m.

Inputs
-------
X_u: A g-by-L+1 matrix, defined above. 
Y_u: A 1D vector of length g, defined above.
sig2: Residual _variance_, as defined above.
Lambda_theta: Prior precision matrix.

Outputs
--------
Single draw from conditional posterior, defined above.
"""
#User-specific parameters given all else
#your code here
def theta_u_draw(X_u, Y_u, sig2, Lambda_theta):
    
    Q = (1/sig2)*X_u.T.dot(X_u) + Lambda_theta
    
    mvn_mean = (1/sig2)*np.linalg.inv(Q).dot(X_u.T).dot(Y_u)
    
    mvn_cov = np.linalg.inv(Q)
    
    mvn = np.random.multivariate_normal(mvn_mean, mvn_cov)
    
    return mvn

"""
Function
--------
factor_gibbs

Runs a gibbs sampler to infer mean, variance, user-specific, and item-specific
parameters.

Inputs
-------
data: A dataframe containing ratings data.
L: Dimension of latent factors.
maxit: Number of samples to draw from posterior.
Lambda_theta_diag: Hyperparameter controlling regularization of Theta.
Lambda_gamma_diag: Hyperparameter controlling regularization of Gamma.
progress: if true, print iteration number every 100 iterations.

Outputs
--------
Dictionary with elements
mu: Draws of mu. 1D array of length maxiter.
sig2: Draws of sig2, residual _variance_. 1D array of length maxiter.
theta: Draws of Theta. U-by-L-by-maxiter array.
gamma: Draws of Gamma. M-by-L-by-maxiter array.
EY: Draws of fitted values of Y. N-by-maxiter array.
"""
def factor_gibbs(data, L, maxit, Lambda_theta_diag, Lambda_gamma_diag, progress=True):
    data = data.copy()
    N = data.shape[0]

    #Create indices that allow us to map users and restaurants to rows
    #in parameter vectors.
    uusers, uidx = np.unique(data.user_id, return_inverse=True)
    uitems, midx = np.unique(data.business_id, return_inverse=True)

    nusers = uusers.size
    nitems = uitems.size

    #Add numerical indices to dataframe.
    data["uidx"] = uidx
    data["midx"] = midx

    #Group observations by user and by business.
    ugroups = data.groupby("uidx")
    mgroups = data.groupby("midx")

    all_avg = data.stars.mean()
    u_avg = ugroups.stars.mean()
    m_avg = mgroups.stars.mean()

    #Initialize parameters and set up data structures for
    #holding draws.
    #Overall mean
    mu = all_avg
    mu_draws = np.zeros(maxit)
    #Residual variance
    sig2 = 0.5
    sig2_draws = np.zeros(maxit)

    #Matrix of user-specific bias and L latent factors.
    theta = np.zeros([nusers, L+1])
    theta[:,0] = u_avg-all_avg
    theta_draws = np.zeros([nusers, L+1, maxit])

    #Matrix of item-specific bias and L latent factors.
    gamma = np.zeros([nitems, L+1])
    gamma[:,0] = m_avg-all_avg
    gamma_draws = np.zeros([nitems, L+1, maxit])

    #Matrix for holding the expected number of stars
    #for each observation at each draw from the posterior.
    EY_draws = np.zeros([data.shape[0], maxit])

    #Inverse covariance matrices from the prior on each theta_u
    #and gamma_b. These are diagonal, like Ridge regression.
    Lambda_theta = np.eye(L+1)*Lambda_theta_diag
    Lambda_gamma = np.eye(L+1)*Lambda_gamma_diag

    #Main sampler code
    for i in range(maxit):
        if i%100==0 and progress:
            print i

        #The entire regression equation except for the overall mean.
        nomu = np.sum(theta[data.uidx,1:]*gamma[data.midx,1:], axis=1) +\
                  theta[data.uidx,0] + gamma[data.midx,0]

        #Compute the expectation of each observation given the current
        #parameter values.
        EY_draws[:,i]=mu+nomu

        #Draw overall mean from a normal distribution
        mu = np.random.normal(np.mean(data.stars-nomu), np.sqrt(sig2/N))
        #Draw overall residual variance from a scaled inverse-Chi squared distribution.
        sig2 = np.sum(np.power(data.stars-nomu-mu,2))/np.random.chisquare(N-2)
        
        #For each item
        for mi,itemdf in mgroups:
            #Gather relevant observations, and subtract out overall mean and
            #user-specific biases, which we are holding fixed.
            Y_m = itemdf.stars-mu-theta[itemdf.uidx,0]
            #Build the regression design matrix implied by holding user factors
            #fixed.
            X_m = np.hstack((np.ones([itemdf.shape[0],1]),
                             theta[itemdf.uidx,1:]))
            gamma[mi,:] = gamma_m_draw(X_m, Y_m, sig2, Lambda_gamma)
            
        #For each user
        for ui,userdf in ugroups:
            #Gather relevant observations, and subtract out overall mean and
            #business-specific biases, which we are holding fixed.
            Y_u = userdf.stars-mu-gamma[userdf.midx,0]
            #Build the regression design matrix implied by holding business factors
            #fixed.
            X_u = np.hstack((np.ones([userdf.shape[0],1]),
                             gamma[userdf.midx,1:]))
            
            theta[ui,:] = theta_u_draw(X_u, Y_u, sig2, Lambda_theta)

        #Record draws
        mu_draws[i] = mu
        sig2_draws[i] = sig2
        theta_draws[:,:,i] = theta
        gamma_draws[:,:,i] = gamma

    return {"mu": mu_draws, "sig2": sig2_draws,
            "theta": theta_draws, "gamma": gamma_draws,
            "EY": EY_draws}

#your code here
gibbs = factor_gibbs(predictdf, 2, 1000, 0.1, 0.1, progress=True)


prediction = gibbs['EY']
gibbs_means = np.mean(prediction, axis=1, keepdims=False)


#your code here
compare_results(predictdf.stars, gibbs_means, title='Gibbs 2-Factor Model Ratings Predictions')

subsetoffull=fulldf[['user_id','business_id', 'stars','business_avg','user_avg']]
subsetoffull.to_csv("subset-full.csv", index=False, header=False)
subsetofsmall=smalldf[['user_id','business_id', 'stars','business_avg','user_avg']]
subsetofsmall.to_csv("subset-small.csv", index=False, header=False)

#from pygments import highlight
#from pygments.lexers import PythonLexer
#from pygments.formatters import HtmlFormatter
#from IPython.display import HTML
#import urllib
#skelcode = urllib.urlopen("https://raw.github.com/cs109/content/master/skeleton.py").read()
#skelhtml=highlight(skelcode, PythonLexer(), HtmlFormatter())
#HTML(skelhtml)

def upper_generator(words):
    for word in words:
        yield word.upper()

words = ['a', 'couple', 'of', 'words', 'to', 'process']

print upper_generator(words)
print list(upper_generator(words))
for u in upper_generator(words):
     print u


thecode = open("computesim.py").read()
thehtml=highlight(thecode, PythonLexer(), HtmlFormatter())
HTML(thehtml)

output_small_local=[[json.loads(j) for j in line.strip().split("\t")] for line in open("./output.small.local.txt")]
output_small_local[0]

def make_database_from_pairs(df, bizpairs):
    """
    make the database from the pairs returned from mrjob.
    df is the dataframe, smalldf or fulldf.
    bizpairs are a list of elements, each of which is a list of two
        lists. The first of these lists has the two business id's, while
        the second has the similarity and the common support
    Returns an instance of the Database class.
    """
    dbase=Database(df)
    cache={}
    for bp,corrs in bizpairs:
        b1,b2=bp
        i1=dbase.uniquebizids[b1]
        i2=dbase.uniquebizids[b2]
        sim,nsup=corrs
        dbase.database_sim[i1][i2]=sim
        dbase.database_sim[i2][i1]=sim
        dbase.database_sup[i1][i2]=nsup
        dbase.database_sup[i2][i1]=nsup
        if cache.has_key(b1):
            nsup1=cache[b1]
        else:
            nsup1=dbase.df[dbase.df.business_id==b1].user_id.count()
            cache[b1]=nsup1
        if cache.has_key(b2):
            nsup2=cache[b2]
        else:
            nsup2=dbase.df[dbase.df.business_id==b2].user_id.count()
            cache[b2]=nsup2
        dbase.database_sim[i1][i1]=1.0
        dbase.database_sim[i2][i2]=1.0
        dbase.database_sup[i1][i1]=nsup1
        dbase.database_sup[i2][i2]=nsup2
    return dbase

db_mrjob_local=make_database_from_pairs(smalldf, output_small_local)

print db.get("zruUQvFySeXyEd7_rQixBg", "z3yFuLVrmH-3RJruPEMYKw")
print db_mrjob_local.get("zruUQvFySeXyEd7_rQixBg", "z3yFuLVrmH-3RJruPEMYKw")

sums=0.
count=0
for k in db.uniquebizids.keys():
    for k2 in db.uniquebizids.keys():
        count=count+1
        sums=sums+db.get(k,k2)[0]-db_mrjob_local.get(k,k2)[0]
print sums, count

output_full_emr=[[json.loads(j) for j in l.strip().split("\t")] for l in open("./output.full.emr.txt")]

dbfull=make_database_from_pairs(fulldf, output_full_emr)

#your code here
print "for user",usernamefromid(fulldf,testuserid), 'avg', fulldf[fulldf.user_id==testuserid].stars.mean() 
for biz_id in bizs:
    print "----------------------------------"
    print biznamefromid(fulldf, biz_id)
    print "Predicted Rating:",rating(fulldf, dbfull, biz_id, testuserid, k=7, reg=3.) 
    u,a=get_other_ratings(biz_id, testuserid, fulldf)
    print "Actual User Rating:",u,"Avg Rating",a