Building a Recommendation System in Python

In this tutorial we'll show you how to build a recommendation system using pandas, scikit-learn, and numpy. We've provided a dataset of beer reviews which we'll use for building our product recommender, but this use case could be easily substituted with a different product.

In [26]:
import pandas as pd
import numpy as np
import pylab as pl

Download the data

Grab the dataset from our data demos bucket on S3, then decompress it. It will create a directory called ~/Downloads/beer_reviews.

In [5]:
%%sh
tar xvf ~/Downloads/beer_reviews.tar.gz
x beer_reviews/
x beer_reviews/beer_reviews.csv
x beer_reviews/load_into_postgres.sql

In [35]:
# substitute your name here. If you're on windows you'll need a different filepath
df = pd.read_csv("/Users/glamp/Downloads/beer_reviews/beer_reviews.csv")
df.head()
Out[35]:
brewery_id brewery_name review_time review_overall review_aroma review_appearance review_profilename beer_style review_palate review_taste beer_name beer_abv beer_beerid
0 10325 Vecchio Birraio 1234817823 1.5 2.0 2.5 stcules Hefeweizen 1.5 1.5 Sausa Weizen 5.0 47986
1 10325 Vecchio Birraio 1235915097 3.0 2.5 3.0 stcules English Strong Ale 3.0 3.0 Red Moon 6.2 48213
2 10325 Vecchio Birraio 1235916604 3.0 2.5 3.0 stcules Foreign / Export Stout 3.0 3.0 Black Horse Black Beer 6.5 48215
3 10325 Vecchio Birraio 1234725145 3.0 3.0 3.5 stcules German Pilsener 2.5 3.0 Sausa Pils 5.0 47969
4 1075 Caldera Brewing Company 1293735206 4.0 4.5 4.0 johnmichaelsen American Double / Imperial IPA 4.0 4.5 Cauldron DIPA 7.7 64883

Finding People Who Have Reviewed 2 Beers

In [32]:
beer_1, beer_2 = "Dale's Pale Ale", "Fat Tire Amber Ale"

beer_1_reviewers = df[df.beer_name==beer_1].review_profilename.unique()
beer_2_reviewers = df[df.beer_name==beer_2].review_profilename.unique()
common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)
print "Users in the sameset: %d" % len(common_reviewers)
list(common_reviewers)[:10]
Users in the sameset: 499

Out[32]:
['womencantsail',
 'Marty30',
 'Winter',
 'Lothore',
 'bump8628',
 'gford217',
 'lackenhauser',
 'wspscott',
 'mjurney',
 'LiquidBread219']

Extracting Reviews

In [13]:
def get_beer_reviews(beer, common_users):
    mask = (df.review_profilename.isin(common_users)) & (df.beer_name==beer)
    reviews = df[mask].sort('review_profilename')
    reviews = reviews[reviews.review_profilename.duplicated()==False]
    return reviews
beer_1_reviews = get_beer_reviews(beer_1, common_reviewers)
beer_2_reviews = get_beer_reviews(beer_2, common_reviewers)

cols = ['beer_name', 'review_profilename', 'review_overall', 'review_aroma', 'review_palate', 'review_taste']
beer_2_reviews[cols].head()
Out[13]:
beer_name review_profilename review_overall review_aroma review_palate review_taste
202456 Fat Tire Amber Ale ATPete 4.5 4.0 4.0 4.5
201458 Fat Tire Amber Ale AdamBear 3.5 2.5 4.5 3.5
201886 Fat Tire Amber Ale AlCaponeJunior 2.0 3.0 3.5 3.0
202481 Fat Tire Amber Ale AltBock 4.0 3.0 3.0 3.0
201803 Fat Tire Amber Ale Andreji 4.0 4.5 4.0 4.0

Calculating Distance

In [38]:
# choose your own way to calculate distance
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from scipy.stats.stats import pearsonr


ALL_FEATURES = ['review_overall', 'review_aroma', 'review_palate', 'review_taste']
def calculate_similarity(beer1, beer2):
    # find common reviewers
    beer_1_reviewers = df[df.beer_name==beer1].review_profilename.unique()
    beer_2_reviewers = df[df.beer_name==beer2].review_profilename.unique()
    common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)

    # get reviews
    beer_1_reviews = get_beer_reviews(beer1, common_reviewers)
    beer_2_reviews = get_beer_reviews(beer2, common_reviewers)
    dists = []
    for f in ALL_FEATURES:
        dists.append(euclidean_distances(beer_1_reviews[f], beer_2_reviews[f])[0][0])
    
    return dists

calculate_similarity(beer_1, beer_2)
Out[38]:
[17.592612085759182, 17.38533865071371, 16.454482671904334, 17.613914953808536]

Calculate the Similarity for a Set of Beers

In [40]:
# calculate only a subset for the demo
beers = ["Dale's Pale Ale", "Sierra Nevada Pale Ale", "Michelob Ultra",
         "Natural Light", "Bud Light", "Fat Tire Amber Ale", "Coors Light",
         "Blue Moon Belgian White", "60 Minute IPA", "Guinness Draught"]

# calculate everything for real production
# beers = df.beer_name.unique()

simple_distances = []
for beer1 in beers:
    print "starting", beer1
    for beer2 in beers:
        if beer1 != beer2:
            row = [beer1, beer2] + calculate_similarity(beer1, beer2)
            simple_distances.append(row)
starting Dale's Pale Ale
starting Sierra Nevada Pale Ale
starting Michelob Ultra
starting Natural Light
starting Bud Light
starting Fat Tire Amber Ale
starting Coors Light
starting Blue Moon Belgian White
starting 60 Minute IPA
starting Guinness Draught

Inspect the Results

In [15]:
cols = ["beer1", "beer2", "overall_dist", "aroma_dist", "palate_dist", "taste_dist"]
simple_distances = pd.DataFrame(simple_distances, columns=cols)
simple_distances.tail()
Out[15]:
beer1 beer2 overall_dist aroma_dist palate_dist taste_dist
85 Guinness Draught Bud Light 44.260592 42.520583 45.825757 44.452222
86 Guinness Draught Fat Tire Amber Ale 23.958297 21.023796 25.014996 22.798026
87 Guinness Draught Coors Light 41.237119 38.823318 43.508620 40.620192
88 Guinness Draught Blue Moon Belgian White 27.147744 22.477767 25.806976 23.727621
89 Guinness Draught 60 Minute IPA 31.348844 32.310989 32.256782 34.510868

Allow the User to Customize the Weights

In [34]:
def calc_distance(dists, beer1, beer2, weights):
    mask = (dists.beer1==beer1) & (dists.beer2==beer2)
    row = dists[mask]
    row = row[['overall_dist', 'aroma_dist', 'palate_dist', 'taste_dist']]
    dist = weights * row
    return dist.sum(axis=1).tolist()[0]

weights = [2, 1, 1, 1]
print calc_distance(simple_distances, "Fat Tire Amber Ale", "Dale's Pale Ale", weights)
print calc_distance(simple_distances, "Fat Tire Amber Ale", "Michelob Ultra", weights)
86.6389604479
153.0466855

Find Similar Beers for Coors Light

In [17]:
my_beer = "Coors Light"
results = []
for b in beers:
    if my_beer!=b:
        results.append((my_beer, b, calc_distance(simple_distances, my_beer, b, weights)))
sorted(results, key=lambda x: x[2])
Out[17]:
[('Coors Light', 'Natural Light', 69.52320122830363),
 ('Coors Light', 'Michelob Ultra', 72.4303764655898),
 ('Coors Light', 'Bud Light', 100.45382254092895),
 ('Coors Light', 'Blue Moon Belgian White', 175.24657417286627),
 ('Coors Light', 'Fat Tire Amber Ale', 176.31863930228485),
 ('Coors Light', "Dale's Pale Ale", 181.20123311633913),
 ('Coors Light', 'Guinness Draught', 205.42636799646337),
 ('Coors Light', '60 Minute IPA', 233.40510433819486),
 ('Coors Light', 'Sierra Nevada Pale Ale', 254.78216241090442)]

Wrap it in Yhat

In [18]:
from yhat import Yhat, BaseModel

class BeerRec(BaseModel):
    
    def transform(self, raw_data):
        beer = raw_data['beer']
        weights = raw_data.get("weights", [1, 1, 1, 1])
        # normalize the weights so they sum to 1.0
        weights = [float(w) / sum(weights) for w in weights]
        return (beer, weights)
        
    def predict(self, data):
        beer, weights = data
        results = []
        for beer_cmp in self.beers:
            if beer!=beer_cmp:
                dist = calc_distance(self.simple_distances, beer, beer_cmp, weights)
                results.append((beer, beer_cmp, dist))
        return sorted(results, key=lambda x: x[2])

Deploy to Yhat

In [20]:
yh = Yhat("{USERNAME}", "{APIKEY}")
br = BeerRec(simple_distances=simple_distances, beers=beers,
             udfs=[calc_distance])
yh.deploy("PydataBeerRec", br)
uploading... done!

Out[20]:
{u'modelname': u'PydataBeerRec', u'status': u'success', u'version': 1}

Test it Out

In [24]:
yh.predict("PydataBeerRec", 1, {"beer": "Coors Light", "weights": [1, 1, 1, 1]})
Out[24]:
[[u'Coors Light', u'Natural Light', 13.10332],
 [u'Coors Light', u'Michelob Ultra', 13.58854],
 [u'Coors Light', u'Bud Light', 18.89981],
 [u'Coors Light', u'Fat Tire Amber Ale', 35.31271],
 [u'Coors Light', u'Blue Moon Belgian White', 35.50784],
 [u'Coors Light', u"Dale's Pale Ale", 36.80674],
 [u'Coors Light', u'Guinness Draught', 41.04731],
 [u'Coors Light', u'60 Minute IPA', 47.50289],
 [u'Coors Light', u'Sierra Nevada Pale Ale', 51.62802]]
In [25]:
yh.predict("PydataBeerRec", 1, {"beer": "Coors Light", "weights": [2, 1, 0, 0]})
Out[25]:
[[u'Coors Light', u'Natural Light', 14.79369],
 [u'Coors Light', u'Michelob Ultra', 15.81099],
 [u'Coors Light', u'Bud Light', 21.75517],
 [u'Coors Light', u'Blue Moon Belgian White', 34.41245],
 [u'Coors Light', u'Fat Tire Amber Ale', 35.19777],
 [u'Coors Light', u"Dale's Pale Ale", 35.41338],
 [u'Coors Light', u'Guinness Draught', 40.43252],
 [u'Coors Light', u'60 Minute IPA', 45.5498],
 [u'Coors Light', u'Sierra Nevada Pale Ale', 49.73314]]
In []: