In this tutorial we'll show you how to build a recommendation system using pandas, scikit-learn, and numpy. We've provided a dataset of beer reviews which we'll use for building our product recommender, but this use case could be easily substituted with a different product.
import pandas as pd
import numpy as np
import pylab as pl
Grab the dataset from our data demos bucket on S3, then decompress it. It will create a directory called ~/Downloads/beer_reviews.
# substitute your name here. If you're on windows you'll need a different filepath
df = pd.read_csv("../data/beer_reviews/beer_reviews.csv")
df.head()
brewery_id | brewery_name | review_time | review_overall | review_aroma | review_appearance | review_profilename | beer_style | review_palate | review_taste | beer_name | beer_abv | beer_beerid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10325 | Vecchio Birraio | 1234817823 | 1.5 | 2.0 | 2.5 | stcules | Hefeweizen | 1.5 | 1.5 | Sausa Weizen | 5.0 | 47986 |
1 | 10325 | Vecchio Birraio | 1235915097 | 3.0 | 2.5 | 3.0 | stcules | English Strong Ale | 3.0 | 3.0 | Red Moon | 6.2 | 48213 |
2 | 10325 | Vecchio Birraio | 1235916604 | 3.0 | 2.5 | 3.0 | stcules | Foreign / Export Stout | 3.0 | 3.0 | Black Horse Black Beer | 6.5 | 48215 |
3 | 10325 | Vecchio Birraio | 1234725145 | 3.0 | 3.0 | 3.5 | stcules | German Pilsener | 2.5 | 3.0 | Sausa Pils | 5.0 | 47969 |
4 | 1075 | Caldera Brewing Company | 1293735206 | 4.0 | 4.5 | 4.0 | johnmichaelsen | American Double / Imperial IPA | 4.0 | 4.5 | Cauldron DIPA | 7.7 | 64883 |
beer_1, beer_2 = "Dale's Pale Ale", "Fat Tire Amber Ale"
beer_1_reviewers = df[df.beer_name==beer_1].review_profilename.unique()
beer_2_reviewers = df[df.beer_name==beer_2].review_profilename.unique()
common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)
print "Users in the sameset: %d" % len(common_reviewers)
list(common_reviewers)[:10]
Users in the sameset: 499
['womencantsail', 'Marty30', 'Winter', 'Lothore', 'bump8628', 'gford217', 'lackenhauser', 'wspscott', 'mjurney', 'LiquidBread219']
def get_beer_reviews(beer, common_users):
mask = (df.review_profilename.isin(common_users)) & (df.beer_name==beer)
reviews = df[mask].sort('review_profilename')
reviews = reviews[reviews.review_profilename.duplicated()==False]
return reviews
beer_1_reviews = get_beer_reviews(beer_1, common_reviewers)
beer_2_reviews = get_beer_reviews(beer_2, common_reviewers)
cols = ['beer_name', 'review_profilename', 'review_overall', 'review_aroma', 'review_palate', 'review_taste']
beer_2_reviews[cols].head()
beer_name | review_profilename | review_overall | review_aroma | review_palate | review_taste | |
---|---|---|---|---|---|---|
202456 | Fat Tire Amber Ale | ATPete | 4.5 | 4.0 | 4.0 | 4.5 |
201458 | Fat Tire Amber Ale | AdamBear | 3.5 | 2.5 | 4.5 | 3.5 |
201886 | Fat Tire Amber Ale | AlCaponeJunior | 2.0 | 3.0 | 3.5 | 3.0 |
202481 | Fat Tire Amber Ale | AltBock | 4.0 | 3.0 | 3.0 | 3.0 |
201803 | Fat Tire Amber Ale | Andreji | 4.0 | 4.5 | 4.0 | 4.0 |
# choose your own way to calculate distance
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from scipy.stats.stats import pearsonr
ALL_FEATURES = ['review_overall', 'review_aroma', 'review_palate', 'review_taste']
def calculate_similarity(beer1, beer2):
# find common reviewers
beer_1_reviewers = df[df.beer_name==beer1].review_profilename.unique()
beer_2_reviewers = df[df.beer_name==beer2].review_profilename.unique()
common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)
# get reviews
beer_1_reviews = get_beer_reviews(beer1, common_reviewers)
beer_2_reviews = get_beer_reviews(beer2, common_reviewers)
dists = []
for f in ALL_FEATURES:
dists.append(euclidean_distances(beer_1_reviews[f], beer_2_reviews[f])[0][0])
return dists
calculate_similarity(beer_1, beer_2)
[17.592612085759182, 17.38533865071371, 16.454482671904334, 17.613914953808536]
# calculate only a subset for the demo
beers = ["Dale's Pale Ale", "Sierra Nevada Pale Ale", "Michelob Ultra",
"Natural Light", "Bud Light", "Fat Tire Amber Ale", "Coors Light",
"Blue Moon Belgian White", "60 Minute IPA", "Guinness Draught"]
# calculate everything for real production
# beers = df.beer_name.unique()
simple_distances = []
for beer1 in beers:
print "starting", beer1
for beer2 in beers:
if beer1 != beer2:
row = [beer1, beer2] + calculate_similarity(beer1, beer2)
simple_distances.append(row)
starting Dale's Pale Ale starting Sierra Nevada Pale Ale starting Michelob Ultra starting Natural Light starting Bud Light starting Fat Tire Amber Ale starting Coors Light starting Blue Moon Belgian White starting 60 Minute IPA starting Guinness Draught
cols = ["beer1", "beer2", "overall_dist", "aroma_dist", "palate_dist", "taste_dist"]
simple_distances = pd.DataFrame(simple_distances, columns=cols)
simple_distances.tail()
beer1 | beer2 | overall_dist | aroma_dist | palate_dist | taste_dist | |
---|---|---|---|---|---|---|
85 | Guinness Draught | Bud Light | 44.260592 | 42.520583 | 45.825757 | 44.452222 |
86 | Guinness Draught | Fat Tire Amber Ale | 23.958297 | 21.023796 | 25.014996 | 22.798026 |
87 | Guinness Draught | Coors Light | 41.237119 | 38.823318 | 43.508620 | 40.620192 |
88 | Guinness Draught | Blue Moon Belgian White | 27.147744 | 22.477767 | 25.806976 | 23.727621 |
89 | Guinness Draught | 60 Minute IPA | 31.348844 | 32.310989 | 32.256782 | 34.510868 |
def calc_distance(dists, beer1, beer2, weights):
mask = (dists.beer1==beer1) & (dists.beer2==beer2)
row = dists[mask]
row = row[['overall_dist', 'aroma_dist', 'palate_dist', 'taste_dist']]
dist = weights * row
return dist.sum(axis=1).tolist()[0]
weights = [2, 1, 1, 1]
print calc_distance(simple_distances, "Fat Tire Amber Ale", "Dale's Pale Ale", weights)
print calc_distance(simple_distances, "Fat Tire Amber Ale", "Michelob Ultra", weights)
86.6389604479 153.0466855
my_beer = "Coors Light"
results = []
for b in beers:
if my_beer!=b:
results.append((my_beer, b, calc_distance(simple_distances, my_beer, b, weights)))
sorted(results, key=lambda x: x[2])
[('Coors Light', 'Natural Light', 69.52320122830363), ('Coors Light', 'Michelob Ultra', 72.4303764655898), ('Coors Light', 'Bud Light', 100.45382254092895), ('Coors Light', 'Blue Moon Belgian White', 175.24657417286627), ('Coors Light', 'Fat Tire Amber Ale', 176.31863930228485), ('Coors Light', "Dale's Pale Ale", 181.20123311633913), ('Coors Light', 'Guinness Draught', 205.42636799646337), ('Coors Light', '60 Minute IPA', 233.40510433819486), ('Coors Light', 'Sierra Nevada Pale Ale', 254.78216241090442)]
from yhat import Yhat, BaseModel
class BeerRec(BaseModel):
def transform(self, raw_data):
beer = raw_data['beer']
weights = raw_data.get("weights", [1, 1, 1, 1])
# normalize the weights so they sum to 1.0
weights = [float(w) / sum(weights) for w in weights]
return (beer, weights)
def predict(self, data):
beer, weights = data
results = []
for beer_cmp in self.beers:
if beer!=beer_cmp:
dist = calc_distance(self.simple_distances, beer, beer_cmp, weights)
results.append((beer, beer_cmp, dist))
return sorted(results, key=lambda x: x[2])
yh = Yhat("caraciol@gmail.com", "eb5359ec4cc9a2865ad4839c6d07047f")
br = BeerRec(simple_distances=simple_distances, beers=beers,
udfs=[calc_distance])
yh.deploy("PythonBeerRec", br)
uploading... done!
{u'modelname': u'PythonBeerRec', u'status': u'success', u'version': 1}
yh.predict("PythonBeerRec", 1, {"beer": "Coors Light", "weights": [1, 1, 1, 1]})
[[u'Coors Light', u'Natural Light', 13.10332], [u'Coors Light', u'Michelob Ultra', 13.58854], [u'Coors Light', u'Bud Light', 18.89981], [u'Coors Light', u'Fat Tire Amber Ale', 35.31271], [u'Coors Light', u'Blue Moon Belgian White', 35.50784], [u'Coors Light', u"Dale's Pale Ale", 36.80674], [u'Coors Light', u'Guinness Draught', 41.04731], [u'Coors Light', u'60 Minute IPA', 47.50289], [u'Coors Light', u'Sierra Nevada Pale Ale', 51.62802]]