import pandas as pd import numpy as np import pprint as pp import os # don't try this at home import warnings warnings.filterwarnings("ignore") # ! say "Welcome to the presentation!" # ! curl -O https://s3.amazonaws.com/demo-datasets/beer_reviews.tar.gz home = os.environ['HOME'] # or wherever you've saved it locally filename = os.path.join(home, "Dropbox/yhathq/datasets/beer_reviews/beer_reviews.csv") df = pd.read_csv(filename) n = 250 top_n = df.beer_name.value_counts().index[:n] df = df[df.beer_name.isin(top_n)] df.head() df_wide = pd.pivot_table(df, values=["review_overall"], rows=["beer_name", "review_profilename"], aggfunc=np.mean).unstack() df_wide.shape # any cells that are missing data (i.e. a user didn't buy a particular product) we're going to set to 0 df_wide = df_wide.fillna(0) df_wide.ix[0:5, 0:5] df_wide.columns[:10] pd.Series(df_wide.index[:10]) from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import manhattan_distances from sklearn.metrics.pairwise import euclidean_distances #from sklearn.metrics.pairwise import dists = cosine_similarity(df_wide) dists dists = pd.DataFrame(dists, columns=df_wide.index) # give the indicies (equivalent to rownames in R) the name of the product id dists.index = dists.columns dists.ix[0:10, 0:10] beers_i_like = ['Sierra Nevada Pale Ale', '120 Minute IPA', 'Allagash White'] dists[beers_i_like].head() # axis = 1 b/c we want 1 score per beer, which are rows beers_summed = dists[beers_i_like].apply(lambda row: np.sum(row), axis=1) #beers_summed = beers_summed.reset_index() #beers_summed.columns = ['beer_name', 'total_distance'] #beers_summed.sort(['beer_name'], ascending=False).head(10) #beers_summed.sort(['total_distance'], ascending=False).head(10) beers_summed.order(ascending=False) ranked_beers = beers_summed.index[beers_summed.index.isin(beers_i_like)==False] ranked_beers = ranked_beers.tolist() ranked_beers[:5] def get_similar(beers, n=None): """ calculates which beers are most similar to the beers provided. Does not return the beers that were provided Parameters ---------- beers: list some beers! Returns ------- ranked_beers: list rank ordered beers """ beers = [beer for beer in beers if beer in dists.columns] beers_summed = dists[beers].apply(lambda row: np.sum(row), axis=1) beers_summed = beers_summed.order(ascending=False) ranked_beers = beers_summed.index[beers_summed.index.isin(beers)==False] ranked_beers = ranked_beers.tolist() if n is None: return ranked_beers else: return ranked_beers[:n] for beer in get_similar(["120 Minute IPA"], 10): print beer for i, beer in enumerate(get_similar(["Coors Light", "Bud Light", "Amstel Light"], 10)): print "%d) %s" % (i+1, beer) from yhat import Yhat, YhatModel, preprocess class BeerRec(YhatModel): @preprocess(in_type=dict, out_type=dict) def execute(self, data): beers = data.get("beers") n = data.get("n") suggested_beers = get_similar(beers, n) result = [] for beer in suggested_beers: result.append({"beer": beer}) return result BeerRec().execute({"beers": ["Coors Light", "Bud Light", "Amstel Light"]}) payload = {"beers": ["Sierra Nevada Pale Ale", "Bud Light", "Amstel Light"], "n": 5} BeerRec().execute(payload) # BeerRec().run() yh = Yhat("hmardukas", "0D86697D12FC4E5F802DFD4D51829FFE", "http://cloud.yhathq.com/") # this will ask you if you want to deploy so just make sure you accept! yh.deploy("BeerRec", BeerRec, globals()) payload = {"beers": ["Coors Light", "Bud Light", "Amstel Light"]} yh.predict("BeerRec", payload) payload = {"beers": ["Sierra Nevada Pale Ale", "Bud Light"]} yh.predict("BeerRec", payload)