#!/usr/bin/env python # coding: utf-8 # In[52]: import re import glob import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.max_rows = 10 plt.rcParams['figure.figsize'] = (12, 6) # We'll use the same dataset of beer reviews. # In[2]: df = pd.read_csv('data/beer_subset.csv.gz', parse_dates=['time'], compression='gzip') review_cols = ['review_appearance', 'review_aroma', 'review_overall', 'review_palate', 'review_taste'] df.head() # In[3]: df.beer_id.nunique() # In[4]: df.profile_name.nunique() # In[5]: df.beer_id.value_counts().plot(kind='hist', bins=10, color='k', log=True, title='Log reviews per beer'); # In[6]: ax = df.brewer_id.value_counts().plot(kind='hist', bins=15, color='k', log=True, title='Log reviews per brewer'); # In[7]: df.profile_name.value_counts().plot(kind='hist', bins=15, color='k', log=True, title='Log reviews per person'); # In[8]: df.review_overall.value_counts().sort_index().plot(kind='bar', width=.8, rot=0); # In[9]: fig, ax = plt.subplots(figsize=(5, 10)) sns.countplot(hue='kind', y='stars', data=(df[review_cols] .stack() .reset_index(level=1) .rename(columns={'level_1': 'kind', 0: 'stars',})), ax=ax, order=np.arange(0, 5.5, .5)); # # Groupby # Groupby is a fundamental operation to pandas and data analysis. # # The components of a groupby operation are to # # 1. Split a table into groups # 2. Apply a function to each groups # 3. Combine the results # # In pandas the first step looks like # # ```python # df.groupby( grouper ) # ``` # # `grouper` can be many things # # - Series (or string indicating a column in `df`) # - function (to be applied on the index) # - dict : groups by *values* # - `levels=[]`, names of levels in a MultiIndex # In[10]: gr = df.groupby('beer_style') gr # Haven't really done anything yet. Just some book-keeping to figure out which **keys** go with which rows. Keys are the things we've grouped by (each `beer_style` in this case). # # The last two steps, apply and combine, are just: # In[11]: gr.agg('mean') # This says apply the `mean` function to each column. Non-numeric columns (nusiance columns) are excluded. We can also select a subset of columns to perform the aggregation on. # In[12]: gr[review_cols].agg('mean') # `.` attribute lookup works as well. # In[13]: gr.abv.agg('mean') # Certain operations are attached directly to the `GroupBy` object, letting you bypass the `.agg` part # In[14]: gr.abv.mean() # Exercise: Find the `beer_style`s with the greatest variance in `abv`. # # - hint: `.std` calculates the standard deviation, and is available on `GroupBy` objects like `gr.abv`. # - hint: use `.order` to sort a Series # In[87]: # your code goes here # In[89]: get_ipython().run_line_magic('load', '-r 15:17 solutions_groupby.py') # Now we'll run the gamut on a bunch of grouper / apply combinations. # Keep sight of the target though: split, apply, combine. # # Single grouper, multiple aggregtaions: # Multiple Aggregations on one column # In[15]: gr['review_aroma'].agg([np.mean, np.std, 'count']).head() # Single Aggregation on multiple columns # In[16]: gr[review_cols].mean() # Multiple aggregations on multiple columns # In[17]: gr[review_cols].agg(['mean', 'count', 'std']) # Hierarchical Indexes in the columns can be awkward to work with, so I'll usually # move a level to the Index with `.stack`. # In[18]: gr[review_cols].agg(['mean', 'count', 'std']).stack(level=0) # You can group by **levels** of a MultiIndex. # In[19]: multi = gr[review_cols].agg(['mean', 'count', 'std']).stack(level=0) multi.head() # In[20]: multi.groupby(level='beer_style')['mean'].agg(['min', 'max']) # Group by **multiple** columns # In[21]: df.groupby(['brewer_id', 'beer_style']).review_overall.mean() # In[22]: df.groupby(['brewer_id', 'beer_style'])[review_cols].mean() # ### Exercise: Plot the relationship between review length (the `text` column) and average `review_overall`. # # Hint: Break the problem into pieces: # # - Find the **len**gth of each reivew (remember the `df.text.str` namespace?) # - Group by that Series of review lengths # - I used `style='k.'` in the plot # In[93]: # Your code goes here # In[162]: get_ipython().run_line_magic('load', '-r 1:5 solutions_groupby.py') # Bonus exercise: # # - Try grouping by the number of words. # - Try grouping by the number of sentances. # # Remember that `str.count` accepts a regular expression. # # Don't worry too much about these, especially if you don't remember the syntax # for regular expressions (I never can). Just jump to the next exercise. # In[ ]: # In[99]: get_ipython().run_line_magic('load', '-r 18:20 solutions_groupby.py') # In[ ]: # In[101]: get_ipython().run_line_magic('load', '-r 21:26 solutions_groupby.py') # In[40]: # Your code goes here # ### Exercise: Which **brewer** (`brewer_id`) has the largest gap between the min and max `review_overall` for two of their beers. # # Hint: You'll need to do this in two steps. # # 1. Find the average `review_overall` by brewer and beername. # 2. Find the difference between the max and min by brewer (rembember `.groupby(level=)`) # In[43]: # Your code goes here. You've got this! # In[105]: get_ipython().run_line_magic('load', '-r 6:13 solutions_groupby.py') # In[106]: # Show for those with counts > 20ish # Create our own "kind" of beer, which aggregates `style`. # In[23]: style = df.beer_style.str.lower() style.head() # In[24]: kinds = ['ipa', 'apa', 'amber ale', 'rye', 'scotch', 'stout', 'barleywine', 'porter', 'brown ale', 'lager', 'pilsner', 'tripel', 'biter', 'farmhouse', 'malt liquour', 'rice'] # In[25]: expr = '|'.join(['(?P<{name}>{pat})'.format(pat=kind, name=kind.replace(' ', '_')) for kind in kinds]) expr # In[26]: beer_kind = (style.replace({'india pale ale': 'ipa', 'american pale ale': 'apa'}) .str.extract(expr).fillna('').sum(1) .str.lower().replace('', 'other')) beer_kind.head() # In[27]: df.groupby(['brewer_id', beer_kind]).review_overall.mean() # In[28]: df.groupby(['brewer_id', beer_kind]).beer_id.nunique().unstack(1).fillna(0) # ### Exercise: Which Brewers have the most different `kinds` of beer? # # Hint: we used `df.profile_name.nunique()` to find the number of different profile names. # What are we grouping, and what is our grouper? # In[ ]: # In[33]: # %load -r 27:29 solutions_groupby.py # ### Exercise: Which kinds of beer have the most brewers? # In[ ]: # In[121]: get_ipython().run_line_magic('load', '-r 30:32 solutions_groupby.py') # We've seen a lot of permutations among number of groupers, number of columns to aggregate, and number of aggregators. # In fact, the `.agg`, which returns one row per group, is just one kind of way to combine the results. The three ways are # # - `agg`: one row per results # - `transform`: identicaly shaped output as input # - `apply`: anything goes # # # Transform # Combined Series / DataFrame is the same shape as the input. For example, say you want to standardize the reviews by subtracting the mean. # In[29]: def de_mean(reviews): s = reviews - reviews.mean() return s # In[30]: de_mean(df.review_overall) # We can do this at the *person* level with `groupby` and `transform`. # In[31]: df.groupby('profile_name').transform(de_mean) # # Apply # # So there's `gr.agg`. and `gr.transform`, and finally `gr.apply`. We're going to skip apply for now. I have an example in a later notebook. # # Resample # # Resample is a special kind of groupby operation for when you have a `DatetimeIndex`. # In[32]: review_times = df.time.value_counts().sort_index() review_times # In[33]: review_times.index # The number of reviews within a given second isn't that interesting. # In[34]: review_times.plot() # Right now the frequency is way to high to be meaningful. `resample` lets you adjust the frequency. # In[35]: review_times.resample("D").plot() # We've essentially grouped by day here (but syntax of `.resample('d')` is much nicer than what we'd have to do to use the `.groupby(grouper)` spelling). By default the aggregation function is `mean`, i.e. take the average of all the values that fall on that day. You can also sum. # In[36]: review_times.resample('D', how='sum').plot() # The `freq` you pass in to `resample` is pretty flexible. # In[37]: review_times.resample('5D', how='sum') # In[38]: c = review_times.resample('3H', how='sum') ax = c.plot(alpha=.5) pd.rolling_mean(c, window=8).plot(ax=ax) # ### Exercise: Plot the number of distinct brewers reviewed per day # # - Hint: The documentation for `resample` is being worked on, but the `how` in `.resample` is *really* flexible. Try the first thing that comes to mind. What function do you want to apply to get the number of unique values? # - Hint2: (Sorry this is harder than I thought). `resample` needs a `DatetimeIndex`. We have datetimes in `time`. If only there was a way to take a column and **set** it as the **index**... # In[ ]: get_ipython().run_line_magic('load', '-r 33:36 solutions_groupby.py') # # Aside: Beer Recommender # # See [Harvard CS109](https://github.com/cs109/content) for a more complete example (with chocolate instead of beer). # # One place where transform comes in handy is as a preprocessing step for any kind of recommender. In some sense, raw score I assign a beer is less important the the score relative to *my* mean. # In[40]: deduped = df[['beer_id', 'profile_name', 'review_overall']].drop_duplicates() deduped.head() # In[41]: user_counts = deduped.profile_name.value_counts() top_users = user_counts[user_counts > user_counts.quantile(.75)].index # In[42]: beer_counts = deduped.beer_id.value_counts() top_beers = beer_counts[beer_counts > beer_counts.quantile(.9)].index # In[43]: top = deduped.query('beer_id in @top_beers and profile_name in @top_users') user_means = top.groupby('profile_name').review_overall.mean() beer_means = top.groupby('beer_id').review_overall.mean() # In[44]: fig, axes = plt.subplots(figsize=(16, 4), ncols=2, sharey=True, sharex=True) sns.distplot(user_means, kde=False, ax=axes[0], color='k', norm_hist=True, hist_kws={'alpha': 1}) sns.distplot(beer_means, kde=False, ax=axes[1], color='k', norm_hist=True, hist_kws={'alpha': 1}) axes[0].set_title("User Averages") axes[1].set_title("Beer Averages") # In[45]: s = top.set_index(['beer_id', 'profile_name']).review_overall.sort_index() s.head() # ### `de_mean` the scores in `s` # In[46]: standardized = s.groupby(level='profile_name').transform(de_mean) standardized.head() # In[47]: from scipy.stats import pearsonr # In[48]: def pearson_sim(reviews_1, reviews_2, reg=2): """ (regularized) Pearson correlation coefficient between sets of reviews for two beers, made by a common subset of reviewers. `reviews_1` and `reviews_2` should be have the same index, the `profile_name`s of people who reviewed both beers. """ n_common = len(reviews_1) if n_common == 0: similarity = 0 else: rho = pearsonr(reviews_1, reviews_2)[0] similarity = (n_common * rho) / (n_common + reg) # regularization if few reviews return similarity, n_common # In[49]: def beer_similarity(standardized, beer_1, beer_2, simfunc=pearson_sim, **simfunc_kwargs): """ Compute the similarity between two beers. """ # get common subset... reviewers_1 = standardized.loc[beer_1].index reviewers_2 = standardized.loc[beer_2].index common_idx = reviewers_1 & reviewers_2 # set intersection # slice the Multiindex, unstack to be N x 2 common_reviews = standardized.loc[[beer_1, beer_2], common_idx].unstack('beer_id') # ... review similairty for subset rho, n_common = simfunc(common_reviews[beer_1], common_reviews[beer_2], **simfunc_kwargs) return rho, n_common # In[50]: beer_ids = s.index.levels[0] len(beer_ids) # In[51]: beer_similarity(standardized, beer_ids[0], beer_ids[10]) # In[509]: get_ipython().run_cell_magic('time', '', "sims = []\n\nfor i, beer_1 in enumerate(beer_ids):\n for j, beer_2 in enumerate(beer_ids):\n if j >= i:\n continue\n sim, n_common = beer_similarity(s, beer_1, beer_2)\n sims.append((beer_1, beer_2, sim, n_common))\n print((i, j), end='\\r')\n \nsim = pd.DataFrame(sims, columns=['beer_1', 'beer_2', 'score', 'n_common'])\nsim.to_csv('beer_subset_similarity.csv', index=False)\n") # In[53]: sim = pd.read_csv('beer_subset_similarity.csv.gz') sim.head() # In[61]: sns.kdeplot(sim[sim.score != 0].dropna().score) # In[62]: sim = sim.set_index(['beer_1', 'beer_2']).score # In[63]: sim.loc[21690].nlargest(5)