#!/usr/bin/env python
# coding: utf-8

# In[52]:


import re
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

pd.options.display.max_rows = 10
plt.rcParams['figure.figsize'] = (12, 6)


# We'll use the same dataset of beer reviews.

# In[2]:


df = pd.read_csv('data/beer_subset.csv.gz', parse_dates=['time'], compression='gzip')
review_cols = ['review_appearance', 'review_aroma', 'review_overall',
               'review_palate', 'review_taste']
df.head()


# In[3]:


df.beer_id.nunique()


# In[4]:


df.profile_name.nunique()


# In[5]:


df.beer_id.value_counts().plot(kind='hist', bins=10, color='k', log=True,
                               title='Log reviews per beer');


# In[6]:


ax = df.brewer_id.value_counts().plot(kind='hist', bins=15, color='k', log=True,
                                      title='Log reviews per brewer');


# In[7]:


df.profile_name.value_counts().plot(kind='hist', bins=15, color='k', log=True,
                                    title='Log reviews per person');


# In[8]:


df.review_overall.value_counts().sort_index().plot(kind='bar', width=.8, rot=0);


# In[9]:


fig, ax = plt.subplots(figsize=(5, 10))
sns.countplot(hue='kind', y='stars', data=(df[review_cols]
                                           .stack()
                                           .reset_index(level=1)
                                           .rename(columns={'level_1': 'kind',
                                                            0: 'stars',})),
              ax=ax, order=np.arange(0, 5.5, .5));


# # Groupby

# Groupby is a fundamental operation to pandas and data analysis.
# 
# The components of a groupby operation are to
# 
# 1. Split a table into groups
# 2. Apply a function to each groups
# 3. Combine the results
# 
# In pandas the first step looks like
# 
# ```python
# df.groupby( grouper )
# ```
# 
# `grouper` can be many things
# 
# - Series (or string indicating a column in `df`)
# - function (to be applied on the index)
# - dict : groups by *values*
# - `levels=[]`, names of levels in a MultiIndex

# In[10]:


gr = df.groupby('beer_style')
gr


# Haven't really done anything yet. Just some book-keeping to figure out which **keys** go with which rows. Keys are the things we've grouped by (each `beer_style` in this case).
# 
# The last two steps, apply and combine, are just:

# In[11]:


gr.agg('mean')


# This says apply the `mean` function to each column. Non-numeric columns (nusiance columns) are excluded. We can also select a subset of columns to perform the aggregation on.

# In[12]:


gr[review_cols].agg('mean')


# `.` attribute lookup works as well.

# In[13]:


gr.abv.agg('mean')


# Certain operations are attached directly to the `GroupBy` object, letting you bypass the `.agg` part

# In[14]:


gr.abv.mean()


# Exercise: Find the `beer_style`s with the greatest variance in `abv`.
# 
# - hint: `.std` calculates the standard deviation, and is available on `GroupBy` objects like `gr.abv`.
# - hint: use `.order` to sort a Series

# In[87]:


# your code goes here


# In[89]:


get_ipython().run_line_magic('load', '-r 15:17 solutions_groupby.py')


# Now we'll run the gamut on a bunch of grouper / apply combinations.
# Keep sight of the target though: split, apply, combine.
# 
# Single grouper, multiple aggregtaions:

# Multiple Aggregations on one column

# In[15]:


gr['review_aroma'].agg([np.mean, np.std, 'count']).head()


# Single Aggregation on multiple columns

# In[16]:


gr[review_cols].mean()


# Multiple aggregations on multiple columns

# In[17]:


gr[review_cols].agg(['mean', 'count', 'std'])


# Hierarchical Indexes in the columns can be awkward to work with, so I'll usually
# move a level to the Index with `.stack`.

# In[18]:


gr[review_cols].agg(['mean', 'count', 'std']).stack(level=0)


# You can group by **levels** of a MultiIndex.

# In[19]:


multi = gr[review_cols].agg(['mean', 'count', 'std']).stack(level=0)
multi.head()


# In[20]:


multi.groupby(level='beer_style')['mean'].agg(['min', 'max'])


# Group by **multiple** columns

# In[21]:


df.groupby(['brewer_id', 'beer_style']).review_overall.mean()


# In[22]:


df.groupby(['brewer_id', 'beer_style'])[review_cols].mean()


# ### Exercise: Plot the relationship between review length (the `text` column) and average `review_overall`.
# 
# Hint: Break the problem into pieces:
# 
# - Find the **len**gth of each reivew (remember the `df.text.str` namespace?)
# - Group by that Series of review lengths
# - I used `style='k.'` in the plot 

# In[93]:


# Your code goes here


# In[162]:


get_ipython().run_line_magic('load', '-r 1:5 solutions_groupby.py')


# Bonus exercise:
# 
# - Try grouping by the number of words.
# - Try grouping by the number of sentances.
# 
# Remember that `str.count` accepts a regular expression.
# 
# Don't worry too much about these, especially if you don't remember the syntax
# for regular expressions (I never can). Just jump to the next exercise.

# In[ ]:


# In[99]:


get_ipython().run_line_magic('load', '-r 18:20 solutions_groupby.py')


# In[ ]:


# In[101]:


get_ipython().run_line_magic('load', '-r 21:26 solutions_groupby.py')


# In[40]:


# Your code goes here


# ### Exercise: Which **brewer** (`brewer_id`) has the largest gap between the min and max `review_overall` for two of their beers.
# 
# Hint: You'll need to do this in two steps.
# 
# 1. Find the average `review_overall` by brewer and beername.
# 2. Find the difference between the max and min by brewer (rembember `.groupby(level=)`)

# In[43]:


# Your code goes here. You've got this!


# In[105]:


get_ipython().run_line_magic('load', '-r 6:13 solutions_groupby.py')


# In[106]:


# Show for those with counts > 20ish


# Create our own "kind" of beer, which aggregates `style`.

# In[23]:


style = df.beer_style.str.lower()
style.head()


# In[24]:


kinds = ['ipa', 'apa', 'amber ale', 'rye', 'scotch', 'stout', 'barleywine', 'porter', 'brown ale', 'lager', 'pilsner',
         'tripel', 'biter', 'farmhouse', 'malt liquour', 'rice']


# In[25]:


expr = '|'.join(['(?P<{name}>{pat})'.format(pat=kind, name=kind.replace(' ', '_')) for kind in kinds])
expr


# In[26]:


beer_kind = (style.replace({'india pale ale': 'ipa',
                            'american pale ale': 'apa'})
            .str.extract(expr).fillna('').sum(1)
            .str.lower().replace('', 'other'))
beer_kind.head()


# In[27]:


df.groupby(['brewer_id', beer_kind]).review_overall.mean()


# In[28]:


df.groupby(['brewer_id', beer_kind]).beer_id.nunique().unstack(1).fillna(0)


# ### Exercise: Which Brewers have the most different `kinds` of beer?
# 
# Hint: we used `df.profile_name.nunique()` to find the number of different profile names.
# What are we grouping, and what is our grouper?

# In[ ]:


# In[33]:


# %load -r 27:29 solutions_groupby.py


# ### Exercise: Which kinds of beer have the most brewers?

# In[ ]:


# In[121]:


get_ipython().run_line_magic('load', '-r 30:32 solutions_groupby.py')


# We've seen a lot of permutations among number of groupers, number of columns to aggregate, and number of aggregators.
# In fact, the `.agg`, which returns one row per group, is just one kind of way to combine the results. The three ways are
# 
# - `agg`: one row per results
# - `transform`: identicaly shaped output as input
# - `apply`: anything goes
# 

# # Transform

# Combined Series / DataFrame is the same shape as the input. For example, say you want to standardize the reviews by subtracting the mean.

# In[29]:


def de_mean(reviews):
    s = reviews - reviews.mean()
    return s


# In[30]:


de_mean(df.review_overall)


# We can do this at the *person* level with `groupby` and `transform`.

# In[31]:


df.groupby('profile_name').transform(de_mean)


# # Apply
# 
# So there's `gr.agg`. and `gr.transform`, and finally `gr.apply`. We're going to skip apply for now. I have an example in a later notebook. 

# # Resample
# 
# Resample is a special kind of groupby operation for when you have a `DatetimeIndex`.

# In[32]:


review_times = df.time.value_counts().sort_index()
review_times


# In[33]:


review_times.index


# The number of reviews within a given second isn't that interesting.

# In[34]:


review_times.plot()


# Right now the frequency is way to high to be meaningful. `resample` lets you adjust the frequency.

# In[35]:


review_times.resample("D").plot()


# We've essentially grouped by day here (but syntax of `.resample('d')` is much nicer than what we'd have to do to use the `.groupby(grouper)` spelling). By default the aggregation function is `mean`, i.e. take the average of all the values that fall on that day. You can also sum.

# In[36]:


review_times.resample('D', how='sum').plot()


# The `freq` you pass in to `resample` is pretty flexible.

# In[37]:


review_times.resample('5D', how='sum')


# In[38]:


c = review_times.resample('3H', how='sum')
ax = c.plot(alpha=.5)
pd.rolling_mean(c, window=8).plot(ax=ax)


# ### Exercise: Plot the number of distinct brewers reviewed per day
# 
# - Hint: The documentation for `resample` is being worked on, but the `how` in `.resample` is *really* flexible. Try the first thing that comes to mind. What function do you want to apply to get the number of unique values?
# - Hint2: (Sorry this is harder than I thought). `resample` needs a `DatetimeIndex`. We have datetimes in `time`. If only there was a way to take a column and **set** it as the **index**...

# In[ ]:


get_ipython().run_line_magic('load', '-r 33:36 solutions_groupby.py')


# # Aside: Beer Recommender
# 
# See [Harvard CS109](https://github.com/cs109/content) for a more complete example (with chocolate instead of beer).
# 
# One place where transform comes in handy is as a preprocessing step for any kind of recommender. In some sense, raw score I assign a beer is less important the the score relative to *my* mean.

# In[40]:


deduped = df[['beer_id', 'profile_name', 'review_overall']].drop_duplicates()
deduped.head()


# In[41]:


user_counts = deduped.profile_name.value_counts()
top_users = user_counts[user_counts > user_counts.quantile(.75)].index


# In[42]:


beer_counts = deduped.beer_id.value_counts()
top_beers = beer_counts[beer_counts > beer_counts.quantile(.9)].index


# In[43]:


top = deduped.query('beer_id in @top_beers and profile_name in @top_users')
user_means = top.groupby('profile_name').review_overall.mean()
beer_means = top.groupby('beer_id').review_overall.mean()


# In[44]:


fig, axes = plt.subplots(figsize=(16, 4), ncols=2, sharey=True, sharex=True)

sns.distplot(user_means, kde=False, ax=axes[0], color='k', norm_hist=True, hist_kws={'alpha': 1})
sns.distplot(beer_means, kde=False, ax=axes[1], color='k', norm_hist=True, hist_kws={'alpha': 1})
axes[0].set_title("User Averages")
axes[1].set_title("Beer Averages")


# In[45]:


s = top.set_index(['beer_id', 'profile_name']).review_overall.sort_index()
s.head()


# ###  `de_mean` the scores in `s`

# In[46]:


standardized = s.groupby(level='profile_name').transform(de_mean)
standardized.head()


# In[47]:


from scipy.stats import pearsonr


# In[48]:


def pearson_sim(reviews_1, reviews_2, reg=2):
    """
    (regularized) Pearson correlation coefficient between sets
    of reviews for two beers, made by a common subset
    of reviewers.
    
    `reviews_1` and `reviews_2` should be have the same index,
    the `profile_name`s of people who reviewed both beers.
    """
    n_common = len(reviews_1)
    if n_common == 0:
        similarity = 0
    else:
        rho = pearsonr(reviews_1, reviews_2)[0]
        similarity = (n_common * rho) / (n_common + reg)  # regularization if few reviews
    return similarity, n_common


# In[49]:


def beer_similarity(standardized, beer_1, beer_2, simfunc=pearson_sim, **simfunc_kwargs):
    """
    Compute the similarity between two beers.
    """
    # get common subset...
    reviewers_1 = standardized.loc[beer_1].index
    reviewers_2 = standardized.loc[beer_2].index
    common_idx = reviewers_1 & reviewers_2  # set intersection

    # slice the Multiindex, unstack to be N x 2
    common_reviews = standardized.loc[[beer_1, beer_2], common_idx].unstack('beer_id')
    # ... review similairty for subset
    rho, n_common = simfunc(common_reviews[beer_1], common_reviews[beer_2], **simfunc_kwargs)
    return rho, n_common


# In[50]:


beer_ids = s.index.levels[0]
len(beer_ids)


# In[51]:


beer_similarity(standardized, beer_ids[0], beer_ids[10])


# In[509]:


get_ipython().run_cell_magic('time', '', "sims = []\n\nfor i, beer_1 in enumerate(beer_ids):\n    for j, beer_2 in enumerate(beer_ids):\n        if j >= i:\n            continue\n        sim, n_common = beer_similarity(s, beer_1, beer_2)\n        sims.append((beer_1, beer_2, sim, n_common))\n        print((i, j), end='\\r')\n        \nsim = pd.DataFrame(sims, columns=['beer_1', 'beer_2', 'score', 'n_common'])\nsim.to_csv('beer_subset_similarity.csv', index=False)\n")


# In[53]:


sim = pd.read_csv('beer_subset_similarity.csv.gz')
sim.head()


# In[61]:


sns.kdeplot(sim[sim.score != 0].dropna().score)


# In[62]:


sim = sim.set_index(['beer_1', 'beer_2']).score


# In[63]:


sim.loc[21690].nlargest(5)