#!/usr/bin/env python
# coding: utf-8

# # Grouping

# In[1]:


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

pd.options.display.max_rows = 6
pd.options.display.max_columns = 6
pd.options.display.width = 80


# We'll use the same dataset of beer reviews.

# In[2]:


df = pd.read_hdf('data/beer.hdf')


# # Groupby

# Groupby is a fundamental operation to pandas and data analysis.
# 
# The components of a groupby operation are to
# 
# 1. Split a table into groups
# 2. Apply a function to each groups
# 3. Combine the results
# 
# http://pandas.pydata.org/pandas-docs/stable/groupby.html

# In pandas the first step looks like

# ``df.groupby( grouper )``

# `grouper` can be many things
# 
# - ``Series`` (or string indicating a column in a ``DataFrame``)
# - function (to be applied on the index)
# - dict : groups by *values*
# - `levels=[]`, names of levels in a MultiIndex

# In[3]:


gr = df.groupby('beer_style')
gr


# Haven't really done anything yet. Just some book-keeping to figure out which **keys** go with which rows. Keys are the things we've grouped by (each `beer_style` in this case).

# In[4]:


gr.ngroups


# In[5]:


list(gr.groups)[0:5]


# In[6]:


cols = ['beer_style'] + df.columns.difference(['beer_style']).tolist()
cols


# In[7]:


gr.get_group('Tripel')[cols]


# In[8]:


df.loc[df.beer_style=='Tripel',cols]


# The last two steps, apply and combine:

# In[9]:


gr.agg('mean')


# This says apply the `mean` function to each column. Non-numeric columns (nusiance columns) are excluded. We can also select a subset of columns to perform the aggregation on.

# In[10]:


review_columns = ['abv','review_overall','review_appearance',
                  'review_palate','review_taste']
gr[review_columns].agg('mean')


# `.` attribute lookup works as well.

# In[11]:


gr.abv.agg('mean')


# Find the `beer_style` with the greatest variance in `abv`.

# In[12]:


(df
   .groupby('beer_style')
   .abv
   .std()
   .sort_values(ascending=False)
 )


# Multiple Aggregations on one column

# In[13]:


gr['review_aroma'].agg([np.mean, np.std, 'count'])


# Single Aggregation on multiple columns

# In[14]:


gr[review_columns].mean()


# Multiple aggregations on multiple columns

# In[15]:


result = gr[review_columns].agg(['mean', 'count', 'std'])
result.columns.names=['characteristic','measure']
result


# Hierarchical Indexes in the columns can be awkward to work with, so I'll usually
# move a level to the Index with `.stack`.

# http://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-by-stacking-and-unstacking

# In[16]:


result


# In[17]:


multi = result.stack(level='characteristic')
multi


# In[18]:


result.stack(level='measure')


# In[19]:


# stack-unstack are inverses
(result
      .stack(level='measure')
      .unstack(level='measure')
 )


# You can group by **levels** of a MultiIndex.

# In[20]:


(result.stack(level='characteristic')
       .groupby(level='beer_style')
       ['mean']
       .agg(['min', 'max' ])
 )


# Group by **multiple** columns

# In[21]:


df.groupby(['brewer_id', 'beer_style'])[review_columns].mean()


# ### Exercise: Plot the relationship between review length (the `text` column) and average `review_overall`.
# 
# - Find the **len**gth of each reivew (remember the `df.text.str` namespace?)
# - Group by that Series of review lengths
# - Using the '.k' plotting style

# In[22]:


(df.groupby(df.text.str.len())
   .review_overall
   .mean()
   .plot(style='.k', figsize=(12,8))
 )


# ## What are we doing

# In[23]:


df.text.str.len()


# In[24]:


df.groupby(df.text.str.len()).ngroups


# We've seen a lot of permutations among number of groupers, number of columns to aggregate, and number of aggregators.
# In fact, the `.agg`, which returns one row per group, is just one kind of way to combine the results. The three ways are
# 
# - `agg`: one row per results
# - `transform`: identicaly shaped output as input
# - `apply`: anything goes
# 

# # Transform

# Combined Series / DataFrame is the same shape as the input. For example, say you want to standardize the reviews by subtracting the mean.

# In[25]:


def de_mean(reviews):
    s = reviews - reviews.mean()
    return s


# In[26]:


de_mean(df.review_overall)


# In[27]:


df.groupby('profile_name').transform(de_mean)


# Oftentimes is better to work with the groupby object directly

# In[28]:


(df-df.groupby('profile_name').transform('mean')
).select_dtypes(exclude=['object'])


# In[29]:


get_ipython().run_line_magic('timeit', "df.groupby('profile_name').transform(de_mean)")


# In[30]:


get_ipython().run_line_magic('timeit', "(df-df.groupby('profile_name').transform('mean')).select_dtypes(exclude=['object'])")


# In[31]:


df.groupby('profile_name').ngroups