#!/usr/bin/env python # coding: utf-8 # # Grouping # In[1]: import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.max_rows = 6 pd.options.display.max_columns = 6 pd.options.display.width = 80 # We'll use the same dataset of beer reviews. # In[2]: df = pd.read_hdf('data/beer.hdf') # # Groupby # Groupby is a fundamental operation to pandas and data analysis. # # The components of a groupby operation are to # # 1. Split a table into groups # 2. Apply a function to each groups # 3. Combine the results # # http://pandas.pydata.org/pandas-docs/stable/groupby.html # In pandas the first step looks like # ``df.groupby( grouper )`` # `grouper` can be many things # # - ``Series`` (or string indicating a column in a ``DataFrame``) # - function (to be applied on the index) # - dict : groups by *values* # - `levels=[]`, names of levels in a MultiIndex # In[3]: gr = df.groupby('beer_style') gr # Haven't really done anything yet. Just some book-keeping to figure out which **keys** go with which rows. Keys are the things we've grouped by (each `beer_style` in this case). # In[4]: gr.ngroups # In[5]: list(gr.groups)[0:5] # In[6]: cols = ['beer_style'] + df.columns.difference(['beer_style']).tolist() cols # In[7]: gr.get_group('Tripel')[cols] # In[8]: df.loc[df.beer_style=='Tripel',cols] # The last two steps, apply and combine: # In[9]: gr.agg('mean') # This says apply the `mean` function to each column. Non-numeric columns (nusiance columns) are excluded. We can also select a subset of columns to perform the aggregation on. # In[10]: review_columns = ['abv','review_overall','review_appearance', 'review_palate','review_taste'] gr[review_columns].agg('mean') # `.` attribute lookup works as well. # In[11]: gr.abv.agg('mean') # Find the `beer_style` with the greatest variance in `abv`. # In[12]: (df .groupby('beer_style') .abv .std() .sort_values(ascending=False) ) # Multiple Aggregations on one column # In[13]: gr['review_aroma'].agg([np.mean, np.std, 'count']) # Single Aggregation on multiple columns # In[14]: gr[review_columns].mean() # Multiple aggregations on multiple columns # In[15]: result = gr[review_columns].agg(['mean', 'count', 'std']) result.columns.names=['characteristic','measure'] result # Hierarchical Indexes in the columns can be awkward to work with, so I'll usually # move a level to the Index with `.stack`. # http://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-by-stacking-and-unstacking # In[16]: result # In[17]: multi = result.stack(level='characteristic') multi # In[18]: result.stack(level='measure') # In[19]: # stack-unstack are inverses (result .stack(level='measure') .unstack(level='measure') ) # You can group by **levels** of a MultiIndex. # In[20]: (result.stack(level='characteristic') .groupby(level='beer_style') ['mean'] .agg(['min', 'max' ]) ) # Group by **multiple** columns # In[21]: df.groupby(['brewer_id', 'beer_style'])[review_columns].mean() # ### Exercise: Plot the relationship between review length (the `text` column) and average `review_overall`. # # - Find the **len**gth of each reivew (remember the `df.text.str` namespace?) # - Group by that Series of review lengths # - Using the '.k' plotting style # In[22]: (df.groupby(df.text.str.len()) .review_overall .mean() .plot(style='.k', figsize=(12,8)) ) # ## What are we doing # In[23]: df.text.str.len() # In[24]: df.groupby(df.text.str.len()).ngroups # We've seen a lot of permutations among number of groupers, number of columns to aggregate, and number of aggregators. # In fact, the `.agg`, which returns one row per group, is just one kind of way to combine the results. The three ways are # # - `agg`: one row per results # - `transform`: identicaly shaped output as input # - `apply`: anything goes # # # Transform # Combined Series / DataFrame is the same shape as the input. For example, say you want to standardize the reviews by subtracting the mean. # In[25]: def de_mean(reviews): s = reviews - reviews.mean() return s # In[26]: de_mean(df.review_overall) # In[27]: df.groupby('profile_name').transform(de_mean) # Oftentimes is better to work with the groupby object directly # In[28]: (df-df.groupby('profile_name').transform('mean') ).select_dtypes(exclude=['object']) # In[29]: get_ipython().run_line_magic('timeit', "df.groupby('profile_name').transform(de_mean)") # In[30]: get_ipython().run_line_magic('timeit', "(df-df.groupby('profile_name').transform('mean')).select_dtypes(exclude=['object'])") # In[31]: df.groupby('profile_name').ngroups