#!/usr/bin/env python # coding: utf-8 # # Split apply and combine # In[1]: import addutils.toc ; addutils.toc.js(ipy_notebook=True) # In this tutorial we are going to see advanced data management with `pandas` data structures. # In[2]: import numpy as np import pandas as pd from IPython.display import (display, HTML) from addutils import side_by_side2 from addutils import css_notebook css_notebook() # Categorizing a data set and applying a function to each group, is often a critical component of a data analysis workflow. After # loading, merging, and preparing a data set, a familiar task is to compute group statistics or possibly pivot tables for reporting or visualization purposes. *pandas* provides a flexible and high-performance groupby facility. # # By *'group by'* we refer to a process involving one or more of the following steps: # # * **Splitting** the data into groups based on some criteria # * **Applying** a function to each group independently # * **Combining** the results into a data structure # # Suppose we are managing a website and we have a log-file with number of *wiews* and *likes* coming from different cities: # In[3]: d1 = pd.read_csv('temp/p07_d1.txt', index_col=0) d1 = d1.reindex(columns=['State','City','Views','Likes']) display(d1) # ## 1 Groupby # `groupby` groups DataFrame or Series by a parameter on a given axis: # In[4]: g1 = d1.groupby('State') print (g1.groups) # The variable `groups` of a `GroupBy` object is a dictionary containing indexes of each group member. # In[5]: for name,group in g1: print (name) print (group) print ('Total Views: %d - Total Likes: %d\n\n' %(group['Views'].sum(), group['Likes'].sum())) # It is also possibile to apply a `groupby` over a hierarchical index `DataFrame` # In[6]: d2 = d1.set_index(['State','City']) display(d2) # ## 2 Aggregate # Once the GroupBy object has been created, several methods are available to perform a computation on the grouped data. Here we use `aggregate`. The result of the aggregation will have the group names as the new index along the grouped axis. In the case of multiple keys, the result is a MultiIndex by default, though this can be changed by using the `as_index option`: # In[7]: g2 = d2.groupby(level=[0]) print (g2.groups) g2.aggregate(np.sum) # In[8]: g3 = d2.groupby(level=[0,1]) g4 = d2.groupby(level=[0,1], as_index=False) HTML(side_by_side2(g3.aggregate(np.sum), g4.aggregate(np.sum))) # `aggregate` allows to pass any function that returns a scalar value from a vector and can handle list of functions: # In[9]: d1[['State', 'Views']].groupby('State').aggregate([np.sum, np.mean, np.std]) # ## 3 Apply # `apply` will extend the previous concepts to any Python function: # In[10]: pd.set_option('display.float_format', lambda x: '{:.1f}'.format(x)) def add_field(group): group['Tot.Views'] = group['Views'].sum() group['Likes[%]'] = 100.0*group['Likes']/group['Likes'].sum() return group HTML(side_by_side2(d1, d1.groupby('State').apply(add_field))) # ## 4 A pratical example: Normalize by year # In[11]: idx = pd.date_range('1999/5/28', periods=1500, freq='1B') s1 = pd.Series(np.random.normal(5.5, 2, 1500), idx) s1 = s1.rolling(10,10).mean().dropna() #s1 = s1.rolling_mean(s1, 10, 10).dropna() # Here we define a grouping key for months and one for years: # In[12]: def my_groupby_key_year(timestamp): return timestamp.year def my_groupby_key_month(timestamp): return timestamp.month def my_normalization(group): return (group-group.mean())/group.std() # Here we normalize the data on a monthly base and check mean and std on an yearly base: # In[13]: t1 = s1.groupby(my_groupby_key_month).apply(my_normalization) HTML(side_by_side2(s1.head(8), t1.head(8), t1.groupby(my_groupby_key_year).aggregate([np.mean, np.std]))) # ## 5 A practical example: Group and standardize by dimension # In[14]: d3 = pd.read_csv('example_data/company.csv', index_col=0) display(d3.head()) # Since the column "Value" is made by strings with a space separator we need a simpel intermediate step to convert values from string to floats: # In[15]: d3['Value'] = d3['Value'].apply(lambda x: float(x.replace(' ', ''))) d3.head() # In[16]: d3.groupby('Dimension').mean() # --- # # Visit [www.add-for.com]() for more tutorials and updates. # # This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.