#!/usr/bin/env python
# coding: utf-8

# # Split apply and combine

# In[1]:


import addutils.toc ; addutils.toc.js(ipy_notebook=True)


# In this tutorial we are going to see advanced data management with `pandas` data structures.

# In[2]:


import numpy as np
import pandas as pd
from IPython.display import (display, HTML)
from addutils import side_by_side2
from addutils import css_notebook
css_notebook()


# Categorizing a data set and applying a function to each group, is often a critical component of a data analysis workflow. After
# loading, merging, and preparing a data set, a familiar task is to compute group statistics or possibly pivot tables for reporting or visualization purposes. *pandas* provides a flexible and high-performance groupby facility.
# 
# By *'group by'* we refer to a process involving one or more of the following steps:
# 
# * **Splitting** the data into groups based on some criteria
# * **Applying** a function to each group independently
# * **Combining** the results into a data structure
# 
# Suppose we are managing a website and we have a log-file with number of *wiews* and *likes* coming from different cities:

# In[3]:


d1 = pd.read_csv('temp/p07_d1.txt', index_col=0)
d1 = d1.reindex(columns=['State','City','Views','Likes'])
display(d1)


# ## 1 Groupby

# `groupby` groups DataFrame or Series by a parameter on a given axis:

# In[4]:


g1 = d1.groupby('State')
print (g1.groups)


# The variable `groups` of a `GroupBy` object is a dictionary containing indexes of each group member.

# In[5]:


for name,group in g1:
    print (name)
    print (group)
    print ('Total Views: %d - Total Likes: %d\n\n' %(group['Views'].sum(),
                                                    group['Likes'].sum()))


# It is also possibile to apply a `groupby` over a hierarchical index `DataFrame`

# In[6]:


d2 = d1.set_index(['State','City'])
display(d2)


# ## 2 Aggregate

# Once the GroupBy object has been created, several methods are available to perform a computation on the grouped data. Here we use `aggregate`. The result of the aggregation will have the group names as the new index along the grouped axis. In the case of multiple keys, the result is a MultiIndex by default, though this can be changed by using the `as_index option`:

# In[7]:


g2 = d2.groupby(level=[0])
print (g2.groups)
g2.aggregate(np.sum)


# In[8]:


g3 = d2.groupby(level=[0,1])
g4 = d2.groupby(level=[0,1], as_index=False)
HTML(side_by_side2(g3.aggregate(np.sum), g4.aggregate(np.sum)))


# `aggregate` allows to pass any function that returns a scalar value from a vector and can handle list of functions:

# In[9]:


d1[['State', 'Views']].groupby('State').aggregate([np.sum, np.mean, np.std])


# ## 3 Apply

# `apply` will extend the previous concepts to any Python function:

# In[10]:


pd.set_option('display.float_format', lambda x: '{:.1f}'.format(x))

def add_field(group):
    group['Tot.Views'] = group['Views'].sum()
    group['Likes[%]'] = 100.0*group['Likes']/group['Likes'].sum()
    return group

HTML(side_by_side2(d1, d1.groupby('State').apply(add_field)))


# ## 4 A pratical example: Normalize by year

# In[11]:


idx = pd.date_range('1999/5/28', periods=1500, freq='1B')
s1 = pd.Series(np.random.normal(5.5, 2, 1500), idx)
s1 = s1.rolling(10,10).mean().dropna()
#s1 = s1.rolling_mean(s1, 10, 10).dropna()


# Here we define a grouping key for months and one for years:

# In[12]:


def my_groupby_key_year(timestamp):
    return timestamp.year

def my_groupby_key_month(timestamp):
    return timestamp.month

def my_normalization(group):
    return (group-group.mean())/group.std()


# Here we normalize the data on a monthly base and check mean and std on an yearly base:

# In[13]:


t1 = s1.groupby(my_groupby_key_month).apply(my_normalization)

HTML(side_by_side2(s1.head(8),
                  t1.head(8),
                  t1.groupby(my_groupby_key_year).aggregate([np.mean, np.std])))


# ## 5 A practical example: Group and standardize by dimension

# In[14]:


d3 = pd.read_csv('example_data/company.csv', index_col=0)
display(d3.head())


# Since the column "Value" is made by strings with a space separator we need a simpel intermediate step to convert values from string to floats:

# In[15]:


d3['Value'] = d3['Value'].apply(lambda x: float(x.replace(' ', '')))
d3.head()


# In[16]:


d3.groupby('Dimension').mean()


# ---
# 
# Visit [www.add-for.com](<http://www.add-for.com/IT>) for more tutorials and updates.
# 
# This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.