#!/usr/bin/env python
# coding: utf-8
#
Reshaping data
#
# > *© 2016, Joris Van den Bossche and Stijn Van Hoey (, ). Licensed under [CC BY 4.0 Creative Commons](http://creativecommons.org/licenses/by/4.0/)*
#
# ---
# In[ ]:
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# # Pivoting data
# ## Cfr. excel
# People who know Excel, probably know the **Pivot** functionality:
# ![](img/pivot_excel.png)
# The data of the table:
# In[ ]:
excelample = pd.DataFrame({'Month': ["January", "January", "January", "January",
"February", "February", "February", "February",
"March", "March", "March", "March"],
'Category': ["Transportation", "Grocery", "Household", "Entertainment",
"Transportation", "Grocery", "Household", "Entertainment",
"Transportation", "Grocery", "Household", "Entertainment"],
'Amount': [74., 235., 175., 100., 115., 240., 225., 125., 90., 260., 200., 120.]})
# In[ ]:
excelample
# In[ ]:
excelample_pivot = excelample.pivot(index="Category", columns="Month", values="Amount")
excelample_pivot
# Interested in *Grand totals*?
# In[ ]:
# sum columns
excelample_pivot.sum(axis=1)
# In[ ]:
# sum rows
excelample_pivot.sum(axis=0)
# ## Pivot is just reordering your data
# Small subsample of the titanic dataset:
# In[ ]:
df = pd.DataFrame({'Fare': [7.25, 71.2833, 51.8625, 30.0708, 7.8542, 13.0],
'Pclass': [3, 1, 1, 2, 3, 2],
'Sex': ['male', 'female', 'male', 'female', 'female', 'male'],
'Survived': [0, 1, 0, 1, 0, 1]})
# In[ ]:
df
# In[ ]:
df.pivot(index='Pclass', columns='Sex', values='Fare')
# In[ ]:
df.pivot(index='Pclass', columns='Sex', values='Survived')
# So far, so good...
# Let's now use the full titanic dataset:
# In[ ]:
df = pd.read_csv("data/titanic.csv")
# In[ ]:
df.head()
# And try the same pivot (*no worries about the try-except, this is here just used to catch a loooong error*):
# In[ ]:
try:
df.pivot(index='Sex', columns='Pclass', values='Fare')
except Exception as e:
print("Exception!", e)
# This does not work, because we would end up with multiple values for one cell of the resulting frame, as the error says: `duplicated` values for the columns in the selection. As an example, consider the following rows of our three columns of interest:
# In[ ]:
df.loc[[1, 3], ["Sex", 'Pclass', 'Fare']]
# Since `pivot` is just restructuring data, where would both values of `Fare` for the same combination of `Sex` and `Pclass` need to go?
#
# Well, they need to be combined, according to an `aggregation` functionality, which is supported by the function`pivot_table`
#
#
#
NOTE:
#
#
# - **Pivot** is purely restructuring: a single value for each index/column combination is required.
#
#
# # Pivot tables - aggregating while pivoting
# In[ ]:
df = pd.read_csv("data/titanic.csv")
# In[ ]:
df.pivot_table(index='Sex', columns='Pclass', values='Fare')
#
#
#
REMEMBER:
#
#
# - By default, `pivot_table` takes the **mean** of all values that would end up into one cell. However, you can also specify other aggregation functions using the `aggfunc` keyword.
#
#
# In[ ]:
df.pivot_table(index='Sex', columns='Pclass',
values='Fare', aggfunc='max')
# In[ ]:
df.pivot_table(index='Sex', columns='Pclass',
values='Fare', aggfunc='count')
#
#
#
REMEMBER:
#
#
# - There is a shortcut function for a `pivot_table` with a `aggfunc=count` as aggregation: `crosstab`
#
#
# In[ ]:
pd.crosstab(index=df['Sex'], columns=df['Pclass'])
#
#
#
EXERCISE:
#
#
# - Make a pivot table with the survival rates (= number of persons survived / total number of persons) for Pclass vs Sex.
# - Plot the result as a bar plot.
#
#
# In[ ]:
# %load snippets/06 - Reshaping data20.py
# In[ ]:
# %load snippets/06 - Reshaping data21.py
#
#
#
EXERCISE:
#
#
# - Make a table of the median Fare payed by aged/underaged vs Sex.
#
#
# In[ ]:
# %load snippets/06 - Reshaping data22.py
# In[ ]:
# %load snippets/06 - Reshaping data23.py
# # Melt
# The `melt` function performs the inverse operation of a `pivot`. This can be used to make your frame longer, i.e. to make a *tidy* version of your data.
# In[ ]:
pivoted = df.pivot_table(index='Sex', columns='Pclass', values='Fare').reset_index()
pivoted.columns.name = None
# In[ ]:
pivoted
# Assume we have a DataFrame like the above. The observations (the average Fare people payed) are spread over different columns. In a tidy dataset, each observation is stored in one row. To obtain this, we can use the `melt` function:
# In[ ]:
pd.melt(pivoted)
# As you can see above, the `melt` function puts all column labels in one column, and all values in a second column.
#
# In this case, this is not fully what we want. We would like to keep the 'Sex' column separately:
# In[ ]:
pd.melt(pivoted, id_vars=['Sex']) #, var_name='Pclass', value_name='Fare')
# # Reshaping with `stack` and `unstack`
# The docs say:
#
# > Pivot a level of the (possibly hierarchical) column labels, returning a
# DataFrame (or Series in the case of an object with a single level of
# column labels) having a hierarchical index with a new inner-most level
# of row labels.
#
# Indeed...
#
#
# Before we speak about `hierarchical index`, first check it in practice on the following dummy example:
# In[ ]:
df = pd.DataFrame({'A':['one', 'one', 'two', 'two'],
'B':['a', 'b', 'a', 'b'],
'C':range(4)})
df
# To use `stack`/`unstack`, we need the values we want to shift from rows to columns or the other way around as the index:
# In[ ]:
df = df.set_index(['A', 'B']) # Indeed, you can combine two indices
df
# In[ ]:
result = df['C'].unstack()
result
# In[ ]:
df = result.stack().reset_index(name='C')
df
#
#
#
REMEMBER:
#
#
# - **stack**: make your data *longer* and *smaller*
# - **unstack**: make your data *shorter* and *wider*
#
#
# ## Mimick pivot table
# To better understand and reason about pivot tables, we can express this method as a combination of more basic steps. In short, the pivot is a convenient way of expressing the combination of a `groupby` and `stack/unstack`.
# In[ ]:
df = pd.read_csv("data/titanic.csv")
# In[ ]:
df.head()
# In[ ]:
df.pivot_table(index='Pclass', columns='Sex',
values='Survived', aggfunc='mean')
#
#
#
EXERCISE:
#
#
# - Get the same result as above based on a combination of `groupby` and `unstack`
# - First use `groupby` to calculate the survival ratio for all groups
# - Then, use `unstack` to reshape the output of the groupby operation
#
#
# In[ ]:
# %load snippets/06 - Reshaping data37.py
# ## Mimick melt
# Like the pivot table above, we can now also obtain the result of `melt` with stack/unstack.
#
# Let's use the same `pivoted` frame as above, and look at the final melt result:
# In[ ]:
pivoted = df.pivot_table(index='Sex', columns='Pclass', values='Fare').reset_index()
pivoted.columns.name = None
pivoted
# In[ ]:
pd.melt(pivoted, id_vars=['Sex'], var_name='Pclass', value_name='Fare')
#
#
#
EXERCISE:
#
#
# - Get the same result as above using `stack`/`unstack` (combined with `set_index` / `reset_index`)
# - Tip: set those columns as the index that you do not want to stack
#
#
# In[ ]:
# %load snippets/06 - Reshaping data40.py
# In[ ]:
# %load snippets/06 - Reshaping data41.py
# In[ ]:
# %load snippets/06 - Reshaping data42.py
# In[ ]:
# %load snippets/06 - Reshaping data43.py
# # Exercises: use the reshaping methods with the movie data
# These exercises are based on the [PyCon tutorial of Brandon Rhodes](https://github.com/brandon-rhodes/pycon-pandas-tutorial/) (so credit to him!) and the datasets he prepared for that. You can download these data from here: [`titles.csv`](https://drive.google.com/open?id=0B3G70MlBnCgKajNMa1pfSzN6Q3M) and [`cast.csv`](https://drive.google.com/open?id=0B3G70MlBnCgKal9UYTJSR2ZhSW8) and put them in the `/data` folder.
# In[ ]:
cast = pd.read_csv('data/cast.csv')
cast.head()
# In[ ]:
titles = pd.read_csv('data/titles.csv')
titles.head()
#
#
#
EXERCISE:
#
#
# - Plot the number of actor roles each year and the number of actress roles each year over the whole period of available movie data.
#
#
# In[ ]:
# %load snippets/06 - Reshaping data46.py
# In[ ]:
# %load snippets/06 - Reshaping data47.py
# In[ ]:
# %load snippets/06 - Reshaping data48.py
#
#
#
EXERCISE:
#
#
# - Plot the number of actor roles each year and the number of actress roles each year. Use kind='area' as plot type
#
#
# In[ ]:
# %load snippets/06 - Reshaping data49.py
#
#
#
EXERCISE:
#
#
# - Plot the fraction of roles that have been 'actor' roles each year over the whole period of available movie data.
#
#
# In[ ]:
# %load snippets/06 - Reshaping data50.py
#
#
#
EXERCISE:
#
#
# - Define a year as a "Superman year" when films of that year feature more Superman characters than Batman characters. How many years in film history have been Superman years?
#
#
# In[ ]:
# %load snippets/06 - Reshaping data51.py
# In[ ]:
# %load snippets/06 - Reshaping data52.py