#!/usr/bin/env python # coding: utf-8 # # Reshaping data with `stack` and `unstack` # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt try: import seaborn except ImportError: pass pd.options.display.max_rows = 8 # ## Case study: air quality data of European monitoring stations (AirBase) # # Going further with the time series case study [test](05 - Time series data.ipynb) on the AirBase (The European Air quality dataBase) data: the actual data downloaded from the Airbase website did not look like a nice csv file (`data/airbase_data.csv`). # One of the actual downloaded raw data files of AirBase is included in the repo: # In[ ]: get_ipython().system('head -1 ./data/BETR8010000800100hour.1-1-1990.31-12-2012') # Just reading the tab-delimited data: # In[ ]: data = pd.read_csv("data/BETR8010000800100hour.1-1-1990.31-12-2012", sep='\t')#, header=None) # In[ ]: data.head() # The above data is clearly not ready to be used! Each row contains the 24 measurements for each hour of the day, and also contains a flag (0/1) indicating the quality of the data. #
# EXERCISE: Clean up this dataframe using more options of `read_csv` #
# # - specify that the values of -999 and -9999 should be regarded as NaN # - specify are own column names (http://stackoverflow.com/questions/6356041/python-intertwining-two-lists) # In[ ]: hours = ["{:02d}".format(i) for i in range(24)] # In[ ]: # %load snippets/06 - Reshaping data6.py # In[ ]: # %load snippets/06 - Reshaping data7.py # For now, we disregard the 'flag' columns # In[ ]: # %load snippets/06 - Reshaping data8.py # Now, we want to reshape it: our goal is to have the different hours as row indices, merged with the date into a datetime-index. # ## Overview: reshaping your data with `stack`, `unstack` and `pivot` # The docs say: # # > Pivot a level of the (possibly hierarchical) column labels, returning a # DataFrame (or Series in the case of an object with a single level of # column labels) having a hierarchical index with a new inner-most level # of row labels. # # # In[ ]: df = pd.DataFrame({'A':['one', 'one', 'two', 'two'], 'B':['a', 'b', 'a', 'b'], 'C':range(4)}) df # To use `stack`/`unstack`, we need the values we want to shift from rows to columns or the other way around as the index: # In[ ]: df = df.set_index(['A', 'B']) df # In[ ]: result = df['C'].unstack() result # In[ ]: df = result.stack().reset_index(name='C') df # `pivot` is similar to `unstack`, but let you specify column names: # In[ ]: df.pivot(index='A', columns='B', values='C') # `pivot_table` is similar as `pivot`, but can work with duplicate indices and let you specify an aggregation function: # In[ ]: df = pd.DataFrame({'A':['one', 'one', 'two', 'two', 'one', 'two'], 'B':['a', 'b', 'a', 'b', 'a', 'b'], 'C':range(6)}) df # In[ ]: df.pivot_table(index='A', columns='B', values='C', aggfunc='count') #'mean' # ## Back to our case study # We can now use `stack` and some other functions to create a timeseries from the original dataframe: # In[ ]: colnames = ['date'] + [item for pair in zip(["{:02d}".format(i) for i in range(24)], ['flag']*24) for item in pair] data = pd.read_csv("data/BETR8010000800100hour.1-1-1990.31-12-2012", sep='\t', header=None, na_values=[-999, -9999], names=colnames) data = data.drop('flag', axis=1) # In[ ]: data.head() #
# EXERCISE: Reshape the dataframe to a timeseries #
# # The end result should look like: # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
BETR801
1990-01-02 09:00:0048.0
1990-01-02 12:00:0048.0
1990-01-02 13:00:0050.0
1990-01-02 14:00:0055.0
......
2012-12-31 20:00:0016.5
2012-12-31 21:00:0014.5
2012-12-31 22:00:0016.5
2012-12-31 23:00:0015.0
#

170794 rows × 1 columns

#
# First, reshape the dataframe so that each row consists of one observation for one date + hour combination: # In[ ]: # %load snippets/06 - Reshaping data18.py # In[ ]: # %load snippets/06 - Reshaping data19.py # In[ ]: # %load snippets/06 - Reshaping data20.py # Now, combine the date and hour colums into a datetime (tip: string columns can be summed to concatenate the strings): # In[ ]: # %load snippets/06 - Reshaping data21.py # In[ ]: # %load snippets/06 - Reshaping data22.py # In[ ]: # %load snippets/06 - Reshaping data23.py # In[ ]: # %load snippets/06 - Reshaping data24.py # In[ ]: # %load snippets/06 - Reshaping data25.py #
# QUESTION: What is the difference in the typical diurnal profile between week and weekend days? (and visualise it) #
# In[ ]: # use the data of above data = data_stacked # or read from the processed csv file #data = pd.read_csv('data/airbase_data.csv', index_col=0, parse_dates=True, na_values=[-9999])[['BETR801']] #data = data['1999':] # In[ ]: get_ipython().run_line_magic('pinfo', 'data.index.weekday') # In[ ]: # %load snippets/06 - Reshaping data28.py # Add a column indicating week/weekend # In[ ]: # %load snippets/06 - Reshaping data29.py # In[ ]: # %load snippets/06 - Reshaping data30.py # In[ ]: # %load snippets/06 - Reshaping data31.py # In[ ]: # %load snippets/06 - Reshaping data32.py # In[ ]: # %load snippets/06 - Reshaping data33.py #
# QUESTION: Visualize the typical week profile for the different stations as boxplots (where the values in one boxplot are the daily means for the different weeks for a certain weekday). #
# # Tip: the boxplot method of a DataFrame expects the data for the different boxes in different columns). For this, you can either use `pivot_table` as a combination of `groupby` and `unstack` # In[ ]: # %load snippets/06 - Reshaping data34.py # In[ ]: # %load snippets/06 - Reshaping data35.py # In[ ]: # %load snippets/06 - Reshaping data36.py # An alternative method using `groupby` and `unstack`: # In[ ]: # %load snippets/06 - Reshaping data37.py # ## We can also use the reshaping methods with the movie data # In[ ]: cast = pd.read_csv('data/cast.csv') cast.head() # In[ ]: titles = pd.read_csv('data/titles.csv') titles.head() #
# EXERCISE: Define a year as a "Superman year" whose films feature more Superman characters than Batman. How many years in film history have been Superman years? #
# In[ ]: # %load snippets/06 - Reshaping data40.py # In[ ]: # %load snippets/06 - Reshaping data41.py #
# EXERCISE: Plot the number of actor roles each year and the number of actress roles each year over the history of film. #
# In[ ]: # %load snippets/06 - Reshaping data42.py #
# EXERCISE: Plot the number of actor roles each year and the number of actress roles each year, but this time as a kind='area' plot. #
# In[ ]: # %load snippets/06 - Reshaping data43.py # In[ ]: