#!/usr/bin/env python # coding: utf-8 # This notebook covers the basics of operations on Series and DataFrames. Again, a quick check on whether you know the material. # # 1. Can you use numpy ufuncs (e.g. `np.exp`) on DataFrames? # 2. What happens when you add two DataFrames with different indicies? # # If you know the answers already, you can skip to our first [application](#Application:-Economic-Timeseries). Maybe download some additional datasets from [FRED](https://research.stlouisfed.org) and play with those; try to figure out how to fix the economy. You've got about 5 minutes. # `DataFrame`s and `Series` support all the usual math operations. Additionally, # numpy [`ufuncs`](http://docs.scipy.org/doc/numpy/reference/ufuncs.html) (e.g. `np.log`) can be used as expected. # In[1]: import numpy as np import pandas as pd # # `ufuncs` # In[2]: np.random.seed(42) df = pd.DataFrame(np.random.uniform(0, 10, size=(3, 3))) df # In[3]: df + 1 # In[4]: df ** 2 # In[5]: np.log(df) # # Alignment # But we don't just have NumPy arrays. We have `DataFrames` and `Series`, which have *labels*. # Pandas automatically aligns by label when doing operations between `DataFrames` and `Series`. # In[6]: df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) df2 = pd.DataFrame({'A': [2, 4, 6], 'B': [1, 4, 9]}) df1 # In[7]: df2 # In[8]: df1 + df2 # In[9]: # Note the index order df2 = pd.DataFrame({'A': [6, 2, 4], 'B': [9, 1, 4]}, index=[2, 0, 1]) df2 # In[10]: df1 + df2 # In[11]: # Different index entirely df3 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C']) df3 # In[12]: df1 + df3 # # Recap # # - `ufuncs` work just fine # - broadcasting works fine # - Operations **align by label** and then are performed # # Application: Economic Timeseries # In[13]: from itertools import chain import pandas as pd import seaborn as sns import statsmodels.api as sm # from pandas_datareader import data # from pandas.io import data import matplotlib.pyplot as plt from toolz import partitionby get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.max_rows = 10 sns.set_style('ticks') # Pandas recently split off some web-based data reading functionality into its own package `pandas-datareader`. Earlier, I used it to grab data from [FRED](http://research.stlouisfed.org). # # ```python # gdp = data.DataReader("GDP", data_source='fred', start='1929', end='2014').squeeze() # cpi = data.DataReader("CPIAUCSL", data_source='fred', start='1947-01', end='2015-05').squeeze() # rec = data.DataReader('USREC', data_source='fred', start='1854-12-01', end='2014-08-01').squeeze() # gdp.to_csv('data/gdp.csv', header=True) # cpi.to_csv('data/cpi.csv', header=True) # rec.to_csv('data/rec.csv', header=True) # ``` # In[14]: get_ipython().system('head data/gdp.csv') # In[15]: get_ipython().run_line_magic('pinfo', 'pd.read_csv') # In[16]: gdp = pd.read_csv('data/gdp.csv', index_col='DATE', parse_dates=['DATE']).squeeze() cpi = pd.read_csv('data/cpi.csv', index_col='DATE', parse_dates=['DATE']).squeeze() rec = pd.read_csv('data/rec.csv', index_col='DATE', parse_dates=['DATE']).squeeze() # In[17]: gdp.head(n=5) # In[18]: cpi.head() # In[19]: rec.head() # # Table Summarization # Pandas has a few methods for summarizing the contents of a DataFrame or Series # In[20]: gdp.describe() # `describe` actually combines and few different *aggregation* methods that summarise a DataFrame or Series. These include `mean`, `max`, `std`, `quantile`... # In[21]: print('mean: ', gdp.mean()) print('std: ', gdp.std()) print('quantile:', gdp.quantile(.66)) print('max: ', gdp.max()) # It can also be useful to get the argmax, or the index label where the maximum occurs. # In[22]: gdp.idxmax() # # Plotting # DataFrames and Series can make decent looking plots with a few lines of code. # # I wanted to incude bars for recession indicators. # The only pandas-relavent bit is `Series.iteritems`, which you *almost* never want to use since it's relatively slow. # In[23]: next(rec.iteritems()) # In[24]: groups = partitionby(lambda x: x[1] == 1, rec.iteritems()) recessions = filter(lambda x: x[0][1] == 1, groups) spans = [(months[0][0], months[-1][0]) for months in recessions] # In[25]: def add_rec_bars(ax=None): ax = ax or plt.gca() for span in spans: ax.fill_between(span, *ax.get_ylim(), color='k', alpha=.25) return ax plt.rcParams['figure.figsize'] = (12, 6) # In[26]: ax = gdp.plot(title='GDP', linewidth=3) add_rec_bars(ax) sns.despine() # In[27]: ax = cpi.plot(title='CPI', linewidth=3) add_rec_bars(ax) sns.despine() # Let's put some of those operations to use. # # Execrise: Convert CPI to be base 2009 # # CPI is the Consumer Price Index. The Index part just means that it doesn't really have any units, # just some (arbitrary) time span that is set to 100, and every other observation is relative to that time. # The CPI we have is based on 1982-1984. # # **Convert the CPI base-2009 by dividing the entire Series `cpi` by the average CPI in 2009.** # # Try breaking the problem into pieces: # 1. Select just the rows from 2009 (`.loc`) # + Timeseries (dates in the index) have speical rules for slicing. Pass in a string with the subset you want # + `.loc['2010-01-01']` select all observations from that day # + `.loc['2010-01']` selects all observations from that month # 2. calculate the average on those rows # 3. divide `cpi` by that number (assign that to `cpi09`) # In[28]: # Your code goes here # Assign the result to cpi09 # In[31]: get_ipython().run_line_magic('load', '-r 1:5 solutions_operations.py') # In[32]: ax = cpi09.plot(title='CPI-2009') add_rec_bars(ax=ax) sns.despine(); # # Real GDP: Alignment # # Let's say we want real GDP (adjusted for inflation). # # \begin{equation} # rGDP_t = \frac{GDP_t}{CPI_t} # \end{equation} # # Problem: our CPI is monthly but GDP is quarterly. Also the two Series have different start and end points. If you didn't have automatic label alignment, you'd have to jump through hoops to select the correct subset of each series. # # # ```python # # this is boolean indexing, we'll see more later # gdp / cpi09[(cpi.index.month % 3 == 1) & (cpi09.index.year <= 2014)] # ``` # ... but that's unneccesary. The operations will automatically align for you. # In[33]: rgdp = (gdp / cpi09) rgdp # The `NaN`s are missing value indicators. `NaN`s can crop up for many reasons, but in this case it's because the labels didn't overlap perfectly. # # Many pandas methods, e.g. aggregations like `sum` or `mean`, will ignore missing values. # Pandas provides methods for detetcing missing data `.isnull`, filling missing data `.fillna`, # or you can dropping it, which is what I'll do here. # In[36]: rgdp.dropna() # Pandas methods are non-mutating by default. This means that even though I called `.dropna()` above, `rgdp` still has missing values. # In[41]: rgdp # To capture the change, assign a name to the result. In this case, I just the same name `rgdp`. # In[42]: rgdp = rgdp.dropna() rgdp.head() # In[43]: ax = gdp.pct_change().plot(figsize=(15, 5), label='GDP', legend=True, linewidth=2) rgdp.pct_change().plot(ax=ax, label='real GDP', legend=True, linewidth=2) add_rec_bars(ax) sns.despine() # # Recap # # DataFrames allow you to rapidly explore data, iterating on ideas. # # - `ufunc`s work as expected # - operations between pandas objects automatically align on labels # In[ ]: