#!/usr/bin/env python # coding: utf-8 # # Tidy Data # # > Structuring datasets to facilitate analysis [(Wickham 2014)](http://www.jstatsoft.org/v59/i10/paper) # # If there's one maxim I can impart it's that your tools shouldn't get in the way of your analysis. Your problem is already difficult enough, don't let the data or your tools make it any harder. # # ## The Rules # # In a tidy dataset... # # 1. Each variable forms a column # 2. Each observation forms a row # 3. Each type of observational unit forms a table # # We'll cover a few methods that help you get there. # Based on [this](http://stackoverflow.com/questions/22695680/python-pandas-timedelta-specific-rows) StackOverflow question. # In[1]: import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt pd.options.display.max_rows = 10 get_ipython().run_line_magic('matplotlib', 'inline') # Earlier, I fetched some data # # ```python # tables = pd.read_html("http://www.basketball-reference.com/leagues/NBA_2015_games.html") # games = tables[0] # games.to_csv('data/games.csv', index=False) # ``` # In[2]: get_ipython().run_line_magic('pinfo', 'pd.read_html') # In[3]: get_ipython().system('head -n 2 data/games.csv') # The Question: # > **How many days of rest did each team get between each game?** # # Whether or not your dataset is tidy depends on your question. Given our question, what is an observation? # In[4]: column_names = ['date', '_', 'away_team', 'away_points', 'home_team', 'home_points', 'n_ot', 'notes'] games = (pd.read_csv('data/games.csv', names=column_names, parse_dates=['date'], skiprows=1) .drop(['_', 'notes', 'n_ot'], axis='columns') .set_index('date', append=True)) games.index.names = ['game_id', 'date'] games.head() # Is `games` a tidy dataset, given our question? No, we have multiple observations (teams) per row. We'll use `pd.melt` to fix that. # In[5]: tidy = pd.melt(games.sort_index().reset_index(), id_vars=['game_id', 'date'], value_vars=['away_team', 'home_team'], value_name='team') # In[6]: tidy.head() # Now the translation from question to operation is direct: # In[7]: # For each team... get number of dates between games tidy.groupby('team')['date'].diff().dt.days - 1 # In[8]: tidy['rest'] = tidy.sort('date').groupby('team').date.diff().dt.days - 1 tidy.dropna().head() # In[9]: un = pd.pivot_table(tidy, values='rest', index=['game_id', 'date'], columns='variable').rename( columns={'away_team': 'away_rest', 'home_team': 'home_rest'} ) un.columns.name = None # In[10]: un.dropna().head() # In[11]: df = pd.concat([games, un], axis=1) df # In[12]: g = sns.FacetGrid(data=tidy.dropna(), col='team', col_wrap=5, hue='team') g.map(sns.barplot, "variable", "rest"); # In[13]: delta = (un.home_rest - un.away_rest).dropna().astype(int) (delta.value_counts() .reindex(np.arange(delta.min(), delta.max() + 1), fill_value=0) .sort_index().plot(kind='bar', color='k', width=.9, rot=0, figsize=(12, 6))) # # Stack / Unstack # An "observation" depends on the question. Home team advantage? # In[14]: home_adv = games.home_points - games.away_points ax = (home_adv).plot(kind='hist', bins=80, color='k', figsize=(10, 5)) ax.set_xlim(-40, 40) mu = home_adv.mean() ax.vlines(mu, *ax.get_ylim(), color='steelblue', linewidth=3) print('Home win percent:', (home_adv > 0).mean()) # # Team Strength # # Mini Project: Home Court Advantage? # # What's the effect (in terms of probability to win) of being # the home team. # ### Step 1: Calculate Win % # We need to create an indicator for whether the home team won. # Add it as a column called `home_win` in `games`. # In[ ]: games['home_win'] = ... # fill this in # In[18]: #%load -r 1:4 solutions_tidy.py # ### Step 2: Find the win percent for each team # # Teams are split across two columns. It's easiest to calculate the number of wins and # number of games as away, and the number of wins and number of games as home. Then # combine those two results to get the win percent. # In[34]: wins_as_home = games.groupby('').agg([]) # hint: use `~` to flip an array of booleans wins_as_away = ... wins_as_home.columns = ['n_wins', 'n_games'] wins_as_away.columns = ['n_wins', 'n_games'] # In[ ]: get_ipython().run_line_magic('load', '-r 5:13 solutions_tidy.py') # Now add `wins_as_home` and `wins_as_away` to get a DataFrame with # two columsn, `n_wins`, and `n_games` and one row per team. # # Finally, calculate the win percent. # In[59]: get_ipython().run_line_magic('load', '-r 14:20 solutions_tidy.py') # In[60]: strength.order().plot(kind='barh', figsize=(5, 12)) # Bring the `strength` valuess in for each team, for each game. # In[61]: games.head() # For SQL people # # ```sql # SELECT * # FROM games NATURAL JOIN strength # ``` # # We just need to get the names worked out. # In[62]: strength.head().reset_index().rename(columns=lambda x: 'away_' + x) # In[63]: (pd.merge(games.reset_index(), strength.reset_index().add_prefix('away_')) .pipe(pd.merge, strength.reset_index().add_prefix('home_')) .set_index(['game_id', 'date'])) # For python people # In[64]: games = games.assign(away_strength=games.away_team.map(strength), home_strength=games.home_team.map(strength)) games.head() # In[65]: X = pd.concat([games, un], axis=1).set_index(['away_team', 'home_team'], append=True).dropna() # In[66]: X.head() # In[67]: X['home_win'] = X.home_win.astype(int) # for statsmodels # In[68]: import statsmodels.api as sm # In[69]: mod = sm.Logit.from_formula('home_win ~ home_strength + away_strength + home_rest + away_rest', X) res = mod.fit() res.summary() # In[70]: mod = sm.Logit.from_formula('home_win ~ rest_difference', X.assign(rest_difference=lambda df: df.home_rest - df.away_rest)) res = mod.fit() res.summary() # In[71]: mod = sm.OLS.from_formula('spread ~ home_strength + away_strength + rest_difference', X.assign(rest_difference=lambda df: df.home_rest - df.away_rest, spread=lambda df: df.home_points - df.away_points)) res = mod.fit() res.summary() # # Recap # # - Tidy data: one row per observation # - melt / stack: wide to long # - pivot_table / unstack: long to wide # In[ ]: