#!/usr/bin/env python # coding: utf-8 # # Tidy Data # In[1]: import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.max_rows = 6 pd.options.display.max_columns = 6 # Structuring datasets to facilitate analysis [(Wickham 2014)](http://www.jstatsoft.org/v59/i10/paper) # # If there's one maxim I can impart it's that your tools shouldn't get in the way of your analysis. Your problem is already difficult enough, don't let the data or your tools make it any harder. # In a tidy dataset... # # 1. Each variable forms a column # 2. Each observation forms a row # 3. Each type of observational unit forms a table # # We'll cover a few methods that help you get there. # In[2]: url = "http://www.basketball-reference.com/leagues/NBA_2015_games.html" tables = pd.read_html(url) games = tables[0] games.to_csv('data/games.csv', index=False) # In[3]: get_ipython().system('head -n 5 data/games.csv') # The Question: # > **How many days of rest did each team get between each game?** # # Whether or not your dataset is tidy depends on your question. Given our question, what is an observation? # In[4]: # parse and convert column_names = ['date1', 'time1', '_', 'away_team', 'away_points', 'home_team', 'home_points', 'n_ot', 'notes'] games = (pd.read_csv('data/games.csv', names=column_names, header=None, skiprows=2) .assign(date=lambda x: pd.to_datetime(x['date1'] + ' ' + x['time1'], errors='coerce')) .drop(['_', 'date1', 'time1', 'notes', 'n_ot'], axis='columns') .set_index('date', append=True) ) games # In[5]: games.index.names = ['game_id', 'date'] games # Is `games` a tidy dataset, given our question? No, we have multiple observations (teams) per row. We'll use `pd.melt` to fix that. # In[6]: # wide to long tidy = pd.melt(games.reset_index(), id_vars=['game_id', 'date'], value_vars=['away_team', 'home_team'], value_name='team') # In[7]: tidy # In[8]: tidy[tidy.game_id==0] # In[9]: # long to wide (tidy .pivot(index='game_id',columns='variable') .reset_index() ) # Now the translation from question to operation is direct: # In[10]: tidy # In[11]: # For each team... get number of dates between games tidy.groupby('team')['date'].diff().dt.days - 1 # In[12]: tidy.groupby('team').get_group('Los Angeles Lakers') # In[13]: tidy.groupby('team').get_group('Los Angeles Lakers')['date'].diff() # In[14]: tidy['rest'] = (tidy .sort_values('date') .groupby('team') .date.diff() .dt .days - 1 ) tidy.dropna() # In[15]: (tidy.dropna() .pipe(sns.FacetGrid, col='team', col_wrap=9, hue='team') .map(sns.barplot, "variable", "rest") ) # What are we doing? # In[16]: (tidy .dropna() .query('team == "Los Angeles Lakers"') .pipe(sns.FacetGrid, col='team', hue='team') .map(sns.barplot, "variable", "rest") ) # In[17]: g = (tidy .dropna() .query('team == "Los Angeles Lakers"') .groupby('variable') ) g.rest.sum()/g.rest.count() # # Stack / Unstack # An "observation" depends on the question. Home team advantage? # In[18]: home_adv = games.home_points - games.away_points ax = home_adv.plot(kind='hist', bins=80, figsize=(10, 5)) ax.set_xlim(-40, 40) ax.vlines(home_adv.mean(), *ax.get_ylim(), color='red', linewidth=3) print('Home win percent:', (home_adv > 0).mean()) # # Team Strength # # ### Mini Project: Home Court Advantage? # # What's the effect (in terms of probability to win) of being # the home team. # # ### Step 1. Calculate Win % # # We need to create an indicator for whether the home team won. # Add it as a column called `home_win` in `games`. # In[19]: games['home_win'] = games['home_points'] > games['away_points'] games # ### Step 2: Find the win percent for each team # # Teams are split across two columns. It's easiest to calculate the number of wins and number of games as away, and the number of wins and number of games as home. Then combine those two results to get the win percent. # In[20]: wins_as_away = games.groupby('away_team').home_win.agg( {'n_games': 'count', 'n_wins': lambda x: (~x).sum()} ) wins_as_home = games.groupby('home_team').home_win.agg( {'n_games': 'count', 'n_wins': 'sum'} ) wins = (wins_as_away + wins_as_home) wins # Finally, calculate the win percent. # In[21]: strength = wins.n_wins / wins.n_games strength.index.name = 'team' strength.name = 'strength' strength # In[22]: strength.sort_values().plot.barh(figsize=(4,8)) # Bring the `strength` values in for each team, for each game. # For SQL people # # ```sql # SELECT * # FROM games NATURAL JOIN strength # ``` # # We just need to get the names worked out. # In[23]: (strength .head() .reset_index() .rename(columns=lambda x: 'away_' + x) ) # In[24]: (pd.merge(games.reset_index(), strength.reset_index().add_prefix('away_')) .pipe(pd.merge, strength.reset_index().add_prefix('home_')) .set_index(['game_id', 'date']) ) # For python people # In[25]: games = games.assign(away_strength=games.away_team.map(strength), home_strength=games.home_team.map(strength)) games # # Pivoting # In[26]: tidy # In[27]: # summarizing pivot un = (pd.pivot_table(tidy, values='rest', index='variable', columns='team') ) un # In[28]: # equivalent tidy.groupby(['team','variable']).rest.mean().unstack('team') # In[29]: # transformational pivot un = (pd.pivot_table(tidy, values='rest', index=['game_id','date'], columns='variable', aggfunc='mean') .rename(columns={'away_team': 'away_rest', 'home_team': 'home_rest'}) ) un.columns.name = None # In[30]: un.dropna() # In[31]: # glue res = pd.concat([games, un], axis=1).reset_index('date') res # In[32]: res.to_hdf('data/games.hdf','df',mode='w',format='table')