#!/usr/bin/env python
# coding: utf-8

# # Tidy Data

# In[1]:


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

pd.options.display.max_rows = 6
pd.options.display.max_columns = 6


# Structuring datasets to facilitate analysis [(Wickham 2014)](http://www.jstatsoft.org/v59/i10/paper)
# 
# If there's one maxim I can impart it's that your tools shouldn't get in the way of your analysis. Your problem is already difficult enough, don't let the data or your tools make it any harder.

# In a tidy dataset...
# 
# 1. Each variable forms a column
# 2. Each observation forms a row
# 3. Each type of observational unit forms a table
# 
# We'll cover a few methods that help you get there.

# In[2]:


url = "http://www.basketball-reference.com/leagues/NBA_2015_games.html"
tables = pd.read_html(url)
games = tables[0]
games.to_csv('data/games.csv', index=False)


# In[3]:


get_ipython().system('head -n 5 data/games.csv')


# The Question:
# > **How many days of rest did each team get between each game?**
# 
# Whether or not your dataset is tidy depends on your question. Given our question, what is an observation?

# In[4]:


# parse and convert
column_names = ['date1', 'time1', '_', 'away_team', 'away_points', 
                'home_team', 'home_points', 'n_ot', 'notes']
games = (pd.read_csv('data/games.csv', names=column_names, 
                      header=None, skiprows=2)
           .assign(date=lambda x: 
                  pd.to_datetime(x['date1'] + ' ' + x['time1'], 
                                 errors='coerce'))
           .drop(['_', 'date1', 'time1', 'notes', 'n_ot'], axis='columns')
           .set_index('date', append=True)
        )
games


# In[5]:


games.index.names = ['game_id', 'date']
games       


# Is `games` a tidy dataset, given our question? No, we have multiple observations (teams) per row. We'll use `pd.melt` to fix that.

# In[6]:


# wide to long
tidy = pd.melt(games.reset_index(),
               id_vars=['game_id', 'date'], 
               value_vars=['away_team', 'home_team'],
               value_name='team')


# In[7]:


tidy


# In[8]:


tidy[tidy.game_id==0]


# In[9]:


# long to wide
(tidy
     .pivot(index='game_id',columns='variable')
     .reset_index()
 )


# Now the translation from question to operation is direct:

# In[10]:


tidy


# In[11]:


# For each team... get number of dates between games
tidy.groupby('team')['date'].diff().dt.days - 1


# In[12]:


tidy.groupby('team').get_group('Los Angeles Lakers')


# In[13]:


tidy.groupby('team').get_group('Los Angeles Lakers')['date'].diff()


# In[14]:


tidy['rest'] = (tidy
                    .sort_values('date')
                    .groupby('team')
                    .date.diff()
                    .dt
                    .days - 1
)
tidy.dropna()


# In[15]:


(tidy.dropna()
     .pipe(sns.FacetGrid, col='team', col_wrap=9, hue='team')
     .map(sns.barplot, "variable", "rest")
 )


# What are we doing?

# In[16]:


(tidy
     .dropna()
     .query('team == "Los Angeles Lakers"')
     .pipe(sns.FacetGrid, col='team', hue='team')
     .map(sns.barplot, "variable", "rest")
 )


# In[17]:


g = (tidy
        .dropna()
        .query('team == "Los Angeles Lakers"')
        .groupby('variable')
     )
g.rest.sum()/g.rest.count()


# # Stack / Unstack

# An "observation" depends on the question. Home team advantage?

# In[18]:


home_adv = games.home_points - games.away_points
ax = home_adv.plot(kind='hist', bins=80, figsize=(10, 5))
ax.set_xlim(-40, 40)
ax.vlines(home_adv.mean(), *ax.get_ylim(), color='red', linewidth=3)
print('Home win percent:', (home_adv > 0).mean())


# # Team Strength
# 
# ### Mini Project: Home Court Advantage?
# 
# What's the effect (in terms of probability to win) of being
# the home team.
# 

# ### Step 1. Calculate Win %
# 
# We need to create an indicator for whether the home team won.
# Add it as a column called `home_win` in `games`.

# In[19]:


games['home_win'] = games['home_points'] > games['away_points']
games


# ### Step 2: Find the win percent for each team
# 
# Teams are split across two columns. It's easiest to calculate the number of wins and number of games as away, and the number of wins and number of games as home. Then combine those two results to get the win percent.

# In[20]:


wins_as_away = games.groupby('away_team').home_win.agg(
    {'n_games': 'count', 'n_wins': lambda x: (~x).sum()}
)
wins_as_home = games.groupby('home_team').home_win.agg(
    {'n_games': 'count', 'n_wins': 'sum'}
)
wins = (wins_as_away + wins_as_home)
wins


# Finally, calculate the win percent.

# In[21]:


strength = wins.n_wins / wins.n_games
strength.index.name = 'team'
strength.name = 'strength'
strength


# In[22]:


strength.sort_values().plot.barh(figsize=(4,8))


# Bring the `strength` values in for each team, for each game.

# For SQL people
# 
# ```sql
# SELECT *
# FROM games NATURAL JOIN strength
# ```
# 
# We just need to get the names worked out.

# In[23]:


(strength
         .head()
         .reset_index()
         .rename(columns=lambda x: 'away_' + x)
 )


# In[24]:


(pd.merge(games.reset_index(), 
          strength.reset_index().add_prefix('away_'))
   .pipe(pd.merge, 
         strength.reset_index().add_prefix('home_'))
   .set_index(['game_id', 'date'])
)


# For python people

# In[25]:


games = games.assign(away_strength=games.away_team.map(strength),
                     home_strength=games.home_team.map(strength))
games


# # Pivoting

# In[26]:


tidy


# In[27]:


# summarizing pivot

un = (pd.pivot_table(tidy,
                     values='rest',
                     index='variable',
                     columns='team')
)
un


# In[28]:


# equivalent
tidy.groupby(['team','variable']).rest.mean().unstack('team')


# In[29]:


# transformational pivot

un = (pd.pivot_table(tidy,
                     values='rest',
                     index=['game_id','date'],
                     columns='variable',
                     aggfunc='mean')
        .rename(columns={'away_team': 'away_rest', 'home_team': 'home_rest'})
)
un.columns.name = None


# In[30]:


un.dropna()


# In[31]:


# glue
res = pd.concat([games, un], axis=1).reset_index('date')
res


# In[32]:


res.to_hdf('data/games.hdf','df',mode='w',format='table')