#!/usr/bin/env python
# coding: utf-8

# # Baby Names

# In[1]:


import addutils.toc ; addutils.toc.js(ipy_notebook=True)


# In[2]:


import numpy as np
import pandas as pd
import addutils
from IPython.display import display
import bokeh.plotting as bk
bk.output_notebook()


# In[3]:


addutils.css_notebook()


# ## 1 Load and prepare the data

# We downloaded statistics about baby names choosen over years in the U.S. from:
# http://www.babycenter.com/baby-names and we stored them on our example data folder.

# In[4]:


dataFolder = 'temp/baby_names/'
columnNames = ['name', 'sex', 'births']

names1880 = pd.read_csv(dataFolder+'yob1880.txt', names=columnNames)
names1880.head()


# This shows some of the names choosen during 1880.
# 
# Now we want to read all the files in the years that spaces from 1880 to 2011.

# In[5]:


years = range(1880, 2012)
parts = []
for year in years:
    path = '{0}yob{1}.txt'.format(dataFolder, year)
    frame = pd.read_csv(path, names=columnNames)
    frame['year'] = year  
    parts.append(frame)


# Now parts is a `python` `list` containing `pandas.DataFrame`(s). Let's create a single `DataFrame` containing all the names.

# In[6]:


names = pd.concat(parts, ignore_index=True)
names[::10**5]


# `pandas.concat` concatenates pandas objects along a particular axis. If the optional parameter `ignore_insex` is True, `concat` won't use index values on the concatenation. Values from `0` to `n-1` will be used instead.

# ## 2 Pivoting

# `DataFrame.pivot_table` creates a spreadsheet-style pivot table as a DataFrame. `aggfunc` parameter specifies a list of aggregation functions to use on elements, `margins` tells if grandtotal/subtotals are to be added to all columns/rows.

# In[7]:


from bokeh.models.ranges import Range1d
totalBirths = names.pivot_table('births', index='year', columns='sex',
                                aggfunc=sum, margins=False)
#display(totalBirths.head())
#totalBirths[['F', 'M']][:-1].plot(title='Total births by sex and year')
fig = bk.figure(plot_width=750, plot_height=300, title=None)
fig.line(x=totalBirths.index, y=totalBirths['F'], legend='F', line_color='magenta')
fig.line(x=totalBirths.index, y=totalBirths['M'], legend='M', line_color='royalblue')
fig.legend.location = 'bottom_right'
fig.xaxis.axis_label = 'Year'
fig.yaxis.axis_label = 'Total births'
fig.yaxis[0].formatter.use_scientific = False
bk.show(fig)


# ## 3 Splitting

# Let's see a couple of example about splitting data.
# 
# In the first example we are going to view the number of births grouped by year and sex.

# In[8]:


names.groupby(['year', 'sex'])['births'].sum().head()


# The second example shows how to split the names in two groups: Boys and Girls.

# In[9]:


boys = names[names.sex == 'M']
girls = names[names.sex == 'F']
display(boys[:2000:100])


# We can see how many boys with a specific name were born each year.

# In[10]:


boys[boys['name']=='Jayden']


# In[11]:


bBirths = boys.pivot_table('births', index='year', columns='name',
                           aggfunc=sum, margins=False)
subset = bBirths[['Ray', 'Elvis', 'Sam', 'John', 'Marvin', 'Bob']]

plots = []
for name in subset.columns:
    fig = bk.figure(plot_height=200, plot_width=700, title=None)
    fig.line(x=np.asarray(subset.index), y=np.asarray(subset[name]),
             line_color='black', legend=name)
    plots.append([fig])
bk.show(bk.gridplot(plots))

# Or directly using Pandas (which uses Matplotlib, not Bokeh): 
#subset.plot(subplots=True, figsize=(12, 10), grid=False,
#            title="Number of births per year")


# ## 4 Using 'groupby'

# Now we are going to add a `column` named 'prop` that shows the ratio: $\frac{\text{children with a specific name}}{\text{total children}}$

# In[12]:


def add_prop(group):
    births = group['births']
    group['prop'] = births/float(births.sum())
    return group


# In[13]:


names = names.groupby(['year', 'sex']).apply(add_prop)
display(names.head())


# Let's check our calculations by verifying that the sum of all porportions by sex must be equal (or at least close) to 1.

# In[14]:


np.allclose(names.groupby(['year', 'sex'])['prop'].sum(), 1)


# Now we want to extract the top names for each sex/year combination.

# In[15]:


def get_top(group, topNumber):
    return group.sort_values(by='births', ascending=False)[:topNumber]

grouped = names.groupby(['year', 'sex'])
topNames = grouped.apply(get_top, topNumber=10)
# rename indexes to avoid warning; index and columns should have different names
topNames.index.rename(['year_', 'sex_', None], inplace=True)
topNames[:50]


# This is our concluding example and we want to measure the increasing in name diversity.

# In[16]:


from bokeh.models.ranges import Range1d

diversity = topNames.pivot_table('prop', index='year', columns='sex', aggfunc=sum)

fig = bk.figure(plot_width=750, plot_height=300, title=None)
fig.line(x=diversity.index, y=diversity['F'], line_color='green', legend='F')
fig.line(x=diversity.index, y=diversity['M'], line_color='blue',  legend='M')
fig.y_range = Range1d(0, 1.2)
bk.show(fig)

# Or, using directly Pandas' "plot" method (which calls Matplotlib, not Bokeh)
# diversity.plot(title='Sum of diversity.prop by year and sex',
#                yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))


# ---
# 
# Visit [www.add-for.com](<http://www.add-for.com/IT>) for more tutorials and updates.
# 
# This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.