#!/usr/bin/env python # coding: utf-8 # # Baby Names # In[1]: import addutils.toc ; addutils.toc.js(ipy_notebook=True) # In[2]: import numpy as np import pandas as pd import addutils from IPython.display import display import bokeh.plotting as bk bk.output_notebook() # In[3]: addutils.css_notebook() # ## 1 Load and prepare the data # We downloaded statistics about baby names choosen over years in the U.S. from: # http://www.babycenter.com/baby-names and we stored them on our example data folder. # In[4]: dataFolder = 'temp/baby_names/' columnNames = ['name', 'sex', 'births'] names1880 = pd.read_csv(dataFolder+'yob1880.txt', names=columnNames) names1880.head() # This shows some of the names choosen during 1880. # # Now we want to read all the files in the years that spaces from 1880 to 2011. # In[5]: years = range(1880, 2012) parts = [] for year in years: path = '{0}yob{1}.txt'.format(dataFolder, year) frame = pd.read_csv(path, names=columnNames) frame['year'] = year parts.append(frame) # Now parts is a `python` `list` containing `pandas.DataFrame`(s). Let's create a single `DataFrame` containing all the names. # In[6]: names = pd.concat(parts, ignore_index=True) names[::10**5] # `pandas.concat` concatenates pandas objects along a particular axis. If the optional parameter `ignore_insex` is True, `concat` won't use index values on the concatenation. Values from `0` to `n-1` will be used instead. # ## 2 Pivoting # `DataFrame.pivot_table` creates a spreadsheet-style pivot table as a DataFrame. `aggfunc` parameter specifies a list of aggregation functions to use on elements, `margins` tells if grandtotal/subtotals are to be added to all columns/rows. # In[7]: from bokeh.models.ranges import Range1d totalBirths = names.pivot_table('births', index='year', columns='sex', aggfunc=sum, margins=False) #display(totalBirths.head()) #totalBirths[['F', 'M']][:-1].plot(title='Total births by sex and year') fig = bk.figure(plot_width=750, plot_height=300, title=None) fig.line(x=totalBirths.index, y=totalBirths['F'], legend='F', line_color='magenta') fig.line(x=totalBirths.index, y=totalBirths['M'], legend='M', line_color='royalblue') fig.legend.location = 'bottom_right' fig.xaxis.axis_label = 'Year' fig.yaxis.axis_label = 'Total births' fig.yaxis[0].formatter.use_scientific = False bk.show(fig) # ## 3 Splitting # Let's see a couple of example about splitting data. # # In the first example we are going to view the number of births grouped by year and sex. # In[8]: names.groupby(['year', 'sex'])['births'].sum().head() # The second example shows how to split the names in two groups: Boys and Girls. # In[9]: boys = names[names.sex == 'M'] girls = names[names.sex == 'F'] display(boys[:2000:100]) # We can see how many boys with a specific name were born each year. # In[10]: boys[boys['name']=='Jayden'] # In[11]: bBirths = boys.pivot_table('births', index='year', columns='name', aggfunc=sum, margins=False) subset = bBirths[['Ray', 'Elvis', 'Sam', 'John', 'Marvin', 'Bob']] plots = [] for name in subset.columns: fig = bk.figure(plot_height=200, plot_width=700, title=None) fig.line(x=np.asarray(subset.index), y=np.asarray(subset[name]), line_color='black', legend=name) plots.append([fig]) bk.show(bk.gridplot(plots)) # Or directly using Pandas (which uses Matplotlib, not Bokeh): #subset.plot(subplots=True, figsize=(12, 10), grid=False, # title="Number of births per year") # ## 4 Using 'groupby' # Now we are going to add a `column` named 'prop` that shows the ratio: $\frac{\text{children with a specific name}}{\text{total children}}$ # In[12]: def add_prop(group): births = group['births'] group['prop'] = births/float(births.sum()) return group # In[13]: names = names.groupby(['year', 'sex']).apply(add_prop) display(names.head()) # Let's check our calculations by verifying that the sum of all porportions by sex must be equal (or at least close) to 1. # In[14]: np.allclose(names.groupby(['year', 'sex'])['prop'].sum(), 1) # Now we want to extract the top names for each sex/year combination. # In[15]: def get_top(group, topNumber): return group.sort_values(by='births', ascending=False)[:topNumber] grouped = names.groupby(['year', 'sex']) topNames = grouped.apply(get_top, topNumber=10) # rename indexes to avoid warning; index and columns should have different names topNames.index.rename(['year_', 'sex_', None], inplace=True) topNames[:50] # This is our concluding example and we want to measure the increasing in name diversity. # In[16]: from bokeh.models.ranges import Range1d diversity = topNames.pivot_table('prop', index='year', columns='sex', aggfunc=sum) fig = bk.figure(plot_width=750, plot_height=300, title=None) fig.line(x=diversity.index, y=diversity['F'], line_color='green', legend='F') fig.line(x=diversity.index, y=diversity['M'], line_color='blue', legend='M') fig.y_range = Range1d(0, 1.2) bk.show(fig) # Or, using directly Pandas' "plot" method (which calls Matplotlib, not Bokeh) # diversity.plot(title='Sum of diversity.prop by year and sex', # yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10)) # --- # # Visit [www.add-for.com]() for more tutorials and updates. # # This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.