# Start with our normal batch of imports and settings
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Following is optional: set plotting styles
import seaborn; seaborn.set()

def load_year(year):
    data = pd.read_csv('../data/names/yob{0}.txt'.format(year),
                       names=['name', 'gender', 'births'])
    data['year'] = year
    return data

names = pd.concat([load_year(year) for year in range(1880, 2014)])
names.head()

# Plot the prevalence of my name over time
my_name = 'Jacob'
subset = names[names.name == my_name]

births = subset.pivot_table('births', index='year',
                            columns='gender', aggfunc='sum')
births.plot();

# Whoah... there are some female "Jacob"s? Let's look at this:
births['F'].fillna(0).plot();

# Now we'll normalize the births against the total for each year
def add_birth_frac(group):
    group['birth_frac'] = group.births / group.births.sum()
    return group

names = names.groupby(['year', 'gender']).apply(add_birth_frac)
names.head()

names[names.name == my_name].pivot_table('birth_frac', index='year',
                                         columns='gender', aggfunc='sum').plot();

births = names.pivot_table('births', index=['year', 'name'], columns='gender')
births = births.fillna(0)
births.head()

early_period = births[:1920].fillna(0).sum(axis=0, level=1)
late_period = births[1980:].fillna(0).sum(axis=0, level=1)

early_period.head()

early_pct_M = 100 * early_period['M'] / early_period.sum(1)
late_pct_M = 100 * late_period['M'] / late_period.sum(1)
fractions = pd.DataFrame({'early_pct_M': early_pct_M,
                         'late_pct_M': late_pct_M})
fractions.head()

fractions[(fractions.early_pct_M < 40) & (fractions.late_pct_M > 60)]

fractions[(fractions.late_pct_M < 40) & (fractions.early_pct_M > 60)]