# Start with our normal batch of imports and settings %matplotlib inline import matplotlib.pyplot as plt import numpy as np import pandas as pd # Following is optional: set plotting styles import seaborn; seaborn.set() def load_year(year): data = pd.read_csv('../data/names/yob{0}.txt'.format(year), names=['name', 'gender', 'births']) data['year'] = year return data names = pd.concat([load_year(year) for year in range(1880, 2014)]) names.head() # Plot the prevalence of my name over time my_name = 'Jacob' subset = names[names.name == my_name] births = subset.pivot_table('births', index='year', columns='gender', aggfunc='sum') births.plot(); # Whoah... there are some female "Jacob"s? Let's look at this: births['F'].fillna(0).plot(); # Now we'll normalize the births against the total for each year def add_birth_frac(group): group['birth_frac'] = group.births / group.births.sum() return group names = names.groupby(['year', 'gender']).apply(add_birth_frac) names.head() names[names.name == my_name].pivot_table('birth_frac', index='year', columns='gender', aggfunc='sum').plot(); births = names.pivot_table('births', index=['year', 'name'], columns='gender') births = births.fillna(0) births.head() early_period = births[:1920].fillna(0).sum(axis=0, level=1) late_period = births[1980:].fillna(0).sum(axis=0, level=1) early_period.head() early_pct_M = 100 * early_period['M'] / early_period.sum(1) late_pct_M = 100 * late_period['M'] / late_period.sum(1) fractions = pd.DataFrame({'early_pct_M': early_pct_M, 'late_pct_M': late_pct_M}) fractions.head() fractions[(fractions.early_pct_M < 40) & (fractions.late_pct_M > 60)] fractions[(fractions.late_pct_M < 40) & (fractions.early_pct_M > 60)]