import pandas as pd
names1880 = pd.read_csv('ch02/names/yob1880.txt', names=['name', 'sex', 'births'])
names1880
<class 'pandas.core.frame.DataFrame'> Int64Index: 2000 entries, 0 to 1999 Data columns (total 3 columns): name 2000 non-null values sex 2000 non-null values births 2000 non-null values dtypes: int64(1), object(2)
names1880.groupby('sex').births.sum()
sex F 90993 M 110493 Name: births, dtype: int64
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
path = 'ch02/names/yob{0}.txt'.format(year)
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
names = pd.concat(pieces, ignore_index=True)
names
<class 'pandas.core.frame.DataFrame'> Int64Index: 1690784 entries, 0 to 1690783 Data columns (total 4 columns): name 1690784 non-null values sex 1690784 non-null values births 1690784 non-null values year 1690784 non-null values dtypes: int64(2), object(2)
names.head()
name | sex | births | year | |
---|---|---|---|---|
0 | Mary | F | 7065 | 1880 |
1 | Anna | F | 2604 | 1880 |
2 | Emma | F | 2003 | 1880 |
3 | Elizabeth | F | 1939 | 1880 |
4 | Minnie | F | 1746 | 1880 |
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
total_births.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1896468 | 2050234 |
2007 | 1916888 | 2069242 |
2008 | 1883645 | 2032310 |
2009 | 1827643 | 1973359 |
2010 | 1759010 | 1898382 |
total_births.plot(title='Total births by sex and year')
<matplotlib.axes.AxesSubplot at 0xcfe32d0>
# don't need this if py3
# py2 doesn't integer division (floor) by default).
from __future__ import division
def add_prop(group):
# don't want integer division
#births = group.births.astype(float)
group['prop'] = group.births / group.births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
names.head()
name | sex | births | year | prop | |
---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 |
2 | Emma | F | 2003 | 1880 | 0.022013 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 |
names
<class 'pandas.core.frame.DataFrame'> Int64Index: 1690784 entries, 0 to 1690783 Data columns (total 5 columns): name 1690784 non-null values sex 1690784 non-null values births 1690784 non-null values year 1690784 non-null values prop 1690784 non-null values dtypes: float64(1), int64(2), object(2)
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
True
def get_top1000(group):
return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
top1000.head()
name | sex | births | year | prop | |||
---|---|---|---|---|---|---|---|
year | sex | ||||||
1880 | F | 0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 | ||
2 | Emma | F | 2003 | 1880 | 0.022013 | ||
3 | Elizabeth | F | 1939 | 1880 | 0.021309 | ||
4 | Minnie | F | 1746 | 1880 | 0.019188 |
top1000
<class 'pandas.core.frame.DataFrame'> MultiIndex: 261877 entries, (1880, F, 0) to (2010, M, 1677643) Data columns (total 5 columns): name 261877 non-null values sex 261877 non-null values births 261877 non-null values year 261877 non-null values prop 261877 non-null values dtypes: float64(1), int64(2), object(2)
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=sum)
total_births.ix[1990].dropna().head()
name Aaron 14545 Abbey 507 Abbie 311 Abby 1293 Abdul 122 Name: 1990, dtype: float64
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False,
title='Number of births per year')
array([<matplotlib.axes.AxesSubplot object at 0x373e310>, <matplotlib.axes.AxesSubplot object at 0xb829550>, <matplotlib.axes.AxesSubplot object at 0x6f48090>, <matplotlib.axes.AxesSubplot object at 0x6f6e8d0>], dtype=object)
table = top1000.pivot_table('prop', rows='year', cols='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
yticks=np.linspace(0, 1.2, 13),
xticks=range(1880, 2020, 10))
<matplotlib.axes.AxesSubplot at 0x7a9c1d0>
df = boys[boys.year == 2010]
df
<class 'pandas.core.frame.DataFrame'> MultiIndex: 1000 entries, (2010, M, 1676644) to (2010, M, 1677643) Data columns (total 5 columns): name 1000 non-null values sex 1000 non-null values births 1000 non-null values year 1000 non-null values prop 1000 non-null values dtypes: float64(1), int64(2), object(2)
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()
prop_cumsum[:10]
year sex 2010 M 1676644 0.011523 1676645 0.020934 1676646 0.029959 1676647 0.038930 1676648 0.047817 1676649 0.056579 1676650 0.065155 1676651 0.073414 1676652 0.081528 1676653 0.089621 dtype: float64
# Alternate way of doing same thing.
prop_cumsum = df.prop.order(ascending=False).cumsum()
prop_cumsum[:10]
year sex 2010 M 1676644 0.011523 1676645 0.020934 1676646 0.029959 1676647 0.038930 1676648 0.047817 1676649 0.056579 1676650 0.065155 1676651 0.073414 1676652 0.081528 1676653 0.089621 dtype: float64
prop_cumsum.searchsorted(0.5) + 1
117
df = boys[boys.year == 1900]
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()
in1900.searchsorted(0.5) + 1
25
def get_quantile_count(group, q=0.5):
group = group.sort_index(by='prop', ascending=False)
return group.prop.cumsum().searchsorted(q) + 1
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity.head()
year sex 1880 F 38 M 14 1881 F 38 M 14 1882 F 38 dtype: int64
diversity = diversity.unstack('sex')
diversity.head()
sex | F | M |
---|---|---|
year | ||
1880 | 38 | 14 |
1881 | 38 | 14 |
1882 | 38 | 15 |
1883 | 39 | 15 |
1884 | 39 | 16 |
diversity.plot(title='Number of popular names in top 50%')
<matplotlib.axes.AxesSubplot at 0xc4c7350>
# extract last letter from name column
last_letters = names.name.str[-1]
last_letters.name = 'last_letter'
table = names.pivot_table('births', rows=last_letters,
cols=['sex', 'year'], aggfunc=sum)
table.ix['a']
sex year F 1880 31446 1881 31581 1882 36536 1883 38330 1884 43680 1885 45408 1886 49100 1887 48942 1888 59442 1889 58631 1890 62313 1891 60582 1892 68331 1893 67821 1894 70631 ... M 1996 42739 1997 41458 1998 41281 1999 40608 2000 40837 2001 39124 2002 38815 2003 37825 2004 38650 2005 36838 2006 36156 2007 34654 2008 32901 2009 31430 2010 28438 Name: a, Length: 262, dtype: float64
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()
sex | F | M | ||||
---|---|---|---|---|---|---|
year | 1910 | 1960 | 2010 | 1910 | 1960 | 2010 |
last_letter | ||||||
a | 108376 | 691247 | 670605 | 977 | 5204 | 28438 |
b | NaN | 694 | 450 | 411 | 3912 | 38859 |
c | 5 | 49 | 946 | 482 | 15476 | 23125 |
d | 6750 | 3729 | 2607 | 22111 | 262112 | 44398 |
e | 133569 | 435013 | 313833 | 28655 | 178823 | 129012 |
subtable.sum()
sex year F 1910 396416 1960 2022062 2010 1759010 M 1910 194198 1960 2132588 2010 1898382 dtype: float64
from __future__ import division
letter_prop = subtable / subtable.sum()
letter_prop.head()
sex | F | M | ||||
---|---|---|---|---|---|---|
year | 1910 | 1960 | 2010 | 1910 | 1960 | 2010 |
last_letter | ||||||
a | 0.273390 | 0.341853 | 0.381240 | 0.005031 | 0.002440 | 0.014980 |
b | NaN | 0.000343 | 0.000256 | 0.002116 | 0.001834 | 0.020470 |
c | 0.000013 | 0.000024 | 0.000538 | 0.002482 | 0.007257 | 0.012181 |
d | 0.017028 | 0.001844 | 0.001482 | 0.113858 | 0.122908 | 0.023387 |
e | 0.336941 | 0.215133 | 0.178415 | 0.147556 | 0.083853 | 0.067959 |
import matplotlib.pyplot as plt
fix, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female')
<matplotlib.axes.AxesSubplot at 0xd9eaad0>
letter_prop = table / table.sum()
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()
d | n | y | |
---|---|---|---|
year | |||
1880 | 0.083055 | 0.153213 | 0.075760 |
1881 | 0.083247 | 0.153214 | 0.077451 |
1882 | 0.085340 | 0.149560 | 0.077537 |
1883 | 0.084066 | 0.151646 | 0.079144 |
1884 | 0.086120 | 0.149915 | 0.080405 |
dny_ts.plot()
<matplotlib.axes.AxesSubplot at 0x2a094690>
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
pd.Series(mask).value_counts()
False 6860 True 5 dtype: int64
lesley_like = all_names[mask]
lesley_like
array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)
# alternate way of obtaining lesley_like
lesley_like = top1000.name[top1000.name.str.lower().str.contains('lesl')].unique()
lesley_like
array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
name Leslee 1082 Lesley 35022 Lesli 929 Leslie 370429 Lesly 10067 Name: births, dtype: int64
table = filtered.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
table.head()
sex | F | M |
---|---|---|
year | ||
1880 | 8 | 79 |
1881 | 11 | 92 |
1882 | 9 | 128 |
1883 | 7 | 125 |
1884 | 15 | 125 |
table = table.div(table.sum(axis=1), axis=0)
table.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1 | NaN |
2007 | 1 | NaN |
2008 | 1 | NaN |
2009 | 1 | NaN |
2010 | 1 | NaN |
table.plot(style={'M': 'k-', 'F': 'k--'})
<matplotlib.axes.AxesSubplot at 0x2a095a90>