In [1]:

import pandas as pd

In [2]:

names1880 = pd.read_csv('ch02/names/yob1880.txt', names=['name', 'sex', 'births'])

In [3]:

names1880

Out[3]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 3 columns):
name      2000  non-null values
sex       2000  non-null values
births    2000  non-null values
dtypes: int64(1), object(2)

In [4]:

names1880.groupby('sex').births.sum()

Out[4]:

sex
F       90993
M      110493
Name: births, dtype: int64

In [6]:

years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
    path = 'ch02/names/yob{0}.txt'.format(year)
    frame = pd.read_csv(path, names=columns)
    frame['year'] = year
    pieces.append(frame)
names = pd.concat(pieces, ignore_index=True)

In [7]:

names

Out[7]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690784 entries, 0 to 1690783
Data columns (total 4 columns):
name      1690784  non-null values
sex       1690784  non-null values
births    1690784  non-null values
year      1690784  non-null values
dtypes: int64(2), object(2)

In [8]:

names.head()

Out[8]:

	name	sex	births	year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880

In [9]:

total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)

In [10]:

total_births.tail()

Out[10]:

sex	F	M
year
2006	1896468	2050234
2007	1916888	2069242
2008	1883645	2032310
2009	1827643	1973359
2010	1759010	1898382

In [11]:

total_births.plot(title='Total births by sex and year')

Out[11]:

<matplotlib.axes.AxesSubplot at 0xcfe32d0>

In [15]:

# don't need this if py3
# py2 doesn't integer division (floor) by default).
from __future__ import division

def add_prop(group):
    # don't want integer division
    #births = group.births.astype(float)
    
    group['prop'] = group.births / group.births.sum()
    return group

names = names.groupby(['year', 'sex']).apply(add_prop)

In [16]:

names.head()

Out[16]:

	name	sex	births	year	prop
0	Mary	F	7065	1880	0.077643
1	Anna	F	2604	1880	0.028618
2	Emma	F	2003	1880	0.022013
3	Elizabeth	F	1939	1880	0.021309
4	Minnie	F	1746	1880	0.019188

In [17]:

names

Out[17]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690784 entries, 0 to 1690783
Data columns (total 5 columns):
name      1690784  non-null values
sex       1690784  non-null values
births    1690784  non-null values
year      1690784  non-null values
prop      1690784  non-null values
dtypes: float64(1), int64(2), object(2)

In [18]:

np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

Out[18]:

True

In [19]:

def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

In [20]:

top1000.head()

Out[20]:

			name	sex	births	year	prop
year	sex
1880	F	0	Mary	F	7065	1880	0.077643
		1	Anna	F	2604	1880	0.028618
		2	Emma	F	2003	1880	0.022013
		3	Elizabeth	F	1939	1880	0.021309
		4	Minnie	F	1746	1880	0.019188

In [21]:

top1000

Out[21]:

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 261877 entries, (1880, F, 0) to (2010, M, 1677643)
Data columns (total 5 columns):
name      261877  non-null values
sex       261877  non-null values
births    261877  non-null values
year      261877  non-null values
prop      261877  non-null values
dtypes: float64(1), int64(2), object(2)

Analyzing Naming Trends¶

In [22]:

boys = top1000[top1000.sex == 'M']

In [23]:

girls = top1000[top1000.sex == 'F']

In [24]:

total_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=sum)

In [31]:

total_births.ix[1990].dropna().head()

Out[31]:

name
Aaron    14545
Abbey      507
Abbie      311
Abby      1293
Abdul      122
Name: 1990, dtype: float64

In [32]:

subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]

In [33]:

subset.plot(subplots=True, figsize=(12, 10), grid=False,
            title='Number of births per year')

Out[33]:

array([<matplotlib.axes.AxesSubplot object at 0x373e310>,
       <matplotlib.axes.AxesSubplot object at 0xb829550>,
       <matplotlib.axes.AxesSubplot object at 0x6f48090>,
       <matplotlib.axes.AxesSubplot object at 0x6f6e8d0>], dtype=object)

Measuring the increase in naming diversity¶

In [34]:

table = top1000.pivot_table('prop', rows='year', cols='sex', aggfunc=sum)

In [35]:

table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13),
           xticks=range(1880, 2020, 10))

Out[35]:

<matplotlib.axes.AxesSubplot at 0x7a9c1d0>

In [36]:

df = boys[boys.year == 2010]

In [37]:

df

Out[37]:

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1000 entries, (2010, M, 1676644) to (2010, M, 1677643)
Data columns (total 5 columns):
name      1000  non-null values
sex       1000  non-null values
births    1000  non-null values
year      1000  non-null values
prop      1000  non-null values
dtypes: float64(1), int64(2), object(2)

In [38]:

prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()

In [39]:

prop_cumsum[:10]

Out[39]:

year  sex         
2010  M    1676644    0.011523
           1676645    0.020934
           1676646    0.029959
           1676647    0.038930
           1676648    0.047817
           1676649    0.056579
           1676650    0.065155
           1676651    0.073414
           1676652    0.081528
           1676653    0.089621
dtype: float64

In [44]:

# Alternate way of doing same thing.
prop_cumsum = df.prop.order(ascending=False).cumsum()
prop_cumsum[:10]

Out[44]:

year  sex         
2010  M    1676644    0.011523
           1676645    0.020934
           1676646    0.029959
           1676647    0.038930
           1676648    0.047817
           1676649    0.056579
           1676650    0.065155
           1676651    0.073414
           1676652    0.081528
           1676653    0.089621
dtype: float64

In [49]:

prop_cumsum.searchsorted(0.5) + 1

Out[49]:

In [46]:

df = boys[boys.year == 1900]

In [47]:

in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()

In [48]:

in1900.searchsorted(0.5) + 1

Out[48]:

In [53]:

def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().searchsorted(q) + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity.head()

Out[53]:

year  sex
1880  F      38
      M      14
1881  F      38
      M      14
1882  F      38
dtype: int64

In [54]:

diversity = diversity.unstack('sex')
diversity.head()

Out[54]:

sex	F	M
year
1880	38	14
1881	38	14
1882	38	15
1883	39	15
1884	39	16

In [55]:

diversity.plot(title='Number of popular names in top 50%')

Out[55]:

<matplotlib.axes.AxesSubplot at 0xc4c7350>

The "Last letter" Revolution¶

In [57]:

# extract last letter from name column
last_letters = names.name.str[-1]
last_letters.name = 'last_letter'

In [58]:

table = names.pivot_table('births', rows=last_letters,
                          cols=['sex', 'year'], aggfunc=sum)

In [69]:

table.ix['a']

Out[69]:

sex  year
F    1880    31446
     1881    31581
     1882    36536
     1883    38330
     1884    43680
     1885    45408
     1886    49100
     1887    48942
     1888    59442
     1889    58631
     1890    62313
     1891    60582
     1892    68331
     1893    67821
     1894    70631
...
M    1996    42739
     1997    41458
     1998    41281
     1999    40608
     2000    40837
     2001    39124
     2002    38815
     2003    37825
     2004    38650
     2005    36838
     2006    36156
     2007    34654
     2008    32901
     2009    31430
     2010    28438
Name: a, Length: 262, dtype: float64

In [64]:

subtable = table.reindex(columns=[1910, 1960, 2010], level='year')

In [65]:

subtable.head()

Out[65]:

sex	F			M
year	1910	1960	2010	1910	1960	2010
last_letter
a	108376	691247	670605	977	5204	28438
b	NaN	694	450	411	3912	38859
c	5	49	946	482	15476	23125
d	6750	3729	2607	22111	262112	44398
e	133569	435013	313833	28655	178823	129012

In [70]:

subtable.sum()

Out[70]:

sex  year
F    1910     396416
     1960    2022062
     2010    1759010
M    1910     194198
     1960    2132588
     2010    1898382
dtype: float64

In [82]:

from __future__ import division
letter_prop = subtable / subtable.sum()

In [83]:

letter_prop.head()

Out[83]:

sex	F			M
year	1910	1960	2010	1910	1960	2010
last_letter
a	0.273390	0.341853	0.381240	0.005031	0.002440	0.014980
b	NaN	0.000343	0.000256	0.002116	0.001834	0.020470
c	0.000013	0.000024	0.000538	0.002482	0.007257	0.012181
d	0.017028	0.001844	0.001482	0.113858	0.122908	0.023387
e	0.336941	0.215133	0.178415	0.147556	0.083853	0.067959

In [77]:

import matplotlib.pyplot as plt
fix, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female')

Out[77]:

<matplotlib.axes.AxesSubplot at 0xd9eaad0>

In [84]:

letter_prop = table / table.sum()

In [85]:

dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T

In [86]:

dny_ts.head()

Out[86]:

	d	n	y
year
1880	0.083055	0.153213	0.075760
1881	0.083247	0.153214	0.077451
1882	0.085340	0.149560	0.077537
1883	0.084066	0.151646	0.079144
1884	0.086120	0.149915	0.080405

In [87]:

dny_ts.plot()

Out[87]:

<matplotlib.axes.AxesSubplot at 0x2a094690>

Boy names that became girl names (and vice versa)¶

In [88]:

all_names = top1000.name.unique()

In [89]:

mask = np.array(['lesl' in x.lower() for x in all_names])

In [92]:

pd.Series(mask).value_counts()

Out[92]:

False    6860
True        5
dtype: int64

In [93]:

lesley_like = all_names[mask]

In [94]:

lesley_like

Out[94]:

array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)

In [97]:

# alternate way of obtaining lesley_like
lesley_like = top1000.name[top1000.name.str.lower().str.contains('lesl')].unique()
lesley_like

Out[97]:

array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)

In [98]:

filtered = top1000[top1000.name.isin(lesley_like)]

In [99]:

filtered.groupby('name').births.sum()

Out[99]:

name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64

In [100]:

table = filtered.pivot_table('births', rows='year', cols='sex', aggfunc=sum)

In [101]:

table.head()

Out[101]:

sex	F	M
year
1880	8	79
1881	11	92
1882	9	128
1883	7	125
1884	15	125

In [106]:

table = table.div(table.sum(axis=1), axis=0)

In [107]:

table.tail()

Out[107]:

sex	F	M
year
2006	1	NaN
2007	1	NaN
2008	1	NaN
2009	1	NaN
2010	1	NaN

In [108]:

table.plot(style={'M': 'k-', 'F': 'k--'})

Out[108]:

<matplotlib.axes.AxesSubplot at 0x2a095a90>

In [ ]: