We are going to play with recreating this animation: http://www.prooffreader.com/2014/04/baby-names-rise-of-n.html

The first step of this is to get the data, so let's pop over to David Taylor's notebook and download the files.

Then let's load those dataframes.

In [3]:
data_path = "baby_names" 

import os
import pandas as pd

os.chdir(data_path)
yob = pd.read_pickle('yob.pickle')
names = pd.read_pickle('names.pickle')
years = pd.read_pickle('years.pickle')
In [56]:
print yob
              name sex  births  year       pct  ranked firstletter
0             Mary   F    7065  1880  7.764334       1           M
1             Anna   F    2604  1880  2.861759       2           A
2             Emma   F    2003  1880  2.201268       3           E
3        Elizabeth   F    1939  1880  2.130933       4           E
4           Minnie   F    1746  1880  1.918829       5           M
5         Margaret   F    1578  1880  1.734199       6           M
6              Ida   F    1472  1880  1.617707       7           I
7            Alice   F    1414  1880  1.553966       8           A
8           Bertha   F    1320  1880  1.450661       9           B
9            Sarah   F    1288  1880  1.415493      10           S
10           Annie   F    1258  1880  1.382524      11           A
11           Clara   F    1226  1880  1.347356      12           C
12            Ella   F    1156  1880  1.270427      13           E
13        Florence   F    1063  1880  1.168222      14           F
14            Cora   F    1045  1880  1.148440      15           C
15          Martha   F    1040  1880  1.142945      16           M
16           Laura   F    1012  1880  1.112173      17           L
17          Nellie   F     995  1880  1.093491      18           N
18           Grace   F     982  1880  1.079204      19           G
19          Carrie   F     949  1880  1.042937      20           C
20           Maude   F     858  1880  0.942930      21           M
21           Mabel   F     808  1880  0.887980      22           M
22          Bessie   F     796  1880  0.874793      23           B
23          Jennie   F     793  1880  0.871496      24           J
24        Gertrude   F     787  1880  0.864902      25           G
25           Julia   F     783  1880  0.860506      26           J
26          Hattie   F     769  1880  0.845120      27           H
27           Edith   F     768  1880  0.844021      28           E
28          Mattie   F     704  1880  0.773686      29           M
29            Rose   F     700  1880  0.769290      30           R
...            ...  ..     ...   ...       ...     ...         ...
1792061    Zayceon   M       5  2013  0.000267   12995           Z
1792062      Zayid   M       5  2013  0.000267   12995           Z
1792063    Zaylynn   M       5  2013  0.000267   12995           Z
1792064  Zecheriah   M       5  2013  0.000267   12995           Z
1792065     Zedric   M       5  2013  0.000267   12995           Z
1792066     Zefram   M       5  2013  0.000267   12995           Z
1792067      Zekhi   M       5  2013  0.000267   12995           Z
1792068     Zenith   M       5  2013  0.000267   12995           Z
1792069     Zennon   M       5  2013  0.000267   12995           Z
1792070    Zepplin   M       5  2013  0.000267   12995           Z
1792071      Zevon   M       5  2013  0.000267   12995           Z
1792072    Zhaiden   M       5  2013  0.000267   12995           Z
1792073       Zhen   M       5  2013  0.000267   12995           Z
1792074      Zhian   M       5  2013  0.000267   12995           Z
1792075       Zien   M       5  2013  0.000267   12995           Z
1792076     Zierre   M       5  2013  0.000267   12995           Z
1792077      Zimri   M       5  2013  0.000267   12995           Z
1792078     Ziquan   M       5  2013  0.000267   12995           Z
1792079     Ziyaad   M       5  2013  0.000267   12995           Z
1792080     Ziyang   M       5  2013  0.000267   12995           Z
1792081      Zmari   M       5  2013  0.000267   12995           Z
1792082      Zolan   M       5  2013  0.000267   12995           Z
1792083     Zurich   M       5  2013  0.000267   12995           Z
1792084      Zyeer   M       5  2013  0.000267   12995           Z
1792085      Zyere   M       5  2013  0.000267   12995           Z
1792086     Zyhier   M       5  2013  0.000267   12995           Z
1792087      Zylar   M       5  2013  0.000267   12995           Z
1792088     Zymari   M       5  2013  0.000267   12995           Z
1792089     Zymeer   M       5  2013  0.000267   12995           Z
1792090      Zyree   M       5  2013  0.000267   12995           Z

[1792091 rows x 7 columns]
In [55]:
print yob[yob.year == 1880]
           name sex  births  year       pct  ranked firstletter
0          Mary   F    7065  1880  7.764334     1.0           M
1          Anna   F    2604  1880  2.861759     2.0           A
2          Emma   F    2003  1880  2.201268     3.0           E
3     Elizabeth   F    1939  1880  2.130933     4.0           E
4        Minnie   F    1746  1880  1.918829     5.0           M
5      Margaret   F    1578  1880  1.734199     6.0           M
6           Ida   F    1472  1880  1.617707     7.0           I
7         Alice   F    1414  1880  1.553966     8.0           A
8        Bertha   F    1320  1880  1.450661     9.0           B
9         Sarah   F    1288  1880  1.415493    10.0           S
10        Annie   F    1258  1880  1.382524    11.0           A
11        Clara   F    1226  1880  1.347356    12.0           C
12         Ella   F    1156  1880  1.270427    13.0           E
13     Florence   F    1063  1880  1.168222    14.0           F
14         Cora   F    1045  1880  1.148440    15.0           C
15       Martha   F    1040  1880  1.142945    16.0           M
16        Laura   F    1012  1880  1.112173    17.0           L
17       Nellie   F     995  1880  1.093491    18.0           N
18        Grace   F     982  1880  1.079204    19.0           G
19       Carrie   F     949  1880  1.042937    20.0           C
20        Maude   F     858  1880  0.942930    21.0           M
21        Mabel   F     808  1880  0.887980    22.0           M
22       Bessie   F     796  1880  0.874793    23.0           B
23       Jennie   F     793  1880  0.871496    24.0           J
24     Gertrude   F     787  1880  0.864902    25.0           G
25        Julia   F     783  1880  0.860506    26.0           J
26       Hattie   F     769  1880  0.845120    27.0           H
27        Edith   F     768  1880  0.844021    28.0           E
28       Mattie   F     704  1880  0.773686    29.0           M
29         Rose   F     700  1880  0.769290    30.0           R
...         ...  ..     ...   ...       ...     ...         ...
1970      Philo   M       5  1880  0.004525   983.5           P
1971    Phineas   M       5  1880  0.004525   983.5           P
1972    Presley   M       5  1880  0.004525   983.5           P
1973     Ransom   M       5  1880  0.004525   983.5           R
1974      Reece   M       5  1880  0.004525   983.5           R
1975       Rene   M       5  1880  0.004525   983.5           R
1976    Roswell   M       5  1880  0.004525   983.5           R
1977    Rowland   M       5  1880  0.004525   983.5           R
1978    Sampson   M       5  1880  0.004525   983.5           S
1979     Samual   M       5  1880  0.004525   983.5           S
1980     Santos   M       5  1880  0.004525   983.5           S
1981   Schuyler   M       5  1880  0.004525   983.5           S
1982   Sheppard   M       5  1880  0.004525   983.5           S
1983   Spurgeon   M       5  1880  0.004525   983.5           S
1984   Starling   M       5  1880  0.004525   983.5           S
1985   Sylvanus   M       5  1880  0.004525   983.5           S
1986   Theadore   M       5  1880  0.004525   983.5           T
1987  Theophile   M       5  1880  0.004525   983.5           T
1988     Tilmon   M       5  1880  0.004525   983.5           T
1989      Tommy   M       5  1880  0.004525   983.5           T
1990    Unknown   M       5  1880  0.004525   983.5           U
1991       Vann   M       5  1880  0.004525   983.5           V
1992        Wes   M       5  1880  0.004525   983.5           W
1993    Winston   M       5  1880  0.004525   983.5           W
1994       Wood   M       5  1880  0.004525   983.5           W
1995     Woodie   M       5  1880  0.004525   983.5           W
1996     Worthy   M       5  1880  0.004525   983.5           W
1997     Wright   M       5  1880  0.004525   983.5           W
1998       York   M       5  1880  0.004525   983.5           Y
1999  Zachariah   M       5  1880  0.004525   983.5           Z

[2000 rows x 7 columns]
In [29]:
print yob[(yob['name'].str.startswith('A')) & (yob.year == 1880)]
           name sex  births  year       pct  ranked
1          Anna   F    2604  1880  2.861759     2.0
7         Alice   F    1414  1880  1.553966     8.0
10        Annie   F    1258  1880  1.382524    11.0
32          Ada   F     652  1880  0.716539    33.0
53        Agnes   F     473  1880  0.519820    54.0
81         Alma   F     277  1880  0.304419    82.0
82        Addie   F     274  1880  0.301122    83.0
88       Amanda   F     241  1880  0.264856    89.0
95       Amelia   F     221  1880  0.242876    96.5
107         Amy   F     167  1880  0.183531   108.0
112     Augusta   F     151  1880  0.165947   113.0
119        Anne   F     136  1880  0.149462   120.0
121         Ann   F     131  1880  0.143967   123.5
143       Allie   F     105  1880  0.115393   144.5
150        Alta   F      91  1880  0.100008   151.0
170     Alberta   F      76  1880  0.083523   171.5
177       Abbie   F      71  1880  0.078028   178.0
185    Adelaide   F      65  1880  0.071434   188.0
207     Adeline   F      54  1880  0.059345   209.0
239       Adele   F      41  1880  0.045058   243.0
253       Angie   F      36  1880  0.039563   257.0
287       Artie   F      29  1880  0.031871   290.5
293      Alvina   F      28  1880  0.030772   299.0
294     Annette   F      28  1880  0.030772   299.0
308      Adella   F      26  1880  0.028574   312.5
309       Alpha   F      26  1880  0.028574   312.5
316    Angeline   F      25  1880  0.027475   321.0
325        Adah   F      24  1880  0.026376   329.0
332     Adaline   F      23  1880  0.025277   337.0
351      Almeda   F      21  1880  0.023079   359.0
...         ...  ..     ...   ...       ...     ...
1690      Arlie   M       7  1880  0.006335   776.5
1750      Adolf   M       6  1880  0.005430   858.5
1751      Albin   M       6  1880  0.005430   858.5
1752     Albion   M       6  1880  0.005430   858.5
1753    Allison   M       6  1880  0.005430   858.5
1754      Alpha   M       6  1880  0.005430   858.5
1755    Alpheus   M       6  1880  0.005430   858.5
1756  Anastacio   M       6  1880  0.005430   858.5
1757      Andre   M       6  1880  0.005430   858.5
1758      Annie   M       6  1880  0.005430   858.5
1759  Arlington   M       6  1880  0.005430   858.5
1760     Armand   M       6  1880  0.005430   858.5
1761    Asberry   M       6  1880  0.005430   858.5
1762     Asbury   M       6  1880  0.005430   858.5
1763      Asher   M       6  1880  0.005430   858.5
1764   Augustin   M       6  1880  0.005430   858.5
1765     Auther   M       6  1880  0.005430   858.5
1766     Author   M       6  1880  0.005430   858.5
1850         Ab   M       5  1880  0.004525   983.5
1851     Abbott   M       5  1880  0.004525   983.5
1852    Agustus   M       5  1880  0.004525   983.5
1853   Albertus   M       5  1880  0.004525   983.5
1854      Almer   M       5  1880  0.004525   983.5
1855   Alphonso   M       5  1880  0.004525   983.5
1856      Alvia   M       5  1880  0.004525   983.5
1857      Artie   M       5  1880  0.004525   983.5
1858      Arvid   M       5  1880  0.004525   983.5
1859      Ashby   M       5  1880  0.004525   983.5
1860    Augusta   M       5  1880  0.004525   983.5
1861    Aurthur   M       5  1880  0.004525   983.5

[190 rows x 6 columns]
In [30]:
yob['firstletter'] = yob['name'].str[0]
In [89]:
# Now let's add dummy data to sanitize things. 
temp_frame = pd.DataFrame(columns=yob.columns.values.tolist(), index=range((2014 - 1880)* 26))
for y in range(1880, 2014):
    for c in range(65, 91):
        ind = 26*(y - 1880) + c - 65
        temp_frame.ix[ind].year = y
        temp_frame.ix[ind].firstletter = chr(c)
        temp_frame.ix[ind].name = chr(c)
        temp_frame.ix[ind].sex = 'F'
        temp_frame.ix[ind].births = 0
        temp_frame.ix[ind].pct = 0
        temp_frame.ix[ind].ranked = 0
print temp_frame.ix[1]
name            NaN
sex               F
births            0
year           1880
pct               0
ranked            0
firstletter       B
Name: 1, dtype: object
In [88]:
yob_aggregated = yob.groupby(['year', 'firstletter', 'sex']).sum().reset_index()
In [90]:
yob_sanitized = yob_aggregated.append(temp_frame)
print yob_sanitized
     births firstletter name         pct   ranked sex  year
0      9334           A  NaN    10.25793    43318   F  1880
1      7406           A  NaN    6.702808  54054.5   M  1880
2      3876           B  NaN    4.259668  12487.5   F  1880
3      2115           B  NaN    1.914183  37631.5   M  1880
4      5868           C  NaN    6.448848    36263   F  1880
5      9949           C  NaN    9.004353  40421.5   M  1880
6      2218           D  NaN     2.43755  21554.5   F  1880
7      2488           D  NaN    2.251767    24340   M  1880
8     11444           E  NaN    12.57679    42338   F  1880
9      6894           E  NaN    6.239422    43971   M  1880
10     2957           F  NaN    3.249701    11487   F  1880
11     6529           F  NaN    5.909079  16127.5   M  1880
12     2463           G  NaN    2.706802  12885.5   F  1880
13     6274           G  NaN    5.678291    21138   M  1880
14     2743           H  NaN    3.014518    11620   F  1880
15     7599           H  NaN    6.877483  43150.5   M  1880
16     2480           I  NaN    2.725484    12726   F  1880
17      947           I  NaN   0.8570834    11240   M  1880
18     3800           J  NaN    4.176145  20455.5   F  1880
19    22272           J  NaN     20.1573  23451.5   M  1880
20     1514           K  NaN    1.663864   5389.5   F  1880
21      106           K  NaN  0.09593542   5792.5   M  1880
22     8713           L  NaN    9.575462  48888.5   F  1880
23     4086           L  NaN    3.698039  31886.5   M  1880
24    19779           M  NaN    21.73684  46765.5   F  1880
25     3166           M  NaN    2.865392  38498.5   M  1880
26     3026           N  NaN    3.325531  11614.5   F  1880
27      893           N  NaN   0.8082106    12978   M  1880
28      968           O  NaN    1.063818    14405   F  1880
29     1736           O  NaN    1.571169    21460   M  1880
...     ...         ...  ...         ...      ...  ..   ...
3454      0           W  NaN           0        0   F  2012
3455      0           X  NaN           0        0   F  2012
3456      0           Y  NaN           0        0   F  2012
3457      0           Z  NaN           0        0   F  2012
3458      0           A  NaN           0        0   F  2013
3459      0           B  NaN           0        0   F  2013
3460      0           C  NaN           0        0   F  2013
3461      0           D  NaN           0        0   F  2013
3462      0           E  NaN           0        0   F  2013
3463      0           F  NaN           0        0   F  2013
3464      0           G  NaN           0        0   F  2013
3465      0           H  NaN           0        0   F  2013
3466      0           I  NaN           0        0   F  2013
3467      0           J  NaN           0        0   F  2013
3468      0           K  NaN           0        0   F  2013
3469      0           L  NaN           0        0   F  2013
3470      0           M  NaN           0        0   F  2013
3471      0           N  NaN           0        0   F  2013
3472      0           O  NaN           0        0   F  2013
3473      0           P  NaN           0        0   F  2013
3474      0           Q  NaN           0        0   F  2013
3475      0           R  NaN           0        0   F  2013
3476      0           S  NaN           0        0   F  2013
3477      0           T  NaN           0        0   F  2013
3478      0           U  NaN           0        0   F  2013
3479      0           V  NaN           0        0   F  2013
3480      0           W  NaN           0        0   F  2013
3481      0           X  NaN           0        0   F  2013
3482      0           Y  NaN           0        0   F  2013
3483      0           Z  NaN           0        0   F  2013

[10381 rows x 7 columns]
In [92]:
# sanity check to make sure that column percent adds up to 100
letters = yob_aggregated[yob_aggregated.year == 1880]['firstletter']
print yob_aggregated[(yob_aggregated.year == 1884) & (yob_aggregated.sex == 'F')]['pct'].sum()
100.0
In [97]:
print yob_aggregated[(yob_aggregated.year == 1884) & (yob_aggregated.sex == 'F')][['firstletter', 'pct']]
    firstletter        pct
197           A  10.326921
199           B   4.654245
201           C   6.137713
203           D   2.371689
205           E  12.761390
207           F   3.252159
209           G   2.937484
211           H   3.079320
213           I   2.626684
215           J   4.060548
217           K   1.527646
219           L   9.493730
221           M  20.987894
223           N   3.279286
225           O   1.258700
227           P   1.464091
229           Q   0.024802
231           R   3.056068
233           S   3.940413
235           T   0.654152
237           U   0.028677
239           V   1.211421
241           W   0.605323
243           Y   0.003875
245           Z   0.255770
In [93]:
import bokeh
bokeh.load_notebook()