We are going to play with recreating this animation: http://www.prooffreader.com/2014/04/baby-names-rise-of-n.html
The first step of this is to get the data, so let's pop over to David Taylor's notebook and download the files.
Then let's load those dataframes.
data_path = "baby_names"
import os
import pandas as pd
os.chdir(data_path)
yob = pd.read_pickle('yob.pickle')
names = pd.read_pickle('names.pickle')
years = pd.read_pickle('years.pickle')
print yob
name sex births year pct ranked firstletter 0 Mary F 7065 1880 7.764334 1 M 1 Anna F 2604 1880 2.861759 2 A 2 Emma F 2003 1880 2.201268 3 E 3 Elizabeth F 1939 1880 2.130933 4 E 4 Minnie F 1746 1880 1.918829 5 M 5 Margaret F 1578 1880 1.734199 6 M 6 Ida F 1472 1880 1.617707 7 I 7 Alice F 1414 1880 1.553966 8 A 8 Bertha F 1320 1880 1.450661 9 B 9 Sarah F 1288 1880 1.415493 10 S 10 Annie F 1258 1880 1.382524 11 A 11 Clara F 1226 1880 1.347356 12 C 12 Ella F 1156 1880 1.270427 13 E 13 Florence F 1063 1880 1.168222 14 F 14 Cora F 1045 1880 1.148440 15 C 15 Martha F 1040 1880 1.142945 16 M 16 Laura F 1012 1880 1.112173 17 L 17 Nellie F 995 1880 1.093491 18 N 18 Grace F 982 1880 1.079204 19 G 19 Carrie F 949 1880 1.042937 20 C 20 Maude F 858 1880 0.942930 21 M 21 Mabel F 808 1880 0.887980 22 M 22 Bessie F 796 1880 0.874793 23 B 23 Jennie F 793 1880 0.871496 24 J 24 Gertrude F 787 1880 0.864902 25 G 25 Julia F 783 1880 0.860506 26 J 26 Hattie F 769 1880 0.845120 27 H 27 Edith F 768 1880 0.844021 28 E 28 Mattie F 704 1880 0.773686 29 M 29 Rose F 700 1880 0.769290 30 R ... ... .. ... ... ... ... ... 1792061 Zayceon M 5 2013 0.000267 12995 Z 1792062 Zayid M 5 2013 0.000267 12995 Z 1792063 Zaylynn M 5 2013 0.000267 12995 Z 1792064 Zecheriah M 5 2013 0.000267 12995 Z 1792065 Zedric M 5 2013 0.000267 12995 Z 1792066 Zefram M 5 2013 0.000267 12995 Z 1792067 Zekhi M 5 2013 0.000267 12995 Z 1792068 Zenith M 5 2013 0.000267 12995 Z 1792069 Zennon M 5 2013 0.000267 12995 Z 1792070 Zepplin M 5 2013 0.000267 12995 Z 1792071 Zevon M 5 2013 0.000267 12995 Z 1792072 Zhaiden M 5 2013 0.000267 12995 Z 1792073 Zhen M 5 2013 0.000267 12995 Z 1792074 Zhian M 5 2013 0.000267 12995 Z 1792075 Zien M 5 2013 0.000267 12995 Z 1792076 Zierre M 5 2013 0.000267 12995 Z 1792077 Zimri M 5 2013 0.000267 12995 Z 1792078 Ziquan M 5 2013 0.000267 12995 Z 1792079 Ziyaad M 5 2013 0.000267 12995 Z 1792080 Ziyang M 5 2013 0.000267 12995 Z 1792081 Zmari M 5 2013 0.000267 12995 Z 1792082 Zolan M 5 2013 0.000267 12995 Z 1792083 Zurich M 5 2013 0.000267 12995 Z 1792084 Zyeer M 5 2013 0.000267 12995 Z 1792085 Zyere M 5 2013 0.000267 12995 Z 1792086 Zyhier M 5 2013 0.000267 12995 Z 1792087 Zylar M 5 2013 0.000267 12995 Z 1792088 Zymari M 5 2013 0.000267 12995 Z 1792089 Zymeer M 5 2013 0.000267 12995 Z 1792090 Zyree M 5 2013 0.000267 12995 Z [1792091 rows x 7 columns]
print yob[yob.year == 1880]
name sex births year pct ranked firstletter 0 Mary F 7065 1880 7.764334 1.0 M 1 Anna F 2604 1880 2.861759 2.0 A 2 Emma F 2003 1880 2.201268 3.0 E 3 Elizabeth F 1939 1880 2.130933 4.0 E 4 Minnie F 1746 1880 1.918829 5.0 M 5 Margaret F 1578 1880 1.734199 6.0 M 6 Ida F 1472 1880 1.617707 7.0 I 7 Alice F 1414 1880 1.553966 8.0 A 8 Bertha F 1320 1880 1.450661 9.0 B 9 Sarah F 1288 1880 1.415493 10.0 S 10 Annie F 1258 1880 1.382524 11.0 A 11 Clara F 1226 1880 1.347356 12.0 C 12 Ella F 1156 1880 1.270427 13.0 E 13 Florence F 1063 1880 1.168222 14.0 F 14 Cora F 1045 1880 1.148440 15.0 C 15 Martha F 1040 1880 1.142945 16.0 M 16 Laura F 1012 1880 1.112173 17.0 L 17 Nellie F 995 1880 1.093491 18.0 N 18 Grace F 982 1880 1.079204 19.0 G 19 Carrie F 949 1880 1.042937 20.0 C 20 Maude F 858 1880 0.942930 21.0 M 21 Mabel F 808 1880 0.887980 22.0 M 22 Bessie F 796 1880 0.874793 23.0 B 23 Jennie F 793 1880 0.871496 24.0 J 24 Gertrude F 787 1880 0.864902 25.0 G 25 Julia F 783 1880 0.860506 26.0 J 26 Hattie F 769 1880 0.845120 27.0 H 27 Edith F 768 1880 0.844021 28.0 E 28 Mattie F 704 1880 0.773686 29.0 M 29 Rose F 700 1880 0.769290 30.0 R ... ... .. ... ... ... ... ... 1970 Philo M 5 1880 0.004525 983.5 P 1971 Phineas M 5 1880 0.004525 983.5 P 1972 Presley M 5 1880 0.004525 983.5 P 1973 Ransom M 5 1880 0.004525 983.5 R 1974 Reece M 5 1880 0.004525 983.5 R 1975 Rene M 5 1880 0.004525 983.5 R 1976 Roswell M 5 1880 0.004525 983.5 R 1977 Rowland M 5 1880 0.004525 983.5 R 1978 Sampson M 5 1880 0.004525 983.5 S 1979 Samual M 5 1880 0.004525 983.5 S 1980 Santos M 5 1880 0.004525 983.5 S 1981 Schuyler M 5 1880 0.004525 983.5 S 1982 Sheppard M 5 1880 0.004525 983.5 S 1983 Spurgeon M 5 1880 0.004525 983.5 S 1984 Starling M 5 1880 0.004525 983.5 S 1985 Sylvanus M 5 1880 0.004525 983.5 S 1986 Theadore M 5 1880 0.004525 983.5 T 1987 Theophile M 5 1880 0.004525 983.5 T 1988 Tilmon M 5 1880 0.004525 983.5 T 1989 Tommy M 5 1880 0.004525 983.5 T 1990 Unknown M 5 1880 0.004525 983.5 U 1991 Vann M 5 1880 0.004525 983.5 V 1992 Wes M 5 1880 0.004525 983.5 W 1993 Winston M 5 1880 0.004525 983.5 W 1994 Wood M 5 1880 0.004525 983.5 W 1995 Woodie M 5 1880 0.004525 983.5 W 1996 Worthy M 5 1880 0.004525 983.5 W 1997 Wright M 5 1880 0.004525 983.5 W 1998 York M 5 1880 0.004525 983.5 Y 1999 Zachariah M 5 1880 0.004525 983.5 Z [2000 rows x 7 columns]
print yob[(yob['name'].str.startswith('A')) & (yob.year == 1880)]
name sex births year pct ranked 1 Anna F 2604 1880 2.861759 2.0 7 Alice F 1414 1880 1.553966 8.0 10 Annie F 1258 1880 1.382524 11.0 32 Ada F 652 1880 0.716539 33.0 53 Agnes F 473 1880 0.519820 54.0 81 Alma F 277 1880 0.304419 82.0 82 Addie F 274 1880 0.301122 83.0 88 Amanda F 241 1880 0.264856 89.0 95 Amelia F 221 1880 0.242876 96.5 107 Amy F 167 1880 0.183531 108.0 112 Augusta F 151 1880 0.165947 113.0 119 Anne F 136 1880 0.149462 120.0 121 Ann F 131 1880 0.143967 123.5 143 Allie F 105 1880 0.115393 144.5 150 Alta F 91 1880 0.100008 151.0 170 Alberta F 76 1880 0.083523 171.5 177 Abbie F 71 1880 0.078028 178.0 185 Adelaide F 65 1880 0.071434 188.0 207 Adeline F 54 1880 0.059345 209.0 239 Adele F 41 1880 0.045058 243.0 253 Angie F 36 1880 0.039563 257.0 287 Artie F 29 1880 0.031871 290.5 293 Alvina F 28 1880 0.030772 299.0 294 Annette F 28 1880 0.030772 299.0 308 Adella F 26 1880 0.028574 312.5 309 Alpha F 26 1880 0.028574 312.5 316 Angeline F 25 1880 0.027475 321.0 325 Adah F 24 1880 0.026376 329.0 332 Adaline F 23 1880 0.025277 337.0 351 Almeda F 21 1880 0.023079 359.0 ... ... .. ... ... ... ... 1690 Arlie M 7 1880 0.006335 776.5 1750 Adolf M 6 1880 0.005430 858.5 1751 Albin M 6 1880 0.005430 858.5 1752 Albion M 6 1880 0.005430 858.5 1753 Allison M 6 1880 0.005430 858.5 1754 Alpha M 6 1880 0.005430 858.5 1755 Alpheus M 6 1880 0.005430 858.5 1756 Anastacio M 6 1880 0.005430 858.5 1757 Andre M 6 1880 0.005430 858.5 1758 Annie M 6 1880 0.005430 858.5 1759 Arlington M 6 1880 0.005430 858.5 1760 Armand M 6 1880 0.005430 858.5 1761 Asberry M 6 1880 0.005430 858.5 1762 Asbury M 6 1880 0.005430 858.5 1763 Asher M 6 1880 0.005430 858.5 1764 Augustin M 6 1880 0.005430 858.5 1765 Auther M 6 1880 0.005430 858.5 1766 Author M 6 1880 0.005430 858.5 1850 Ab M 5 1880 0.004525 983.5 1851 Abbott M 5 1880 0.004525 983.5 1852 Agustus M 5 1880 0.004525 983.5 1853 Albertus M 5 1880 0.004525 983.5 1854 Almer M 5 1880 0.004525 983.5 1855 Alphonso M 5 1880 0.004525 983.5 1856 Alvia M 5 1880 0.004525 983.5 1857 Artie M 5 1880 0.004525 983.5 1858 Arvid M 5 1880 0.004525 983.5 1859 Ashby M 5 1880 0.004525 983.5 1860 Augusta M 5 1880 0.004525 983.5 1861 Aurthur M 5 1880 0.004525 983.5 [190 rows x 6 columns]
yob['firstletter'] = yob['name'].str[0]
# Now let's add dummy data to sanitize things.
temp_frame = pd.DataFrame(columns=yob.columns.values.tolist(), index=range((2014 - 1880)* 26))
for y in range(1880, 2014):
for c in range(65, 91):
ind = 26*(y - 1880) + c - 65
temp_frame.ix[ind].year = y
temp_frame.ix[ind].firstletter = chr(c)
temp_frame.ix[ind].name = chr(c)
temp_frame.ix[ind].sex = 'F'
temp_frame.ix[ind].births = 0
temp_frame.ix[ind].pct = 0
temp_frame.ix[ind].ranked = 0
print temp_frame.ix[1]
name NaN sex F births 0 year 1880 pct 0 ranked 0 firstletter B Name: 1, dtype: object
yob_aggregated = yob.groupby(['year', 'firstletter', 'sex']).sum().reset_index()
yob_sanitized = yob_aggregated.append(temp_frame)
print yob_sanitized
births firstletter name pct ranked sex year 0 9334 A NaN 10.25793 43318 F 1880 1 7406 A NaN 6.702808 54054.5 M 1880 2 3876 B NaN 4.259668 12487.5 F 1880 3 2115 B NaN 1.914183 37631.5 M 1880 4 5868 C NaN 6.448848 36263 F 1880 5 9949 C NaN 9.004353 40421.5 M 1880 6 2218 D NaN 2.43755 21554.5 F 1880 7 2488 D NaN 2.251767 24340 M 1880 8 11444 E NaN 12.57679 42338 F 1880 9 6894 E NaN 6.239422 43971 M 1880 10 2957 F NaN 3.249701 11487 F 1880 11 6529 F NaN 5.909079 16127.5 M 1880 12 2463 G NaN 2.706802 12885.5 F 1880 13 6274 G NaN 5.678291 21138 M 1880 14 2743 H NaN 3.014518 11620 F 1880 15 7599 H NaN 6.877483 43150.5 M 1880 16 2480 I NaN 2.725484 12726 F 1880 17 947 I NaN 0.8570834 11240 M 1880 18 3800 J NaN 4.176145 20455.5 F 1880 19 22272 J NaN 20.1573 23451.5 M 1880 20 1514 K NaN 1.663864 5389.5 F 1880 21 106 K NaN 0.09593542 5792.5 M 1880 22 8713 L NaN 9.575462 48888.5 F 1880 23 4086 L NaN 3.698039 31886.5 M 1880 24 19779 M NaN 21.73684 46765.5 F 1880 25 3166 M NaN 2.865392 38498.5 M 1880 26 3026 N NaN 3.325531 11614.5 F 1880 27 893 N NaN 0.8082106 12978 M 1880 28 968 O NaN 1.063818 14405 F 1880 29 1736 O NaN 1.571169 21460 M 1880 ... ... ... ... ... ... .. ... 3454 0 W NaN 0 0 F 2012 3455 0 X NaN 0 0 F 2012 3456 0 Y NaN 0 0 F 2012 3457 0 Z NaN 0 0 F 2012 3458 0 A NaN 0 0 F 2013 3459 0 B NaN 0 0 F 2013 3460 0 C NaN 0 0 F 2013 3461 0 D NaN 0 0 F 2013 3462 0 E NaN 0 0 F 2013 3463 0 F NaN 0 0 F 2013 3464 0 G NaN 0 0 F 2013 3465 0 H NaN 0 0 F 2013 3466 0 I NaN 0 0 F 2013 3467 0 J NaN 0 0 F 2013 3468 0 K NaN 0 0 F 2013 3469 0 L NaN 0 0 F 2013 3470 0 M NaN 0 0 F 2013 3471 0 N NaN 0 0 F 2013 3472 0 O NaN 0 0 F 2013 3473 0 P NaN 0 0 F 2013 3474 0 Q NaN 0 0 F 2013 3475 0 R NaN 0 0 F 2013 3476 0 S NaN 0 0 F 2013 3477 0 T NaN 0 0 F 2013 3478 0 U NaN 0 0 F 2013 3479 0 V NaN 0 0 F 2013 3480 0 W NaN 0 0 F 2013 3481 0 X NaN 0 0 F 2013 3482 0 Y NaN 0 0 F 2013 3483 0 Z NaN 0 0 F 2013 [10381 rows x 7 columns]
# sanity check to make sure that column percent adds up to 100
letters = yob_aggregated[yob_aggregated.year == 1880]['firstletter']
print yob_aggregated[(yob_aggregated.year == 1884) & (yob_aggregated.sex == 'F')]['pct'].sum()
100.0
print yob_aggregated[(yob_aggregated.year == 1884) & (yob_aggregated.sex == 'F')][['firstletter', 'pct']]
firstletter pct 197 A 10.326921 199 B 4.654245 201 C 6.137713 203 D 2.371689 205 E 12.761390 207 F 3.252159 209 G 2.937484 211 H 3.079320 213 I 2.626684 215 J 4.060548 217 K 1.527646 219 L 9.493730 221 M 20.987894 223 N 3.279286 225 O 1.258700 227 P 1.464091 229 Q 0.024802 231 R 3.056068 233 S 3.940413 235 T 0.654152 237 U 0.028677 239 V 1.211421 241 W 0.605323 243 Y 0.003875 245 Z 0.255770
import bokeh
bokeh.load_notebook()