#Python/Pandas script to analyze baby names database 1880-2012
#from the U.S. Social Security Administration
#
#By David Taylor Feb. 2014
#
#www.prooffreader.com (for blogged results)
#
#prooffreaderplus.blogspot.com (for scripts, calculations, links, gits, etc.)
#
#Disclaimer: not a professional programmer, more interested right now in results in a reasonable time.
# That said, constructive critique and suggestions are always totally welcome. I'm not proud.
# In particular, there is a lot of very needless duplication of boy and girl databases and code
# in loops that refer to them, when I could just subset a larger database every time,
# but I have enough memory in my computer and not enough in my head so I just found it easier
# to work in this inefficient fashion. Your mileage may vary.
# Note that I usually use column names instead of indexes, my brain just deals with them better
# right now, as I get more used to pandas I'm already starting to adapt.
# Also, sometimes I just print-dump everything to a csv file and work with it in Excel. Sorry!
#Instructions:
#1. Download data set at (as of Feb. 2014) http://www.ssa.gov/OACT/babynames/names.zip
#2. Unzip into a working directory
#3. Change the working directory strings in this script.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re as re
import scipy
import os
os.chdir("C:/_Dropbox/Dropbox/py/babynames/yobs") #change this to your working directory
#read yob files, arrange into data frames and concatenate into one data frame
#this portion taken from O'Reilly's Python for Data Analysis (2009)
years = range(1880, 2013) #remember in python a 2013 upper bound means the last value used will be 2012
pieces = []
yobcolumns = ['name', 'sex', 'births']
for year in years:
path = 'yob%d.txt' % year
frame = pd.read_csv(path, names=yobcolumns)
frame['year'] = year
pieces.append(frame)
df = pd.concat(pieces, ignore_index=True)
os.chdir("C:/_Dropbox/Dropbox/py/babynames/") #change this to your working directory
#add column 'pct' that is the number of births of that name and sex in that year
#divided by the total number of births of that sex in that year, multiplied by
#100 to turn into a percentage and reduce all those leading zeroes
def add_pct(group):
births = group.births.astype(float)
group['pct'] = (births / births.sum() * 100)
return group
df = df.groupby(['year', 'sex']).apply(add_pct)
#add rank of each name each year each sex
df['ranked'] = df.groupby(['year', 'sex'])['births'].rank(ascending=False)
#subset girls and boys
dff = df[df.sex == 'F']
dfm = df[df.sex == 'M']
#create names dataframe. This DF discards individual birth or pct values, and instead collects data on unique names.
#There is one row per unique combination of name and sex.
temp_count = pd.DataFrame(data=dff['name'].value_counts(), columns=['year_count'])
temp_min = pd.DataFrame(data=dff.groupby('name').year.min(), columns = ['year_min'])
temp_max = pd.DataFrame(data=dff.groupby('name').year.max(), columns = ['year_max'])
temp_pctsum = pd.DataFrame(data=dff.groupby('name').pct.sum(), columns = ['pct_sum'])
temp_pctmax = pd.DataFrame(data=dff.groupby('name').pct.max(), columns = ['pct_max'])
temp_f = temp_count.join(temp_min)
temp_f = temp_f.join(temp_max)
temp_f = temp_f.join(temp_pctsum)
temp_f = temp_f.join(temp_pctmax)
temp_f['sex'] = "F"
temp_f.reset_index(inplace=True, drop=False)
temp_f.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
temp_f = temp_f[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
temp_count = pd.DataFrame(data=dfm['name'].value_counts(), columns=['year_count'])
temp_min = pd.DataFrame(data=dfm.groupby('name').year.min(), columns = ['year_min'])
temp_max = pd.DataFrame(data=dfm.groupby('name').year.max(), columns = ['year_max'])
temp_pctsum = pd.DataFrame(data=dfm.groupby('name').pct.sum(), columns = ['pct_sum'])
temp_pctmax = pd.DataFrame(data=dfm.groupby('name').pct.max(), columns = ['pct_max'])
temp_m = temp_count.join(temp_min)
temp_m = temp_m.join(temp_max)
temp_m = temp_m.join(temp_pctsum)
temp_m = temp_m.join(temp_pctmax)
temp_m['sex'] = "M"
temp_m.reset_index(inplace=True, drop=False)
temp_m.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
temp_m = temp_m[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
names = pd.concat([temp_f, temp_m], ignore_index=True)
# create years dataframe. This DF discards individual name data, aggregating by year.
total = pd.DataFrame(df.pivot_table('births', rows='year', cols = 'sex', aggfunc=sum))
total.reset_index(drop=False, inplace=True)
total.columns = ['year', 'births_f', 'births_m']
total['births_t'] = total.births_f + total.births_m
newnames = pd.DataFrame(data=names.groupby('year_min').year_min.count(), columns = ['firstyearcount'])
newnames.reset_index(drop=False, inplace=True)
newnames.columns = ['year', 'new_names']
uniquenames = pd.DataFrame(columns=['year', 'unique_names'])
for yr in range(1880, 2013):
uniquenames = uniquenames.append(pd.DataFrame([{'year':yr, 'unique_names':len(unique(df[df.year == yr].name))}]), ignore_index=True)
years = pd.merge(left=total, right=newnames, on='year', right_index=False, left_index=False)
years = pd.merge(left=years, right=uniquenames, on='year', right_index=False, left_index=False)
#births dataframes, just the number of births per year
births = df.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
births_f = dff.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
births_m = dfm.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
# It takes my $400 Acer desktop computer about 15 seconds for this cell. If you want to compare, put the following, uncommented,
# as the first line:
# %%timeit
# Some custom functions to help exploration:
def headtail(df, num = 5):
""" Returns concatenated head and tail of dataframe
arguments: dataframe, integer number of rows in each of head and tail """
return pd.concat([df.head(num), df.tail(num)], ignore_index=False)
def subset(df, returncol='', searchval = '', searchcol = ''):
""" Returns dataframe in first argument
with column named in second argument (or entire row if omitted)
and value named in third argument (or entire column(s) if omitted
found in column named in fourth argument (or second argument if omitted)"""
if returncol == '':
if searchval <> '' and searchcol <> '':
return df[df[searchcol] == searchval]
else:
return 'Error in argument formulation'
else:
if searchval == '':
if searchcol == '':
return df[returncol]
else:
return 'Error in argument formulation'
else:
if searchcol == '':
return df[df[returncol] == searchval][returncol]
else:
return df[df[searchcol] == searchval][returncol]
#Main dataframe of SSA data:
headtail(df)
name | sex | births | year | pct | ranked | |
---|---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 7.764334 | 1 |
1 | Anna | F | 2604 | 1880 | 2.861759 | 2 |
2 | Emma | F | 2003 | 1880 | 2.201268 | 3 |
3 | Elizabeth | F | 1939 | 1880 | 2.130933 | 4 |
4 | Minnie | F | 1746 | 1880 | 1.918829 | 5 |
1758725 | Zylin | M | 5 | 2012 | 0.000266 | 13166 |
1758726 | Zymari | M | 5 | 2012 | 0.000266 | 13166 |
1758727 | Zyrin | M | 5 | 2012 | 0.000266 | 13166 |
1758728 | Zyrus | M | 5 | 2012 | 0.000266 | 13166 |
1758729 | Zytaevius | M | 5 | 2012 | 0.000266 | 13166 |
print df.describe()
births year pct ranked count 1758730.000000 1758730.000000 1758730.000000 1758730.000000 mean 187.499369 1971.073360 0.015125 5114.759420 std 1590.400193 33.184493 0.125716 4316.878964 min 5.000000 1880.000000 0.000232 1.000000 25% 7.000000 1947.000000 0.000417 1728.000000 50% 12.000000 1980.000000 0.000790 3875.000000 75% 32.000000 1999.000000 0.002521 7395.000000 max 99685.000000 2012.000000 8.738268 19076.000000
#for comparison with Dept of Health and Human Services database:
dhscompare = []
for yr in range(1910,2010):
dhscompare.append(yr)
dhscompare.append(subset(df, 'births', yr, 'year').sum())
dhscompare
#normally this would be better as a dict, but for brevity's sake I output as a list because the DHS data is in Excel
#and I already have an excel macro to parse a list like this.
[1910, 590696, 1911, 644242, 1912, 987950, 1913, 1136990, 1914, 1416254, 1915, 1832497, 1916, 1934414, 1917, 2006750, 1918, 2171217, 1919, 2110293, 1920, 2262790, 1921, 2334387, 1922, 2289215, 1923, 2302408, 1924, 2381572, 1925, 2333195, 1926, 2295691, 1927, 2318839, 1928, 2260302, 1929, 2191451, 1930, 2222667, 1931, 2103195, 1932, 2110686, 1933, 1998556, 1934, 2076055, 1935, 2089180, 1936, 2077021, 1937, 2129965, 1938, 2211879, 1939, 2202888, 1940, 2302046, 1941, 2435671, 1942, 2731210, 1943, 2821662, 1944, 2689449, 1945, 2652581, 1946, 3194707, 1947, 3601997, 1948, 3450802, 1949, 3481993, 1950, 3504681, 1951, 3677885, 1952, 3797756, 1953, 3849398, 1954, 3978865, 1955, 4012414, 1956, 4121009, 1957, 4199845, 1958, 4131232, 1959, 4156399, 1960, 4154859, 1961, 4139305, 1962, 4035299, 1963, 3958833, 1964, 3887558, 1965, 3626291, 1966, 3475626, 1967, 3395191, 1968, 3378931, 1969, 3476680, 1970, 3607444, 1971, 3432364, 1972, 3143737, 1973, 3017152, 1974, 3040252, 1975, 3019558, 1976, 3034648, 1977, 3176263, 1978, 3173337, 1979, 3326319, 1980, 3442974, 1981, 3457508, 1982, 3505823, 1983, 3461025, 1984, 3486048, 1985, 3566396, 1986, 3554197, 1987, 3603080, 1988, 3691763, 1989, 3843030, 1990, 3949807, 1991, 3893707, 1992, 3839222, 1993, 3768219, 1994, 3715625, 1995, 3660115, 1996, 3645024, 1997, 3623401, 1998, 3675594, 1999, 3691023, 2000, 3776494, 2001, 3739758, 2002, 3734405, 2003, 3798084, 2004, 3816078, 2005, 3839196, 2006, 3949493, 2007, 3989796, 2008, 3921184, 2009, 3809014]
#verify that dff and dfm are just df split into sex == F and M, respectively.
if len(df) == len(dff) + len(dfm):
print "Dataframes add up correctly: " + str(len(dff)) + " girls + " + str(len(dfm)) + " boys = " + str(len(df)) + " total."
else:
print "Dataframes do not add up: " + str(len(dff)) + " girls + " + str(len(dfm)) + " boys != " + str(len(df)) + " total; difference - " + str(len(df) - len(dfm) - len(dff))
Dataframes add up correctly: 1043165 girls + 715565 boys = 1758730 total.
#names dataframe
#there are 101260 unique name-sex combinations; e.g., "Jeon" for a boy occurred once only, in 1999.
#births and pct and rank have been discarded, but can easily be looked up from df or dfm.
headtail(names)
name | sex | year_count | year_min | year_max | pct_sum | pct_max | |
---|---|---|---|---|---|---|---|
0 | Georgianna | F | 133 | 1880 | 2012 | 1.047215 | 0.031525 |
1 | Winifred | F | 133 | 1880 | 2012 | 6.207711 | 0.141134 |
2 | Joan | F | 133 | 1880 | 2012 | 37.098464 | 1.972498 |
3 | Miriam | F | 133 | 1880 | 2012 | 8.853062 | 0.149273 |
4 | Alva | F | 133 | 1880 | 2012 | 1.165021 | 0.031943 |
101256 | Zelda | M | 1 | 1932 | 1932 | 0.000479 | 0.000479 |
101257 | Schell | M | 1 | 1962 | 1962 | 0.000338 | 0.000338 |
101258 | Aadvik | M | 1 | 2012 | 2012 | 0.000320 | 0.000320 |
101259 | Mikequan | M | 1 | 1995 | 1995 | 0.000263 | 0.000263 |
101260 | Jeon | M | 1 | 1999 | 1999 | 0.000261 | 0.000261 |
print names.describe()
year_count year_min year_max pct_sum pct_max count 101261.000000 101261.000000 101261.000000 101261.000000 101261.000000 mean 17.368286 1970.788675 1996.402603 0.262688 0.008826 std 26.002521 36.164306 23.476958 4.739686 0.104103 min 1.000000 1880.000000 1880.000000 0.000232 0.000232 25% 2.000000 1951.000000 1991.000000 0.000599 0.000344 50% 6.000000 1983.000000 2008.000000 0.002548 0.000556 75% 21.000000 1999.000000 2012.000000 0.013662 0.001301 max 133.000000 2012.000000 2012.000000 558.006916 8.738268
#years dataframe
headtail(years)
year | births_f | births_m | births_t | new_names | unique_names | |
---|---|---|---|---|---|---|
0 | 1880 | 90993 | 110491 | 201484 | 2000 | 1889 |
1 | 1881 | 91955 | 100746 | 192701 | 310 | 1830 |
2 | 1882 | 107850 | 113687 | 221537 | 302 | 2012 |
3 | 1883 | 112322 | 104630 | 216952 | 195 | 1962 |
4 | 1884 | 129022 | 114445 | 243467 | 233 | 2158 |
128 | 2008 | 1886109 | 2035075 | 3921184 | 2047 | 32474 |
129 | 2009 | 1831382 | 1977632 | 3809014 | 1787 | 32203 |
130 | 2010 | 1770632 | 1911572 | 3682204 | 1634 | 31581 |
131 | 2011 | 1750078 | 1889557 | 3639635 | 1534 | 31388 |
132 | 2012 | 1743626 | 1877705 | 3621331 | 1506 | 31109 |
print years.describe()
year births_f births_m births_t new_names unique_names count 133.00000 133.000000 133.000000 133.000000 133.000000 133.000000 mean 1946.00000 1229499.000000 1249905.248120 2479404.248120 761.360902 12052.827068 std 38.53786 644533.683153 739035.563215 1382297.397633 637.666524 8597.344768 min 1880.00000 90993.000000 100746.000000 192701.000000 103.000000 1830.000000 25% 1913.00000 624463.000000 512527.000000 1136990.000000 215.000000 6263.000000 50% 1946.00000 1457578.000000 1559084.000000 3017152.000000 467.000000 9516.000000 75% 1979.00000 1770632.000000 1902493.000000 3675594.000000 1208.000000 17325.000000 max 2012.00000 2044078.000000 2155767.000000 4199845.000000 2189.000000 32474.000000
#Make dataframe and graph of first-ranked
rank1m = dfm[dfm.ranked == 1]
rank1f = dff[dff.ranked == 1]
zeroline = scipy.zeros(len(range(1880, 2013)))
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='k')
plt.plot(rank1m.year, rank1m.pct, color="blue", linewidth = 2, label = 'Boys')
plt.fill_between(rank1m.year, rank1m.pct, color="blue", alpha = 0.1, interpolate=True)
plt.xlim(1880,2012)
plt.ylim(0,9)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of #1 boys' name by year", size=18, color="blue")
plt.xlabel('Year', size=15)
plt.ylabel('% of male births', size=15)
plt.show()
plt.close()
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='k')
plt.plot(rank1f.year, rank1f.pct, color="red", linewidth = 2, label = 'Girls')
plt.fill_between(rank1f.year, rank1f.pct, color="red", alpha = 0.1, interpolate=True)
plt.xlim(1880,2012)
plt.ylim(0,9)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of #1 girls' name by year", size=18, color="red")
plt.xlabel('Year', size=15)
plt.ylabel('% of female births', size=15)
plt.show()
plt.close()
# make graphs of gender miscategorizations for all names that were #1 ranked in any year
namelist = list(rank1m.name.unique()) + list(rank1f.name.unique())
for nm in namelist:
f_or_m = df[df.name == nm]
f_or_m = pd.pivot_table(f_or_m, 'pct', rows=['year'], cols='sex')
f_or_m = pd.DataFrame(f_or_m)
f_or_m = f_or_m.dropna()
f_or_m['temp'] = f_or_m.F / f_or_m.M
f_or_m['sex_max'] = f_or_m['temp'].apply(lambda x: 'F' if x >= 1 else 'M')
f_or_m['temp2'] = f_or_m['sex_max'].apply(lambda x: -1 if x =='F' else 1)
f_or_m['pctratio'] = 100 * (f_or_m.temp ** f_or_m.temp2)
y2 = f_or_m.pctratio
y1 = f_or_m[f_or_m.iloc[0]['sex_max']]
x = f_or_m.index
if f_or_m.iloc[0]['sex_max'] == 'M':
sexvar = 'female'
else:
sexvar = 'male'
figure(num=None, figsize=(6,4), dpi=150, facecolor='w', edgecolor='k')
plt.subplot(211)
plt.xlim(1880,2012)
plt.xticks([])
plt.ylabel('% of all births', size=10)
if y1[[1980, 1985, 1990, 1995, 2000, 2005, 2010]].max() / y1.max() < 0.5: #puts name on the left if name was popular in year 2000
plt.annotate(nm, xy=(.96, .92), xycoords='axes fraction', size = 17, horizontalalignment='right', verticalalignment='top')
else:
plt.annotate(nm, xy=(.05, .92), xycoords='axes fraction', size = 17, horizontalalignment='left', verticalalignment='top')
plt.plot(x, y1, color='black', linewidth = 2, label = '2')
plt.subplot(212)
plt.xlim(1880,2012)
plt.ylabel('% ' + nm + ' ' + sexvar, size=10)
plt.plot(x, y2, color='green', linewidth = 2, label = '2')
plt.savefig('fm_' + nm + '.png')
plt.show()
plt.close()
# Note: algorithm does not work for Ashley, because it changed from a male to a female name (see below)
#Try to solve the mystery of the high error rate of Emma ca. 1900
#posted on Pastebin and prooffreaderplus.blogspot.com
dfm1900 = subset(dfm, '', 1900, 'year')
dfm1900.reset_index(drop=True, inplace=True)
dfm1900E = dfm1900[dfm1900.name.str.contains('^E')]
dfm1900E_a = dfm1900E[dfm1900E.name.str.contains('a$')]
dfm1900_a = dfm1900[dfm1900.name.str.contains('a$')]
print 'Boys born in 1900 whose names begin with E:\n'
print dfm1900E[['name', 'births', 'ranked']].head(60)
print '\n\nBoys born in 1900 whose names end with a:\n'
print dfm1900_a[['name', 'births', 'ranked']].head(50)
print '\n\nIntersection of both sets:\n'
print dfm1900E_a[['name', 'births', 'ranked']].head(50)
Boys born in 1900 whose names begin with E: name births ranked 8 Edward 2721 9.0 27 Earl 1037 28.0 29 Ernest 1012 30.0 37 Elmer 699 38.0 44 Eugene 587 45.0 62 Eddie 425 63.0 70 Edgar 374 71.0 73 Edwin 350 74.0 86 Ed 297 87.0 99 Everett 237 100.0 105 Earnest 223 105.0 164 Emil 127 165.5 168 Elbert 125 168.5 172 Ellis 120 173.5 174 Emmett 119 175.0 201 Edmund 94 202.0 208 Ervin 88 209.0 210 Elijah 86 211.5 262 Edd 61 263.5 286 Emory 52 288.0 289 Elwood 51 290.5 306 Edmond 47 306.0 308 Earle 46 310.0 316 Eli 45 315.5 317 Emanuel 45 315.5 325 Erwin 43 327.0 329 Emery 42 333.0 330 Ezra 42 333.0 344 Elmo 40 346.0 350 Elton 39 351.5 361 Elisha 37 360.5 366 Enoch 36 369.5 381 Emerson 34 383.0 385 Elliott 33 388.5 404 Elmore 31 407.5 405 Emile 31 407.5 418 Early 29 421.0 435 Ellsworth 27 437.5 436 Emmet 27 437.5 470 Elvin 24 472.5 471 Evan 24 472.5 479 Elias 23 483.0 480 Emmitt 23 483.0 518 Emma 21 525.5 519 Eric 21 525.5 536 Eldon 20 544.0 537 Elzie 20 544.0 538 Ethel 20 544.0 560 Elie 19 566.0 561 Ezekiel 19 566.0 587 Ennis 18 590.0 607 Eldridge 17 612.5 631 Elizabeth 16 635.5 632 Elza 16 635.5 633 Everette 16 635.5 653 Elsie 15 664.0 654 Emilio 15 664.0 705 Edna 13 720.5 706 Effie 13 720.5 707 Eunice 13 720.5 Boys born in 1900 whose names end with a: name births ranked 106 Ira 218 107.5 236 Alva 73 237.5 330 Ezra 42 333.0 333 Ora 42 333.0 361 Elisha 37 360.5 370 Joshua 36 369.5 413 Asa 30 415.5 454 Anna 25 461.5 491 Bertha 22 503.5 518 Emma 21 525.5 526 Otha 21 525.5 559 Dana 19 566.0 591 Hosea 18 590.0 632 Elza 16 635.5 699 Alma 13 720.5 705 Edna 13 720.5 713 Ida 13 720.5 752 Clara 12 763.5 753 Ella 12 763.5 856 Eliga 10 868.5 892 Rolla 10 868.5 944 Martha 9 934.5 950 Nora 9 934.5 980 Cora 8 1008.5 989 Eva 8 1008.5 1001 Iva 8 1008.5 1008 Lea 8 1008.5 1075 Eligha 7 1105.5 1091 Georgia 7 1105.5 1114 Laura 7 1105.5 1145 Roma 7 1105.5 1163 Alberta 6 1235.5 1167 Alonza 6 1235.5 1168 Alpha 6 1235.5 1176 Augusta 6 1235.5 1241 Julia 6 1235.5 1254 Lonza 6 1235.5 1271 Oda 6 1235.5 1274 Orla 6 1235.5 1287 Stella 6 1235.5 1351 Claudia 5 1408.5 1361 Dora 5 1408.5 1373 Elva 5 1408.5 1446 Ola 5 1408.5 1471 Rosa 5 1408.5 1486 Theresa 5 1408.5 1496 Viola 5 1408.5 Intersection of both sets: name births ranked 330 Ezra 42 333.0 361 Elisha 37 360.5 518 Emma 21 525.5 632 Elza 16 635.5 705 Edna 13 720.5 753 Ella 12 763.5 856 Eliga 10 868.5 989 Eva 8 1008.5 1075 Eligha 7 1105.5 1373 Elva 5 1408.5
I said in the blog text that nobody (in the database) was named Ashley in 1900. Here's the proof:
NOTE: I wrote this for my first blog post on Feb. 24, 2014. Then I found out that Ashley was a BOYS' name in 1900!
dff[dff.name == 'Ashley'].head()
name | sex | births | year | pct | ranked | |
---|---|---|---|---|---|---|
143423 | Ashley | F | 5 | 1917 | 0.000462 | 4954.0 |
351811 | Ashley | F | 7 | 1938 | 0.000635 | 3647.5 |
379234 | Ashley | F | 6 | 1941 | 0.000497 | 4187.5 |
387796 | Ashley | F | 8 | 1942 | 0.000592 | 3591.0 |
396803 | Ashley | F | 10 | 1943 | 0.000717 | 3142.0 |
dfm[dfm.name == 'Ashley'].head() #Added on March 3
name | sex | births | year | pct | ranked | |
---|---|---|---|---|---|---|
1625 | Ashley | M | 8 | 1880 | 0.007240 | 713 |
3726 | Ashley | M | 6 | 1881 | 0.005956 | 823 |
5723 | Ashley | M | 7 | 1882 | 0.006157 | 795 |
7764 | Ashley | M | 9 | 1883 | 0.008602 | 660 |
10175 | Ashley | M | 6 | 1884 | 0.005243 | 909 |
# plot percent male and female births of Ashley
nm = 'Ashley'
f_and_m = df[df.name == nm]
f_and_m = pd.pivot_table(f_and_m, 'pct', rows=['year'], cols='sex')
f_and_m = pd.DataFrame(f_and_m)
figure(num=None, figsize=(8,7), dpi=150, facecolor='w', edgecolor='k')
plt.xlim(1880,2012)
plt.subplot(211)
#plt.title(nm)
plt.title("Ashley used to be a boys' name\n", size=20)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(f_and_m.index, f_and_m.F, color='#cc0000', linewidth = 2, label = 'Girls')
plt.plot(f_and_m.index, f_and_m.M, color='#0000cc', linewidth = 2, label = 'Boys')
plt.legend(loc = 'upper left')
plt.subplot(212)
plt.ylim(0, 0.05)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(f_and_m.index, f_and_m.F, color='#cc0000', linewidth = 2, label = 'Girls')
plt.plot(f_and_m.index, f_and_m.M, color='#0000cc', linewidth = 2, label = 'Boys')
plt.annotate('magnification of top graph\n(10% of height of first tick mark)\nto see male names', xy=(.03, .95), xycoords='axes fraction', size = 12, horizontalalignment='left', verticalalignment='top')
plt.savefig('ashleyMF.png')
plt.show()
plt.close()
name_chosen = "Sigourney"
sex_chosen = "F"
if sex_chosen == "F":
y_axis_sex = "female"
else:
y_axis_sex = "male"
import scipy
graphdf = df[df.sex == sex_chosen]
graphdf = graphdf[graphdf.name == name_chosen]
figure(num=None, figsize=(11, 8), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="black", linewidth = 2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of baby name '" + name_chosen + "', 1880-2012", size=18, color="black")
plt.xlabel('Year', size=15)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
#plt.savefig(name_chosen + '.png') # un-comment to save graph
plt.show()
plt.close()
#graphdf.to_csv(name_chosen + "_" + sex_chosen + ".csv")
name_chosen = "Sharona"
sex_chosen = "F"
if sex_chosen == "F":
y_axis_sex = "female"
else:
y_axis_sex = "male"
import scipy
graphdf = df[df.sex == sex_chosen]
graphdf = graphdf[graphdf.name == name_chosen]
figure(num=None, figsize=(8, 4), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="black", linewidth = 2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of baby name '" + name_chosen + "', 1880-2012", size=18, color="black")
plt.xlabel('Year', size=15)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
plt.savefig(name_chosen + '.png') # un-comment to save graph
plt.show()
plt.close()
#graphdf.to_csv(name_chosen + "_" + sex_chosen + ".csv")
#compare two names of same sex
name_chosen = "Marilyn"
name_chosen_2 = "Norma"
sex_chosen = "F"
if sex_chosen == "F":
y_axis_sex = "female"
else:
y_axis_sex = "male"
import scipy
graphdf = df[df.sex == sex_chosen]
graphdf2 = graphdf[graphdf.name == name_chosen_2]
graphdf = graphdf[graphdf.name == name_chosen]
figure(num=None, figsize=(7.5, 5), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="#6500A4", linewidth = 3, label=name_chosen)
plt.plot(graphdf2.year, graphdf2.pct, color="#005E75", linewidth = 3, label=name_chosen_2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of U.S. baby names\n" + name_chosen + " and " + name_chosen_2 + ", 1880-2012", size=16, color="black")
plt.xlabel('Year', size=12)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
plt.legend(loc = 'upper left')
plt.savefig(name_chosen + '_v_' + name_chosen_2 + '.png') # un-comment to save graph
plt.show()
plt.close()
# plot rate of babies named "Unknown" or "Baby"
Unkn = df[df.name == 'Unknown']
Unkn = pd.pivot_table(Unkn, 'pct', rows=['year'], cols='name')
Unkn = pd.DataFrame(Unkn)
Baby = df[df.name == 'Baby']
Baby = pd.pivot_table(Baby, 'pct', rows=['year'], cols='name')
Baby = pd.DataFrame(Baby)
figure(num=None, figsize=(8,4), dpi=150, facecolor='w', edgecolor='k')
plt.xlim(1880,2012)
plt.title("U.S. babies named 'Unknown' or 'Baby', 1880-2012", size=18)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(Unkn.index, Unkn.Unknown, color='#660066', linewidth = 2, label = 'Unknown')
plt.plot(Baby.index, Baby.Baby, color='#dd8833', linewidth = 2, label = 'Baby')
plt.legend(loc = 'upper left')
plt.savefig('Unknown_Baby.png')
plt.show()
plt.close()
years2 = years[years.year > 1880] # the first year, 1880, mucks up the 'new names' data
newperthou = years2.new_names * 1000.0 / years2.births_t
import scipy
zeroline = scipy.zeros(len(range(1881, 2013)))
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='w')
plt.plot(years2.year, newperthou, color="#0A5711", linewidth = 2)
plt.fill_between(years2.year, zeroline, newperthou, color="#20912B", alpha = 1.0, interpolate=True)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("New names introduced per 1000 births", size=18, color="#0A5711")
plt.xlabel('Year', size=15)
plt.ylabel('Names appearing for first time / 1000 births', size=11)
plt.show()
plt.close()