Baby names IPython notebook¶

www.prooffreader.com¶

In [1]:

#Python/Pandas script to analyze baby names database 1880-2012
#from the U.S. Social Security Administration
#
#By David Taylor Feb. 2014
#
#www.prooffreader.com (for blogged results)
#
#prooffreaderplus.blogspot.com (for scripts, calculations, links, gits, etc.)
#
#Disclaimer: not a professional programmer, more interested right now in results in a reasonable time.
#  That said, constructive critique and suggestions are always totally welcome. I'm not proud.
#  In particular, there is a lot of very needless duplication of boy and girl databases and code
#    in loops that refer to them, when I could just subset a larger database every time,
#    but I have enough memory in my computer and not enough in my head so I just found it easier
#    to work in this inefficient fashion. Your mileage may vary.
#  Note that I usually use column names instead of indexes, my brain just deals with them better
#    right now, as I get more used to pandas I'm already starting to adapt.
#  Also, sometimes I just print-dump everything to a csv file and work with it in Excel. Sorry!

#Instructions:
#1. Download data set at (as of Feb. 2014) http://www.ssa.gov/OACT/babynames/names.zip
#2. Unzip into a working directory
#3. Change the working directory strings in this script.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re as re
import scipy
import os

os.chdir("C:/_Dropbox/Dropbox/py/babynames/yobs") #change this to your working directory
    
#read yob files, arrange into data frames and concatenate into one data frame
#this portion taken from O'Reilly's Python for Data Analysis (2009)
years = range(1880, 2013) #remember in python a 2013 upper bound means the last value used will be 2012
pieces = []
yobcolumns = ['name', 'sex', 'births']
for year in years:
    path = 'yob%d.txt' % year
    frame = pd.read_csv(path, names=yobcolumns)
    frame['year'] = year
    pieces.append(frame)
df = pd.concat(pieces, ignore_index=True)

os.chdir("C:/_Dropbox/Dropbox/py/babynames/") #change this to your working directory

#add column 'pct' that is the number of births of that name and sex in that year
#divided by the total number of births of that sex in that year, multiplied by
#100 to turn into a percentage and reduce all those leading zeroes
def add_pct(group):
    births = group.births.astype(float)
    group['pct'] = (births / births.sum() * 100)
    return group
df = df.groupby(['year', 'sex']).apply(add_pct)

#add rank of each name each year each sex
df['ranked'] = df.groupby(['year', 'sex'])['births'].rank(ascending=False)

#subset girls and boys
dff = df[df.sex == 'F']
dfm = df[df.sex == 'M']

#create names dataframe. This DF discards individual birth or pct values, and instead collects data on unique names.
#There is one row per unique combination of name and sex.
temp_count = pd.DataFrame(data=dff['name'].value_counts(), columns=['year_count'])
temp_min = pd.DataFrame(data=dff.groupby('name').year.min(), columns = ['year_min'])
temp_max = pd.DataFrame(data=dff.groupby('name').year.max(), columns = ['year_max'])
temp_pctsum = pd.DataFrame(data=dff.groupby('name').pct.sum(), columns = ['pct_sum'])
temp_pctmax = pd.DataFrame(data=dff.groupby('name').pct.max(), columns = ['pct_max'])
temp_f = temp_count.join(temp_min)
temp_f = temp_f.join(temp_max)
temp_f = temp_f.join(temp_pctsum)
temp_f = temp_f.join(temp_pctmax)
temp_f['sex'] = "F"
temp_f.reset_index(inplace=True, drop=False)
temp_f.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
temp_f = temp_f[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
temp_count = pd.DataFrame(data=dfm['name'].value_counts(), columns=['year_count'])
temp_min = pd.DataFrame(data=dfm.groupby('name').year.min(), columns = ['year_min'])
temp_max = pd.DataFrame(data=dfm.groupby('name').year.max(), columns = ['year_max'])
temp_pctsum = pd.DataFrame(data=dfm.groupby('name').pct.sum(), columns = ['pct_sum'])
temp_pctmax = pd.DataFrame(data=dfm.groupby('name').pct.max(), columns = ['pct_max'])
temp_m = temp_count.join(temp_min)
temp_m = temp_m.join(temp_max)
temp_m = temp_m.join(temp_pctsum)
temp_m = temp_m.join(temp_pctmax)
temp_m['sex'] = "M"
temp_m.reset_index(inplace=True, drop=False)
temp_m.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
temp_m = temp_m[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
names = pd.concat([temp_f, temp_m], ignore_index=True)

# create years dataframe. This DF discards individual name data, aggregating by year.
total = pd.DataFrame(df.pivot_table('births', rows='year', cols = 'sex', aggfunc=sum))
total.reset_index(drop=False, inplace=True)
total.columns = ['year', 'births_f', 'births_m']
total['births_t'] = total.births_f + total.births_m
newnames = pd.DataFrame(data=names.groupby('year_min').year_min.count(), columns = ['firstyearcount'])
newnames.reset_index(drop=False, inplace=True)
newnames.columns = ['year', 'new_names']
uniquenames = pd.DataFrame(columns=['year', 'unique_names'])
for yr in range(1880, 2013):
    uniquenames = uniquenames.append(pd.DataFrame([{'year':yr, 'unique_names':len(unique(df[df.year == yr].name))}]), ignore_index=True)
years = pd.merge(left=total, right=newnames, on='year', right_index=False, left_index=False)
years = pd.merge(left=years, right=uniquenames, on='year', right_index=False, left_index=False)

#births dataframes, just the number of births per year
births = df.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
births_f = dff.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
births_m = dfm.pivot_table('births', rows='year', cols='sex', aggfunc=sum)

# It takes my $400 Acer desktop computer about 15 seconds for this cell. If you want to compare, put the following, uncommented,
# as the first line:
# %%timeit

In [2]:

# Some custom functions to help exploration:

def headtail(df, num = 5):
    """ Returns concatenated head and tail of dataframe
    arguments: dataframe, integer number of rows in each of head and tail """
    return pd.concat([df.head(num), df.tail(num)], ignore_index=False)

def subset(df, returncol='', searchval = '', searchcol = ''):
    """ Returns dataframe in first argument
    with column named in second argument (or entire row if omitted)
    and value named in third argument (or entire column(s) if omitted
    found in column named in fourth argument (or second argument if omitted)"""
    if returncol == '':
        if searchval <> '' and searchcol <> '':
            return df[df[searchcol] == searchval]
        else:
            return 'Error in argument formulation' 
    else:
        if searchval == '':
            if searchcol == '':
                return df[returncol]
            else:
                return 'Error in argument formulation'
        else:
            if searchcol == '':
                return df[df[returncol] == searchval][returncol]
            else:
                return df[df[searchcol] == searchval][returncol]

Overview of primary dataframes:¶

In [3]:

#Main dataframe of SSA data:
headtail(df)

Out[3]:

	name	sex	births	year	pct	ranked
0	Mary	F	7065	1880	7.764334	1
1	Anna	F	2604	1880	2.861759	2
2	Emma	F	2003	1880	2.201268	3
3	Elizabeth	F	1939	1880	2.130933	4
4	Minnie	F	1746	1880	1.918829	5
1758725	Zylin	M	5	2012	0.000266	13166
1758726	Zymari	M	5	2012	0.000266	13166
1758727	Zyrin	M	5	2012	0.000266	13166
1758728	Zyrus	M	5	2012	0.000266	13166
1758729	Zytaevius	M	5	2012	0.000266	13166

In [4]:

print df.describe()

               births            year             pct          ranked
count  1758730.000000  1758730.000000  1758730.000000  1758730.000000
mean       187.499369     1971.073360        0.015125     5114.759420
std       1590.400193       33.184493        0.125716     4316.878964
min          5.000000     1880.000000        0.000232        1.000000
25%          7.000000     1947.000000        0.000417     1728.000000
50%         12.000000     1980.000000        0.000790     3875.000000
75%         32.000000     1999.000000        0.002521     7395.000000
max      99685.000000     2012.000000        8.738268    19076.000000

In [5]:

#for comparison with Dept of Health and Human Services database:
dhscompare = []
for yr in range(1910,2010):
    dhscompare.append(yr)
    dhscompare.append(subset(df, 'births', yr, 'year').sum())
dhscompare
#normally this would be better as a dict, but for brevity's sake I output as a list because the DHS data is in Excel
#and I already have an excel macro to parse a list like this.

Out[5]:

In [6]:

#verify that dff and dfm are just df split into sex == F and M, respectively.

if len(df) == len(dff) + len(dfm):
    print "Dataframes add up correctly: " + str(len(dff)) + " girls + " + str(len(dfm)) + " boys = " + str(len(df)) + " total."
else:
    print "Dataframes do not add up: " + str(len(dff)) + " girls + " + str(len(dfm)) + " boys != " + str(len(df)) + " total; difference - " + str(len(df) - len(dfm) - len(dff))

Dataframes add up correctly: 1043165 girls + 715565 boys = 1758730 total.

In [7]:

#names dataframe
#there are 101260 unique name-sex combinations; e.g., "Jeon" for a boy occurred once only, in 1999.
#births and pct and rank have been discarded, but can easily be looked up from df or dfm.
headtail(names)

Out[7]:

	name	sex	year_count	year_min	year_max	pct_sum	pct_max
0	Georgianna	F	133	1880	2012	1.047215	0.031525
1	Winifred	F	133	1880	2012	6.207711	0.141134
2	Joan	F	133	1880	2012	37.098464	1.972498
3	Miriam	F	133	1880	2012	8.853062	0.149273
4	Alva	F	133	1880	2012	1.165021	0.031943
101256	Zelda	M	1	1932	1932	0.000479	0.000479
101257	Schell	M	1	1962	1962	0.000338	0.000338
101258	Aadvik	M	1	2012	2012	0.000320	0.000320
101259	Mikequan	M	1	1995	1995	0.000263	0.000263
101260	Jeon	M	1	1999	1999	0.000261	0.000261

In [8]:

print names.describe()

          year_count       year_min       year_max        pct_sum        pct_max
count  101261.000000  101261.000000  101261.000000  101261.000000  101261.000000
mean       17.368286    1970.788675    1996.402603       0.262688       0.008826
std        26.002521      36.164306      23.476958       4.739686       0.104103
min         1.000000    1880.000000    1880.000000       0.000232       0.000232
25%         2.000000    1951.000000    1991.000000       0.000599       0.000344
50%         6.000000    1983.000000    2008.000000       0.002548       0.000556
75%        21.000000    1999.000000    2012.000000       0.013662       0.001301
max       133.000000    2012.000000    2012.000000     558.006916       8.738268

In [9]:

#years dataframe
headtail(years)

Out[9]:

	year	births_f	births_m	births_t	new_names	unique_names
0	1880	90993	110491	201484	2000	1889
1	1881	91955	100746	192701	310	1830
2	1882	107850	113687	221537	302	2012
3	1883	112322	104630	216952	195	1962
4	1884	129022	114445	243467	233	2158
128	2008	1886109	2035075	3921184	2047	32474
129	2009	1831382	1977632	3809014	1787	32203
130	2010	1770632	1911572	3682204	1634	31581
131	2011	1750078	1889557	3639635	1534	31388
132	2012	1743626	1877705	3621331	1506	31109

In [10]:

print years.describe()

             year        births_f        births_m        births_t    new_names  unique_names
count   133.00000      133.000000      133.000000      133.000000   133.000000    133.000000
mean   1946.00000  1229499.000000  1249905.248120  2479404.248120   761.360902  12052.827068
std      38.53786   644533.683153   739035.563215  1382297.397633   637.666524   8597.344768
min    1880.00000    90993.000000   100746.000000   192701.000000   103.000000   1830.000000
25%    1913.00000   624463.000000   512527.000000  1136990.000000   215.000000   6263.000000
50%    1946.00000  1457578.000000  1559084.000000  3017152.000000   467.000000   9516.000000
75%    1979.00000  1770632.000000  1902493.000000  3675594.000000  1208.000000  17325.000000
max    2012.00000  2044078.000000  2155767.000000  4199845.000000  2189.000000  32474.000000

Make dataframe and graphs of top-ranked name for girls and boys, 1880-2012¶

In [11]:

#Make dataframe and graph of first-ranked
rank1m = dfm[dfm.ranked == 1]
rank1f = dff[dff.ranked == 1]
zeroline = scipy.zeros(len(range(1880, 2013)))
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='k')
plt.plot(rank1m.year, rank1m.pct, color="blue", linewidth = 2, label = 'Boys')
plt.fill_between(rank1m.year, rank1m.pct, color="blue", alpha = 0.1, interpolate=True)
plt.xlim(1880,2012)
plt.ylim(0,9)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of #1 boys' name by year", size=18, color="blue")
plt.xlabel('Year', size=15)
plt.ylabel('% of male births', size=15)
plt.show()
plt.close()
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='k')
plt.plot(rank1f.year, rank1f.pct, color="red", linewidth = 2, label = 'Girls')
plt.fill_between(rank1f.year, rank1f.pct, color="red", alpha = 0.1, interpolate=True)
plt.xlim(1880,2012)
plt.ylim(0,9)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of #1 girls' name by year", size=18, color="red")
plt.xlabel('Year', size=15)
plt.ylabel('% of female births', size=15)
plt.show()
plt.close()

Check assignments of names into boys and girls categories¶

In [12]:

# make graphs of gender miscategorizations for all names that were #1 ranked in any year

namelist = list(rank1m.name.unique()) + list(rank1f.name.unique())

for nm in namelist:
    f_or_m = df[df.name == nm]
    f_or_m = pd.pivot_table(f_or_m, 'pct', rows=['year'], cols='sex')
    f_or_m = pd.DataFrame(f_or_m)
    f_or_m = f_or_m.dropna()
    f_or_m['temp'] = f_or_m.F / f_or_m.M
    f_or_m['sex_max'] = f_or_m['temp'].apply(lambda x: 'F' if x >= 1 else 'M')
    f_or_m['temp2'] = f_or_m['sex_max'].apply(lambda x: -1 if x =='F' else 1)
    f_or_m['pctratio'] = 100 * (f_or_m.temp ** f_or_m.temp2)
    y2 = f_or_m.pctratio
    y1 = f_or_m[f_or_m.iloc[0]['sex_max']]
    x = f_or_m.index
    
    if f_or_m.iloc[0]['sex_max'] == 'M':
        sexvar = 'female'
    else:
        sexvar = 'male'

    figure(num=None, figsize=(6,4), dpi=150, facecolor='w', edgecolor='k')
    
    plt.subplot(211)
    plt.xlim(1880,2012)
    plt.xticks([])
    plt.ylabel('% of all births', size=10)
    
    if y1[[1980, 1985, 1990, 1995, 2000, 2005, 2010]].max() / y1.max() < 0.5: #puts name on the left if name was popular in year 2000
        plt.annotate(nm, xy=(.96, .92),  xycoords='axes fraction', size = 17, horizontalalignment='right', verticalalignment='top')
    else:
        plt.annotate(nm, xy=(.05, .92),  xycoords='axes fraction', size = 17, horizontalalignment='left', verticalalignment='top')

    plt.plot(x, y1, color='black', linewidth = 2, label = '2')
    
    plt.subplot(212)
    plt.xlim(1880,2012)
    plt.ylabel('% ' + nm + ' ' + sexvar, size=10)
    plt.plot(x, y2, color='green', linewidth = 2, label = '2')
    plt.savefig('fm_' + nm + '.png')
    plt.show()
    plt.close()

# Note: algorithm does not work for Ashley, because it changed from a male to a female name (see below)

In [13]:

#Try to solve the mystery of the high error rate of Emma ca. 1900
#posted on Pastebin and prooffreaderplus.blogspot.com
dfm1900 = subset(dfm, '', 1900, 'year')
dfm1900.reset_index(drop=True, inplace=True)
dfm1900E = dfm1900[dfm1900.name.str.contains('^E')]
dfm1900E_a = dfm1900E[dfm1900E.name.str.contains('a$')]
dfm1900_a = dfm1900[dfm1900.name.str.contains('a$')]
print 'Boys born in 1900 whose names begin with E:\n'
print dfm1900E[['name', 'births', 'ranked']].head(60)
print '\n\nBoys born in 1900 whose names end with a:\n'
print dfm1900_a[['name', 'births', 'ranked']].head(50)
print '\n\nIntersection of both sets:\n'
print dfm1900E_a[['name', 'births', 'ranked']].head(50)

Boys born in 1900 whose names begin with E:

          name  births  ranked
8       Edward    2721     9.0
27        Earl    1037    28.0
29      Ernest    1012    30.0
37       Elmer     699    38.0
44      Eugene     587    45.0
62       Eddie     425    63.0
70       Edgar     374    71.0
73       Edwin     350    74.0
86          Ed     297    87.0
99     Everett     237   100.0
105    Earnest     223   105.0
164       Emil     127   165.5
168     Elbert     125   168.5
172      Ellis     120   173.5
174     Emmett     119   175.0
201     Edmund      94   202.0
208      Ervin      88   209.0
210     Elijah      86   211.5
262        Edd      61   263.5
286      Emory      52   288.0
289     Elwood      51   290.5
306     Edmond      47   306.0
308      Earle      46   310.0
316        Eli      45   315.5
317    Emanuel      45   315.5
325      Erwin      43   327.0
329      Emery      42   333.0
330       Ezra      42   333.0
344       Elmo      40   346.0
350      Elton      39   351.5
361     Elisha      37   360.5
366      Enoch      36   369.5
381    Emerson      34   383.0
385    Elliott      33   388.5
404     Elmore      31   407.5
405      Emile      31   407.5
418      Early      29   421.0
435  Ellsworth      27   437.5
436      Emmet      27   437.5
470      Elvin      24   472.5
471       Evan      24   472.5
479      Elias      23   483.0
480     Emmitt      23   483.0
518       Emma      21   525.5
519       Eric      21   525.5
536      Eldon      20   544.0
537      Elzie      20   544.0
538      Ethel      20   544.0
560       Elie      19   566.0
561    Ezekiel      19   566.0
587      Ennis      18   590.0
607   Eldridge      17   612.5
631  Elizabeth      16   635.5
632       Elza      16   635.5
633   Everette      16   635.5
653      Elsie      15   664.0
654     Emilio      15   664.0
705       Edna      13   720.5
706      Effie      13   720.5
707     Eunice      13   720.5


Boys born in 1900 whose names end with a:

         name  births  ranked
106       Ira     218   107.5
236      Alva      73   237.5
330      Ezra      42   333.0
333       Ora      42   333.0
361    Elisha      37   360.5
370    Joshua      36   369.5
413       Asa      30   415.5
454      Anna      25   461.5
491    Bertha      22   503.5
518      Emma      21   525.5
526      Otha      21   525.5
559      Dana      19   566.0
591     Hosea      18   590.0
632      Elza      16   635.5
699      Alma      13   720.5
705      Edna      13   720.5
713       Ida      13   720.5
752     Clara      12   763.5
753      Ella      12   763.5
856     Eliga      10   868.5
892     Rolla      10   868.5
944    Martha       9   934.5
950      Nora       9   934.5
980      Cora       8  1008.5
989       Eva       8  1008.5
1001      Iva       8  1008.5
1008      Lea       8  1008.5
1075   Eligha       7  1105.5
1091  Georgia       7  1105.5
1114    Laura       7  1105.5
1145     Roma       7  1105.5
1163  Alberta       6  1235.5
1167   Alonza       6  1235.5
1168    Alpha       6  1235.5
1176  Augusta       6  1235.5
1241    Julia       6  1235.5
1254    Lonza       6  1235.5
1271      Oda       6  1235.5
1274     Orla       6  1235.5
1287   Stella       6  1235.5
1351  Claudia       5  1408.5
1361     Dora       5  1408.5
1373     Elva       5  1408.5
1446      Ola       5  1408.5
1471     Rosa       5  1408.5
1486  Theresa       5  1408.5
1496    Viola       5  1408.5


Intersection of both sets:

        name  births  ranked
330     Ezra      42   333.0
361   Elisha      37   360.5
518     Emma      21   525.5
632     Elza      16   635.5
705     Edna      13   720.5
753     Ella      12   763.5
856    Eliga      10   868.5
989      Eva       8  1008.5
1075  Eligha       7  1105.5
1373    Elva       5  1408.5

I said in the blog text that nobody (in the database) was named Ashley in 1900. Here's the proof:

NOTE: I wrote this for my first blog post on Feb. 24, 2014. Then I found out that Ashley was a BOYS' name in 1900!

In [14]:

dff[dff.name == 'Ashley'].head()

Out[14]:

	name	sex	births	year	pct	ranked
143423	Ashley	F	5	1917	0.000462	4954.0
351811	Ashley	F	7	1938	0.000635	3647.5
379234	Ashley	F	6	1941	0.000497	4187.5
387796	Ashley	F	8	1942	0.000592	3591.0
396803	Ashley	F	10	1943	0.000717	3142.0

In [15]:

dfm[dfm.name == 'Ashley'].head() #Added on March 3

Out[15]:

	name	sex	births	year	pct	ranked
1625	Ashley	M	8	1880	0.007240	713
3726	Ashley	M	6	1881	0.005956	823
5723	Ashley	M	7	1882	0.006157	795
7764	Ashley	M	9	1883	0.008602	660
10175	Ashley	M	6	1884	0.005243	909

In [16]:

# plot percent male and female births of Ashley
nm = 'Ashley'
f_and_m = df[df.name == nm]
f_and_m = pd.pivot_table(f_and_m, 'pct', rows=['year'], cols='sex')
f_and_m = pd.DataFrame(f_and_m)

figure(num=None, figsize=(8,7), dpi=150, facecolor='w', edgecolor='k')

plt.xlim(1880,2012)

plt.subplot(211)
#plt.title(nm)
plt.title("Ashley used to be a boys' name\n", size=20)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(f_and_m.index, f_and_m.F, color='#cc0000', linewidth = 2, label = 'Girls')
plt.plot(f_and_m.index, f_and_m.M, color='#0000cc', linewidth = 2, label = 'Boys')
plt.legend(loc = 'upper left')
plt.subplot(212)
plt.ylim(0, 0.05)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(f_and_m.index, f_and_m.F, color='#cc0000', linewidth = 2, label = 'Girls')
plt.plot(f_and_m.index, f_and_m.M, color='#0000cc', linewidth = 2, label = 'Boys')
plt.annotate('magnification of top graph\n(10% of height of first tick mark)\nto see male names', xy=(.03, .95),  xycoords='axes fraction', size = 12, horizontalalignment='left', verticalalignment='top')
plt.savefig('ashleyMF.png')
plt.show()
plt.close()

Graph an individual name¶

In [17]:

name_chosen = "Sigourney"
sex_chosen = "F"

if sex_chosen == "F":
    y_axis_sex = "female"
else:
    y_axis_sex = "male"


import scipy
graphdf = df[df.sex == sex_chosen]
graphdf = graphdf[graphdf.name == name_chosen]

figure(num=None, figsize=(11, 8), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="black", linewidth = 2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of baby name '" + name_chosen + "', 1880-2012", size=18, color="black")
plt.xlabel('Year', size=15)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
#plt.savefig(name_chosen + '.png') # un-comment to save graph
plt.show()
plt.close()
#graphdf.to_csv(name_chosen + "_" + sex_chosen + ".csv")

In [18]:

name_chosen = "Sharona"
sex_chosen = "F"

if sex_chosen == "F":
    y_axis_sex = "female"
else:
    y_axis_sex = "male"


import scipy
graphdf = df[df.sex == sex_chosen]
graphdf = graphdf[graphdf.name == name_chosen]

figure(num=None, figsize=(8, 4), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="black", linewidth = 2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of baby name '" + name_chosen + "', 1880-2012", size=18, color="black")
plt.xlabel('Year', size=15)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
plt.savefig(name_chosen + '.png') # un-comment to save graph
plt.show()
plt.close()
#graphdf.to_csv(name_chosen + "_" + sex_chosen + ".csv")

In [19]:

#compare two names of same sex
name_chosen = "Marilyn"
name_chosen_2 = "Norma"
sex_chosen = "F"

if sex_chosen == "F":
    y_axis_sex = "female"
else:
    y_axis_sex = "male"

import scipy
graphdf = df[df.sex == sex_chosen]
graphdf2 = graphdf[graphdf.name == name_chosen_2]
graphdf = graphdf[graphdf.name == name_chosen]

figure(num=None, figsize=(7.5, 5), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="#6500A4", linewidth = 3, label=name_chosen)
plt.plot(graphdf2.year, graphdf2.pct, color="#005E75", linewidth = 3, label=name_chosen_2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of U.S. baby names\n" + name_chosen + " and " + name_chosen_2 + ", 1880-2012", size=16, color="black")
plt.xlabel('Year', size=12)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
plt.legend(loc = 'upper left')
plt.savefig(name_chosen + '_v_' + name_chosen_2 + '.png') # un-comment to save graph
plt.show()
plt.close()

In [20]:

# plot rate of babies named "Unknown" or "Baby"
Unkn = df[df.name == 'Unknown']
Unkn = pd.pivot_table(Unkn, 'pct', rows=['year'], cols='name')
Unkn = pd.DataFrame(Unkn)
Baby = df[df.name == 'Baby']
Baby = pd.pivot_table(Baby, 'pct', rows=['year'], cols='name')
Baby = pd.DataFrame(Baby)

figure(num=None, figsize=(8,4), dpi=150, facecolor='w', edgecolor='k')

plt.xlim(1880,2012)

plt.title("U.S. babies named 'Unknown' or 'Baby', 1880-2012", size=18)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(Unkn.index, Unkn.Unknown, color='#660066', linewidth = 2, label = 'Unknown')
plt.plot(Baby.index, Baby.Baby, color='#dd8833', linewidth = 2, label = 'Baby')
plt.legend(loc = 'upper left')
plt.savefig('Unknown_Baby.png')
plt.show()
plt.close()

Make dataframe and graph of new names that appear in database for every birth year¶

In [21]:

years2 = years[years.year > 1880] # the first year, 1880, mucks up the 'new names' data
newperthou = years2.new_names * 1000.0 / years2.births_t
import scipy
zeroline = scipy.zeros(len(range(1881, 2013)))
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='w')
plt.plot(years2.year, newperthou, color="#0A5711", linewidth = 2)
plt.fill_between(years2.year, zeroline, newperthou, color="#20912B", alpha = 1.0, interpolate=True)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("New names introduced per 1000 births", size=18, color="#0A5711")
plt.xlabel('Year', size=15)
plt.ylabel('Names appearing for first time / 1000 births', size=11)
plt.show()
plt.close()