Baby names IPython notebook

www.prooffreader.com

Create pandas dataframes from U.S. Social Security baby names database:

In [1]:
#Python/Pandas script to analyze baby names database 1880-2012
#from the U.S. Social Security Administration
#
#By David Taylor Feb. 2014
#
#www.prooffreader.com (for blogged results)
#
#prooffreaderplus.blogspot.com (for scripts, calculations, links, gits, etc.)
#
#Disclaimer: not a professional programmer, more interested right now in results in a reasonable time.
#  That said, constructive critique and suggestions are always totally welcome. I'm not proud.
#  In particular, there is a lot of very needless duplication of boy and girl databases and code
#    in loops that refer to them, when I could just subset a larger database every time,
#    but I have enough memory in my computer and not enough in my head so I just found it easier
#    to work in this inefficient fashion. Your mileage may vary.
#  Note that I usually use column names instead of indexes, my brain just deals with them better
#    right now, as I get more used to pandas I'm already starting to adapt.
#  Also, sometimes I just print-dump everything to a csv file and work with it in Excel. Sorry!

#Instructions:
#1. Download data set at (as of Feb. 2014) http://www.ssa.gov/OACT/babynames/names.zip
#2. Unzip into a working directory
#3. Change the working directory strings in this script.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re as re
import scipy
import os

os.chdir("C:/_Dropbox/Dropbox/py/babynames/yobs") #change this to your working directory
    
#read yob files, arrange into data frames and concatenate into one data frame
#this portion taken from O'Reilly's Python for Data Analysis (2009)
years = range(1880, 2013) #remember in python a 2013 upper bound means the last value used will be 2012
pieces = []
yobcolumns = ['name', 'sex', 'births']
for year in years:
    path = 'yob%d.txt' % year
    frame = pd.read_csv(path, names=yobcolumns)
    frame['year'] = year
    pieces.append(frame)
df = pd.concat(pieces, ignore_index=True)

os.chdir("C:/_Dropbox/Dropbox/py/babynames/") #change this to your working directory

#add column 'pct' that is the number of births of that name and sex in that year
#divided by the total number of births of that sex in that year, multiplied by
#100 to turn into a percentage and reduce all those leading zeroes
def add_pct(group):
    births = group.births.astype(float)
    group['pct'] = (births / births.sum() * 100)
    return group
df = df.groupby(['year', 'sex']).apply(add_pct)

#add rank of each name each year each sex
df['ranked'] = df.groupby(['year', 'sex'])['births'].rank(ascending=False)

#subset girls and boys
dff = df[df.sex == 'F']
dfm = df[df.sex == 'M']

#create names dataframe. This DF discards individual birth or pct values, and instead collects data on unique names.
#There is one row per unique combination of name and sex.
temp_count = pd.DataFrame(data=dff['name'].value_counts(), columns=['year_count'])
temp_min = pd.DataFrame(data=dff.groupby('name').year.min(), columns = ['year_min'])
temp_max = pd.DataFrame(data=dff.groupby('name').year.max(), columns = ['year_max'])
temp_pctsum = pd.DataFrame(data=dff.groupby('name').pct.sum(), columns = ['pct_sum'])
temp_pctmax = pd.DataFrame(data=dff.groupby('name').pct.max(), columns = ['pct_max'])
temp_f = temp_count.join(temp_min)
temp_f = temp_f.join(temp_max)
temp_f = temp_f.join(temp_pctsum)
temp_f = temp_f.join(temp_pctmax)
temp_f['sex'] = "F"
temp_f.reset_index(inplace=True, drop=False)
temp_f.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
temp_f = temp_f[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
temp_count = pd.DataFrame(data=dfm['name'].value_counts(), columns=['year_count'])
temp_min = pd.DataFrame(data=dfm.groupby('name').year.min(), columns = ['year_min'])
temp_max = pd.DataFrame(data=dfm.groupby('name').year.max(), columns = ['year_max'])
temp_pctsum = pd.DataFrame(data=dfm.groupby('name').pct.sum(), columns = ['pct_sum'])
temp_pctmax = pd.DataFrame(data=dfm.groupby('name').pct.max(), columns = ['pct_max'])
temp_m = temp_count.join(temp_min)
temp_m = temp_m.join(temp_max)
temp_m = temp_m.join(temp_pctsum)
temp_m = temp_m.join(temp_pctmax)
temp_m['sex'] = "M"
temp_m.reset_index(inplace=True, drop=False)
temp_m.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
temp_m = temp_m[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
names = pd.concat([temp_f, temp_m], ignore_index=True)

# create years dataframe. This DF discards individual name data, aggregating by year.
total = pd.DataFrame(df.pivot_table('births', rows='year', cols = 'sex', aggfunc=sum))
total.reset_index(drop=False, inplace=True)
total.columns = ['year', 'births_f', 'births_m']
total['births_t'] = total.births_f + total.births_m
newnames = pd.DataFrame(data=names.groupby('year_min').year_min.count(), columns = ['firstyearcount'])
newnames.reset_index(drop=False, inplace=True)
newnames.columns = ['year', 'new_names']
uniquenames = pd.DataFrame(columns=['year', 'unique_names'])
for yr in range(1880, 2013):
    uniquenames = uniquenames.append(pd.DataFrame([{'year':yr, 'unique_names':len(unique(df[df.year == yr].name))}]), ignore_index=True)
years = pd.merge(left=total, right=newnames, on='year', right_index=False, left_index=False)
years = pd.merge(left=years, right=uniquenames, on='year', right_index=False, left_index=False)

#births dataframes, just the number of births per year
births = df.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
births_f = dff.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
births_m = dfm.pivot_table('births', rows='year', cols='sex', aggfunc=sum)

# It takes my $400 Acer desktop computer about 15 seconds for this cell. If you want to compare, put the following, uncommented,
# as the first line:
# %%timeit
In [2]:
# Some custom functions to help exploration:

def headtail(df, num = 5):
    """ Returns concatenated head and tail of dataframe
    arguments: dataframe, integer number of rows in each of head and tail """
    return pd.concat([df.head(num), df.tail(num)], ignore_index=False)

def subset(df, returncol='', searchval = '', searchcol = ''):
    """ Returns dataframe in first argument
    with column named in second argument (or entire row if omitted)
    and value named in third argument (or entire column(s) if omitted
    found in column named in fourth argument (or second argument if omitted)"""
    if returncol == '':
        if searchval <> '' and searchcol <> '':
            return df[df[searchcol] == searchval]
        else:
            return 'Error in argument formulation' 
    else:
        if searchval == '':
            if searchcol == '':
                return df[returncol]
            else:
                return 'Error in argument formulation'
        else:
            if searchcol == '':
                return df[df[returncol] == searchval][returncol]
            else:
                return df[df[searchcol] == searchval][returncol]

Overview of primary dataframes:

In [3]:
#Main dataframe of SSA data:
headtail(df)
Out[3]:
name sex births year pct ranked
0 Mary F 7065 1880 7.764334 1
1 Anna F 2604 1880 2.861759 2
2 Emma F 2003 1880 2.201268 3
3 Elizabeth F 1939 1880 2.130933 4
4 Minnie F 1746 1880 1.918829 5
1758725 Zylin M 5 2012 0.000266 13166
1758726 Zymari M 5 2012 0.000266 13166
1758727 Zyrin M 5 2012 0.000266 13166
1758728 Zyrus M 5 2012 0.000266 13166
1758729 Zytaevius M 5 2012 0.000266 13166
In [4]:
print df.describe()
               births            year             pct          ranked
count  1758730.000000  1758730.000000  1758730.000000  1758730.000000
mean       187.499369     1971.073360        0.015125     5114.759420
std       1590.400193       33.184493        0.125716     4316.878964
min          5.000000     1880.000000        0.000232        1.000000
25%          7.000000     1947.000000        0.000417     1728.000000
50%         12.000000     1980.000000        0.000790     3875.000000
75%         32.000000     1999.000000        0.002521     7395.000000
max      99685.000000     2012.000000        8.738268    19076.000000
In [5]:
#for comparison with Dept of Health and Human Services database:
dhscompare = []
for yr in range(1910,2010):
    dhscompare.append(yr)
    dhscompare.append(subset(df, 'births', yr, 'year').sum())
dhscompare
#normally this would be better as a dict, but for brevity's sake I output as a list because the DHS data is in Excel
#and I already have an excel macro to parse a list like this.
Out[5]:
[1910,
 590696,
 1911,
 644242,
 1912,
 987950,
 1913,
 1136990,
 1914,
 1416254,
 1915,
 1832497,
 1916,
 1934414,
 1917,
 2006750,
 1918,
 2171217,
 1919,
 2110293,
 1920,
 2262790,
 1921,
 2334387,
 1922,
 2289215,
 1923,
 2302408,
 1924,
 2381572,
 1925,
 2333195,
 1926,
 2295691,
 1927,
 2318839,
 1928,
 2260302,
 1929,
 2191451,
 1930,
 2222667,
 1931,
 2103195,
 1932,
 2110686,
 1933,
 1998556,
 1934,
 2076055,
 1935,
 2089180,
 1936,
 2077021,
 1937,
 2129965,
 1938,
 2211879,
 1939,
 2202888,
 1940,
 2302046,
 1941,
 2435671,
 1942,
 2731210,
 1943,
 2821662,
 1944,
 2689449,
 1945,
 2652581,
 1946,
 3194707,
 1947,
 3601997,
 1948,
 3450802,
 1949,
 3481993,
 1950,
 3504681,
 1951,
 3677885,
 1952,
 3797756,
 1953,
 3849398,
 1954,
 3978865,
 1955,
 4012414,
 1956,
 4121009,
 1957,
 4199845,
 1958,
 4131232,
 1959,
 4156399,
 1960,
 4154859,
 1961,
 4139305,
 1962,
 4035299,
 1963,
 3958833,
 1964,
 3887558,
 1965,
 3626291,
 1966,
 3475626,
 1967,
 3395191,
 1968,
 3378931,
 1969,
 3476680,
 1970,
 3607444,
 1971,
 3432364,
 1972,
 3143737,
 1973,
 3017152,
 1974,
 3040252,
 1975,
 3019558,
 1976,
 3034648,
 1977,
 3176263,
 1978,
 3173337,
 1979,
 3326319,
 1980,
 3442974,
 1981,
 3457508,
 1982,
 3505823,
 1983,
 3461025,
 1984,
 3486048,
 1985,
 3566396,
 1986,
 3554197,
 1987,
 3603080,
 1988,
 3691763,
 1989,
 3843030,
 1990,
 3949807,
 1991,
 3893707,
 1992,
 3839222,
 1993,
 3768219,
 1994,
 3715625,
 1995,
 3660115,
 1996,
 3645024,
 1997,
 3623401,
 1998,
 3675594,
 1999,
 3691023,
 2000,
 3776494,
 2001,
 3739758,
 2002,
 3734405,
 2003,
 3798084,
 2004,
 3816078,
 2005,
 3839196,
 2006,
 3949493,
 2007,
 3989796,
 2008,
 3921184,
 2009,
 3809014]
In [6]:
#verify that dff and dfm are just df split into sex == F and M, respectively.

if len(df) == len(dff) + len(dfm):
    print "Dataframes add up correctly: " + str(len(dff)) + " girls + " + str(len(dfm)) + " boys = " + str(len(df)) + " total."
else:
    print "Dataframes do not add up: " + str(len(dff)) + " girls + " + str(len(dfm)) + " boys != " + str(len(df)) + " total; difference - " + str(len(df) - len(dfm) - len(dff))
Dataframes add up correctly: 1043165 girls + 715565 boys = 1758730 total.
In [7]:
#names dataframe
#there are 101260 unique name-sex combinations; e.g., "Jeon" for a boy occurred once only, in 1999.
#births and pct and rank have been discarded, but can easily be looked up from df or dfm.
headtail(names)
Out[7]:
name sex year_count year_min year_max pct_sum pct_max
0 Georgianna F 133 1880 2012 1.047215 0.031525
1 Winifred F 133 1880 2012 6.207711 0.141134
2 Joan F 133 1880 2012 37.098464 1.972498
3 Miriam F 133 1880 2012 8.853062 0.149273
4 Alva F 133 1880 2012 1.165021 0.031943
101256 Zelda M 1 1932 1932 0.000479 0.000479
101257 Schell M 1 1962 1962 0.000338 0.000338
101258 Aadvik M 1 2012 2012 0.000320 0.000320
101259 Mikequan M 1 1995 1995 0.000263 0.000263
101260 Jeon M 1 1999 1999 0.000261 0.000261
In [8]:
print names.describe()
          year_count       year_min       year_max        pct_sum        pct_max
count  101261.000000  101261.000000  101261.000000  101261.000000  101261.000000
mean       17.368286    1970.788675    1996.402603       0.262688       0.008826
std        26.002521      36.164306      23.476958       4.739686       0.104103
min         1.000000    1880.000000    1880.000000       0.000232       0.000232
25%         2.000000    1951.000000    1991.000000       0.000599       0.000344
50%         6.000000    1983.000000    2008.000000       0.002548       0.000556
75%        21.000000    1999.000000    2012.000000       0.013662       0.001301
max       133.000000    2012.000000    2012.000000     558.006916       8.738268
In [9]:
#years dataframe
headtail(years)
Out[9]:
year births_f births_m births_t new_names unique_names
0 1880 90993 110491 201484 2000 1889
1 1881 91955 100746 192701 310 1830
2 1882 107850 113687 221537 302 2012
3 1883 112322 104630 216952 195 1962
4 1884 129022 114445 243467 233 2158
128 2008 1886109 2035075 3921184 2047 32474
129 2009 1831382 1977632 3809014 1787 32203
130 2010 1770632 1911572 3682204 1634 31581
131 2011 1750078 1889557 3639635 1534 31388
132 2012 1743626 1877705 3621331 1506 31109
In [10]:
print years.describe()
             year        births_f        births_m        births_t    new_names  unique_names
count   133.00000      133.000000      133.000000      133.000000   133.000000    133.000000
mean   1946.00000  1229499.000000  1249905.248120  2479404.248120   761.360902  12052.827068
std      38.53786   644533.683153   739035.563215  1382297.397633   637.666524   8597.344768
min    1880.00000    90993.000000   100746.000000   192701.000000   103.000000   1830.000000
25%    1913.00000   624463.000000   512527.000000  1136990.000000   215.000000   6263.000000
50%    1946.00000  1457578.000000  1559084.000000  3017152.000000   467.000000   9516.000000
75%    1979.00000  1770632.000000  1902493.000000  3675594.000000  1208.000000  17325.000000
max    2012.00000  2044078.000000  2155767.000000  4199845.000000  2189.000000  32474.000000

Make dataframe and graphs of top-ranked name for girls and boys, 1880-2012

In [11]:
#Make dataframe and graph of first-ranked
rank1m = dfm[dfm.ranked == 1]
rank1f = dff[dff.ranked == 1]
zeroline = scipy.zeros(len(range(1880, 2013)))
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='k')
plt.plot(rank1m.year, rank1m.pct, color="blue", linewidth = 2, label = 'Boys')
plt.fill_between(rank1m.year, rank1m.pct, color="blue", alpha = 0.1, interpolate=True)
plt.xlim(1880,2012)
plt.ylim(0,9)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of #1 boys' name by year", size=18, color="blue")
plt.xlabel('Year', size=15)
plt.ylabel('% of male births', size=15)
plt.show()
plt.close()
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='k')
plt.plot(rank1f.year, rank1f.pct, color="red", linewidth = 2, label = 'Girls')
plt.fill_between(rank1f.year, rank1f.pct, color="red", alpha = 0.1, interpolate=True)
plt.xlim(1880,2012)
plt.ylim(0,9)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of #1 girls' name by year", size=18, color="red")
plt.xlabel('Year', size=15)
plt.ylabel('% of female births', size=15)
plt.show()
plt.close()

Check assignments of names into boys and girls categories

In [12]:
# make graphs of gender miscategorizations for all names that were #1 ranked in any year

namelist = list(rank1m.name.unique()) + list(rank1f.name.unique())

for nm in namelist:
    f_or_m = df[df.name == nm]
    f_or_m = pd.pivot_table(f_or_m, 'pct', rows=['year'], cols='sex')
    f_or_m = pd.DataFrame(f_or_m)
    f_or_m = f_or_m.dropna()
    f_or_m['temp'] = f_or_m.F / f_or_m.M
    f_or_m['sex_max'] = f_or_m['temp'].apply(lambda x: 'F' if x >= 1 else 'M')
    f_or_m['temp2'] = f_or_m['sex_max'].apply(lambda x: -1 if x =='F' else 1)
    f_or_m['pctratio'] = 100 * (f_or_m.temp ** f_or_m.temp2)
    y2 = f_or_m.pctratio
    y1 = f_or_m[f_or_m.iloc[0]['sex_max']]
    x = f_or_m.index
    
    if f_or_m.iloc[0]['sex_max'] == 'M':
        sexvar = 'female'
    else:
        sexvar = 'male'

    figure(num=None, figsize=(6,4), dpi=150, facecolor='w', edgecolor='k')
    
    plt.subplot(211)
    plt.xlim(1880,2012)
    plt.xticks([])
    plt.ylabel('% of all births', size=10)
    
    if y1[[1980, 1985, 1990, 1995, 2000, 2005, 2010]].max() / y1.max() < 0.5: #puts name on the left if name was popular in year 2000
        plt.annotate(nm, xy=(.96, .92),  xycoords='axes fraction', size = 17, horizontalalignment='right', verticalalignment='top')
    else:
        plt.annotate(nm, xy=(.05, .92),  xycoords='axes fraction', size = 17, horizontalalignment='left', verticalalignment='top')

    plt.plot(x, y1, color='black', linewidth = 2, label = '2')
    
    plt.subplot(212)
    plt.xlim(1880,2012)
    plt.ylabel('% ' + nm + ' ' + sexvar, size=10)
    plt.plot(x, y2, color='green', linewidth = 2, label = '2')
    plt.savefig('fm_' + nm + '.png')
    plt.show()
    plt.close()

# Note: algorithm does not work for Ashley, because it changed from a male to a female name (see below)
In [13]:
#Try to solve the mystery of the high error rate of Emma ca. 1900
#posted on Pastebin and prooffreaderplus.blogspot.com
dfm1900 = subset(dfm, '', 1900, 'year')
dfm1900.reset_index(drop=True, inplace=True)
dfm1900E = dfm1900[dfm1900.name.str.contains('^E')]
dfm1900E_a = dfm1900E[dfm1900E.name.str.contains('a$')]
dfm1900_a = dfm1900[dfm1900.name.str.contains('a$')]
print 'Boys born in 1900 whose names begin with E:\n'
print dfm1900E[['name', 'births', 'ranked']].head(60)
print '\n\nBoys born in 1900 whose names end with a:\n'
print dfm1900_a[['name', 'births', 'ranked']].head(50)
print '\n\nIntersection of both sets:\n'
print dfm1900E_a[['name', 'births', 'ranked']].head(50)
Boys born in 1900 whose names begin with E:

          name  births  ranked
8       Edward    2721     9.0
27        Earl    1037    28.0
29      Ernest    1012    30.0
37       Elmer     699    38.0
44      Eugene     587    45.0
62       Eddie     425    63.0
70       Edgar     374    71.0
73       Edwin     350    74.0
86          Ed     297    87.0
99     Everett     237   100.0
105    Earnest     223   105.0
164       Emil     127   165.5
168     Elbert     125   168.5
172      Ellis     120   173.5
174     Emmett     119   175.0
201     Edmund      94   202.0
208      Ervin      88   209.0
210     Elijah      86   211.5
262        Edd      61   263.5
286      Emory      52   288.0
289     Elwood      51   290.5
306     Edmond      47   306.0
308      Earle      46   310.0
316        Eli      45   315.5
317    Emanuel      45   315.5
325      Erwin      43   327.0
329      Emery      42   333.0
330       Ezra      42   333.0
344       Elmo      40   346.0
350      Elton      39   351.5
361     Elisha      37   360.5
366      Enoch      36   369.5
381    Emerson      34   383.0
385    Elliott      33   388.5
404     Elmore      31   407.5
405      Emile      31   407.5
418      Early      29   421.0
435  Ellsworth      27   437.5
436      Emmet      27   437.5
470      Elvin      24   472.5
471       Evan      24   472.5
479      Elias      23   483.0
480     Emmitt      23   483.0
518       Emma      21   525.5
519       Eric      21   525.5
536      Eldon      20   544.0
537      Elzie      20   544.0
538      Ethel      20   544.0
560       Elie      19   566.0
561    Ezekiel      19   566.0
587      Ennis      18   590.0
607   Eldridge      17   612.5
631  Elizabeth      16   635.5
632       Elza      16   635.5
633   Everette      16   635.5
653      Elsie      15   664.0
654     Emilio      15   664.0
705       Edna      13   720.5
706      Effie      13   720.5
707     Eunice      13   720.5


Boys born in 1900 whose names end with a:

         name  births  ranked
106       Ira     218   107.5
236      Alva      73   237.5
330      Ezra      42   333.0
333       Ora      42   333.0
361    Elisha      37   360.5
370    Joshua      36   369.5
413       Asa      30   415.5
454      Anna      25   461.5
491    Bertha      22   503.5
518      Emma      21   525.5
526      Otha      21   525.5
559      Dana      19   566.0
591     Hosea      18   590.0
632      Elza      16   635.5
699      Alma      13   720.5
705      Edna      13   720.5
713       Ida      13   720.5
752     Clara      12   763.5
753      Ella      12   763.5
856     Eliga      10   868.5
892     Rolla      10   868.5
944    Martha       9   934.5
950      Nora       9   934.5
980      Cora       8  1008.5
989       Eva       8  1008.5
1001      Iva       8  1008.5
1008      Lea       8  1008.5
1075   Eligha       7  1105.5
1091  Georgia       7  1105.5
1114    Laura       7  1105.5
1145     Roma       7  1105.5
1163  Alberta       6  1235.5
1167   Alonza       6  1235.5
1168    Alpha       6  1235.5
1176  Augusta       6  1235.5
1241    Julia       6  1235.5
1254    Lonza       6  1235.5
1271      Oda       6  1235.5
1274     Orla       6  1235.5
1287   Stella       6  1235.5
1351  Claudia       5  1408.5
1361     Dora       5  1408.5
1373     Elva       5  1408.5
1446      Ola       5  1408.5
1471     Rosa       5  1408.5
1486  Theresa       5  1408.5
1496    Viola       5  1408.5


Intersection of both sets:

        name  births  ranked
330     Ezra      42   333.0
361   Elisha      37   360.5
518     Emma      21   525.5
632     Elza      16   635.5
705     Edna      13   720.5
753     Ella      12   763.5
856    Eliga      10   868.5
989      Eva       8  1008.5
1075  Eligha       7  1105.5
1373    Elva       5  1408.5

I said in the blog text that nobody (in the database) was named Ashley in 1900. Here's the proof:

NOTE: I wrote this for my first blog post on Feb. 24, 2014. Then I found out that Ashley was a BOYS' name in 1900!

In [14]:
dff[dff.name == 'Ashley'].head()
Out[14]:
name sex births year pct ranked
143423 Ashley F 5 1917 0.000462 4954.0
351811 Ashley F 7 1938 0.000635 3647.5
379234 Ashley F 6 1941 0.000497 4187.5
387796 Ashley F 8 1942 0.000592 3591.0
396803 Ashley F 10 1943 0.000717 3142.0
In [15]:
dfm[dfm.name == 'Ashley'].head() #Added on March 3
Out[15]:
name sex births year pct ranked
1625 Ashley M 8 1880 0.007240 713
3726 Ashley M 6 1881 0.005956 823
5723 Ashley M 7 1882 0.006157 795
7764 Ashley M 9 1883 0.008602 660
10175 Ashley M 6 1884 0.005243 909
In [16]:
# plot percent male and female births of Ashley
nm = 'Ashley'
f_and_m = df[df.name == nm]
f_and_m = pd.pivot_table(f_and_m, 'pct', rows=['year'], cols='sex')
f_and_m = pd.DataFrame(f_and_m)

figure(num=None, figsize=(8,7), dpi=150, facecolor='w', edgecolor='k')

plt.xlim(1880,2012)

plt.subplot(211)
#plt.title(nm)
plt.title("Ashley used to be a boys' name\n", size=20)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(f_and_m.index, f_and_m.F, color='#cc0000', linewidth = 2, label = 'Girls')
plt.plot(f_and_m.index, f_and_m.M, color='#0000cc', linewidth = 2, label = 'Boys')
plt.legend(loc = 'upper left')
plt.subplot(212)
plt.ylim(0, 0.05)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(f_and_m.index, f_and_m.F, color='#cc0000', linewidth = 2, label = 'Girls')
plt.plot(f_and_m.index, f_and_m.M, color='#0000cc', linewidth = 2, label = 'Boys')
plt.annotate('magnification of top graph\n(10% of height of first tick mark)\nto see male names', xy=(.03, .95),  xycoords='axes fraction', size = 12, horizontalalignment='left', verticalalignment='top')
plt.savefig('ashleyMF.png')
plt.show()
plt.close()

Graph an individual name

In [17]:
name_chosen = "Sigourney"
sex_chosen = "F"

if sex_chosen == "F":
    y_axis_sex = "female"
else:
    y_axis_sex = "male"


import scipy
graphdf = df[df.sex == sex_chosen]
graphdf = graphdf[graphdf.name == name_chosen]

figure(num=None, figsize=(11, 8), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="black", linewidth = 2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of baby name '" + name_chosen + "', 1880-2012", size=18, color="black")
plt.xlabel('Year', size=15)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
#plt.savefig(name_chosen + '.png') # un-comment to save graph
plt.show()
plt.close()
#graphdf.to_csv(name_chosen + "_" + sex_chosen + ".csv")
In [18]:
name_chosen = "Sharona"
sex_chosen = "F"

if sex_chosen == "F":
    y_axis_sex = "female"
else:
    y_axis_sex = "male"


import scipy
graphdf = df[df.sex == sex_chosen]
graphdf = graphdf[graphdf.name == name_chosen]

figure(num=None, figsize=(8, 4), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="black", linewidth = 2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of baby name '" + name_chosen + "', 1880-2012", size=18, color="black")
plt.xlabel('Year', size=15)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
plt.savefig(name_chosen + '.png') # un-comment to save graph
plt.show()
plt.close()
#graphdf.to_csv(name_chosen + "_" + sex_chosen + ".csv")
In [19]:
#compare two names of same sex
name_chosen = "Marilyn"
name_chosen_2 = "Norma"
sex_chosen = "F"

if sex_chosen == "F":
    y_axis_sex = "female"
else:
    y_axis_sex = "male"

import scipy
graphdf = df[df.sex == sex_chosen]
graphdf2 = graphdf[graphdf.name == name_chosen_2]
graphdf = graphdf[graphdf.name == name_chosen]

figure(num=None, figsize=(7.5, 5), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="#6500A4", linewidth = 3, label=name_chosen)
plt.plot(graphdf2.year, graphdf2.pct, color="#005E75", linewidth = 3, label=name_chosen_2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of U.S. baby names\n" + name_chosen + " and " + name_chosen_2 + ", 1880-2012", size=16, color="black")
plt.xlabel('Year', size=12)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
plt.legend(loc = 'upper left')
plt.savefig(name_chosen + '_v_' + name_chosen_2 + '.png') # un-comment to save graph
plt.show()
plt.close()
In [20]:
# plot rate of babies named "Unknown" or "Baby"
Unkn = df[df.name == 'Unknown']
Unkn = pd.pivot_table(Unkn, 'pct', rows=['year'], cols='name')
Unkn = pd.DataFrame(Unkn)
Baby = df[df.name == 'Baby']
Baby = pd.pivot_table(Baby, 'pct', rows=['year'], cols='name')
Baby = pd.DataFrame(Baby)

figure(num=None, figsize=(8,4), dpi=150, facecolor='w', edgecolor='k')

plt.xlim(1880,2012)

plt.title("U.S. babies named 'Unknown' or 'Baby', 1880-2012", size=18)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(Unkn.index, Unkn.Unknown, color='#660066', linewidth = 2, label = 'Unknown')
plt.plot(Baby.index, Baby.Baby, color='#dd8833', linewidth = 2, label = 'Baby')
plt.legend(loc = 'upper left')
plt.savefig('Unknown_Baby.png')
plt.show()
plt.close()

Make dataframe and graph of new names that appear in database for every birth year

In [21]:
years2 = years[years.year > 1880] # the first year, 1880, mucks up the 'new names' data
newperthou = years2.new_names * 1000.0 / years2.births_t
import scipy
zeroline = scipy.zeros(len(range(1881, 2013)))
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='w')
plt.plot(years2.year, newperthou, color="#0A5711", linewidth = 2)
plt.fill_between(years2.year, zeroline, newperthou, color="#20912B", alpha = 1.0, interpolate=True)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("New names introduced per 1000 births", size=18, color="#0A5711")
plt.xlabel('Year', size=15)
plt.ylabel('Names appearing for first time / 1000 births', size=11)
plt.show()
plt.close()