Notebook

Baby names iPython notebooks¶

By David Taylor, www.prooffreader.com
using data from United States Social Security Administration
I am making this public to give a head start to those who want to explore this dataset, so they don't have to download and format the data and the python objects used to do preliminary analysis. Please let me know if you find this helpful!

Printing graphs of names that match a list of names¶

List of Pokémon names¶

Note: this is an interactive script with repeated code (alas, not yet in functions) that shows the process of getting the data desired, not just the final result.¶

First pass with raw list¶

In [1]:

listed_path = "lists/pokemon.list"
totals_title = ""
top_cutoff = 10

top_boys_title = ""
top_girls_title = ""
last_year = 2013 #change this when Social Security database is updated
save_path = "user_charts" # files created by this notebook will be saved in this directory

import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
    os.makedirs(save_path)
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn # comment out if you don't have it, but it makes good-looking charts
print 'This is standard output from download_and_process.py'
%run download_and_process.py

print '--------------------\nFirst 80 characters of list:'
listed_file = open(listed_path, "r").read()
print listed_file[:80] + ' ...'
all_listed = eval(listed_file) # make sure you trust this file!
all_listed_set = set(all_listed) # to remove duplicates
all_listed = list(all_listed)
print "all_listed: list of length", len(all_listed)

# reduce names dataframe to those matching list
print '--------------------\nDataframe names filtered to those that match list'
print "%d records to begin." % (len(names))
names_listed = names[names.name.isin(all_listed)].copy()
names_listed.sort('pct_max', ascending=False, inplace=True)
print "%d records remaining." % (len(names_listed))
listed_in_df = list(names_listed.name)
print names_listed.head(10)
listed_m = list(names[(names.sex == 'M') & (names.name.isin(listed_in_df))]['name'])
listed_f = list(names[(names.sex == 'F') & (names.name.isin(listed_in_df))]['name'])

#reduce yob dataframe to those matching list
print '--------------------\nDataframe yob filtered to those that match list (count only)'
print "%d records to begin." % (len(yob))
yob_listed = yob[yob.name.isin(listed_in_df)].copy()
yob_listed.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
print "%d records remaining." % (len(yob_listed))

# m and f totals
yob_listed_f_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'F'].groupby('year').sum())[['births', 'pct']]
yob_listed_m_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'M'].groupby('year').sum())[['births', 'pct']]
print '--------------------\nHead of total matching list per year, female'
print yob_listed_f_agg.head()

# print chart of m and f totals
print '\n'

# function to determine a nice y-axis limit a little above the maximum value
# rounds maximum y up to second-most-significant digit
def determine_y_limit(x):                      
    significance = int(math.floor((math.log10(x))))
    val = math.floor(x / (10 ** (significance - 1))) + 1
    val = val * (10 ** (significance - 1))
    return val

#data
xf = list(yob_listed_f_agg.index)
xm = list(yob_listed_m_agg.index)

plt.figure(figsize=(16,9))
plt.plot(xf, list(yob_listed_f_agg.pct), color="red")
plt.plot(xm, list(yob_listed_m_agg.pct), color="blue")

plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct)
                                  +list(yob_listed_m_agg.pct))))
plt.xlim(1880, 2013)

plt.title(totals_title, fontsize = 20)
plt.xlabel("Year", fontsize = 14)
plt.ylabel("% of total births of that sex", fontsize = 14)

plt.show()

#function to make dataframe for top names

def top_df(yobdf, names, sexes):
    """ yobdf = dataframe derived from yob; normally it would just be yob itself.
        names = list of names
        sexes = list of length 1 for all the same sex, or same length as names. 'F' and 'M' allowed
        """

    df_chart = yobdf.copy()
    assert len(sexes) == 1 or len(names) == len(sexes)
    if len(sexes) == 1:
        sexes = sexes * len(names)

    df_chart = df_chart[df_chart['name'].isin(names)]   

    df_chart['temp'] = 0
    for row in range(len(df_chart)):
        for pos in range(len(names)):
            if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
                df_chart.temp.iloc[row] = 1
    df_chart = df_chart[df_chart.temp == 1]

    print "Tail of dataframe:"
    print df_chart.tail()

    output_df = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

    col = output_df.columns[0]

    for yr in range(1880, last_year + 1): #inserts missing years
        if yr not in output_df.index:
            #output_df[col][yr] = 0.0
            output_df = output_df.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

    output_df = output_df.fillna(0)
    
    return output_df

listed_top_m = top_df(yob, listed_m[:top_cutoff], ['M'])
listed_top_f = top_df(yob, listed_f[:top_cutoff], ['F'])

#a single function to make the four different kinds of charts

def make_chart(df, form='line', title='', colors= [], smoothing=0, \
               groupedlist = [], baseline='sym', png_name=''):
    
    dataframe = df.copy()
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    has_male = False
    has_female = False
    has_both = False
    max_y = 0
    for name, sex in dataframe.columns:
        max_y = max(max_y, dataframe[(name, sex)].max())
        final_name = name
        if sex == 'M': has_male = True
        if sex == 'F': has_female = True
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[(name, sex)].iloc[row] = newvalues[row]
    if has_male and has_female:
        y_text = "% of births of indicated sex"
        has_both = True
    elif has_male:
        y_text = "Percent of male births"
    else:
        y_text = "Percent of female births"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
        colors = ['#BB2114', '#0C5966', '#BA7814', '#4459AB', '#6B3838', 
                  '#B8327B', '#2B947F', '#0D83B5', '#684287', '#8C962C', 
                  '#92289E', '#242D7D']
        # my own list of dark contrasting colors
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    if title == '':
        if num_series == 1:
            title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
        else:
            title = "Popularity of baby names in U.S., %s" % (yearstr)
    
    x_values = range(startyear, endyear + 1)
    y_zeroes = [0] * (endyear - startyear)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for name, sex in dataframe.columns:
            color = colors[counter % num_colors]
            counter += 1
            if has_both:
                label = "%s (%s)" % (name, sex)
            else:
                label = name
            ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
        ax.set_ylim(0,determine_y_limit(max_y)) 
        ax.set_xlim(startyear, endyear)
        ax.set_ylabel(y_text, size = 13)
        ax.set_title(title, size = 18)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            current_ymax = dataframe[(name, sex)].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(startyear, endyear)
        
        if has_both:
            yaxtext = 'Percent of births of indicated sex (scale: '
        elif has_male:
            yaxtext = 'Percent of male births (scale: '
        else:
            yaxtext = 'Percent of female births (scale: '
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        namelist = []
        for name, sex in dataframe.columns:
            if has_both:
                namelist.append('%s (%s)' % (name, sex))
            else:
                namelist.append(name)
        plt.legend(legendProxies, namelist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        filename = save_path + "/" + png_name + ".png"
        plt.savefig(filename)
    plt.close()

# line charts

make_chart(df=listed_top_m,
           form='line', # line , subplots_auto , subplots_same , stream
           title=top_boys_title,
           colors= [],
           smoothing=0,
           baseline='zero',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           )

make_chart(df=listed_top_f,
           form='line', # line , subplots_auto , subplots_same , stream
           title=top_girls_title,
           colors= [],
           smoothing=0,
           baseline='zero',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           )

names_listed.reset_index(drop = True, inplace = True)
names_listed.head()
names_listed.to_csv('lists/names_matching_mythological_list.csv')

This is standard output from download_and_process.py
Data already downloaded.
Data already extracted.
Reading from pickle.
Tail of dataframe 'yob':
           name sex  births  year       pct  ranked
1792086  Zyhier   M       5  2013  0.000267   12995
1792087   Zylar   M       5  2013  0.000267   12995
1792088  Zymari   M       5  2013  0.000267   12995
1792089  Zymeer   M       5  2013  0.000267   12995
1792090   Zyree   M       5  2013  0.000267   12995

Tail of dataframe 'names':
                 name sex  year_count  year_min  year_max   pct_sum   pct_max
102685          Gross   M           1      1925      1925  0.000538  0.000538
102686           Elik   M           1      2012      2012  0.000318  0.000318
102687  Patrickjoseph   M           1      1998      1998  0.000262  0.000262
102688       Southern   M           1      1923      1923  0.000547  0.000547
102689           Jeon   M           1      1999      1999  0.000261  0.000261

Tail of dataframe 'years':
    year  births_f  births_m  births_t  new_names  unique_names_x    sexratio  \
68  2008   1886765   2035811   3922576       2046           32483  107.899553   
69  2009   1832276   1978582   3810858       1789           32210  107.984932   
70  2010   1771846   1912915   3684761       1635           31593  107.961696   
71  2011   1752198   1891800   3643998       1539           31412  107.967250   
72  2012   1751866   1886972   3638838       1531           31212  107.712120   

    unique_names_y_x  unique_names_x  unique_names_y_x  unique_names_x  \
68             32483           32483             32483           32483   
69             32210           32210             32210           32210   
70             31593           31593             31593           31593   
71             31412           31412             31412           31412   
72             31212           31212             31212           31212   

    unique_names_y_x  unique_names_x  unique_names_y  
68             32483           32483           32483  
69             32210           32210           32210  
70             31593           31593           31593  
71             31412           31412           31412  
72             31212           31212           31212  
--------------------
First 80 characters of list:
["Bulbasaur", "Fushigidane", "Ivysaur", "Fushigisou", "Venusaur", "Fushigibana", ...
all_listed: list of length 1318
--------------------
Dataframe names filtered to those that match list
102690 records to begin.
24 records remaining.
          name sex  year_count  year_min  year_max   pct_sum   pct_max
64911    Casey   M         120      1888      2013  5.999303  0.210989
3021     Casey   F          71      1921      2013  4.294258  0.206064
863    Unknown   F         118      1886      2013  0.563948  0.033599
64703  Unknown   M         131      1880      2013  0.693920  0.026963
64269     Aron   M         134      1880      2013  1.330348  0.020925
4806   Tangela   F          54      1956      2011  0.226490  0.012276
5139      Abra   F          52      1955      2012  0.050814  0.003892
65739    Lucky   M          98      1912      2013  0.171647  0.003567
68408   Durant   M          48      1914      2013  0.024203  0.001833
2555   Roselia   F          78      1908      2013  0.046634  0.001795
--------------------
Dataframe yob filtered to those that match list (count only)
1792091 records to begin.
1116 records remaining.
--------------------
Head of total matching list per year, female
      births       pct
year                  
1886       5  0.003459
1889       8  0.004485
1890       7  0.003677
1894       8  0.003589
1896       6  0.002522

Tail of dataframe:
            name sex  births  year       pct   ranked  temp
1781425    Lucky   M      33  2013  0.001763   3278.5     1
1784646  Thunder   M      13  2013  0.000695   6375.0     1
1787485     Onix   M       8  2013  0.000427   9152.0     1
1789150   Durant   M       6  2013  0.000321  11332.0     1
1792004    Yadon   M       5  2013  0.000267  12995.0     1
Tail of dataframe:
            name sex  births  year       pct   ranked  temp
1759744    Casey   F     373  2013  0.021478    726.0     1
1761965  Unknown   F      57  2013  0.003282   2934.0     1
1767797    Lucky   F      13  2013  0.000749   8724.5     1
1769055  Roselia   F      11  2013  0.000633   9843.5     1
1769230   Amaura   F      10  2013  0.000576  10526.0     1

In [2]:

print names_listed.name.unique()

['Casey' 'Unknown' 'Aron' 'Tangela' 'Abra' 'Lucky' 'Durant' 'Roselia'
 'Thunder' 'Windie' 'Paras' 'Kimori' 'Amaura' 'Onix' 'Eevee' 'Hassam'
 'Kameil' 'Yadon' 'Lizardo' 'Sand']

Refine list¶

Remove Casey, Aron, Tangela and Unknown.

In [3]:

cutoffn = 0
# how many names will remain to evaluate after duplicates removed

from collections import OrderedDict
evallistm = OrderedDict()
evallistf = OrderedDict()

# remove names with more common duplicates in other sex
# this happens frequently in ssa db

for name in listed_m:
    try:
        pctf = names_listed[(names_listed.sex == 'F') & 
                            (names_listed.name == name)].pct_max.iloc[0]
        pctm = names_listed[(names_listed.sex == 'M') & 
                            (names_listed.name == name)].pct_max.iloc[0]
    except:
        pctf = 98
        pctm = 99
    if (name not in names_listed[names_listed.sex == 'F'].name.unique() or
        pctf < pctm):
        evallistm[name] = ''
        
for name in listed_f:
    try:
        pctf = names_listed[(names_listed.sex == 'F') & 
                            (names_listed.name == name)].pct_max.iloc[0]
        pctm = names_listed[(names_listed.sex == 'M') & 
                            (names_listed.name == name)].pct_max.iloc[0]
    except:
        pctf = 99
        pctm = 98
    if (name not in names_listed[names_listed.sex == 'M'].name.unique() or
        pctm < pctf):
        evallistf[name] = ''
        
if cutoffn > 0:
    assert len(evallistm) > cutoffn
    assert len(evallistf) > cutoffn
    print evallistm[:cutoffn]
    print evallistf[:cutoffn]
else:
    print 'Length of lists: %d male, %d female\n' % (len(evallistm), len(evallistf))
    print evallistm
    print ' '
    print evallistf

Length of lists: 10 male, 10 female

OrderedDict([('Aron', ''), ('Casey', ''), ('Lucky', ''), ('Durant', ''), ('Paras', ''), ('Thunder', ''), ('Onix', ''), ('Yadon', ''), ('Lizardo', ''), ('Hassam', '')])
 
OrderedDict([('Unknown', ''), ('Roselia', ''), ('Tangela', ''), ('Abra', ''), ('Windie', ''), ('Amaura', ''), ('Kimori', ''), ('Kameil', ''), ('Sand', ''), ('Eevee', '')])

In [4]:

#manually copy and paste the above lists and assign 
#'acc' or 'rej' individually to accept or reject

evallistm = OrderedDict([('Aron', 'rej'), ('Casey', 'rej'), ('Lucky', 'acc'), 
                         ('Durant', 'acc'), ('Paras', 'acc'), ('Thunder', 'acc'), 
                         ('Onix', 'acc'), ('Yadon', 'acc'), ('Lizardo', 'acc'), 
                         ('Hassam', 'acc')])
evallistf = OrderedDict([('Unknown', 'rej'), ('Roselia', 'acc'), ('Tangela', 'rej'), 
                         ('Abra', 'acc'), ('Windie', 'acc'), ('Amaura', 'acc'), 
                         ('Kimori', 'acc'), ('Kameil', 'acc'), ('Sand', 'acc'), 
                         ('Eevee', 'acc')])

# Note, Demeter and Nike taken from males' list and moved to females'; for some reason it spiked in males higher than in females
# Similarly, Saturn moved from females' to males'

# Test that all names have 'acc' or 'rej' values

final_m = []
final_f = []

names_not_validated = []
for item in evallistm:
    if evallistm[item] not in ['acc', 'rej']:
        names_not_validated.append(item)
    elif evallistm[item] == 'acc':
        final_m.append(item)
for item in evallistf:
    if evallistf[item] not in ['acc', 'rej']:
        names_not_validated.append(item)
    elif evallistf[item] == 'acc':
        final_f.append(item)
        
final_all = final_m + final_f

if len(names_not_validated) > 0:
    print "The following names do not have 'acc' or 'rej' values: ", names_not_validated
    raise exception("Names not validated")
    
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f

print 'Length: %d male, %d female\n' % (len(final_m), len(final_f))

cutmin = min(len(final_m), len(final_f))

final_m = final_m[:cutmin]
final_f = final_f[:cutmin]

print 'After resizing to %d names each:' % (cutmin)
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f

Accepted male names: ['Lucky', 'Durant', 'Paras', 'Thunder', 'Onix', 'Yadon', 'Lizardo', 'Hassam']
Accepted female names: ['Roselia', 'Abra', 'Windie', 'Amaura', 'Kimori', 'Kameil', 'Sand', 'Eevee']
Length: 8 male, 8 female

After resizing to 8 names each:
Accepted male names: ['Lucky', 'Durant', 'Paras', 'Thunder', 'Onix', 'Yadon', 'Lizardo', 'Hassam']
Accepted female names: ['Roselia', 'Abra', 'Windie', 'Amaura', 'Kimori', 'Kameil', 'Sand', 'Eevee']

In [5]:

%run download_and_process.py

# reduce names dataframe to those matching list
# print '--------------------\nDataframe names filtered to those that match list'
# print "%d records to begin." % (len(names))
names_listed = names[((names.name.isin(final_m) & (names.sex == 'M')) |
                      (names.name.isin(final_f) & (names.sex == 'F')) )].copy()
names_listed.sort('pct_max', ascending=False, inplace=True)
# print "%d records remaining." % (len(names_listed))
# listed_in_df = list(names_listed.name)
# print names_listed.head(10)
# listed_m = list(names[(names.sex == 'M') & (names.name.isin(final_m))]['name'])
# listed_f = list(names[(names.sex == 'F') & (names.name.isin(final_f))]['name'])

#reduce yob dataframe to those matching list
print '--------------------\nDataframe yob filtered to those that match list (count only)'
print "%d records to begin." % (len(yob))
yob_listed_m = yob[(yob.name.isin(final_m)) & (yob.sex == 'M')].copy()
yob_listed_m.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
yob_listed_f = yob[(yob.name.isin(final_f)) & (yob.sex == 'F')].copy()
yob_listed_f.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
print "%d records remaining." % (len(yob_listed))

# m and f totals
yob_listed_f_agg = pd.DataFrame(yob_listed_f.groupby('year').sum())[['births', 'pct']]
yob_listed_m_agg = pd.DataFrame(yob_listed_m.groupby('year').sum())[['births', 'pct']]
print '--------------------\nHead of total matching list per year, female'
print yob_listed_f_agg.head()

# print chart of m and f totals
print '\n'

# function to determine a nice y-axis limit a little above the maximum value
# rounds maximum y up to second-most-significant digit
def determine_y_limit(x):                      
    significance = int(math.floor((math.log10(x))))
    val = math.floor(x / (10 ** (significance - 1))) + 1
    val = val * (10 ** (significance - 1))
    return val

#data
xf = list(yob_listed_f_agg.index)
xm = list(yob_listed_m_agg.index)

plt.figure(figsize=(16,9))
plt.plot(xf, list(yob_listed_f_agg.pct), color="red")
plt.plot(xm, list(yob_listed_m_agg.pct), color="blue")

plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct)
                                  +list(yob_listed_m_agg.pct))))
plt.xlim(1880, 2013)

plt.title('boy=blue, girl=red', fontsize = 20)
plt.xlabel("Year", fontsize = 14)
plt.ylabel("% of total births of that sex", fontsize = 14)

plt.show()

Data already downloaded.
Data already extracted.
Reading from pickle.
Tail of dataframe 'yob':
           name sex  births  year       pct  ranked
1792086  Zyhier   M       5  2013  0.000267   12995
1792087   Zylar   M       5  2013  0.000267   12995
1792088  Zymari   M       5  2013  0.000267   12995
1792089  Zymeer   M       5  2013  0.000267   12995
1792090   Zyree   M       5  2013  0.000267   12995

Tail of dataframe 'names':
                 name sex  year_count  year_min  year_max   pct_sum   pct_max
102685          Gross   M           1      1925      1925  0.000538  0.000538
102686           Elik   M           1      2012      2012  0.000318  0.000318
102687  Patrickjoseph   M           1      1998      1998  0.000262  0.000262
102688       Southern   M           1      1923      1923  0.000547  0.000547
102689           Jeon   M           1      1999      1999  0.000261  0.000261

Tail of dataframe 'years':
    year  births_f  births_m  births_t  new_names  unique_names_x    sexratio  \
68  2008   1886765   2035811   3922576       2046           32483  107.899553   
69  2009   1832276   1978582   3810858       1789           32210  107.984932   
70  2010   1771846   1912915   3684761       1635           31593  107.961696   
71  2011   1752198   1891800   3643998       1539           31412  107.967250   
72  2012   1751866   1886972   3638838       1531           31212  107.712120   

    unique_names_y_x  unique_names_x  unique_names_y_x  unique_names_x  \
68             32483           32483             32483           32483   
69             32210           32210             32210           32210   
70             31593           31593             31593           31593   
71             31412           31412             31412           31412   
72             31212           31212             31212           31212   

    unique_names_y_x  unique_names_x  unique_names_y_x  unique_names  
68             32483           32483             32483         32483  
69             32210           32210             32210         32210  
70             31593           31593             31593         31593  
71             31412           31412             31412         31412  
72             31212           31212             31212         31212  
--------------------
Dataframe yob filtered to those that match list (count only)
1792091 records to begin.
1116 records remaining.
--------------------
Head of total matching list per year, female
      births       pct
year                  
1908       6  0.001795
1909       6  0.001728
1911       5  0.001195
1913       6  0.000961
1914       5  0.000657

In [6]:

# all names_listed, so we can see which ones to aggregate
# cutoff of 10 already done

print names_listed[names_listed.sex == 'M'].head(50)
print ''
print names_listed[names_listed.sex == 'F'].head(50)

          name sex  year_count  year_min  year_max   pct_sum   pct_max
65739    Lucky   M          98      1912      2013  0.171647  0.003567
68408   Durant   M          48      1914      2013  0.024203  0.001833
71899  Thunder   M          28      1975      2013  0.017100  0.000990
71211    Paras   M          32      1975      2011  0.013356  0.000731
72773     Onix   M          25      1956      2013  0.009148  0.000561
91673   Hassam   M           2      1994      2000  0.000617  0.000363
88031    Yadon   M           4      2010      2013  0.001163  0.000318
90614  Lizardo   M           2      1970      2010  0.000530  0.000269

          name sex  year_count  year_min  year_max   pct_sum   pct_max
5139      Abra   F          52      1955      2012  0.050814  0.003892
2555   Roselia   F          78      1908      2013  0.046634  0.001795
19024   Windie   F          17      1961      1982  0.008654  0.000892
27879   Kimori   F           9      2002      2011  0.003804  0.000583
26782   Amaura   F          10      2000      2013  0.003863  0.000576
48704    Eevee   F           2      2012      2013  0.000745  0.000457
43647   Kameil   F           2      2009      2012  0.000615  0.000342
46206     Sand   F           2      1958      1960  0.000496  0.000249

Values before WWII are HEAVILY waited towards white middle-class, so refine these graphs to show 1945-present¶

In [7]:

names = final_f[:10]
sexes = ['F'] # can be length 1 or same length as names

yearstart=1880 # for data, not graph
yearend=2013

xmin = 1940

start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
    sexes = sexes * len(names)
    
df_chart = df_chart[df_chart['name'].isin(names)]   

df_chart['temp'] = 0
for row in range(len(df_chart)):
    for pos in range(len(names)):
        if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
            df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]


#To keep more than one data set for charts in memory, change name of chart_1

chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

col = chart_1.columns[0]

for yr in range(yearstart, yearend+1): #inserts missing years
    if yr not in chart_1.index:
        #chart_1[col][yr] = 0.0
        chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

chart_1 = chart_1.fillna(0)

chart_1.sort(inplace=True, ascending=True)

#a single function to make the four different kinds of charts

def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
               groupedlist = [], baseline='sym', png_name=''):
    
    dataframe = df.copy()
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    has_male = False
    has_female = False
    has_both = False
    max_y = 0
    for name, sex in dataframe.columns:
        max_y = max(max_y, dataframe[(name, sex)].max())
        final_name = name
        if sex == 'M': has_male = True
        if sex == 'F': has_female = True
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[(name, sex)].iloc[row] = newvalues[row]
    if has_male and has_female:
        y_text = "% of births of indicated sex"
        has_both = True
    elif has_male:
        y_text = "Percent of male births"
    else:
        y_text = "Percent of female births"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
       colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
                 "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
        #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
    from random import shuffle
    shuffle(colors)
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    if title == '':
        if num_series == 1:
            title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
        else:
            title = "Popularity of baby names in U.S., %s" % (yearstr)
    
    x_values = range(startyear, endyear + 1)
    y_zeroes = [0] * (endyear - startyear)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for name, sex in dataframe.columns:
            color = colors[counter % num_colors]
            counter += 1
            if has_both:
                label = "%s (%s)" % (name, sex)
            else:
                label = name
            ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
        ax.set_ylim(0,determine_y_limit(max_y)) 
        ax.set_xlim(xmin, endyear)
        ax.set_ylabel(y_text, size = 13)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            current_ymax = dataframe[(name, sex)].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(xmin, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(xmin, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        plt.figure(num=None, figsize=(20,16.67), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(xmin, endyear)
        
        if has_both:
            yaxtext = 'Percent of births of indicated sex (scale: '
        elif has_male:
            yaxtext = 'Percent of male births (scale: '
        else:
            yaxtext = 'Percent of female births (scale: '
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        namelist = []
        for name, sex in dataframe.columns:
            if has_both:
                namelist.append('%s (%s)' % (name, sex))
            else:
                namelist.append(name)
        plt.legend(legendProxies, namelist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        filename = save_path + "/" + png_name + ".png"
        plt.savefig(filename)
    plt.close()
    
#stream graph

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = '',  # if '', will not be saved
           )

In [8]:

names = final_m[:10]
sexes = ['M'] # can be length 1 or same length as names

yearstart=1880 # for data, not graph
yearend=2013

xmin = 1940

start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
    sexes = sexes * len(names)
    
df_chart = df_chart[df_chart['name'].isin(names)]   

df_chart['temp'] = 0
for row in range(len(df_chart)):
    for pos in range(len(names)):
        if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
            df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]


#To keep more than one data set for charts in memory, change name of chart_1

chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

col = chart_1.columns[0]

for yr in range(yearstart, yearend+1): #inserts missing years
    if yr not in chart_1.index:
        #chart_1[col][yr] = 0.0
        chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

chart_1 = chart_1.fillna(0)

chart_1.sort(inplace=True, ascending=True)

#a single function to make the four different kinds of charts

def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
               groupedlist = [], baseline='sym', png_name=''):
    
    dataframe = df.copy()
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    has_male = False
    has_female = False
    has_both = False
    max_y = 0
    for name, sex in dataframe.columns:
        max_y = max(max_y, dataframe[(name, sex)].max())
        final_name = name
        if sex == 'M': has_male = True
        if sex == 'F': has_female = True
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[(name, sex)].iloc[row] = newvalues[row]
    if has_male and has_female:
        y_text = "% of births of indicated sex"
        has_both = True
    elif has_male:
        y_text = "Percent of male births"
    else:
        y_text = "Percent of female births"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
       colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
                 "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
        #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
    from random import shuffle
    shuffle(colors)
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    if title == '':
        if num_series == 1:
            title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
        else:
            title = "Popularity of baby names in U.S., %s" % (yearstr)
    
    x_values = range(startyear, endyear + 1)
    y_zeroes = [0] * (endyear - startyear)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for name, sex in dataframe.columns:
            color = colors[counter % num_colors]
            counter += 1
            if has_both:
                label = "%s (%s)" % (name, sex)
            else:
                label = name
            ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
        ax.set_ylim(0,determine_y_limit(max_y)) 
        ax.set_xlim(xmin, endyear)
        ax.set_ylabel(y_text, size = 13)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            current_ymax = dataframe[(name, sex)].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(xmin, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(xmin, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(xmin, endyear)
        
        if has_both:
            yaxtext = 'Percent of births of indicated sex (scale: '
        elif has_male:
            yaxtext = 'Percent of male births (scale: '
        else:
            yaxtext = 'Percent of female births (scale: '
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        namelist = []
        for name, sex in dataframe.columns:
            if has_both:
                namelist.append('%s (%s)' % (name, sex))
            else:
                namelist.append(name)
        plt.legend(legendProxies, namelist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        filename = save_path + "/" + png_name + ".png"
        plt.savefig(filename)
    plt.close()
    
#stream graph

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = '',  # if '', will not be saved
           )

In [ ]: