Notebook

Baby names iPython notebooks¶

By David Taylor, www.prooffreader.com
using data from United States Social Security Administration
I am making this public to give a head start to those who want to explore this dataset, so they don't have to download and format the data and the python objects used to do preliminary analysis. Please let me know if you find this helpful!

Find singletons¶

The database only shows names that appear at least five times in a given year. This script will find names that appeared only in a single year.

Set working path, and import libraries and read dataframe pickles¶

In [1]:

last_year = 2013 #change this when Social Security database is updated
save_path = "user_singletons" # files created by this notebook will be saved in this directory

import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
    os.makedirs(save_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn # comment out if you don't have it, but it makes good-looking charts
%run download_and_process.py

# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):                      
    significance = int(floor((log10(x))))
    val = floor(x / (10 ** (significance - 1))) + 1
    val = val * (10 ** (significance - 1))
    return val

Data already downloaded.
Data already extracted.
Reading from pickle.
Tail of dataframe 'yob':
           name sex  births  year       pct  ranked
1792086  Zyhier   M       5  2013  0.000267   12995
1792087   Zylar   M       5  2013  0.000267   12995
1792088  Zymari   M       5  2013  0.000267   12995
1792089  Zymeer   M       5  2013  0.000267   12995
1792090   Zyree   M       5  2013  0.000267   12995

Tail of dataframe 'names':
                 name sex  year_count  year_min  year_max   pct_sum   pct_max
102685          Gross   M           1      1925      1925  0.000538  0.000538
102686           Elik   M           1      2012      2012  0.000318  0.000318
102687  Patrickjoseph   M           1      1998      1998  0.000262  0.000262
102688       Southern   M           1      1923      1923  0.000547  0.000547
102689           Jeon   M           1      1999      1999  0.000261  0.000261

Tail of dataframe 'years':
     year  births_f  births_m  births_t  new_names  unique_names    sexratio
128  2008   1886765   2035811   3922576       2046         32483  107.899553
129  2009   1832276   1978582   3810858       1789         32210  107.984932
130  2010   1771846   1912915   3684761       1635         31593  107.961696
131  2011   1752198   1891800   3643998       1539         31412  107.967250
132  2012   1751866   1886972   3638838       1531         31212  107.712120

In [2]:

df_oneyear = names[names.year_min == names.year_max]
df_oneyear = df_oneyear[['name', 'sex', 'year_min', 'pct_max']]
df_oneyear.columns = ['name', 'sex', 'year', 'pct']

In [4]:

oneyearnames = list(df_oneyear.name.unique())
yobcopy = yob.copy()
yobcopy = yobcopy[yobcopy.name.isin(oneyearnames)]

In [5]:

df_oneyear['births'] = 0
for i in range(len(df_oneyear)):
    df_oneyear.births.iloc[i] = yobcopy[(yobcopy.name == df_oneyear.name.iloc[i]) & 
                                        (yobcopy.sex == df_oneyear.sex.iloc[i])].births.iloc[0]

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-5-a26a49c576f1> in <module>()
      1 df_oneyear['births'] = 0
      2 for i in range(len(df_oneyear)):
----> 3     df_oneyear.births.iloc[i] = yobcopy[(yobcopy.name == df_oneyear.name.iloc[i]) & 
      4                                         (yobcopy.sex == df_oneyear.sex.iloc[i])].births.iloc[0]

C:\Users\David\Anaconda\lib\site-packages\pandas\core\ops.pyc in wrapper(self, other)
    553         else:
    554 
--> 555             mask = isnull(self)
    556 
    557             values = self.values

C:\Users\David\Anaconda\lib\site-packages\pandas\core\common.pyc in isnull(obj)
    127     pandas.notnull: boolean inverse of pandas.isnull
    128     """
--> 129     return _isnull(obj)
    130 
    131 

C:\Users\David\Anaconda\lib\site-packages\pandas\core\common.pyc in _isnull_new(obj)
    137         raise NotImplementedError("isnull is not defined for MultiIndex")
    138     elif isinstance(obj, (ABCSeries, np.ndarray)):
--> 139         return _isnull_ndarraylike(obj)
    140     elif isinstance(obj, ABCGeneric):
    141         return obj._constructor(obj._data.isnull(func=isnull))

C:\Users\David\Anaconda\lib\site-packages\pandas\core\common.pyc in _isnull_ndarraylike(obj)
    225     if isinstance(obj, ABCSeries):
    226         from pandas import Series
--> 227         result = Series(result, index=obj.index, name=obj.name, copy=False)
    228 
    229     return result

C:\Users\David\Anaconda\lib\site-packages\pandas\core\series.pyc in __init__(self, data, index, dtype, name, copy, fastpath)
    229                                        raise_cast_failure=True)
    230 
--> 231                 data = SingleBlockManager(data, index, fastpath=True)
    232 
    233         generic.NDFrame.__init__(self, data, fastpath=True)

C:\Users\David\Anaconda\lib\site-packages\pandas\core\internals.pyc in __init__(self, block, axis, do_integrity_check, fastpath)
   2999         if fastpath:
   3000             self.axes = [axis]
-> 3001             if isinstance(block, list):
   3002 
   3003                 # empty block

KeyboardInterrupt:

In [5]:

df_oneyear.to_pickle(save_path+'/df_oneyear.pickle')

In [3]:

df_oneyear = pd.read_pickle(save_path+'/df_oneyear.pickle')

---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-3-0fd440f6d226> in <module>()
----> 1 df_oneyear = pd.read_pickle(save_path+'/df_oneyear.pickle')

C:\Users\David\Anaconda\lib\site-packages\pandas\io\pickle.pyc in read_pickle(path)
     59 
     60     try:
---> 61         return try_read(path)
     62     except:
     63         if PY3:

C:\Users\David\Anaconda\lib\site-packages\pandas\io\pickle.pyc in try_read(path, encoding)
     55             # compat pickle
     56             except:
---> 57                 with open(path, 'rb') as fh:
     58                     return pc.load(fh, encoding=encoding, compat=True)
     59 

IOError: [Errno 2] No such file or directory: 'user_singletons/df_oneyear.pickle'

In [6]:

df_oneyear.sort('births', inplace=True, ascending=False)
df_oneyear.head(50)

Out[6]:

	name	sex	year	pct	births

In [5]:

%matplotlib inline
dictionary = {0:1000, 1:20, 2:15, 3:0, 4:5}
xmax = df_oneyear.year.max()
plt.figure() # <- makes a new figure and sets it active (add this)
plt.hist(list(df_oneyear.year),xmax) # <- finds the current active axes/figure and plots to it
plt.title('Histogram of names appearing only once in the Social Security database') 
plt.xlabel('Year')
plt.ylabel('Number of names')
plt.show()

In [18]:

#determine what percentage of births were single-appearances for each year
df_oygrt = pd.DataFrame(df_oneyear.groupby('year').births.sum())
df_oygrm = pd.DataFrame(df_oneyear[df_oneyear.sex == 'M'].groupby('year').births.sum())
df_oygrf = pd.DataFrame(df_oneyear[df_oneyear.sex == 'F'].groupby('year').births.sum())
df_oygrt['pct'] = 0.0
df_oygrm['pct'] = 0.0
df_oygrf['pct'] = 0.0
for i in range(len(df_oygrt)):
    df_oygrt.pct.iloc[i] = df_oygrt.births.iloc[i] * 1.0 / 

In [19]:

df_oygr.head()

Out[19]:

	births
year
1880	5
1881	14
1885	15
1886	5
1887	5