In [1]:

import pandas as pd
from pandas import DataFrame, Series
import urllib2
import zipfile
import os

def download_name_data():
    name_data_url = "http://www.ssa.gov/oact/babynames/names.zip"
    zip_data = urllib2.urlopen(name_data_url).read()
    with open("names.zip", "wb") as zipfile:
        zipfile.write(zip_data)
    print "Successfully saved names.zip"

def extract_zip(zip_file, output_dir):
    zfile = zipfile.ZipFile(zip_file)
    
    try:
        os.mkdir(output_dir)
    except OSError, e:
        pass # Directory may already exist

    for filename in zfile.namelist():
        outfile = open(output_dir + '/' + filename, 'wb')
        outfile.write(zfile.read(filename))
        outfile.close()
    print "Extracted", zip_file, "to", output_dir

def parse_names_data(names_dir):
    years = range(1880, 2012+1)
    parts = []
    for year in years:
        path = names_dir + ("/yob%d.txt" % year)
        frame = pd.read_csv(path, names=['name', 'sex', 'births'])
        frame['year'] = year
        parts.append(frame)
    names = pd.concat(parts, ignore_index=True)
    return names

In [2]:

#download_name_data()
names_data_dir = "names_data"
extract_zip("names.zip", names_data_dir)
names = parse_names_data(names_data_dir)

Extracted names.zip to names_data

In [3]:

names.head()

Out[3]:

	name	sex	births	year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880

In [4]:

names.tail()

Out[4]:

	name	sex	births	year
1758725	Zylin	M	5	2012
1758726	Zymari	M	5	2012
1758727	Zyrin	M	5	2012
1758728	Zyrus	M	5	2012
1758729	Zytaevius	M	5	2012

In [5]:

names[names.name == 'Caleb']

Out[5]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 166 entries, 1268 to 1744599
Data columns (total 4 columns):
name      166  non-null values
sex       166  non-null values
births    166  non-null values
year      166  non-null values
dtypes: int64(2), object(2)

In [34]:

def total_number_of_births(name):
    return names[names.name == name].births.sum()

print "Total number of people born with each name between 1880 and 2012:"
name_series = Series(['Caleb','Whitney','Hastin','Rafe','Dan','Mary','Leah','Wesley','Ambria','Tradley'])
births_list = []
for name in name_series:
    births_list.append(total_number_of_births(name))

births_by_name_frame = DataFrame({'births since 1880 in the US': births_list}, index=name_series)
births_by_name_frame

Total number of people born with each name between 1880 and 2012:

Out[34]:

	births since 1880 in the US
Caleb	234304
Whitney	99268
Hastin	43
Rafe	1632
Dan	105847
Mary	4124778
Leah	194257
Wesley	198311
Ambria	1855
Tradley	0

In [7]:

total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
total_births.head()

Out[7]:

sex	F	M
year
1880	90993	110491
1881	91955	100746
1882	107850	113687
1883	112322	104630
1884	129022	114445

In [8]:

total_births.plot(figsize=(10,5))

Out[8]:

<matplotlib.axes.AxesSubplot at 0x1111943d0>

In [9]:

births_by_year_by_name = names.pivot_table('births', rows='name', cols='year', aggfunc=sum)

In [10]:

calebs_by_year = births_by_year_by_name.ix['Caleb']

In [11]:

calebs_by_year.plot(title="Caleb's born by year", figsize=(10,5))

Out[11]:

<matplotlib.axes.AxesSubplot at 0x11a60af50>

In [12]:

births_by_year_by_name.ix['Whitney'].plot(figsize=(10,5))

Out[12]:

<matplotlib.axes.AxesSubplot at 0x11a4a65d0>

In [13]:

births_by_year_by_name.ix['Bertha'].plot(figsize=(10,5))

Out[13]:

<matplotlib.axes.AxesSubplot at 0x110a0c210>

In [ ]: