import pandas as pd
from pandas import DataFrame, Series
import urllib2
import zipfile
import os
def download_name_data():
name_data_url = "http://www.ssa.gov/oact/babynames/names.zip"
zip_data = urllib2.urlopen(name_data_url).read()
with open("names.zip", "wb") as zipfile:
zipfile.write(zip_data)
print "Successfully saved names.zip"
def extract_zip(zip_file, output_dir):
zfile = zipfile.ZipFile(zip_file)
try:
os.mkdir(output_dir)
except OSError, e:
pass # Directory may already exist
for filename in zfile.namelist():
outfile = open(output_dir + '/' + filename, 'wb')
outfile.write(zfile.read(filename))
outfile.close()
print "Extracted", zip_file, "to", output_dir
def parse_names_data(names_dir):
years = range(1880, 2012+1)
parts = []
for year in years:
path = names_dir + ("/yob%d.txt" % year)
frame = pd.read_csv(path, names=['name', 'sex', 'births'])
frame['year'] = year
parts.append(frame)
names = pd.concat(parts, ignore_index=True)
return names
#download_name_data()
names_data_dir = "names_data"
extract_zip("names.zip", names_data_dir)
names = parse_names_data(names_data_dir)
names.head()
names.tail()
names[names.name == 'Caleb']
def total_number_of_births(name):
return names[names.name == name].births.sum()
print "Total number of people born with each name between 1880 and 2012:"
name_series = Series(['Caleb','Whitney','Hastin','Rafe','Dan','Mary','Leah','Wesley','Ambria','Tradley'])
births_list = []
for name in name_series:
births_list.append(total_number_of_births(name))
births_by_name_frame = DataFrame({'births since 1880 in the US': births_list}, index=name_series)
births_by_name_frame
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
total_births.head()
total_births.plot(figsize=(10,5))
births_by_year_by_name = names.pivot_table('births', rows='name', cols='year', aggfunc=sum)
calebs_by_year = births_by_year_by_name.ix['Caleb']
calebs_by_year.plot(title="Caleb's born by year", figsize=(10,5))
births_by_year_by_name.ix['Whitney'].plot(figsize=(10,5))
births_by_year_by_name.ix['Bertha'].plot(figsize=(10,5))