import pandas as pd
from pandas import DataFrame, Series
import urllib2
import zipfile
import os
def download_name_data():
name_data_url = "http://www.ssa.gov/oact/babynames/names.zip"
zip_data = urllib2.urlopen(name_data_url).read()
with open("names.zip", "wb") as zipfile:
zipfile.write(zip_data)
print "Successfully saved names.zip"
def extract_zip(zip_file, output_dir):
zfile = zipfile.ZipFile(zip_file)
try:
os.mkdir(output_dir)
except OSError, e:
pass # Directory may already exist
for filename in zfile.namelist():
outfile = open(output_dir + '/' + filename, 'wb')
outfile.write(zfile.read(filename))
outfile.close()
print "Extracted", zip_file, "to", output_dir
def parse_names_data(names_dir):
years = range(1880, 2012+1)
parts = []
for year in years:
path = names_dir + ("/yob%d.txt" % year)
frame = pd.read_csv(path, names=['name', 'sex', 'births'])
frame['year'] = year
parts.append(frame)
names = pd.concat(parts, ignore_index=True)
return names
#download_name_data()
names_data_dir = "names_data"
extract_zip("names.zip", names_data_dir)
names = parse_names_data(names_data_dir)
Extracted names.zip to names_data
names.head()
name | sex | births | year | |
---|---|---|---|---|
0 | Mary | F | 7065 | 1880 |
1 | Anna | F | 2604 | 1880 |
2 | Emma | F | 2003 | 1880 |
3 | Elizabeth | F | 1939 | 1880 |
4 | Minnie | F | 1746 | 1880 |
names.tail()
name | sex | births | year | |
---|---|---|---|---|
1758725 | Zylin | M | 5 | 2012 |
1758726 | Zymari | M | 5 | 2012 |
1758727 | Zyrin | M | 5 | 2012 |
1758728 | Zyrus | M | 5 | 2012 |
1758729 | Zytaevius | M | 5 | 2012 |
names[names.name == 'Caleb']
<class 'pandas.core.frame.DataFrame'> Int64Index: 166 entries, 1268 to 1744599 Data columns (total 4 columns): name 166 non-null values sex 166 non-null values births 166 non-null values year 166 non-null values dtypes: int64(2), object(2)
def total_number_of_births(name):
return names[names.name == name].births.sum()
print "Total number of people born with each name between 1880 and 2012:"
name_series = Series(['Caleb','Whitney','Hastin','Rafe','Dan','Mary','Leah','Wesley','Ambria','Tradley'])
births_list = []
for name in name_series:
births_list.append(total_number_of_births(name))
births_by_name_frame = DataFrame({'births since 1880 in the US': births_list}, index=name_series)
births_by_name_frame
Total number of people born with each name between 1880 and 2012:
births since 1880 in the US | |
---|---|
Caleb | 234304 |
Whitney | 99268 |
Hastin | 43 |
Rafe | 1632 |
Dan | 105847 |
Mary | 4124778 |
Leah | 194257 |
Wesley | 198311 |
Ambria | 1855 |
Tradley | 0 |
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
total_births.head()
sex | F | M |
---|---|---|
year | ||
1880 | 90993 | 110491 |
1881 | 91955 | 100746 |
1882 | 107850 | 113687 |
1883 | 112322 | 104630 |
1884 | 129022 | 114445 |
total_births.plot(figsize=(10,5))
<matplotlib.axes.AxesSubplot at 0x1111943d0>
births_by_year_by_name = names.pivot_table('births', rows='name', cols='year', aggfunc=sum)
calebs_by_year = births_by_year_by_name.ix['Caleb']
calebs_by_year.plot(title="Caleb's born by year", figsize=(10,5))
<matplotlib.axes.AxesSubplot at 0x11a60af50>
births_by_year_by_name.ix['Whitney'].plot(figsize=(10,5))
<matplotlib.axes.AxesSubplot at 0x11a4a65d0>
births_by_year_by_name.ix['Bertha'].plot(figsize=(10,5))
<matplotlib.axes.AxesSubplot at 0x110a0c210>