In [1]:

import pandas as pd
import numpy as np

In [2]:

name1880 = pd.read_csv('/Users/lpd/Downloads/names/yob1880.txt',names=['name','sex','births'])

In [3]:

name1880.groupby('sex').births.sum()

Out[3]:

sex
F       90994
M      110492
Name: births

In [4]:

years = range(1880,2012)
frames = []
for year in years:
    frame = pd.read_csv('/Users/lpd/Downloads/names/yob%d.txt'%year,names=['name','sex','births'])
    frame['year'] = year
    frames.append(frame)

In [5]:

names = pd.concat(frames, ignore_index=True)

In [6]:

names.describe()

Out[6]:

	births	year
count	1724892.000000	1724892.000000
mean	189.047366	1970.272885
std	1602.926761	2490.211478
min	5.000000	1880.000000
25%	7.000000	1947.000000
50%	12.000000	1979.000000
75%	32.000000	1998.000000
max	99653.000000	2011.000000

In [7]:

names.year

Out[7]:

0     1880
1     1880
2     1880
3     1880
4     1880
5     1880
6     1880
7     1880
8     1880
9     1880
10    1880
11    1880
12    1880
13    1880
14    1880
...
1724877    2011
1724878    2011
1724879    2011
1724880    2011
1724881    2011
1724882    2011
1724883    2011
1724884    2011
1724885    2011
1724886    2011
1724887    2011
1724888    2011
1724889    2011
1724890    2011
1724891    2011
Name: year, Length: 1724892

In [8]:

total_births = names.pivot_table('births', rows='year',cols='sex',aggfunc=sum)

In [9]:

total_births.tail()

Out[9]:

sex	F	M
year
2007	1917881	2070445
2008	1885178	2034014
2009	1830134	1976208
2010	1768463	1909167
2011	1742410	1880633

In [10]:

total_births.plot()

Out[10]:

<matplotlib.axes.AxesSubplot at 0x1015dc190>

In [11]:

names

Out[11]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1724892 entries, 0 to 1724891
Data columns:
name      1724892  non-null values
sex       1724892  non-null values
births    1724892  non-null values
year      1724892  non-null values
dtypes: int64(2), object(2)

In [12]:

total_births['diff'] = total_births['F']/total_births['M']

In [13]:

total_births.tail()

Out[13]:

sex	F	M	diff
year
2007	1917881	2070445	0.926313
2008	1885178	2034014	0.926826
2009	1830134	1976208	0.926084
2010	1768463	1909167	0.926301
2011	1742410	1880633	0.926502

In [14]:

total_births['diff'].plot()

Out[14]:

<matplotlib.axes.AxesSubplot at 0x110952d90>

In [15]:

total_births['diff']

Out[15]:

year
1880    0.823535
1881    0.912732
1882    0.948666
1883    1.073506
1884    1.127371
1885    1.234274
1886    1.304641
1887    1.439452
1888    1.478044
1889    1.612882
1890    1.714722
1891    1.832900
1892    1.740018
1893    1.895556
1894    1.925485
...
1997    0.923414
1998    0.924455
1999    0.923827
2000    0.924699
2001    0.926717
2002    0.925432
2003    0.924983
2004    0.925037
2005    0.925031
2006    0.925009
2007    0.926313
2008    0.926826
2009    0.926084
2010    0.926301
2011    0.926502
Name: diff, Length: 132

In [28]:

names_pivot1 = names.pivot_table('births',rows='name',aggfunc=sum)

In [38]:

top_names = names_pivot1.order()[-10:]

In [39]:

top_names

Out[39]:

name
Charles    2354882
Joseph     2543111
Richard    2555644
David      3552443
William    4020092
Mary       4121992
Michael    4282648
Robert     4795444
John       5073452
James      5086540
Name: births

In [46]:

top_names_list = list(top_names.index.values)

In [47]:

top_names_list

Out[47]:

['Charles',
 'Joseph',
 'Richard',
 'David',
 'William',
 'Mary',
 'Michael',
 'Robert',
 'John',
 'James']

In [52]:

names_by_year = names.pivot_table('births',rows='year',cols='name',aggfunc=sum)

In [58]:

names_by_year[top_names_list].plot()

Out[58]:

<matplotlib.axes.AxesSubplot at 0x10fafb210>

In [59]:

top5_names = top_names_list[5:]

In [60]:

names_by_year[top5_names].plot(subplots=True)

Out[60]:

array([<matplotlib.axes.AxesSubplot object at 0x1018c8dd0>,
       <matplotlib.axes.AxesSubplot object at 0x125322a90>,
       <matplotlib.axes.AxesSubplot object at 0x1252c3f50>,
       <matplotlib.axes.AxesSubplot object at 0x12532f390>,
       <matplotlib.axes.AxesSubplot object at 0x12527b410>], dtype=object)

In [78]:

same_names = names.pivot_table('births',rows=['sex','name'],cols=['year'])

In [ ]: