import pandas as pd
import numpy as np
name1880 = pd.read_csv('/Users/lpd/Downloads/names/yob1880.txt',names=['name','sex','births'])
name1880.groupby('sex').births.sum()
sex F 90994 M 110492 Name: births
years = range(1880,2012)
frames = []
for year in years:
frame = pd.read_csv('/Users/lpd/Downloads/names/yob%d.txt'%year,names=['name','sex','births'])
frame['year'] = year
frames.append(frame)
names = pd.concat(frames, ignore_index=True)
names.describe()
births | year | |
---|---|---|
count | 1724892.000000 | 1724892.000000 |
mean | 189.047366 | 1970.272885 |
std | 1602.926761 | 2490.211478 |
min | 5.000000 | 1880.000000 |
25% | 7.000000 | 1947.000000 |
50% | 12.000000 | 1979.000000 |
75% | 32.000000 | 1998.000000 |
max | 99653.000000 | 2011.000000 |
names.year
0 1880 1 1880 2 1880 3 1880 4 1880 5 1880 6 1880 7 1880 8 1880 9 1880 10 1880 11 1880 12 1880 13 1880 14 1880 ... 1724877 2011 1724878 2011 1724879 2011 1724880 2011 1724881 2011 1724882 2011 1724883 2011 1724884 2011 1724885 2011 1724886 2011 1724887 2011 1724888 2011 1724889 2011 1724890 2011 1724891 2011 Name: year, Length: 1724892
total_births = names.pivot_table('births', rows='year',cols='sex',aggfunc=sum)
total_births.tail()
sex | F | M |
---|---|---|
year | ||
2007 | 1917881 | 2070445 |
2008 | 1885178 | 2034014 |
2009 | 1830134 | 1976208 |
2010 | 1768463 | 1909167 |
2011 | 1742410 | 1880633 |
total_births.plot()
<matplotlib.axes.AxesSubplot at 0x1015dc190>
names
<class 'pandas.core.frame.DataFrame'> Int64Index: 1724892 entries, 0 to 1724891 Data columns: name 1724892 non-null values sex 1724892 non-null values births 1724892 non-null values year 1724892 non-null values dtypes: int64(2), object(2)
total_births['diff'] = total_births['F']/total_births['M']
total_births.tail()
sex | F | M | diff |
---|---|---|---|
year | |||
2007 | 1917881 | 2070445 | 0.926313 |
2008 | 1885178 | 2034014 | 0.926826 |
2009 | 1830134 | 1976208 | 0.926084 |
2010 | 1768463 | 1909167 | 0.926301 |
2011 | 1742410 | 1880633 | 0.926502 |
total_births['diff'].plot()
<matplotlib.axes.AxesSubplot at 0x110952d90>
total_births['diff']
year 1880 0.823535 1881 0.912732 1882 0.948666 1883 1.073506 1884 1.127371 1885 1.234274 1886 1.304641 1887 1.439452 1888 1.478044 1889 1.612882 1890 1.714722 1891 1.832900 1892 1.740018 1893 1.895556 1894 1.925485 ... 1997 0.923414 1998 0.924455 1999 0.923827 2000 0.924699 2001 0.926717 2002 0.925432 2003 0.924983 2004 0.925037 2005 0.925031 2006 0.925009 2007 0.926313 2008 0.926826 2009 0.926084 2010 0.926301 2011 0.926502 Name: diff, Length: 132
names_pivot1 = names.pivot_table('births',rows='name',aggfunc=sum)
top_names = names_pivot1.order()[-10:]
top_names
name Charles 2354882 Joseph 2543111 Richard 2555644 David 3552443 William 4020092 Mary 4121992 Michael 4282648 Robert 4795444 John 5073452 James 5086540 Name: births
top_names_list = list(top_names.index.values)
top_names_list
['Charles', 'Joseph', 'Richard', 'David', 'William', 'Mary', 'Michael', 'Robert', 'John', 'James']
names_by_year = names.pivot_table('births',rows='year',cols='name',aggfunc=sum)
names_by_year[top_names_list].plot()
<matplotlib.axes.AxesSubplot at 0x10fafb210>
top5_names = top_names_list[5:]
names_by_year[top5_names].plot(subplots=True)
array([<matplotlib.axes.AxesSubplot object at 0x1018c8dd0>, <matplotlib.axes.AxesSubplot object at 0x125322a90>, <matplotlib.axes.AxesSubplot object at 0x1252c3f50>, <matplotlib.axes.AxesSubplot object at 0x12532f390>, <matplotlib.axes.AxesSubplot object at 0x12527b410>], dtype=object)
same_names = names.pivot_table('births',rows=['sex','name'],cols=['year'])