from pandas import *
set_printoptions(notebook_repr_html=False) # turn off html, i.e., without table.
set_printoptions(max_rows=500) # getting the summary of the view versus getting everything. Terminal takes time to buffer.
import pandas
pandas.__version__
'0.8.1'
names = read_csv('baby-names2.csv')
names
<class 'pandas.core.frame.DataFrame'> Int64Index: 258000 entries, 0 to 257999 Data columns: year 258000 non-null values name 258000 non-null values prop 258000 non-null values sex 258000 non-null values soundex 258000 non-null values dtypes: float64(1), int64(1), object(3)
names.head()
year name prop sex soundex 0 1880 John 0.081541 boy J500 1 1880 William 0.080511 boy W450 2 1880 James 0.050057 boy J520 3 1880 Charles 0.045167 boy C642 4 1880 George 0.043292 boy G620
names[names.year == 1880].head()
year name prop sex soundex 0 1880 John 0.081541 boy J500 1 1880 William 0.080511 boy W450 2 1880 James 0.050057 boy J520 3 1880 Charles 0.045167 boy C642 4 1880 George 0.043292 boy G620
names[names.year == 1880].tail()
year name prop sex soundex 129995 1880 Emaline 0.000041 girl E545 129996 1880 Ester 0.000041 girl E236 129997 1880 Eulah 0.000041 girl E400 129998 1880 Eulalie 0.000041 girl E440 129999 1880 Euna 0.000041 girl E500
boys = names[names.sex == 'boy'] # segment the data into boy and girl names. We have 2 dataframes now.
girls = names[names.sex == 'girl']
boys.groupby('year')
<pandas.core.groupby.DataFrameGroupBy at 0x106eba190>
boys.groupby('year').size()
year 1880 1000 1881 1000 1882 1000 1883 1000 1884 1000 1885 1000 1886 1000 1887 1000 1888 1000 1889 1000 1890 1000 1891 1000 1892 1000 1893 1000 1894 1000 1895 1000 1896 1000 1897 1000 1898 1000 1899 1000 1900 1000 1901 1000 1902 1000 1903 1000 1904 1000 1905 1000 1906 1000 1907 1000 1908 1000 1909 1000 1910 1000 1911 1000 1912 1000 1913 1000 1914 1000 1915 1000 1916 1000 1917 1000 1918 1000 1919 1000 1920 1000 1921 1000 1922 1000 1923 1000 1924 1000 1925 1000 1926 1000 1927 1000 1928 1000 1929 1000 1930 1000 1931 1000 1932 1000 1933 1000 1934 1000 1935 1000 1936 1000 1937 1000 1938 1000 1939 1000 1940 1000 1941 1000 1942 1000 1943 1000 1944 1000 1945 1000 1946 1000 1947 1000 1948 1000 1949 1000 1950 1000 1951 1000 1952 1000 1953 1000 1954 1000 1955 1000 1956 1000 1957 1000 1958 1000 1959 1000 1960 1000 1961 1000 1962 1000 1963 1000 1964 1000 1965 1000 1966 1000 1967 1000 1968 1000 1969 1000 1970 1000 1971 1000 1972 1000 1973 1000 1974 1000 1975 1000 1976 1000 1977 1000 1978 1000 1979 1000 1980 1000 1981 1000 1982 1000 1983 1000 1984 1000 1985 1000 1986 1000 1987 1000 1988 1000 1989 1000 1990 1000 1991 1000 1992 1000 1993 1000 1994 1000 1995 1000 1996 1000 1997 1000 1998 1000 1999 1000 2000 1000 2001 1000 2002 1000 2003 1000 2004 1000 2005 1000 2006 1000 2007 1000 2008 1000 Length: 129
names.groupby(['year', 'sex']).size()
year sex 1880 boy 1000 girl 1000 1881 boy 1000 girl 1000 1882 boy 1000 girl 1000 1883 boy 1000 girl 1000 1884 boy 1000 girl 1000 1885 boy 1000 girl 1000 1886 boy 1000 girl 1000 1887 boy 1000 girl 1000 1888 boy 1000 girl 1000 1889 boy 1000 girl 1000 1890 boy 1000 girl 1000 1891 boy 1000 girl 1000 1892 boy 1000 girl 1000 1893 boy 1000 girl 1000 1894 boy 1000 girl 1000 1895 boy 1000 girl 1000 1896 boy 1000 girl 1000 1897 boy 1000 girl 1000 1898 boy 1000 girl 1000 1899 boy 1000 girl 1000 1900 boy 1000 girl 1000 1901 boy 1000 girl 1000 1902 boy 1000 girl 1000 1903 boy 1000 girl 1000 1904 boy 1000 girl 1000 1905 boy 1000 girl 1000 1906 boy 1000 girl 1000 1907 boy 1000 girl 1000 1908 boy 1000 girl 1000 1909 boy 1000 girl 1000 1910 boy 1000 girl 1000 1911 boy 1000 girl 1000 1912 boy 1000 girl 1000 1913 boy 1000 girl 1000 1914 boy 1000 girl 1000 1915 boy 1000 girl 1000 1916 boy 1000 girl 1000 1917 boy 1000 girl 1000 1918 boy 1000 girl 1000 1919 boy 1000 girl 1000 1920 boy 1000 girl 1000 1921 boy 1000 girl 1000 1922 boy 1000 girl 1000 1923 boy 1000 girl 1000 1924 boy 1000 girl 1000 1925 boy 1000 girl 1000 1926 boy 1000 girl 1000 1927 boy 1000 girl 1000 1928 boy 1000 girl 1000 1929 boy 1000 girl 1000 1930 boy 1000 girl 1000 1931 boy 1000 girl 1000 1932 boy 1000 girl 1000 1933 boy 1000 girl 1000 1934 boy 1000 girl 1000 1935 boy 1000 girl 1000 1936 boy 1000 girl 1000 1937 boy 1000 girl 1000 1938 boy 1000 girl 1000 1939 boy 1000 girl 1000 1940 boy 1000 girl 1000 1941 boy 1000 girl 1000 1942 boy 1000 girl 1000 1943 boy 1000 girl 1000 1944 boy 1000 girl 1000 1945 boy 1000 girl 1000 1946 boy 1000 girl 1000 1947 boy 1000 girl 1000 1948 boy 1000 girl 1000 1949 boy 1000 girl 1000 1950 boy 1000 girl 1000 1951 boy 1000 girl 1000 1952 boy 1000 girl 1000 1953 boy 1000 girl 1000 1954 boy 1000 girl 1000 1955 boy 1000 girl 1000 1956 boy 1000 girl 1000 1957 boy 1000 girl 1000 1958 boy 1000 girl 1000 1959 boy 1000 girl 1000 1960 boy 1000 girl 1000 1961 boy 1000 girl 1000 1962 boy 1000 girl 1000 1963 boy 1000 girl 1000 1964 boy 1000 girl 1000 1965 boy 1000 girl 1000 1966 boy 1000 girl 1000 1967 boy 1000 girl 1000 1968 boy 1000 girl 1000 1969 boy 1000 girl 1000 1970 boy 1000 girl 1000 1971 boy 1000 girl 1000 1972 boy 1000 girl 1000 1973 boy 1000 girl 1000 1974 boy 1000 girl 1000 1975 boy 1000 girl 1000 1976 boy 1000 girl 1000 1977 boy 1000 girl 1000 1978 boy 1000 girl 1000 1979 boy 1000 girl 1000 1980 boy 1000 girl 1000 1981 boy 1000 girl 1000 1982 boy 1000 girl 1000 1983 boy 1000 girl 1000 1984 boy 1000 girl 1000 1985 boy 1000 girl 1000 1986 boy 1000 girl 1000 1987 boy 1000 girl 1000 1988 boy 1000 girl 1000 1989 boy 1000 girl 1000 1990 boy 1000 girl 1000 1991 boy 1000 girl 1000 1992 boy 1000 girl 1000 1993 boy 1000 girl 1000 1994 boy 1000 girl 1000 1995 boy 1000 girl 1000 1996 boy 1000 girl 1000 1997 boy 1000 girl 1000 1998 boy 1000 girl 1000 1999 boy 1000 girl 1000 2000 boy 1000 girl 1000 2001 boy 1000 girl 1000 2002 boy 1000 girl 1000 2003 boy 1000 girl 1000 2004 boy 1000 girl 1000 2005 boy 1000 girl 1000 2006 boy 1000 girl 1000 2007 boy 1000 girl 1000 2008 boy 1000 girl 1000 Length: 258
type(names.groupby(['year', 'sex']).size())
pandas.core.series.Series
names.groupby(['year', 'sex']).size().ix[2000] # select out by year: 2001, 2002. Only show the inner key.
sex boy 1000 girl 1000
boys[boys.year == 2000] # We would like to find out most popular boy names for each year.
<class 'pandas.core.frame.DataFrame'> Int64Index: 1000 entries, 120000 to 120999 Data columns: year 1000 non-null values name 1000 non-null values prop 1000 non-null values sex 1000 non-null values soundex 1000 non-null values dtypes: float64(1), int64(1), object(3)
boys[boys.year == 2000].prop
120000 0.016514 120001 0.015346 120002 0.013695 120003 0.013194 120004 0.011990 120005 0.011815 120006 0.011324 120007 0.010930 120008 0.010681 120009 0.010305 120010 0.009896 120011 0.009745 120012 0.009715 120013 0.009617 120014 0.009516 ... 120985 0.000072 120986 0.000072 120987 0.000072 120988 0.000072 120989 0.000071 120990 0.000071 120991 0.000071 120992 0.000071 120993 0.000071 120994 0.000071 120995 0.000071 120996 0.000071 120997 0.000071 120998 0.000071 120999 0.000071 Name: prop, Length: 1000
boys[boys.year == 2000][:5]
year name prop sex soundex 120000 2000 Jacob 0.016514 boy J210 120001 2000 Michael 0.015346 boy M240 120002 2000 Matthew 0.013695 boy M300 120003 2000 Joshua 0.013194 boy J200 120004 2000 Christopher 0.011990 boy C623
boys[boys.year == 2000].prop.idxmax()
120000
boys.ix[boys[boys.year == 2000].prop.idxmax()] # to get the whole row.
year 2000 name Jacob prop 0.016514 sex boy soundex J210 Name: 120000
def get_max_record(group):
return group.ix[group.prop.idxmax()]
get_max_record(boys)
year 1880 name John prop 0.081541 sex boy soundex J500 Name: 0
def get_max_record(group):
return group.ix[group.prop.idxmax()]
result = boys.groupby('year').apply(get_max_record)
result # the popularity of a certain name goes down over time.
year name prop sex soundex year 1880 1880 John 0.081541 boy J500 1881 1881 John 0.080975 boy J500 1882 1882 John 0.078314 boy J500 1883 1883 John 0.079066 boy J500 1884 1884 John 0.076476 boy J500 1885 1885 John 0.075517 boy J500 1886 1886 John 0.07582 boy J500 1887 1887 John 0.074181 boy J500 1888 1888 John 0.071186 boy J500 1889 1889 John 0.071804 boy J500 1890 1890 John 0.071034 boy J500 1891 1891 John 0.070292 boy J500 1892 1892 John 0.068759 boy J500 1893 1893 John 0.066495 boy J500 1894 1894 John 0.065961 boy J500 1895 1895 John 0.065699 boy J500 1896 1896 John 0.063051 boy J500 1897 1897 John 0.061907 boy J500 1898 1898 John 0.06177 boy J500 1899 1899 John 0.060674 boy J500 1900 1900 John 0.060619 boy J500 1901 1901 John 0.059677 boy J500 1902 1902 John 0.059568 boy J500 1903 1903 John 0.058833 boy J500 1904 1904 John 0.058535 boy J500 1905 1905 John 0.056266 boy J500 1906 1906 John 0.057359 boy J500 1907 1907 John 0.056635 boy J500 1908 1908 John 0.05615 boy J500 1909 1909 John 0.054234 boy J500 1910 1910 John 0.054914 boy J500 1911 1911 John 0.055709 boy J500 1912 1912 John 0.054465 boy J500 1913 1913 John 0.054678 boy J500 1914 1914 John 0.055531 boy J500 1915 1915 John 0.054005 boy J500 1916 1916 John 0.054202 boy J500 1917 1917 John 0.054062 boy J500 1918 1918 John 0.053923 boy J500 1919 1919 John 0.052724 boy J500 1920 1920 John 0.05171 boy J500 1921 1921 John 0.05116 boy J500 1922 1922 John 0.050889 boy J500 1923 1923 John 0.050741 boy J500 1924 1924 Robert 0.052025 boy R163 1925 1925 Robert 0.052875 boy R163 1926 1926 Robert 0.053366 boy R163 1927 1927 Robert 0.053075 boy R163 1928 1928 Robert 0.053177 boy R163 1929 1929 Robert 0.054029 boy R163 1930 1930 Robert 0.055021 boy R163 1931 1931 Robert 0.056598 boy R163 1932 1932 Robert 0.055166 boy R163 1933 1933 Robert 0.05314 boy R163 1934 1934 Robert 0.052583 boy R163 1935 1935 Robert 0.052852 boy R163 1936 1936 Robert 0.054959 boy R163 1937 1937 Robert 0.056541 boy R163 1938 1938 Robert 0.054806 boy R163 1939 1939 Robert 0.052645 boy R163 1940 1940 James 0.052662 boy J520 1941 1941 James 0.053182 boy J520 1942 1942 James 0.054812 boy J520 1943 1943 James 0.055183 boy J520 1944 1944 James 0.055406 boy J520 1945 1945 James 0.054288 boy J520 1946 1946 James 0.052999 boy J520 1947 1947 James 0.050987 boy J520 1948 1948 James 0.049697 boy J520 1949 1949 James 0.048244 boy J520 1950 1950 James 0.047336 boy J520 1951 1951 James 0.045662 boy J520 1952 1952 James 0.044132 boy J520 1953 1953 Robert 0.043059 boy R163 1954 1954 Michael 0.042792 boy M240 1955 1955 Michael 0.042293 boy M240 1956 1956 Michael 0.042244 boy M240 1957 1957 Michael 0.042392 boy M240 1958 1958 Michael 0.042029 boy M240 1959 1959 Michael 0.039381 boy M240 1960 1960 David 0.039669 boy D130 1961 1961 Michael 0.040331 boy M240 1962 1962 Michael 0.040453 boy M240 1963 1963 Michael 0.040569 boy M240 1964 1964 Michael 0.040776 boy M240 1965 1965 Michael 0.042762 boy M240 1966 1966 Michael 0.043997 boy M240 1967 1967 Michael 0.046329 boy M240 1968 1968 Michael 0.046184 boy M240 1969 1969 Michael 0.046564 boy M240 1970 1970 Michael 0.044784 boy M240 1971 1971 Michael 0.042687 boy M240 1972 1972 Michael 0.042659 boy M240 1973 1973 Michael 0.042049 boy M240 1974 1974 Michael 0.041444 boy M240 1975 1975 Michael 0.042178 boy M240 1976 1976 Michael 0.041012 boy M240 1977 1977 Michael 0.039554 boy M240 1978 1978 Michael 0.039308 boy M240 1979 1979 Michael 0.037823 boy M240 1980 1980 Michael 0.037039 boy M240 1981 1981 Michael 0.036931 boy M240 1982 1982 Michael 0.036157 boy M240 1983 1983 Michael 0.036506 boy M240 1984 1984 Michael 0.036117 boy M240 1985 1985 Michael 0.033745 boy M240 1986 1986 Michael 0.033429 boy M240 1987 1987 Michael 0.032652 boy M240 1988 1988 Michael 0.032053 boy M240 1989 1989 Michael 0.031213 boy M240 1990 1990 Michael 0.030358 boy M240 1991 1991 Michael 0.028698 boy M240 1992 1992 Michael 0.025914 boy M240 1993 1993 Michael 0.02401 boy M240 1994 1994 Michael 0.021831 boy M240 1995 1995 Michael 0.020594 boy M240 1996 1996 Michael 0.019156 boy M240 1997 1997 Michael 0.018805 boy M240 1998 1998 Michael 0.018066 boy M240 1999 1999 Jacob 0.017346 boy J210 2000 2000 Jacob 0.016514 boy J210 2001 2001 Jacob 0.015734 boy J210 2002 2002 Jacob 0.014792 boy J210 2003 2003 Jacob 0.014104 boy J210 2004 2004 Jacob 0.013196 boy J210 2005 2005 Jacob 0.012148 boy J210 2006 2006 Jacob 0.011331 boy J210 2007 2007 Jacob 0.010948 boy J210 2008 2008 Jacob 0.010355 boy J210
result.prop.plot()
<matplotlib.axes.AxesSubplot at 0x107afe7d0>
boys[boys.name == 'Travis']
year name prop sex soundex 563 1880 Travis 0.000101 boy T612 2648 1882 Travis 0.000082 boy T612 3672 1883 Travis 0.000080 boy T612 4656 1884 Travis 0.000081 boy T612 5602 1885 Travis 0.000095 boy T612 6598 1886 Travis 0.000092 boy T612 9871 1889 Travis 0.000059 boy T612 10733 1890 Travis 0.000075 boy T612 11696 1891 Travis 0.000082 boy T612 12593 1892 Travis 0.000099 boy T612 13831 1893 Travis 0.000066 boy T612 14849 1894 Travis 0.000064 boy T612 15735 1895 Travis 0.000079 boy T612 16774 1896 Travis 0.000077 boy T612 17645 1897 Travis 0.000098 boy T612 18991 1898 Travis 0.000053 boy T612 19658 1899 Travis 0.000095 boy T612 20442 1900 Travis 0.000166 boy T612 22552 1902 Travis 0.000121 boy T612 23778 1903 Travis 0.000077 boy T612 24510 1904 Travis 0.000137 boy T612 25505 1905 Travis 0.000147 boy T612 26587 1906 Travis 0.000118 boy T612 27535 1907 Travis 0.000132 boy T612 28512 1908 Travis 0.000150 boy T612 29451 1909 Travis 0.000181 boy T612 30510 1910 Travis 0.000149 boy T612 31467 1911 Travis 0.000166 boy T612 32448 1912 Travis 0.000171 boy T612 33413 1913 Travis 0.000203 boy T612 34388 1914 Travis 0.000220 boy T612 35426 1915 Travis 0.000182 boy T612 36404 1916 Travis 0.000204 boy T612 37393 1917 Travis 0.000215 boy T612 38381 1918 Travis 0.000233 boy T612 39384 1919 Travis 0.000228 boy T612 40385 1920 Travis 0.000218 boy T612 41383 1921 Travis 0.000219 boy T612 42374 1922 Travis 0.000231 boy T612 43385 1923 Travis 0.000209 boy T612 44382 1924 Travis 0.000210 boy T612 45388 1925 Travis 0.000208 boy T612 46404 1926 Travis 0.000191 boy T612 47364 1927 Travis 0.000220 boy T612 48371 1928 Travis 0.000214 boy T612 49338 1929 Travis 0.000246 boy T612 50360 1930 Travis 0.000225 boy T612 51325 1931 Travis 0.000266 boy T612 52329 1932 Travis 0.000260 boy T612 53311 1933 Travis 0.000290 boy T612 54281 1934 Travis 0.000344 boy T612 55292 1935 Travis 0.000324 boy T612 56295 1936 Travis 0.000314 boy T612 57300 1937 Travis 0.000293 boy T612 58290 1938 Travis 0.000312 boy T612 59295 1939 Travis 0.000312 boy T612 60294 1940 Travis 0.000299 boy T612 61309 1941 Travis 0.000278 boy T612 62319 1942 Travis 0.000256 boy T612 63347 1943 Travis 0.000216 boy T612 64330 1944 Travis 0.000240 boy T612 65342 1945 Travis 0.000225 boy T612 66318 1946 Travis 0.000246 boy T612 67337 1947 Travis 0.000218 boy T612 68347 1948 Travis 0.000211 boy T612 69349 1949 Travis 0.000207 boy T612 70343 1950 Travis 0.000211 boy T612 71353 1951 Travis 0.000197 boy T612 72356 1952 Travis 0.000184 boy T612 73355 1953 Travis 0.000189 boy T612 74370 1954 Travis 0.000176 boy T612 75380 1955 Travis 0.000171 boy T612 76384 1956 Travis 0.000177 boy T612 77409 1957 Travis 0.000164 boy T612 78371 1958 Travis 0.000201 boy T612 79346 1959 Travis 0.000233 boy T612 80327 1960 Travis 0.000256 boy T612 81323 1961 Travis 0.000271 boy T612 82301 1962 Travis 0.000312 boy T612 83257 1963 Travis 0.000430 boy T612 84234 1964 Travis 0.000507 boy T612 85221 1965 Travis 0.000568 boy T612 86187 1966 Travis 0.000718 boy T612 87165 1967 Travis 0.000902 boy T612 88140 1968 Travis 0.001162 boy T612 89112 1969 Travis 0.001567 boy T612 90068 1970 Travis 0.002270 boy T612 91060 1971 Travis 0.002762 boy T612 92056 1972 Travis 0.003068 boy T612 93052 1973 Travis 0.003637 boy T612 94053 1974 Travis 0.003496 boy T612 95050 1975 Travis 0.003674 boy T612 96045 1976 Travis 0.004188 boy T612 97043 1977 Travis 0.004259 boy T612 98045 1978 Travis 0.004072 boy T612 99035 1979 Travis 0.006169 boy T612 100036 1980 Travis 0.005808 boy T612 101039 1981 Travis 0.005435 boy T612 102038 1982 Travis 0.005525 boy T612 103040 1983 Travis 0.005627 boy T612 104041 1984 Travis 0.005461 boy T612 105045 1985 Travis 0.004725 boy T612 106048 1986 Travis 0.004416 boy T612 107041 1987 Travis 0.005335 boy T612 108040 1988 Travis 0.005711 boy T612 109040 1989 Travis 0.005500 boy T612 110041 1990 Travis 0.005325 boy T612 111043 1991 Travis 0.004786 boy T612 112047 1992 Travis 0.004239 boy T612 113049 1993 Travis 0.003845 boy T612 114052 1994 Travis 0.003338 boy T612 115065 1995 Travis 0.002788 boy T612 116072 1996 Travis 0.002576 boy T612 117080 1997 Travis 0.002346 boy T612 118097 1998 Travis 0.001937 boy T612 119101 1999 Travis 0.001770 boy T612 120114 2000 Travis 0.001576 boy T612 121133 2001 Travis 0.001424 boy T612 122143 2002 Travis 0.001306 boy T612 123158 2003 Travis 0.001209 boy T612 124165 2004 Travis 0.001164 boy T612 125173 2005 Travis 0.001114 boy T612 126162 2006 Travis 0.001186 boy T612 127169 2007 Travis 0.001087 boy T612 128176 2008 Travis 0.001025 boy T612
idf = boys.set_index(['name', 'year']) # pull out the indexes, and move them to the row.
idf[-50:]
prop sex soundex name year Reilly 2008 0.000097 boy R400 Sheldon 2008 0.000097 boy S435 Abdullah 2008 0.000097 boy A134 Jagger 2008 0.000097 boy J260 Thaddeus 2008 0.000097 boy T320 Case 2008 0.000096 boy C200 Kyson 2008 0.000096 boy K250 Lamont 2008 0.000096 boy L553 Chaz 2008 0.000096 boy C200 Makhi 2008 0.000096 boy M200 Jan 2008 0.000095 boy J500 Marques 2008 0.000095 boy M622 Oswaldo 2008 0.000095 boy O243 Donavan 2008 0.000095 boy D515 Keyon 2008 0.000095 boy K500 Kyan 2008 0.000095 boy K500 Simeon 2008 0.000095 boy S550 Trystan 2008 0.000095 boy T623 Andreas 2008 0.000094 boy A536 Dangelo 2008 0.000094 boy D524 Landin 2008 0.000094 boy L535 Reagan 2008 0.000094 boy R250 Turner 2008 0.000094 boy T656 Arnav 2008 0.000094 boy A651 Brenton 2008 0.000094 boy B653 Callum 2008 0.000094 boy C450 Jayvion 2008 0.000094 boy J150 Bridger 2008 0.000093 boy B632 Sammy 2008 0.000093 boy S500 Deegan 2008 0.000093 boy D250 Jaylan 2008 0.000093 boy J450 Lennon 2008 0.000093 boy L550 Odin 2008 0.000093 boy O350 Abdiel 2008 0.000092 boy A134 Jerimiah 2008 0.000092 boy J650 Eliezer 2008 0.000092 boy E426 Bronson 2008 0.000091 boy B652 Cornelius 2008 0.000091 boy C654 Pierre 2008 0.000091 boy P600 Cortez 2008 0.000091 boy C632 Baron 2008 0.000090 boy B650 Carlo 2008 0.000090 boy C640 Carsen 2008 0.000090 boy C625 Fletcher 2008 0.000090 boy F432 Izayah 2008 0.000090 boy I200 Kolten 2008 0.000090 boy K435 Damari 2008 0.000089 boy D560 Hugh 2008 0.000089 boy H200 Jensen 2008 0.000089 boy J525 Yurem 2008 0.000089 boy Y650
idf.ix['Travis']
prop sex soundex year 1880 0.000101 boy T612 1882 0.000082 boy T612 1883 0.000080 boy T612 1884 0.000081 boy T612 1885 0.000095 boy T612 1886 0.000092 boy T612 1889 0.000059 boy T612 1890 0.000075 boy T612 1891 0.000082 boy T612 1892 0.000099 boy T612 1893 0.000066 boy T612 1894 0.000064 boy T612 1895 0.000079 boy T612 1896 0.000077 boy T612 1897 0.000098 boy T612 1898 0.000053 boy T612 1899 0.000095 boy T612 1900 0.000166 boy T612 1902 0.000121 boy T612 1903 0.000077 boy T612 1904 0.000137 boy T612 1905 0.000147 boy T612 1906 0.000118 boy T612 1907 0.000132 boy T612 1908 0.000150 boy T612 1909 0.000181 boy T612 1910 0.000149 boy T612 1911 0.000166 boy T612 1912 0.000171 boy T612 1913 0.000203 boy T612 1914 0.000220 boy T612 1915 0.000182 boy T612 1916 0.000204 boy T612 1917 0.000215 boy T612 1918 0.000233 boy T612 1919 0.000228 boy T612 1920 0.000218 boy T612 1921 0.000219 boy T612 1922 0.000231 boy T612 1923 0.000209 boy T612 1924 0.000210 boy T612 1925 0.000208 boy T612 1926 0.000191 boy T612 1927 0.000220 boy T612 1928 0.000214 boy T612 1929 0.000246 boy T612 1930 0.000225 boy T612 1931 0.000266 boy T612 1932 0.000260 boy T612 1933 0.000290 boy T612 1934 0.000344 boy T612 1935 0.000324 boy T612 1936 0.000314 boy T612 1937 0.000293 boy T612 1938 0.000312 boy T612 1939 0.000312 boy T612 1940 0.000299 boy T612 1941 0.000278 boy T612 1942 0.000256 boy T612 1943 0.000216 boy T612 1944 0.000240 boy T612 1945 0.000225 boy T612 1946 0.000246 boy T612 1947 0.000218 boy T612 1948 0.000211 boy T612 1949 0.000207 boy T612 1950 0.000211 boy T612 1951 0.000197 boy T612 1952 0.000184 boy T612 1953 0.000189 boy T612 1954 0.000176 boy T612 1955 0.000171 boy T612 1956 0.000177 boy T612 1957 0.000164 boy T612 1958 0.000201 boy T612 1959 0.000233 boy T612 1960 0.000256 boy T612 1961 0.000271 boy T612 1962 0.000312 boy T612 1963 0.000430 boy T612 1964 0.000507 boy T612 1965 0.000568 boy T612 1966 0.000718 boy T612 1967 0.000902 boy T612 1968 0.001162 boy T612 1969 0.001567 boy T612 1970 0.002270 boy T612 1971 0.002762 boy T612 1972 0.003068 boy T612 1973 0.003637 boy T612 1974 0.003496 boy T612 1975 0.003674 boy T612 1976 0.004188 boy T612 1977 0.004259 boy T612 1978 0.004072 boy T612 1979 0.006169 boy T612 1980 0.005808 boy T612 1981 0.005435 boy T612 1982 0.005525 boy T612 1983 0.005627 boy T612 1984 0.005461 boy T612 1985 0.004725 boy T612 1986 0.004416 boy T612 1987 0.005335 boy T612 1988 0.005711 boy T612 1989 0.005500 boy T612 1990 0.005325 boy T612 1991 0.004786 boy T612 1992 0.004239 boy T612 1993 0.003845 boy T612 1994 0.003338 boy T612 1995 0.002788 boy T612 1996 0.002576 boy T612 1997 0.002346 boy T612 1998 0.001937 boy T612 1999 0.001770 boy T612 2000 0.001576 boy T612 2001 0.001424 boy T612 2002 0.001306 boy T612 2003 0.001209 boy T612 2004 0.001164 boy T612 2005 0.001114 boy T612 2006 0.001186 boy T612 2007 0.001087 boy T612 2008 0.001025 boy T612
idf.ix['Travis'].prop.plot()
<matplotlib.axes.AxesSubplot at 0x107ae5590>
boys.groupby('name')['prop'].mean()
name Aaden 0.000442 Aarav 0.000101 Aaron 0.002266 Ab 0.000044 Abb 0.000046 Abbie 0.000046 Abbott 0.000044 Abdiel 0.000092 Abdul 0.000064 Abdullah 0.000094 Abe 0.000225 Abel 0.000152 Abelardo 0.000041 Abie 0.000042 Abner 0.000130 ... Zander 0.000369 Zane 0.000175 Zavier 0.000098 Zayden 0.000185 Zayne 0.000091 Zeb 0.000080 Zebulon 0.000055 Zechariah 0.000112 Zed 0.000056 Zeke 0.000055 Zenas 0.000052 Zeno 0.000071 Zigmund 0.000052 Zion 0.000490 Zollie 0.000060 Name: prop, Length: 3437
boys.groupby('name')['prop'].mean().order()
name Danniel 0.000027 Kennard 0.000027 Deryl 0.000027 Grayling 0.000027 Cheryl 0.000028 Dwyane 0.000028 Michial 0.000028 Levern 0.000029 Diane 0.000029 Kimball 0.000029 Randolf 0.000029 Renard 0.000029 Theodis 0.000029 Patric 0.000029 Lex 0.000030 ... Paul 0.008322 Daniel 0.008517 Frank 0.010386 Edward 0.011069 Richard 0.014183 Thomas 0.014739 George 0.016254 David 0.016737 Joseph 0.017771 Michael 0.018342 Charles 0.019521 Robert 0.029625 William 0.034182 James 0.035465 John 0.041082 Name: prop, Length: 3437
boys['prop'].describe() # Getting the summary of statistics from the dataframe, based on proportion.e
count 129000.000000 mean 0.000916 std 0.003441 min 0.000026 25% 0.000071 50% 0.000142 75% 0.000453 max 0.081541
result = boys.groupby('year')['prop'].describe()
result[:50]
year 1880 count 1000.000000 mean 0.000931 std 0.004871 min 0.000042 25% 0.000059 50% 0.000118 75% 0.000372 max 0.081541 1881 count 1000.000000 mean 0.000930 std 0.004812 min 0.000037 25% 0.000065 50% 0.000120 75% 0.000369 max 0.080975 1882 count 1000.000000 mean 0.000928 std 0.004691 min 0.000041 25% 0.000066 50% 0.000115 75% 0.000377 max 0.078314 1883 count 1000.000000 mean 0.000929 std 0.004675 min 0.000044 25% 0.000062 50% 0.000116 75% 0.000382 max 0.079066 1884 count 1000.000000 mean 0.000927 std 0.004542 min 0.000041 25% 0.000065 50% 0.000122 75% 0.000385 max 0.076476 1885 count 1000.000000 mean 0.000926 std 0.004459 min 0.000043 25% 0.000069 50% 0.000121 75% 0.000373 max 0.075517 1886 count 1000.000000 mean 0.000926
df = boys[boys.year == 2008]
df.prop
128000 0.010355 128001 0.009437 128002 0.009301 128003 0.008799 128004 0.008702 128005 0.008566 128006 0.008442 128007 0.008438 128008 0.008268 128009 0.008061 128010 0.007877 128011 0.007694 128012 0.007570 128013 0.007467 128014 0.007262 ... 128985 0.000092 128986 0.000091 128987 0.000091 128988 0.000091 128989 0.000091 128990 0.000090 128991 0.000090 128992 0.000090 128993 0.000090 128994 0.000090 128995 0.000090 128996 0.000089 128997 0.000089 128998 0.000089 128999 0.000089 Name: prop, Length: 1000
df = boys[boys.year == 2008].sort_index(by='prop', ascending=False) # If not in descending order. Can also do ascending=True for ascending.
df.prop
128000 0.010355 128001 0.009437 128002 0.009301 128003 0.008799 128004 0.008702 128005 0.008566 128006 0.008442 128007 0.008438 128008 0.008268 128009 0.008061 128010 0.007877 128011 0.007694 128012 0.007570 128013 0.007467 128014 0.007262 ... 128985 0.000092 128986 0.000091 128987 0.000091 128988 0.000091 128989 0.000091 128992 0.000090 128994 0.000090 128993 0.000090 128995 0.000090 128991 0.000090 128990 0.000090 128997 0.000089 128998 0.000089 128996 0.000089 128999 0.000089 Name: prop, Length: 1000
df.prop.cumsum() # numpy
128000 0.010355 128001 0.019792 128002 0.029093 128003 0.037892 128004 0.046594 128005 0.055160 128006 0.063602 128007 0.072040 128008 0.080308 128009 0.088369 128010 0.096246 128011 0.103940 128012 0.111510 128013 0.118977 128014 0.126239 ... 128985 0.794154 128986 0.794245 128987 0.794336 128988 0.794427 128989 0.794518 128992 0.794608 128994 0.794698 128993 0.794788 128995 0.794878 128991 0.794968 128990 0.795058 128997 0.795147 128998 0.795236 128996 0.795325 128999 0.795414 Length: 1000
df.prop.cumsum().searchsorted(0.5) # how many does it take to reach 50%. Also called a measure of diversity.
127
df.prop.cumsum()[:130]
128000 0.010355 128001 0.019792 128002 0.029093 128003 0.037892 128004 0.046594 128005 0.055160 128006 0.063602 128007 0.072040 128008 0.080308 128009 0.088369 128010 0.096246 128011 0.103940 128012 0.111510 128013 0.118977 128014 0.126239 128015 0.133404 128016 0.140356 128017 0.147110 128018 0.153482 128019 0.159584 128020 0.165669 128021 0.171711 128022 0.177696 128023 0.183670 128024 0.189600 128025 0.195505 128026 0.201343 128027 0.207172 128028 0.212924 128029 0.218338 128030 0.223717 128031 0.229061 128032 0.234334 128033 0.239182 128034 0.244003 128035 0.248758 128036 0.253340 128037 0.257906 128038 0.262453 128039 0.266966 128040 0.271479 128041 0.275903 128042 0.280226 128043 0.284531 128044 0.288831 128045 0.293104 128046 0.297326 128047 0.301496 128048 0.305538 128049 0.309454 128050 0.313330 128051 0.317158 128052 0.320967 128053 0.324665 128054 0.328339 128055 0.331989 128056 0.335588 128057 0.339177 128058 0.342724 128059 0.346245 128060 0.349669 128061 0.353036 128062 0.356375 128063 0.359659 128064 0.362908 128065 0.366139 128066 0.369359 128067 0.372383 128068 0.375391 128069 0.378297 128070 0.381182 128071 0.383982 128072 0.386771 128073 0.389546 128074 0.392286 128075 0.395015 128076 0.397694 128077 0.400343 128078 0.402992 128079 0.405624 128080 0.408225 128081 0.410765 128082 0.413265 128083 0.415745 128084 0.418224 128085 0.420698 128086 0.423111 128087 0.425494 128088 0.427855 128089 0.430198 128090 0.432527 128091 0.434848 128092 0.437132 128093 0.439357 128094 0.441551 128095 0.443726 128096 0.445889 128097 0.447981 128098 0.450050 128099 0.452117 128100 0.454156 128101 0.456166 128102 0.458152 128103 0.460093 128104 0.462009 128105 0.463904 128106 0.465773 128107 0.467637 128108 0.469478 128109 0.471285 128110 0.473029 128111 0.474770 128112 0.476492 128113 0.478208 128114 0.479911 128115 0.481593 128116 0.483275 128117 0.484921 128118 0.486563 128119 0.488203 128120 0.489818 128121 0.491429 128122 0.493032 128123 0.494626 128124 0.496173 128125 0.497718 128126 0.499243 128127 0.500748 128128 0.502243 128129 0.503723 Length: 130
def get_quantile_count(group, quantile = 0.5):
df = group.sort_index(by='prop', ascending=False)
return df.prop.cumsum().searchsorted(quantile)
boys.groupby('year').apply(get_quantile_count).plot()
<matplotlib.axes.AxesSubplot at 0x106e90550>
def get_quantile_count(group, quantile=0.5): # Problem with no different colors for boys and girls.
group = group.groupby('soundex').sum()
df = group.sort_index(by='prop', ascending=False)
return df.prop.cumsum().searchsorted(quantile)
#f = lambda x: get_quantile_count(x, 0.1)
q = 0.25
boy_ct = boys.groupby('year').apply(get_quantile_count, quantile=q) # to pass different values for quantile
girl_ct = girls.groupby('year').apply(get_quantile_count, quantile=q)
boy_ct.plot(label='boy')
girl_ct.plot(label='girl')
legend(loc='best') # with --pylab=inline, we don't have to do plt.legend()
<matplotlib.legend.Legend at 0x1097ff650>
boys[boys.year == 2008].prop.rank() # mean rank by default.
128000 1000 128001 999 128002 998 128003 997 128004 996 128005 995 128006 994 128007 993 128008 992 128009 991 128010 990 128011 989 128012 988 128013 987 128014 986 ... 128985 16.0 128986 12.5 128987 12.5 128988 12.5 128989 12.5 128990 7.5 128991 7.5 128992 7.5 128993 7.5 128994 7.5 128995 7.5 128996 2.5 128997 2.5 128998 2.5 128999 2.5 Name: prop, Length: 1000
grouped = boys.groupby('year')['prop']
grouped.transform(Series.rank) # transform is more rigid than apply. Output the same size as the input.
0 1000 1 999 2 998 3 997 4 996 5 995 6 994 7 993 8 992 9 991 10 990 11 989 12 988 13 987 14 986 ... 128985 16.0 128986 12.5 128987 12.5 128988 12.5 128989 12.5 128990 7.5 128991 7.5 128992 7.5 128993 7.5 128994 7.5 128995 7.5 128996 2.5 128997 2.5 128998 2.5 128999 2.5 Name: prop, Length: 129000
boys['year_rank'] = grouped.transform(Series.rank)
boys[boys.name == 'Wesley'].year_rank.plot()
<matplotlib.axes.AxesSubplot at 0x10a348cd0>
idf = boys.set_index(['name', 'year']) # same as above
idf.ix['Wesley']
idf.ix['Wesley'].year_rank
idf.ix['Wesley'].year_rank.plot()
<matplotlib.axes.AxesSubplot at 0x10456c050>
% timeit result = grouped.transform(lambda x: x - x.mean())
10 loops, best of 3: 25.7 ms per loop
% timeit result = grouped.apply(lambda x: x - x.mean())
100 loops, best of 3: 19.5 ms per loop
names # Output only the proportion of each name, not the amount.
<class 'pandas.core.frame.DataFrame'> Int64Index: 258000 entries, 0 to 257999 Data columns: year 258000 non-null values name 258000 non-null values prop 258000 non-null values sex 258000 non-null values soundex 258000 non-null values dtypes: float64(1), int64(1), object(3)
births = read_csv('births.csv')
merged = merge(names, births, on=['year', 'sex']) # merge 2 tables: names and births.
merge(names, births) # same as above. Join is performed using the common columns: year and sex
# Many to many joins in SQL compute the cartesians products of duplicated keys.
<class 'pandas.core.frame.DataFrame'> Int64Index: 258000 entries, 0 to 257999 Data columns: year 258000 non-null values name 258000 non-null values prop 258000 non-null values sex 258000 non-null values soundex 258000 non-null values births 258000 non-null values dtypes: float64(1), int64(2), object(3)
merged['persons'] = np.floor(merged.prop * merged.births)
merged.head()
year name prop sex soundex births persons 0 1880 John 0.081541 boy J500 118405 9654 1 1880 William 0.080511 boy W450 118405 9532 2 1880 James 0.050057 boy J520 118405 5926 3 1880 Charles 0.045167 boy C642 118405 5347 4 1880 George 0.043292 boy G620 118405 5125
merged.groupby(['name', 'sex'])['persons'].sum() # slice and dice. It's a hierarhical labeling.
name sex Aaden boy 959 Aaliyah girl 39660 Aarav boy 219 Aaron boy 508034 girl 1365 Ab boy 24 Abagail girl 2680 Abb boy 15 Abbey girl 14333 Abbie boy 10 girl 16556 Abbigail girl 6792 Abbott boy 9 Abby girl 47845 Abdiel boy 199 ... Zilpah girl 8 Zilpha girl 70 Zina girl 2263 Zion boy 11428 girl 1530 Zita girl 831 Zoa girl 73 Zoe girl 66271 Zoey girl 19335 Zoie girl 4688 Zola girl 4847 Zollie boy 60 Zona girl 3218 Zora girl 4551 Zula girl 3578 Name: persons, Length: 7455
merged.groupby(['name', 'sex'])['persons'].sum().order()
name sex Achsah girl 3 Ala girl 3 Alabama girl 3 Alfred girl 3 Amey girl 3 Chester girl 3 Chestina girl 3 Clem girl 3 Crissie girl 3 Daisye girl 3 Dema girl 3 Dollye girl 3 Eithel girl 3 Abie boy 4 Acey boy 4 ... Elizabeth girl 1540430 Patricia girl 1567602 Daniel boy 1776483 Christopher boy 1924919 Thomas boy 2239606 Charles boy 2320818 Joseph boy 2490274 Richard boy 2535681 David boy 3495940 William boy 3951365 Mary girl 4097626 Michael boy 4207352 Robert boy 4752198 John boy 5016124 James boy 5021269 Name: persons, Length: 7455
mboys = merge(boys, births) # inner join by default.
mboys['persons'] = np.floor(mboys.prop * mboys.births)
persons = mboys.set_index(['year', 'name']).persons # Select out persons
type(persons) # One dimensional series.
pandas.core.series.Series
persons # hierarhical index
year name 1880 John 9654 William 9532 James 5926 Charles 5347 George 5125 Frank 3241 Joseph 2632 Thomas 2533 Henry 2443 Robert 2415 Edward 2363 Harry 2152 Walter 1754 Arthur 1598 Fred 1568 ... 2008 Eliezer 199 Bronson 197 Cornelius 197 Pierre 197 Cortez 197 Baron 195 Carlo 195 Carsen 195 Fletcher 195 Izayah 195 Kolten 195 Damari 193 Hugh 193 Jensen 193 Yurem 193 Name: persons, Length: 129000
persons.ix[:, 'Christopher'].plot(kind='bar', rot=90) # Select out all the people named Chris. Plot is kind of crowded. Matplotlib doesn't go more than 130 in x axis.
<matplotlib.axes.AxesSubplot at 0x10a3ad1d0>
persons.unstack('name') # Create a data frame whose columns are each unique names, and the row indexes are the years.
<class 'pandas.core.frame.DataFrame'> Int64Index: 129 entries, 1880 to 2008 Columns: 3437 entries, Aaden to Zollie dtypes: float64(3437)
result = _ # underscore in ipython: the output of the last statement, because we don't want to compute again the same thing.
result
<class 'pandas.core.frame.DataFrame'> Int64Index: 129 entries, 1880 to 2008 Columns: 3437 entries, Aaden to Zollie dtypes: float64(3437)
result['Wesley']
year 1880 123 1881 102 1882 144 1883 104 1884 120 1885 107 1886 120 1887 114 1888 147 1889 100 1890 130 1891 114 1892 141 1893 136 1894 149 1895 156 1896 144 1897 120 1898 151 1899 119 1900 180 1901 146 1902 144 1903 152 1904 165 1905 174 1906 155 1907 190 1908 184 1909 195 1910 208 1911 261 1912 484 1913 584 1914 789 1915 990 1916 1026 1917 1013 1918 1145 1919 1095 1920 1247 1921 1326 1922 1290 1923 1376 1924 1346 1925 1266 1926 1321 1927 1327 1928 1176 1929 1176 1930 1187 1931 1113 1932 1135 1933 1030 1934 1116 1935 1037 1936 1062 1937 1064 1938 1071 1939 1116 1940 1122 1941 1089 1942 1258 1943 1263 1944 1169 1945 1105 1946 1249 1947 1455 1948 1334 1949 1398 1950 1516 1951 1511 1952 1592 1953 1732 1954 1839 1955 1918 1956 2010 1957 1987 1958 1951 1959 2001 1960 2016 1961 2001 1962 1965 1963 2097 1964 1968 1965 1862 1966 1801 1967 1592 1968 1642 1969 1853 1970 2060 1971 1958 1972 1694 1973 1558 1974 1466 1975 1534 1976 2146 1977 4301 1978 3301 1979 3232 1980 3164 1981 3039 1982 3024 1983 2787 1984 2799 1985 3038 1986 3040 1987 3245 1988 3477 1989 3458 1990 3313 1991 3268 1992 3236 1993 3169 1994 3030 1995 2795 1996 2616 1997 2355 1998 2240 1999 2208 2000 2217 2001 2024 2002 2049 2003 2116 2004 2103 2005 2107 2006 2096 2007 2057 2008 2050 Name: Wesley, Length: 129
result['Wesley'].plot()
<matplotlib.axes.AxesSubplot at 0x10d4968d0>