%matplotlib inline import matplotlib.pyplot as plt import numpy as np from pylab import figure, show from pandas import DataFrame, Series import pandas as pd try: import mpld3 from mpld3 import enable_notebook from mpld3 import plugins enable_notebook() except Exception as e: print "Attempt to import and enable mpld3 failed", e # what would seaborn do? try: import seaborn as sns except Exception as e: print "Attempt to import and enable seaborn failed", e import os NAMES_DIR = os.path.join(os.pardir, "pydata-book", "ch02", "names") assert os.path.exists(NAMES_DIR) # show the first five files in the NAMES_DIR import glob glob.glob(NAMES_DIR + "/*")[:5] # 2010 is the last available year in the pydata-book repo import os years = range(1880, 2011) pieces = [] columns = ['name', 'sex', 'births'] for year in years: path = os.path.join(NAMES_DIR, 'yob%d.txt' % year) frame = pd.read_csv(path, names=columns) frame['year'] = year pieces.append(frame) # Concatenate everything into a single DataFrame names = pd.concat(pieces, ignore_index=True) # why floats? I'm not sure. names.describe() # how many people, names, males and females represented in names? names.births.sum() # F vs M names.groupby('sex')['births'].sum() # total number of names len(names.groupby('name')) # use pivot_table to collect records by year (rows) and sex (columns) total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum) total_births.head() # You can use groupy to get equivalent pivot_table calculation names.groupby('year').apply(lambda s: s.groupby('sex').agg('sum')).unstack()['births'] # how to calculate the total births / year names.groupby('year').sum().plot(title="total births by year") names.groupby('year').apply(lambda s: s.groupby('sex').agg('sum')).unstack()['births'].plot(title="births (M/F) by year") # from book: add prop to names def add_prop(group): # Integer division floors births = group.births.astype(float) group['prop'] = births / births.sum() return group names = names.groupby(['year', 'sex']).apply(add_prop) # verify prop --> all adds up to 1 np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1) # number of records in full names dataframe len(names) # from book: useful to work with top 1000 for each year/sex combo # can use groupby/apply names.groupby(['year', 'sex']).apply(lambda g: g.sort_index(by='births', ascending=False)[:1000]) def get_top1000(group): return group.sort_index(by='births', ascending=False)[:1000] grouped = names.groupby(['year', 'sex']) top1000 = grouped.apply(get_top1000) top1000.head() # Do pivot table: row: year and cols= names for top 1000 top_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=np.sum) top_births.tail() # is your name in the top_births list? top_births['Raymond'].plot(title='plot for Raymond') # for Aaden, which shows up at the end top_births.Aaden.plot(xlim=[1880,2010]) # number of names represented in top_births len(top_births.columns) # how to get the most popular name of all time in top_births? most_common_names = top_births.sum() most_common_names.sort(ascending=False) most_common_names.head() # as of mpl v 0.1 (2014.03.04), the name labeling doesn't work -- so disble mpld3 for this figure mpld3.disable_notebook() plt.figure() most_common_names[:50][::-1].plot(kind='barh', figsize=(10,10)) # turn mpld3 back on mpld3.enable_notebook() # instead of top_birth -- get all_births all_births = names.pivot_table('births', rows='year', cols='name', aggfunc=sum) all_births = all_births.fillna(0) all_births.tail() # set up to do start/end calculation all_births_cumsum = all_births.apply(lambda s: s.cumsum(), axis=0) all_births_cumsum.tail() # remind ourselves of what's in names names.head() # columns in names names.columns