Set working path, and import libraries and read dataframe pickles last_year = 2013 #change this when Social Security database is updated save_path = "user_charts" # files created by this notebook will be saved in this directory import time import os if not os.path.isdir(save_path): # creates path if it does not exist os.makedirs(save_path) #os.chdir("Baby_names_US_IPython") from math import floor import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn # comment out if you don't have it, but it makes good-looking charts %run download_and_process.py # used to round limit of y axis up to second-most-significant digit def determine_y_limit(x): significance = int(floor((log10(x)))) val = floor(x / (10 ** (significance - 1))) + 1 val = val * (10 ** (significance - 1)) return val namelist = ["William", "Will", "Willy", "Willie", "Billy", "Bill"] sexes = ['M'] # can be length 1 or same length as names yearstart = 1880 yearend = 2013 start = time.time() df_chart = yob.copy() if len(sexes) == 1: sexes = sexes * len(namelist) df_chart = df_chart[df_chart['name'].isin(namelist)] df_chart['temp'] = 0 # I think this whole block is now unnecessary due to the previous line .isin function for row in range(len(df_chart)): for pos in range(len(namelist)): if df_chart.name.iloc[row] == namelist[pos] and df_chart.sex.iloc[row] == sexes[pos]: df_chart.temp.iloc[row] = 1 df_chart = df_chart[df_chart.temp == 1] print "%0.1f minutes elapsed. %d records remain.\nTail of dataframe:" % ((time.time() - start)/60, len(df_chart)) print df_chart.tail() #To keep more than one data set for charts in memory, change name of chart_1 chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex'])) col = chart_1.columns[0] for yr in range(yearstart, yearend+1): #inserts missing years if yr not in chart_1.index: #chart_1[col][yr] = 0.0 chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0])) chart_1 = chart_1.fillna(0) chart_1.sort(inplace=True, ascending=True) temp = names[names.name != 'Jennifer'] sexes = list(temp[temp.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).sex) namelist = list(temp[temp.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).name) yearstart = 1880 yearend = 2013 start = time.time() df_chart = yob.copy() if len(sexes) == 1: sexes = sexes * len(namelist) df_chart = df_chart[df_chart['name'].isin(namelist)] df_chart['temp'] = 0 # I think this whole block is now unnecessary due to the previous line .isin function for row in range(len(df_chart)): for pos in range(len(namelist)): if df_chart.name.iloc[row] == namelist[pos] and df_chart.sex.iloc[row] == sexes[pos]: df_chart.temp.iloc[row] = 1 df_chart = df_chart[df_chart.temp == 1] print "%0.1f minutes elapsed. %d records remain.\nTail of dataframe:" % ((time.time() - start)/60, len(df_chart)) print df_chart.tail() #To keep more than one data set for charts in memory, change name of chart_1 chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex'])) col = chart_1.columns[0] for yr in range(yearstart, yearend+1): #inserts missing years if yr not in chart_1.index: #chart_1[col][yr] = 0.0 chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0])) chart_1 = chart_1.fillna(0) chart_1.sort(inplace=True, ascending=True) #a single function to make the four different kinds of charts def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \ groupedlist = [], baseline='sym', png_name=''): dataframe = df.copy() startyear = min(list(dataframe.index)) endyear = max(list(dataframe.index)) yearstr = '%d-%d' % (startyear, endyear) legend_size = 0.01 has_male = False has_female = False has_both = False max_y = 0 for name, sex in dataframe.columns: max_y = max(max_y, dataframe[(name, sex)].max()) final_name = name if sex == 'M': has_male = True if sex == 'F': has_female = True if smoothing > 0: newvalues = [] for row in range(len(dataframe)): start = max(0, row - smoothing) end = min(len(dataframe) - 1, row + smoothing) newvalues.append(dataframe[(name, sex)].iloc[start:end].mean()) for row in range(len(dataframe)): dataframe[(name, sex)].iloc[row] = newvalues[row] if has_male and has_female: y_text = "% of births of indicated sex" has_both = True elif has_male: y_text = "Percent of male births" else: y_text = "Percent of female births" num_series = len(dataframe.columns) if colors == []: colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3", "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"] num_colors = len(colors) if num_series > num_colors: print "Warning: colors will be repeated." if title == '': if num_series == 1: title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr) else: title = "Popularity of baby names in U.S., %s" % (yearstr) x_values = range(startyear, endyear + 1) y_zeroes = [0] * (endyear - startyear) if form == 'line': fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w') counter = 0 for name, sex in dataframe.columns: color = colors[counter % num_colors] counter += 1 if has_both: label = "%s (%s)" % (name, sex) else: label = name ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3) ax.set_ylim(0,determine_y_limit(max_y)) ax.set_xlim(startyear, endyear) ax.set_ylabel(y_text, size = 13) box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * legend_size, box.width, box.height * (1 - legend_size)]) legend_cols = min(5, num_series) ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols) if form == 'subplots_auto': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum alpha: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) current_ymax = dataframe[(name, sex)].max() tint = 1.0 * current_ymax / determine_y_limit(max_y) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(current_ymax)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'subplots_same': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum y axis: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(max_y)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'stream': plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k') plt.title(title, size=17) plt.xlim(startyear, endyear) if has_both: yaxtext = 'Percent of births of indicated sex (scale: ' elif has_male: yaxtext = 'Percent of male births (scale: ' else: yaxtext = 'Percent of female births (scale: ' scale = str(determine_y_limit(max_y)) + ')' yaxtext += scale plt.ylabel(yaxtext, size=13) polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], colors=colors, baseline=baseline) legendProxies = [] for poly in polys: legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0])) namelist = [] for name, sex in dataframe.columns: if has_both: namelist.append('%s (%s)' % (name, sex)) else: namelist.append(name) plt.legend(legendProxies, namelist, loc=3, ncol=2) plt.tick_params(\ axis='y', which='both', # major and minor ticks left='off', right='off', labelleft='off') plt.show() if png_name != '': filename = save_path + "/" + png_name + ".png" plt.savefig(filename) plt.close() #line graph make_chart(df=chart_1, form='line', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='sym', # zero , sym , wiggle , weighted_wiggle png_name = '' # if '', will not be saved ) from math import log10 %matplotlib inline #subplots with autoscaling, with the intensity of the fill color proportional to peak maximum make_chart(df=chart_1, form='subplots_auto', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='sym', # zero , sym , wiggle , weighted_wiggle png_name = '' # if '', will not be saved ) #subplots with all y axes having the same scale make_chart(df=chart_1, form='subplots_same', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='sym', # zero , sym , wiggle , weighted_wiggle png_name = '' # if '', will not be saved ) #stream graph 'sym' make_chart(df=chart_1, form='stream', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='sym', # zero , sym , wiggle , weighted_wiggle png_name = '' # if '', will not be saved ) #stream graph 'zero' make_chart(df=chart_1, form='stream', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='zero', # zero , sym , wiggle , weighted_wiggle png_name = '' # if '', will not be saved ) #stream graph 'wiggle' make_chart(df=chart_1, form='stream', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='wiggle', # zero , sym , wiggle , weighted_wiggle png_name = '' # if '', will not be saved ) #stream graph 'weighted_wiggle' make_chart(df=chart_1, form='stream', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='weighted_wiggle', # zero , sym , wiggle , weighted_wiggle png_name = '' # if '', will not be saved ) names[names.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False) x = list(names[names.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).name) y = list(names[names.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).sex)