listed_path = "lists/mythological_names_eg_gk_ro_no.list" totals_title = "Mythological names in U.S. Social Security baby names database, 1880-2013" top_cutoff = 6 top_boys_title = "Top %d mythological boys' names from U.S. Social Security database, 1880-2013" % (top_cutoff) top_girls_title = "Top %d mythological girls' names from U.S. Social Security database, 1880-2013" % (top_cutoff) last_year = 2013 #change this when Social Security database is updated save_path = "user_charts" # files created by this notebook will be saved in this directory import time import os if not os.path.isdir(save_path): # creates path if it does not exist os.makedirs(save_path) import math import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline import seaborn # comment out if you don't have it, but it makes good-looking charts print 'This is standard output from download_and_process.py' %run download_and_process.py print '--------------------\nFirst 80 characters of list:' listed_file = open(listed_path, "r").read() print listed_file[:80] + ' ...' all_listed = eval(listed_file) # make sure you trust this file! all_listed_set = set(all_listed) # to remove duplicates all_listed = list(all_listed) print "all_listed: list of length", len(all_listed) # reduce names dataframe to those matching list print '--------------------\nDataframe names filtered to those that match list' print "%d records to begin." % (len(names)) names_listed = names[names.name.isin(all_listed)].copy() names_listed.sort('pct_max', ascending=False, inplace=True) print "%d records remaining." % (len(names_listed)) listed_in_df = list(names_listed.name) print names_listed.head(10) listed_m = list(names[(names.sex == 'M') & (names.name.isin(listed_in_df))]['name']) listed_f = list(names[(names.sex == 'F') & (names.name.isin(listed_in_df))]['name']) #reduce yob dataframe to those matching list print '--------------------\nDataframe yob filtered to those that match list (count only)' print "%d records to begin." % (len(yob)) yob_listed = yob[yob.name.isin(listed_in_df)].copy() yob_listed.sort(['year', 'sex', 'name'], ascending=False, inplace=True) print "%d records remaining." % (len(yob_listed)) # m and f totals yob_listed_f_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'F'].groupby('year').sum())[['births', 'pct']] yob_listed_m_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'M'].groupby('year').sum())[['births', 'pct']] print '--------------------\nHead of total matching list per year, female' print yob_listed_f_agg.head() # print chart of m and f totals print '\n' # function to determine a nice y-axis limit a little above the maximum value # rounds maximum y up to second-most-significant digit def determine_y_limit(x): significance = int(math.floor((math.log10(x)))) val = math.floor(x / (10 ** (significance - 1))) + 1 val = val * (10 ** (significance - 1)) return val #data xf = list(yob_listed_f_agg.index) xm = list(yob_listed_m_agg.index) plt.figure(figsize=(16,9)) plt.plot(xf, list(yob_listed_f_agg.pct), color="red") plt.plot(xm, list(yob_listed_m_agg.pct), color="blue") plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct) +list(yob_listed_m_agg.pct)))) plt.xlim(1880, 2013) plt.title(totals_title, fontsize = 20) plt.xlabel("Year", fontsize = 14) plt.ylabel("% of total births of that sex", fontsize = 14) plt.show() #function to make dataframe for top names def top_df(yobdf, names, sexes): """ yobdf = dataframe derived from yob; normally it would just be yob itself. names = list of names sexes = list of length 1 for all the same sex, or same length as names. 'F' and 'M' allowed """ df_chart = yobdf.copy() assert len(sexes) == 1 or len(names) == len(sexes) if len(sexes) == 1: sexes = sexes * len(names) df_chart = df_chart[df_chart['name'].isin(names)] df_chart['temp'] = 0 for row in range(len(df_chart)): for pos in range(len(names)): if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]: df_chart.temp.iloc[row] = 1 df_chart = df_chart[df_chart.temp == 1] print "Tail of dataframe:" print df_chart.tail() output_df = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex'])) col = output_df.columns[0] for yr in range(1880, last_year + 1): #inserts missing years if yr not in output_df.index: #output_df[col][yr] = 0.0 output_df = output_df.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0])) output_df = output_df.fillna(0) return output_df listed_top_m = top_df(yob, listed_m[:top_cutoff], ['M']) listed_top_f = top_df(yob, listed_f[:top_cutoff], ['F']) #a single function to make the four different kinds of charts def make_chart(df, form='line', title='', colors= [], smoothing=0, \ groupedlist = [], baseline='sym', png_name=''): dataframe = df.copy() startyear = min(list(dataframe.index)) endyear = max(list(dataframe.index)) yearstr = '%d-%d' % (startyear, endyear) legend_size = 0.01 has_male = False has_female = False has_both = False max_y = 0 for name, sex in dataframe.columns: max_y = max(max_y, dataframe[(name, sex)].max()) final_name = name if sex == 'M': has_male = True if sex == 'F': has_female = True if smoothing > 0: newvalues = [] for row in range(len(dataframe)): start = max(0, row - smoothing) end = min(len(dataframe) - 1, row + smoothing) newvalues.append(dataframe[(name, sex)].iloc[start:end].mean()) for row in range(len(dataframe)): dataframe[(name, sex)].iloc[row] = newvalues[row] if has_male and has_female: y_text = "% of births of indicated sex" has_both = True elif has_male: y_text = "Percent of male births" else: y_text = "Percent of female births" num_series = len(dataframe.columns) if colors == []: colors = ['#BB2114', '#0C5966', '#BA7814', '#4459AB', '#6B3838', '#B8327B', '#2B947F', '#0D83B5', '#684287', '#8C962C', '#92289E', '#242D7D'] # my own list of dark contrasting colors num_colors = len(colors) if num_series > num_colors: print "Warning: colors will be repeated." if title == '': if num_series == 1: title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr) else: title = "Popularity of baby names in U.S., %s" % (yearstr) x_values = range(startyear, endyear + 1) y_zeroes = [0] * (endyear - startyear) if form == 'line': fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w') counter = 0 for name, sex in dataframe.columns: color = colors[counter % num_colors] counter += 1 if has_both: label = "%s (%s)" % (name, sex) else: label = name ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3) ax.set_ylim(0,determine_y_limit(max_y)) ax.set_xlim(startyear, endyear) ax.set_ylabel(y_text, size = 13) ax.set_title(title, size = 18) box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * legend_size, box.width, box.height * (1 - legend_size)]) legend_cols = min(5, num_series) ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols) if form == 'subplots_auto': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum alpha: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) current_ymax = dataframe[(name, sex)].max() tint = 1.0 * current_ymax / determine_y_limit(max_y) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(current_ymax)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'subplots_same': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum y axis: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(max_y)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'stream': plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k') plt.title(title, size=17) plt.xlim(startyear, endyear) if has_both: yaxtext = 'Percent of births of indicated sex (scale: ' elif has_male: yaxtext = 'Percent of male births (scale: ' else: yaxtext = 'Percent of female births (scale: ' scale = str(determine_y_limit(max_y)) + ')' yaxtext += scale plt.ylabel(yaxtext, size=13) polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], colors=colors, baseline=baseline) legendProxies = [] for poly in polys: legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0])) namelist = [] for name, sex in dataframe.columns: if has_both: namelist.append('%s (%s)' % (name, sex)) else: namelist.append(name) plt.legend(legendProxies, namelist, loc=3, ncol=2) plt.tick_params(\ axis='y', which='both', # major and minor ticks left='off', right='off', labelleft='off') plt.show() if png_name != '': filename = save_path + "/" + png_name + ".png" plt.savefig(filename) plt.close() # line charts make_chart(df=listed_top_m, form='line', # line , subplots_auto , subplots_same , stream title=top_boys_title, colors= [], smoothing=0, baseline='zero', # zero , sym , wiggle , weighted_wiggle ) make_chart(df=listed_top_f, form='line', # line , subplots_auto , subplots_same , stream title=top_girls_title, colors= [], smoothing=0, baseline='zero', # zero , sym , wiggle , weighted_wiggle ) names_listed.reset_index(drop = True, inplace = True) names_listed.head() names_listed.to_csv('lists/names_matching_mythological_list.csv') print names_listed.name.unique() cutoffn = 0 # how many names will remain to evaluate after duplicates removed from collections import OrderedDict evallistm = OrderedDict() evallistf = OrderedDict() # remove names with more common duplicates in other sex # this happens frequently in ssa db for name in listed_m: try: pctf = names_listed[(names_listed.sex == 'F') & (names_listed.name == name)].pct_max.iloc[0] pctm = names_listed[(names_listed.sex == 'M') & (names_listed.name == name)].pct_max.iloc[0] except: pctf = 98 pctm = 99 if (name not in names_listed[names_listed.sex == 'F'].name.unique() or pctf < pctm): evallistm[name] = '' for name in listed_f: try: pctf = names_listed[(names_listed.sex == 'F') & (names_listed.name == name)].pct_max.iloc[0] pctm = names_listed[(names_listed.sex == 'M') & (names_listed.name == name)].pct_max.iloc[0] except: pctf = 99 pctm = 98 if (name not in names_listed[names_listed.sex == 'M'].name.unique() or pctm < pctf): evallistf[name] = '' if cutoffn > 0: assert len(evallistm) > cutoffn assert len(evallistf) > cutoffn print evallistm[:cutoffn] print evallistf[:cutoffn] else: print 'Length of lists: %d male, %d female\n' % (len(evallistm), len(evallistf)) print evallistm print ' ' print evallistf #manually copy and paste the above lists and assign #'acc' or 'rej' individually to accept or reject evallistm = OrderedDict([('Sol', 'rej'), ('Seth', 'rej'), ('Griffin', 'rej'), ('Amon', 'rej'), ('Thor', 'acc'), ('Hercules', 'acc'), ('Ladon', 'rej'), ('Odin', 'acc'), ('Hermes', 'acc'), ('Apollo', 'acc'), ('Osiris', 'acc'), ('Min', 'rej'), ('Clete', 'rej'), ('Zeus', 'acc'), ('Phoenix', 'acc'), ('Amen', 'rej'), ('Mars', 'acc'), ('Ares', 'acc'), ('Loki', 'acc'), ('Nike', 'rej'), ('Ran', 'rej'), ('Mercury', 'acc'), ('Tyr', 'acc'), ('Jupiter', 'acc'), ('Kore', 'rej'), ('Ra', 'acc'), ('Anubis', 'acc'), ('Helios', 'acc'), ('Poseidon', 'acc'), ('Makar', 'rej'), ('Pater', 'rej'), ('Amun', 'rej'), ('Fenris', 'acc'), ('Set', 'rej'), ('Demeter', 'rej'), ('Horus', 'acc'), ('Megale', 'rej'), ('Aten', 'acc'), ('Saturn', 'acc')]) evallistf = OrderedDict([('Athena', 'acc'), ('Delia', 'rej'), ('Minerva', 'acc'), ('Doris', 'rej'), ('Phoebe', 'rej'), ('Chloe', 'rej'), ('Diana', 'rej'), ('Flora', 'rej'), ('Sophia', 'rej'), ('Rhea', 'rej'), ('Venus', 'acc'), ('Vesta', 'acc'), ('Luna', 'rej'), ('Thalia', 'acc'), ('Lucina', 'rej'), ('Gerda', 'rej'), ('Eris', 'acc'), ('Artemis', 'acc'), ('Aphrodite', 'acc'), ('Isis', 'acc'), ('Clio', 'acc'), ('Persephone', 'acc'), ('Melaina', 'rej'), ('Shai', 'rej'), ('Andromeda', 'acc'), ('Lamia', 'acc'), ('Sia', 'rej'), ('Hera', 'acc'), ('Nanna', 'rej'), ('Urania', 'acc'), ('Gaia', 'acc'), ('Khloe', 'rej'), ('Chloris', 'rej'), ('Athene', 'acc'), ('Janus', 'rej'), ('Freyja', 'acc'), ('Valkyrie', 'acc'), ('Ourania', 'acc'), ('Juno', 'acc'), ('Vali', 'acc'), ('Holle', 'rej'), ('Cybele', 'acc'), ('Pelagia', 'rej'), ('Anat', 'rej'), ('Soteria', 'rej'), ('Pallas', 'acc'), ('Fortuna', 'rej'), ('Maat', 'acc'), ('Caliope', 'acc'), ('Chimera', 'acc'), ('Deianeira', 'rej'), ('Agathe', 'rej'), ('Lousia', 'rej'), ('Shu', 'rej'), ('Areion', 'rej'), ('Ceres', 'acc'), ('Areia', 'rej'), ('Saturn', 'rej'), ('Lakinia', 'rej'), ('Tyche', 'acc'), ('Khepri', 'acc'), ('Demeter', 'acc'), ('Nike', 'acc')]) # Note, Demeter and Nike taken from males' list and moved to females'; for some reason it spiked in males higher than in females # Similarly, Saturn moved from females' to males' # Test that all names have 'acc' or 'rej' values final_m = [] final_f = [] names_not_validated = [] for item in evallistm: if evallistm[item] not in ['acc', 'rej']: names_not_validated.append(item) elif evallistm[item] == 'acc': final_m.append(item) for item in evallistf: if evallistf[item] not in ['acc', 'rej']: names_not_validated.append(item) elif evallistf[item] == 'acc': final_f.append(item) final_all = final_m + final_f if len(names_not_validated) > 0: print "The following names do not have 'acc' or 'rej' values: ", names_not_validated raise exception("Names not validated") print 'Accepted male names:', final_m print 'Accepted female names:', final_f print 'Length: %d male, %d female\n' % (len(final_m), len(final_f)) cutmin = min(len(final_m), len(final_f)) final_m = final_m[:cutmin] final_f = final_f[:cutmin] print 'After resizing to %d names each:' % (cutmin) print 'Accepted male names:', final_m print 'Accepted female names:', final_f from copy import deepcopy oldm = deepcopy(evallistm) oldf = deepcopy(evallistf) cutoffn = 0 # how many names will remain to evaluate after duplicates removed from collections import OrderedDict evallistm = OrderedDict() evallistf = OrderedDict() # remove names with more common duplicates in other sex # this happens frequently in ssa db for name in listed_m: try: pctf = names_listed[(names_listed.sex == 'F') & (names_listed.name == name)].pct_max.iloc[0] pctm = names_listed[(names_listed.sex == 'M') & (names_listed.name == name)].pct_max.iloc[0] except: pctf = 98 pctm = 99 if (name not in ['Demeter', 'Nike'] and (name not in names_listed[names_listed.sex == 'F'].name.unique() or pctf < pctm or name == 'Saturn')): evallistm[name] = '' for name in listed_f: try: pctf = names_listed[(names_listed.sex == 'F') & (names_listed.name == name)].pct_max.iloc[0] pctm = names_listed[(names_listed.sex == 'M') & (names_listed.name == name)].pct_max.iloc[0] except: pctf = 99 pctm = 98 if (name != 'Saturn' and (name not in names_listed[names_listed.sex == 'M'].name.unique() or pctm < pctf or name in ['Demeter', 'Nike'])): evallistf[name] = '' for item in evallistm: # copy from above block try: evallistm[item] = oldm[item] except: pass for item in evallistf: try: evallistf[item] = oldf[item] except: pass if cutoffn > 0: assert len(evallistm) > cutoffn assert len(evallistf) > cutoffn print evallistm[:cutoffn] print evallistf[:cutoffn] else: print 'Length of lists: %d male, %d female\n' % (len(evallistm), len(evallistf)) print evallistm print ' ' print evallistf #manually copy and paste the above lists and assign #'acc' or 'rej' individually to accept or reject # 72, 29 and 80 character rule (PEP) do not reach this ->| #########1#########2#########3#########4#########5#########6#########7#2######9X evallistm = OrderedDict([('Sol', 'rej'), ('Seth', 'rej'), ('Griffin', 'rej'), ('Amon', 'rej'), ('Thor', 'acc'), ('Hercules', 'acc'), ('Ladon', 'rej'), ('Odin', 'acc'), ('Hermes', 'acc'), ('Apollo', 'acc'), ('Osiris', 'acc'), ('Min', 'rej'), ('Clete', 'rej'), ('Zeus', 'acc'), ('Phoenix', 'acc'), ('Amen', 'rej'), ('Mars', 'acc'), ('Ares', 'acc'), ('Loki', 'acc'), ('Ran', 'rej'), ('Mercury', 'acc'), ('Tyr', 'acc'), ('Jupiter', 'acc'), ('Kore', 'rej'), ('Ra', 'acc'), ('Anubis', 'acc'), ('Helios', 'acc'), ('Poseidon', 'acc'), ('Makar', 'rej'), ('Pater', 'rej'), ('Amun', 'rej'), ('Fenris', 'acc'), ('Set', 'rej'), ('Horus', 'acc'), ('Megale', 'rej'), ('Aten', 'acc')]) evallistf = OrderedDict([('Athena', 'acc'), ('Delia', 'rej'), ('Minerva', 'acc'), ('Doris', 'rej'), ('Phoebe', 'rej'), ('Chloe', 'rej'), ('Diana', 'rej'), ('Flora', 'rej'), ('Sophia', 'rej'), ('Rhea', 'rej'), ('Venus', 'acc'), ('Vesta', 'acc'), ('Luna', 'rej'), ('Thalia', 'acc'), ('Lucina', 'rej'), ('Gerda', 'rej'), ('Eris', 'acc'), ('Artemis', 'acc'), ('Aphrodite', 'acc'), ('Isis', 'acc'), ('Clio', 'acc'), ('Persephone', 'acc'), ('Melaina', 'rej'), ('Shai', 'rej'), ('Andromeda', 'acc'), ('Lamia', 'acc'), ('Sia', 'rej'), ('Hera', 'acc'), ('Nanna', 'rej'), ('Urania', 'acc'), ('Gaia', 'acc'), ('Khloe', 'rej'), ('Chloris', 'rej'), ('Athene', 'acc'), ('Nike', 'acc'), ('Janus', 'rej'), ('Freyja', 'acc'), ('Valkyrie', 'acc'), ('Ourania', 'acc'), ('Juno', 'acc'), ('Vali', 'acc'), ('Holle', 'rej'), ('Cybele', 'acc'), ('Pelagia', 'rej'), ('Anat', 'rej'), ('Soteria', 'rej'), ('Pallas', 'acc'), ('Fortuna', 'rej'), ('Maat', 'acc'), ('Caliope', 'acc'), ('Chimera', 'acc'), ('Deianeira', 'rej'), ('Agathe', 'rej'), ('Lousia', 'rej'), ('Shu', 'rej'), ('Areion', 'rej'), ('Ceres', 'acc'), ('Areia', 'rej'), ('Lakinia', 'rej'), ('Tyche', 'acc'), ('Khepri', 'acc')]) # Note, Demeter and Nike taken from males' list and moved to females'; for some reason it spiked in males higher than in females # Similarly, Saturn moved from females' to males' # Test that all names have 'acc' or 'rej' values final_m = [] final_f = [] names_not_validated = [] for item in evallistm: if evallistm[item] not in ['acc', 'rej']: names_not_validated.append(item) elif evallistm[item] == 'acc': final_m.append(item) for item in evallistf: if evallistf[item] not in ['acc', 'rej']: names_not_validated.append(item) elif evallistf[item] == 'acc': final_f.append(item) if len(names_not_validated) > 0: print "The following names do not have 'acc' or 'rej' values: ", names_not_validated raise exception("Names not validated") print 'Accepted male names:', final_m print 'Accepted female names:', final_f print 'Length: %d male, %d female\n' % (len(final_m), len(final_f)) # manually limit to nice round number nice_round_number = 100 # if too high, there will be no change final_m = final_m[:nice_round_number] final_f = final_f[:nice_round_number] print 'After manually resizing to nice round number of %d names each:' % (nice_round_number) print 'Accepted male names:', final_m print 'Accepted female names:', final_f # BTW, here's those missassigned (it appears) genders: print names[names.name == 'Saturn'] print names[names.name == 'Demeter'] print names[names.name == 'Nike'] %run download_and_process.py # reduce names dataframe to those matching list # print '--------------------\nDataframe names filtered to those that match list' # print "%d records to begin." % (len(names)) names_listed = names[((names.name.isin(final_m) & (names.sex == 'M')) | (names.name.isin(final_f) & (names.sex == 'F')) )].copy() names_listed.sort('pct_max', ascending=False, inplace=True) # print "%d records remaining." % (len(names_listed)) # listed_in_df = list(names_listed.name) # print names_listed.head(10) # listed_m = list(names[(names.sex == 'M') & (names.name.isin(final_m))]['name']) # listed_f = list(names[(names.sex == 'F') & (names.name.isin(final_f))]['name']) #reduce yob dataframe to those matching list print '--------------------\nDataframe yob filtered to those that match list (count only)' print "%d records to begin." % (len(yob)) yob_listed_m = yob[(yob.name.isin(final_m)) & (yob.sex == 'M')].copy() yob_listed_m.sort(['year', 'sex', 'name'], ascending=False, inplace=True) yob_listed_f = yob[(yob.name.isin(final_f)) & (yob.sex == 'F')].copy() yob_listed_f.sort(['year', 'sex', 'name'], ascending=False, inplace=True) print "%d records remaining." % (len(yob_listed)) # m and f totals yob_listed_f_agg = pd.DataFrame(yob_listed_f.groupby('year').sum())[['births', 'pct']] yob_listed_m_agg = pd.DataFrame(yob_listed_m.groupby('year').sum())[['births', 'pct']] print '--------------------\nHead of total matching list per year, female' print yob_listed_f_agg.head() # print chart of m and f totals print '\n' # function to determine a nice y-axis limit a little above the maximum value # rounds maximum y up to second-most-significant digit def determine_y_limit(x): significance = int(math.floor((math.log10(x)))) val = math.floor(x / (10 ** (significance - 1))) + 1 val = val * (10 ** (significance - 1)) return val #data xf = list(yob_listed_f_agg.index) xm = list(yob_listed_m_agg.index) plt.figure(figsize=(16,9)) plt.plot(xf, list(yob_listed_f_agg.pct), color="red") plt.plot(xm, list(yob_listed_m_agg.pct), color="blue") plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct) +list(yob_listed_m_agg.pct)))) plt.xlim(1880, 2013) plt.title('Top 10 mythological names, boy=blue, girl=red', fontsize = 20) plt.xlabel("Year", fontsize = 14) plt.ylabel("% of total births of that sex", fontsize = 14) plt.show() # all names_listed, so we can see which ones to aggregate # cutoff of 10 already done print names_listed[names_listed.sex == 'M'].head(50) print '' print names_listed[names_listed.sex == 'F'].head(50) # just take top 10 nice_round_number = 10 # if too high, there will be no change final_m = final_m[:nice_round_number] final_f = final_f[:nice_round_number] print 'After manually resizing to nice round number of %d names each:' % (nice_round_number) print 'Accepted male names:', final_m print 'Accepted female names:', final_f names = final_m[:10] sexes = ['M'] # can be length 1 or same length as names yearstart=1880 yearend=2013 start = time.time() df_chart = yob.copy() if len(sexes) == 1: sexes = sexes * len(names) df_chart = df_chart[df_chart['name'].isin(names)] df_chart['temp'] = 0 for row in range(len(df_chart)): for pos in range(len(names)): if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]: df_chart.temp.iloc[row] = 1 df_chart = df_chart[df_chart.temp == 1] #To keep more than one data set for charts in memory, change name of chart_1 chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex'])) col = chart_1.columns[0] for yr in range(yearstart, yearend+1): #inserts missing years if yr not in chart_1.index: #chart_1[col][yr] = 0.0 chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0])) chart_1 = chart_1.fillna(0) chart_1.sort(inplace=True, ascending=True) #a single function to make the four different kinds of charts def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \ groupedlist = [], baseline='sym', png_name=''): dataframe = df.copy() startyear = min(list(dataframe.index)) endyear = max(list(dataframe.index)) yearstr = '%d-%d' % (startyear, endyear) legend_size = 0.01 has_male = False has_female = False has_both = False max_y = 0 for name, sex in dataframe.columns: max_y = max(max_y, dataframe[(name, sex)].max()) final_name = name if sex == 'M': has_male = True if sex == 'F': has_female = True if smoothing > 0: newvalues = [] for row in range(len(dataframe)): start = max(0, row - smoothing) end = min(len(dataframe) - 1, row + smoothing) newvalues.append(dataframe[(name, sex)].iloc[start:end].mean()) for row in range(len(dataframe)): dataframe[(name, sex)].iloc[row] = newvalues[row] if has_male and has_female: y_text = "% of births of indicated sex" has_both = True elif has_male: y_text = "Percent of male births" else: y_text = "Percent of female births" num_series = len(dataframe.columns) if colors == []: colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3", "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"] #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454'] from random import shuffle shuffle(colors) num_colors = len(colors) if num_series > num_colors: print "Warning: colors will be repeated." if title == '': if num_series == 1: title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr) else: title = "Popularity of baby names in U.S., %s" % (yearstr) x_values = range(startyear, endyear + 1) y_zeroes = [0] * (endyear - startyear) if form == 'line': fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w') counter = 0 for name, sex in dataframe.columns: color = colors[counter % num_colors] counter += 1 if has_both: label = "%s (%s)" % (name, sex) else: label = name ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3) ax.set_ylim(0,determine_y_limit(max_y)) ax.set_xlim(1980, endyear) ax.set_ylabel(y_text, size = 13) box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * legend_size, box.width, box.height * (1 - legend_size)]) legend_cols = min(5, num_series) ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols) if form == 'subplots_auto': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum alpha: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) current_ymax = dataframe[(name, sex)].max() tint = 1.0 * current_ymax / determine_y_limit(max_y) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(current_ymax)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'subplots_same': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum y axis: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(max_y)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'stream': plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k') plt.title(title, size=17) plt.xlim(startyear, endyear) if has_both: yaxtext = 'Percent of births of indicated sex (scale: ' elif has_male: yaxtext = 'Percent of male births (scale: ' else: yaxtext = 'Percent of female births (scale: ' scale = str(determine_y_limit(max_y)) + ')' yaxtext += scale plt.ylabel(yaxtext, size=13) polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], colors=colors, baseline=baseline) legendProxies = [] for poly in polys: legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0])) namelist = [] for name, sex in dataframe.columns: if has_both: namelist.append('%s (%s)' % (name, sex)) else: namelist.append(name) plt.legend(legendProxies, namelist, loc=3, ncol=2) plt.tick_params(\ axis='y', which='both', # major and minor ticks left='off', right='off', labelleft='off') plt.show() if png_name != '': filename = save_path + "/" + png_name + ".png" plt.savefig(filename) plt.close() #line graph make_chart(df=chart_1, form='stream', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='sym', # zero , sym , wiggle , weighted_wiggle png_name = '', # if '', will not be saved ) names = final_f[:12] sexes = ['F'] # can be length 1 or same length as names yearstart=1880 yearend=2013 start = time.time() df_chart = yob.copy() if len(sexes) == 1: sexes = sexes * len(names) df_chart = df_chart[df_chart['name'].isin(names)] df_chart['temp'] = 0 for row in range(len(df_chart)): for pos in range(len(names)): if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]: df_chart.temp.iloc[row] = 1 df_chart = df_chart[df_chart.temp == 1] #To keep more than one data set for charts in memory, change name of chart_1 chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex'])) col = chart_1.columns[0] for yr in range(yearstart, yearend+1): #inserts missing years if yr not in chart_1.index: #chart_1[col][yr] = 0.0 chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0])) chart_1 = chart_1.fillna(0) chart_1.sort(inplace=True, ascending=True) #a single function to make the four different kinds of charts def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \ groupedlist = [], baseline='sym', png_name=''): dataframe = df.copy() startyear = min(list(dataframe.index)) endyear = max(list(dataframe.index)) yearstr = '%d-%d' % (startyear, endyear) legend_size = 0.01 has_male = False has_female = False has_both = False max_y = 0 for name, sex in dataframe.columns: max_y = max(max_y, dataframe[(name, sex)].max()) final_name = name if sex == 'M': has_male = True if sex == 'F': has_female = True if smoothing > 0: newvalues = [] for row in range(len(dataframe)): start = max(0, row - smoothing) end = min(len(dataframe) - 1, row + smoothing) newvalues.append(dataframe[(name, sex)].iloc[start:end].mean()) for row in range(len(dataframe)): dataframe[(name, sex)].iloc[row] = newvalues[row] if has_male and has_female: y_text = "% of births of indicated sex" has_both = True elif has_male: y_text = "Percent of male births" else: y_text = "Percent of female births" num_series = len(dataframe.columns) if colors == []: colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3", "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"] #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454'] from random import shuffle shuffle(colors) num_colors = len(colors) if num_series > num_colors: print "Warning: colors will be repeated." if title == '': if num_series == 1: title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr) else: title = "Popularity of baby names in U.S., %s" % (yearstr) x_values = range(startyear, endyear + 1) y_zeroes = [0] * (endyear - startyear) if form == 'line': fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w') counter = 0 for name, sex in dataframe.columns: color = colors[counter % num_colors] counter += 1 if has_both: label = "%s (%s)" % (name, sex) else: label = name ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3) ax.set_ylim(0,determine_y_limit(max_y)) ax.set_xlim(1980, endyear) ax.set_ylabel(y_text, size = 13) box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * legend_size, box.width, box.height * (1 - legend_size)]) legend_cols = min(5, num_series) ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols) if form == 'subplots_auto': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum alpha: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) current_ymax = dataframe[(name, sex)].max() tint = 1.0 * current_ymax / determine_y_limit(max_y) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(current_ymax)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'subplots_same': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum y axis: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(max_y)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'stream': plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k') plt.title(title, size=17) plt.xlim(startyear, endyear) if has_both: yaxtext = 'Percent of births of indicated sex (scale: ' elif has_male: yaxtext = 'Percent of male births (scale: ' else: yaxtext = 'Percent of female births (scale: ' scale = str(determine_y_limit(max_y)) + ')' yaxtext += scale plt.ylabel(yaxtext, size=13) polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], colors=colors, baseline=baseline) legendProxies = [] for poly in polys: legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0])) namelist = [] for name, sex in dataframe.columns: if has_both: namelist.append('%s (%s)' % (name, sex)) else: namelist.append(name) plt.legend(legendProxies, namelist, loc=3, ncol=2) plt.tick_params(\ axis='y', which='both', # major and minor ticks left='off', right='off', labelleft='off') plt.show() if png_name != '': filename = save_path + "/" + png_name + ".png" plt.savefig(filename) plt.close() #line graph make_chart(df=chart_1, form='stream', # line , subplots_auto , subplots_same , stream title="10 most popular mythological girls' names, 2914-2013", colors= [], smoothing=0, baseline='sym', # zero , sym , wiggle , weighted_wiggle png_name = '', # if '', will not be saved ) names = final_f[:10] sexes = ['F'] # can be length 1 or same length as names yearstart=1880 # for data, not graph yearend=2013 xmin = 1940 start = time.time() df_chart = yob.copy() if len(sexes) == 1: sexes = sexes * len(names) df_chart = df_chart[df_chart['name'].isin(names)] df_chart['temp'] = 0 for row in range(len(df_chart)): for pos in range(len(names)): if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]: df_chart.temp.iloc[row] = 1 df_chart = df_chart[df_chart.temp == 1] #To keep more than one data set for charts in memory, change name of chart_1 chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex'])) col = chart_1.columns[0] for yr in range(yearstart, yearend+1): #inserts missing years if yr not in chart_1.index: #chart_1[col][yr] = 0.0 chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0])) chart_1 = chart_1.fillna(0) chart_1.sort(inplace=True, ascending=True) #a single function to make the four different kinds of charts def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \ groupedlist = [], baseline='sym', png_name=''): dataframe = df.copy() startyear = min(list(dataframe.index)) endyear = max(list(dataframe.index)) yearstr = '%d-%d' % (startyear, endyear) legend_size = 0.01 has_male = False has_female = False has_both = False max_y = 0 for name, sex in dataframe.columns: max_y = max(max_y, dataframe[(name, sex)].max()) final_name = name if sex == 'M': has_male = True if sex == 'F': has_female = True if smoothing > 0: newvalues = [] for row in range(len(dataframe)): start = max(0, row - smoothing) end = min(len(dataframe) - 1, row + smoothing) newvalues.append(dataframe[(name, sex)].iloc[start:end].mean()) for row in range(len(dataframe)): dataframe[(name, sex)].iloc[row] = newvalues[row] if has_male and has_female: y_text = "% of births of indicated sex" has_both = True elif has_male: y_text = "Percent of male births" else: y_text = "Percent of female births" num_series = len(dataframe.columns) if colors == []: colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3", "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"] #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454'] from random import shuffle shuffle(colors) num_colors = len(colors) if num_series > num_colors: print "Warning: colors will be repeated." if title == '': if num_series == 1: title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr) else: title = "Popularity of baby names in U.S., %s" % (yearstr) x_values = range(startyear, endyear + 1) y_zeroes = [0] * (endyear - startyear) if form == 'line': fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w') counter = 0 for name, sex in dataframe.columns: color = colors[counter % num_colors] counter += 1 if has_both: label = "%s (%s)" % (name, sex) else: label = name ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3) ax.set_ylim(0,determine_y_limit(max_y)) ax.set_xlim(xmin, endyear) ax.set_ylabel(y_text, size = 13) box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * legend_size, box.width, box.height * (1 - legend_size)]) legend_cols = min(5, num_series) ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols) if form == 'subplots_auto': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum alpha: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) current_ymax = dataframe[(name, sex)].max() tint = 1.0 * current_ymax / determine_y_limit(max_y) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(current_ymax)) axes[counter].set_xlim(xmin, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'subplots_same': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum y axis: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(max_y)) axes[counter].set_xlim(xmin, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'stream': plt.figure(num=None, figsize=(20,16.67), dpi=150, facecolor='w', edgecolor='k') plt.title(title, size=17) plt.xlim(xmin, endyear) if has_both: yaxtext = 'Percent of births of indicated sex (scale: ' elif has_male: yaxtext = 'Percent of male births (scale: ' else: yaxtext = 'Percent of female births (scale: ' scale = str(determine_y_limit(max_y)) + ')' yaxtext += scale plt.ylabel(yaxtext, size=13) polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], colors=colors, baseline=baseline) legendProxies = [] for poly in polys: legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0])) namelist = [] for name, sex in dataframe.columns: if has_both: namelist.append('%s (%s)' % (name, sex)) else: namelist.append(name) plt.legend(legendProxies, namelist, loc=3, ncol=2) plt.tick_params(\ axis='y', which='both', # major and minor ticks left='off', right='off', labelleft='off') plt.show() if png_name != '': filename = save_path + "/" + png_name + ".png" plt.savefig(filename) plt.close() #stream graph make_chart(df=chart_1, form='stream', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='sym', # zero , sym , wiggle , weighted_wiggle png_name = '', # if '', will not be saved ) names = final_m[:10] sexes = ['M'] # can be length 1 or same length as names yearstart=1880 # for data, not graph yearend=2013 xmin = 1940 start = time.time() df_chart = yob.copy() if len(sexes) == 1: sexes = sexes * len(names) df_chart = df_chart[df_chart['name'].isin(names)] df_chart['temp'] = 0 for row in range(len(df_chart)): for pos in range(len(names)): if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]: df_chart.temp.iloc[row] = 1 df_chart = df_chart[df_chart.temp == 1] #To keep more than one data set for charts in memory, change name of chart_1 chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex'])) col = chart_1.columns[0] for yr in range(yearstart, yearend+1): #inserts missing years if yr not in chart_1.index: #chart_1[col][yr] = 0.0 chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0])) chart_1 = chart_1.fillna(0) chart_1.sort(inplace=True, ascending=True) #a single function to make the four different kinds of charts def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \ groupedlist = [], baseline='sym', png_name=''): dataframe = df.copy() startyear = min(list(dataframe.index)) endyear = max(list(dataframe.index)) yearstr = '%d-%d' % (startyear, endyear) legend_size = 0.01 has_male = False has_female = False has_both = False max_y = 0 for name, sex in dataframe.columns: max_y = max(max_y, dataframe[(name, sex)].max()) final_name = name if sex == 'M': has_male = True if sex == 'F': has_female = True if smoothing > 0: newvalues = [] for row in range(len(dataframe)): start = max(0, row - smoothing) end = min(len(dataframe) - 1, row + smoothing) newvalues.append(dataframe[(name, sex)].iloc[start:end].mean()) for row in range(len(dataframe)): dataframe[(name, sex)].iloc[row] = newvalues[row] if has_male and has_female: y_text = "% of births of indicated sex" has_both = True elif has_male: y_text = "Percent of male births" else: y_text = "Percent of female births" num_series = len(dataframe.columns) if colors == []: colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3", "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"] #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454'] from random import shuffle shuffle(colors) num_colors = len(colors) if num_series > num_colors: print "Warning: colors will be repeated." if title == '': if num_series == 1: title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr) else: title = "Popularity of baby names in U.S., %s" % (yearstr) x_values = range(startyear, endyear + 1) y_zeroes = [0] * (endyear - startyear) if form == 'line': fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w') counter = 0 for name, sex in dataframe.columns: color = colors[counter % num_colors] counter += 1 if has_both: label = "%s (%s)" % (name, sex) else: label = name ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3) ax.set_ylim(0,determine_y_limit(max_y)) ax.set_xlim(xmin, endyear) ax.set_ylabel(y_text, size = 13) box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * legend_size, box.width, box.height * (1 - legend_size)]) legend_cols = min(5, num_series) ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols) if form == 'subplots_auto': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum alpha: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) current_ymax = dataframe[(name, sex)].max() tint = 1.0 * current_ymax / determine_y_limit(max_y) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(current_ymax)) axes[counter].set_xlim(xmin, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'subplots_same': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum y axis: %d percent' % (determine_y_limit(max_y)) for name, sex in dataframe.columns: if sex=='M': sex_label = 'male' else: sex_label = 'female' label = "Percent of %s births for %s" % (sex_label, name) axes[counter].plot(x_values, dataframe[(name, sex)], color='k') axes[counter].set_ylim(0,determine_y_limit(max_y)) axes[counter].set_xlim(xmin, endyear) axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'stream': plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k') plt.title(title, size=17) plt.xlim(xmin, endyear) if has_both: yaxtext = 'Percent of births of indicated sex (scale: ' elif has_male: yaxtext = 'Percent of male births (scale: ' else: yaxtext = 'Percent of female births (scale: ' scale = str(determine_y_limit(max_y)) + ')' yaxtext += scale plt.ylabel(yaxtext, size=13) polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], colors=colors, baseline=baseline) legendProxies = [] for poly in polys: legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0])) namelist = [] for name, sex in dataframe.columns: if has_both: namelist.append('%s (%s)' % (name, sex)) else: namelist.append(name) plt.legend(legendProxies, namelist, loc=3, ncol=2) plt.tick_params(\ axis='y', which='both', # major and minor ticks left='off', right='off', labelleft='off') plt.show() if png_name != '': filename = save_path + "/" + png_name + ".png" plt.savefig(filename) plt.close() #stream graph make_chart(df=chart_1, form='stream', # line , subplots_auto , subplots_same , stream title='', colors= [], smoothing=0, baseline='sym', # zero , sym , wiggle , weighted_wiggle png_name = '', # if '', will not be saved )