listed_path = "lists/pokemon.list"
totals_title = ""
top_cutoff = 10
top_boys_title = ""
top_girls_title = ""
last_year = 2013 #change this when Social Security database is updated
save_path = "user_charts" # files created by this notebook will be saved in this directory
import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
os.makedirs(save_path)
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn # comment out if you don't have it, but it makes good-looking charts
print 'This is standard output from download_and_process.py'
%run download_and_process.py
print '--------------------\nFirst 80 characters of list:'
listed_file = open(listed_path, "r").read()
print listed_file[:80] + ' ...'
all_listed = eval(listed_file) # make sure you trust this file!
all_listed_set = set(all_listed) # to remove duplicates
all_listed = list(all_listed)
print "all_listed: list of length", len(all_listed)
# reduce names dataframe to those matching list
print '--------------------\nDataframe names filtered to those that match list'
print "%d records to begin." % (len(names))
names_listed = names[names.name.isin(all_listed)].copy()
names_listed.sort('pct_max', ascending=False, inplace=True)
print "%d records remaining." % (len(names_listed))
listed_in_df = list(names_listed.name)
print names_listed.head(10)
listed_m = list(names[(names.sex == 'M') & (names.name.isin(listed_in_df))]['name'])
listed_f = list(names[(names.sex == 'F') & (names.name.isin(listed_in_df))]['name'])
#reduce yob dataframe to those matching list
print '--------------------\nDataframe yob filtered to those that match list (count only)'
print "%d records to begin." % (len(yob))
yob_listed = yob[yob.name.isin(listed_in_df)].copy()
yob_listed.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
print "%d records remaining." % (len(yob_listed))
# m and f totals
yob_listed_f_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'F'].groupby('year').sum())[['births', 'pct']]
yob_listed_m_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'M'].groupby('year').sum())[['births', 'pct']]
print '--------------------\nHead of total matching list per year, female'
print yob_listed_f_agg.head()
# print chart of m and f totals
print '\n'
# function to determine a nice y-axis limit a little above the maximum value
# rounds maximum y up to second-most-significant digit
def determine_y_limit(x):
significance = int(math.floor((math.log10(x))))
val = math.floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val
#data
xf = list(yob_listed_f_agg.index)
xm = list(yob_listed_m_agg.index)
plt.figure(figsize=(16,9))
plt.plot(xf, list(yob_listed_f_agg.pct), color="red")
plt.plot(xm, list(yob_listed_m_agg.pct), color="blue")
plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct)
+list(yob_listed_m_agg.pct))))
plt.xlim(1880, 2013)
plt.title(totals_title, fontsize = 20)
plt.xlabel("Year", fontsize = 14)
plt.ylabel("% of total births of that sex", fontsize = 14)
plt.show()
#function to make dataframe for top names
def top_df(yobdf, names, sexes):
""" yobdf = dataframe derived from yob; normally it would just be yob itself.
names = list of names
sexes = list of length 1 for all the same sex, or same length as names. 'F' and 'M' allowed
"""
df_chart = yobdf.copy()
assert len(sexes) == 1 or len(names) == len(sexes)
if len(sexes) == 1:
sexes = sexes * len(names)
df_chart = df_chart[df_chart['name'].isin(names)]
df_chart['temp'] = 0
for row in range(len(df_chart)):
for pos in range(len(names)):
if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
print "Tail of dataframe:"
print df_chart.tail()
output_df = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = output_df.columns[0]
for yr in range(1880, last_year + 1): #inserts missing years
if yr not in output_df.index:
#output_df[col][yr] = 0.0
output_df = output_df.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
output_df = output_df.fillna(0)
return output_df
listed_top_m = top_df(yob, listed_m[:top_cutoff], ['M'])
listed_top_f = top_df(yob, listed_f[:top_cutoff], ['F'])
#a single function to make the four different kinds of charts
def make_chart(df, form='line', title='', colors= [], smoothing=0, \
groupedlist = [], baseline='sym', png_name=''):
dataframe = df.copy()
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
has_male = False
has_female = False
has_both = False
max_y = 0
for name, sex in dataframe.columns:
max_y = max(max_y, dataframe[(name, sex)].max())
final_name = name
if sex == 'M': has_male = True
if sex == 'F': has_female = True
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[(name, sex)].iloc[row] = newvalues[row]
if has_male and has_female:
y_text = "% of births of indicated sex"
has_both = True
elif has_male:
y_text = "Percent of male births"
else:
y_text = "Percent of female births"
num_series = len(dataframe.columns)
if colors == []:
colors = ['#BB2114', '#0C5966', '#BA7814', '#4459AB', '#6B3838',
'#B8327B', '#2B947F', '#0D83B5', '#684287', '#8C962C',
'#92289E', '#242D7D']
# my own list of dark contrasting colors
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
if title == '':
if num_series == 1:
title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
else:
title = "Popularity of baby names in U.S., %s" % (yearstr)
x_values = range(startyear, endyear + 1)
y_zeroes = [0] * (endyear - startyear)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for name, sex in dataframe.columns:
color = colors[counter % num_colors]
counter += 1
if has_both:
label = "%s (%s)" % (name, sex)
else:
label = name
ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
ax.set_ylim(0,determine_y_limit(max_y))
ax.set_xlim(startyear, endyear)
ax.set_ylabel(y_text, size = 13)
ax.set_title(title, size = 18)
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
current_ymax = dataframe[(name, sex)].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(startyear, endyear)
if has_both:
yaxtext = 'Percent of births of indicated sex (scale: '
elif has_male:
yaxtext = 'Percent of male births (scale: '
else:
yaxtext = 'Percent of female births (scale: '
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
namelist = []
for name, sex in dataframe.columns:
if has_both:
namelist.append('%s (%s)' % (name, sex))
else:
namelist.append(name)
plt.legend(legendProxies, namelist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
filename = save_path + "/" + png_name + ".png"
plt.savefig(filename)
plt.close()
# line charts
make_chart(df=listed_top_m,
form='line', # line , subplots_auto , subplots_same , stream
title=top_boys_title,
colors= [],
smoothing=0,
baseline='zero', # zero , sym , wiggle , weighted_wiggle
)
make_chart(df=listed_top_f,
form='line', # line , subplots_auto , subplots_same , stream
title=top_girls_title,
colors= [],
smoothing=0,
baseline='zero', # zero , sym , wiggle , weighted_wiggle
)
names_listed.reset_index(drop = True, inplace = True)
names_listed.head()
names_listed.to_csv('lists/names_matching_mythological_list.csv')
This is standard output from download_and_process.py Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names_x sexratio \ 68 2008 1886765 2035811 3922576 2046 32483 107.899553 69 2009 1832276 1978582 3810858 1789 32210 107.984932 70 2010 1771846 1912915 3684761 1635 31593 107.961696 71 2011 1752198 1891800 3643998 1539 31412 107.967250 72 2012 1751866 1886972 3638838 1531 31212 107.712120 unique_names_y_x unique_names_x unique_names_y_x unique_names_x \ 68 32483 32483 32483 32483 69 32210 32210 32210 32210 70 31593 31593 31593 31593 71 31412 31412 31412 31412 72 31212 31212 31212 31212 unique_names_y_x unique_names_x unique_names_y 68 32483 32483 32483 69 32210 32210 32210 70 31593 31593 31593 71 31412 31412 31412 72 31212 31212 31212 -------------------- First 80 characters of list: ["Bulbasaur", "Fushigidane", "Ivysaur", "Fushigisou", "Venusaur", "Fushigibana", ... all_listed: list of length 1318 -------------------- Dataframe names filtered to those that match list 102690 records to begin. 24 records remaining. name sex year_count year_min year_max pct_sum pct_max 64911 Casey M 120 1888 2013 5.999303 0.210989 3021 Casey F 71 1921 2013 4.294258 0.206064 863 Unknown F 118 1886 2013 0.563948 0.033599 64703 Unknown M 131 1880 2013 0.693920 0.026963 64269 Aron M 134 1880 2013 1.330348 0.020925 4806 Tangela F 54 1956 2011 0.226490 0.012276 5139 Abra F 52 1955 2012 0.050814 0.003892 65739 Lucky M 98 1912 2013 0.171647 0.003567 68408 Durant M 48 1914 2013 0.024203 0.001833 2555 Roselia F 78 1908 2013 0.046634 0.001795 -------------------- Dataframe yob filtered to those that match list (count only) 1792091 records to begin. 1116 records remaining. -------------------- Head of total matching list per year, female births pct year 1886 5 0.003459 1889 8 0.004485 1890 7 0.003677 1894 8 0.003589 1896 6 0.002522
Tail of dataframe: name sex births year pct ranked temp 1781425 Lucky M 33 2013 0.001763 3278.5 1 1784646 Thunder M 13 2013 0.000695 6375.0 1 1787485 Onix M 8 2013 0.000427 9152.0 1 1789150 Durant M 6 2013 0.000321 11332.0 1 1792004 Yadon M 5 2013 0.000267 12995.0 1 Tail of dataframe: name sex births year pct ranked temp 1759744 Casey F 373 2013 0.021478 726.0 1 1761965 Unknown F 57 2013 0.003282 2934.0 1 1767797 Lucky F 13 2013 0.000749 8724.5 1 1769055 Roselia F 11 2013 0.000633 9843.5 1 1769230 Amaura F 10 2013 0.000576 10526.0 1
print names_listed.name.unique()
['Casey' 'Unknown' 'Aron' 'Tangela' 'Abra' 'Lucky' 'Durant' 'Roselia' 'Thunder' 'Windie' 'Paras' 'Kimori' 'Amaura' 'Onix' 'Eevee' 'Hassam' 'Kameil' 'Yadon' 'Lizardo' 'Sand']
Remove Casey, Aron, Tangela and Unknown.
cutoffn = 0
# how many names will remain to evaluate after duplicates removed
from collections import OrderedDict
evallistm = OrderedDict()
evallistf = OrderedDict()
# remove names with more common duplicates in other sex
# this happens frequently in ssa db
for name in listed_m:
try:
pctf = names_listed[(names_listed.sex == 'F') &
(names_listed.name == name)].pct_max.iloc[0]
pctm = names_listed[(names_listed.sex == 'M') &
(names_listed.name == name)].pct_max.iloc[0]
except:
pctf = 98
pctm = 99
if (name not in names_listed[names_listed.sex == 'F'].name.unique() or
pctf < pctm):
evallistm[name] = ''
for name in listed_f:
try:
pctf = names_listed[(names_listed.sex == 'F') &
(names_listed.name == name)].pct_max.iloc[0]
pctm = names_listed[(names_listed.sex == 'M') &
(names_listed.name == name)].pct_max.iloc[0]
except:
pctf = 99
pctm = 98
if (name not in names_listed[names_listed.sex == 'M'].name.unique() or
pctm < pctf):
evallistf[name] = ''
if cutoffn > 0:
assert len(evallistm) > cutoffn
assert len(evallistf) > cutoffn
print evallistm[:cutoffn]
print evallistf[:cutoffn]
else:
print 'Length of lists: %d male, %d female\n' % (len(evallistm), len(evallistf))
print evallistm
print ' '
print evallistf
Length of lists: 10 male, 10 female OrderedDict([('Aron', ''), ('Casey', ''), ('Lucky', ''), ('Durant', ''), ('Paras', ''), ('Thunder', ''), ('Onix', ''), ('Yadon', ''), ('Lizardo', ''), ('Hassam', '')]) OrderedDict([('Unknown', ''), ('Roselia', ''), ('Tangela', ''), ('Abra', ''), ('Windie', ''), ('Amaura', ''), ('Kimori', ''), ('Kameil', ''), ('Sand', ''), ('Eevee', '')])
#manually copy and paste the above lists and assign
#'acc' or 'rej' individually to accept or reject
evallistm = OrderedDict([('Aron', 'rej'), ('Casey', 'rej'), ('Lucky', 'acc'),
('Durant', 'acc'), ('Paras', 'acc'), ('Thunder', 'acc'),
('Onix', 'acc'), ('Yadon', 'acc'), ('Lizardo', 'acc'),
('Hassam', 'acc')])
evallistf = OrderedDict([('Unknown', 'rej'), ('Roselia', 'acc'), ('Tangela', 'rej'),
('Abra', 'acc'), ('Windie', 'acc'), ('Amaura', 'acc'),
('Kimori', 'acc'), ('Kameil', 'acc'), ('Sand', 'acc'),
('Eevee', 'acc')])
# Note, Demeter and Nike taken from males' list and moved to females'; for some reason it spiked in males higher than in females
# Similarly, Saturn moved from females' to males'
# Test that all names have 'acc' or 'rej' values
final_m = []
final_f = []
names_not_validated = []
for item in evallistm:
if evallistm[item] not in ['acc', 'rej']:
names_not_validated.append(item)
elif evallistm[item] == 'acc':
final_m.append(item)
for item in evallistf:
if evallistf[item] not in ['acc', 'rej']:
names_not_validated.append(item)
elif evallistf[item] == 'acc':
final_f.append(item)
final_all = final_m + final_f
if len(names_not_validated) > 0:
print "The following names do not have 'acc' or 'rej' values: ", names_not_validated
raise exception("Names not validated")
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f
print 'Length: %d male, %d female\n' % (len(final_m), len(final_f))
cutmin = min(len(final_m), len(final_f))
final_m = final_m[:cutmin]
final_f = final_f[:cutmin]
print 'After resizing to %d names each:' % (cutmin)
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f
Accepted male names: ['Lucky', 'Durant', 'Paras', 'Thunder', 'Onix', 'Yadon', 'Lizardo', 'Hassam'] Accepted female names: ['Roselia', 'Abra', 'Windie', 'Amaura', 'Kimori', 'Kameil', 'Sand', 'Eevee'] Length: 8 male, 8 female After resizing to 8 names each: Accepted male names: ['Lucky', 'Durant', 'Paras', 'Thunder', 'Onix', 'Yadon', 'Lizardo', 'Hassam'] Accepted female names: ['Roselia', 'Abra', 'Windie', 'Amaura', 'Kimori', 'Kameil', 'Sand', 'Eevee']
%run download_and_process.py
# reduce names dataframe to those matching list
# print '--------------------\nDataframe names filtered to those that match list'
# print "%d records to begin." % (len(names))
names_listed = names[((names.name.isin(final_m) & (names.sex == 'M')) |
(names.name.isin(final_f) & (names.sex == 'F')) )].copy()
names_listed.sort('pct_max', ascending=False, inplace=True)
# print "%d records remaining." % (len(names_listed))
# listed_in_df = list(names_listed.name)
# print names_listed.head(10)
# listed_m = list(names[(names.sex == 'M') & (names.name.isin(final_m))]['name'])
# listed_f = list(names[(names.sex == 'F') & (names.name.isin(final_f))]['name'])
#reduce yob dataframe to those matching list
print '--------------------\nDataframe yob filtered to those that match list (count only)'
print "%d records to begin." % (len(yob))
yob_listed_m = yob[(yob.name.isin(final_m)) & (yob.sex == 'M')].copy()
yob_listed_m.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
yob_listed_f = yob[(yob.name.isin(final_f)) & (yob.sex == 'F')].copy()
yob_listed_f.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
print "%d records remaining." % (len(yob_listed))
# m and f totals
yob_listed_f_agg = pd.DataFrame(yob_listed_f.groupby('year').sum())[['births', 'pct']]
yob_listed_m_agg = pd.DataFrame(yob_listed_m.groupby('year').sum())[['births', 'pct']]
print '--------------------\nHead of total matching list per year, female'
print yob_listed_f_agg.head()
# print chart of m and f totals
print '\n'
# function to determine a nice y-axis limit a little above the maximum value
# rounds maximum y up to second-most-significant digit
def determine_y_limit(x):
significance = int(math.floor((math.log10(x))))
val = math.floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val
#data
xf = list(yob_listed_f_agg.index)
xm = list(yob_listed_m_agg.index)
plt.figure(figsize=(16,9))
plt.plot(xf, list(yob_listed_f_agg.pct), color="red")
plt.plot(xm, list(yob_listed_m_agg.pct), color="blue")
plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct)
+list(yob_listed_m_agg.pct))))
plt.xlim(1880, 2013)
plt.title('boy=blue, girl=red', fontsize = 20)
plt.xlabel("Year", fontsize = 14)
plt.ylabel("% of total births of that sex", fontsize = 14)
plt.show()
Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names_x sexratio \ 68 2008 1886765 2035811 3922576 2046 32483 107.899553 69 2009 1832276 1978582 3810858 1789 32210 107.984932 70 2010 1771846 1912915 3684761 1635 31593 107.961696 71 2011 1752198 1891800 3643998 1539 31412 107.967250 72 2012 1751866 1886972 3638838 1531 31212 107.712120 unique_names_y_x unique_names_x unique_names_y_x unique_names_x \ 68 32483 32483 32483 32483 69 32210 32210 32210 32210 70 31593 31593 31593 31593 71 31412 31412 31412 31412 72 31212 31212 31212 31212 unique_names_y_x unique_names_x unique_names_y_x unique_names 68 32483 32483 32483 32483 69 32210 32210 32210 32210 70 31593 31593 31593 31593 71 31412 31412 31412 31412 72 31212 31212 31212 31212 -------------------- Dataframe yob filtered to those that match list (count only) 1792091 records to begin. 1116 records remaining. -------------------- Head of total matching list per year, female births pct year 1908 6 0.001795 1909 6 0.001728 1911 5 0.001195 1913 6 0.000961 1914 5 0.000657
# all names_listed, so we can see which ones to aggregate
# cutoff of 10 already done
print names_listed[names_listed.sex == 'M'].head(50)
print ''
print names_listed[names_listed.sex == 'F'].head(50)
name sex year_count year_min year_max pct_sum pct_max 65739 Lucky M 98 1912 2013 0.171647 0.003567 68408 Durant M 48 1914 2013 0.024203 0.001833 71899 Thunder M 28 1975 2013 0.017100 0.000990 71211 Paras M 32 1975 2011 0.013356 0.000731 72773 Onix M 25 1956 2013 0.009148 0.000561 91673 Hassam M 2 1994 2000 0.000617 0.000363 88031 Yadon M 4 2010 2013 0.001163 0.000318 90614 Lizardo M 2 1970 2010 0.000530 0.000269 name sex year_count year_min year_max pct_sum pct_max 5139 Abra F 52 1955 2012 0.050814 0.003892 2555 Roselia F 78 1908 2013 0.046634 0.001795 19024 Windie F 17 1961 1982 0.008654 0.000892 27879 Kimori F 9 2002 2011 0.003804 0.000583 26782 Amaura F 10 2000 2013 0.003863 0.000576 48704 Eevee F 2 2012 2013 0.000745 0.000457 43647 Kameil F 2 2009 2012 0.000615 0.000342 46206 Sand F 2 1958 1960 0.000496 0.000249
names = final_f[:10]
sexes = ['F'] # can be length 1 or same length as names
yearstart=1880 # for data, not graph
yearend=2013
xmin = 1940
start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
sexes = sexes * len(names)
df_chart = df_chart[df_chart['name'].isin(names)]
df_chart['temp'] = 0
for row in range(len(df_chart)):
for pos in range(len(names)):
if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
#To keep more than one data set for charts in memory, change name of chart_1
chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = chart_1.columns[0]
for yr in range(yearstart, yearend+1): #inserts missing years
if yr not in chart_1.index:
#chart_1[col][yr] = 0.0
chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
chart_1 = chart_1.fillna(0)
chart_1.sort(inplace=True, ascending=True)
#a single function to make the four different kinds of charts
def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
groupedlist = [], baseline='sym', png_name=''):
dataframe = df.copy()
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
has_male = False
has_female = False
has_both = False
max_y = 0
for name, sex in dataframe.columns:
max_y = max(max_y, dataframe[(name, sex)].max())
final_name = name
if sex == 'M': has_male = True
if sex == 'F': has_female = True
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[(name, sex)].iloc[row] = newvalues[row]
if has_male and has_female:
y_text = "% of births of indicated sex"
has_both = True
elif has_male:
y_text = "Percent of male births"
else:
y_text = "Percent of female births"
num_series = len(dataframe.columns)
if colors == []:
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
#colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
from random import shuffle
shuffle(colors)
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
if title == '':
if num_series == 1:
title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
else:
title = "Popularity of baby names in U.S., %s" % (yearstr)
x_values = range(startyear, endyear + 1)
y_zeroes = [0] * (endyear - startyear)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for name, sex in dataframe.columns:
color = colors[counter % num_colors]
counter += 1
if has_both:
label = "%s (%s)" % (name, sex)
else:
label = name
ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
ax.set_ylim(0,determine_y_limit(max_y))
ax.set_xlim(xmin, endyear)
ax.set_ylabel(y_text, size = 13)
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
current_ymax = dataframe[(name, sex)].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(xmin, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(xmin, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
plt.figure(num=None, figsize=(20,16.67), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(xmin, endyear)
if has_both:
yaxtext = 'Percent of births of indicated sex (scale: '
elif has_male:
yaxtext = 'Percent of male births (scale: '
else:
yaxtext = 'Percent of female births (scale: '
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
namelist = []
for name, sex in dataframe.columns:
if has_both:
namelist.append('%s (%s)' % (name, sex))
else:
namelist.append(name)
plt.legend(legendProxies, namelist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
filename = save_path + "/" + png_name + ".png"
plt.savefig(filename)
plt.close()
#stream graph
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '', # if '', will not be saved
)
names = final_m[:10]
sexes = ['M'] # can be length 1 or same length as names
yearstart=1880 # for data, not graph
yearend=2013
xmin = 1940
start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
sexes = sexes * len(names)
df_chart = df_chart[df_chart['name'].isin(names)]
df_chart['temp'] = 0
for row in range(len(df_chart)):
for pos in range(len(names)):
if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
#To keep more than one data set for charts in memory, change name of chart_1
chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = chart_1.columns[0]
for yr in range(yearstart, yearend+1): #inserts missing years
if yr not in chart_1.index:
#chart_1[col][yr] = 0.0
chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
chart_1 = chart_1.fillna(0)
chart_1.sort(inplace=True, ascending=True)
#a single function to make the four different kinds of charts
def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
groupedlist = [], baseline='sym', png_name=''):
dataframe = df.copy()
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
has_male = False
has_female = False
has_both = False
max_y = 0
for name, sex in dataframe.columns:
max_y = max(max_y, dataframe[(name, sex)].max())
final_name = name
if sex == 'M': has_male = True
if sex == 'F': has_female = True
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[(name, sex)].iloc[row] = newvalues[row]
if has_male and has_female:
y_text = "% of births of indicated sex"
has_both = True
elif has_male:
y_text = "Percent of male births"
else:
y_text = "Percent of female births"
num_series = len(dataframe.columns)
if colors == []:
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
#colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
from random import shuffle
shuffle(colors)
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
if title == '':
if num_series == 1:
title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
else:
title = "Popularity of baby names in U.S., %s" % (yearstr)
x_values = range(startyear, endyear + 1)
y_zeroes = [0] * (endyear - startyear)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for name, sex in dataframe.columns:
color = colors[counter % num_colors]
counter += 1
if has_both:
label = "%s (%s)" % (name, sex)
else:
label = name
ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
ax.set_ylim(0,determine_y_limit(max_y))
ax.set_xlim(xmin, endyear)
ax.set_ylabel(y_text, size = 13)
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
current_ymax = dataframe[(name, sex)].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(xmin, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(xmin, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(xmin, endyear)
if has_both:
yaxtext = 'Percent of births of indicated sex (scale: '
elif has_male:
yaxtext = 'Percent of male births (scale: '
else:
yaxtext = 'Percent of female births (scale: '
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
namelist = []
for name, sex in dataframe.columns:
if has_both:
namelist.append('%s (%s)' % (name, sex))
else:
namelist.append(name)
plt.legend(legendProxies, namelist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
filename = save_path + "/" + png_name + ".png"
plt.savefig(filename)
plt.close()
#stream graph
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '', # if '', will not be saved
)