last_year = 2013 #change this when Social Security database is updated
save_path = "user_charts" # files created by this notebook will be saved in this directory
import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
os.makedirs(save_path)
#os.chdir("Baby_names_US_IPython")
from math import floor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn # comment out if you don't have it, but it makes good-looking charts
%run download_and_process.py
# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):
significance = int(floor((log10(x))))
val = floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val
Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names_x sexratio \ 68 2008 1886765 2035811 3922576 2046 32483 107.899553 69 2009 1832276 1978582 3810858 1789 32210 107.984932 70 2010 1771846 1912915 3684761 1635 31593 107.961696 71 2011 1752198 1891800 3643998 1539 31412 107.967250 72 2012 1751866 1886972 3638838 1531 31212 107.712120 unique_names_y unique_names 68 32483 32483 69 32210 32210 70 31593 31593 71 31412 31412 72 31212 31212 Tail of dataframe 'yob1940': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names1940': name sex year_count year_min year_max pct_sum pct_max 96949 Nyah M 1 2001 2001 0.000258 0.000258 96950 Dajan M 1 2002 2002 0.000309 0.000309 96951 Maung M 1 2009 2009 0.000253 0.000253 96952 Charger M 1 2013 2013 0.000321 0.000321 96953 Chrystal M 1 1987 1987 0.000268 0.000268
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) C:\Users\David\Documents\Dropbox\IPython_Synced\GitHub\Baby_names_US_IPython\download_and_process.py in <module>() 321 322 years1940 = years1940[['year', 'births_f', 'births_m', 'births_t', 'new_names', 'unique_names_x', 'sexratio']] --> 323 years1940.columns = ['year', 'births_f', 'births_m', 'births_t', 'new_names', 'unique_names', 'sexratio'] 324 # above lines correct outer merge problem 325 print "Tail of dataframe 'years1940':" C:\Users\David\Anaconda\lib\site-packages\pandas\core\generic.pyc in __setattr__(self, name, value) 1847 This allows simpler access to columns for interactive use.""" 1848 if name in self._internal_names_set: -> 1849 object.__setattr__(self, name, value) 1850 elif name in self._metadata: 1851 return object.__setattr__(self, name, value) C:\Users\David\Anaconda\lib\site-packages\pandas\lib.pyd in pandas.lib.AxisProperty.__set__ (pandas\lib.c:38173)() C:\Users\David\Anaconda\lib\site-packages\pandas\core\generic.pyc in _set_axis(self, axis, labels) 398 399 def _set_axis(self, axis, labels): --> 400 self._data.set_axis(axis, labels) 401 self._clear_item_cache() 402 C:\Users\David\Anaconda\lib\site-packages\pandas\core\internals.pyc in set_axis(self, axis, new_labels) 1953 if new_len != old_len: 1954 raise ValueError('Length mismatch: Expected axis has %d elements, ' -> 1955 'new values have %d elements' % (old_len, new_len)) 1956 1957 self.axes[axis] = new_labels ValueError: Length mismatch: Expected axis has 8 elements, new values have 7 elements
namelist = ["William", "Will", "Willy", "Willie", "Billy", "Bill"]
sexes = ['M'] # can be length 1 or same length as names
yearstart = 1880
yearend = 2013
start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
sexes = sexes * len(namelist)
df_chart = df_chart[df_chart['name'].isin(namelist)]
df_chart['temp'] = 0 # I think this whole block is now unnecessary due to the previous line .isin function
for row in range(len(df_chart)):
for pos in range(len(namelist)):
if df_chart.name.iloc[row] == namelist[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
print "%0.1f minutes elapsed. %d records remain.\nTail of dataframe:" % ((time.time() - start)/60, len(df_chart))
print df_chart.tail()
#To keep more than one data set for charts in memory, change name of chart_1
chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = chart_1.columns[0]
for yr in range(yearstart, yearend+1): #inserts missing years
if yr not in chart_1.index:
#chart_1[col][yr] = 0.0
chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
chart_1 = chart_1.fillna(0)
chart_1.sort(inplace=True, ascending=True)
0.0 minutes elapsed. 778 records remain. Tail of dataframe: name sex births year pct ranked temp 1778822 Billy M 343 2013 0.018328 690.5 1 1778840 Willie M 328 2013 0.017526 708.0 1 1778872 Will M 313 2013 0.016725 739.5 1 1780576 Bill M 50 2013 0.002672 2458.5 1 1781108 Willy M 38 2013 0.002030 2950.5 1
temp = names[names.name != 'Jennifer']
sexes = list(temp[temp.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).sex)
namelist = list(temp[temp.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).name)
yearstart = 1880
yearend = 2013
start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
sexes = sexes * len(namelist)
df_chart = df_chart[df_chart['name'].isin(namelist)]
df_chart['temp'] = 0 # I think this whole block is now unnecessary due to the previous line .isin function
for row in range(len(df_chart)):
for pos in range(len(namelist)):
if df_chart.name.iloc[row] == namelist[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
print "%0.1f minutes elapsed. %d records remain.\nTail of dataframe:" % ((time.time() - start)/60, len(df_chart))
print df_chart.tail()
#To keep more than one data set for charts in memory, change name of chart_1
chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = chart_1.columns[0]
for yr in range(yearstart, yearend+1): #inserts missing years
if yr not in chart_1.index:
#chart_1[col][yr] = 0.0
chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
chart_1 = chart_1.fillna(0)
chart_1.sort(inplace=True, ascending=True)
0.0 minutes elapsed. 722 records remain. Tail of dataframe: name sex births year pct ranked temp 1739464 Jenniffer F 7 2012 0.000400 14172.0 1 1741107 Jenyfer F 6 2012 0.000342 15852.5 1 1762483 Jenifer F 45 2013 0.002591 3462.0 1 1764701 Jennyfer F 23 2013 0.001324 5693.0 1 1769515 Jeniffer F 10 2013 0.000576 10526.0 1
df : dataframe created above
form : line = line chart with all series sharing x and y axis
subplot_same = area charts sharing x axis, separate y axes but all have same range
subplot_auto = area charts sharing x axis, separate y axes each autoscaled to maximum of subplot;
differences in y axis maxima are shown by shading of area under curve
stream = a stream graph
title : if left blank, an automated title will be generated
colors : list of colors, one per series; a default list has 12 values
smoothed : amount of neighbour smoothing to be applied, default=0
baseline : type of stream graph, valid values are zero , sym , wiggle , weighted_wiggle
png_name : if blank, figure will not be saved.
#a single function to make the four different kinds of charts
def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
groupedlist = [], baseline='sym', png_name=''):
dataframe = df.copy()
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
has_male = False
has_female = False
has_both = False
max_y = 0
for name, sex in dataframe.columns:
max_y = max(max_y, dataframe[(name, sex)].max())
final_name = name
if sex == 'M': has_male = True
if sex == 'F': has_female = True
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[(name, sex)].iloc[row] = newvalues[row]
if has_male and has_female:
y_text = "% of births of indicated sex"
has_both = True
elif has_male:
y_text = "Percent of male births"
else:
y_text = "Percent of female births"
num_series = len(dataframe.columns)
if colors == []:
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
if title == '':
if num_series == 1:
title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
else:
title = "Popularity of baby names in U.S., %s" % (yearstr)
x_values = range(startyear, endyear + 1)
y_zeroes = [0] * (endyear - startyear)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for name, sex in dataframe.columns:
color = colors[counter % num_colors]
counter += 1
if has_both:
label = "%s (%s)" % (name, sex)
else:
label = name
ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
ax.set_ylim(0,determine_y_limit(max_y))
ax.set_xlim(startyear, endyear)
ax.set_ylabel(y_text, size = 13)
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
current_ymax = dataframe[(name, sex)].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(startyear, endyear)
if has_both:
yaxtext = 'Percent of births of indicated sex (scale: '
elif has_male:
yaxtext = 'Percent of male births (scale: '
else:
yaxtext = 'Percent of female births (scale: '
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
namelist = []
for name, sex in dataframe.columns:
if has_both:
namelist.append('%s (%s)' % (name, sex))
else:
namelist.append(name)
plt.legend(legendProxies, namelist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
filename = save_path + "/" + png_name + ".png"
plt.savefig(filename)
plt.close()
#line graph
make_chart(df=chart_1,
form='line', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '' # if '', will not be saved
)
from math import log10
%matplotlib inline
#subplots with autoscaling, with the intensity of the fill color proportional to peak maximum
make_chart(df=chart_1,
form='subplots_auto', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '' # if '', will not be saved
)
Maximum alpha: 8 percent
#subplots with all y axes having the same scale
make_chart(df=chart_1,
form='subplots_same', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '' # if '', will not be saved
)
Maximum y axis: 8 percent
#stream graph 'sym'
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '' # if '', will not be saved
)
Warning: colors will be repeated.
#stream graph 'zero'
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='zero', # zero , sym , wiggle , weighted_wiggle
png_name = '' # if '', will not be saved
)
Warning: colors will be repeated.
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-43-306c3a4517f7> in <module>() 7 smoothing=0, 8 baseline='zero', # zero , sym , wiggle , weighted_wiggle ----> 9 png_name = '' # if '', will not be saved 10 ) <ipython-input-42-d6feb851b029> in make_chart(df, form, title, colors, smoothing, groupedlist, baseline, png_name) 129 yaxtext += scale 130 plt.ylabel(yaxtext, size=13) --> 131 polys = pyplot.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 132 colors=colors, baseline=baseline) 133 legendProxies = [] NameError: global name 'pyplot' is not defined
#stream graph 'wiggle'
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='wiggle', # zero , sym , wiggle , weighted_wiggle
png_name = '' # if '', will not be saved
)
#stream graph 'weighted_wiggle'
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='weighted_wiggle', # zero , sym , wiggle , weighted_wiggle
png_name = '' # if '', will not be saved
)
Warning: colors will be repeated.
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-45-69231189cf48> in <module>() 7 smoothing=0, 8 baseline='weighted_wiggle', # zero , sym , wiggle , weighted_wiggle ----> 9 png_name = '' # if '', will not be saved 10 ) <ipython-input-44-9a402c6609e8> in make_chart(df, form, title, colors, smoothing, groupedlist, baseline, png_name) 133 legendProxies = [] 134 for poly in polys: --> 135 legendProxies.append(pyplot.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0])) 136 namelist = [] 137 for name, sex in dataframe.columns: NameError: global name 'pyplot' is not defined
names[names.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False)
name | sex | year_count | year_min | year_max | pct_sum | pct_max | |
---|---|---|---|---|---|---|---|
1652 | Jennifer | F | 96 | 1916 | 2013 | 88.233240 | 4.300706 |
2491 | Jenifer | F | 79 | 1933 | 2013 | 1.499543 | 0.062833 |
67227 | Jennifer | M | 63 | 1946 | 2012 | 0.272530 | 0.014403 |
4411 | Jeniffer | F | 57 | 1947 | 2013 | 0.209707 | 0.006657 |
3816 | Jenniffer | F | 63 | 1947 | 2012 | 0.172112 | 0.008436 |
6674 | Jennyfer | F | 44 | 1969 | 2013 | 0.093090 | 0.004847 |
4283 | Jennefer | F | 58 | 1950 | 2010 | 0.089288 | 0.004021 |
6851 | Jennafer | F | 43 | 1964 | 2009 | 0.062570 | 0.004710 |
4775 | Jannifer | F | 54 | 1944 | 2001 | 0.050104 | 0.001859 |
6810 | Jennipher | F | 43 | 1965 | 2008 | 0.038018 | 0.002020 |
12144 | Jennifier | F | 27 | 1957 | 1993 | 0.031601 | 0.002607 |
8154 | Janifer | F | 38 | 1942 | 1999 | 0.019424 | 0.000765 |
8959 | Jenefer | F | 36 | 1965 | 2003 | 0.019251 | 0.001249 |
15029 | Jennfier | F | 22 | 1969 | 1992 | 0.014234 | 0.001205 |
11435 | Jenipher | F | 29 | 1969 | 2011 | 0.013397 | 0.000958 |
16087 | Jenafer | F | 20 | 1972 | 1998 | 0.010676 | 0.001070 |
15745 | Jenyfer | F | 21 | 1977 | 2012 | 0.009356 | 0.000736 |
18068 | Jinnifer | F | 18 | 1967 | 1990 | 0.009290 | 0.000886 |
19859 | Jennfer | F | 16 | 1970 | 1987 | 0.007968 | 0.000686 |
20489 | Jeanifer | F | 15 | 1966 | 1987 | 0.006330 | 0.000723 |
25260 | Jeannifer | F | 11 | 1970 | 1991 | 0.004879 | 0.000681 |
32322 | Jennnifer | F | 6 | 1973 | 1986 | 0.002365 | 0.000479 |
33481 | Jeneffer | F | 6 | 1972 | 1994 | 0.002359 | 0.000685 |
33602 | Jenniver | F | 6 | 1969 | 1987 | 0.002178 | 0.000422 |
86665 | Jenifer | M | 4 | 1964 | 1989 | 0.001144 | 0.000325 |
41247 | Jenneffer | F | 3 | 1983 | 1987 | 0.000879 | 0.000299 |
56079 | Jenniferr | F | 1 | 1973 | 1973 | 0.000480 | 0.000480 |
52198 | Jeenifer | F | 1 | 1973 | 1973 | 0.000343 | 0.000343 |
61704 | Jhenifer | F | 1 | 2004 | 2004 | 0.000273 | 0.000273 |
x = list(names[names.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).name)
y = list(names[names.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).sex)