last_year = 2013 #change this when Social Security database is updated
sex = 'M' # change this to 'F' to do same analysis on girls' names
save_path = "user_last_letters_" + sex # files created by this notebook will be saved in this directory
import os
if not os.path.isdir(save_path): # creates path if it does not exist
os.makedirs(save_path)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%run download_and_process.py
Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names 128 2008 1886765 2035811 3922576 2046 32483 129 2009 1832276 1978582 3810858 1789 32210 130 2010 1771846 1912915 3684761 1635 31593 131 2011 1752198 1891800 3643998 1539 31412 132 2012 1751866 1886972 3638838 1531 31212
# create df_half with only one sex
# aggregate percentages of last letter into df_last
df_half = yob[yob.sex == sex]
df_half = df_half[['name', 'year', 'pct']]
df_half.name = df_half.name.str.lower() #just in case any name ends in a capital letter
pieces = []
for yr in range(1880, last_year + 1):
loopdf = df_half[df_half.year == yr]
last_letter = []
letter_count = []
for idx, row in loopdf.iterrows():
current_name = row['name']
current_length = len(current_name)
current_pct = row['pct']
last_letter.append(current_name[current_length-1])
letter_count.append(current_pct)
lettersdf = pd.DataFrame(last_letter)
lettersdf.rename(columns = {0: 'letter'}, inplace=True)
countsdf = pd.DataFrame(letter_count)
countsdf.rename(columns = {0: 'pct'}, inplace=True)
frame = lettersdf.join(countsdf)
frame['year'] = yr
pieces.append(frame)
pieces_concat = pd.concat(pieces, ignore_index=True)
df_last = pd.DataFrame(pieces_concat.groupby(['letter', 'year']).pct.sum())
df_last.reset_index(inplace=True, drop=False)
print df_last.tail()
letter year pct 3315 z 2009 0.184172 3316 z 2010 0.182653 3317 z 2011 0.186965 3318 z 2012 0.190464 3319 z 2013 0.192950
#df_last_max shows most popular letter and its percentage every year
df_last_max = df_last.groupby('year').apply(lambda t: t[t.pct==t.pct.max()])
df_last_max.reset_index(inplace=True, drop=True)
max_overall = df_last_max.pct.max()
print "Tail of 'df_last_max':"
print df_last_max.tail()
print "\nLast letters that were most popular in any given year:",
last_list = list(df_last_max.letter.unique())
for ltr in last_list:
print ltr,
print "\nMaximum overall popularity of a last letter: %0.2f%%" % max_overall
Tail of 'df_last_max': letter year pct 129 n 2009 36.245958 130 n 2010 36.248030 131 n 2011 36.418279 132 n 2012 36.032649 133 n 2013 35.469127 Last letters that were most popular in any given year: s n e d y Maximum overall popularity of a last letter: 36.42%
# Create grid of subplots of last letter frequency by year for each letter
# this version has different scales for each y axis
os.chdir(save_path)
alphalist = list(df_last.groupby(['letter']).pct.mean().rank() / 26)
alphabet = 'abcdefghijklmnopqrstuvwxyz'
fig = plt.figure(figsize=(12, 7), dpi=150, facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace=.4)
for i in range(len(alphabet)):
curcol = i % 5 + 1
currow = (i / curcol) + 1
ltr = alphabet[i]
subdf = df_last[df_last.letter == ltr]
xlist = list(subdf.year)
ylist = list(subdf.pct)
plt.subplot(5, 6, i + 1)
plt.xlabel('')
plt.xlim(1880, last_year)
plt.tick_params(axis='both', labelsize=0, length=0, width=0, color='#ffffff')
plt.ylabel("")
plt.title(ltr, size = 11)
plt.plot(xlist,ylist, color='black')
plt.savefig("grid_lastletters_unscaled.png")
plt.show()
os.chdir("../")
# Same plot as above, but with every chart having the same maximum y scale
os.chdir(save_path)
alphalist = list(df_last.groupby(['letter']).pct.mean().rank() / 26)
alphabet = 'abcdefghijklmnopqrstuvwxyz'
fig = plt.figure(figsize=(12, 7), dpi=150, facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace=.4)
for i in range(len(alphabet)):
curcol = i % 5 + 1
currow = (i / curcol) + 1
ltr = alphabet[i]
subdf = df_last[df_last.letter == ltr]
xlist = list(subdf.year)
ylist = list(subdf.pct)
plt.subplot(5, 6, i + 1)
plt.xlabel('')
plt.xlim(1880, last_year)
plt.ylim(0, max_overall)
plt.tick_params(axis='both', labelsize=0, length=0, width=0, color='#ffffff')
plt.ylabel("")
plt.title(ltr, size = 11)
plt.plot(xlist,ylist, color='black')
plt.savefig("grid_lastletters_scaled.png")
plt.show()
os.chdir("../")
# Some procedures needed for following histograms
# dict of most common last letter each year
last_by_year = {}
for year in range(1880, last_year + 1):
last_by_year[year] = df_last_max[df_last_max.year == year].letter.iloc[0]
# make dict of maximum values for each letter
# I used this in my blog to add fills to the by-letter plots above
# fills can be done in matplotlib, of course; this was more expedient
alphabet = 'abcdefghijklmnopqrstuvwxyz'
last_max_dict = {}
temppivot = pd.pivot_table(df_last, values='pct', rows='year', cols='letter')
temppivot = temppivot.fillna(value=0)
for ltr in alphabet:
last_max_dict[ltr] = temppivot[ltr].max()
# make transposed pivot table of years and letters with percent values
# this fills in NaN values for letters which did not appear in database
# at all for certain years; this NaNs are replaced with zeros
dflastpivott = pd.pivot_table(df_last, values='pct', rows='letter', cols='year')
dflastpivott = dflastpivott.fillna(value=0)
# make dict of y positions for histogram letter labels
y_dict = {}
for year in range(1880, last_year + 1):
ycalc = df_last_max[df_last_max.year == year].pct.iloc[0]
ycalc = (round(ycalc/2,0)+1) * 2
if year == 1880:
y_dict[year] = ycalc
last_ycalc = ycalc
else:
lastlet = last_by_year[year - 1]
thislet = last_by_year[year]
if lastlet != thislet:
y_dict[year] = ycalc
last_ycalc = ycalc
else:
if ycalc < last_ycalc: ycalc = last_ycalc
last_ycalc = ycalc
y_dict[year] = ycalc
# Creates histograms, one per year
save_histograms = False # Change to true to save histogram as png grappics
use_full_range = False ## Change to true to get histograms of the entire dataset;
## This was implemented so the GitHub/nbviewer version would not be too huge
if use_full_range == False:
start_year = 1943 ## Change if desired
end_year = 2013 ## Change if desired
skip_year = 10 ## Change if desired; at 10, histograms will be made one every decade
else:
start_year = 1880
end_year = last_year
skip_year = 1 ## These values will print every year
os.chdir(save_path)
alphabet = 'abcdefghijklmnopqrstuvwxyz'
alphadict = {}
for i in range(26):
alphadict[alphabet[i]] = i
for year in range(start_year, end_year + 1, skip_year): # Change to a list of years to cut down on the number of histograms shown
percentlist = list(dflastpivott[year])
maxlet = last_by_year[year]
ind = np.arange(26) # the x locations for the groups
width = 0.9 # the width of the bars
fig = plt.figure(figsize=(10, 6), dpi=150, facecolor='w', edgecolor='k')
ax = plt.subplot(111)
barlist = plt.bar(range(26), percentlist, width=width, color='#aa4444')
barlist[alphadict[maxlet]].set_color('#000088')
ax.set_xticks(np.arange(26) + width/2)
ax.set_xticklabels( ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') )
ax.set_ylabel('% of names ending with letter', size = 14)
ax.set_title("Distribution of last letter in newborn boys' names", size=20, color="#222222")
plt.annotate(year, xy=(.98, .96), xycoords='axes fraction', size = 32, color='#aa4444', horizontalalignment='right', verticalalignment='top')
plt.annotate("Source: U.S. Social Security Administration", xy=(0.05, 0.03), xycoords='figure fraction', size = 10, horizontalalignment='left', verticalalignment='bottom')
plt.annotate("prooffreader.com", xy=(0.95, 0.03), xycoords='figure fraction', size = 13, horizontalalignment='right', verticalalignment='bottom')
plt.annotate(maxlet, xy=(alphadict[maxlet]+0.45, y_dict[year]-0.5), xycoords='data', size='19', color = "#000088", horizontalalignment='center', verticalalignment='bottom')
plt.ylim(0, 40)
plt.xlim(0, 26)
ax.xaxis.set_tick_params(width=0)
if save_histograms == True:
plt.savefig("histogram_last_letter_%s_%d.png" % (sex, year))
plt.show()
plt.close()
os.chdir("../")
Nothing in this part depends on part one, the notebook can be started here
last_year = 2013 #change this when Social Security database is updated
sex = 'M' # change this to 'F' to do same analysis on girls' names
save_path = "last_letters_" + sex # files created by this notebook will be saved in this directory
import os
if not os.path.isdir(save_path): # creates path if it does not exist
os.makedirs(save_path)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%run download_and_process.py
Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names 128 2008 1886765 2035811 3922576 2046 32483 129 2009 1832276 1978582 3810858 1789 32210 130 2010 1771846 1912915 3684761 1635 31593 131 2011 1752198 1891800 3643998 1539 31412 132 2012 1751866 1886972 3638838 1531 31212
# create df_quint dataframe
df_quint = yob[yob.sex == 'M']
df_quint.sort(columns=['year', 'births'], ascending=[True, False], inplace=True)
df_quint['cumul_sum'] = df_quint.groupby('year').pct.cumsum()
df_quint['quintile'] = 0
df_quint['endsn'] = 0
for idx, row in df_quint.iterrows():
if df_quint['name'][idx][-1] == 'n': df_quint['endsn'][idx] = 1
df_quint['quintile'][idx] = (df_quint['cumul_sum'][idx] / 20) + 1
if df_quint['quintile'][idx] == 6: df_quint['quintile'][idx] = 5 # quick inelegant fix to fencepost error
# note that if a name straddles a quintile, it is put in the lower quintile
# one could distribute such names between quintiles, but doing so does not add anything substantive to the analysis
print "Tail of df_quint:"
print df_quint.tail()
Tail of df_quint: name sex births year pct ranked cumul_sum quintile endsn 1792086 Zyhier M 5 2013 0.000267 12995 99.998931 5 0 1792087 Zylar M 5 2013 0.000267 12995 99.999198 5 0 1792088 Zymari M 5 2013 0.000267 12995 99.999466 5 0 1792089 Zymeer M 5 2013 0.000267 12995 99.999733 5 0 1792090 Zyree M 5 2013 0.000267 12995 100.000000 5 0
# create dicts for graphs
by_year_n = {} # dict with key of year, quintile tuple and total percentage of names ending with n
by_year_notn = {} # same as above but for names not ending in n
by_year_propn = {} # same as above but proportion ending in n
for i in range(1880, last_year + 1):
for q in range(1,6):
by_year_n[(i, q)] = 0
by_year_notn[(i, q)] = 0
for idx, row in df_quint.iterrows():
if df_quint['endsn'][idx] == 1:
by_year_n[(df_quint['year'][idx],df_quint['quintile'][idx])] += df_quint['births'][idx]
else:
by_year_notn[(df_quint['year'][idx],df_quint['quintile'][idx])] += df_quint['births'][idx]
for i in range(1880, last_year + 1):
for q in range(1,6):
by_year_propn[(i, q)] = 1.0 * by_year_n[(i, q)] / (by_year_n[(i, q)] + by_year_notn[(i, q)])
overall_n = {}
overall_notn = {}
for i in range(1880, last_year + 1):
overall_n[i] = 0
overall_notn[i] = 0
for idx, row in df_quint.iterrows():
if df_quint['endsn'][idx] == 1:
overall_n[df_quint['year'][idx]] += df_quint['births'][idx]
else:
overall_notn[df_quint['year'][idx]] += df_quint['births'][idx]
overall_propn = {}
for i in range(1880, last_year + 1):
overall_propn[i] = 1.0 * overall_n[i] / (overall_n[i] + overall_notn[i])
# import seaborn # Uncomment if you have seaborn installed and want nicer-looking graphs
# note that much of the graphics processing for the graphs shown on prooffreader.com
# was done after the fact in Photoshop
# the top five graphs are the five quintiles and the bottom is the overall frequency of names ending in n
os.chdir(save_path)
fig = plt.figure(figsize=(12, 12), dpi=150, facecolor='w', edgecolor='k')
#fig.subplots_adjust(hspace=.4)
for q in range(1, 6):
listx = []
listy = []
for yr in range(1880, last_year + 1):
listx.append(yr)
listy.append(by_year_propn[(yr, q)])
plt.subplot(6, 1, q)
plt.xlabel('')
plt.xlim(1880, last_year)
plt.ylim(0, 1)
plt.ylabel("")
plt.title(' ', size = 11)
plt.plot(listx,listy, color='black')
listx = []
listy = []
for yr in range(1880, last_year + 1):
listx.append(yr)
listy.append(overall_propn[yr])
plt.subplot(6, 1, 6)
plt.xlabel('')
plt.xlim(1880, last_year)
plt.ylim(0, 1)
plt.ylabel("")
plt.title(' ', size = 11)
plt.plot(listx,listy, color='black')
plt.savefig("quintiles_n_" + sex + ".png")
plt.show()
os.chdir("../")
df_quint.to_pickle("last_letters_" + sex + "/df_quint_" + sex + ".pickle")