this should be done only for 2013
last_year = 2013 #change this when Social Security database is updated
save_path = "user_charts" # files created by this notebook will be saved in this directory
import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
os.makedirs(save_path)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn # comment out if you don't have it, but it makes good-looking charts
%run download_and_process.py
# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):
significance = int(floor((log10(x))))
val = floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val
Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names 128 2008 1886765 2035811 3922576 2046 32483 129 2009 1832276 1978582 3810858 1789 32210 130 2010 1771846 1912915 3684761 1635 31593 131 2011 1752198 1891800 3643998 1539 31412 132 2012 1751866 1886972 3638838 1531 31212
# Group yob dataframe to just have birth counts; separate into M and F sexes
dfm = yob[yob.sex == 'M']
dff = yob[yob.sex == 'F']
dfm = pd.DataFrame(dfm.groupby('name').sum()['births'])
dff = pd.DataFrame(dff.groupby('name').sum()['births'])
dfm['sex'] = 'M'
dff['sex'] = 'F'
# Create empty dicts to hold frequencies, then iterate over dfs to fill them
import time
start = time.time()
alphabet = 'abcdefghijklmnopqrstuvwxyz'
lettersf = {}
lettersm = {}
bigramsf = {}
bigramsm = {}
for letter in alphabet:
lettersf[letter] = 0
lettersm[letter] = 0
for second_letter in alphabet:
bigram = letter+second_letter
bigramsf[bigram] = 0
bigramsm[bigram] = 0
for i in range(len(dff)):
name = dff.index[i].lower()
count = dff.births.iloc[i]
for j in range(len(name)):
lettersf[name[j]] += count
if j < (len(name) - 1):
bigram = name[j]+name[j+1]
bigramsf[bigram] += count
for i in range(len(dfm)):
name = dfm.index[i].lower()
count = dfm.births.iloc[i]
for j in range(len(name)):
lettersm[name[j]] += count
if j < (len(name) - 1):
bigram = name[j]+name[j+1]
bigramsm[bigram] += count
# count totals
letsumf = 0
letsumm = 0
bigrsumf = 0
bigrsumm = 0
for letter in alphabet:
letsumf += lettersf[letter]
letsumm += lettersm[letter]
for second_letter in alphabet:
bigram = letter+second_letter
bigrsumf += bigramsf[bigram]
bigrsumm += bigramsm[bigram]
# recalculate dicts as proportions
for letter in alphabet:
lettersf[letter] = lettersf[letter] * 1.0 / letsumf
lettersm[letter] = lettersm[letter] * 1.0 / letsumm
for second_letter in alphabet:
bigram = letter+second_letter
bigramsf[bigram] = bigramsf[bigram] * 1.0 / bigrsumf
bigramsm[bigram] = bigramsf[bigram] * 1.0 / bigrsumf
print "%0.1f seconds elapsed." % ((time.time() - start))
4.7 seconds elapsed.
# make dataframe of results.
df = pd.DataFrame()
for letter in alphabet:
df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[''], 'sex':['F'],
'pct':[lettersf[letter]]}), ignore_index=True)
df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[''], 'sex':['M'],
'pct':[lettersm[letter]]}), ignore_index=True)
for second_letter in alphabet:
bigram = letter+second_letter
df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[second_letter],
'sex':['F'], 'pct':[bigramsf[bigram]]}), ignore_index=True)
df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[second_letter],
'sex':['M'], 'pct':[bigramsm[bigram]]}), ignore_index=True)
df.sort(['firstlet', 'secondlet'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_pickle(save_path + 'baby_name_letter_bigram_freqs.pickle')