Set working path, and import libraries and read dataframe pickles last_year = 2013 #change this when Social Security database is updated save_path = "user_charts" # files created by this notebook will be saved in this directory import time import os if not os.path.isdir(save_path): # creates path if it does not exist os.makedirs(save_path) import pandas as pd import numpy as np import matplotlib.pyplot as plt #import seaborn # comment out if you don't have it, but it makes good-looking charts %run download_and_process.py # used to round limit of y axis up to second-most-significant digit def determine_y_limit(x): significance = int(floor((log10(x)))) val = floor(x / (10 ** (significance - 1))) + 1 val = val * (10 ** (significance - 1)) return val # Group yob dataframe to just have birth counts; separate into M and F sexes dfm = yob[yob.sex == 'M'] dff = yob[yob.sex == 'F'] dfm = pd.DataFrame(dfm.groupby('name').sum()['births']) dff = pd.DataFrame(dff.groupby('name').sum()['births']) dfm['sex'] = 'M' dff['sex'] = 'F' # Create empty dicts to hold frequencies, then iterate over dfs to fill them import time start = time.time() alphabet = 'abcdefghijklmnopqrstuvwxyz' lettersf = {} lettersm = {} bigramsf = {} bigramsm = {} for letter in alphabet: lettersf[letter] = 0 lettersm[letter] = 0 for second_letter in alphabet: bigram = letter+second_letter bigramsf[bigram] = 0 bigramsm[bigram] = 0 for i in range(len(dff)): name = dff.index[i].lower() count = dff.births.iloc[i] for j in range(len(name)): lettersf[name[j]] += count if j < (len(name) - 1): bigram = name[j]+name[j+1] bigramsf[bigram] += count for i in range(len(dfm)): name = dfm.index[i].lower() count = dfm.births.iloc[i] for j in range(len(name)): lettersm[name[j]] += count if j < (len(name) - 1): bigram = name[j]+name[j+1] bigramsm[bigram] += count # count totals letsumf = 0 letsumm = 0 bigrsumf = 0 bigrsumm = 0 for letter in alphabet: letsumf += lettersf[letter] letsumm += lettersm[letter] for second_letter in alphabet: bigram = letter+second_letter bigrsumf += bigramsf[bigram] bigrsumm += bigramsm[bigram] # recalculate dicts as proportions for letter in alphabet: lettersf[letter] = lettersf[letter] * 1.0 / letsumf lettersm[letter] = lettersm[letter] * 1.0 / letsumm for second_letter in alphabet: bigram = letter+second_letter bigramsf[bigram] = bigramsf[bigram] * 1.0 / bigrsumf bigramsm[bigram] = bigramsf[bigram] * 1.0 / bigrsumf print "%0.1f seconds elapsed." % ((time.time() - start)) # make dataframe of results. df = pd.DataFrame() for letter in alphabet: df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[''], 'sex':['F'], 'pct':[lettersf[letter]]}), ignore_index=True) df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[''], 'sex':['M'], 'pct':[lettersm[letter]]}), ignore_index=True) for second_letter in alphabet: bigram = letter+second_letter df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[second_letter], 'sex':['F'], 'pct':[bigramsf[bigram]]}), ignore_index=True) df = df.append(pd.DataFrame({'firstlet':[letter], 'secondlet':[second_letter], 'sex':['M'], 'pct':[bigramsm[bigram]]}), ignore_index=True) df.sort(['firstlet', 'secondlet'], ascending=True, inplace=True) df.reset_index(drop=True, inplace=True) df.to_pickle(save_path + 'baby_name_letter_bigram_freqs.pickle')