last_year = 2013 #change this when Social Security database is updated save_path = "user_singletons" # files created by this notebook will be saved in this directory import time import os if not os.path.isdir(save_path): # creates path if it does not exist os.makedirs(save_path) import pandas as pd import numpy as np import matplotlib.pyplot as plt #import seaborn # comment out if you don't have it, but it makes good-looking charts %run download_and_process.py # used to round limit of y axis up to second-most-significant digit def determine_y_limit(x): significance = int(floor((log10(x)))) val = floor(x / (10 ** (significance - 1))) + 1 val = val * (10 ** (significance - 1)) return val df_oneyear = names[names.year_min == names.year_max] df_oneyear = df_oneyear[['name', 'sex', 'year_min', 'pct_max']] df_oneyear.columns = ['name', 'sex', 'year', 'pct'] oneyearnames = list(df_oneyear.name.unique()) yobcopy = yob.copy() yobcopy = yobcopy[yobcopy.name.isin(oneyearnames)] df_oneyear['births'] = 0 for i in range(len(df_oneyear)): df_oneyear.births.iloc[i] = yobcopy[(yobcopy.name == df_oneyear.name.iloc[i]) & (yobcopy.sex == df_oneyear.sex.iloc[i])].births.iloc[0] df_oneyear.to_pickle(save_path+'/df_oneyear.pickle') df_oneyear = pd.read_pickle(save_path+'/df_oneyear.pickle') df_oneyear.sort('births', inplace=True, ascending=False) df_oneyear.head(50) %matplotlib inline dictionary = {0:1000, 1:20, 2:15, 3:0, 4:5} xmax = df_oneyear.year.max() plt.figure() # <- makes a new figure and sets it active (add this) plt.hist(list(df_oneyear.year),xmax) # <- finds the current active axes/figure and plots to it plt.title('Histogram of names appearing only once in the Social Security database') plt.xlabel('Year') plt.ylabel('Number of names') plt.show() #determine what percentage of births were single-appearances for each year df_oygrt = pd.DataFrame(df_oneyear.groupby('year').births.sum()) df_oygrm = pd.DataFrame(df_oneyear[df_oneyear.sex == 'M'].groupby('year').births.sum()) df_oygrf = pd.DataFrame(df_oneyear[df_oneyear.sex == 'F'].groupby('year').births.sum()) df_oygrt['pct'] = 0.0 df_oygrm['pct'] = 0.0 df_oygrf['pct'] = 0.0 for i in range(len(df_oygrt)): df_oygrt.pct.iloc[i] = df_oygrt.births.iloc[i] * 1.0 / df_oygr.head()