last_year = 2013 #change this when Social Security database is updated
save_path = "user_singletons" # files created by this notebook will be saved in this directory
import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
os.makedirs(save_path)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn # comment out if you don't have it, but it makes good-looking charts
%run download_and_process.py
# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):
significance = int(floor((log10(x))))
val = floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val
Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names sexratio 128 2008 1886765 2035811 3922576 2046 32483 107.899553 129 2009 1832276 1978582 3810858 1789 32210 107.984932 130 2010 1771846 1912915 3684761 1635 31593 107.961696 131 2011 1752198 1891800 3643998 1539 31412 107.967250 132 2012 1751866 1886972 3638838 1531 31212 107.712120
df_oneyear = names[names.year_min == names.year_max]
df_oneyear = df_oneyear[['name', 'sex', 'year_min', 'pct_max']]
df_oneyear.columns = ['name', 'sex', 'year', 'pct']
oneyearnames = list(df_oneyear.name.unique())
yobcopy = yob.copy()
yobcopy = yobcopy[yobcopy.name.isin(oneyearnames)]
df_oneyear['births'] = 0
for i in range(len(df_oneyear)):
df_oneyear.births.iloc[i] = yobcopy[(yobcopy.name == df_oneyear.name.iloc[i]) &
(yobcopy.sex == df_oneyear.sex.iloc[i])].births.iloc[0]
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-5-a26a49c576f1> in <module>() 1 df_oneyear['births'] = 0 2 for i in range(len(df_oneyear)): ----> 3 df_oneyear.births.iloc[i] = yobcopy[(yobcopy.name == df_oneyear.name.iloc[i]) & 4 (yobcopy.sex == df_oneyear.sex.iloc[i])].births.iloc[0] C:\Users\David\Anaconda\lib\site-packages\pandas\core\ops.pyc in wrapper(self, other) 553 else: 554 --> 555 mask = isnull(self) 556 557 values = self.values C:\Users\David\Anaconda\lib\site-packages\pandas\core\common.pyc in isnull(obj) 127 pandas.notnull: boolean inverse of pandas.isnull 128 """ --> 129 return _isnull(obj) 130 131 C:\Users\David\Anaconda\lib\site-packages\pandas\core\common.pyc in _isnull_new(obj) 137 raise NotImplementedError("isnull is not defined for MultiIndex") 138 elif isinstance(obj, (ABCSeries, np.ndarray)): --> 139 return _isnull_ndarraylike(obj) 140 elif isinstance(obj, ABCGeneric): 141 return obj._constructor(obj._data.isnull(func=isnull)) C:\Users\David\Anaconda\lib\site-packages\pandas\core\common.pyc in _isnull_ndarraylike(obj) 225 if isinstance(obj, ABCSeries): 226 from pandas import Series --> 227 result = Series(result, index=obj.index, name=obj.name, copy=False) 228 229 return result C:\Users\David\Anaconda\lib\site-packages\pandas\core\series.pyc in __init__(self, data, index, dtype, name, copy, fastpath) 229 raise_cast_failure=True) 230 --> 231 data = SingleBlockManager(data, index, fastpath=True) 232 233 generic.NDFrame.__init__(self, data, fastpath=True) C:\Users\David\Anaconda\lib\site-packages\pandas\core\internals.pyc in __init__(self, block, axis, do_integrity_check, fastpath) 2999 if fastpath: 3000 self.axes = [axis] -> 3001 if isinstance(block, list): 3002 3003 # empty block KeyboardInterrupt:
df_oneyear.to_pickle(save_path+'/df_oneyear.pickle')
df_oneyear = pd.read_pickle(save_path+'/df_oneyear.pickle')
--------------------------------------------------------------------------- IOError Traceback (most recent call last) <ipython-input-3-0fd440f6d226> in <module>() ----> 1 df_oneyear = pd.read_pickle(save_path+'/df_oneyear.pickle') C:\Users\David\Anaconda\lib\site-packages\pandas\io\pickle.pyc in read_pickle(path) 59 60 try: ---> 61 return try_read(path) 62 except: 63 if PY3: C:\Users\David\Anaconda\lib\site-packages\pandas\io\pickle.pyc in try_read(path, encoding) 55 # compat pickle 56 except: ---> 57 with open(path, 'rb') as fh: 58 return pc.load(fh, encoding=encoding, compat=True) 59 IOError: [Errno 2] No such file or directory: 'user_singletons/df_oneyear.pickle'
df_oneyear.sort('births', inplace=True, ascending=False)
df_oneyear.head(50)
name | sex | year | pct | births |
---|
%matplotlib inline
dictionary = {0:1000, 1:20, 2:15, 3:0, 4:5}
xmax = df_oneyear.year.max()
plt.figure() # <- makes a new figure and sets it active (add this)
plt.hist(list(df_oneyear.year),xmax) # <- finds the current active axes/figure and plots to it
plt.title('Histogram of names appearing only once in the Social Security database')
plt.xlabel('Year')
plt.ylabel('Number of names')
plt.show()
#determine what percentage of births were single-appearances for each year
df_oygrt = pd.DataFrame(df_oneyear.groupby('year').births.sum())
df_oygrm = pd.DataFrame(df_oneyear[df_oneyear.sex == 'M'].groupby('year').births.sum())
df_oygrf = pd.DataFrame(df_oneyear[df_oneyear.sex == 'F'].groupby('year').births.sum())
df_oygrt['pct'] = 0.0
df_oygrm['pct'] = 0.0
df_oygrf['pct'] = 0.0
for i in range(len(df_oygrt)):
df_oygrt.pct.iloc[i] = df_oygrt.births.iloc[i] * 1.0 /
df_oygr.head()
births | |
---|---|
year | |
1880 | 5 |
1881 | 14 |
1885 | 15 |
1886 | 5 |
1887 | 5 |