This notebook is based on the work by Randall Munroe (xkcd). The link to the original blog post is here: http://blog.xkcd.com/2012/07/12/a-morbid-python-script/. The script that Randall wrote is inserted below for reference.
First, let's start by taking a look at the data that Randall used and compare it to data from SSA.GOV found on the internet [link needed!].
# XKCD data
data_xkcd = array([[0.00756, 0.00052, 0.00035, 0.00025, 0.00020, 0.00018, 0.00017, 0.00016, 0.00014, 0.00011, 0.00009, 0.00010, 0.00015, 0.00027, 0.00043, 0.00061, 0.00078, 0.00094, 0.00107, 0.00119, 0.00131, 0.00142, 0.00149, 0.00151, 0.00148, 0.00143, 0.00140, 0.00138, 0.00137, 0.00139, 0.00141, 0.00143, 0.00147, 0.00152, 0.00158, 0.00165, 0.00174, 0.00186, 0.00202, 0.00221, 0.00243, 0.00267, 0.00291, 0.00317, 0.00344, 0.00373, 0.00405, 0.00441, 0.00480, 0.00524, 0.00573, 0.00623, 0.00671, 0.00714, 0.00756, 0.00800, 0.00853, 0.00917, 0.00995, 0.01086, 0.01190, 0.01301, 0.01413, 0.01522, 0.01635, 0.01760, 0.01906, 0.02073, 0.02265, 0.02482, 0.02729, 0.03001, 0.03289, 0.03592, 0.03918, 0.04292, 0.04715, 0.05173, 0.05665, 0.06206, 0.06821, 0.07522, 0.08302, 0.09163, 0.10119, 0.11183, 0.12367, 0.13679, 0.15124, 0.16702, 0.18414, 0.20255, 0.22224, 0.24314, 0.26520, 0.28709, 0.30846, 0.32891, 0.34803, 0.36544, 0.38371, 0.40289, 0.42304, 0.44419, 0.46640, 0.48972, 0.51421, 0.53992, 0.56691, 0.59526, 0.62502, 0.65628, 0.68909, 0.72354, 0.75972, 0.79771, 0.83759, 0.87947, 0.92345, 0.96962], [0.00615, 0.00041, 0.00025, 0.00018, 0.00015, 0.00014, 0.00014, 0.00013, 0.00012, 0.00011, 0.00010, 0.00010, 0.00012, 0.00016, 0.00021, 0.00028, 0.00034, 0.00039, 0.00042, 0.00043, 0.00045, 0.00047, 0.00048, 0.00049, 0.00050, 0.00051, 0.00052, 0.00053, 0.00056, 0.00059, 0.00063, 0.00068, 0.00073, 0.00078, 0.00084, 0.00091, 0.00098, 0.00108, 0.00118, 0.00130, 0.00144, 0.00158, 0.00173, 0.00189, 0.00206, 0.00225, 0.00244, 0.00264, 0.00285, 0.00306, 0.00329, 0.00355, 0.00382, 0.00409, 0.00437, 0.00468, 0.00505, 0.00549, 0.00603, 0.00665, 0.00736, 0.00813, 0.00890, 0.00967, 0.01047, 0.01136, 0.01239, 0.01357, 0.01491, 0.01641, 0.01816, 0.02008, 0.02210, 0.02418, 0.02641, 0.02902, 0.03206, 0.03538, 0.03899, 0.04301, 0.04766, 0.05307, 0.05922, 0.06618, 0.07403, 0.08285, 0.09270, 0.10365, 0.11574, 0.12899, 0.14343, 0.15907, 0.17591, 0.19393, 0.21312, 0.23254, 0.25193, 0.27097, 0.28933, 0.30670, 0.32510, 0.34460, 0.36528, 0.38720, 0.41043, 0.43505, 0.46116, 0.48883, 0.51816, 0.54925, 0.58220, 0.61714, 0.65416, 0.69341, 0.73502, 0.77912, 0.82587, 0.87542, 0.92345, 0.96962]]).T
# ssa data
data_ssa = loadtxt("prob_from_SSA.GOV.csv", delimiter=';')[:, [1, 4]]
figure(figsize=(10, 6))
plot(data_xkcd[:, 0], label='xkcd - male data')
plot(data_xkcd[:, 0], label='xkcd - female data')
plot(data_ssa[:, 0], label='ssa - male data')
plot(data_ssa[:, 1], label='ssa - female data')
legend(loc=2)
xlabel('Age')
ylabel('Probability of dying within a year at that age')
grid(True)
The idea behind the function below is based on what Randall did. Unless what he did, I did not include the following things:
def death_probability(sex, age, years, dataset):
if sex == 'm':
data = dataset[:, 0]
elif sex == 'f':
data = dataset[:, 1]
survival_probability = 1.0
for year in arange(age + 1, age + years - (years % 1)):
try:
survival_probability *= (1 - data[year])
except IndexError:
survival_probability *= (1 - data[-1])
if years % 1:
fraction = years % 1
try:
last_year_die = data[year]
except IndexError:
last_year_die = data[-1]
last_year_live = (1 - last_year_die) ** (fraction)
survival_probability *= last_year_live
return 1 - survival_probability
Some tests below, to make sure it works.
death_probability('m', 24, 30, data_xkcd)
0.08304389875929985
death_probability('f', 24, 30, data_xkcd)
0.047407458185668272
death_probability('m', 24, 799, data_xkcd)
1.0
print death_probability('m', 24.2, 15, data_xkcd)
print death_probability('m', 24, 15, data_xkcd)
0.0235966098158 0.0214339789092
Testing the interpolation scheme.
print death_probability('m', 24, 10, data_xkcd)
print death_probability('m', 24, 10.9999999, data_xkcd)
print death_probability('m', 24, 11, data_xkcd)
0.0127274323426 0.0142280864955 0.0142873229995
def calculate_expected_value(sex, age, target_prob, dataset):
assert(target_prob > 0 and target_prob < 1)
year = 0
for interval in [10, 1, 0.1, 0.01]:
prob = 0
while prob < target_prob:
year += interval
prob = death_probability(sex, age, year, dataset)
year -= interval
return year
calculate_expected_value('m', 26, .5, data_xkcd)
53.1
sex = 'm'
age = 26
data = data_xkcd
def plot_death_prob(age, sex, data, annotation_xy=(.25, .5), plot_age_offset=True):
""" plots death prob for: age, sex, data.
Optional arguments are 'annotation_xy': position of the life expectancy label on the plot
and 'plot_age_offset': adds an offset to the x axis corresponding to the current age"""
if plot_age_offset:
offset = age
else:
offset = 0
# calculating the data
years = arange(200)
probs = array([death_probability(sex, age, year, data) for year in years])
upper_limit = (probs > 0.99).nonzero()[0][0]
# plotting
plot(years[:upper_limit] + offset, probs[:upper_limit], label='sex: %s, current age: %s' % (sex, age))
title('probability of dying')
xlabel('years')
ylabel('probability')
legend(loc = 2)
grid(True)
# annotation
ind = (probs > 0.5).nonzero()[0][0]
mean = (probs[ind] * years[ind] + probs[ind - 1] * years[ind - 1])/(probs[ind] + probs[ind - 1])
annotate('life expectancy: %.2f years' % (mean + offset),
(mean + offset, 0.5),
xytext=annotation_xy, textcoords='axes fraction',
arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
figure(figsize=(10, 6))
plot_death_prob(age, sex, data)
figure(figsize=(10, 6))
plot_death_prob(26, 'm', data_xkcd, (.25, .5), False)
plot_death_prob(31, 'f', data_xkcd, (.65, .3), False)
The purpose of this paragraph is to define a function that calculates the age of the other person in the couple so that the expected life duration is the same.
def calculate_right_age(sex, age, dataset):
target_life_expectancy = calculate_expected_value(sex, age, 0.5, dataset)
if sex == 'm':
sex = 'f'
else:
sex = 'm'
age = 1
for interval in [10, 1, 0.1, 0.01]:
life_expectancy = calculate_expected_value(sex, age, 0.5, dataset)
while life_expectancy > target_life_expectancy:
age += interval
life_expectancy = calculate_expected_value(sex, age, 0.5, dataset)
age -= interval
return age
life_expectancy = calculate_expected_value('f', 1, 0.5, dataset)
life_expectancy
82.55
sex = 'm'
age = 26
dataset = data_xkcd
calculate_right_age(sex, age, dataset)
30.01
Let's plot the variation of the perfect age as a function of the age.
male_ages = arange(10, 80)
girl_ages = [calculate_right_age('m', age, data_xkcd) for age in male_ages]
plot(male_ages, girl_ages - male_ages)
title('Age difference necessary for couple with similar life expectancy')
xlabel('Age of male')
ylabel('Difference of age of female')
ylim((0, 6))
grid(True)
bfigure(figsize=(10, 6))
plot_death_prob(58, 'm', data_xkcd, (.25, .7), False)
plot_death_prob(58, 'f', data_xkcd, (.65, .3), False)