# For parsing the csv
import csv
import urllib2
import StringIO
import math
# For actual computations
import random as rand
import pandas as pd
Today we are going to do a rough version of the analysis done in this paper: http://crypto.stanford.edu/~pgolle/papers/census.pdf.
The idea is to show it is possible to uniquely identify someone with high probability just given someones (Gender, Birthdate, Zip Code).
It's easy to look up the population of the US is 313.9 million people, and there are approximately 41,750 zip codes in the US.
number_of_people_per_zipcode = 313900000/41750
# number_of_people_per_zipcode
Next, if we assume that the average lifespan in the US is 78.74 years, and that there are 356 days in a year, then if we assume that each zipcode has equal demographics, then we get that:
average_number_of_distinct_birthdays_per_zip = 78.74 * 356
# average_number_of_distinct_birthdays_per_zip
Finally, if we account for gender, we double the number of "buckets" that people could fall into:
categories_per_zip = average_number_of_distinct_birthdays_per_zip * 2
rand.seed()
distribution_of_people_in_zip = [rand.randint(0, math.ceil(categories_per_zip)) for x in range(number_of_people_per_zipcode)]
distribution_of_people_in_zip.sort()
single_buckets = 0
prev = -1
for i in distribution_of_people_in_zip:
if i != prev:
single_buckets += 1
prev = i
# single_buckets
percent_in_single_buckets = single_buckets / (number_of_people_per_zipcode + 0.0) * 100.0
# percent_in_single_buckets
Ok, so this is a little crazy, right? Well the paper referred to at the begining of the paper says that in reality the numbers are much lower, so let's see if we can get a little closer using a better model.
The gender breakdown in the US seems fairly even (https://en.wikipedia.org/wiki/List_of_countries_by_sex_ratio), so we will pretend it is even.
Next let's look at the birthday breakdown using data from http://www.census.gov/population/age/.
I've already put a csv of the file up on the repo, so we just need to pull it into python.
ages_url = 'https://raw.githubusercontent.com/ledeprogram/courses/master/platforms/anonymization/age_breakdown.csv'
ages_response = urllib2.urlopen(ages_url)
ages = pd.read_csv(ages_response)
# ages
sample = ages[["Both sexes", "Max_Age"]][1:19].astype(float)
sample_sum = sample["Both sexes"].sum()
# sample_sum
sample["percentages"] = sample["Both sexes"]/sample_sum
# sample["percentages"]
sample["partitions"] = sample["percentages"].cumsum()
# sample["partitions"]
def find_interval(sample, value):
last_interval = 0
while(sample["partitions"][last_interval + 1] < value):
last_interval += 1
min_age = 0 if last_interval == 1 else sample["Max_Age"][last_interval] + 1
return [min_age, sample["Max_Age"][last_interval]]
rand.random()
find_interval(sample, rand.random())
for i in map(lambda x: x/10.0, range(1, 10)):
print i
for i in map(lambda x: x/10.0, range(1, 10)):
print i
print find_interval(sample, i)