# This is my notebook for exploring data about economic inequality in Cambodia.
%matplotlib inline
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import qgrid
from pylab import *
import seaborn as sb
# Hey, good news! We can remotely access the World Bank's World Development Indicators Database
# directly from pandas!
from pandas.io import wb
# First, search the database for all poverty-related indicator names and store them.
# I didn't use qgrid because it wouldn't display the id column correctly for some reason. It'd look nicer if it worked, though.
pov = wb.search('pov.*%').iloc[:,:2]
pov
id | name | |
---|---|---|
5453 | IN.POV.HCR.EST.RURL | Poverty HCR Estimates (%) - Rural |
5454 | IN.POV.HCR.EST.TOTL | Poverty HCR Estimates (%) - Total |
5455 | IN.POV.HCR.EST.URBN | Poverty HCR Estimates (%) - Urban |
7529 | SI.POV.25DAY | Poverty headcount ratio at $2.5 a day (PPP) (%... |
7530 | SI.POV.2DAY | Poverty headcount ratio at $2 a day (PPP) (% o... |
7531 | SI.POV.4DAY | Poverty headcount ratio at $4 a day (PPP) (% o... |
7532 | SI.POV.5DAY | Poverty headcount ratio at $5 a day (PPP) (% o... |
7533 | SI.POV.DDAY | Poverty headcount ratio at $1.25 a day (PPP) (... |
7534 | SI.POV.GAP2 | Poverty gap at $2 a day (PPP) (%) |
7535 | SI.POV.GAP25 | Poverty gap at $2.5 a day (PPP) (%) |
7536 | SI.POV.GAP4 | Poverty gap at $4 a day (PPP) (%) |
7537 | SI.POV.GAP5 | Poverty gap at $5 a day (PPP) (%) |
7538 | SI.POV.GAPS | Poverty gap at $1.25 a day (PPP) (%) |
7540 | SI.POV.NAGP | Poverty gap at national poverty lines (%) |
7541 | SI.POV.NAHC | Poverty headcount ratio at national poverty li... |
7547 | SI.POV.RUGP | Rural poverty gap at national poverty lines (%) |
7548 | SI.POV.RUHC | Rural poverty headcount ratio at national pove... |
7549 | SI.POV.URGP | Urban poverty gap at national poverty lines (%) |
7550 | SI.POV.URHC | Urban poverty headcount ratio at national pove... |
10162 | ccx_povchi_40_fem | Poverty headcount of children (below bottom 40... |
10163 | ccx_povchi_40_mal | Poverty headcount of children (below bottom 40... |
10164 | ccx_povchi_40_rur | Poverty headcount of children (below bottom 40... |
10165 | ccx_povchi_40_tot | Poverty headcount of children (below bottom 40%) |
10166 | ccx_povchi_40_urb | Poverty headcount of children (below bottom 40... |
10167 | ccx_poveld_40_fem | Poverty headcount of the elderly (below bottom... |
10168 | ccx_poveld_40_mal | Poverty headcount of the elderly (below bottom... |
10169 | ccx_poveld_40_rur | Poverty headcount of the elderly (below bottom... |
10170 | ccx_poveld_40_tot | Poverty headcount of the elderly (below bottom... |
10171 | ccx_poveld_40_urb | Poverty headcount of the elderly (below bottom... |
10172 | ccx_povwka_40_fem | Poverty headcount of working age adults (below... |
... | ... | ... |
12761 | per_si_allsi_p1_ep_preT_tot | Poverty Gap reduction (%) - All Social Insura... |
12762 | per_si_allsi_p1_ep_tot | Poverty Gap reduction (%) - All Social Insura... |
12763 | per_si_allsi_p1_preT_tot | Poverty Gap reduction (%) - All Social Insura... |
12764 | per_si_allsi_p1_rur | Poverty Gap reduction (%) - All Social Insura... |
12765 | per_si_allsi_p1_tot | Poverty Gap reduction (%) - All Social Insura... |
12766 | per_si_allsi_p1_urb | Poverty Gap reduction (%) - All Social Insura... |
12901 | per_si_oa_p0_ep_preT_tot | Poverty Headcount reduction (%) - Old Age Con... |
12902 | per_si_oa_p0_ep_tot | Poverty Headcount reduction (%) - Old Age Con... |
12903 | per_si_oa_p0_preT_tot | Poverty Headcount reduction (%) - Old Age Con... |
12904 | per_si_oa_p0_rur | Poverty Headcount reduction (%) - Old Age Con... |
12905 | per_si_oa_p0_tot | Poverty Headcount reduction (%) - Old Age Con... |
12906 | per_si_oa_p0_urb | Poverty Headcount reduction (%) - Old Age Con... |
12907 | per_si_oa_p1_ep_preT_tot | Poverty Gap reduction (%) - Old Age Contribut... |
12908 | per_si_oa_p1_ep_tot | Poverty Gap reduction (%) - Old Age Contribut... |
12909 | per_si_oa_p1_preT_tot | Poverty Gap reduction (%) - Old Age Contribut... |
12910 | per_si_oa_p1_rur | Poverty Gap reduction (%) - Old Age Contribut... |
12911 | per_si_oa_p1_tot | Poverty Gap reduction (%) - Old Age Contribut... |
12912 | per_si_oa_p1_urb | Poverty Gap reduction (%) - Old Age Contribut... |
13047 | per_si_ss_p0_ep_preT_tot | Poverty Headcount reduction (%) - Other Socia... |
13048 | per_si_ss_p0_ep_tot | Poverty Headcount reduction (%) - Other Socia... |
13049 | per_si_ss_p0_preT_tot | Poverty Headcount reduction (%) - Other Socia... |
13050 | per_si_ss_p0_rur | Poverty Headcount reduction (%) - Other Socia... |
13051 | per_si_ss_p0_tot | Poverty Headcount reduction (%) - Other Socia... |
13052 | per_si_ss_p0_urb | Poverty Headcount reduction (%) - Other Socia... |
13053 | per_si_ss_p1_ep_preT_tot | Poverty Gap reduction (%) - Other Social Insu... |
13054 | per_si_ss_p1_ep_tot | Poverty Gap reduction (%) - Other Social Insu... |
13055 | per_si_ss_p1_preT_tot | Poverty Gap reduction (%) - Other Social Insu... |
13056 | per_si_ss_p1_rur | Poverty Gap reduction (%) - Other Social Insu... |
13057 | per_si_ss_p1_tot | Poverty Gap reduction (%) - Other Social Insu... |
13058 | per_si_ss_p1_urb | Poverty Gap reduction (%) - Other Social Insu... |
231 rows × 2 columns
# Strip the English labels from the id's and store them in a separate table
povnames = pov.loc[7529:7550, 'name']
povnames = povnames.tolist()
# Keep only the id's in the original pov table
pov = pov.loc[7529:7550, 'id']
pov = pov.tolist()
# Take a look
povnames
[u'Poverty headcount ratio at $2.5 a day (PPP) (% of population)', u'Poverty headcount ratio at $2 a day (PPP) (% of population)', u'Poverty headcount ratio at $4 a day (PPP) (% of population)', u'Poverty headcount ratio at $5 a day (PPP) (% of population)', u'Poverty headcount ratio at $1.25 a day (PPP) (% of population)', u'Poverty gap at $2 a day (PPP) (%)', u'Poverty gap at $2.5 a day (PPP) (%)', u'Poverty gap at $4 a day (PPP) (%)', u'Poverty gap at $5 a day (PPP) (%)', u'Poverty gap at $1.25 a day (PPP) (%)', u'Poverty gap at national poverty lines (%)', u'Poverty headcount ratio at national poverty lines (% of population)', u'Rural poverty gap at national poverty lines (%)', u'Rural poverty headcount ratio at national poverty lines (% of rural population)', u'Urban poverty gap at national poverty lines (%)', u'Urban poverty headcount ratio at national poverty lines (% of urban population)']
pov
[u'SI.POV.25DAY', u'SI.POV.2DAY', u'SI.POV.4DAY', u'SI.POV.5DAY', u'SI.POV.DDAY', u'SI.POV.GAP2', u'SI.POV.GAP25', u'SI.POV.GAP4', u'SI.POV.GAP5', u'SI.POV.GAPS', u'SI.POV.NAGP', u'SI.POV.NAHC', u'SI.POV.RUGP', u'SI.POV.RUHC', u'SI.POV.URGP', u'SI.POV.URHC']
# Create a dictionary of the names and id's
povdict = dict(zip(pov, povnames))
povdict
{u'SI.POV.25DAY': u'Poverty headcount ratio at $2.5 a day (PPP) (% of population)', u'SI.POV.2DAY': u'Poverty headcount ratio at $2 a day (PPP) (% of population)', u'SI.POV.4DAY': u'Poverty headcount ratio at $4 a day (PPP) (% of population)', u'SI.POV.5DAY': u'Poverty headcount ratio at $5 a day (PPP) (% of population)', u'SI.POV.DDAY': u'Poverty headcount ratio at $1.25 a day (PPP) (% of population)', u'SI.POV.GAP2': u'Poverty gap at $2 a day (PPP) (%)', u'SI.POV.GAP25': u'Poverty gap at $2.5 a day (PPP) (%)', u'SI.POV.GAP4': u'Poverty gap at $4 a day (PPP) (%)', u'SI.POV.GAP5': u'Poverty gap at $5 a day (PPP) (%)', u'SI.POV.GAPS': u'Poverty gap at $1.25 a day (PPP) (%)', u'SI.POV.NAGP': u'Poverty gap at national poverty lines (%)', u'SI.POV.NAHC': u'Poverty headcount ratio at national poverty lines (% of population)', u'SI.POV.RUGP': u'Rural poverty gap at national poverty lines (%)', u'SI.POV.RUHC': u'Rural poverty headcount ratio at national poverty lines (% of rural population)', u'SI.POV.URGP': u'Urban poverty gap at national poverty lines (%)', u'SI.POV.URHC': u'Urban poverty headcount ratio at national poverty lines (% of urban population)'}
# Now, look for all income related indicators and store them
inc = wb.search('income.*share.*%').iloc[:,:2]
inc
id | name | |
---|---|---|
7522 | SI.DST.02ND.20 | Income share held by second 20% |
7523 | SI.DST.03RD.20 | Income share held by third 20% |
7524 | SI.DST.04TH.20 | Income share held by fourth 20% |
7525 | SI.DST.05TH.20 | Income share held by highest 20% |
7526 | SI.DST.10TH.10 | Income share held by highest 10% |
7527 | SI.DST.FRST.10 | Income share held by lowest 10% |
7528 | SI.DST.FRST.20 | Income share held by lowest 20% |
# Repeat what was done with the poverty indicators
incnames = inc.loc[:, 'name']
incnames = incnames.tolist()
inc = inc.loc[:,'id']
inc = inc.tolist()
incnames
[u'Income share held by second 20%', u'Income share held by third 20%', u'Income share held by fourth 20%', u'Income share held by highest 20%', u'Income share held by highest 10%', u'Income share held by lowest 10%', u'Income share held by lowest 20%']
inc
[u'SI.DST.02ND.20', u'SI.DST.03RD.20', u'SI.DST.04TH.20', u'SI.DST.05TH.20', u'SI.DST.10TH.10', u'SI.DST.FRST.10', u'SI.DST.FRST.20']
# Create another dictionary for income
incdict = dict(zip(inc, incnames))
incdict
{u'SI.DST.02ND.20': u'Income share held by second 20%', u'SI.DST.03RD.20': u'Income share held by third 20%', u'SI.DST.04TH.20': u'Income share held by fourth 20%', u'SI.DST.05TH.20': u'Income share held by highest 20%', u'SI.DST.10TH.10': u'Income share held by highest 10%', u'SI.DST.FRST.10': u'Income share held by lowest 10%', u'SI.DST.FRST.20': u'Income share held by lowest 20%'}
# Create master list of all of the data we want to download:
idx = pov + inc
idx
[u'SI.POV.25DAY', u'SI.POV.2DAY', u'SI.POV.4DAY', u'SI.POV.5DAY', u'SI.POV.DDAY', u'SI.POV.GAP2', u'SI.POV.GAP25', u'SI.POV.GAP4', u'SI.POV.GAP5', u'SI.POV.GAPS', u'SI.POV.NAGP', u'SI.POV.NAHC', u'SI.POV.RUGP', u'SI.POV.RUHC', u'SI.POV.URGP', u'SI.POV.URHC', u'SI.DST.02ND.20', u'SI.DST.03RD.20', u'SI.DST.04TH.20', u'SI.DST.05TH.20', u'SI.DST.10TH.10', u'SI.DST.FRST.10', u'SI.DST.FRST.20']
# Download data and store it as a DataFrame
khm = wb.download(indicator=idx, country='KHM', start=2004, end=2012)
khm
SI.POV.25DAY | SI.POV.2DAY | SI.POV.4DAY | SI.POV.5DAY | SI.POV.DDAY | SI.POV.GAP2 | SI.POV.GAP25 | SI.POV.GAP4 | SI.POV.GAP5 | SI.POV.GAPS | ... | SI.POV.RUHC | SI.POV.URGP | SI.POV.URHC | SI.DST.02ND.20 | SI.DST.03RD.20 | SI.DST.04TH.20 | SI.DST.05TH.20 | SI.DST.10TH.10 | SI.DST.FRST.10 | SI.DST.FRST.20 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
country | year | |||||||||||||||||||||
Cambodia | 2012 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 20.8 | NaN | 6.4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2011 | 59.00 | 41.26 | 85.70 | 91.83 | 10.05 | 10.25 | 18.29 | 39.60 | 49.51 | 1.43 | ... | 23.6 | NaN | 8.7 | 12.46 | 16.11 | 21.24 | 41.20 | 26.91 | 4.04 | 8.99 | |
2010 | 57.59 | 40.88 | 84.06 | 90.63 | 11.25 | 10.59 | 18.37 | 39.00 | 48.74 | 1.70 | ... | 25.3 | NaN | 8.5 | 12.02 | 15.80 | 21.18 | 42.49 | 28.01 | 3.80 | 8.51 | |
2009 | 56.25 | 40.74 | 82.20 | 89.25 | 12.93 | 11.21 | 18.71 | 38.50 | 48.02 | 2.08 | ... | 27.5 | NaN | 8.0 | 11.66 | 15.68 | 21.48 | 43.15 | 28.20 | 3.55 | 8.03 | |
2008 | 65.90 | 51.05 | 87.33 | 92.48 | 20.89 | 16.27 | 24.78 | 45.05 | 54.08 | 4.39 | ... | 38.5 | NaN | 15.1 | 11.60 | 15.67 | 21.43 | 43.45 | 28.57 | 3.44 | 7.85 | |
2007 | 71.05 | 59.39 | 87.65 | 92.07 | 30.82 | 21.92 | 30.64 | 49.51 | 57.63 | 7.24 | ... | 51.4 | NaN | 18.3 | 10.05 | 13.95 | 20.16 | 48.89 | 33.99 | 3.12 | 6.95 | |
2006 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
2005 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
2004 | 76.54 | 64.43 | 91.55 | 94.95 | 32.77 | 23.64 | 33.09 | 52.85 | 60.96 | 7.79 | ... | 54.2 | NaN | 28.5 | 11.43 | 15.37 | 21.22 | 44.00 | 29.09 | 3.56 | 7.98 |
9 rows × 23 columns
# Reverse the order of the DataFrame so the years are ascending, drop Cambodia index, drop categories with all NA's
khm.index = khm.index.droplevel(0)
khm = khm.iloc[::-1]
khm = khm.dropna(axis=1, how='all')
qgrid.show_grid(khm, remote_js=True)
# Let's look at the data of percent of total income earned from the highest 10% next to that of
# the lowest 10%
incframe = khm[['SI.DST.10TH.10', 'SI.DST.FRST.10']]
incframe = incframe[0:8] # No data for 2012, so let's omit it
incframe.columns = [incdict[incframe.columns.tolist()[0]], incdict[incframe.columns.tolist()[1]]]
incframe
Income share held by highest 10% | Income share held by lowest 10% | |
---|---|---|
year | ||
2004 | 29.09 | 3.56 |
2005 | NaN | NaN |
2006 | NaN | NaN |
2007 | 33.99 | 3.12 |
2008 | 28.57 | 3.44 |
2009 | 28.20 | 3.55 |
2010 | 28.01 | 3.80 |
2011 | 26.91 | 4.04 |
incframe.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']
plt.figure()
incframe.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
<matplotlib.text.Text at 0x10d85eb10>
<matplotlib.figure.Figure at 0x10d9f4910>
incframe1 = khm[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'
]]
# Loop to look up ids in dictionary to rename the columns
newcolumns = range(0,len(incframe1.columns))
for i in range(0, len(incframe1.columns)):
newcolumns[i] = incdict[incframe1.columns.tolist()[i]]
incframe1.columns = newcolumns
incframe1 = incframe1[0:8] #Omit 2012, no data
incframe1
Income share held by lowest 20% | Income share held by second 20% | Income share held by third 20% | Income share held by fourth 20% | Income share held by highest 20% | |
---|---|---|---|---|---|
year | |||||
2004 | 7.98 | 11.43 | 15.37 | 21.22 | 44.00 |
2005 | NaN | NaN | NaN | NaN | NaN |
2006 | NaN | NaN | NaN | NaN | NaN |
2007 | 6.95 | 10.05 | 13.95 | 20.16 | 48.89 |
2008 | 7.85 | 11.60 | 15.67 | 21.43 | 43.45 |
2009 | 8.03 | 11.66 | 15.68 | 21.48 | 43.15 |
2010 | 8.51 | 12.02 | 15.80 | 21.18 | 42.49 |
2011 | 8.99 | 12.46 | 16.11 | 21.24 | 41.20 |
# Change column names
incframe1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']
plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
plt.ylim([0, 50])
(0, 50)
<matplotlib.figure.Figure at 0x10d9f4850>
# Stacked bar graph of above dataframe
plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
<matplotlib.text.Text at 0x10d816c90>
<matplotlib.figure.Figure at 0x10d967250>
# How does this compare to the US?
usa = wb.download(indicator=['SI.DST.10TH.10', 'SI.DST.FRST.10', 'SI.DST.FRST.20', 'SI.DST.02ND.20',
'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'],
country='USA', start=2004, end=2011)
usa.index = usa.index.droplevel(0)
usa = usa.iloc[::-1]
qgrid.show_grid(usa, remote_js=True)
# So there isn't nearly as much data for the US during this time period, but it's still worth looking at.
usainc = usa[['SI.DST.10TH.10', 'SI.DST.FRST.10']]
usainc.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']
plt.figure()
usainc.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
<matplotlib.text.Text at 0x10aed8710>
<matplotlib.figure.Figure at 0x10d9dd210>
usainc1 = usa[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20']]
usainc1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']
plt.figure()
usainc1.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
<matplotlib.text.Text at 0x10d876950>
<matplotlib.figure.Figure at 0x10d9ff550>
plt.figure()
usainc1.plot(title = 'Income Share in the USA', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
<matplotlib.text.Text at 0x10d69ed50>
<matplotlib.figure.Figure at 0x10d926610>
# So... we aren't much better (perhaps worse). Let's look at the Gross National Income per capita for each
# country to get a better look at the differences
khmgni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='KHM', start=2004, end=2011)
khmgni.index = khmgni.index.droplevel(0)
khmgni = khmgni.iloc[::-1]
qgrid.show_grid(khmgni, remote_js=True)
plt.figure()
khmgni.plot(title = 'Gross National Income per Capita in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')
<matplotlib.text.Text at 0x10a84bfd0>
<matplotlib.figure.Figure at 0x10d067850>
usagni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='USA', start=2004, end=2011)
usagni.index = usagni.index.droplevel(0)
usagni = usagni.iloc[::-1]
qgrid.show_grid(usagni, remote_js=True)
plt.figure()
usagni.plot(title = 'Gross National Income per Capita in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')
<matplotlib.text.Text at 0x109ca6810>
<matplotlib.figure.Figure at 0x109d0b310>
# Explore how close people are to the poverty boundary and how it is changing over time
povline = khm[['SI.POV.25DAY', 'SI.POV.2DAY', 'SI.POV.4DAY', 'SI.POV.5DAY', 'SI.POV.DDAY', 'SI.POV.NAHC']]
newcolumns = range(0,len(povline.columns))
for i in range(0, len(povline.columns)):
newcolumns[i] = povdict[povline.columns.tolist()[i]]
povline.columns = newcolumns
povline
Poverty headcount ratio at $2.5 a day (PPP) (% of population) | Poverty headcount ratio at $2 a day (PPP) (% of population) | Poverty headcount ratio at $4 a day (PPP) (% of population) | Poverty headcount ratio at $5 a day (PPP) (% of population) | Poverty headcount ratio at $1.25 a day (PPP) (% of population) | Poverty headcount ratio at national poverty lines (% of population) | |
---|---|---|---|---|---|---|
year | ||||||
2004 | 76.54 | 64.43 | 91.55 | 94.95 | 32.77 | 50.2 |
2005 | NaN | NaN | NaN | NaN | NaN | NaN |
2006 | NaN | NaN | NaN | NaN | NaN | NaN |
2007 | 71.05 | 59.39 | 87.65 | 92.07 | 30.82 | 45.0 |
2008 | 65.90 | 51.05 | 87.33 | 92.48 | 20.89 | 34.0 |
2009 | 56.25 | 40.74 | 82.20 | 89.25 | 12.93 | 23.9 |
2010 | 57.59 | 40.88 | 84.06 | 90.63 | 11.25 | 22.1 |
2011 | 59.00 | 41.26 | 85.70 | 91.83 | 10.05 | 20.5 |
2012 | NaN | NaN | NaN | NaN | NaN | 17.7 |
# Change titles of columns for plotting, then plot
povline.columns = ['<$2.50 a day', '<$2 a day', '<$4 a day', '<$5 a day', '<$1.25 a day', '<National poverty lines']
plt.figure()
povline.plot(title='Poverty Headcount Ratio at Different Incomes in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('% of population')
<matplotlib.text.Text at 0x10dfbee10>
<matplotlib.figure.Figure at 0x109cc6c90>
# The World Bank does not have most of this data for the USA, so I will find US Census data later.