In [1]:

# This is my notebook for exploring data about economic inequality in Cambodia.

%matplotlib inline
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import qgrid
from pylab import *
import seaborn as sb

# Hey, good news! We can remotely access the World Bank's World Development Indicators Database
# directly from pandas!

from pandas.io import wb

In [2]:

# First, search the database for all poverty-related indicator names and store them.
# I didn't use qgrid because it wouldn't display the id column correctly for some reason. It'd look nicer if it worked, though.

pov = wb.search('pov.*%').iloc[:,:2]
pov

Out[2]:

	id	name
5453	IN.POV.HCR.EST.RURL	Poverty HCR Estimates (%) - Rural
5454	IN.POV.HCR.EST.TOTL	Poverty HCR Estimates (%) - Total
5455	IN.POV.HCR.EST.URBN	Poverty HCR Estimates (%) - Urban
7529	SI.POV.25DAY	Poverty headcount ratio at $2.5 a day (PPP) (%...
7530	SI.POV.2DAY	Poverty headcount ratio at $2 a day (PPP) (% o...
7531	SI.POV.4DAY	Poverty headcount ratio at $4 a day (PPP) (% o...
7532	SI.POV.5DAY	Poverty headcount ratio at $5 a day (PPP) (% o...
7533	SI.POV.DDAY	Poverty headcount ratio at $1.25 a day (PPP) (...
7534	SI.POV.GAP2	Poverty gap at $2 a day (PPP) (%)
7535	SI.POV.GAP25	Poverty gap at $2.5 a day (PPP) (%)
7536	SI.POV.GAP4	Poverty gap at $4 a day (PPP) (%)
7537	SI.POV.GAP5	Poverty gap at $5 a day (PPP) (%)
7538	SI.POV.GAPS	Poverty gap at $1.25 a day (PPP) (%)
7540	SI.POV.NAGP	Poverty gap at national poverty lines (%)
7541	SI.POV.NAHC	Poverty headcount ratio at national poverty li...
7547	SI.POV.RUGP	Rural poverty gap at national poverty lines (%)
7548	SI.POV.RUHC	Rural poverty headcount ratio at national pove...
7549	SI.POV.URGP	Urban poverty gap at national poverty lines (%)
7550	SI.POV.URHC	Urban poverty headcount ratio at national pove...
10162	ccx_povchi_40_fem	Poverty headcount of children (below bottom 40...
10163	ccx_povchi_40_mal	Poverty headcount of children (below bottom 40...
10164	ccx_povchi_40_rur	Poverty headcount of children (below bottom 40...
10165	ccx_povchi_40_tot	Poverty headcount of children (below bottom 40%)
10166	ccx_povchi_40_urb	Poverty headcount of children (below bottom 40...
10167	ccx_poveld_40_fem	Poverty headcount of the elderly (below bottom...
10168	ccx_poveld_40_mal	Poverty headcount of the elderly (below bottom...
10169	ccx_poveld_40_rur	Poverty headcount of the elderly (below bottom...
10170	ccx_poveld_40_tot	Poverty headcount of the elderly (below bottom...
10171	ccx_poveld_40_urb	Poverty headcount of the elderly (below bottom...
10172	ccx_povwka_40_fem	Poverty headcount of working age adults (below...
...	...	...
12761	per_si_allsi_p1_ep_preT_tot	Poverty Gap reduction (%) - All Social Insura...
12762	per_si_allsi_p1_ep_tot	Poverty Gap reduction (%) - All Social Insura...
12763	per_si_allsi_p1_preT_tot	Poverty Gap reduction (%) - All Social Insura...
12764	per_si_allsi_p1_rur	Poverty Gap reduction (%) - All Social Insura...
12765	per_si_allsi_p1_tot	Poverty Gap reduction (%) - All Social Insura...
12766	per_si_allsi_p1_urb	Poverty Gap reduction (%) - All Social Insura...
12901	per_si_oa_p0_ep_preT_tot	Poverty Headcount reduction (%) - Old Age Con...
12902	per_si_oa_p0_ep_tot	Poverty Headcount reduction (%) - Old Age Con...
12903	per_si_oa_p0_preT_tot	Poverty Headcount reduction (%) - Old Age Con...
12904	per_si_oa_p0_rur	Poverty Headcount reduction (%) - Old Age Con...
12905	per_si_oa_p0_tot	Poverty Headcount reduction (%) - Old Age Con...
12906	per_si_oa_p0_urb	Poverty Headcount reduction (%) - Old Age Con...
12907	per_si_oa_p1_ep_preT_tot	Poverty Gap reduction (%) - Old Age Contribut...
12908	per_si_oa_p1_ep_tot	Poverty Gap reduction (%) - Old Age Contribut...
12909	per_si_oa_p1_preT_tot	Poverty Gap reduction (%) - Old Age Contribut...
12910	per_si_oa_p1_rur	Poverty Gap reduction (%) - Old Age Contribut...
12911	per_si_oa_p1_tot	Poverty Gap reduction (%) - Old Age Contribut...
12912	per_si_oa_p1_urb	Poverty Gap reduction (%) - Old Age Contribut...
13047	per_si_ss_p0_ep_preT_tot	Poverty Headcount reduction (%) - Other Socia...
13048	per_si_ss_p0_ep_tot	Poverty Headcount reduction (%) - Other Socia...
13049	per_si_ss_p0_preT_tot	Poverty Headcount reduction (%) - Other Socia...
13050	per_si_ss_p0_rur	Poverty Headcount reduction (%) - Other Socia...
13051	per_si_ss_p0_tot	Poverty Headcount reduction (%) - Other Socia...
13052	per_si_ss_p0_urb	Poverty Headcount reduction (%) - Other Socia...
13053	per_si_ss_p1_ep_preT_tot	Poverty Gap reduction (%) - Other Social Insu...
13054	per_si_ss_p1_ep_tot	Poverty Gap reduction (%) - Other Social Insu...
13055	per_si_ss_p1_preT_tot	Poverty Gap reduction (%) - Other Social Insu...
13056	per_si_ss_p1_rur	Poverty Gap reduction (%) - Other Social Insu...
13057	per_si_ss_p1_tot	Poverty Gap reduction (%) - Other Social Insu...
13058	per_si_ss_p1_urb	Poverty Gap reduction (%) - Other Social Insu...

231 rows × 2 columns

In [3]:

# Strip the English labels from the id's and store them in a separate table

povnames = pov.loc[7529:7550, 'name']
povnames = povnames.tolist()

# Keep only the id's in the original pov table

pov = pov.loc[7529:7550, 'id']
pov = pov.tolist()

# Take a look

povnames

Out[3]:

[u'Poverty headcount ratio at $2.5 a day (PPP) (% of population)',
 u'Poverty headcount ratio at $2 a day (PPP) (% of population)',
 u'Poverty headcount ratio at $4 a day (PPP) (% of population)',
 u'Poverty headcount ratio at $5 a day (PPP) (% of population)',
 u'Poverty headcount ratio at $1.25 a day (PPP) (% of population)',
 u'Poverty gap at $2 a day (PPP) (%)',
 u'Poverty gap at $2.5 a day (PPP) (%)',
 u'Poverty gap at $4 a day (PPP) (%)',
 u'Poverty gap at $5 a day (PPP) (%)',
 u'Poverty gap at $1.25 a day (PPP) (%)',
 u'Poverty gap at national poverty lines (%)',
 u'Poverty headcount ratio at national poverty lines (% of population)',
 u'Rural poverty gap at national poverty lines (%)',
 u'Rural poverty headcount ratio at national poverty lines (% of rural population)',
 u'Urban poverty gap at national poverty lines (%)',
 u'Urban poverty headcount ratio at national poverty lines (% of urban population)']

In [4]:

pov

Out[4]:

[u'SI.POV.25DAY',
 u'SI.POV.2DAY',
 u'SI.POV.4DAY',
 u'SI.POV.5DAY',
 u'SI.POV.DDAY',
 u'SI.POV.GAP2',
 u'SI.POV.GAP25',
 u'SI.POV.GAP4',
 u'SI.POV.GAP5',
 u'SI.POV.GAPS',
 u'SI.POV.NAGP',
 u'SI.POV.NAHC',
 u'SI.POV.RUGP',
 u'SI.POV.RUHC',
 u'SI.POV.URGP',
 u'SI.POV.URHC']

In [5]:

# Create a dictionary of the names and id's
povdict = dict(zip(pov, povnames))
povdict

Out[5]:

{u'SI.POV.25DAY': u'Poverty headcount ratio at $2.5 a day (PPP) (% of population)',
 u'SI.POV.2DAY': u'Poverty headcount ratio at $2 a day (PPP) (% of population)',
 u'SI.POV.4DAY': u'Poverty headcount ratio at $4 a day (PPP) (% of population)',
 u'SI.POV.5DAY': u'Poverty headcount ratio at $5 a day (PPP) (% of population)',
 u'SI.POV.DDAY': u'Poverty headcount ratio at $1.25 a day (PPP) (% of population)',
 u'SI.POV.GAP2': u'Poverty gap at $2 a day (PPP) (%)',
 u'SI.POV.GAP25': u'Poverty gap at $2.5 a day (PPP) (%)',
 u'SI.POV.GAP4': u'Poverty gap at $4 a day (PPP) (%)',
 u'SI.POV.GAP5': u'Poverty gap at $5 a day (PPP) (%)',
 u'SI.POV.GAPS': u'Poverty gap at $1.25 a day (PPP) (%)',
 u'SI.POV.NAGP': u'Poverty gap at national poverty lines (%)',
 u'SI.POV.NAHC': u'Poverty headcount ratio at national poverty lines (% of population)',
 u'SI.POV.RUGP': u'Rural poverty gap at national poverty lines (%)',
 u'SI.POV.RUHC': u'Rural poverty headcount ratio at national poverty lines (% of rural population)',
 u'SI.POV.URGP': u'Urban poverty gap at national poverty lines (%)',
 u'SI.POV.URHC': u'Urban poverty headcount ratio at national poverty lines (% of urban population)'}

In [6]:

# Now, look for all income related indicators and store them

inc = wb.search('income.*share.*%').iloc[:,:2]
inc

Out[6]:

	id	name
7522	SI.DST.02ND.20	Income share held by second 20%
7523	SI.DST.03RD.20	Income share held by third 20%
7524	SI.DST.04TH.20	Income share held by fourth 20%
7525	SI.DST.05TH.20	Income share held by highest 20%
7526	SI.DST.10TH.10	Income share held by highest 10%
7527	SI.DST.FRST.10	Income share held by lowest 10%
7528	SI.DST.FRST.20	Income share held by lowest 20%

In [7]:

# Repeat what was done with the poverty indicators

incnames = inc.loc[:, 'name']
incnames = incnames.tolist()
inc = inc.loc[:,'id']
inc = inc.tolist()
incnames

Out[7]:

[u'Income share held by second 20%',
 u'Income share held by third 20%',
 u'Income share held by fourth 20%',
 u'Income share held by highest 20%',
 u'Income share held by highest 10%',
 u'Income share held by lowest 10%',
 u'Income share held by lowest 20%']

In [8]:

inc

Out[8]:

[u'SI.DST.02ND.20',
 u'SI.DST.03RD.20',
 u'SI.DST.04TH.20',
 u'SI.DST.05TH.20',
 u'SI.DST.10TH.10',
 u'SI.DST.FRST.10',
 u'SI.DST.FRST.20']

In [9]:

# Create another dictionary for income

incdict = dict(zip(inc, incnames))
incdict

Out[9]:

{u'SI.DST.02ND.20': u'Income share held by second 20%',
 u'SI.DST.03RD.20': u'Income share held by third 20%',
 u'SI.DST.04TH.20': u'Income share held by fourth 20%',
 u'SI.DST.05TH.20': u'Income share held by highest 20%',
 u'SI.DST.10TH.10': u'Income share held by highest 10%',
 u'SI.DST.FRST.10': u'Income share held by lowest 10%',
 u'SI.DST.FRST.20': u'Income share held by lowest 20%'}

In [10]:

# Create master list of all of the data we want to download:

idx = pov + inc
idx

Out[10]:

[u'SI.POV.25DAY',
 u'SI.POV.2DAY',
 u'SI.POV.4DAY',
 u'SI.POV.5DAY',
 u'SI.POV.DDAY',
 u'SI.POV.GAP2',
 u'SI.POV.GAP25',
 u'SI.POV.GAP4',
 u'SI.POV.GAP5',
 u'SI.POV.GAPS',
 u'SI.POV.NAGP',
 u'SI.POV.NAHC',
 u'SI.POV.RUGP',
 u'SI.POV.RUHC',
 u'SI.POV.URGP',
 u'SI.POV.URHC',
 u'SI.DST.02ND.20',
 u'SI.DST.03RD.20',
 u'SI.DST.04TH.20',
 u'SI.DST.05TH.20',
 u'SI.DST.10TH.10',
 u'SI.DST.FRST.10',
 u'SI.DST.FRST.20']

In [11]:

# Download data and store it as a DataFrame

khm = wb.download(indicator=idx, country='KHM', start=2004, end=2012)
khm

Out[11]:

		SI.POV.25DAY	SI.POV.2DAY	SI.POV.4DAY	SI.POV.5DAY	SI.POV.DDAY	SI.POV.GAP2	SI.POV.GAP25	SI.POV.GAP4	SI.POV.GAP5	SI.POV.GAPS	...	SI.POV.RUHC	SI.POV.URGP	SI.POV.URHC	SI.DST.02ND.20	SI.DST.03RD.20	SI.DST.04TH.20	SI.DST.05TH.20	SI.DST.10TH.10	SI.DST.FRST.10	SI.DST.FRST.20
country	year
Cambodia	2012	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	20.8	NaN	6.4	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	2011	59.00	41.26	85.70	91.83	10.05	10.25	18.29	39.60	49.51	1.43	...	23.6	NaN	8.7	12.46	16.11	21.24	41.20	26.91	4.04	8.99
	2010	57.59	40.88	84.06	90.63	11.25	10.59	18.37	39.00	48.74	1.70	...	25.3	NaN	8.5	12.02	15.80	21.18	42.49	28.01	3.80	8.51
	2009	56.25	40.74	82.20	89.25	12.93	11.21	18.71	38.50	48.02	2.08	...	27.5	NaN	8.0	11.66	15.68	21.48	43.15	28.20	3.55	8.03
	2008	65.90	51.05	87.33	92.48	20.89	16.27	24.78	45.05	54.08	4.39	...	38.5	NaN	15.1	11.60	15.67	21.43	43.45	28.57	3.44	7.85
	2007	71.05	59.39	87.65	92.07	30.82	21.92	30.64	49.51	57.63	7.24	...	51.4	NaN	18.3	10.05	13.95	20.16	48.89	33.99	3.12	6.95
	2006	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	2005	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	2004	76.54	64.43	91.55	94.95	32.77	23.64	33.09	52.85	60.96	7.79	...	54.2	NaN	28.5	11.43	15.37	21.22	44.00	29.09	3.56	7.98

9 rows × 23 columns

In [12]:

# Reverse the order of the DataFrame so the years are ascending, drop Cambodia index, drop categories with all NA's

khm.index = khm.index.droplevel(0)
khm = khm.iloc[::-1]
khm = khm.dropna(axis=1, how='all')
qgrid.show_grid(khm, remote_js=True)

In [13]:

# Let's look at the data of percent of total income earned from the highest 10% next to that of
# the lowest 10%

incframe = khm[['SI.DST.10TH.10', 'SI.DST.FRST.10']]
incframe = incframe[0:8] # No data for 2012, so let's omit it
incframe.columns = [incdict[incframe.columns.tolist()[0]], incdict[incframe.columns.tolist()[1]]]
incframe

Out[13]:

	Income share held by highest 10%	Income share held by lowest 10%
year
2004	29.09	3.56
2005	NaN	NaN
2006	NaN	NaN
2007	33.99	3.12
2008	28.57	3.44
2009	28.20	3.55
2010	28.01	3.80
2011	26.91	4.04

In [14]:

incframe.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']

plt.figure()
incframe.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

Out[14]:

<matplotlib.text.Text at 0x10d85eb10>

<matplotlib.figure.Figure at 0x10d9f4910>

In [15]:

incframe1 = khm[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'
                 ]]

# Loop to look up ids in dictionary to rename the columns
newcolumns = range(0,len(incframe1.columns))
for i in range(0, len(incframe1.columns)):
    newcolumns[i] = incdict[incframe1.columns.tolist()[i]]
incframe1.columns = newcolumns

incframe1 = incframe1[0:8] #Omit 2012, no data
incframe1

Out[15]:

	Income share held by lowest 20%	Income share held by second 20%	Income share held by third 20%	Income share held by fourth 20%	Income share held by highest 20%
year
2004	7.98	11.43	15.37	21.22	44.00
2005	NaN	NaN	NaN	NaN	NaN
2006	NaN	NaN	NaN	NaN	NaN
2007	6.95	10.05	13.95	20.16	48.89
2008	7.85	11.60	15.67	21.43	43.45
2009	8.03	11.66	15.68	21.48	43.15
2010	8.51	12.02	15.80	21.18	42.49
2011	8.99	12.46	16.11	21.24	41.20

In [16]:

# Change column names
incframe1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']

plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
plt.ylim([0, 50])

Out[16]:

(0, 50)

<matplotlib.figure.Figure at 0x10d9f4850>

In [17]:

# Stacked bar graph of above dataframe

plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

Out[17]:

<matplotlib.text.Text at 0x10d816c90>

<matplotlib.figure.Figure at 0x10d967250>

In [18]:

# How does this compare to the US?

usa = wb.download(indicator=['SI.DST.10TH.10', 'SI.DST.FRST.10', 'SI.DST.FRST.20', 'SI.DST.02ND.20', 
                             'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'], 
                  country='USA', start=2004, end=2011)

usa.index = usa.index.droplevel(0)
usa = usa.iloc[::-1]
qgrid.show_grid(usa, remote_js=True)

In [19]:

# So there isn't nearly as much data for the US during this time period, but it's still worth looking at.

usainc = usa[['SI.DST.10TH.10', 'SI.DST.FRST.10']]

usainc.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']

plt.figure()
usainc.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

Out[19]:

<matplotlib.text.Text at 0x10aed8710>

<matplotlib.figure.Figure at 0x10d9dd210>

In [20]:

usainc1 = usa[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20']]

usainc1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']

plt.figure()
usainc1.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

Out[20]:

<matplotlib.text.Text at 0x10d876950>

<matplotlib.figure.Figure at 0x10d9ff550>

In [21]:

plt.figure()
usainc1.plot(title = 'Income Share in the USA', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

Out[21]:

<matplotlib.text.Text at 0x10d69ed50>

<matplotlib.figure.Figure at 0x10d926610>

In [27]:

# So... we aren't much better (perhaps worse). Let's look at the Gross National Income per capita for each
# country to get a better look at the differences

khmgni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='KHM', start=2004, end=2011)
khmgni.index = khmgni.index.droplevel(0)
khmgni = khmgni.iloc[::-1]

qgrid.show_grid(khmgni, remote_js=True)

In [28]:

plt.figure()
khmgni.plot(title = 'Gross National Income per Capita in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')

Out[28]:

<matplotlib.text.Text at 0x10a84bfd0>

<matplotlib.figure.Figure at 0x10d067850>

In [29]:

usagni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='USA', start=2004, end=2011)
usagni.index = usagni.index.droplevel(0)
usagni = usagni.iloc[::-1]

qgrid.show_grid(usagni, remote_js=True)

In [30]:

plt.figure()
usagni.plot(title = 'Gross National Income per Capita in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')

Out[30]:

<matplotlib.text.Text at 0x109ca6810>

<matplotlib.figure.Figure at 0x109d0b310>

In [24]:

# Explore how close people are to the poverty boundary and how it is changing over time

povline = khm[['SI.POV.25DAY', 'SI.POV.2DAY', 'SI.POV.4DAY', 'SI.POV.5DAY', 'SI.POV.DDAY', 'SI.POV.NAHC']]
newcolumns = range(0,len(povline.columns))
for i in range(0, len(povline.columns)):
    newcolumns[i] = povdict[povline.columns.tolist()[i]]
povline.columns = newcolumns

povline

Out[24]:

	Poverty headcount ratio at $2.5 a day (PPP) (% of population)	Poverty headcount ratio at $2 a day (PPP) (% of population)	Poverty headcount ratio at $4 a day (PPP) (% of population)	Poverty headcount ratio at $5 a day (PPP) (% of population)	Poverty headcount ratio at $1.25 a day (PPP) (% of population)	Poverty headcount ratio at national poverty lines (% of population)
year
2004	76.54	64.43	91.55	94.95	32.77	50.2
2005	NaN	NaN	NaN	NaN	NaN	NaN
2006	NaN	NaN	NaN	NaN	NaN	NaN
2007	71.05	59.39	87.65	92.07	30.82	45.0
2008	65.90	51.05	87.33	92.48	20.89	34.0
2009	56.25	40.74	82.20	89.25	12.93	23.9
2010	57.59	40.88	84.06	90.63	11.25	22.1
2011	59.00	41.26	85.70	91.83	10.05	20.5
2012	NaN	NaN	NaN	NaN	NaN	17.7

In [31]:

# Change titles of columns for plotting, then plot
povline.columns = ['<$2.50 a day', '<$2 a day', '<$4 a day', '<$5 a day', '<$1.25 a day', '<National poverty lines']

plt.figure()
povline.plot(title='Poverty Headcount Ratio at Different Incomes in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('% of population')

Out[31]:

<matplotlib.text.Text at 0x10dfbee10>

<matplotlib.figure.Figure at 0x109cc6c90>

In [26]:

# The World Bank does not have most of this data for the USA, so I will find US Census data later.