# This is my notebook for exploring data about economic inequality in Cambodia.

%matplotlib inline
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import qgrid
from pylab import *
import seaborn as sb

# Hey, good news! We can remotely access the World Bank's World Development Indicators Database
# directly from pandas!

from pandas.io import wb

# First, search the database for all poverty-related indicator names and store them.
# I didn't use qgrid because it wouldn't display the id column correctly for some reason. It'd look nicer if it worked, though.

pov = wb.search('pov.*%').iloc[:,:2]
pov

# Strip the English labels from the id's and store them in a separate table

povnames = pov.loc[7529:7550, 'name']
povnames = povnames.tolist()

# Keep only the id's in the original pov table

pov = pov.loc[7529:7550, 'id']
pov = pov.tolist()

# Take a look

povnames

pov

# Create a dictionary of the names and id's
povdict = dict(zip(pov, povnames))
povdict

# Now, look for all income related indicators and store them

inc = wb.search('income.*share.*%').iloc[:,:2]
inc

# Repeat what was done with the poverty indicators

incnames = inc.loc[:, 'name']
incnames = incnames.tolist()
inc = inc.loc[:,'id']
inc = inc.tolist()
incnames

inc

# Create another dictionary for income

incdict = dict(zip(inc, incnames))
incdict

# Create master list of all of the data we want to download:

idx = pov + inc
idx

# Download data and store it as a DataFrame

khm = wb.download(indicator=idx, country='KHM', start=2004, end=2012)
khm

# Reverse the order of the DataFrame so the years are ascending, drop Cambodia index, drop categories with all NA's

khm.index = khm.index.droplevel(0)
khm = khm.iloc[::-1]
khm = khm.dropna(axis=1, how='all')
qgrid.show_grid(khm, remote_js=True)

# Let's look at the data of percent of total income earned from the highest 10% next to that of
# the lowest 10%

incframe = khm[['SI.DST.10TH.10', 'SI.DST.FRST.10']]
incframe = incframe[0:8] # No data for 2012, so let's omit it
incframe.columns = [incdict[incframe.columns.tolist()[0]], incdict[incframe.columns.tolist()[1]]]
incframe


incframe.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']

plt.figure()
incframe.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

incframe1 = khm[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'
                 ]]

# Loop to look up ids in dictionary to rename the columns
newcolumns = range(0,len(incframe1.columns))
for i in range(0, len(incframe1.columns)):
    newcolumns[i] = incdict[incframe1.columns.tolist()[i]]
incframe1.columns = newcolumns

incframe1 = incframe1[0:8] #Omit 2012, no data
incframe1

# Change column names
incframe1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']

plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
plt.ylim([0, 50])

# Stacked bar graph of above dataframe

plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

# How does this compare to the US?

usa = wb.download(indicator=['SI.DST.10TH.10', 'SI.DST.FRST.10', 'SI.DST.FRST.20', 'SI.DST.02ND.20', 
                             'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'], 
                  country='USA', start=2004, end=2011)

usa.index = usa.index.droplevel(0)
usa = usa.iloc[::-1]
qgrid.show_grid(usa, remote_js=True)

# So there isn't nearly as much data for the US during this time period, but it's still worth looking at.

usainc = usa[['SI.DST.10TH.10', 'SI.DST.FRST.10']]

usainc.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']

plt.figure()
usainc.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

usainc1 = usa[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20']]

usainc1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']

plt.figure()
usainc1.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

plt.figure()
usainc1.plot(title = 'Income Share in the USA', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')

# So... we aren't much better (perhaps worse). Let's look at the Gross National Income per capita for each
# country to get a better look at the differences

khmgni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='KHM', start=2004, end=2011)
khmgni.index = khmgni.index.droplevel(0)
khmgni = khmgni.iloc[::-1]

qgrid.show_grid(khmgni, remote_js=True)

plt.figure()
khmgni.plot(title = 'Gross National Income per Capita in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')

usagni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='USA', start=2004, end=2011)
usagni.index = usagni.index.droplevel(0)
usagni = usagni.iloc[::-1]

qgrid.show_grid(usagni, remote_js=True)

plt.figure()
usagni.plot(title = 'Gross National Income per Capita in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')

# Explore how close people are to the poverty boundary and how it is changing over time

povline = khm[['SI.POV.25DAY', 'SI.POV.2DAY', 'SI.POV.4DAY', 'SI.POV.5DAY', 'SI.POV.DDAY', 'SI.POV.NAHC']]
newcolumns = range(0,len(povline.columns))
for i in range(0, len(povline.columns)):
    newcolumns[i] = povdict[povline.columns.tolist()[i]]
povline.columns = newcolumns

povline

# Change titles of columns for plotting, then plot
povline.columns = ['<$2.50 a day', '<$2 a day', '<$4 a day', '<$5 a day', '<$1.25 a day', '<National poverty lines']

plt.figure()
povline.plot(title='Poverty Headcount Ratio at Different Incomes in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('% of population')

# The World Bank does not have most of this data for the USA, so I will find US Census data later.