# This is my notebook for exploring data about economic inequality in Cambodia. %matplotlib inline import pandas as pd import matplotlib as mp import matplotlib.pyplot as plt import qgrid from pylab import * import seaborn as sb # Hey, good news! We can remotely access the World Bank's World Development Indicators Database # directly from pandas! from pandas.io import wb # First, search the database for all poverty-related indicator names and store them. # I didn't use qgrid because it wouldn't display the id column correctly for some reason. It'd look nicer if it worked, though. pov = wb.search('pov.*%').iloc[:,:2] pov # Strip the English labels from the id's and store them in a separate table povnames = pov.loc[7529:7550, 'name'] povnames = povnames.tolist() # Keep only the id's in the original pov table pov = pov.loc[7529:7550, 'id'] pov = pov.tolist() # Take a look povnames pov # Create a dictionary of the names and id's povdict = dict(zip(pov, povnames)) povdict # Now, look for all income related indicators and store them inc = wb.search('income.*share.*%').iloc[:,:2] inc # Repeat what was done with the poverty indicators incnames = inc.loc[:, 'name'] incnames = incnames.tolist() inc = inc.loc[:,'id'] inc = inc.tolist() incnames inc # Create another dictionary for income incdict = dict(zip(inc, incnames)) incdict # Create master list of all of the data we want to download: idx = pov + inc idx # Download data and store it as a DataFrame khm = wb.download(indicator=idx, country='KHM', start=2004, end=2012) khm # Reverse the order of the DataFrame so the years are ascending, drop Cambodia index, drop categories with all NA's khm.index = khm.index.droplevel(0) khm = khm.iloc[::-1] khm = khm.dropna(axis=1, how='all') qgrid.show_grid(khm, remote_js=True) # Let's look at the data of percent of total income earned from the highest 10% next to that of # the lowest 10% incframe = khm[['SI.DST.10TH.10', 'SI.DST.FRST.10']] incframe = incframe[0:8] # No data for 2012, so let's omit it incframe.columns = [incdict[incframe.columns.tolist()[0]], incdict[incframe.columns.tolist()[1]]] incframe incframe.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners'] plt.figure() incframe.plot(title = 'Income Share in Cambodia', style = 'o-') plt.xlabel('Year') plt.ylabel('Share of National Income (%)') incframe1 = khm[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20' ]] # Loop to look up ids in dictionary to rename the columns newcolumns = range(0,len(incframe1.columns)) for i in range(0, len(incframe1.columns)): newcolumns[i] = incdict[incframe1.columns.tolist()[i]] incframe1.columns = newcolumns incframe1 = incframe1[0:8] #Omit 2012, no data incframe1 # Change column names incframe1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%'] plt.figure() incframe1.plot(title = 'Income Share in Cambodia', style = 'o-') plt.xlabel('Year') plt.ylabel('Share of National Income (%)') plt.ylim([0, 50]) # Stacked bar graph of above dataframe plt.figure() incframe1.plot(title = 'Income Share in Cambodia', kind = 'bar', stacked = 'True') plt.xlabel('Year') plt.ylabel('Share of National Income (%)') # How does this compare to the US? usa = wb.download(indicator=['SI.DST.10TH.10', 'SI.DST.FRST.10', 'SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'], country='USA', start=2004, end=2011) usa.index = usa.index.droplevel(0) usa = usa.iloc[::-1] qgrid.show_grid(usa, remote_js=True) # So there isn't nearly as much data for the US during this time period, but it's still worth looking at. usainc = usa[['SI.DST.10TH.10', 'SI.DST.FRST.10']] usainc.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners'] plt.figure() usainc.plot(title = 'Income Share in the USA', style = 'o-') plt.xlabel('Year') plt.ylabel('Share of National Income (%)') usainc1 = usa[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20']] usainc1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%'] plt.figure() usainc1.plot(title = 'Income Share in the USA', style = 'o-') plt.xlabel('Year') plt.ylabel('Share of National Income (%)') plt.figure() usainc1.plot(title = 'Income Share in the USA', kind = 'bar', stacked = 'True') plt.xlabel('Year') plt.ylabel('Share of National Income (%)') # So... we aren't much better (perhaps worse). Let's look at the Gross National Income per capita for each # country to get a better look at the differences khmgni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='KHM', start=2004, end=2011) khmgni.index = khmgni.index.droplevel(0) khmgni = khmgni.iloc[::-1] qgrid.show_grid(khmgni, remote_js=True) plt.figure() khmgni.plot(title = 'Gross National Income per Capita in Cambodia', style = 'o-') plt.xlabel('Year') plt.ylabel('Income ($)') usagni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='USA', start=2004, end=2011) usagni.index = usagni.index.droplevel(0) usagni = usagni.iloc[::-1] qgrid.show_grid(usagni, remote_js=True) plt.figure() usagni.plot(title = 'Gross National Income per Capita in the USA', style = 'o-') plt.xlabel('Year') plt.ylabel('Income ($)') # Explore how close people are to the poverty boundary and how it is changing over time povline = khm[['SI.POV.25DAY', 'SI.POV.2DAY', 'SI.POV.4DAY', 'SI.POV.5DAY', 'SI.POV.DDAY', 'SI.POV.NAHC']] newcolumns = range(0,len(povline.columns)) for i in range(0, len(povline.columns)): newcolumns[i] = povdict[povline.columns.tolist()[i]] povline.columns = newcolumns povline # Change titles of columns for plotting, then plot povline.columns = ['<$2.50 a day', '<$2 a day', '<$4 a day', '<$5 a day', '<$1.25 a day', '