import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML
%matplotlib inline
df = pd.read_csv('../data/gdp_wordbank.csv',
index_col=[1,2], na_values=['..'])
df.sortlevel(inplace=True)
HTML(df.loc[( slice('NY.GDP.PCAP.KD', 'NY.GDP.PCAP.KN'),
slice('Andorra','Angola')), :]
.ix[:,:5]
.to_html())
Series Name | Country Code | 2004 [YR2004] | 2005 [YR2005] | 2006 [YR2006] | ||
---|---|---|---|---|---|---|
Series Code | Country Name | |||||
NY.GDP.PCAP.KD | Andorra | GDP per capita (constant 2005 US$) | ADO | 30329.589913 | 31268.966745 | 33125.386792 |
Angola | GDP per capita (constant 2005 US$) | AGO | 1494.296347 | 1706.543616 | 1990.839161 | |
NY.GDP.PCAP.KD.ZG | Andorra | GDP per capita growth (annual %) | ADO | 1.882501 | 3.097229 | 5.936941 |
Angola | GDP per capita growth (annual %) | AGO | 7.023286 | 14.203827 | 16.659143 | |
NY.GDP.PCAP.KN | Andorra | GDP per capita (constant LCU) | ADO | 14338.168009 | 14782.253896 | 15659.867560 |
Angola | GDP per capita (constant LCU) | AGO | 39736.007970 | 45380.041829 | 52939.967995 |
df_selected = df.ix[u'NY.GDP.PCAP.KD']
gdp2012 = np.log10(np.array(df_selected['2012 [YR2012]'].dropna(), dtype=np.float32))
plt.hist(gdp2012, 20)
plt.xlabel('log GDP (US $)')
plt.ylabel('n/o countries')
<matplotlib.text.Text at 0x1111e9c10>
gdp2005 = np.log10(np.array(df_selected['2005 [YR2005]'].dropna(), dtype=np.float32))
plt.plot([1,2], [np.median(gdp2005), np.median(gdp2012)], 'k--')
elements = plt.boxplot([gdp2005, gdp2012],
labels=['2005', '2012'],
notch=True);
plt.ylabel('log GDP per capita (US $)')
plt.setp(elements['medians'], color='k')
plt.setp(elements['whiskers'], color='k', ls='solid')
plt.setp(elements['boxes'], color='k')
[None, None]
bins = np.linspace(gdp2012.min(), gdp2012.max(), 30)
n2012, bins2012, patches = plt.hist(gdp2012, bins, histtype='step',
label='2012')
n2005, bins2005, patches = plt.hist(gdp2005, bins, histtype='step',
label='2005')
plt.xlabel('log GDP (US $)')
plt.ylabel('n/o countries')
<matplotlib.text.Text at 0x11128d190>
from scipy import stats
from matplotlib import transforms
kde2005 = stats.kde.gaussian_kde(gdp2005)
kde2012 = stats.kde.gaussian_kde(gdp2012)
x = np.linspace(1, 6, 40)
ax = plt.subplot(111)
trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
l1, = plt.plot(x, kde2005(x))
l2, = plt.plot(x, kde2012(x), color='r')
plt.legend([l1, l2], ['2005', '2012'], frameon=False)
plt.vlines(gdp2012, 0.0, 0.03, color='r', lw=0.2,
transform=trans)
plt.vlines(gdp2005, 0.03, 0.06, color='b', lw=0.2,
transform=trans)
plt.xlabel('log GDP per capita (US $)')
plt.ylabel('prob. density')
plt.text(0.02, 0.92, "Data source: Word Bank", transform=ax.transAxes)
<matplotlib.text.Text at 0x11357a6d0>
print "K-S stat: {:.2f}, p-value: {:.2f}".format(*stats.ks_2samp(gdp2005, gdp2012))
K-S stat: 0.05, p-value: 0.98