import pandas as pd import matplotlib.pyplot as plt import numpy as np from IPython.display import HTML %matplotlib inline df = pd.read_csv('../data/gdp_wordbank.csv', index_col=[1,2], na_values=['..']) df.sortlevel(inplace=True) HTML(df.loc[( slice('NY.GDP.PCAP.KD', 'NY.GDP.PCAP.KN'), slice('Andorra','Angola')), :] .ix[:,:5] .to_html()) df_selected = df.ix[u'NY.GDP.PCAP.KD'] gdp2012 = np.log10(np.array(df_selected['2012 [YR2012]'].dropna(), dtype=np.float32)) plt.hist(gdp2012, 20) plt.xlabel('log GDP (US $)') plt.ylabel('n/o countries') gdp2005 = np.log10(np.array(df_selected['2005 [YR2005]'].dropna(), dtype=np.float32)) plt.plot([1,2], [np.median(gdp2005), np.median(gdp2012)], 'k--') elements = plt.boxplot([gdp2005, gdp2012], labels=['2005', '2012'], notch=True); plt.ylabel('log GDP per capita (US $)') plt.setp(elements['medians'], color='k') plt.setp(elements['whiskers'], color='k', ls='solid') plt.setp(elements['boxes'], color='k') bins = np.linspace(gdp2012.min(), gdp2012.max(), 30) n2012, bins2012, patches = plt.hist(gdp2012, bins, histtype='step', label='2012') n2005, bins2005, patches = plt.hist(gdp2005, bins, histtype='step', label='2005') plt.xlabel('log GDP (US $)') plt.ylabel('n/o countries') from scipy import stats from matplotlib import transforms kde2005 = stats.kde.gaussian_kde(gdp2005) kde2012 = stats.kde.gaussian_kde(gdp2012) x = np.linspace(1, 6, 40) ax = plt.subplot(111) trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) l1, = plt.plot(x, kde2005(x)) l2, = plt.plot(x, kde2012(x), color='r') plt.legend([l1, l2], ['2005', '2012'], frameon=False) plt.vlines(gdp2012, 0.0, 0.03, color='r', lw=0.2, transform=trans) plt.vlines(gdp2005, 0.03, 0.06, color='b', lw=0.2, transform=trans) plt.xlabel('log GDP per capita (US $)') plt.ylabel('prob. density') plt.text(0.02, 0.92, "Data source: Word Bank", transform=ax.transAxes) print "K-S stat: {:.2f}, p-value: {:.2f}".format(*stats.ks_2samp(gdp2005, gdp2012))