import pandas as pd

population = pd.read_excel("sp.pop.totl_Indicator_en_excel_v2.xls", "Data",
                           header=2, index_col="Country Name")
publications = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Data",
                             header=2, index_col="Country Name")
metac = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Metadata - Countries",
                       header=0, index_col="Country Name")

okCountries = metac['Region'].dropna().index;
# as oposed to regions

publications = publications.loc[okCountries]
population = population.loc[okCountries]

pubs_per_cap = (publications['2009'] / population['2009']).dropna()
# there is no data for publications for 2010, 2011, 2012 or 2013

pubs_per_cap.sort(ascending=False)

pubs_per_cap

vals = pubs_per_cap.values
pops = population['2009'][pubs_per_cap.index].values / 1000000
poss = np.concatenate([np.array([0.]), np.cumsum(pops)])
countries = [name for name in pubs_per_cap.index]
# very dirty, I know...

n = 50

# dark magic for colors:
colors = [get_cmap("Set1")((hash(name)/1234567.) % 1) for name in pubs_per_cap.index[:n]]
# or alternatively something like:
# https://gist.github.com/endolith/2719900#file_accent.py
# colors = [get_cmap("Paired")(rand()) for i in range(n)]

figure(num=None, figsize=(14, 8), dpi=80, facecolor='w', edgecolor='k')
rects = bar(poss[:n], vals[:n], width=pops[:n],
    linewidth=0, color=colors)
for i, rect in enumerate(rects):
    height = rect.get_height()
    text(rect.get_x()+rect.get_width()/2., 1.05*height, countries[i],
         ha='left', va='bottom', rotation=45)
xlabel("Population [in millions]")
ylabel("Publications per capita per year (2009)")
show()