import pandas as pd population = pd.read_excel("sp.pop.totl_Indicator_en_excel_v2.xls", "Data", header=2, index_col="Country Name") publications = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Data", header=2, index_col="Country Name") metac = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Metadata - Countries", header=0, index_col="Country Name") okCountries = metac['Region'].dropna().index; # as oposed to regions publications = publications.loc[okCountries] population = population.loc[okCountries] pubs_per_cap = (publications['2009'] / population['2009']).dropna() # there is no data for publications for 2010, 2011, 2012 or 2013 pubs_per_cap.sort(ascending=False) pubs_per_cap vals = pubs_per_cap.values pops = population['2009'][pubs_per_cap.index].values / 1000000 poss = np.concatenate([np.array([0.]), np.cumsum(pops)]) countries = [name for name in pubs_per_cap.index] # very dirty, I know... n = 50 # dark magic for colors: colors = [get_cmap("Set1")((hash(name)/1234567.) % 1) for name in pubs_per_cap.index[:n]] # or alternatively something like: # https://gist.github.com/endolith/2719900#file_accent.py # colors = [get_cmap("Paired")(rand()) for i in range(n)] figure(num=None, figsize=(14, 8), dpi=80, facecolor='w', edgecolor='k') rects = bar(poss[:n], vals[:n], width=pops[:n], linewidth=0, color=colors) for i, rect in enumerate(rects): height = rect.get_height() text(rect.get_x()+rect.get_width()/2., 1.05*height, countries[i], ha='left', va='bottom', rotation=45) xlabel("Population [in millions]") ylabel("Publications per capita per year (2009)") show()