Quick and dirty code to address Research publications per capita? - Academia Stack Exchange.
import pandas as pd
population = pd.read_excel("sp.pop.totl_Indicator_en_excel_v2.xls", "Data",
header=2, index_col="Country Name")
publications = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Data",
header=2, index_col="Country Name")
metac = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Metadata - Countries",
header=0, index_col="Country Name")
okCountries = metac['Region'].dropna().index;
# as oposed to regions
publications = publications.loc[okCountries]
population = population.loc[okCountries]
pubs_per_cap = (publications['2009'] / population['2009']).dropna()
# there is no data for publications for 2010, 2011, 2012 or 2013
pubs_per_cap.sort(ascending=False)
pubs_per_cap
Country Name Switzerland 0.001223 Sweden 0.001019 Denmark 0.000961 Finland 0.000927 Norway 0.000919 Netherlands 0.000899 Australia 0.000869 Canada 0.000860 Israel 0.000842 Singapore 0.000839 Iceland 0.000814 New Zealand 0.000739 United Kingdom 0.000739 United States 0.000680 Belgium 0.000669 ... Korea, Dem. Rep. 3.405566e-07 Angola 3.275804e-07 Guinea 3.209592e-07 Congo, Dem. Rep. 3.157741e-07 Burundi 2.912615e-07 Chad 2.022631e-07 Myanmar 1.979027e-07 Turkmenistan 1.004225e-07 Somalia 7.462007e-08 Liberia 0.000000e+00 St. Lucia 0.000000e+00 Tuvalu 0.000000e+00 Antigua and Barbuda 0.000000e+00 Samoa 0.000000e+00 Kiribati 0.000000e+00 Name: 2009, Length: 188, dtype: float64
vals = pubs_per_cap.values
pops = population['2009'][pubs_per_cap.index].values / 1000000
poss = np.concatenate([np.array([0.]), np.cumsum(pops)])
countries = [name for name in pubs_per_cap.index]
# very dirty, I know...
n = 50
# dark magic for colors:
colors = [get_cmap("Set1")((hash(name)/1234567.) % 1) for name in pubs_per_cap.index[:n]]
# or alternatively something like:
# https://gist.github.com/endolith/2719900#file_accent.py
# colors = [get_cmap("Paired")(rand()) for i in range(n)]
figure(num=None, figsize=(14, 8), dpi=80, facecolor='w', edgecolor='k')
rects = bar(poss[:n], vals[:n], width=pops[:n],
linewidth=0, color=colors)
for i, rect in enumerate(rects):
height = rect.get_height()
text(rect.get_x()+rect.get_width()/2., 1.05*height, countries[i],
ha='left', va='bottom', rotation=45)
xlabel("Population [in millions]")
ylabel("Publications per capita per year (2009)")
show()