#!/usr/bin/env python # coding: utf-8 # Get some data... # In[1]: funder='http://roarmap.eprints.org/cgi/exportview/policymaker_type/funder/JSON/funder.js' funderAndResearch='http://roarmap.eprints.org/cgi/exportview/policymaker_type/funder=5Fand=5Fresearch=5Forg/JSON/funder=5Fand=5Fresearch=5Forg.js' multi_res_orgs='http://roarmap.eprints.org/cgi/exportview/policymaker_type/multiple=5Fresearch=5Forgs/JSON/multiple=5Fresearch=5Forgs.js' res_org='http://roarmap.eprints.org/cgi/exportview/policymaker_type/research=5Forg/JSON/research=5Forg.js' sub_unit='http://roarmap.eprints.org/cgi/exportview/policymaker_type/research=5Forg=5Fsubunit/JSON/research=5Forg=5Fsubunit.js' # ### Get Country Codes # # We need to install a library to help scrape the country codes. # In[8]: get_ipython().system('pip install beautifulsoup4') # In[2]: import pandas as pd import requests # In[3]: get_ipython().run_line_magic('matplotlib', 'inline') #Need this for charting #use seaborn for prettier charts import seaborn as sns # ### Grab the ROAR data down # # One way is to go in by org type (not sure if there is a single list?) and then combine the results into a single data table. # In[6]: funder_df=pd.read_json(funder) funderAndResearch_df=pd.read_json(funderAndResearch) multi_res_orgs_df=pd.read_json(multi_res_orgs) res_org_df=pd.read_json(res_org) sub_unit_df=pd.read_json(sub_unit) # In[7]: df=pd.concat([funder_df,funderAndResearch_df,multi_res_orgs_df,res_org_df,sub_unit_df]) #How many records do we have? len(df) # In[8]: #Preview top few rows df.head() # In[4]: #Countries have a code rather than country name associated with them... from bs4 import BeautifulSoup #Grab web page containing country codes soup=BeautifulSoup(requests.get('http://roarmap.eprints.org/cgi/search/advanced').content) countries=soup.find('select',id='country').findAll('option') #generate lookup from country codes to country names countryList={} countryListZKey={} for country in countries: countryList[country['value']]=country.text.strip('.') countryListZKey[country['value'].lstrip('0')]=country.text.strip('.') #Add in country names from country codes df['country2']=df['country'].apply(lambda x: countryListZKey[str(x)]) # In[25]: #Save data as a csv file df.to_csv('roardata.csv') #What I'd be tempted to do is load that data into RStudio and build a shiny app around it... #Tutorial: http://shiny.rstudio.com/tutorial/ # ### Things by Country # # In[10]: df.groupby('country2').size().order(ascending=False) # In[11]: df.groupby('rights_holding').size().order(ascending=False) # In[12]: df.columns # In[13]: df.groupby('policy_colour').size().order(ascending=False) # In[14]: df.groupby(['policy_colour','country2']).size().order(ascending=False) # In[23]: fig=df.groupby('rights_holding').size().order(ascending=False).plot(kind='barh') fig # In[ ]: