import requests,json import pandas as pd #IATI Codelists iati_country='http://iatistandard.org/codelists/downloads/clv1/codelist/Country.csv' iati_region='http://iatistandard.org/codelists/downloads/clv1/codelist/Region.csv' iati_sector='http://iatistandard.org/codelists/downloads/clv1/codelist/Sector.csv' iati_category='http://iatistandard.org/codelists/downloads/clv1/codelist/SectorCategory.csv' iati_orgtype='http://iatistandard.org/codelists/downloads/clv1/codelist/OrganisationType.csv' def codesgetter(url): df=pd.read_csv(url).dropna(axis=1,how='all') df.code=df['code'].astype('str') df.set_index(keys='code',inplace=True) return df countries=codesgetter(iati_country) countries[:3] regions=codesgetter(iati_region) regions[:3] #I fell foul of looking up countries that were regions - generate a combined set and call them areas areas=pd.concat([countries,regions]) sectors=codesgetter(iati_sector) sectors[:3] categories=codesgetter(iati_category) categories[:3] orgtypes=codesgetter(iati_orgtype) orgtypes[:3] url='http://iatiregistry.org/api/3/action/package_search?q=extras_filetype:organisation&rows=1000' data=json.loads(requests.get(url).text) len(data['result']['results']) from bs4 import BeautifulSoup url='http://iatiregistry.org/publisher' response = requests.get(url) soup=BeautifulSoup(response.content) results={} table=soup.find('tbody') res=[] for row in table.findAll('tr'): tmp=[row.find('a').attrs['href'].replace('/publisher/','')] for cell in row.findAll('td'): tmp.append(cell.text) res.append(tmp) publishers=pd.DataFrame(res,columns=['ID','Name','OrgType','Country','Datasets']) publishers[:3] #Look up org details by code/id on IATI Register def iati_org_info(k): r=json.loads(requests.get('http://iatiregistry.org/api/1/rest/group/'+k).text) #print(r) d={} for x in ['title','display_name']: d[x]=r[x] d['id']=r['name'] for e in r['extras']: if e in ['publisher_organization_type','publisher_country','publisher_iati_id']: d[e]=r['extras'][e] if 'publisher_organization_type' in d: d['orgtype'] = orgtypes.loc[ d['publisher_organization_type'] ]['name'] if 'publisher_country' in d and d['publisher_country']!='': d['pub_country'] = areas.loc[ d['publisher_country'] ]['name'] return d iati_org_info('aauk') #Trying this... #It makes a few hundred requests to the API, so if we run this we may as well store the result # We can then use the stored data in any future investigations def scrapeIATIorgData(): dx=[] #Get a list of groups groups=json.loads(requests.get('http://iatiregistry.org/api/1/rest/group').text) for group in groups: dx.append(iati_org_info(group)) admindata=pd.DataFrame(dx) admindata.to_csv('iati_orgdata.csv',index=False) #scrapeIATIorgData() admindata=pd.read_csv('iati_orgdata.csv') admindata[:3] def lookupOrg(name): return admindata[admindata['title'].str.contains(name,case=False)] lookupOrg('oxfam') #These routines parse results of searches on the API def iati_org_annotate(d,lang='en'): if 'publisher_organization_type' in d: d['orgtype'] = orgtypes.loc[ d['publisher_organization_type'] ]['name'] if 'publisher_country' in d: d['pub_country'] = areas.loc[ d['publisher_country'] ]['name'] return d def iati_org_core(r): d={} d['item']=r['title'] d['title']=r['organization']['title'] d['id']=r['organization']['name'] for e in r['extras']: if e['key'] in ['publisher_organization_type','publisher_country','publisher_iati_id']: d[e['key']]=e['value'] return d def iati_org(r): return iati_org_annotate( iati_org_core(r) ) iati_org(data['result']['results'][0]) #Really scruffy search def searchIATIdatasets(q): url='http://iatiregistry.org/api/3/action/package_search' params={'rows':100, 'q':' '.join([q,'extras_filetype:activity'])} response=requests.get(url,params=params) data=json.loads(response.text) results=[] for result in data['result']['results']: tmp=iati_org(result) tmp['url']=result['resources'][0]['url'] results.append(tmp) return pd.DataFrame(results) resp=searchIATIdatasets('bangladesh') resp[:4] #Try to load in a data file identified from the previous search #Pass in the index val from the previous query from io import StringIO def grabIATIdata(df,ix): print("Data for:",df.loc[ix]['item']) url='http://tools.aidinfolabs.org/csv/direct_from_registry/' params={'xml':df.loc[ix]['url'], 'download':'true', 'id':'true', 'format':'simple' } response = requests.get(url,params=params) df=pd.read_csv(StringIO(response.text))#, skipfooter=2, engine='python' ) return df #This will grab the fourth result (index 3) - CAFOD Activity File for Bangladesh - and display the data ex=grabIATIdata(resp,3) ex[:3] #Here's a peek at the full range of columns available in the dataframe ex.columns