An attempt to get data from the IATI Data Registry http://iatiregistry.org/
import requests,json
import pandas as pd
The IATA Datastore makes use of several code lists. Let's see what they're like...
#IATI Codelists
iati_country='http://iatistandard.org/codelists/downloads/clv1/codelist/Country.csv'
iati_region='http://iatistandard.org/codelists/downloads/clv1/codelist/Region.csv'
iati_sector='http://iatistandard.org/codelists/downloads/clv1/codelist/Sector.csv'
iati_category='http://iatistandard.org/codelists/downloads/clv1/codelist/SectorCategory.csv'
iati_orgtype='http://iatistandard.org/codelists/downloads/clv1/codelist/OrganisationType.csv'
def codesgetter(url):
df=pd.read_csv(url).dropna(axis=1,how='all')
df.code=df['code'].astype('str')
df.set_index(keys='code',inplace=True)
return df
countries=codesgetter(iati_country)
countries[:3]
name | language | |
---|---|---|
code | ||
AF | AFGHANISTAN | en |
AX | ÅLAND ISLANDS | en |
AL | ALBANIA | en |
regions=codesgetter(iati_region)
regions[:3]
name | language | |
---|---|---|
code | ||
89 | Europe, regional | en |
189 | North of Sahara, regional | en |
289 | South of Sahara, regional | en |
#I fell foul of looking up countries that were regions - generate a combined set and call them areas
areas=pd.concat([countries,regions])
sectors=codesgetter(iati_sector)
sectors[:3]
name | description | language | category | category-name | category-description | |
---|---|---|---|---|---|---|
code | ||||||
11110 | Education policy and administrative management | Education sector policy, planning and programm... | en | 111 | Education, level unspecified | Education sector policy, planning and programm... |
11120 | Education facilities and training | Educational buildings, equipment, materials; s... | en | 111 | Education, level unspecified | Education sector policy, planning and programm... |
11130 | Teacher training | Teacher education (where the level of educatio... | en | 111 | Education, level unspecified | Education sector policy, planning and programm... |
categories=codesgetter(iati_category)
categories[:3]
name | description | language | |
---|---|---|---|
code | |||
111 | Education, level unspecified | Education sector policy, planning and programm... | en |
112 | Basic education | Formal and non-formal primary education for ch... | en |
113 | Secondary education | Second cycle systematic instruction at both ju... | en |
orgtypes=codesgetter(iati_orgtype)
orgtypes[:3]
name | language | |
---|---|---|
code | ||
10 | Government | en |
15 | Other Public Sector | en |
21 | International NGO | en |
Now let's see if we can start to work with the API - can we pull down the organisation names?
url='http://iatiregistry.org/api/3/action/package_search?q=extras_filetype:organisation&rows=1000'
data=json.loads(requests.get(url).text)
len(data['result']['results'])
170
The list of organizations from the API (returns 170 organisations) is different to the list of organisations (270 of them) described as publishers here: http://iatiregistry.org/publisher ( http://iatiregistry.org/organization forwards to that URL).
I don't really understand the API structure at all - so a quick hack workaround is just make use of the effort developing the website and scrape that instead!
from bs4 import BeautifulSoup
url='http://iatiregistry.org/publisher'
response = requests.get(url)
soup=BeautifulSoup(response.content)
results={}
table=soup.find('tbody')
res=[]
for row in table.findAll('tr'):
tmp=[row.find('a').attrs['href'].replace('/publisher/','')]
for cell in row.findAll('td'):
tmp.append(cell.text)
res.append(tmp)
publishers=pd.DataFrame(res,columns=['ID','Name','OrgType','Country','Datasets'])
publishers[:3]
ID | Name | OrgType | Country | Datasets | |
---|---|---|---|---|---|
0 | aa | International HIV/AIDS Alliance | International NGO | United Kingdom | 2 |
1 | aai | ActionAid International | International NGO | Netherlands | 12 |
2 | aauk | ActionAid UK | International NGO | United Kingdom | 7 |
The following function looks up a few more details about an organisation from it's ID.
#Look up org details by code/id on IATI Register
def iati_org_info(k):
r=json.loads(requests.get('http://iatiregistry.org/api/1/rest/group/'+k).text)
#print(r)
d={}
for x in ['title','display_name']:
d[x]=r[x]
d['id']=r['name']
for e in r['extras']:
if e in ['publisher_organization_type','publisher_country','publisher_iati_id']:
d[e]=r['extras'][e]
if 'publisher_organization_type' in d:
d['orgtype'] = orgtypes.loc[ d['publisher_organization_type'] ]['name']
if 'publisher_country' in d and d['publisher_country']!='':
d['pub_country'] = areas.loc[ d['publisher_country'] ]['name']
return d
iati_org_info('aauk')
{'title': 'ActionAid UK', 'publisher_country': 'GB', 'publisher_iati_id': 'GB-CHC-274467', 'pub_country': 'UNITED KINGDOM', 'id': 'aauk', 'publisher_organization_type': '21', 'orgtype': 'International NGO', 'display_name': 'ActionAid UK'}
We probably should try to make API calls rather than resorting to webpage scrapers though! How about the following (not sure where I found the URL crib?! I donlt know what the list of organisations actually refers to, either?!)
#Trying this...
#It makes a few hundred requests to the API, so if we run this we may as well store the result
# We can then use the stored data in any future investigations
def scrapeIATIorgData():
dx=[]
#Get a list of groups
groups=json.loads(requests.get('http://iatiregistry.org/api/1/rest/group').text)
for group in groups:
dx.append(iati_org_info(group))
admindata=pd.DataFrame(dx)
admindata.to_csv('iati_orgdata.csv',index=False)
#scrapeIATIorgData()
admindata=pd.read_csv('iati_orgdata.csv')
admindata[:3]
display_name | id | orgtype | pub_country | publisher_country | publisher_iati_id | publisher_organization_type | title | |
---|---|---|---|---|---|---|---|---|
0 | International HIV/AIDS Alliance | aa | International NGO | UNITED KINGDOM | GB | 21020 | 21 | International HIV/AIDS Alliance |
1 | ActionAid International | aai | International NGO | NETHERLANDS | NL | NL-KVK-27264198 | 21 | ActionAid International |
2 | ActionAid UK | aauk | International NGO | UNITED KINGDOM | GB | GB-CHC-274467 | 21 | ActionAid UK |
We can search with that list for particular organisations.
def lookupOrg(name):
return admindata[admindata['title'].str.contains(name,case=False)]
lookupOrg('oxfam')
display_name | id | orgtype | pub_country | publisher_country | publisher_iati_id | publisher_organization_type | title | |
---|---|---|---|---|---|---|---|---|
196 | Oxfam Novib | onl | International NGO | NETHERLANDS | NL | NL-KvK-27108436 | 21 | Oxfam Novib |
199 | Oxfam GB | oxfamgb | International NGO | UNITED KINGDOM | GB | GB-CHC-202918 | 21 | Oxfam GB |
200 | Oxfam India | oxfamindia | National NGO | INDIA | IN | NaN | 22 | Oxfam India |
Now let's think about doing some actual dataset searches.
#These routines parse results of searches on the API
def iati_org_annotate(d,lang='en'):
if 'publisher_organization_type' in d:
d['orgtype'] = orgtypes.loc[ d['publisher_organization_type'] ]['name']
if 'publisher_country' in d:
d['pub_country'] = areas.loc[ d['publisher_country'] ]['name']
return d
def iati_org_core(r):
d={}
d['item']=r['title']
d['title']=r['organization']['title']
d['id']=r['organization']['name']
for e in r['extras']:
if e['key'] in ['publisher_organization_type','publisher_country','publisher_iati_id']:
d[e['key']]=e['value']
return d
def iati_org(r):
return iati_org_annotate( iati_org_core(r) )
iati_org(data['result']['results'][0])
{'title': 'Switzerland - Swiss Agency for Development and Cooperation (SDC)', 'item': '140521 Organisation file SDC', 'publisher_country': 'CH', 'pub_country': 'SWITZERLAND', 'id': 'sdc_ch', 'publisher_organization_type': '10', 'orgtype': 'Government', 'publisher_iati_id': 'CH-4'}
The following function runs a keyword search for activity data packages, with no other search limits.
#Really scruffy search
def searchIATIdatasets(q):
url='http://iatiregistry.org/api/3/action/package_search'
params={'rows':100, 'q':' '.join([q,'extras_filetype:activity'])}
response=requests.get(url,params=params)
data=json.loads(response.text)
results=[]
for result in data['result']['results']:
tmp=iati_org(result)
tmp['url']=result['resources'][0]['url']
results.append(tmp)
return pd.DataFrame(results)
resp=searchIATIdatasets('bangladesh')
resp[:4]
id | item | orgtype | pub_country | publisher_country | publisher_iati_id | publisher_organization_type | title | url | |
---|---|---|---|---|---|---|---|---|---|
0 | scuk | Activities - Bangladesh | International NGO | UNITED KINGDOM | GB | GB-COH-213890 | 21 | Save the Children UK | http://iati.savethechildren.org.uk/files/activ... |
1 | asdb | Asian Development Bank Activity File for Bangl... | Multilateral | Asia, regional | 798 | 46004 | 40 | Asian Development Bank | http://www.adb.org/iati/iati-activities-bd.xml |
2 | ausgov | Australian Aid Country File Bangladesh | Government | AUSTRALIA | AU | AU-5 | 10 | Australia - Department of Foreign Affairs and... | http://dfat.gov.au/data/downloads/Australian_A... |
3 | cafod | CAFOD Activity File for Bangladesh | International NGO | UNITED KINGDOM | GB | GB-CHC-285776 | 21 | Catholic Agency For Overseas Development | http://www.cafod.org.uk/extra/data/iati/IATIFi... |
Having got a list of search results, we can specify the index (bold font, left hand column entry) of the dataset we're interested in, download it as a CSV file, and work with it as a pandas dataframe.
#Try to load in a data file identified from the previous search
#Pass in the index val from the previous query
from io import StringIO
def grabIATIdata(df,ix):
print("Data for:",df.loc[ix]['item'])
url='http://tools.aidinfolabs.org/csv/direct_from_registry/'
params={'xml':df.loc[ix]['url'],
'download':'true',
'id':'true',
'format':'simple'
}
response = requests.get(url,params=params)
df=pd.read_csv(StringIO(response.text))#, skipfooter=2, engine='python' )
return df
#This will grab the fourth result (index 3) - CAFOD Activity File for Bangladesh - and display the data
ex=grabIATIdata(resp,3)
ex[:3]
Data for: CAFOD Activity File for Bangladesh
reporting-organisation | iati-identifier | aid_project_title | activity_description | full_details | default-currency_for_amounts | total_commitments | total_disbursements | total_reimbursements | total_expenditure | ... | default-aid-type | default-finance-type_code | default-finance-type | default-flow-type_code | default-flow-type | default-tied-status_code | default-tied-status | related-activities | related-activity_types | last-updated-datetime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CAFOD | GB-CHC-285776-BAN098 | Promoting Social and Political Rights of Perso... | The objective of thisis project is to provide ... | http://tools.aidinfolabs.org/explorer/activity... | GBP | 0 | 471635 | 0 | 0 | 471661 | ... | 110 | Aid grant excluding debt reorganisation | 30 | Private grants | 5 | Untied | NaN | NaN | 2014-07-30T00:00:00 | NaN |
CAFOD | GB-CHC-285776-BAN101 | HIV and AIDS prevention project | For nine years 2003-12 CAFOD has worked with C... | http://tools.aidinfolabs.org/explorer/activity... | GBP | 0 | 187407 | 0 | 0 | 196963 | ... | 110 | Aid grant excluding debt reorganisation | 30 | Private grants | 5 | Untied | NaN | NaN | 2014-07-30T00:00:00 | NaN |
CAFOD | GB-CHC-285776-BAN105 | Improvement of livelihoods through sustainable... | This project began in 2006, focusing on two ve... | http://tools.aidinfolabs.org/explorer/activity... | GBP | 0 | 180820 | 0 | 0 | 177996 | ... | 110 | Aid grant excluding debt reorganisation | 30 | Private grants | 5 | Untied | NaN | NaN | 2014-07-30T00:00:00 | NaN |
3 rows × 63 columns
#Here's a peek at the full range of columns available in the dataframe
ex.columns
Index(['reporting-organisation', 'iati-identifier', 'aid_project_title', 'activity_description', 'full_details', 'default-currency_for_amounts', 'total_commitments', 'total_disbursements', 'total_reimbursements', 'total_expenditure', 'total_incoming-funds', 'total_loan-repayment', 'total_interest-repayment', 'start-planned_iso-date', 'start-planned', 'start-actual_iso-date', 'start-actual', 'end-planned_iso-date', 'end-planned', 'end-actual_iso-date', 'end-actual', 'funding-organisations', 'extending-organisations', 'accountable-organisations', 'implementing-organisations', 'recipient-country_codes', 'recipient-countries', 'recipient-country_percentages', 'recipient-region_codes', 'recipient-regions', 'recipient-region_percentages', 'activity-website', 'activity-status_code', 'activity-status', 'collaboration-type_code', 'collaboration-type', 'conditions-attached', 'condition_types', 'conditions', 'sectors', 'sector_vocabularies', 'sector_codes', 'sector_percentages', 'policy-markers', 'policy-marker_vocabularies', 'policy-marker_significance', 'policy-marker_codes', 'contact-info_organisation', 'contact-info_person-name', 'contact-info_person-name.1', 'contact-info_email', 'contact-info_mailing-address', 'default-aid-type_code', 'default-aid-type', 'default-finance-type_code', 'default-finance-type', 'default-flow-type_code', 'default-flow-type', 'default-tied-status_code', 'default-tied-status', 'related-activities', 'related-activity_types', 'last-updated-datetime'], dtype='object')
We've managed to grab a list of orgainisations listed on the IATI Data Register (though I'bve no idea in what capacity) along with informal short ID codes and IATI Registration codes.
We got a simple way of looking up organisation details by slug (eg iati_org_info('aauk')
) and a way of running queries over activity datasets (eg searchIATIdatasets('bangladesh')
). We can also get a dataframe containing data relating to a specified package from the results listing.