This notebook defines a few functions to download images with specified search terms from Wikimedia, via the mediawiki search API.
The results are used in the Parallel face detection notebook.
import sys, os
import requests
try:
import requests_cache
except ImportError:
print("no cache, no worries")
else:
requests_cache.install_cache("mediawiki")
api_url = "http://commons.wikimedia.org/w/api.php"
no cache, no worries
def api_request(**kwargs):
"""Make a request of the Wikimedia Commons API
Returns data after parsing JSON
"""
sys.stdout.write('.')
sys.stdout.flush()
params = dict(
action='query',
format='json',
)
params.update(kwargs)
r = requests.get(api_url, params=params)
r.raise_for_status()
return r.json()
import json
def search_images(search, limit=100, size_limit=400000):
"""search wikimedia commons for a given term
returns a list of `limit` URLs for images
"""
urls = []
continue_params = {}
while limit > 0:
data = api_request(
srnamespace=6,
prop='imageinfo',
list='search',
srsearch=search,
srlimit=min(limit, 50),
**continue_params
)
continue_params = data['query-continue']['search']
total = data['query']['searchinfo']['totalhits']
results = data['query']['search']
for r in results:
title = r['title']
data = api_request(
prop='imageinfo',
titles=title,
iiprop='url|size|mime')
imageinfo = data['query']['pages'].values()[0]['imageinfo'][0]
if imageinfo['mime'] in ('image/png', 'image/jpeg') and imageinfo['size'] <= size_limit:
urls.append(imageinfo['url'])
limit -= 1
return urls
def download_images(search, n):
"""download images from mediawiki commons to folders based on the search term"""
if not os.path.exists('images'):
os.mkdir('images')
tagdir = os.path.join('images', search)
if not os.path.exists(tagdir):
os.mkdir(tagdir)
for url in search_images(search, n):
r = requests.get(url)
fname = url.rsplit('/')[-1]
dest = os.path.join(tagdir, fname)
# print("downloading %s => %s" % (url, dest))
sys.stdout.write('+')
sys.stdout.flush()
with open(dest, 'wb') as f:
f.write(r.content)
download_images('portrait', 100)
....................................................................................................................................................................++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
download_images('face', 100)
.....................................................................................................................................................................................................................................................++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
download_images('headshot', 100)
..............................................................................................................................................................................................++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
download_images('castle', 100)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-2-bf59cc0f091f> in <module>() ----> 1 download_images('castle', 100) NameError: name 'download_images' is not defined