#The BBC Programmes data is currently available as a JSON feed import requests import json #We can get a list of the episodes associated with a series from the the episodes/player page for the series #seriesStub='http://www.bbc.co.uk/programmes/b006qshd/episodes/player.json?page=' #More or Less, Radio 4 seriesStub='http://www.bbc.co.uk/programmes/p00msxfl/episodes/player.json?page=' #More or Less, World Service #The following lambda function gives the URL for a particular episode based on its programme ID programmeURL= lambda uid: "http://www.bbc.co.uk/programmes/"+uid+".json" def getData(url): ''' A simple function to request the JSON data from a URL and return it as a dict ''' resp = requests.get(url) data=json.loads(resp.content.decode('utf8')) return data #Test the function data=getData(seriesStub) data #Test the episode URL creator for episode in data['episodes'][:3]: print(programmeURL(episode['programme']['pid'])) #Test that we can get the full data for episode from a created URL episodeData=getData(programmeURL(episode['programme']['pid'])) episodeData #Long_synopsis data is often good to search on episodeData['programme']['long_synopsis'] #We're going to store the data in a MongoDB - so let's set one up from pymongo import MongoClient c = MongoClient('localhost', 27017) db = c['bbc-database'] collection = db.more_or_less #Test saving the data #Let's use the episode ID as the MongoDB key _id episodeData['programme']['_id']=episodeData['programme']['pid'] #Use save rather than insert (insert throws an error if by chance there is a duplicate key _id) collection.save(episodeData['programme']) #Test a search for r in collection.find({'long_synopsis': {'$regex':'deaths'}},{'long_synopsis':1}): print(r) #Now we're going to put all the pieces together seriesStub='http://www.bbc.co.uk/programmes/b006qshd/episodes/player.json?page=' #More or Less, Radio 4 #The scraping flag just helps us keep track of the pages we're going to iterate through scraping=True pagecount=1 while scraping: url=seriesStub+str(pagecount) resp = requests.get(url) data=json.loads(resp.content.decode('utf8')) if data['page']!=pagecount: scraping=False else: for episode in data['episodes']: episodeData=getData(programmeURL(episode['programme']['pid'])) episodeData['programme']['_id']=episodeData['programme']['pid'] collection.save(episodeData['programme']) pagecount+=1 collection.count() seriesStubGenerator=lambda pid: 'http://www.bbc.co.uk/programmes/'+pid+'/episodes/player.json?page=' def bbcSeriesDataScraper(collection,pid): scraping=True pagecount=1 seriesStub=seriesStubGenerator(pid) while scraping: url=seriesStub+str(pagecount) resp = requests.get(url) data=json.loads(resp.content.decode('utf8')) if data['page']!=pagecount: scraping=False else: for episode in data['episodes']: episodeData=getData(programmeURL(episode['programme']['pid'])) episodeData['programme']['_id']=episodeData['programme']['pid'] collection.save(episodeData['programme']) pagecount+=1 #seriesStub='http://www.bbc.co.uk/programmes/p00msxfl/episodes/player.json?page=' #More or Less, World Service bbcSeriesDataScraper(collection,'p00msxfl') collection.count() #Test a search for r in collection.find({'long_synopsis': {'$regex':'deaths'}},{'long_synopsis':1}): print(r)