#The BBC Programmes data is currently available as a JSON feed
import requests
import json

#We can get a list of the episodes associated with a series from the the episodes/player page for the series
#seriesStub='http://www.bbc.co.uk/programmes/b006qshd/episodes/player.json?page=' #More or Less, Radio 4
seriesStub='http://www.bbc.co.uk/programmes/p00msxfl/episodes/player.json?page=' #More or Less, World Service
#The following lambda function gives the URL for a particular episode based on its programme ID
programmeURL= lambda uid: "http://www.bbc.co.uk/programmes/"+uid+".json"

def getData(url):
    ''' A simple function to request the JSON data from a URL and return it as a dict '''
    resp = requests.get(url)
    data=json.loads(resp.content.decode('utf8'))
    return data

#Test the function
data=getData(seriesStub)
data

#Test the episode URL creator
for episode in data['episodes'][:3]:
    print(programmeURL(episode['programme']['pid']))

#Test that we can get the full data for episode from a created URL
episodeData=getData(programmeURL(episode['programme']['pid']))
episodeData

#Long_synopsis data is often good to search on
episodeData['programme']['long_synopsis']

#We're going to store the data in a MongoDB - so let's set one up
from pymongo import MongoClient
c = MongoClient('localhost', 27017)
db = c['bbc-database']
collection = db.more_or_less

#Test saving the data
#Let's use the episode ID as the MongoDB key _id
episodeData['programme']['_id']=episodeData['programme']['pid']
#Use save rather than insert (insert throws an error if by chance there is a duplicate key _id)
collection.save(episodeData['programme'])

#Test a search
for r in collection.find({'long_synopsis': {'$regex':'deaths'}},{'long_synopsis':1}):
    print(r)

#Now we're going to put all the pieces together
seriesStub='http://www.bbc.co.uk/programmes/b006qshd/episodes/player.json?page=' #More or Less, Radio 4

#The scraping flag just helps us keep track of the pages we're going to iterate through
scraping=True
pagecount=1

while scraping:
    url=seriesStub+str(pagecount)
    resp = requests.get(url)
    data=json.loads(resp.content.decode('utf8'))
    if data['page']!=pagecount: scraping=False
    else:
        for episode in data['episodes']:
            episodeData=getData(programmeURL(episode['programme']['pid']))
            episodeData['programme']['_id']=episodeData['programme']['pid']
            collection.save(episodeData['programme'])
        
    
    pagecount+=1

collection.count()

seriesStubGenerator=lambda pid: 'http://www.bbc.co.uk/programmes/'+pid+'/episodes/player.json?page='

def bbcSeriesDataScraper(collection,pid):
    scraping=True
    pagecount=1
    seriesStub=seriesStubGenerator(pid)
    while scraping:
        url=seriesStub+str(pagecount)
        resp = requests.get(url)
        data=json.loads(resp.content.decode('utf8'))
        if data['page']!=pagecount: scraping=False
        else:
            for episode in data['episodes']:
                episodeData=getData(programmeURL(episode['programme']['pid']))
                episodeData['programme']['_id']=episodeData['programme']['pid']
                collection.save(episodeData['programme'])


        pagecount+=1

#seriesStub='http://www.bbc.co.uk/programmes/p00msxfl/episodes/player.json?page=' #More or Less, World Service
bbcSeriesDataScraper(collection,'p00msxfl')

collection.count()

#Test a search
for r in collection.find({'long_synopsis': {'$regex':'deaths'}},{'long_synopsis':1}):
    print(r)