In [1]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

def text_to_dict(paragraph_array, d, candidates):
    '''takes an array of text paragraphs from debate and returns dict 
    where key is person and value is list of text spoken by that candidate'''
    # just a default speaker that won't end up in our returned data
    # will get replaced when an actual speaker is found
    speaker = "<START>"
    for paragraph in paragraph_array:
        words = paragraph.text.split(' ')
        first_word = words[0]
        # only new speaker when have SPEAKER: format
        if first_word[-1] == ":":
            speaker = first_word[:-1]
        # only keep candidates text
        if speaker in candidates:
    return d

def process_url(url, speaker_dict, candidates):
    # requests gets the source code from the url and extracts it as text
    html = requests.get(url).text
    # beautifulsoup is a library that takes in text source code and returns a structured format of that
    # source code that you can more easily search and parse.
    soup = BeautifulSoup(html, 'html5lib')
    # get all the 'p' tags from the source with class = 'story-body-text'
    # this was determined by looking at the source code
    # the first and last paragraphs are intro and ending
    paragraphs = soup('p', {'class': 'story-body-text'})[1:-1]
    text_to_dict(paragraphs, speaker_dict, candidates)
def process_url_list(urls, speaker_dict, candidates):
    for url in urls:
        process_url(url, speaker_dict, candidates)
candidates = ['BUSH', 'TRUMP', 'RUBIO', 'CARSON', 'FIORINA', 'KASICH', 'CRUZ', 'PAUL',
             'SANDERS', 'CLINTON', "O’MALLEY"]
urls = ['',
speaker_dict = defaultdict(list)

process_url_list(urls, speaker_dict, candidates)
/Users/tylerfolkman/anaconda/lib/python2.7/site-packages/ipykernel/ UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
In [2]:
def pargraphs_to_words(speaker_dict):
    d = defaultdict(list)
    for candidate, paragraphs in speaker_dict.items():
        for paragraph in paragraphs:
            for word in paragraph:
    return d
In [3]:
candidate_words = pargraphs_to_words(speaker_dict)
In [4]:
import json

with open('candidate_words_dict.json', 'w') as f:
    json.dump(candidate_words, f)