# special IPython command to prepare the notebook for matplotlib %matplotlib inline import urllib2 # module to read in HTML import bs4 # BeautifulSoup: module to parse HTML and XML import json # import datetime as dt # module for manipulating dates and times import pandas as pd import numpy as np x = urllib2.urlopen("http://www.google.com") htmlSource = x.read() x.close() type(htmlSource) print htmlSource x = urllib2.urlopen("http://www.reddit.com") # Opens URLS htmlSource = x.read() x.close() print htmlSource ### prettify() Beautiful Soup gives us a `BeautifulSoup` object, which represents the document as a nested data structure. We can use the `prettify()` function to show the different levels of the HTML code. soup = bs4.BeautifulSoup(htmlSource) print soup.prettify() print soup.head.prettify() soup.head.contents len(soup.head.contents) # Extract first three elements from the list of contents soup.head.contents[0:3] soup.head.children for child in soup.head.children: print(child) # print the title of reddit soup.head.title # print the string in the title soup.head.title.string for child in soup.head.descendants: print child for string in soup.strings: print(repr(string)) for string in soup.stripped_strings: print(repr(string)) soup.title soup.title.string soup.title.string.parent # search for all tags; returns a list soup.find_all('a') # your turn # search for all the paragragh tags # your turn # search for all the table tags # your turn # search for all the tags and use the limit argument # your turn # What does the using the text argument do? soup.find_all('a')[1].get('href') # your turn # write a for loop printing all the links from reddit # your turn # write a for loop, but use a list comprehension this time # show the first 5 elements # your turn # split the first url by "/" print(soup.get_text()) a = {'a': 1, 'b':2} s = json.dumps(a) a2 = json.loads(s) a # a dictionary s # s is a string containing a in JSON encoding a2 # reading back the keys are now in unicode url = "http://worldcup.sfg.io/matches" data = urllib2.urlopen(url).read() wc = json.loads(data.decode('utf-8')) "Number of matches in 2014 World Cup: %i" % len(wc) # Print keys in first match gameIndex = 60 wc[gameIndex].keys() wc[gameIndex]['status'] wc[gameIndex]['match_number'] wc[gameIndex]['away_team'] wc[gameIndex]['away_team_events'] wc[gameIndex]['home_team'] for elem in wc: print elem['home_team']['country'], elem['home_team']['goals'], elem['away_team']['country'], elem['away_team']['goals'] data = pd.DataFrame(wc, columns = ['match_number', 'location', 'datetime', 'home_team', 'away_team', 'winner', 'home_team_events', 'away_team_events']) data.head() data['gameDate'] = pd.DatetimeIndex(data.datetime).date data['gameTime'] = pd.DatetimeIndex(data.datetime).time data.head()