# special IPython command to prepare the notebook for matplotlib
%matplotlib inline
import urllib2 # module to read in HTML
import bs4 # BeautifulSoup: module to parse HTML and XML
import json #
import datetime as dt # module for manipulating dates and times
import pandas as pd
import numpy as np
x = urllib2.urlopen("http://www.google.com")
htmlSource = x.read()
x.close()
type(htmlSource)
print htmlSource
x = urllib2.urlopen("http://www.reddit.com") # Opens URLS
htmlSource = x.read()
x.close()
print htmlSource
### prettify()
Beautiful Soup gives us a `BeautifulSoup` object, which represents the document as a nested data structure. We can use the `prettify()` function to show the different levels of the HTML code.
soup = bs4.BeautifulSoup(htmlSource)
print soup.prettify()
print soup.head.prettify()
soup.head.contents
len(soup.head.contents)
# Extract first three elements from the list of contents
soup.head.contents[0:3]
soup.head.children
for child in soup.head.children:
print(child)
# print the title of reddit
soup.head.title
# print the string in the title
soup.head.title.string
for child in soup.head.descendants:
print child
for string in soup.strings:
print(repr(string))
for string in soup.stripped_strings:
print(repr(string))
soup.title
soup.title.string
soup.title.string.parent
# search for all tags; returns a list
soup.find_all('a')
# your turn
# search for all the paragragh tags
# your turn
# search for all the table tags
# your turn
# search for all the tags and use the limit argument
# your turn
# What does the using the text argument do?
soup.find_all('a')[1].get('href')
# your turn
# write a for loop printing all the links from reddit
# your turn
# write a for loop, but use a list comprehension this time
# show the first 5 elements
# your turn
# split the first url by "/"
print(soup.get_text())
a = {'a': 1, 'b':2}
s = json.dumps(a)
a2 = json.loads(s)
a # a dictionary
s # s is a string containing a in JSON encoding
a2 # reading back the keys are now in unicode
url = "http://worldcup.sfg.io/matches"
data = urllib2.urlopen(url).read()
wc = json.loads(data.decode('utf-8'))
"Number of matches in 2014 World Cup: %i" % len(wc)
# Print keys in first match
gameIndex = 60
wc[gameIndex].keys()
wc[gameIndex]['status']
wc[gameIndex]['match_number']
wc[gameIndex]['away_team']
wc[gameIndex]['away_team_events']
wc[gameIndex]['home_team']
for elem in wc:
print elem['home_team']['country'], elem['home_team']['goals'], elem['away_team']['country'], elem['away_team']['goals']
data = pd.DataFrame(wc, columns = ['match_number', 'location', 'datetime', 'home_team', 'away_team', 'winner', 'home_team_events', 'away_team_events'])
data.head()
data['gameDate'] = pd.DatetimeIndex(data.datetime).date
data['gameTime'] = pd.DatetimeIndex(data.datetime).time
data.head()