# Setup
import pattern.web as web
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from cs109style import customize_mpl, customize_css
customize_mpl()
customize_css()
%pylab inline
def get_links_from_front_pages(n):
'find URLs of comments pages, linked from the n first few pages of reddit'
url = web.URL('http://www.reddit.com/')
comment_pages = []
for page_idx in range(n):
dom = web.DOM(url.download(cached=False))
### Extract comments pages
### find the next page link - reddit has 25 links per page
# use set() to remove repeated URLs
return list(set(comment_pages))
print len(get_links_from_front_pages(6))
def info_from_comments_pages(links):
'fetch title, upvotes, downvotes, time of submission from a sequence of links'
results = []
for urltext in links:
url = web.URL(urltext)
print "fetching info for", url
try:
dom = web.DOM(url.download(cached=False))
### Extract title, upvotes, downvotes, submission time
results.append((title, upvotes, downvotes, pd.to_datetime(time)))
except KeyboardInterrupt:
# allow us to interrupt the kernel and still continue
break
except:
pass # some things that look like comment pages don't have the information above
return results
comments_pages = get_links_from_front_pages(5)
print "Fetching info for", len(comments_pages), "pages"
pages = info_from_comments_pages(comments_pages)
titles, upvotes, downvotes, dates = zip(*pages) # zip(*seq) transposes a sequence of sequences.
df = pd.DataFrame({'title' : titles, 'upvotes' : upvotes, 'downvotes' : downvotes, 'date' : dates}, index=dates)
print df
df.sort('date', inplace=True)
df['upvotes'].plot(c='g')
df['downvotes'].plot(c='r')
(df['upvotes'] - df['downvotes']).plot(c='k')