# Setup import pattern.web as web import pandas as pd import numpy as np import matplotlib.pyplot as plt from cs109style import customize_mpl, customize_css customize_mpl() customize_css() %pylab inline def get_links_from_front_pages(n): 'find URLs of comments pages, linked from the n first few pages of reddit' url = web.URL('http://www.reddit.com/') comment_pages = [] for page_idx in range(n): dom = web.DOM(url.download(cached=False)) for entry in dom('a.comments'): href = entry.attributes.get('href', '') if href: comment_pages.append(href) # find the next page link - reddit has 25 links per page for a in dom('a'): if ('count=%d' % ((page_idx + 1) * 25)) in a.attributes.get('href', ''): url = web.URL(a.attributes.get('href')) # use set() to remove repeated pages return list(set(comment_pages)) print len(get_links_from_front_pages(6)) def info_from_comments_pages(links): 'fetch title, upvotes, downvotes, time of submission from a sequence of links' results = [] for urltext in links: url = web.URL(urltext) print "fetching info for", url try: dom = web.DOM(url.download(cached=False)) title = dom('title')[0].content upvotes = int(dom.by_class('upvotes')[0].children[0].content.replace(',', '')) downvotes = int(dom.by_class('downvotes')[0].children[0].content.replace(',', '')) time = dom.by_class('tagline')[0]('time')[0].attributes.get('datetime') results.append((title, upvotes, downvotes, pd.to_datetime(time))) except KeyboardInterrupt: # allow us to interrupt the kernel but use what we've already fetched break except: pass # some things that look like comment pages don't have the information above return results comments_pages = get_links_from_front_pages(5) print "Fetching info for", len(comments_pages), "pages" pages = info_from_comments_pages(comments_pages) titles, upvotes, downvotes, dates = zip(*pages) # zip(*seq) transposes a sequence of sequences. df = pd.DataFrame({'title' : titles, 'upvotes' : upvotes, 'downvotes' : downvotes, 'date' : dates}, index=dates) print df df.sort('date', inplace=True) df['upvotes'].plot(c='g') df['downvotes'].plot(c='r') (df['upvotes'] - df['downvotes']).plot(c='k')