Notebook

In [ ]:

# Setup
import pattern.web as web
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from cs109style import customize_mpl, customize_css
customize_mpl()
customize_css()
%pylab inline

Example 2: extracting reddit titles, upvotes, downvotes, and submission time¶

We'll operate in two phases:¶

first, find all the URLs to comment pages on the first few front pages of reddit.
second, extract information from each comments page

In [ ]:

def get_links_from_front_pages(n):
    'find  URLs of comments pages, linked from the n first few pages of reddit'
    url = web.URL('http://www.reddit.com/')
    comment_pages = []
    for page_idx in range(n):
        dom = web.DOM(url.download(cached=False))
        
        ### Extract comments pages
        
        ### find the next page link - reddit has 25 links per page

    # use set() to remove repeated URLs
    return list(set(comment_pages))

            
print len(get_links_from_front_pages(6))

In [ ]:

def info_from_comments_pages(links):
    'fetch title, upvotes, downvotes, time of submission from a sequence of links'
    results = []
    for urltext in links:
        url = web.URL(urltext)
        print "fetching info for", url
        try:
            dom = web.DOM(url.download(cached=False))
            
            ### Extract title, upvotes, downvotes, submission time
            
            results.append((title, upvotes, downvotes, pd.to_datetime(time)))
        except KeyboardInterrupt:
            # allow us to interrupt the kernel and still continue
            break
        except:
            pass  # some things that look like comment pages don't have the information above
    return results

In [ ]:

comments_pages = get_links_from_front_pages(5)
print "Fetching info for", len(comments_pages), "pages"
pages = info_from_comments_pages(comments_pages)
titles, upvotes, downvotes, dates = zip(*pages)  # zip(*seq) transposes a sequence of sequences.
df = pd.DataFrame({'title' : titles, 'upvotes' : upvotes, 'downvotes' : downvotes, 'date' : dates}, index=dates)
print df

In [ ]:

df.sort('date', inplace=True)
df['upvotes'].plot(c='g')
df['downvotes'].plot(c='r')
(df['upvotes'] - df['downvotes']).plot(c='k')

In [ ]: