Notebook

In [1]:

import time
import pandas as pd
import json
import os
import urllib
import urllib2
import datetime

In order to use the reddit API we need to specify an user agent with a description and a link to our project

In [13]:

user_agent = ("Project for Data Science class v1.0"  " https://github.com/jaysayre/intelligentdolphins")

In [3]:

def json_extract(baseurl, headrs=None, params=None, extraparam=None):
    '''
    Helper function to download and read json data. Takes in explanatory headers and returns json dict.
    '''
    if params != None:
        if extraparam != None:
                params['t'] = extraparam
        form = urllib.urlencode(params)
        url = baseurl+form
    else:
        url = baseurl
    if headrs != None:
        request = urllib2.Request(url, headers=headrs)
    else: 
        request = urllib2.Request(url)
    return json.loads(urllib2.urlopen(request).read())

In [4]:

'''
appends the data we currently look at to the relevant list
'''
def subreddit_json_parser(data, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates):
        for post in data['data']['children']:
            if post['kind'] == 't3':
                for key, value in post['data'].items():
                    if key == "title":
                        title.append(value)
                    elif key == "id":
                        ids.append(str(value))
                    elif key == "ups":
                        upvts.append(value)
                    elif key == "downs":
                        downvts.append(value)
                    elif key == "author":
                        authors.append(value)
                    elif key == "num_comments":
                        comments.append(value)
                    elif key == "score":
                        score.append(value)
                    elif key == "media":
                        media.append(value)
                    elif key == "distinguished":
                        distin.append(value)
                    elif key == "selftext":
                        selftxt.append(value)
                    elif key == "created_utc":
                        dates.append(value)
                    else:
                        pass

In [5]:

def get_subreddit_df(subreddit, sort_call, user_agent, n, t=None, api_call_limit=100, status=False):
    '''
    Builds a subreddit dataframe.
    
    Parameters --
    subreddit: specifies which reddit subreddit to pull information from
    sort_call: specifies whether you want to sort by top, hot, rising etc.
    user_agent: so reddit won't reject our https requests
    n: int describing how many top results one would like
    t: specifies whether scope should be week, hour, day, year, etc.
    api_call_limit: Reddit doesn't serve more than 100 results at a time
    status: whether or not you want the status of the download printed in the console
    
    Returns --
    
    A pandas dataframe containing various bits of information about the top posts.
    
    '''
    reddit_base = 'http://www.reddit.com/r/%s/%s.json?' % (subreddit, sort_call) #Base api call
    headers = {'User-agent': user_agent}
    
    #Empty lists for information we'll extract
    ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates = [], [], [], [], [], [], [], [], [], [], []
    
    #Makes sure n and api_call_limit aren't floats!
    n = int(n) 
    api_call_limit = int(api_call_limit)
    
    #Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests
    if n%api_call_limit != 0:
        remainder = n%api_call_limit
        num = (n/api_call_limit) +1
    else:
        num = n/api_call_limit
        remainder = api_call_limit

    #Makes an api call for all n entries based on the api call limit
    for i in range(num):
        if i == 0:
            post_params = {'limit': api_call_limit} 
            jsondata = json_extract(reddit_base, headers, post_params, t)
            tostartfrom = jsondata['data']['after']
            subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)
            time.sleep(2)
            if status == True:
                print "Downloaded %s posts..." % len(set(ids))
        elif i == num - 1:
            post_params = {'limit': remainder, 'after': tostartfrom} #Indicates the post after we wish to call from
            jsondata = json_extract(reddit_base, headers, post_params, t)
            subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)
            time.sleep(2)
            if status == True:
                print "Downloaded %s posts..." % len(set(ids))
        else: 
            post_params = {'limit': api_call_limit, 'after': tostartfrom} 
            jsondata = json_extract(reddit_base, headers, post_params, t)
            tostartfrom = jsondata['data']['after']
            subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)
            time.sleep(2)
            if status == True:
                print "Downloaded %s posts..." % len(set(ids))
    
    tempdict = {'id': ids, 'title': title, 'upvotes': upvts, 'media': media, 'distinguished' : distin,\
                'selftext': selftxt, 'downvotes': downvts, 'comments': comments, 'score': score, 'author': authors, 'time_created': dates}

    return pd.DataFrame(tempdict)

In [6]:

# helper function to determine if float
def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False

# helper function to clean strings
def filterstr(x):
    try:
        y= ''.join(e for e in x if (e == ' ' or e.isalnum()) and not e.isdigit())
        return y
    except:
        return ''

In [15]:

# Function to clean downloaded information
def subset_subreddit(df):
    df = df[df['distinguished'].apply(lambda x: x==None)] # Makes sure poster isn't a moderator (distinguished)
    df = df[df['media'].apply(lambda x: x==None)] # Also makes sure post is text based
    df = df[df['title'].apply(lambda x: x!=None)] # Makes sure title isn't empty
    df = df.drop('distinguished',1)
    df = df.drop('media', 1)
    # Occasionally, text strings will be in areas where only ints should be. This takes care of that
    df=df[df['comments'].apply(lambda x: isfloat(x))]
    df=df[df['downvotes'].apply(lambda x: isfloat(x))]
    df=df[df['score'].apply(lambda x: isfloat(x))]
    df=df[df['upvotes'].apply(lambda x: isfloat(x))]
    # Ids can be weird as well, sometimes they manifest as long numbers
    # Reddit uses a id system in base 36, so I doubt a reasonable id will be longer than 8
    df=df[df['id'].apply(lambda x: len(str(x)) <= 8)]
    # Filter out characters that cause problems later
    for i in df.index:
        df['title'][i] = filterstr(df['title'][i])
        df['author'][i] = filterstr(df['author'][i])
        df['selftext'][i] = filterstr(df['selftext'][i])
    # Removes accented characters
    df['title'] = df['title'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
    df['selftext'] = df['selftext'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
    df = df.drop_duplicates() #Remove duplicate entries, should they exist
    return df

In [8]:

# Names of the popular text-based subreddits we want to look at
subreddits = ['explainlikeimfive', 'AskReddit', 'TalesFromTechsupport', 
              'talesFromRetail', 'pettyrevenge', 'askhistorians', 
              'askscience', 'tifu', 'nosleep', 'jokes', 'atheism', 'politics']

In [16]:

#Append date stamp to txt file to indicate when download happened
timeformat = '%m-%d-%y-%H%M'
printdate = datetime.datetime.now().strftime(timeformat)
    
with open('Data/New/dltimestamp.txt', 'a') as the_file:
    the_file.write(printdate)

for subreddit in subreddits:
    #print subreddit
    
    # Reddit only serves 1000 posts on a section of a subreddit..
    # we will try to get as much as possible
    top_all = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'all', status=False)
    top_week = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'week', status=False)
    top_day = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'day', status=False)
    hot = get_subreddit_df(subreddit, 'hot', user_agent, 1000, status=False)
    new = get_subreddit_df(subreddit, 'new', user_agent, 1000, status=False)
    
    #Subset and clean each dataframe
    top_all = subset_subreddit(top_all)
    top_week = subset_subreddit(top_week)
    top_day = subset_subreddit(top_day)
    hot = subset_subreddit(hot)
    new = subset_subreddit(new)
    
    # Add columns in each denoting name of subreddit
    top_all['subreddit'] = subreddit
    top_week['subreddit'] = subreddit
    top_day['subreddit'] = subreddit
    hot['subreddit'] = subreddit
    new['subreddit'] = subreddit
    
    #Adds columns denoting section of subreddit
    top_all['type'] = 'top_all'
    top_week['type'] = 'top_week'
    top_day['type'] = 'top_day'
    hot['type'] = 'hot'
    new['type'] = 'new'
    
    #Write scraped subreddit information to a csv file
    dltopall = 'Data/New/' + subreddit + 'top_all.csv'
    dltopweek = 'Data/New/' + subreddit + 'top_week.csv'
    dltopday = 'Data/New/' + subreddit + 'top_day.csv'
    dlhot = 'Data/New/' + subreddit + 'hot.csv'
    dlnew = 'Data/New/' + subreddit + 'new.csv'
    
    top_all.to_csv(dltopall, index=False, encoding='utf-8')
    top_week.to_csv(dltopweek, index=False, encoding='utf-8')
    top_day.to_csv(dltopday, index=False, encoding='utf-8')
    hot.to_csv(dlhot, index=False, encoding='utf-8')
    new.to_csv(dlnew, index=False, encoding='utf-8')
    

explainlikeimfive
AskReddit
TalesFromTechsupport
talesFromRetail
pettyrevenge
askhistorians
askscience
tifu
nosleep
jokes
atheism
politics