import time
import pandas as pd
import json
import os
import urllib
import urllib2
import datetime
In order to use the reddit API we need to specify an user agent with a description and a link to our project
user_agent = ("Project for Data Science class v1.0" " https://github.com/jaysayre/intelligentdolphins")
def json_extract(baseurl, headrs=None, params=None, extraparam=None):
'''
Helper function to download and read json data. Takes in explanatory headers and returns json dict.
'''
if params != None:
if extraparam != None:
params['t'] = extraparam
form = urllib.urlencode(params)
url = baseurl+form
else:
url = baseurl
if headrs != None:
request = urllib2.Request(url, headers=headrs)
else:
request = urllib2.Request(url)
return json.loads(urllib2.urlopen(request).read())
'''
appends the data we currently look at to the relevant list
'''
def subreddit_json_parser(data, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates):
for post in data['data']['children']:
if post['kind'] == 't3':
for key, value in post['data'].items():
if key == "title":
title.append(value)
elif key == "id":
ids.append(str(value))
elif key == "ups":
upvts.append(value)
elif key == "downs":
downvts.append(value)
elif key == "author":
authors.append(value)
elif key == "num_comments":
comments.append(value)
elif key == "score":
score.append(value)
elif key == "media":
media.append(value)
elif key == "distinguished":
distin.append(value)
elif key == "selftext":
selftxt.append(value)
elif key == "created_utc":
dates.append(value)
else:
pass
def get_subreddit_df(subreddit, sort_call, user_agent, n, t=None, api_call_limit=100, status=False):
'''
Builds a subreddit dataframe.
Parameters --
subreddit: specifies which reddit subreddit to pull information from
sort_call: specifies whether you want to sort by top, hot, rising etc.
user_agent: so reddit won't reject our https requests
n: int describing how many top results one would like
t: specifies whether scope should be week, hour, day, year, etc.
api_call_limit: Reddit doesn't serve more than 100 results at a time
status: whether or not you want the status of the download printed in the console
Returns --
A pandas dataframe containing various bits of information about the top posts.
'''
reddit_base = 'http://www.reddit.com/r/%s/%s.json?' % (subreddit, sort_call) #Base api call
headers = {'User-agent': user_agent}
#Empty lists for information we'll extract
ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates = [], [], [], [], [], [], [], [], [], [], []
#Makes sure n and api_call_limit aren't floats!
n = int(n)
api_call_limit = int(api_call_limit)
#Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests
if n%api_call_limit != 0:
remainder = n%api_call_limit
num = (n/api_call_limit) +1
else:
num = n/api_call_limit
remainder = api_call_limit
#Makes an api call for all n entries based on the api call limit
for i in range(num):
if i == 0:
post_params = {'limit': api_call_limit}
jsondata = json_extract(reddit_base, headers, post_params, t)
tostartfrom = jsondata['data']['after']
subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)
time.sleep(2)
if status == True:
print "Downloaded %s posts..." % len(set(ids))
elif i == num - 1:
post_params = {'limit': remainder, 'after': tostartfrom} #Indicates the post after we wish to call from
jsondata = json_extract(reddit_base, headers, post_params, t)
subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)
time.sleep(2)
if status == True:
print "Downloaded %s posts..." % len(set(ids))
else:
post_params = {'limit': api_call_limit, 'after': tostartfrom}
jsondata = json_extract(reddit_base, headers, post_params, t)
tostartfrom = jsondata['data']['after']
subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)
time.sleep(2)
if status == True:
print "Downloaded %s posts..." % len(set(ids))
tempdict = {'id': ids, 'title': title, 'upvotes': upvts, 'media': media, 'distinguished' : distin,\
'selftext': selftxt, 'downvotes': downvts, 'comments': comments, 'score': score, 'author': authors, 'time_created': dates}
return pd.DataFrame(tempdict)
# helper function to determine if float
def isfloat(x):
try:
float(x)
return True
except:
return False
# helper function to clean strings
def filterstr(x):
try:
y= ''.join(e for e in x if (e == ' ' or e.isalnum()) and not e.isdigit())
return y
except:
return ''
# Function to clean downloaded information
def subset_subreddit(df):
df = df[df['distinguished'].apply(lambda x: x==None)] # Makes sure poster isn't a moderator (distinguished)
df = df[df['media'].apply(lambda x: x==None)] # Also makes sure post is text based
df = df[df['title'].apply(lambda x: x!=None)] # Makes sure title isn't empty
df = df.drop('distinguished',1)
df = df.drop('media', 1)
# Occasionally, text strings will be in areas where only ints should be. This takes care of that
df=df[df['comments'].apply(lambda x: isfloat(x))]
df=df[df['downvotes'].apply(lambda x: isfloat(x))]
df=df[df['score'].apply(lambda x: isfloat(x))]
df=df[df['upvotes'].apply(lambda x: isfloat(x))]
# Ids can be weird as well, sometimes they manifest as long numbers
# Reddit uses a id system in base 36, so I doubt a reasonable id will be longer than 8
df=df[df['id'].apply(lambda x: len(str(x)) <= 8)]
# Filter out characters that cause problems later
for i in df.index:
df['title'][i] = filterstr(df['title'][i])
df['author'][i] = filterstr(df['author'][i])
df['selftext'][i] = filterstr(df['selftext'][i])
# Removes accented characters
df['title'] = df['title'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
df['selftext'] = df['selftext'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
df = df.drop_duplicates() #Remove duplicate entries, should they exist
return df
# Names of the popular text-based subreddits we want to look at
subreddits = ['explainlikeimfive', 'AskReddit', 'TalesFromTechsupport',
'talesFromRetail', 'pettyrevenge', 'askhistorians',
'askscience', 'tifu', 'nosleep', 'jokes', 'atheism', 'politics']
#Append date stamp to txt file to indicate when download happened
timeformat = '%m-%d-%y-%H%M'
printdate = datetime.datetime.now().strftime(timeformat)
with open('Data/New/dltimestamp.txt', 'a') as the_file:
the_file.write(printdate)
for subreddit in subreddits:
#print subreddit
# Reddit only serves 1000 posts on a section of a subreddit..
# we will try to get as much as possible
top_all = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'all', status=False)
top_week = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'week', status=False)
top_day = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'day', status=False)
hot = get_subreddit_df(subreddit, 'hot', user_agent, 1000, status=False)
new = get_subreddit_df(subreddit, 'new', user_agent, 1000, status=False)
#Subset and clean each dataframe
top_all = subset_subreddit(top_all)
top_week = subset_subreddit(top_week)
top_day = subset_subreddit(top_day)
hot = subset_subreddit(hot)
new = subset_subreddit(new)
# Add columns in each denoting name of subreddit
top_all['subreddit'] = subreddit
top_week['subreddit'] = subreddit
top_day['subreddit'] = subreddit
hot['subreddit'] = subreddit
new['subreddit'] = subreddit
#Adds columns denoting section of subreddit
top_all['type'] = 'top_all'
top_week['type'] = 'top_week'
top_day['type'] = 'top_day'
hot['type'] = 'hot'
new['type'] = 'new'
#Write scraped subreddit information to a csv file
dltopall = 'Data/New/' + subreddit + 'top_all.csv'
dltopweek = 'Data/New/' + subreddit + 'top_week.csv'
dltopday = 'Data/New/' + subreddit + 'top_day.csv'
dlhot = 'Data/New/' + subreddit + 'hot.csv'
dlnew = 'Data/New/' + subreddit + 'new.csv'
top_all.to_csv(dltopall, index=False, encoding='utf-8')
top_week.to_csv(dltopweek, index=False, encoding='utf-8')
top_day.to_csv(dltopday, index=False, encoding='utf-8')
hot.to_csv(dlhot, index=False, encoding='utf-8')
new.to_csv(dlnew, index=False, encoding='utf-8')
explainlikeimfive AskReddit TalesFromTechsupport talesFromRetail pettyrevenge askhistorians askscience tifu nosleep jokes atheism politics