import time import pandas as pd import json import os import urllib import urllib2 import datetime user_agent = ("Project for Data Science class v1.0" " https://github.com/jaysayre/intelligentdolphins") def json_extract(baseurl, headrs=None, params=None, extraparam=None): ''' Helper function to download and read json data. Takes in explanatory headers and returns json dict. ''' if params != None: if extraparam != None: params['t'] = extraparam form = urllib.urlencode(params) url = baseurl+form else: url = baseurl if headrs != None: request = urllib2.Request(url, headers=headrs) else: request = urllib2.Request(url) return json.loads(urllib2.urlopen(request).read()) ''' appends the data we currently look at to the relevant list ''' def subreddit_json_parser(data, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates): for post in data['data']['children']: if post['kind'] == 't3': for key, value in post['data'].items(): if key == "title": title.append(value) elif key == "id": ids.append(str(value)) elif key == "ups": upvts.append(value) elif key == "downs": downvts.append(value) elif key == "author": authors.append(value) elif key == "num_comments": comments.append(value) elif key == "score": score.append(value) elif key == "media": media.append(value) elif key == "distinguished": distin.append(value) elif key == "selftext": selftxt.append(value) elif key == "created_utc": dates.append(value) else: pass def get_subreddit_df(subreddit, sort_call, user_agent, n, t=None, api_call_limit=100, status=False): ''' Builds a subreddit dataframe. Parameters -- subreddit: specifies which reddit subreddit to pull information from sort_call: specifies whether you want to sort by top, hot, rising etc. user_agent: so reddit won't reject our https requests n: int describing how many top results one would like t: specifies whether scope should be week, hour, day, year, etc. api_call_limit: Reddit doesn't serve more than 100 results at a time status: whether or not you want the status of the download printed in the console Returns -- A pandas dataframe containing various bits of information about the top posts. ''' reddit_base = 'http://www.reddit.com/r/%s/%s.json?' % (subreddit, sort_call) #Base api call headers = {'User-agent': user_agent} #Empty lists for information we'll extract ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates = [], [], [], [], [], [], [], [], [], [], [] #Makes sure n and api_call_limit aren't floats! n = int(n) api_call_limit = int(api_call_limit) #Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests if n%api_call_limit != 0: remainder = n%api_call_limit num = (n/api_call_limit) +1 else: num = n/api_call_limit remainder = api_call_limit #Makes an api call for all n entries based on the api call limit for i in range(num): if i == 0: post_params = {'limit': api_call_limit} jsondata = json_extract(reddit_base, headers, post_params, t) tostartfrom = jsondata['data']['after'] subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates) time.sleep(2) if status == True: print "Downloaded %s posts..." % len(set(ids)) elif i == num - 1: post_params = {'limit': remainder, 'after': tostartfrom} #Indicates the post after we wish to call from jsondata = json_extract(reddit_base, headers, post_params, t) subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates) time.sleep(2) if status == True: print "Downloaded %s posts..." % len(set(ids)) else: post_params = {'limit': api_call_limit, 'after': tostartfrom} jsondata = json_extract(reddit_base, headers, post_params, t) tostartfrom = jsondata['data']['after'] subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates) time.sleep(2) if status == True: print "Downloaded %s posts..." % len(set(ids)) tempdict = {'id': ids, 'title': title, 'upvotes': upvts, 'media': media, 'distinguished' : distin,\ 'selftext': selftxt, 'downvotes': downvts, 'comments': comments, 'score': score, 'author': authors, 'time_created': dates} return pd.DataFrame(tempdict) # helper function to determine if float def isfloat(x): try: float(x) return True except: return False # helper function to clean strings def filterstr(x): try: y= ''.join(e for e in x if (e == ' ' or e.isalnum()) and not e.isdigit()) return y except: return '' # Function to clean downloaded information def subset_subreddit(df): df = df[df['distinguished'].apply(lambda x: x==None)] # Makes sure poster isn't a moderator (distinguished) df = df[df['media'].apply(lambda x: x==None)] # Also makes sure post is text based df = df[df['title'].apply(lambda x: x!=None)] # Makes sure title isn't empty df = df.drop('distinguished',1) df = df.drop('media', 1) # Occasionally, text strings will be in areas where only ints should be. This takes care of that df=df[df['comments'].apply(lambda x: isfloat(x))] df=df[df['downvotes'].apply(lambda x: isfloat(x))] df=df[df['score'].apply(lambda x: isfloat(x))] df=df[df['upvotes'].apply(lambda x: isfloat(x))] # Ids can be weird as well, sometimes they manifest as long numbers # Reddit uses a id system in base 36, so I doubt a reasonable id will be longer than 8 df=df[df['id'].apply(lambda x: len(str(x)) <= 8)] # Filter out characters that cause problems later for i in df.index: df['title'][i] = filterstr(df['title'][i]) df['author'][i] = filterstr(df['author'][i]) df['selftext'][i] = filterstr(df['selftext'][i]) # Removes accented characters df['title'] = df['title'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore')) df['selftext'] = df['selftext'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore')) df = df.drop_duplicates() #Remove duplicate entries, should they exist return df # Names of the popular text-based subreddits we want to look at subreddits = ['explainlikeimfive', 'AskReddit', 'TalesFromTechsupport', 'talesFromRetail', 'pettyrevenge', 'askhistorians', 'askscience', 'tifu', 'nosleep', 'jokes', 'atheism', 'politics'] #Append date stamp to txt file to indicate when download happened timeformat = '%m-%d-%y-%H%M' printdate = datetime.datetime.now().strftime(timeformat) with open('Data/New/dltimestamp.txt', 'a') as the_file: the_file.write(printdate) for subreddit in subreddits: #print subreddit # Reddit only serves 1000 posts on a section of a subreddit.. # we will try to get as much as possible top_all = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'all', status=False) top_week = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'week', status=False) top_day = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'day', status=False) hot = get_subreddit_df(subreddit, 'hot', user_agent, 1000, status=False) new = get_subreddit_df(subreddit, 'new', user_agent, 1000, status=False) #Subset and clean each dataframe top_all = subset_subreddit(top_all) top_week = subset_subreddit(top_week) top_day = subset_subreddit(top_day) hot = subset_subreddit(hot) new = subset_subreddit(new) # Add columns in each denoting name of subreddit top_all['subreddit'] = subreddit top_week['subreddit'] = subreddit top_day['subreddit'] = subreddit hot['subreddit'] = subreddit new['subreddit'] = subreddit #Adds columns denoting section of subreddit top_all['type'] = 'top_all' top_week['type'] = 'top_week' top_day['type'] = 'top_day' hot['type'] = 'hot' new['type'] = 'new' #Write scraped subreddit information to a csv file dltopall = 'Data/New/' + subreddit + 'top_all.csv' dltopweek = 'Data/New/' + subreddit + 'top_week.csv' dltopday = 'Data/New/' + subreddit + 'top_day.csv' dlhot = 'Data/New/' + subreddit + 'hot.csv' dlnew = 'Data/New/' + subreddit + 'new.csv' top_all.to_csv(dltopall, index=False, encoding='utf-8') top_week.to_csv(dltopweek, index=False, encoding='utf-8') top_day.to_csv(dltopday, index=False, encoding='utf-8') hot.to_csv(dlhot, index=False, encoding='utf-8') new.to_csv(dlnew, index=False, encoding='utf-8')