import time
import pandas as pd
import numpy as np
import json
import os
import urllib
import urllib2
import datetime
user_agent = ("Project for Data Science class v1.0" " /u/Valedra" " https://github.com/jaysayre/intelligentdolphins")
Now that we've downloaded all of the data we need using Redditscraping.ipynb, we now need to assemble it and download other pieces of information. First, lets combine all of the data frames.
file_dir = "Data/New/"
path, dirs, files = os.walk(file_dir).next()
csvfiles = [file_dir + i for i in files if ".csv" in i ] #Builds a list with .csv files
csvfiles.sort()
def filemerge(csvfiles):
if len(csvfiles) >= 2:
df = pd.DataFrame()
for csvfile in csvfiles:
dfnew = pd.read_csv(csvfile, encoding='utf-8')
df = df.append(dfnew)
return df
else:
print 'Not enough files'
filemerge(csvfiles).to_csv('Data/combined.csv', index=False, encoding='utf-8')
Next, let's add something here that will download a seperate dataframe containing the total karma each poster in our new, large data frame has.
df = pd.read_csv('Data/combined.csv', encoding='utf-8')
def add_karma(df, user_agent, display=20, start=0, end=len(df)):
'''
Adds in the karma score for every author, and returns the same df except with the new information.
This part takes a bit longer, as we have to make a get request for every user.
Parameters --
df: input dataframe
user_agent: same as before
display: How often one wants status updates on the download progress
Returns --
A pandas dataframe with the same information, in addition to overall and link karma for the post author
'''
count = 0
dfidlist = list(df.index)
dfidlist = dfidlist[start : end]
df2 = pd.DataFrame({'karma':df['id'], 'link_karma':df['subreddit'], 'author':df['author']})
for i in dfidlist:
try:
reddit_url = 'http://www.reddit.com/user/%s/about.json' % df['author'][i]
headers = {'User-agent': user_agent}
df2['author'][i] = df['author'][i]
jsondata = json_extract(reddit_url, headers)
df2['karma'][i] = jsondata['data']['comment_karma']
try:
df2['link_karma'][i] = jsondata['data']['link_karma']
except:
df2['link_karma'][i] = 0
count += 1
except:
df2['karma'][i] = 0
df2['link_karma'][i] = 0
count += 1
if count%int(display) == 0:
print "Retrieved karma for %s users" % count
return df2
def json_extract(baseurl, headrs=None, params=None, extraparam=None):
'''
Helper function to download and read json data. Takes in explanatory headers and returns json dict.
'''
if params != None:
if extraparam != None:
params['t'] = extraparam
form = urllib.urlencode(params)
url = baseurl+form
else:
url = baseurl
if headrs != None:
request = urllib2.Request(url, headers=headrs)
else:
request = urllib2.Request(url)
return json.loads(urllib2.urlopen(request).read())
#Add in karma... Be careful as this is really quite slow
df = pd.read_csv('Data/combined.csv', encoding='utf-8')
df2 = add_karma(df, user_agent, display=40) #Change the start and end
df2.to_csv('Data/karma.csv', index=False, encoding='utf-8')
Now that we've downloaded the karma information, we can now merge it into our original data set.
fulldf = df
fulldf['karma'] = df2['karma']
fulldf['link_karma'] = df2['link_karma']
fulldf.to_csv('Data/full.csv', index=False, encoding='utf-8')
Finally, we need to download one more thing... all of the comments related to a given post! Well, not all, but just the top comments for a given post.
def get_comments(subreddit, postid, sort_call, subtype, user_agent):
'''
Parameters --
subreddit: subreddit title
postid: 6 digit id corresponding to the post
sort_call: one of confidence, top, new, hot, controversial, old, random
user_agent: same as before
Returns --
'''
reddit_base = 'http://www.reddit.com/r/%s/comments/%s.json?' % (subreddit, postid)
headers = {'User-agent': user_agent}
post_params = {'sort': sort_call}
jsondata = json_extract(reddit_base, headers, post_params)
comments, ids, ups, downs, authors, distin = [], [], [], [], [], []
for item in jsondata[1]['data']['children']:
for key, value in item['data'].items():
if key == "author":
if value == None:
authors.append('null')
elif value == '[deleted]':
authors.append('null')
else:
authors.append(value)
elif key == "id":
if value == None:
ids.append('null')
elif value == '[deleted]':
ids.append('null')
else:
ids.append(str(value))
elif key == "body":
if value == None:
comments.append('null')
elif value == '[deleted]':
comments.append('null')
else:
comments.append(value)#.replace('\n', ''))
elif key == "ups":
if value == None:
ups.append('null')
elif value == '[deleted]':
ups.append('null')
else:
ups.append(value)
elif key == "downs":
if value == None:
downs.append('null')
elif value == '[deleted]':
downs.append('null')
else:
downs.append(value)
elif key == "distinguished":
if value == None:
distin.append('null')
elif value == '[deleted]':
distin.append('null')
else:
distin.append(value)
else:
pass
try:
return pd.DataFrame({'comment': comments, 'id': ids, 'ups': ups, 'downs': downs, 'author': authors,
'distinguished': distin, 'subreddit': subreddit, 'post': postid, 'type': subtype})
except:
#print len(comments), len(ids), len(ups), len(downs), len(authors)
ids.pop(0) #Might need to be more formulaic
return pd.DataFrame({'comment': comments, 'id': ids, 'ups': ups, 'downs': downs, 'author': authors,
'distinguished': distin, 'subreddit': subreddit, 'post': postid, 'type': subtype})
bigdf = df
subs = list(bigdf['subreddit'].unique())
types = list(bigdf['type'].unique())
This will download a seperate dataframe with the comments for each subreddit type. It will take a while, as it has to make a seperate api call for each post and then download those comments.
for sub in subs:
print sub
for typ in types:
print typ
smalldf = bigdf[bigdf['subreddit'] == sub]
smalldf = smalldf[smalldf['type'] == typ]
dfidlist = list(smalldf.index)
comments = pd.DataFrame()
for i in dfidlist:
try:
comments = comments.append(get_comments(smalldf['subreddit'][i], smalldf['id'][i], 'top', smalldf['type'][i], user_agent))
except:
print i
with open('Data/comments/missingcomments.txt', 'a') as the_file:
the_file.write(smalldf['id'][i])
the_file.write(' ' + sub)
the_file.write(' ' + typ + '\n')
filename = 'Data/comments/' + sub + typ + '.csv'
comments.to_csv(filename, index=False, encoding='utf-8')
Now that we've downloaded all of the comment files, we can begin to clean them up a little bit.
comment_dir = "Data/comments/"
path, dirs, files = os.walk(comment_dir).next()
commentfiles = [comment_dir + i for i in files if ".csv" in i ] #Builds a list with .csv files
commentfiles.sort()
#commentfiles
# Function to determine if float
def isfloat(x):
try:
float(x)
return True
except:
return False
# Function to clean strings
def filterstr(x):
try:
y= ''.join(e for e in x if (e == ' ' or e.isalnum()) and not e.isdigit())
return y
except:
return ''
def subset_comment(df):
# Makes sure commenter isn't a moderator
df = df[df['distinguished'].apply(lambda x: x == 'null')]
df = df.drop('distinguished',1)
# Occasionally, text or null entries will be in areas where only ints should be
df=df[df['downs'].apply(lambda x: isfloat(x))]
df=df[df['ups'].apply(lambda x: isfloat(x))]
# Removes null comments or authors
df = df[df['comment'].apply(lambda x: x != 'null')]
df = df[df['author'].apply(lambda x: x != 'null')]
# Ids can be weird as well, sometimes they manifest as long numbers
# Reddit uses a id system in base 36, so I doubt any reasonable id will be longer than 8
df=df[df['id'].apply(lambda x: len(str(x)) <= 8)]
df=df[df['post'].apply(lambda x: len(str(x)) <= 8)]
# Filter out characters that cause problems later, including all punctuation...
for i in df.index:
df['comment'][i] = filterstr(df['comment'][i])
df['author'][i] = filterstr(df['author'][i])
df = df.drop_duplicates() #Remove duplicate entries, should they exist
return df
for commentfile in commentfiles:
commentdf = pd.read_csv(commentfile, encoding='utf-8')
commentdf = subset_comment(commentdf)
commentdf['score'] = commentdf['ups'].astype(int)-commentdf['downs'].astype(int)
commentdf.to_csv(commentfile, index=False, encoding='utf-8')
Let's merge all of the comment files for a given subreddit, which will help us with comment parsing.
ath = ['Data/comments/atheismhot.csv',
'Data/comments/atheismnew.csv',
'Data/comments/atheismtop_all.csv',
'Data/comments/atheismtop_day.csv',
'Data/comments/atheismtop_week.csv']
poli = ['Data/comments/politicshot.csv',
'Data/comments/politicsnew.csv',
'Data/comments/politicstop_all.csv',
'Data/comments/politicstop_day.csv',
'Data/comments/politicstop_week.csv']
nos = ['Data/comments/nosleephot.csv',
'Data/comments/nosleepnew.csv',
'Data/comments/nosleeptop_all.csv',
'Data/comments/nosleeptop_day.csv',
'Data/comments/nosleeptop_week.csv']
ptyr = ['Data/comments/pettyrevengehot.csv',
'Data/comments/pettyrevengenew.csv',
'Data/comments/pettyrevengetop_all.csv',
'Data/comments/pettyrevengetop_day.csv',
'Data/comments/pettyrevengetop_week.csv']
joke = ['Data/comments/jokeshot.csv',
'Data/comments/jokesnew.csv',
'Data/comments/jokestop_all.csv',
'Data/comments/jokestop_day.csv',
'Data/comments/jokestop_week.csv']
askh = ['Data/comments/askhistorianshot.csv',
'Data/comments/askhistoriansnew.csv',
'Data/comments/askhistorianstop_all.csv',
'Data/comments/askhistorianstop_day.csv',
'Data/comments/askhistorianstop_week.csv']
tfts = ['Data/comments/TalesFromTechsupporthot.csv',
'Data/comments/TalesFromTechsupportnew.csv',
'Data/comments/TalesFromTechsupporttop_all.csv',
'Data/comments/TalesFromTechsupporttop_day.csv',
'Data/comments/TalesFromTechsupporttop_week.csv']
ar = ['Data/comments/AskReddithot.csv',
'Data/comments/AskRedditnew.csv',
'Data/comments/AskReddittop_all.csv',
'Data/comments/AskReddittop_day.csv',
'Data/comments/AskReddittop_week.csv']
tfr = ['Data/comments/talesFromRetailhot.csv',
'Data/comments/talesFromRetailnew.csv',
'Data/comments/talesFromRetailtop_all.csv',
'Data/comments/talesFromRetailtop_day.csv',
'Data/comments/talesFromRetailtop_week.csv']
asksci = ['Data/comments/asksciencehot.csv',
'Data/comments/asksciencenew.csv',
'Data/comments/asksciencetop_all.csv',
'Data/comments/asksciencetop_day.csv',
'Data/comments/asksciencetop_week.csv']
tifu = ['Data/comments/tifuhot.csv',
'Data/comments/tifunew.csv',
'Data/comments/tifutop_all.csv',
'Data/comments/tifutop_day.csv',
'Data/comments/tifutop_week.csv']
eli5 = ['Data/comments/explainlikeimfivehot.csv',
'Data/comments/explainlikeimfivenew.csv',
'Data/comments/explainlikeimfivetop_all.csv',
'Data/comments/explainlikeimfivetop_day.csv',
'Data/comments/explainlikeimfivetop_week.csv']
filemerge(ath).to_csv('Data/combinedcomments/atheism.csv', index=False, encoding='utf-8')
filemerge(poli).to_csv('Data/combinedcomments/politics.csv', index=False, encoding='utf-8')
filemerge(nos).to_csv('Data/combinedcomments/nosleep.csv', index=False, encoding='utf-8')
filemerge(ptyr).to_csv('Data/combinedcomments/pettyrevenge.csv', index=False, encoding='utf-8')
filemerge(joke).to_csv('Data/combinedcomments/jokes.csv', index=False, encoding='utf-8')
filemerge(askh).to_csv('Data/combinedcomments/askhistorians.csv', index=False, encoding='utf-8')
filemerge(tfts).to_csv('Data/combinedcomments/TalesFromTechsupport.csv', index=False, encoding='utf-8')
filemerge(ar).to_csv('Data/combinedcomments/AskReddit.csv', index=False, encoding='utf-8')
filemerge(tfr).to_csv('Data/combinedcomments/talesFromRetail.csv', index=False, encoding='utf-8')
filemerge(asksci).to_csv('Data/combinedcomments/askscience.csv', index=False, encoding='utf-8')
filemerge(tifu).to_csv('Data/combinedcomments/tifu.csv', index=False, encoding='utf-8')
filemerge(eli5).to_csv('Data/combinedcomments/explainlikeimfive.csv', index=False, encoding='utf-8')