%matplotlib inline
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import matplotlib.pylab as plt2
import string
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)
# set some nicer defaults for matplotlib
from matplotlib import rcParams
#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
'''
# NO LONGER USED - WILL READ IN MORE THAN ONE CSV
file_dir = "Data/"
path, dirs, files = os.walk(file_dir).next()
csvfiles = [file_dir + i for i in files if ".csv" in i ] #Builds a list with .csv files
csvfiles.sort()
'''
'\n# NO LONGER USED - WILL READ IN MORE THAN ONE CSV\nfile_dir = "Data/"\n\npath, dirs, files = os.walk(file_dir).next()\ncsvfiles = [file_dir + i for i in files if ".csv" in i ] #Builds a list with .csv files\ncsvfiles.sort()\n'
big_table = pd.read_csv('Data/full.csv', encoding='utf-8')
big_table = big_table[big_table['author'] != "deleted"] # throw deleted posts away for looking at usernames
#needed for later - lists of all types and subreddits
types = list(big_table['type'].unique())
subreddits = list(big_table['subreddit'].unique())
print types # print the types we are looking at
[u'top_all', u'hot', u'new', u'top_week', u'top_day']
print "Number of distinct authors: ", len(big_table .groupby('author'))
Number of distinct authors: 20044
We will create grouped tables in order to explore the data
author_table = big_table.groupby('author')
author_count = author_table['author'].count()
How many posts do the most active author have?
author_count.sort()
author_count[-20:]
author Gambatte 46 DoremusJe up 46 CharlieDarwin 46 kaykhosrow 46 AL 49 Libertatea 50 day 53 yupko 64 davidrei 64 pnewell 65 Vladith 69 maxwellhill 74 UserName 75 mepper 78 shadowbanmeplz 81 AdelleChattre 83 FredFltStn 87 wattmeter 91 BurtonDesque 178 drewiepoodle 276 dtype: int64
We will plot the number of posts of each individual author.
plt.hist(author_count, bins = 20, log=True)
remove_border()
successful_authors = big_table[author_table.author.transform(lambda x: x.count() > 1).astype('bool')]
authorset = set()
for a in successful_authors.index:
authorset.add(successful_authors.ix[a]['author'])
print "number of authors with more than 1 submission: ", len(authorset)
#print "Their names are:"
#for a in authorset:
# print a
number of authors with more than 1 submission: 10089
You can see that there is a small number of viral posts with >20,000 comments or >4000 score. The vast majority have less than 2,500 comments and score.
plt.hist(big_table['comments'], bins = 50, log=True)
plt.title("comment number distribution")
remove_border()
plt.show()
plt.hist(big_table['score'], bins = 50, log=True)
plt.title("score distribution")
remove_border()
plt.show()
Plotting the two histograms you can notice the similarity of them. Both have a huge amount of posts with low score and few comments and some really successful posts with a lot of posts and a high score. In order to check how those two are correlated we will do a scatter plot
#regression line
m_fit,b_fit = plt2.polyfit(big_table.comments, big_table.score, 1)
plt2.plot(big_table.comments, big_table.score, 'yo', big_table.comments, m_fit*big_table.comments+b_fit, color='purple', alpha=0.3)
plt.title("Comments versus Score")
plt.xlabel("Comments")
plt.ylabel("Score")
plt.xlim(-10, max(big_table.comments) * 1.05)
plt.ylim(-10, max(big_table.score) * 1.05 )
remove_border()
You can see that there is a small linear correlation between number of comments and the score. You can also see that the line of best fit doesn't really seem to fit most posts. This is due to the high number of unsuccessful posts in the bottom left part of the plot. There also seems to be a magical border of roughly 2000 - 2500 score that posts rarely cross, no matter how many comments the post has. We suspenct that this is due to the small number of people in the reddit community that actually participate in up and downvoting and vote on every post. The majority of the people will rarely vote on anything, only on really great content which would cause the post to break the 2500 score border. We might want to further investigate this later - what causes people that generally not vote to vote on content.
The next thing we will investigate now is whether there is a correlation of the comments and the score when both are really low.
big_table_filtered = big_table[big_table['comments'] < 50]
big_table_filtered = big_table_filtered[big_table_filtered['score'] < 100]
plt.scatter(big_table_filtered.comments, big_table_filtered.score, alpha=0.2)
plt.title("Comments versus Score")
plt.xlabel("Comments")
plt.ylabel("Score")
plt.xlim(-1, max(big_table_filtered.comments) * 1.05)
plt.ylim(-1, max(big_table_filtered.score) * 1.05 )
remove_border()
When trying to visualize filtered data where we filtered out successful posts we saw that there seems to be no correlation between comments and score as almost every combination of those two exist in that score range. Since we want to predict the post when it still is in the lower bottom part of the chart, the ratio might not be an optimal indicator of whether a posts can be successful.
def get_sub_stats(subreddit):
author_table = subreddit.groupby('author')
dist_authors = len(subreddit.groupby('author'))
#print "Number of distinct authors: ", dist_authors
successful_authors = subreddit[author_table.author.transform(lambda x: x.count() > 1).astype('bool')]
authorset = set()
for a in successful_authors.index:
authorset.add(successful_authors.ix[a]['author'])
active_users = len(authorset)
#print "number of authors with more than 1 submission in the top 1000: ", active_users
succ_ratio = float(active_users) / dist_authors
return active_users, dist_authors, succ_ratio
We wrote this funtion in order to find out whether there are super users that are in all top lists. We suspected that there are some users that are famous within the community that get a higher score based on their name and not their submission. That's why we tested how many distinct users there are in the data set and what percentage of them have multiple submissions in the top lists.
authorstats = {}
for ctype in types:
curr_df = big_table[big_table['type'] == ctype]
authorstats[ctype] = get_sub_stats(curr_df)
print authorstats
{u'top_week': (1029, 4968, 0.2071256038647343), u'new': (1060, 9421, 0.11251459505360366), u'top_all': (913, 9354, 0.09760530254436604), u'hot': (1091, 9494, 0.11491468295765747), u'top_day': (344, 1659, 0.20735382760699217)}
'''
# not longer needed as we have all data in one frame now. Maybe come of it can be used to investigate distinct subreddits later
author_week_list = []
author_all_list = []
author_new_list = []
for i, v in enumerate(csvfiles):
sr_name = string.split(v, sep='\\')[-1]
sr_name = sr_name[5:]
sr_name = string.split(sr_name, sep='.')[0]
sr_stats = get_sub_stats(pd.read_csv(csvfiles[i], encoding='utf-8'))
if "new" in sr_name:
author_new_list.append((sr_name, sr_stats))
elif "_all" in sr_name:
author_all_list.append((sr_name, sr_stats))
else:
author_week_list.append((sr_name, sr_stats))
#print "ratio of successful users: ", sub_stats
'''
'\n# not longer needed as we have all data in one frame now. Maybe come of it can be used to investigate distinct subreddits later\nauthor_week_list = []\nauthor_all_list = []\nauthor_new_list = []\n\nfor i, v in enumerate(csvfiles):\n sr_name = string.split(v, sep=\'\\\')[-1]\n sr_name = sr_name[5:]\n sr_name = string.split(sr_name, sep=\'.\')[0]\n \n sr_stats = get_sub_stats(pd.read_csv(csvfiles[i], encoding=\'utf-8\'))\n if "new" in sr_name:\n author_new_list.append((sr_name, sr_stats))\n elif "_all" in sr_name:\n author_all_list.append((sr_name, sr_stats))\n else:\n author_week_list.append((sr_name, sr_stats))\n \n #print "ratio of successful users: ", sub_stats\n'
After we created the arrays with the information about authors we need to plot them. Since we want to look at two values a scatterplot is the best way to visualize it.
def plot_author_success(successlist):
xvals = [value[0] for key, value in successlist.iteritems()]
yvals = [value[2] for key, value in successlist.iteritems()]
labellist = [key for key, value in successlist.iteritems()]
#plt.annotate([a[0] for a in author_week_list], xy=(1, 10), xytext=(1, 20))
fig, ax = plt.subplots()
ax.scatter(xvals, yvals)
for i, txt in enumerate(labellist):
ax.annotate(txt, (xvals[i],yvals[i]))
plt.title("Active Users with their success rate")
plt.xlabel("No distinct users")
plt.ylabel("fraction of users with multiple posts")
remove_border()
plot_author_success(authorstats)
subreddits = list(big_table['subreddit'].unique())
sr_stats = {}
for ctype in subreddits:
curr_df = big_table[big_table['subreddit'] == ctype]
sr_stats[ctype] = get_sub_stats(curr_df)
plot_author_success(sr_stats)
What you can see here is that the types with the higher success rate are the types that indicate that a user had to be successful to be on the list in the first place with a short time span. It seems like getting on one of the short time top lists motivates to post more content which will be successful too. This doesn't work for top_all though - probably because the posts there are very viral and unpredictable and only a small fraction of users make that list.
It is confusing that hot and top_day yield different results as you would expect them to be very similar.
def plot_subr_data(subreddit, ctype):
'''
plt.hist(subreddit['comments'], bins = 50, log=True)
plt.title("comment number distribution")
'''
X_plot,Y_plot = plt2.polyfit(subreddit.comments, subreddit.score, 1)
plt2.plot(subreddit.comments, subreddit.score, 'yo', subreddit.comments, X_plot*subreddit.comments+Y_plot , color='purple', alpha=0.3)
plt.title("Comments versus Score - " + ctype)
plt.xlabel("Comments")
plt.ylabel("Score")
#plt.xlim(-10, 13000)
#plt.ylim(-10, 2700)
remove_border()
for ctype in types:
curr_df = big_table[big_table['type'] == ctype]
plot_subr_data(curr_df, ctype)
plt.show()
The code above plots the same vizualization as earlier for each distinct type. You can see that the 2000-2500
for ctype in subreddits:
curr_df = big_table[big_table['subreddit'] == ctype]
plot_subr_data(curr_df, ctype)
plt.show()
Maybe it makes a difference whether the author input some text in the description box of the post. Let's plot both the scores with and without a "self" text.
def split_selftext_DataFrame(df):
is_string_list = []
i = 0
for idx, record in df['selftext'].iteritems():
if type(record) == float:
is_string_list.append(0)
else:
is_string_list.append(1)
return is_string_list
big_table['islink'] = split_selftext_DataFrame(big_table)
big_table_link = big_table[big_table['islink'] == 0]
big_table_self = big_table[big_table['islink'] == 1]
def plot_link_vs_self(table_link, table_self):
p1 = plt.scatter(table_link.score, table_link.comments, color='red', alpha = 0.2)
p2 = plt.scatter(table_self.score, table_self.comments, color='blue', alpha = 0.2)
plt.legend([p1, p2], ["no self text", "self texts"])
plt.title("Comments versus Score ")
plt.xlabel("Comments")
plt.ylabel("Score")
plt.xlim(-10, 5000)
plt.ylim(-10, 30000)
remove_border()
plot_link_vs_self(big_table_link, big_table_self)
If we want to include the comment count in the prediction we need to take into account that no self text generally generates more comments than self texts which is surprising.