%matplotlib inline import pandas as pd import numpy as np import json import os import matplotlib.pyplot as plt import matplotlib.pylab as plt2 import string pd.set_option('display.width', 500) pd.set_option('display.max_columns', 30) # set some nicer defaults for matplotlib from matplotlib import rcParams #these colors come from colorbrewer2.org. Each is an RGB triplet dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = False rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecessary plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() ''' # NO LONGER USED - WILL READ IN MORE THAN ONE CSV file_dir = "Data/" path, dirs, files = os.walk(file_dir).next() csvfiles = [file_dir + i for i in files if ".csv" in i ] #Builds a list with .csv files csvfiles.sort() ''' big_table = pd.read_csv('Data/full.csv', encoding='utf-8') big_table = big_table[big_table['author'] != "deleted"] # throw deleted posts away for looking at usernames #needed for later - lists of all types and subreddits types = list(big_table['type'].unique()) subreddits = list(big_table['subreddit'].unique()) print types # print the types we are looking at print "Number of distinct authors: ", len(big_table .groupby('author')) author_table = big_table.groupby('author') author_count = author_table['author'].count() author_count.sort() author_count[-20:] plt.hist(author_count, bins = 20, log=True) remove_border() successful_authors = big_table[author_table.author.transform(lambda x: x.count() > 1).astype('bool')] authorset = set() for a in successful_authors.index: authorset.add(successful_authors.ix[a]['author']) print "number of authors with more than 1 submission: ", len(authorset) #print "Their names are:" #for a in authorset: # print a plt.hist(big_table['comments'], bins = 50, log=True) plt.title("comment number distribution") remove_border() plt.show() plt.hist(big_table['score'], bins = 50, log=True) plt.title("score distribution") remove_border() plt.show() #regression line m_fit,b_fit = plt2.polyfit(big_table.comments, big_table.score, 1) plt2.plot(big_table.comments, big_table.score, 'yo', big_table.comments, m_fit*big_table.comments+b_fit, color='purple', alpha=0.3) plt.title("Comments versus Score") plt.xlabel("Comments") plt.ylabel("Score") plt.xlim(-10, max(big_table.comments) * 1.05) plt.ylim(-10, max(big_table.score) * 1.05 ) remove_border() big_table_filtered = big_table[big_table['comments'] < 50] big_table_filtered = big_table_filtered[big_table_filtered['score'] < 100] plt.scatter(big_table_filtered.comments, big_table_filtered.score, alpha=0.2) plt.title("Comments versus Score") plt.xlabel("Comments") plt.ylabel("Score") plt.xlim(-1, max(big_table_filtered.comments) * 1.05) plt.ylim(-1, max(big_table_filtered.score) * 1.05 ) remove_border() def get_sub_stats(subreddit): author_table = subreddit.groupby('author') dist_authors = len(subreddit.groupby('author')) #print "Number of distinct authors: ", dist_authors successful_authors = subreddit[author_table.author.transform(lambda x: x.count() > 1).astype('bool')] authorset = set() for a in successful_authors.index: authorset.add(successful_authors.ix[a]['author']) active_users = len(authorset) #print "number of authors with more than 1 submission in the top 1000: ", active_users succ_ratio = float(active_users) / dist_authors return active_users, dist_authors, succ_ratio authorstats = {} for ctype in types: curr_df = big_table[big_table['type'] == ctype] authorstats[ctype] = get_sub_stats(curr_df) print authorstats ''' # not longer needed as we have all data in one frame now. Maybe come of it can be used to investigate distinct subreddits later author_week_list = [] author_all_list = [] author_new_list = [] for i, v in enumerate(csvfiles): sr_name = string.split(v, sep='\\')[-1] sr_name = sr_name[5:] sr_name = string.split(sr_name, sep='.')[0] sr_stats = get_sub_stats(pd.read_csv(csvfiles[i], encoding='utf-8')) if "new" in sr_name: author_new_list.append((sr_name, sr_stats)) elif "_all" in sr_name: author_all_list.append((sr_name, sr_stats)) else: author_week_list.append((sr_name, sr_stats)) #print "ratio of successful users: ", sub_stats ''' def plot_author_success(successlist): xvals = [value[0] for key, value in successlist.iteritems()] yvals = [value[2] for key, value in successlist.iteritems()] labellist = [key for key, value in successlist.iteritems()] #plt.annotate([a[0] for a in author_week_list], xy=(1, 10), xytext=(1, 20)) fig, ax = plt.subplots() ax.scatter(xvals, yvals) for i, txt in enumerate(labellist): ax.annotate(txt, (xvals[i],yvals[i])) plt.title("Active Users with their success rate") plt.xlabel("No distinct users") plt.ylabel("fraction of users with multiple posts") remove_border() plot_author_success(authorstats) subreddits = list(big_table['subreddit'].unique()) sr_stats = {} for ctype in subreddits: curr_df = big_table[big_table['subreddit'] == ctype] sr_stats[ctype] = get_sub_stats(curr_df) plot_author_success(sr_stats) def plot_subr_data(subreddit, ctype): ''' plt.hist(subreddit['comments'], bins = 50, log=True) plt.title("comment number distribution") ''' X_plot,Y_plot = plt2.polyfit(subreddit.comments, subreddit.score, 1) plt2.plot(subreddit.comments, subreddit.score, 'yo', subreddit.comments, X_plot*subreddit.comments+Y_plot , color='purple', alpha=0.3) plt.title("Comments versus Score - " + ctype) plt.xlabel("Comments") plt.ylabel("Score") #plt.xlim(-10, 13000) #plt.ylim(-10, 2700) remove_border() for ctype in types: curr_df = big_table[big_table['type'] == ctype] plot_subr_data(curr_df, ctype) plt.show() for ctype in subreddits: curr_df = big_table[big_table['subreddit'] == ctype] plot_subr_data(curr_df, ctype) plt.show() def split_selftext_DataFrame(df): is_string_list = [] i = 0 for idx, record in df['selftext'].iteritems(): if type(record) == float: is_string_list.append(0) else: is_string_list.append(1) return is_string_list big_table['islink'] = split_selftext_DataFrame(big_table) big_table_link = big_table[big_table['islink'] == 0] big_table_self = big_table[big_table['islink'] == 1] def plot_link_vs_self(table_link, table_self): p1 = plt.scatter(table_link.score, table_link.comments, color='red', alpha = 0.2) p2 = plt.scatter(table_self.score, table_self.comments, color='blue', alpha = 0.2) plt.legend([p1, p2], ["no self text", "self texts"]) plt.title("Comments versus Score ") plt.xlabel("Comments") plt.ylabel("Score") plt.xlim(-10, 5000) plt.ylim(-10, 30000) remove_border() plot_link_vs_self(big_table_link, big_table_self)