%matplotlib inline

import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import matplotlib.pylab as plt2
import string

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

'''
# NO LONGER USED - WILL READ IN MORE THAN ONE CSV
file_dir = "Data/"

path, dirs, files = os.walk(file_dir).next()
csvfiles = [file_dir + i for i in files if ".csv" in i ] #Builds a list with .csv files
csvfiles.sort()
'''

big_table = pd.read_csv('Data/full.csv', encoding='utf-8')
big_table = big_table[big_table['author'] != "deleted"] # throw deleted posts away for looking at usernames

#needed for later - lists of all types and subreddits
types = list(big_table['type'].unique())
subreddits = list(big_table['subreddit'].unique())

print types # print the types we are looking at

print "Number of distinct authors: ", len(big_table .groupby('author')) 

author_table = big_table.groupby('author')
author_count = author_table['author'].count()

author_count.sort()

author_count[-20:]

plt.hist(author_count, bins = 20, log=True)
remove_border()

successful_authors = big_table[author_table.author.transform(lambda x: x.count() > 1).astype('bool')]
authorset = set()
for a in successful_authors.index:
    authorset.add(successful_authors.ix[a]['author'])
print "number of authors with more than 1 submission: ", len(authorset)
#print "Their names are:"
#for a in authorset:
#    print a

plt.hist(big_table['comments'], bins = 50, log=True)
plt.title("comment number distribution")
remove_border()
plt.show()

plt.hist(big_table['score'], bins = 50, log=True)
plt.title("score distribution")
remove_border()
plt.show()


#regression line
m_fit,b_fit = plt2.polyfit(big_table.comments, big_table.score, 1) 
plt2.plot(big_table.comments, big_table.score, 'yo', big_table.comments, m_fit*big_table.comments+b_fit, color='purple', alpha=0.3) 
plt.title("Comments versus Score")
plt.xlabel("Comments")
plt.ylabel("Score")
plt.xlim(-10, max(big_table.comments) * 1.05)
plt.ylim(-10, max(big_table.score) * 1.05 )
remove_border()


big_table_filtered = big_table[big_table['comments'] < 50]
big_table_filtered = big_table_filtered[big_table_filtered['score'] < 100]

plt.scatter(big_table_filtered.comments, big_table_filtered.score, alpha=0.2)
plt.title("Comments versus Score")
plt.xlabel("Comments")
plt.ylabel("Score")
plt.xlim(-1, max(big_table_filtered.comments) * 1.05)
plt.ylim(-1, max(big_table_filtered.score) * 1.05 )
remove_border()

def get_sub_stats(subreddit):
    author_table = subreddit.groupby('author')
    dist_authors = len(subreddit.groupby('author'))
    #print "Number of distinct authors: ", dist_authors
    successful_authors = subreddit[author_table.author.transform(lambda x: x.count() > 1).astype('bool')]
    authorset = set()
    for a in successful_authors.index:
        authorset.add(successful_authors.ix[a]['author'])
    active_users = len(authorset)
    #print "number of authors with more than 1 submission in the top 1000: ", active_users
    
    succ_ratio = float(active_users) / dist_authors
    return active_users, dist_authors, succ_ratio
    

authorstats = {}
for ctype in types:
    curr_df = big_table[big_table['type'] == ctype]
    authorstats[ctype] = get_sub_stats(curr_df)
print authorstats


'''
# not longer needed as we have all data in one frame now. Maybe come of it can be used to investigate distinct subreddits later
author_week_list = []
author_all_list = []
author_new_list = []

for i, v in enumerate(csvfiles):
    sr_name = string.split(v, sep='\\')[-1]
    sr_name = sr_name[5:]
    sr_name = string.split(sr_name, sep='.')[0]
    
    sr_stats = get_sub_stats(pd.read_csv(csvfiles[i], encoding='utf-8'))
    if "new" in sr_name:
        author_new_list.append((sr_name, sr_stats))
    elif "_all" in sr_name:
        author_all_list.append((sr_name, sr_stats))
    else:
        author_week_list.append((sr_name, sr_stats))
    
    #print "ratio of successful users: ", sub_stats
'''

def plot_author_success(successlist):
    xvals = [value[0] for key, value in successlist.iteritems()]
    yvals = [value[2] for key, value in successlist.iteritems()]
    labellist = [key for key, value in successlist.iteritems()]
    #plt.annotate([a[0] for a in author_week_list], xy=(1, 10), xytext=(1, 20))
    
    fig, ax = plt.subplots()
    ax.scatter(xvals, yvals)
    
    for i, txt in enumerate(labellist):
        ax.annotate(txt, (xvals[i],yvals[i]))
    plt.title("Active Users with their success rate")
    plt.xlabel("No distinct users")
    plt.ylabel("fraction of users with multiple posts")
    remove_border()

plot_author_success(authorstats)

subreddits = list(big_table['subreddit'].unique())
sr_stats = {}
for ctype in subreddits:
    curr_df = big_table[big_table['subreddit'] == ctype]
    sr_stats[ctype] = get_sub_stats(curr_df)
plot_author_success(sr_stats)

def plot_subr_data(subreddit, ctype):
    '''
    plt.hist(subreddit['comments'], bins = 50, log=True)
    plt.title("comment number distribution")
    '''
    X_plot,Y_plot = plt2.polyfit(subreddit.comments, subreddit.score, 1) 
    plt2.plot(subreddit.comments, subreddit.score, 'yo', subreddit.comments, X_plot*subreddit.comments+Y_plot , color='purple', alpha=0.3) 
    
    plt.title("Comments versus Score - " + ctype)
    plt.xlabel("Comments")
    plt.ylabel("Score")
    #plt.xlim(-10, 13000)
    #plt.ylim(-10, 2700)
    remove_border()
    

for ctype in types:
    curr_df = big_table[big_table['type'] == ctype]
    plot_subr_data(curr_df, ctype)
    plt.show()

for ctype in subreddits:
    curr_df = big_table[big_table['subreddit'] == ctype]
    plot_subr_data(curr_df, ctype)
    plt.show()

def split_selftext_DataFrame(df):
    is_string_list = []
    i = 0
    for idx, record in df['selftext'].iteritems():
        if type(record) == float:
            is_string_list.append(0)
        else:
            is_string_list.append(1)
        
    return is_string_list
big_table['islink'] = split_selftext_DataFrame(big_table)

big_table_link = big_table[big_table['islink'] == 0]
big_table_self = big_table[big_table['islink'] == 1]

def plot_link_vs_self(table_link, table_self):
    p1 = plt.scatter(table_link.score, table_link.comments, color='red', alpha = 0.2)
    p2 = plt.scatter(table_self.score, table_self.comments, color='blue', alpha = 0.2)
    
    plt.legend([p1, p2], ["no self text", "self texts"])
    plt.title("Comments versus Score ")
    plt.xlabel("Comments")
    plt.ylabel("Score")
    plt.xlim(-10, 5000)
    plt.ylim(-10, 30000)
    remove_border()

plot_link_vs_self(big_table_link, big_table_self)