%matplotlib inline

import json

import numpy as np
import copy
import pandas as pd
import networkx as nx
import requests
import scipy
from pattern import web
import matplotlib.pyplot as plt
import matplotlib.pylab as plt2
from scipy.stats import pearsonr
from datetime import datetime
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity
from myalchemy import MyAlchemy


from sklearn import svm, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

big_table = pd.read_csv('Data/full.csv', encoding='utf-8')
big_table = big_table[big_table['author'] != "deleted"]
print "Number of posts: ", len(big_table)
print "Number of distinct authors: ", len(big_table .groupby('author'))

def get_author_stats():
    author_table = big_table.groupby('author')
    author_count = author_table['author'].count()
    author_count.sort()
    return author_count
author_count = get_author_stats()
author_count[-10:]

plt.hist(author_count, bins = 20, log=True)
plt.title("Distribution of number of submissions")
remove_border()

types = list(big_table['type'].unique())

'''
returns:
- the number of active users with more than 2 posts
- the number of distinct authors
- the ratio of active/distinct users
for a subreddit
'''
def get_sub_stats(subreddit):
    author_table = subreddit.groupby('author')
    dist_authors = len(subreddit.groupby('author'))
    #print "Number of distinct authors: ", dist_authors
    successful_authors = subreddit[author_table.author.transform(lambda x: x.count() > 1).astype('bool')]
    authorset = set()
    for a in successful_authors.index:
        authorset.add(successful_authors.ix[a]['author'])
    active_users = len(authorset)
    #print "number of authors with more than 1 submission in the top 1000: ", active_users
    if dist_authors >0:
        succ_ratio = float(active_users) / dist_authors
    else:
        succ_ratio = 0
    return active_users, dist_authors, succ_ratio
    
#get the values for all types of data
authorstats = {}
for ctype in types:
    curr_df = big_table[big_table['type'] == ctype]
    authorstats[ctype] = get_sub_stats(curr_df)
del curr_df #reduce memory

'''
plots a scatterplot for a list of subreddit stats calculated before
X-Axis: Number of distinct users
Y-Axis: Success ratio
'''
def plot_author_success(successlist):
    xvals = [value[0] for key, value in successlist.iteritems()]
    yvals = [value[2] for key, value in successlist.iteritems()]
    labellist = [key for key, value in successlist.iteritems()]
    
    fig, ax = plt.subplots()
    ax.scatter(xvals, yvals)
    
    for i, txt in enumerate(labellist):
        ax.annotate(txt, (xvals[i],yvals[i]))
    plt.title("Active Users with their success rate")
    plt.xlabel("No. distinct users")
    plt.ylabel("fraction of users with multiple posts")
    remove_border()

plot_author_success(authorstats)

subreddits = list(big_table['subreddit'].unique())
sr_stats = {}
for ctype in subreddits:
    curr_df = big_table[big_table['subreddit'] == ctype]
    sr_stats[ctype] = get_sub_stats(curr_df)
del curr_df #reduce memory
plot_author_success(sr_stats)
del sr_stats #reduce memory

#regression line
m_fit,b_fit = plt2.polyfit(big_table.comments, big_table.score, 1) 
plt2.plot(big_table.comments, big_table.score, 'yo', big_table.comments, m_fit*big_table.comments+b_fit, color='purple', alpha=0.3) 
plt.title("Comments versus Score")
plt.xlabel("Comments")
plt.ylabel("Score")
plt.xlim(-10, max(big_table.comments) * 1.05)
plt.ylim(-10, max(big_table.score) * 1.05 )
remove_border()


big_table_filtered = big_table[big_table['comments'] < 50] #only look at posts with <50 comments
big_table_filtered = big_table_filtered[big_table_filtered['score'] < 100] # and less than 100 score

plt.scatter(big_table_filtered.comments, big_table_filtered.score, alpha=0.2)
plt.title("Comments versus Score")
plt.xlabel("Comments")
plt.ylabel("Score")
plt.xlim(-1, max(big_table_filtered.comments) * 1.05)
plt.ylim(-1, max(big_table_filtered.score) * 1.05 )
remove_border()
del big_table_filtered

def split_selftext_DataFrame(df):
    '''
    returns a list with 0 if a post has no selftext and a 1 if it has
    '''
    is_string_list = []
    i = 0
    for idx, record in df['selftext'].iteritems():
        if type(record) == float: #for some reason no selftext is formatted as float
            is_string_list.append(0)
        else:
            is_string_list.append(1)
        
    return is_string_list

big_table['islink'] = split_selftext_DataFrame(big_table)

big_table_link = big_table[big_table['islink'] == 0]
big_table_self = big_table[big_table['islink'] == 1]

def plot_link_vs_self(table_link, table_self):
    '''
    plots a scatterplot of scores and comments for two different datasets
    '''
    p1 = plt.scatter(table_link.comments, table_link.score, color='red', alpha = 0.2)
    p2 = plt.scatter(table_self.comments, table_self.score, color='blue', alpha = 0.2)
    
    plt.legend([p1, p2], ["no self text", "self texts"])
    plt.title("Comments versus Score ")
    plt.xlabel("Comments")
    plt.ylabel("Score")
    plt.ylim(-10, 5000)
    plt.xlim(-10, 30000)
    remove_border()
    
plot_link_vs_self(big_table_link, big_table_self)
del big_table_link
del big_table_self

logkrm = np.log(big_table['karma'])
loglinkkrm = np.log(big_table['link_karma'])
logscore = np.log(big_table['score'])

plt.scatter(logkrm, logscore, c='g')
plt.title("Karma versus Score - Both on a Logarithimic Scale")
plt.xlabel("Karma (Log)")
plt.ylabel("Score (Log)")
plt.xlim(-0.5, 16)
plt.ylim(-0.5, 10)
remove_border()
plt.show()
r_row, p_value = pearsonr(big_table['karma'], big_table['score'])
print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value)

plt.scatter(loglinkkrm, logscore, c='g')
plt.title("Link Karma versus Score - Both on a Logarithimic Scale")
plt.xlabel("Link Karma (Log)")
plt.ylabel("Score (Log)")
plt.xlim(-0.5, 16)
plt.ylim(-0.5, 10)
remove_border()
plt.show()
r_row, p_value = pearsonr(big_table['link_karma'], big_table['score'])
print "Pearson r coefficient is " + str(r_row) + " with a p-value of " + str(p_value)

del logkrm, loglinkkrm, logscore

r_row, p_value = pearsonr(big_table['karma'], big_table['link_karma'])
print "Pearson r coefficient is " + str(r_row) + " with a p-value of " + str(p_value)

big_table['length']=big_table['comments'] # Done simply to initialize the column "length", so to speak
for i in big_table.index:
    big_table['length'][i]=len(str(big_table['title'][i]))
    
plt.scatter(big_table['length'], big_table['score'], c='g')
plt.title("Post Title Length versus Post Score")
plt.xlabel("Title Length")
plt.ylabel("Score")
plt.xlim(0, 300)
plt.ylim(0, 9000)
remove_border()
plt.show()
r_row, p_value = pearsonr(big_table['length'], big_table['score'])
print "Pearson r coefficient is " + str(r_row) + " with a p-value of " + str(p_value)
    

#p =datetime.utcfromtimestamp(messwith)#.strftime('%Y-%m-%d %H:%M:%S') #Year, Month, Day, Hour, Minute, Second format

dates = list(big_table['time_created'])

#Function to return the time between dates
def convertdate(dates, which):
    dts = []
    for date in dates:
        dts.append(datetime.utcfromtimestamp(date))
    currenttime = datetime.now()
    until = max(dts)
    days = []
    hrs = []
    for date in dts:
        days.append((until-date).days)
        hrs.append((until-date).total_seconds()/3600.0)
    #print "Last post in the data set has a date/time of", until.strftime('%Y-%m-%d %H:%M:%S')
    if which == 'days':
        return days
    elif which == 'hours':
        return hrs
    else:
        print 'Enter days or hours'

big_table['daysfrom'] = convertdate(dates, 'days')
big_table['hoursfrom'] = convertdate(dates, 'hours')

# Color each scatter plot point according to subreddit type
df = big_table

#Set the colors of each category for a nicer looking graph
colors = ['c', 'g', 'y', 'b', 'r', 'm', 'k', 'w']

talldf = df[df['type'] == types[0]]
talldf['color'] = colors[0]
tallcol = list(talldf['color'])
newdf = df[df['type'] == types[1]]
newdf['color'] = colors[1]
newcol= list(newdf['color'])
hotdf = df[df['type'] == types[2]]
hotdf['color'] = colors[2]
hotcol= list(hotdf['color'])
tweekdf = df[df['type'] == types[3]]
tweekdf['color'] = colors[3]
tweekcol= list(tweekdf['color'])
tdaydf = df[df['type'] == types[4]]
tdaydf['color'] = colors[4]
tdaycol= list(tdaydf['color'])

#Plot time vs. score

tall = plt.scatter(talldf['daysfrom'], talldf['score'], c=tallcol)
new = plt.scatter(newdf['daysfrom'], newdf['score'], c=newcol)
hot = plt.scatter(hotdf['daysfrom'], hotdf['score'], c=hotcol)
tweek = plt.scatter(tweekdf['daysfrom'], tweekdf['score'], c=tweekcol)
tday = plt.scatter(tdaydf['daysfrom'], tdaydf['score'], c=tdaycol)
plt.title("Post Date (in Days posted before November, 11 2013) versus Post Score")
plt.xlabel("Number of Days posted before last post date")
plt.ylabel("Score (Upvotes-Downvotes)")
plt.xlim(0, 2100)
plt.ylim(0, 9000)
plt.legend((tall, new, hot, tweek, tday),
           ('Top all', 'New', 'Hot', 'Top Weekly', 'Top Day'),
           loc='upper right')
remove_border()
plt.show()
r_row, p_value = pearsonr(talldf['length'], talldf['score'])
print "Pearson r coefficient for top all is " + str(r_row) + " with a p-value of " + str(p_value)

'''
The following code will plot four scatterplots from different combinations
of this data and various axis limits to see if any patterns can be observed
'''
#exclude data older than 100 days from the plots
tall = plt.scatter(talldf['hoursfrom']/24.0, talldf['score'], c=tallcol)
new = plt.scatter(newdf['hoursfrom']/24.0, newdf['score'], c=newcol)
hot = plt.scatter(hotdf['hoursfrom']/24.0, hotdf['score'], c=hotcol)
tweek = plt.scatter(tweekdf['hoursfrom']/24.0, tweekdf['score'], c=tweekcol)
tday = plt.scatter(tdaydf['hoursfrom']/24.0, tdaydf['score'], c=tdaycol)
plt.title("Post Date (in Days posted before November, 11 2013) versus Post Score")
plt.xlabel("Number of Days posted before last post date")
plt.ylabel("Score (Upvotes-Downvotes)")
plt.xlim(0, 100)
plt.ylim(0, 5500)
plt.legend((tall, new, hot, tweek, tday),
           ('Top all', 'New', 'Hot', 'Top Weekly', 'Top Day'),
           loc='upper right')
remove_border()
plt.show()

#leave out all and hot
new = plt.scatter(newdf['hoursfrom']/24.0, newdf['score'], c=newcol)
tweek = plt.scatter(tweekdf['hoursfrom']/24.0, tweekdf['score'], c=tweekcol)
tday = plt.scatter(tdaydf['hoursfrom']/24.0, tdaydf['score'], c=tdaycol)
plt.title("Post Date (in Days posted before November, 11 2013) versus Post Score")
plt.xlabel("Number of Days posted before last post date")
plt.ylabel("Score (Upvotes-Downvotes)")
plt.xlim(0, 100)
plt.ylim(0, 4200)
plt.legend((new, tweek, tday),
           ('New', 'Top Weekly', 'Top Day'),
           loc='upper right')
remove_border()
plt.show()

#plot it with hot
new = plt.scatter(newdf['hoursfrom']/24.0, newdf['score'], c=newcol)
hot = plt.scatter(hotdf['hoursfrom']/24.0, hotdf['score'], c=hotcol)
tweek = plt.scatter(tweekdf['hoursfrom']/24.0, tweekdf['score'], c=tweekcol)
tday = plt.scatter(tdaydf['hoursfrom']/24.0, tdaydf['score'], c=tdaycol)
plt.title("Post Date (in Days posted before November, 11 2013) versus Post Score")
plt.xlabel("Number of Days posted before last post date")
plt.ylabel("Score (Upvotes-Downvotes)")
plt.xlim(0, 100)
plt.ylim(0, 4200)
plt.legend((new, hot, tweek, tday),
           ('New', 'Hot', 'Top Weekly', 'Top Day'),
           loc='upper right')
remove_border()
plt.show()

#look only at day and week
tweek = plt.scatter(tweekdf['hoursfrom']/24.0, tweekdf['score'], c=tweekcol)
tday = plt.scatter(tdaydf['hoursfrom']/24.0, tdaydf['score'], c=tdaycol)
plt.title("Post Date (in days posted before November, 11 2013) versus Post Score")
plt.xlabel("Number of Days posted before last post date")
plt.ylabel("Score (Upvotes-Downvotes)")
plt.xlim(0, 8)
plt.ylim(0, 4200)
plt.legend((tweek, tday),
           ('Top Weekly', 'Top Day'),
           loc='upper right')
remove_border()
plt.show()

del talldf, tallcol, newdf, newcol, hotdf, hotcol, tweekdf, tweekcol, tdaydf, tdaycol

df = pd.read_csv('Data/full.csv', encoding='utf-8') # Top all is our training data set
print len(df)
df['up/down'] = df['upvotes'].astype(float)/df['downvotes'].astype(float) # Reddit fuzzes this so... 
topcomments=float(max(df['comments']))
topsscore=float(max(df['score']))
leastcontro = max(df['up/down'])
# The following metric is something we invented for testing purposes
df['mymetric'] = (((df['comments'].astype(float)/topcomments)*0.10)+\
                  ((df['score'].astype(float)/topsscore)*0.85)+\
                  ((df['up/down']/leastcontro)*0.05))**(0.30)
df['nrmscore'] = (df['score'].astype(float)/topsscore)**(0.30)
bigdf = df

df = df[df['subreddit'] == 'AskReddit']
df2 = df[df['type'] == 'top_week']
print len(df2)
df = df[df['type'] == 'top_all'] 
print len(df)


#It's important in cross validation that the sets are disjoint, so we are removing duplicates
dfids = list(df['id'])
df2ids = list(df2['id'])

dupids = []
for redditid in dfids:
    if redditid in df2ids:
        dupids.append(redditid)

#This part is slightly overengineered, but the motivation behind it is that we didn't want to simply strip out the 
#posts from other data set at will. Instead, we are splitting the duplicates in half and assigning them to one of the data sets
#to avoid some sort of possible bias.
if len(dupids)%2 != 0:
    a = len(dupids)/2
    a = a+1
    dup1 = dupids[0:a]
    dup2 = dupids[a:]
else: 
    a = len(dupids)/2
    dup1 = dupids[0:a]
    dup2 = dupids[a:]
    
if np.random.randint(2) == 0:
    df=df[df['id'].apply(lambda x: x in dup1) == False]
    df2=df2[df2['id'].apply(lambda x: x in dup2) == False]
else: 
    df=df[df['id'].apply(lambda x: x in dup2) == False]
    df2=df2[df2['id'].apply(lambda x: x in dup1) == False]

print len(df)
print len(df2)

vectorizer = CountVectorizer(min_df=0.001)
title = list(df['title']) + list(df2['title'])
vectorizer.fit(title)

def category(x, df, num=2):
    size = len(df)
    blocksize = size/num
    for i in range(num):
        blockmax = max(sorted(df['score'])[blocksize*i:blocksize*(i+1)])        
        if x < blockmax:
            return i+1
    return num


x_train = vectorizer.transform(df['title'])
x_test = vectorizer.transform(df2['title'])
score = [category(i, df2) for i in df['score']]
score2 = [category(i, df2) for i in df2['score']]
y_train = np.array(score)
y_test = np.array(score2)

vectorizer2 = CountVectorizer(min_df=0.001)
title2 = df2['title']
vectorizer2.fit(title2)
X2 = vectorizer2.transform(title2)
Y2 = np.array(df2['score'])

clf = MultinomialNB(alpha=1)
clf.fit(x_train, y_train)
print "Training accuracy is", clf.score(x_train, y_train)
print "Test accuracy is", clf.score(x_test, y_test)

dftitles = df['title']
df2titles = df2['title']
vectorizer = CountVectorizer(min_df=0.001)
title = list(dftitles) + list(df2titles)
vectorizer.fit(title)

def category(x, df, num=2):
    size = len(df)
    blocksize = size/num
    for i in range(num):
        blockmax = max(sorted(df['score'])[blocksize*i:blocksize*(i+1)])        
        if x < blockmax:
            return i+1
    return num

#scores = [category(i) for i in df2['score']]
#print scores
#X = vectorizer.transform(title)
#Y = np.array(scores)
x_train = vectorizer.transform(dftitles)
x_test = vectorizer.transform(df2titles)
score = [1 if i > np.mean(df['score']) else 0 for i in df['score']]
score2 = [1 if i > np.mean(df2['score']) else 0 for i in df2['score']]
y_train = np.array(score)
y_test = np.array(score2)

clf = MultinomialNB(alpha=1)
clf.fit(x_train, y_train)
print "Training accuracy is", clf.score(x_train, y_train)
print "Test accuracy is", clf.score(x_test, y_test)

apikey = "e945cef59338f9e8e7bc962badde170e623fb7e5" #Please insert your own key here
p = MyAlchemy(apikey)

dftitles = list(df['title'])
df2titles = list(df2['title'])
print dftitles[5]
print p.run_method(dftitles[5], 'concepts')
print p.run_method(dftitles[5], 'keywords')
print p.run_method(dftitles[5], 'category')
#print p.run_method(dftitles[5], 'sentiment')
print p.run_method(dftitles[5], 'entities')
print len(df)

#Concepts, keywords, category, sentiment, entities - all things Alchemy can provide
categories = []
concepts, concepts2 = [], []
for i in range(30):
    conceptlist = p.run_method(dftitles[i], 'concepts')
    for c in conceptlist:
        concepts.append(c[1])
        
for i in range(30):
    conceptlist = p.run_method(df2titles[i], 'concepts')
    for c in conceptlist:
        concepts2.append(c[1])
    
print concepts
print "--------"
print concepts2

vectorizer = CountVectorizer(min_df=0.001)
vectorizer.fit(concepts)

X = vectorizer.transform(concepts)
Y = np.array(df['score'][0:55])

title2 = df2['title']

print len(Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) #I added the train size parameter.

clf = MultinomialNB(alpha=1)
clf.fit(x_train, y_train)
print "Training accuracy is", clf.score(x_train, y_train)
print "Test accuracy is", clf.score(x_test, y_test)


del big_table #don't need this any longer
df = pd.read_csv('Data/full.csv', encoding='utf-8') #using a fresh dataset for this

print "Original size of data set is", len(df)
df = df.drop_duplicates('id')
print "Size of data set with only unique posts is", len(df)
dfmean = np.mean(df['score'])

df = df.sort('score')
df = df.reset_index(level=0, drop=True)
median = len(df)/2
md = df['score'][median]

def make_xy(titles, scores, vectorizer=None):
    #Set default vecotrizer
    if not vectorizer:
        vectorizer = CountVectorizer(min_df=0.001)
        
    #Build the vocabulary by fitting the vectorizer to the list of quotes
    vectorizer.fit(titles)    
    
    #Convert into a bag-of-words and use a sparse array to save memory
    x = vectorizer.transform(titles)
    x = x.tocsc()
    
    #save into numpy array, and return everything
    y = np.array(scores)

    return x, y, vectorizer

X,Y,vectorizer = make_xy(list(df['title']), df['score'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)

clf = MultinomialNB(alpha=1)
clf.fit(x_train, y_train)
print "Training accuracy is", clf.score(x_train, y_train)
print "Test accuracy is", clf.score(x_test, y_test)

sorteddf = df.sort('score')
sorteddf['category'] = df['score']
size = len(df)
num = 5
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
    sorteddf['category'][blocks[i]:blocks[i+1]] = i+1

Xsort, Ysort, vectorizer2 = make_xy(list(sorteddf['title']), sorteddf['category'])

x_train3, x_test3, y_train3, y_test3 = train_test_split(Xsort, Ysort, train_size=0.5)
clf3 = MultinomialNB(alpha=1)
clf3.fit(x_train3, y_train3)
train_acc = clf3.score(x_train3, y_train3)
test_acc = clf3.score(x_test3, y_test3)
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc

sorteddf = df.sort('score')
sorteddf['category'] = df['score']
size = len(df)
best_test = 0
best_vect = None
best_Ysort = None
best_clf = None
for num in range(2, 11):
    blocksize = size/num
    blocks = [blocksize * i for i in range(num)]
    blocks.append(size)
    for i in range(num):
        sorteddf['category'][blocks[i]:blocks[i+1]] = i+1
    
    Xsort, Ysort, vectorizer2 = make_xy(list(sorteddf['title']), sorteddf['category'])
    
    x_train3, x_test3, y_train3, y_test3 = train_test_split(Xsort, Ysort, train_size=0.5)
    clf3 = MultinomialNB(alpha=1)
    clf3.fit(x_train3, y_train3)
    train_acc = clf3.score(x_train3, y_train3)
    test_acc = clf3.score(x_test3, y_test3)
    if best_test < test_acc:
        best_test = test_acc
        best_vect = copy.deepcopy(vectorizer2)
        best_Ysort = copy.deepcopy(Ysort)
        best_clf = copy.deepcopy(clf3)
    print "For", num, "bins:"
    print "Training accuracy is", train_acc
    print "Test accuracy is", test_acc
    print "---------------------------------"


n_grams = CountVectorizer(ngram_range=[1, 5], analyzer='word')
n_grams.fit(list(sorteddf['title']))
Xngram = n_grams.transform(list(sorteddf['title']))
x_train4, x_test4, y_train4, y_test4 = train_test_split(Xngram, best_Ysort, train_size=0.5)
clf4 = MultinomialNB(alpha=1)
clf4.fit(x_train4, y_train4)
print "Training accuracy is", clf4.score(x_train4, y_train4)
print "Test accuracy is", clf4.score(x_test4, y_test4)

tdidf = TfidfVectorizer(ngram_range=[1, 5], sublinear_tf=True)
tdidf.fit(list(sorteddf['title']))
Xtdidf = tdidf.transform(list(sorteddf['title']))
x_train5, x_test5, y_train5, y_test5 = train_test_split(Xtdidf, best_Ysort, train_size=0.5)
clf5 = MultinomialNB(alpha=1)
clf5.fit(x_train5, y_train5)
print "Training accuracy is", clf5.score(x_train5, y_train5)
print "Test accuracy is", clf5.score(x_test5, y_test5)

#Calculate the rotten and fresh word probabilities
#and create a new, sorted DataFrame for them
mywords = best_vect.get_feature_names()
print len(mywords)
diag = np.eye(len(mywords))

unpop, pop = zip(*best_clf.predict_proba(diag))
data = pd.DataFrame({'words': mywords, 'p_pop': pop, 'p_unpop': unpop})
sort = data.sort('p_pop', ascending=False).copy()

print 'Top 10 \"Best\" Words:'
print
for i in sort[:10].index:
    print "The word",sort.words[i],"has probability", sort.p_pop[i], "of being popular"
print
print 'Top 10 \"Worst\" Words:'
print
for i in sort[:-11:-1].index:
    print "The word",sort.words[i],"has probability", sort.p_unpop[i], "of being unpopular"

#let's get started with a new and clean data set once again
df = pd.read_csv('Data/full.csv', encoding='utf-8')
df = df.drop_duplicates('id')
df = df.sort('score')
df = df.reset_index(level=0, drop=True)
df = df.drop_duplicates()


subreddit_ngrams = {}
for subreddit in subreddits: 
    smalldf = df[df['subreddit'] == subreddit]
    sortedsmalldf = smalldf.sort('score')
    sortedsmalldf['category'] = smalldf['score']
    size = len(smalldf)
    num = 2
    blocksize = size/num
    blocks = [blocksize * i for i in range(num)]
    blocks.append(size)
    for i in range(num):
        sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
        
    n_grams = CountVectorizer(ngram_range=[1, 3])
    n_grams.fit(list(sortedsmalldf['title']))
    X = n_grams.transform(list(sortedsmalldf['title']))
    Y = np.array(sortedsmalldf['category'])
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
    clf = MultinomialNB(alpha=50)
    clf.fit(x_train, y_train)
    subreddit_ngrams[subreddit] = [clf, n_grams]
    train_acc = clf.score(x_train, y_train)
    test_acc = clf.score(x_test, y_test)
    print "For", subreddit, "subreddit:"
    print "Training accuracy is", train_acc
    print "Test accuracy is", test_acc
    print "---------------------------------"

# after http://stackoverflow.com/questions/12787650/finding-the-index-of-n-biggest-elements-in-python-array-list-efficiently
# we need this to efficiently get the largest cosine scores
def f(a):
    return np.argsort(a)[0][::-1]

def make_xy(titles, scores, vectorizer=None):
    #this one uses a tfidf vectorizer as opposed to the earlier version
    #Set default vecotrizer
    if not vectorizer:
        vectorizer = TfidfVectorizer()
        
    #Build the vocabulary by fitting the vectorizer to the list of quotes
    #Convert into a bag-of-words and use a sparse array to save memory
    x = vectorizer.fit_transform(titles)    
    #x = x.tocsc()
    #save into numpy array, and return everything
    y = np.array(scores)
    return x, y, vectorizer

X,Y,vectorizer = make_xy(list(df['title']), df['score'])

#this calculates close to 2.5 billion scores + sorts every of the 50,000 list of 50,000 -> might take a while
def make_closest():
    '''
    calculates the cosine similarity between each pair of titles and returns a dictionary containing them
    '''
    closest_title_scores = {}
    i = 1
    for a in X:
        vec = cosine_similarity(a, X)
        #sort the results
        sorted_vec = f(vec)
        num = 0
        already_printed = 0
        closest_list = []
        while already_printed < 10:  
            #try because of the dropped ID's -> may have nonexistent entries
            try:
                curr = df['title'][sorted_vec[num]]
                sco = df['score'][sorted_vec[num]]
                
                closest_list.append((curr, sco, vec[0][sorted_vec[num]]))
                already_printed +=1
            except:
                pass
            
            num +=1
        #drop the first entry because its the cosine of the title with itself (score 1)
        closest_title_scores[closest_list[0][0]] = closest_list[1:]
    return closest_title_scores

closest_title_scores = make_closest()


def knearest(title, k=7):
    """
    Given a restaurant_id, dataframe, and database, get a sorted list of the
    k most similar restaurants from the entire database.
    """
    return closest_title_scores[title][:k]


#generate the new columns
df['max_cosine'] = df['title'].map(lambda x: 0)
df['avg_cosine'] = df['title'].map(lambda x: 0)
df['min_cosine'] = df['title'].map(lambda x: 0)
df['closest_cosine'] = df['title'].map(lambda x: 0)

#fill in the values to the new columns
for key, value in df.iterrows():
    max_score = 0
    mean_score = 0
    min_score = 0
    closest_score = 0
    try:
        tuple_list = knearest(value['title'])
        closest_score = tuple_list[0][1]
        max_score = max(tuple_list,key=lambda item:item[1])[1]
        min_score = min(tuple_list,key=lambda item:item[1])[1]
        mean_score =  np.mean([a[1] for a in tuple_list] )
    except:
        pass
        
    df['max_cosine'][key] = max_score
    df['avg_cosine'][key] = mean_score
    df['min_cosine'][key] = min_score
    df['closest_cosine'][key] = closest_score
    
#calculate the pearsonr
    
print "max cosine pearson"
r_row, p_value = pearsonr(df['max_cosine'], df['score'])
print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value)

print "avg cosine pearson"
r_row, p_value = pearsonr(df['avg_cosine'], df['score'])
print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value)

print "min cosine pearson"
r_row, p_value = pearsonr(df['min_cosine'], df['score'])
print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value)

print "closest cosine pearson"
r_row, p_value = pearsonr(df['closest_cosine'], df['score'])
print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value)


spec_probs = []
for i in df.index:
    title = df.title[i]
    subreddit = df.subreddit[i]
    clf = subreddit_ngrams[subreddit][0]
    n_grams_spec = subreddit_ngrams[subreddit][1]
    prob_spec = clf.predict_proba(n_grams_spec.transform([title]))[0][1]
    spec_probs.append(prob_spec)
    
df['spec_probs'] = spec_probs
df.to_csv("Data/new_full.csv", index=False, encoding='utf-8')

def plot_spec_prob(table):
    '''
    plots a scatterplot of scores against the specific probability
    '''
    m_fit,b_fit = plt2.polyfit(table.spec_probs, table.score, 1) 
    plt2.plot(table.spec_probs, table.score, 'yo', table.spec_probs, m_fit*table.spec_probs+b_fit, color='red', alpha=.9) 

    #p1 = plt.scatter(table.spec_probs, table.score, color='red', alpha = 0.2)
    
    plt.title("Specific probability against score")
    plt.xlabel("Specific probability")
    plt.ylabel("Score")
    plt.ylim([0, 7000])
    remove_border()
    
plot_spec_prob(df)

m, b, r, p, std = scipy.stats.linregress(np.array(df['spec_probs']), np.array(df['score']))
print "slope", m
print "slope intercept", b
print "squared correlation", r**2
print "probability", p
print "standard deviation", std


def predict(title):
    x = clf.predict_proba(n_grams_spec.transform([title]))[0][1]
    y = m*x + b
    return y

subreddit_svm = {}
for subreddit in subreddits: 
    smalldf = df[df['subreddit'] == subreddit]
    sortedsmalldf = smalldf.sort('score')
    sortedsmalldf['category'] = smalldf['score']
    size = len(smalldf)
    num = 2
    blocksize = size/num
    blocks = [blocksize * i for i in range(num)]
    blocks.append(size)
    for i in range(num):
        sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
        
    n_grams = CountVectorizer(ngram_range=[1, 3])
    n_grams.fit(list(sortedsmalldf['title']))
    X = n_grams.transform(list(sortedsmalldf['title']))
    Y = np.array(sortedsmalldf['category'])
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
    clf = RidgeClassifier(tol=1e-2, solver="lsqr")
    clf.fit(x_train, y_train)
    subreddit_svm[subreddit] = [clf, n_grams]
    train_acc = clf.score(x_train, y_train)
    test_acc = clf.score(x_test, y_test)
    print "For", subreddit, "subreddit:"
    print "Training accuracy is", train_acc
    print "Test accuracy is", test_acc
    print "---------------------------------"

subreddit_alchemy = {}
for subreddit in subreddits: 
    smalldf = df[df['subreddit'] == subreddit]
    sortedsmalldf = smalldf.sort('score')
    sortedsmalldf['category'] = smalldf['score']
    size = len(smalldf)
    num = 2
    blocksize = size/num
    blocks = [blocksize * i for i in range(num)]
    blocks.append(size)
    for i in range(num):
        sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
        
    alch_titles = []
    for title in list(sortedsmalldf['title']):
        titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]
        titles = [lst.replace(')', '') for lst in titles]
        titles = [lst.replace('[', '') for lst in titles]
        titles = [lst.replace(']', '') for lst in titles]
        titles = "".join(titles)
        titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')
        titles = titles.replace('  ', ' ')
        titles = titles.split(' ')
        alch_titles.append(" ".join(titles[1:]))
        
    n_grams = CountVectorizer(ngram_range=[1, 3])
    n_grams.fit(list(alch_titles))
    X = n_grams.transform(alch_titles)
    Y = np.array(sortedsmalldf['category'])
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
    clf = RidgeClassifier(tol=1e-2, solver="lsqr")
    clf.fit(x_train, y_train)
    subreddit_alchemy[subreddit] = [clf, n_grams]
    train_acc = clf.score(x_train, y_train)
    test_acc = clf.score(x_test, y_test)
    print "For", subreddit, "subreddit:"
    print "Training accuracy is", train_acc
    print "Test accuracy is", test_acc
    print "---------------------------------"

subreddit_alchemy = {}
for subreddit in subreddits: 
    smalldf = df[df['subreddit'] == subreddit]
    sortedsmalldf = smalldf.sort('score')
    sortedsmalldf['category'] = smalldf['score']
    size = len(smalldf)
    num = 2
    blocksize = size/num
    blocks = [blocksize * i for i in range(num)]
    blocks.append(size)
    for i in range(num):
        sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
        
    alch_titles = []
    for title in list(sortedsmalldf['title']):
        titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]
        titles = [lst.replace(')', '') for lst in titles]
        titles = [lst.replace('[', '') for lst in titles]
        titles = [lst.replace(']', '') for lst in titles]
        titles = "".join(titles)
        titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')
        titles = titles.replace('  ', ' ')
        titles = titles.split(' ')
        alch_titles.append(" ".join(titles[1:]))
        
    n_grams = CountVectorizer(ngram_range=[1, 3])
    n_grams.fit(list(alch_titles))
    X = n_grams.transform(alch_titles)
    Y = np.array(sortedsmalldf['category'])
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
    clf = Perceptron(n_iter=50)
    clf.fit(x_train, y_train)
    subreddit_alchemy[subreddit] = [clf, n_grams]
    train_acc = clf.score(x_train, y_train)
    test_acc = clf.score(x_test, y_test)
    print "For", subreddit, "subreddit:"
    print "Training accuracy is", train_acc
    print "Test accuracy is", test_acc
    print "---------------------------------"

subreddit_alchemy = {}
for subreddit in subreddits: 
    smalldf = df[df['subreddit'] == subreddit]
    sortedsmalldf = smalldf.sort('score')
    sortedsmalldf['category'] = smalldf['score']
    size = len(smalldf)
    num = 2
    blocksize = size/num
    blocks = [blocksize * i for i in range(num)]
    blocks.append(size)
    for i in range(num):
        sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
        
    alch_titles = []
    for title in list(sortedsmalldf['title']):
        titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]
        titles = [lst.replace(')', '') for lst in titles]
        titles = [lst.replace('[', '') for lst in titles]
        titles = [lst.replace(']', '') for lst in titles]
        titles = "".join(titles)
        titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')
        titles = titles.replace('  ', ' ')
        titles = titles.split(' ')
        alch_titles.append(" ".join(titles[1:]))
        
    n_grams = CountVectorizer(ngram_range=[1, 3])
    n_grams.fit(list(alch_titles))
    X = n_grams.transform(alch_titles)
    Y = np.array(sortedsmalldf['category'])
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
    clf = PassiveAggressiveClassifier(n_iter=50)
    clf.fit(x_train, y_train)
    subreddit_alchemy[subreddit] = [clf, n_grams]
    train_acc = clf.score(x_train, y_train)
    test_acc = clf.score(x_test, y_test)
    print "For", subreddit, "subreddit:"
    print "Training accuracy is", train_acc
    print "Test accuracy is", test_acc
    print "---------------------------------"

#this needs a lot of memory! you might get a memory error running it
for i, d in enumerate(['Not alchemy', 'Alchemy']):
    for clf, name in (
            (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
            (Perceptron(n_iter=50), "Perceptron"),
            (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive")):
        subreddit_svm = {}
        for subreddit in subreddits: 
            smalldf = df[df['subreddit'] == subreddit]
            sortedsmalldf = smalldf.sort('score')
            sortedsmalldf['category'] = smalldf['score']
            size = len(smalldf)
            num = 2
            blocksize = size/num
            blocks = [blocksize * i for i in range(num)]
            blocks.append(size)
            for i in range(num):
                sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
                
            titles = list(sortedsmalldf['title'])
            bins = list(sortedsmalldf['category'])
            if (i==1):
                alch_titles = []
                for title in list(sortedsmalldf['title']):
                    titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]
                    titles = [lst.replace(')', '') for lst in titles]
                    titles = [lst.replace('[', '') for lst in titles]
                    titles = [lst.replace(']', '') for lst in titles]
                    titles = "".join(titles)
                    titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')
                    titles = titles.replace('  ', ' ')
                    titles = titles.split(' ')[1:]
                    alch_titles.append(titles)    
                alch_bins = []
                categories = np.array(sortedsmalldf['category'])
                for i, lst in enumerate(alch_titles):
                    b = categories[i]
                    for j in range(len(lst)):
                        alch_bins.append(b)        
                alch_titles = [word for words in alch_titles for word in words]
                titles = alch_titles
                bins = alch_bins
                
            n_grams = CountVectorizer(ngram_range=[1, 3])
            n_grams.fit(titles)
            X = n_grams.transform(titles)
            Y = np.array(bins)
            
            x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
            clf2 = clf
            clf2.fit(x_train, y_train)
            subreddit_svm[subreddit] = [clf, n_grams]
            train_acc = clf.score(x_train, y_train)
            test_acc = clf.score(x_test, y_test)
            print "For", d, "and", subreddit, "subreddit and", name, "classifier:"
            print "Training accuracy is", train_acc
            print "Test accuracy is", test_acc
            print "---------------------------------"