%matplotlib inline

import time
import pandas as pd
import numpy as np
import json
import os
import urllib
import urllib2
import matplotlib.pyplot as plt

import nltk
from nltk.util import ngrams
from nltk.collocations import *
from nltk.corpus import stopwords

import pandas as pd
import math
from scipy.stats import pearsonr

user_agent = ("Project for Data Science class v1.0" " /u/Valedra" " https://github.com/jaysayre/intelligentdolphins")


def json_extract(baseurl, headrs=None, params=None):
    '''
    Helper function to download and read json data. Takes in explanatory headers and returns json dict.
    '''
    if params != None:
        form = urllib.urlencode(params)
        url = baseurl+form
    else:
        url = baseurl
    
    if headrs != None:
        request = urllib2.Request(url, headers=headrs)
    else: 
        request = urllib2.Request(url)
    return json.loads(urllib2.urlopen(request).read())

def return_grams(sentence, n=[1, 3],  minlength=3):
    gramslist = []
    mysentencetokens_sw= nltk.word_tokenize(sentence)
    mysentencetokens = [token for token in mysentencetokens_sw if (not token in stopwords.words('english')) and len(token) >= minlength]
    for j in range(n[0], n[1]+1):
        somegrams = ngrams(mysentencetokens, j)
        for grams in somegrams:
            gramslist.append(' '.join(grams))
    return gramslist     

def search_keyword(title, maxscores, user_agent, n=[3,3], postid='bhjfb', sort_call='relevance', t='all', subreddit=None, api_call_limit=100):
    scores = {}
    header = {'User-agent': user_agent}
    for term in return_grams(title, [n[0],n[1]]): 
        post_params = {'q':term, 'sort': sort_call, 't':t, 'limit':api_call_limit}
        if subreddit == None:
            reddit_base = 'http://www.reddit.com/r/search/search.json?' # If we want to search all of reddit
        else:
            reddit_base = 'http://www.reddit.com/r/%s/search.json?' % subreddit
            post_params.update ({'restrict_sr':'on'})
        
        #Makes sure maxscores isn't a float!
        maxscores = int(maxscores) 
        api_call_limit = int(api_call_limit)
    
        #Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests
        if maxscores%api_call_limit != 0:
            remainder = maxscores%api_call_limit
            num = (maxscores/api_call_limit) +1
        else:
            num = maxscores/api_call_limit
            remainder = api_call_limit
            
        #Makes an api call for all n entries based on the api call limit
        for i in range(num):
            if i == 0:
                jsondata = json_extract(reddit_base, header, post_params)
                tostartfrom = jsondata['data']['after']
                for item in jsondata['data']['children']:
                    if item['data']['score'] != 0:
                        scores.update({item['data']['id'] : item['data']['score']})
            elif i == num - 1:
                post_params.update({'limit': remainder, 'after': tostartfrom}) #Indicates the post after we wish to call from
                jsondata = json_extract(reddit_base, header, post_params)
                for item in jsondata['data']['children']:
                    if item['data']['score'] != 0:
                        scores.update({item['data']['id'] : item['data']['score']})
            else: 
                post_params.update({'after': tostartfrom}) 
                jsondata = json_extract(reddit_base, header, post_params)
                tostartfrom = jsondata['data']['after']
                for item in jsondata['data']['children']:
                    if item['data']['score'] != 0:
                        scores.update({item['data']['id'] : item['data']['score']})
    try:
        scores.pop(postid)
    except:
        pass
        
    return scores.values()


df = pd.read_csv("Data/full.csv", encoding="utf-8")
df2 = df[df['titlescore'] == '[null]']
print len(df2)
dfids = list(df2.index)

for i in range(len(df2)):
    print df['id'][dfids[i]], i
    b = search_keyword(df['title'][dfids[i]], 100, user_agent, n=[2,2], postid=df['id'][dfids[i]], subreddit=df['subreddit'][dfids[i]])
    if len(b) > 0:
        df['titlescore'][dfids[i]] = np.mean(b)
        df['titlestd'][dfids[i]] = np.std(b)
    print df['titlescore'][dfids[i]]

#df.to_csv("Data/titlescorebigram.csv", index=False, encoding='utf-8')

fulldf = pd.read_csv('Data/full.csv', encoding='utf-8')
fulldf['titlescore'] = ['null']*len(fulldf)
fulldf['titlestd'] = ['null']*len(fulldf)

fulldf.columns

for i in fulldf.index:
    fid = fulldf['id'][i]
    alc = df[df['id'] == fid]['titlescore']
    fulldf['titlescore'][i] = alc
    pop = df[df['id'] == fid]['titlestd']
    fulldf['titlestd'][i] = pop
    

def calc_cdf(mean, std):
    cdf_dem = (1.0 + math.erf((600-mean)/math.sqrt(2*(std**2))))/2.0
    return 1 - cdf_dem 

df = fulldf

#clean the data
df['titlescore'] = df['titlescore'].apply(lambda x: x[1:-1])
df['titlestd'] = df['titlestd'].apply(lambda x: x[1:-1])

df = df[df['titlescore'] != 'null']
df = df[df['titlescore'] != '#N/A']
df['titlescore'] = df['titlescore'].apply(lambda x:float(x))
df['titlestd'] = df['titlestd'].apply(lambda x:float(x))
    

#get the cdf
df['probprob'] = ['null'] * len(df)
for i in df.index:
    df['probprob'][i] = calc_cdf(df['titlescore'][i], df['titlestd'][i])

df['probprob'] = df['probprob'].astype(float)

pearsonr(df['probprob'], df['score'])

subs = list(df['subreddit'].unique()) 
types = list(df['type'].unique())

'''
df['titlescore'] = df['titlescore'].apply(lambda x: x[1:-1])

null_df = df[df['titlescore'] == 'null']
na_df = df[df['titlescore'] == '#N/A']
dfids = list(null_df.index)
for i in range(len(dfids)):
   df['titlescore'][dfids[i]] = '0'
dfids = list(na_df.index)
for i in range(len(dfids)):
   df['titlescore'][dfids[i]] = '0'
            
df['titlescore'] = df['titlescore'].apply(lambda x:float(x))
'''

df['normalized_titlescore'] = ['null']*len(df)
for sub in subs:
    curr_df = df[df['subreddit'] == sub]
    for ctype in types:
        current_df = curr_df[curr_df['type'] == ctype]
        curr_mean =  current_df['score'].mean()
        dfids = list(current_df.index)
        for i in range(len(dfids)):
           df['normalized_titlescore'][dfids[i]] = df['titlescore'][dfids[i]]/curr_mean

df2 = pd.read_csv('Data/full2.csv', encoding='utf-8')

df2= df2[df2['normalized_titlescore'] != 0]
pearsonr(df2['normalized_titlescore'], df2['score'])

df2['titlestd'] = df2['titlestd'].apply(lambda x: x[1:-1])
df2= df2[df2['titlestd'] != 'null']
df2= df2[df2['titlestd'] != '#N/A']


df2['titlestd'] = df2['titlestd'].apply(lambda x:float(x))

plt.scatter(df2['titlestd'], df2['score'], c='g')
plt.title("Std versus Score")
plt.xlabel("Std")
plt.ylabel("Score")
plt.xlim(0, 2000)
plt.ylim(0, 8000)
plt.show()
r_row, p_value = pearsonr(df2['titlestd'], df2['score'])
print "Pearson coefficient is" + str(r_row) + " with a p-value of " + str(p_value)