Notebook

In [27]:

%matplotlib inline

import time
import pandas as pd
import numpy as np
import json
import os
import urllib
import urllib2
import matplotlib.pyplot as plt

import nltk
from nltk.util import ngrams
from nltk.collocations import *
from nltk.corpus import stopwords

import pandas as pd
import math
from scipy.stats import pearsonr

user_agent = ("Project for Data Science class v1.0" " /u/Valedra" " https://github.com/jaysayre/intelligentdolphins")


def json_extract(baseurl, headrs=None, params=None):
    '''
    Helper function to download and read json data. Takes in explanatory headers and returns json dict.
    '''
    if params != None:
        form = urllib.urlencode(params)
        url = baseurl+form
    else:
        url = baseurl
    
    if headrs != None:
        request = urllib2.Request(url, headers=headrs)
    else: 
        request = urllib2.Request(url)
    return json.loads(urllib2.urlopen(request).read())

def return_grams(sentence, n=[1, 3],  minlength=3):
    gramslist = []
    mysentencetokens_sw= nltk.word_tokenize(sentence)
    mysentencetokens = [token for token in mysentencetokens_sw if (not token in stopwords.words('english')) and len(token) >= minlength]
    for j in range(n[0], n[1]+1):
        somegrams = ngrams(mysentencetokens, j)
        for grams in somegrams:
            gramslist.append(' '.join(grams))
    return gramslist     

def search_keyword(title, maxscores, user_agent, n=[3,3], postid='bhjfb', sort_call='relevance', t='all', subreddit=None, api_call_limit=100):
    scores = {}
    header = {'User-agent': user_agent}
    for term in return_grams(title, [n[0],n[1]]): 
        post_params = {'q':term, 'sort': sort_call, 't':t, 'limit':api_call_limit}
        if subreddit == None:
            reddit_base = 'http://www.reddit.com/r/search/search.json?' # If we want to search all of reddit
        else:
            reddit_base = 'http://www.reddit.com/r/%s/search.json?' % subreddit
            post_params.update ({'restrict_sr':'on'})
        
        #Makes sure maxscores isn't a float!
        maxscores = int(maxscores) 
        api_call_limit = int(api_call_limit)
    
        #Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests
        if maxscores%api_call_limit != 0:
            remainder = maxscores%api_call_limit
            num = (maxscores/api_call_limit) +1
        else:
            num = maxscores/api_call_limit
            remainder = api_call_limit
            
        #Makes an api call for all n entries based on the api call limit
        for i in range(num):
            if i == 0:
                jsondata = json_extract(reddit_base, header, post_params)
                tostartfrom = jsondata['data']['after']
                for item in jsondata['data']['children']:
                    if item['data']['score'] != 0:
                        scores.update({item['data']['id'] : item['data']['score']})
            elif i == num - 1:
                post_params.update({'limit': remainder, 'after': tostartfrom}) #Indicates the post after we wish to call from
                jsondata = json_extract(reddit_base, header, post_params)
                for item in jsondata['data']['children']:
                    if item['data']['score'] != 0:
                        scores.update({item['data']['id'] : item['data']['score']})
            else: 
                post_params.update({'after': tostartfrom}) 
                jsondata = json_extract(reddit_base, header, post_params)
                tostartfrom = jsondata['data']['after']
                for item in jsondata['data']['children']:
                    if item['data']['score'] != 0:
                        scores.update({item['data']['id'] : item['data']['score']})
    try:
        scores.pop(postid)
    except:
        pass
        
    return scores.values()

In this file we tried to generate trigrams in titles.

Based on those we searched reddit and stored the scores of the search results.

If those were zero, we tried the same thing using bigrams.

We were hoping that the scores in search results would tell us anything about the score of the post we were looking at.

Since this file uses old data it should not get touched anymore and will take upwards of 30 h to run

In [9]:

df = pd.read_csv("Data/full.csv", encoding="utf-8")
df2 = df[df['titlescore'] == '[null]']
print len(df2)
dfids = list(df2.index)

In [15]:

for i in range(len(df2)):
    print df['id'][dfids[i]], i
    b = search_keyword(df['title'][dfids[i]], 100, user_agent, n=[2,2], postid=df['id'][dfids[i]], subreddit=df['subreddit'][dfids[i]])
    if len(b) > 0:
        df['titlescore'][dfids[i]] = np.mean(b)
        df['titlestd'][dfids[i]] = np.std(b)
    print df['titlescore'][dfids[i]]

#df.to_csv("Data/titlescorebigram.csv", index=False, encoding='utf-8')

Out[15]:

"\nfor i in range(len(df2)):\n    print df['id'][dfids[i]], i\n    b = search_keyword(df['title'][dfids[i]], 100, user_agent, n=[2,2], postid=df['id'][dfids[i]], subreddit=df['subreddit'][dfids[i]])\n    if len(b) > 0:\n        df['titlescore'][dfids[i]] = np.mean(b)\n        df['titlestd'][dfids[i]] = np.std(b)\n    print df['titlescore'][dfids[i]]\n  "

In [29]:

fulldf = pd.read_csv('Data/full.csv', encoding='utf-8')
fulldf['titlescore'] = ['null']*len(fulldf)
fulldf['titlestd'] = ['null']*len(fulldf)

In [30]:

fulldf.columns

Out[30]:

Index([author, comments, downvotes, id, score, selftext, subreddit, time_created, title, type, upvotes, karma, link_karma, alchemy, titlescore, titlestd], dtype=object)

In [32]:

for i in fulldf.index:
    fid = fulldf['id'][i]
    alc = df[df['id'] == fid]['titlescore']
    fulldf['titlescore'][i] = alc
    pop = df[df['id'] == fid]['titlestd']
    fulldf['titlestd'][i] = pop
    

On the titlescores we tried to use CDF to predict the actual score

$$ CDF(z) = \frac1{2}\left(1 + {\rm erf}\left(\frac{z - \mu}{\sqrt{2 \sigma^2}}\right)\right) $$

In [ ]:

def calc_cdf(mean, std):
    cdf_dem = (1.0 + math.erf((600-mean)/math.sqrt(2*(std**2))))/2.0
    return 1 - cdf_dem 

In [ ]:

df = fulldf

In [ ]:

#clean the data
df['titlescore'] = df['titlescore'].apply(lambda x: x[1:-1])
df['titlestd'] = df['titlestd'].apply(lambda x: x[1:-1])

df = df[df['titlescore'] != 'null']
df = df[df['titlescore'] != '#N/A']
df['titlescore'] = df['titlescore'].apply(lambda x:float(x))
df['titlestd'] = df['titlestd'].apply(lambda x:float(x))
    

In [16]:

#get the cdf
df['probprob'] = ['null'] * len(df)
for i in df.index:
    df['probprob'][i] = calc_cdf(df['titlescore'][i], df['titlestd'][i])

In [17]:

df['probprob'] = df['probprob'].astype(float)

In [ ]:

pearsonr(df['probprob'], df['score'])

Unfortunately this yielded a pearsonr of ca. 10% which is nothing we want to use in our model. We still wanted to improve those numbers. We thought, normalizing the data might do the trick. For that we take the titlescore and divide it by the average score of the current type and subreddit we are looking at.

In [18]:

subs = list(df['subreddit'].unique()) 
types = list(df['type'].unique())

In [ ]:

'''
df['titlescore'] = df['titlescore'].apply(lambda x: x[1:-1])

null_df = df[df['titlescore'] == 'null']
na_df = df[df['titlescore'] == '#N/A']
dfids = list(null_df.index)
for i in range(len(dfids)):
   df['titlescore'][dfids[i]] = '0'
dfids = list(na_df.index)
for i in range(len(dfids)):
   df['titlescore'][dfids[i]] = '0'
            
df['titlescore'] = df['titlescore'].apply(lambda x:float(x))
'''

In [ ]:

df['normalized_titlescore'] = ['null']*len(df)
for sub in subs:
    curr_df = df[df['subreddit'] == sub]
    for ctype in types:
        current_df = curr_df[curr_df['type'] == ctype]
        curr_mean =  current_df['score'].mean()
        dfids = list(current_df.index)
        for i in range(len(dfids)):
           df['normalized_titlescore'][dfids[i]] = df['titlescore'][dfids[i]]/curr_mean

In [19]:

df2 = pd.read_csv('Data/full2.csv', encoding='utf-8')

In [20]:

df2= df2[df2['normalized_titlescore'] != 0]
pearsonr(df2['normalized_titlescore'], df2['score'])

Out[20]:

(-0.13039151874449112, 7.9110685237525519e-160)

In [21]:

df2['titlestd'] = df2['titlestd'].apply(lambda x: x[1:-1])
df2= df2[df2['titlestd'] != 'null']
df2= df2[df2['titlestd'] != '#N/A']

In [22]:

df2['titlestd'] = df2['titlestd'].apply(lambda x:float(x))

In [28]:

plt.scatter(df2['titlestd'], df2['score'], c='g')
plt.title("Std versus Score")
plt.xlabel("Std")
plt.ylabel("Score")
plt.xlim(0, 2000)
plt.ylim(0, 8000)
plt.show()
r_row, p_value = pearsonr(df2['titlestd'], df2['score'])
print "Pearson coefficient is" + str(r_row) + " with a p-value of " + str(p_value)

Pearson coefficient is0.153435569189 with a p-value of 3.04008690448e-221

As you can see, neither the normalized titlescore nor the standard deviation of the scores of the search results have any statistical significance fot our model. We ended this experiment at this point as we made no progress and tried different things.

In [ ]: