%matplotlib inline
import time
import pandas as pd
import numpy as np
import json
import os
import urllib
import urllib2
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams
from nltk.collocations import *
from nltk.corpus import stopwords
import pandas as pd
import math
from scipy.stats import pearsonr
user_agent = ("Project for Data Science class v1.0" " /u/Valedra" " https://github.com/jaysayre/intelligentdolphins")
def json_extract(baseurl, headrs=None, params=None):
'''
Helper function to download and read json data. Takes in explanatory headers and returns json dict.
'''
if params != None:
form = urllib.urlencode(params)
url = baseurl+form
else:
url = baseurl
if headrs != None:
request = urllib2.Request(url, headers=headrs)
else:
request = urllib2.Request(url)
return json.loads(urllib2.urlopen(request).read())
def return_grams(sentence, n=[1, 3], minlength=3):
gramslist = []
mysentencetokens_sw= nltk.word_tokenize(sentence)
mysentencetokens = [token for token in mysentencetokens_sw if (not token in stopwords.words('english')) and len(token) >= minlength]
for j in range(n[0], n[1]+1):
somegrams = ngrams(mysentencetokens, j)
for grams in somegrams:
gramslist.append(' '.join(grams))
return gramslist
def search_keyword(title, maxscores, user_agent, n=[3,3], postid='bhjfb', sort_call='relevance', t='all', subreddit=None, api_call_limit=100):
scores = {}
header = {'User-agent': user_agent}
for term in return_grams(title, [n[0],n[1]]):
post_params = {'q':term, 'sort': sort_call, 't':t, 'limit':api_call_limit}
if subreddit == None:
reddit_base = 'http://www.reddit.com/r/search/search.json?' # If we want to search all of reddit
else:
reddit_base = 'http://www.reddit.com/r/%s/search.json?' % subreddit
post_params.update ({'restrict_sr':'on'})
#Makes sure maxscores isn't a float!
maxscores = int(maxscores)
api_call_limit = int(api_call_limit)
#Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests
if maxscores%api_call_limit != 0:
remainder = maxscores%api_call_limit
num = (maxscores/api_call_limit) +1
else:
num = maxscores/api_call_limit
remainder = api_call_limit
#Makes an api call for all n entries based on the api call limit
for i in range(num):
if i == 0:
jsondata = json_extract(reddit_base, header, post_params)
tostartfrom = jsondata['data']['after']
for item in jsondata['data']['children']:
if item['data']['score'] != 0:
scores.update({item['data']['id'] : item['data']['score']})
elif i == num - 1:
post_params.update({'limit': remainder, 'after': tostartfrom}) #Indicates the post after we wish to call from
jsondata = json_extract(reddit_base, header, post_params)
for item in jsondata['data']['children']:
if item['data']['score'] != 0:
scores.update({item['data']['id'] : item['data']['score']})
else:
post_params.update({'after': tostartfrom})
jsondata = json_extract(reddit_base, header, post_params)
tostartfrom = jsondata['data']['after']
for item in jsondata['data']['children']:
if item['data']['score'] != 0:
scores.update({item['data']['id'] : item['data']['score']})
try:
scores.pop(postid)
except:
pass
return scores.values()
In this file we tried to generate trigrams in titles.
Based on those we searched reddit and stored the scores of the search results.
If those were zero, we tried the same thing using bigrams.
We were hoping that the scores in search results would tell us anything about the score of the post we were looking at.
Since this file uses old data it should not get touched anymore and will take upwards of 30 h to run
df = pd.read_csv("Data/full.csv", encoding="utf-8")
df2 = df[df['titlescore'] == '[null]']
print len(df2)
dfids = list(df2.index)
1938
for i in range(len(df2)):
print df['id'][dfids[i]], i
b = search_keyword(df['title'][dfids[i]], 100, user_agent, n=[2,2], postid=df['id'][dfids[i]], subreddit=df['subreddit'][dfids[i]])
if len(b) > 0:
df['titlescore'][dfids[i]] = np.mean(b)
df['titlestd'][dfids[i]] = np.std(b)
print df['titlescore'][dfids[i]]
#df.to_csv("Data/titlescorebigram.csv", index=False, encoding='utf-8')
"\nfor i in range(len(df2)):\n print df['id'][dfids[i]], i\n b = search_keyword(df['title'][dfids[i]], 100, user_agent, n=[2,2], postid=df['id'][dfids[i]], subreddit=df['subreddit'][dfids[i]])\n if len(b) > 0:\n df['titlescore'][dfids[i]] = np.mean(b)\n df['titlestd'][dfids[i]] = np.std(b)\n print df['titlescore'][dfids[i]]\n "
fulldf = pd.read_csv('Data/full.csv', encoding='utf-8')
fulldf['titlescore'] = ['null']*len(fulldf)
fulldf['titlestd'] = ['null']*len(fulldf)
fulldf.columns
Index([author, comments, downvotes, id, score, selftext, subreddit, time_created, title, type, upvotes, karma, link_karma, alchemy, titlescore, titlestd], dtype=object)
for i in fulldf.index:
fid = fulldf['id'][i]
alc = df[df['id'] == fid]['titlescore']
fulldf['titlescore'][i] = alc
pop = df[df['id'] == fid]['titlestd']
fulldf['titlestd'][i] = pop
On the titlescores we tried to use CDF to predict the actual score
def calc_cdf(mean, std):
cdf_dem = (1.0 + math.erf((600-mean)/math.sqrt(2*(std**2))))/2.0
return 1 - cdf_dem
df = fulldf
#clean the data
df['titlescore'] = df['titlescore'].apply(lambda x: x[1:-1])
df['titlestd'] = df['titlestd'].apply(lambda x: x[1:-1])
df = df[df['titlescore'] != 'null']
df = df[df['titlescore'] != '#N/A']
df['titlescore'] = df['titlescore'].apply(lambda x:float(x))
df['titlestd'] = df['titlestd'].apply(lambda x:float(x))
#get the cdf
df['probprob'] = ['null'] * len(df)
for i in df.index:
df['probprob'][i] = calc_cdf(df['titlescore'][i], df['titlestd'][i])
df['probprob'] = df['probprob'].astype(float)
pearsonr(df['probprob'], df['score'])
Unfortunately this yielded a pearsonr of ca. 10% which is nothing we want to use in our model. We still wanted to improve those numbers. We thought, normalizing the data might do the trick. For that we take the titlescore and divide it by the average score of the current type and subreddit we are looking at.
subs = list(df['subreddit'].unique())
types = list(df['type'].unique())
'''
df['titlescore'] = df['titlescore'].apply(lambda x: x[1:-1])
null_df = df[df['titlescore'] == 'null']
na_df = df[df['titlescore'] == '#N/A']
dfids = list(null_df.index)
for i in range(len(dfids)):
df['titlescore'][dfids[i]] = '0'
dfids = list(na_df.index)
for i in range(len(dfids)):
df['titlescore'][dfids[i]] = '0'
df['titlescore'] = df['titlescore'].apply(lambda x:float(x))
'''
df['normalized_titlescore'] = ['null']*len(df)
for sub in subs:
curr_df = df[df['subreddit'] == sub]
for ctype in types:
current_df = curr_df[curr_df['type'] == ctype]
curr_mean = current_df['score'].mean()
dfids = list(current_df.index)
for i in range(len(dfids)):
df['normalized_titlescore'][dfids[i]] = df['titlescore'][dfids[i]]/curr_mean
df2 = pd.read_csv('Data/full2.csv', encoding='utf-8')
df2= df2[df2['normalized_titlescore'] != 0]
pearsonr(df2['normalized_titlescore'], df2['score'])
(-0.13039151874449112, 7.9110685237525519e-160)
df2['titlestd'] = df2['titlestd'].apply(lambda x: x[1:-1])
df2= df2[df2['titlestd'] != 'null']
df2= df2[df2['titlestd'] != '#N/A']
df2['titlestd'] = df2['titlestd'].apply(lambda x:float(x))
plt.scatter(df2['titlestd'], df2['score'], c='g')
plt.title("Std versus Score")
plt.xlabel("Std")
plt.ylabel("Score")
plt.xlim(0, 2000)
plt.ylim(0, 8000)
plt.show()
r_row, p_value = pearsonr(df2['titlestd'], df2['score'])
print "Pearson coefficient is" + str(r_row) + " with a p-value of " + str(p_value)
Pearson coefficient is0.153435569189 with a p-value of 3.04008690448e-221
As you can see, neither the normalized titlescore nor the standard deviation of the scores of the search results have any statistical significance fot our model. We ended this experiment at this point as we made no progress and tried different things.