%matplotlib inline import time import pandas as pd import numpy as np import json import os import urllib import urllib2 import matplotlib.pyplot as plt import nltk from nltk.util import ngrams from nltk.collocations import * from nltk.corpus import stopwords import pandas as pd import math from scipy.stats import pearsonr user_agent = ("Project for Data Science class v1.0" " /u/Valedra" " https://github.com/jaysayre/intelligentdolphins") def json_extract(baseurl, headrs=None, params=None): ''' Helper function to download and read json data. Takes in explanatory headers and returns json dict. ''' if params != None: form = urllib.urlencode(params) url = baseurl+form else: url = baseurl if headrs != None: request = urllib2.Request(url, headers=headrs) else: request = urllib2.Request(url) return json.loads(urllib2.urlopen(request).read()) def return_grams(sentence, n=[1, 3], minlength=3): gramslist = [] mysentencetokens_sw= nltk.word_tokenize(sentence) mysentencetokens = [token for token in mysentencetokens_sw if (not token in stopwords.words('english')) and len(token) >= minlength] for j in range(n[0], n[1]+1): somegrams = ngrams(mysentencetokens, j) for grams in somegrams: gramslist.append(' '.join(grams)) return gramslist def search_keyword(title, maxscores, user_agent, n=[3,3], postid='bhjfb', sort_call='relevance', t='all', subreddit=None, api_call_limit=100): scores = {} header = {'User-agent': user_agent} for term in return_grams(title, [n[0],n[1]]): post_params = {'q':term, 'sort': sort_call, 't':t, 'limit':api_call_limit} if subreddit == None: reddit_base = 'http://www.reddit.com/r/search/search.json?' # If we want to search all of reddit else: reddit_base = 'http://www.reddit.com/r/%s/search.json?' % subreddit post_params.update ({'restrict_sr':'on'}) #Makes sure maxscores isn't a float! maxscores = int(maxscores) api_call_limit = int(api_call_limit) #Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests if maxscores%api_call_limit != 0: remainder = maxscores%api_call_limit num = (maxscores/api_call_limit) +1 else: num = maxscores/api_call_limit remainder = api_call_limit #Makes an api call for all n entries based on the api call limit for i in range(num): if i == 0: jsondata = json_extract(reddit_base, header, post_params) tostartfrom = jsondata['data']['after'] for item in jsondata['data']['children']: if item['data']['score'] != 0: scores.update({item['data']['id'] : item['data']['score']}) elif i == num - 1: post_params.update({'limit': remainder, 'after': tostartfrom}) #Indicates the post after we wish to call from jsondata = json_extract(reddit_base, header, post_params) for item in jsondata['data']['children']: if item['data']['score'] != 0: scores.update({item['data']['id'] : item['data']['score']}) else: post_params.update({'after': tostartfrom}) jsondata = json_extract(reddit_base, header, post_params) tostartfrom = jsondata['data']['after'] for item in jsondata['data']['children']: if item['data']['score'] != 0: scores.update({item['data']['id'] : item['data']['score']}) try: scores.pop(postid) except: pass return scores.values() df = pd.read_csv("Data/full.csv", encoding="utf-8") df2 = df[df['titlescore'] == '[null]'] print len(df2) dfids = list(df2.index) for i in range(len(df2)): print df['id'][dfids[i]], i b = search_keyword(df['title'][dfids[i]], 100, user_agent, n=[2,2], postid=df['id'][dfids[i]], subreddit=df['subreddit'][dfids[i]]) if len(b) > 0: df['titlescore'][dfids[i]] = np.mean(b) df['titlestd'][dfids[i]] = np.std(b) print df['titlescore'][dfids[i]] #df.to_csv("Data/titlescorebigram.csv", index=False, encoding='utf-8') fulldf = pd.read_csv('Data/full.csv', encoding='utf-8') fulldf['titlescore'] = ['null']*len(fulldf) fulldf['titlestd'] = ['null']*len(fulldf) fulldf.columns for i in fulldf.index: fid = fulldf['id'][i] alc = df[df['id'] == fid]['titlescore'] fulldf['titlescore'][i] = alc pop = df[df['id'] == fid]['titlestd'] fulldf['titlestd'][i] = pop def calc_cdf(mean, std): cdf_dem = (1.0 + math.erf((600-mean)/math.sqrt(2*(std**2))))/2.0 return 1 - cdf_dem df = fulldf #clean the data df['titlescore'] = df['titlescore'].apply(lambda x: x[1:-1]) df['titlestd'] = df['titlestd'].apply(lambda x: x[1:-1]) df = df[df['titlescore'] != 'null'] df = df[df['titlescore'] != '#N/A'] df['titlescore'] = df['titlescore'].apply(lambda x:float(x)) df['titlestd'] = df['titlestd'].apply(lambda x:float(x)) #get the cdf df['probprob'] = ['null'] * len(df) for i in df.index: df['probprob'][i] = calc_cdf(df['titlescore'][i], df['titlestd'][i]) df['probprob'] = df['probprob'].astype(float) pearsonr(df['probprob'], df['score']) subs = list(df['subreddit'].unique()) types = list(df['type'].unique()) ''' df['titlescore'] = df['titlescore'].apply(lambda x: x[1:-1]) null_df = df[df['titlescore'] == 'null'] na_df = df[df['titlescore'] == '#N/A'] dfids = list(null_df.index) for i in range(len(dfids)): df['titlescore'][dfids[i]] = '0' dfids = list(na_df.index) for i in range(len(dfids)): df['titlescore'][dfids[i]] = '0' df['titlescore'] = df['titlescore'].apply(lambda x:float(x)) ''' df['normalized_titlescore'] = ['null']*len(df) for sub in subs: curr_df = df[df['subreddit'] == sub] for ctype in types: current_df = curr_df[curr_df['type'] == ctype] curr_mean = current_df['score'].mean() dfids = list(current_df.index) for i in range(len(dfids)): df['normalized_titlescore'][dfids[i]] = df['titlescore'][dfids[i]]/curr_mean df2 = pd.read_csv('Data/full2.csv', encoding='utf-8') df2= df2[df2['normalized_titlescore'] != 0] pearsonr(df2['normalized_titlescore'], df2['score']) df2['titlestd'] = df2['titlestd'].apply(lambda x: x[1:-1]) df2= df2[df2['titlestd'] != 'null'] df2= df2[df2['titlestd'] != '#N/A'] df2['titlestd'] = df2['titlestd'].apply(lambda x:float(x)) plt.scatter(df2['titlestd'], df2['score'], c='g') plt.title("Std versus Score") plt.xlabel("Std") plt.ylabel("Score") plt.xlim(0, 2000) plt.ylim(0, 8000) plt.show() r_row, p_value = pearsonr(df2['titlestd'], df2['score']) print "Pearson coefficient is" + str(r_row) + " with a p-value of " + str(p_value)