import pickle
f = open('****','rb')
data = pickle.load(f)
f.close()
odds_cnt = 0
evens_cnt = 0
odds_s = []
evens_s = []
all_s = []
for d,vals in data.items():
for u,s in data[d].items():
if d%2==0:
evens_s.append(s)
evens_cnt += 1
else:
odds_s.append(s)
odds_cnt += 1
all_s.append(s)
import numpy as np
from collections import defaultdict
avgs = defaultdict(int)
for d,vals in data.items():
scores = np.array(vals.values())
avgs[d]=round(scores.mean(),2)
avgs_odd = np.array([y for x,y in avgs.items() if x%2==1])
avgs_even = np.array([y for x,y in avgs.items() if x%2==0])
rcParams['figure.figsize'] = 8,5
for d,s in avgs.items():
if d%2==0:
pylab.plot(len(data[d]), s, 'ro', alpha=0.8)
else:
pylab.plot(len(data[d]), s, 'bo', alpha=0.8)
if s>20:
pylab.annotate('{}'.format(d), xy=(len(data[d]), s), xytext=(-5, 5), ha='left', textcoords='offset points')
pylab.xlabel('number of published listicles for a given digit')
pylab.ylabel('avg score')
pylab.title('listicles / score vs. size')
<matplotlib.text.Text at 0x4481dd0>
rcParams['figure.figsize'] = 16,6
for d,s in avgs.items():
if d<200:
if s==0:
continue
if d%2==0:
pylab.plot(d, s, 'ro', alpha=0.8)
else:
pylab.plot(d, s, 'bo', alpha=0.8)
for i,s in avgs.items():
if s>20:
#pylab.annotate(i, (i, all_avg_users[i]))
pylab.annotate('{}'.format(i), xy=(i, s), xytext=(-5, 5), ha='left', textcoords='offset points')
pylab.xlabel('listicle length (digit)')
pylab.ylabel('avg score')
pylab.title('listicles / by score')
<matplotlib.text.Text at 0x48f4150>
# some basic stats
odds_s = np.array(odds_s)
evens_s = np.array(evens_s)
print 'members:\nodds: %s\nevens: %s\n' % (odds_cnt, evens_cnt)
print 'standard deviation:\nodds: %s\nevens: %s\n' % (odds_s.std(), evens_s.std())
print 'mean:\nodds: %s\nevens: %s\n' % (odds_s.mean(), evens_s.mean())
members: odds: 5037 evens: 4886 standard deviation: odds: 87.6169551402 evens: 39.0589360888 mean: odds: 20.9217788366 evens: 14.1966844044
from scipy import stats
# t-test on all data
print stats.ttest_ind(odds_s, evens_s)
(4.9120672755595303, 9.1560218380972755e-07)
# t-test on data averages (per listicle length)
print stats.ttest_ind(avgs_odd, avgs_even)
(2.4086189971429741, 0.017825485089317184)
Here we try to identify the optimal listicle length across the board. There are some clear candidates from our plot above: 27, 29, 35. These numbers had both a relatively high avg score, as well as a high number of published posts. Numbers such as 71, 81 and 91 displayed high avg scores, with very small number of published posts. The high scores are heavily skewed towards a single post in those bins that performed extraordinarily well. Typical issue with outliers. Lets compare the performance of 27, 29 and 35 with the top performing links (=> scores above 20, approximately 20% of our dataset).
# performance of 27-length listicles
links_27 = [y for x,y in data[27].items()]
links_29 = [y for x,y in data[29].items()]
links_35 = [y for x,y in data[35].items()]
print stats.ttest_ind(links_27, all_s)
print stats.ttest_ind(links_29, all_s)
print stats.ttest_ind(links_35, all_s)
(4.6711298195107691, 3.0338447258973221e-06) (8.0769462172810371, 7.4038388325404207e-16) (3.5244321399517857, 0.0004262834158488855)
# get population mean
population_mean = np.array(all_s).mean()
odds_score_above_mean = [x for x in odds_s if x>population_mean]
odds_score_below_mean = [x for x in odds_s if x<population_mean]
evens_score_above_mean = [x for x in evens_s if x>population_mean]
evens_score_below_mean = [x for x in evens_s if x<population_mean]
print len(odds_score_above_mean)
print len(odds_score_below_mean)
print len(evens_score_above_mean)
print len(evens_score_below_mean)
1210 3827 890 3996
oddsratio, pvalue = stats.fisher_exact([[1210,3827],[890,3996]])
print oddsratio
print pvalue
1.41958820093 -1.45436468458e-11
def primes(n):
""" Returns a list of primes < n """
sieve = [True] * n
for i in xrange(3,int(n**0.5)+1,2):
if sieve[i]:
sieve[i*i::2*i]=[False]*((n-i*i-1)/(2*i)+1)
return [2] + [i for i in xrange(3,n,2) if sieve[i]]
prime_nums = list(set(primes(200)).intersection(set(data.keys())))
avgs_prime = [y for x,y in avgs.items() if x in prime_nums]
print stats.ttest_ind(avgs_odd, avgs_even)
print stats.ttest_ind(avgs_prime, avgs_odd) # not statistically significant
print stats.ttest_ind(avgs_prime, avgs_even)
(2.4086189971429741, 0.017825485089317184) (-0.029120310398447546, 0.97685393494459993) (2.0119545375529109, 0.04772277726894511)
primes_s = []
for d in prime_nums:
primes_s+=data[d].values()
print stats.ttest_ind(primes_s, evens_s)
(3.8825970347615471, 0.00010421007133924338)
The following plot highlights listicle length performance as a grid. The lower left corner represents the average performance of articles of length = 0 (there was one). The rectangle's color represents average performance (blue = poor, red = high). The bottom row from left to right represents listicle lengths: 0,1,2,3...,9. The row above is: 10,11,12,...19. And so on and so forth.
# plot as grid
rcParams['figure.figsize'] = 10,5
data_plot = pylab.random((20,10))
for d in sorted(data.keys()):
avg_score = np.mean(data[d].values())
y = d%10
x = d/10
data_plot[x][y]=log(avg_score+1)
pylab.pcolor(data_plot)
pylab.colorbar()
pylab.show()
%pylab inline
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['f'] `%matplotlib` prevents importing * from pylab and numpy