%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True
plt.gray()
<matplotlib.figure.Figure at 0x22c3ad0>
The problem with weighting the interactions in the curated edgelist we are using to build our protein-protein interaction network is that the classifier is not perfect and is supplying extremely low probabilities for interactions which we expect to exist. We can encode this prior into our model by considering inclusion in a database, the edgelist and the result of classification to both be events that are dependent on an underlying interaction event. If we assume the events are also conditionally independent then we have a Naive Bayes model.
Specifically, if we have $I$ as interaction $c$ as classifier result and $e$ as edgelist inclusion result the joint can be expressed:
$$ p(I,e,c) = p(I) p(e|I) p(c|I) $$Where the individual distributions are:
$$ p(I=1) = Bernoulli(I=1;\theta_{I}) = \theta_{I} $$$$ p(e|I=1) = Bernoulli(e;\theta_{e}) = \theta_{e}^{e}(1-\theta_{e})^{1-e} $$$$ p(c|I=1) = Beta(c;\alpha,\beta) = \frac{1}{B(\alpha,\beta)} c^{\alpha-1}(1-c)^{\beta-1} $$There are parameters we need to find to be able to use this model: $\alpha, \beta, \theta_{I}, \pmb{\theta_{e}}$. We can easily get two of these:
First we have to load the data:
import scipy.stats
cd ../../forGAVIN/mergecode/OUT/
/data/opencast/MRes/forGAVIN/mergecode/OUT
interactions = loadtxt("edgelist_weighted.txt",dtype=str)
interactions = interactions[1:]
weights = interactions[:,2]
weights = weights.astype(np.float)
Now, finding the indexes of positive and negative examples:
import pickle
#unpickle classifier's samples
f = open("../../../features/random.forest.bayes.dist.samples.pickle")
samples = pickle.load(f)
f.close()
h = hist(samples[1][:,1],bins=50)
h = hist(samples[0][:,1],bins=100)
#regularisation samples
posregsamp = scipy.stats.beta.rvs(1,1,size=10)
negregsamp = scipy.stats.beta.rvs(1,1,size=10)
idx1 = random_integers(0,len(samples[1][:,1])-1,size=1000)
idx0 = random_integers(0,len(samples[0][:,1])-1,size=1000)
alpha_1,beta_1,_,_ = scipy.stats.beta.fit(hstack([samples[1][:,1][idx1],posregsamp]))
alpha_0,beta_0,_,_ = scipy.stats.beta.fit(hstack([samples[0][:,1][idx0],negregsamp]))
/usr/local/lib/python2.7/dist-packages/scipy/optimize/minpack.py:237: RuntimeWarning: The iteration is not making good progress, as measured by the improvement from the last ten iterations. warnings.warn(msg, RuntimeWarning)
x = linspace(0,1,500)
b1 = scipy.stats.beta.pdf(x,alpha_1,beta_1)
b0 = scipy.stats.beta.pdf(x,alpha_0,beta_0)
Have to use logarithmic scaling in the following plot to yield a readable graph:
plot(x,b1,label="interaction")
plot(x,b0,label="non-interaction")
legend(loc=0)
<matplotlib.legend.Legend at 0x5e4c490>
import os
plotpath = os.path.abspath("../../../plots/bayes/")
savez(os.path.join(plotpath,"rf.beta.npz"),x,b1,b0)
STRING is pickled, we can load it and retreive the internal database to obtain the probabilities.
import pickle
sys.path.append("../../../opencast-bio/")
import ocbio.extract
f = open("../../../string/human.Entrez.string.pickle")
stringdb = pickle.load(f)
f.close()
Similarly, we must do this for iRefIndex:
f = open("../../../iRefIndex/human.iRefIndex.Entrez.1ofk.pickle")
irefdb = pickle.load(f)
f.close()
As was done above with our classifier's results, we must build a beta distribution for the results from STRING. This will involve using iRefIndex as class labels, as before.
stringlabelled = {}
for k in stringdb.featuredict:
try:
stringlabelled[max(irefdb[k])] += [float(stringdb[k][-1])/1000]
except KeyError:
stringlabelled[max(irefdb[k])] = [float(stringdb[k][-1])/1000]
h=hist(stringlabelled['1'],bins=50,label='interaction')
h=hist(stringlabelled['0'],bins=50,label='non-interaction')
l=legend()
#regularisation samples
posregsamp = scipy.stats.beta.rvs(1,1,size=10)
negregsamp = scipy.stats.beta.rvs(1,1,size=10)
stringlabelled['1'] = array(stringlabelled['1'])
stringlabelled['0'] = array(stringlabelled['0'])
N1 = len(stringlabelled['1'])
N2 = len(stringlabelled['0'])
s1idx = random_integers(0,N1,size=1000)
s2idx = random_integers(0,N2,size=1000)
stralpha_1,strbeta_1,_,_ = scipy.stats.beta.fit(hstack([stringlabelled['1'][s1idx],posregsamp]))
stralpha_0,strbeta_0,_,_ = scipy.stats.beta.fit(hstack([stringlabelled['0'][s2idx],negregsamp]))
b1 = scipy.stats.beta.pdf(x,stralpha_1,strbeta_1)
b0 = scipy.stats.beta.pdf(x,stralpha_0,strbeta_0)
plot(x,b1,label="interaction")
plot(x,b0,label="non-interaction")
l=legend(loc=0)
savez(os.path.join(plotpath,"string.beta.npz"),x,b1,b0)
Using these distributions we can find the posterior probabilities for each of the interactions in this edgelist:
$$ p(I=1|e,c;\theta_{e},\alpha,\beta) = \frac{p(I=1,e,c;\theta_{e},\alpha,\beta)}{p(e,c;\theta_{e},\alpha,\beta)} = \frac{p(I=1)p(e|I=1;\theta_{e})p(c|I=1;\alpha,\beta)}{p(e,c;\theta_{e},\alpha,\beta)} $$Where we find the denominator by marginalising the joint:
$$ p(e,c;\theta_{e},\alpha,\beta) = \sum_{I} p(I,e,c;\theta_{e},\alpha,\beta) $$Which requires using the distributions of:
$$ p(I=0) = Bernoulli(I=1;\theta_{I}) = 1-\theta_{I} $$$$ p(e|I=0) = Bernoulli(e;\theta_{e}) = \theta_{e0}^{e}(1-\theta_{e0})^{1-e} $$$$ p(c|I=0) = Beta(c;\alpha_{0},\beta_{0}) = \frac{1}{B(\alpha_{0},\beta_{0})} c^{\alpha_{0}-1}(1-c)^{\beta_{0}-1} $$In this case, as $e$ is always one (all examples are elements of the edgelist) $\theta_{e0}$ is simply $1 - \theta_{e}$. Therefore, we can solve for the posterior for arbitrary points:
def posterior(prior,logprobdict):
"""Takes dictionary of class-conditional log probabilities and log prior
produces the corresponding posterior probability"""
#dictionary should have keys 0 and 1
post = []
for l1,l0 in zip(logprobdict[1],logprobdict[0]):
numerator = sum(l1) + prior
denominator = logaddexp(numerator,sum(l0))
post.append(exp(numerator-denominator))
return post
alpha = {0:alpha_0,1:alpha_1}
beta = {0:beta_0,1:beta_1}
stralpha = {0:stralpha_0,1:stralpha_1}
strbeta = {0:strbeta_0,1:strbeta_1}
In the below cell we will be explicitly defining various probabilities for the different data sources. The values used will be estimates, aiming to be conservative. For each database two different probabilties must be defined:
This only equates to two probabilities as these probabilities are related within each class:
$$ TPR = 1 - FPR $$We can define these probabilities for each of these databases before we begin for iRefIndex and edgelist inclusion.
irefprob = {'tpr':0.9,'tnr':0.5,'fpr':0.1,'fnr':0.5}
edgelistprob = {'tpr':0.9,'tnr':0.5,'fpr':0.1,'fnr':0.5}
#making logprobdict from classifier, STRING and iRefIndex
logprobdict = {}
for cls in [0,1]:
for l in interactions:
pair = frozenset(l[:2])
#rounding problems
weight = float(l[2])*0.9 + 0.01
pclass = scipy.stats.beta.logpdf(weight,alpha[cls],beta[cls])
wstring = (float(stringdb[pair][-1])/1000)*0.9 + 0.09999
pstring = scipy.stats.beta.logpdf(wstring,stralpha[cls],strbeta[cls])
irefresponse = max(map(float,irefdb[pair]))
if cls == 1:
piref = log(irefresponse*irefprob['tpr'] + (1.0-irefresponse)*irefprob['fnr'])
pedgelist = log(edgelistprob['tpr'])
else:
piref = log(irefresponse*irefprob['fpr'] + (1.0-irefresponse)*irefprob['tnr'])
pedgelist = log(edgelistprob['fpr'])
try:
logprobdict[cls] += [sum([pclass,pstring,pedgelist,piref])]
except KeyError:
logprobdict[cls] = [sum([pclass,pstring,pedgelist,piref])]
postweightings = posterior(1.0/600,logprobdict)
h=hist(postweightings,bins=100)
Quite a discrete set of weights, probably due to all these binary features we're using.
!git annex unlock ../../../plots/bayes/postweightings.npz
unlock ../../../plots/bayes/postweightings.npz (copying...) ok
savez(os.path.join(plotpath,"postweightings.npz"),postweightings)
We will now write these weights to a file so that the community detection code can be run.
import csv
!git annex unlock edgelist_update_weighted.txt
unlock edgelist_update_weighted.txt (copying...) ok
f = open("edgelist_update_weighted.txt","w")
c = csv.writer(f, delimiter="\t")
for i,p in enumerate(interactions[:,:2]):
c.writerow(list(p)+[postweightings[i]])
f.close()