beta = thinkbayes.Beta()
beta.Update((3, 3))
print beta.MaximumLikelihood()
class Dirichlet(object):
def __init__(self, n):
self.n = n
self.params = numpy.ones(n, dtype=numpy.int)
def MarginalBeta(self, i):
alpha0 = self.params.sum()
alpha = self.params[i]
return Beta(alpha, alpha0-alpha)
dirichlet = thinkbayes.Dirichlet(3)
for i in range(3):
beta = dirichlet.MarginalBeta(i)
print beta.Mean()
def Update(self, data):
m = len(data)
self.params[:m] += data
data = [3, 2, 1]
dirichlet.Update(data)
for i in range(3):
beta = dirichlet.MarginalBeta(i)
pmf = beta.MakePmf()
print i, pmf.Mean()
class Species(thinkbayes.Suite):
def __init__(self, ns):
hypos = [thinkbayes.Dirichlet(n) for n in ns]
thinkbayes.Suite.__init__(self, hypos)
ns = range(3, 30)
suite = Species(ns)
#class Species
def Update(self, data):
thinkbayes.Suite.Update(self, data)
for hypo in self.Values():
hypo.Update(data)
# class Species
def Likelihood(self, data, hypo):
dirichlet = hypo
like = 0
for i in range(1000):
like += dirichlet.Likelihood(data)
return like
# class Dirichlet
def Likelihood(self, data):
m = len(data)
if self.n < m:
return 0
x = data
p = self.Random()
q = p[:m]**x
return q.prod()
# class Dirichlet
def Random(self):
p = numpy.random.gamma(self.params)
return p / p.sum()
def DistOfN(self):
pmf = thinkbayes.Pmf()
for hypo, prob in self.Items():
pmf.Set(hypo.n, prob)
return pmf
class Species2(object):
def __init__(self, ns):
self.ns = ns
self.probs = numpy.ones(len(ns), dtype=numpy.double)
self.params = numpy.ones(self.high, dtype=numpy.int)
# class Species2
def Update(self, data):
like = numpy.zeros(len(self.ns), dtype=numpy.double)
for i in range(1000):
like += self.SampleLikelihood(data)
self.probs *= like
self.probs /= self.probs.sum()
m = len(data)
self.params[:m] += data
# class Species2
def SampleLikelihood(self, data):
gammas = numpy.random.gamma(self.params)
m = len(data)
row = gammas[:m]
col = numpy.cumsum(gammas)
log_likes = []
for n in self.ns:
ps = row / col[n-1]
terms = data * numpy.log(ps)
log_like = terms.sum()
log_likes.append(log_like)
log_likes -= numpy.max(log_likes)
likes = numpy.exp(log_likes)
coefs = [thinkbayes.BinomialCoef(n, m) for n in self.ns]
likes *= coefs
return likes
class Species4(Species):
def Update(self, data):
m = len(data)
for i in range(m):
one = numpy.zeros(i+1)
one[i] = data[i]
Species.Update(self, one)
# class Species4
def Likelihood(self, data, hypo):
dirichlet = hypo
like = 0
for i in range(self.iterations):
like += dirichlet.Likelihood(data)
# correct for the number of unseen species the new one
# could have been
m = len(data)
num_unseen = dirichlet.n - m + 1
like *= num_unseen
return like
class Species5(Species2):
def Update(self, data):
m = len(data)
for i in range(m):
self.UpdateOne(i+1, data[i])
self.params[i] += data[i]
# class Species5
def UpdateOne(self, i, count):
likes = numpy.zeros(len(self.ns), dtype=numpy.double)
for i in range(self.iterations):
likes += self.SampleLikelihood(i, count)
unseen_species = [n-i+1 for n in self.ns]
likes *= unseen_species
self.probs *= likes
self.probs /= self.probs.sum()
# class Species5
def SampleLikelihood(self, i, count):
gammas = numpy.random.gamma(self.params)
sums = numpy.cumsum(gammas)[self.ns[0]-1:]
ps = gammas[i-1] / sums
log_likes = numpy.log(ps) * count
log_likes -= numpy.max(log_likes)
likes = numpy.exp(log_likes)
return likes
92, 53, 47, 38, 15, 14, 12, 10, 8, 7, 7, 5, 5,
4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
class Subject(object):
def __init__(self, code):
self.code = code
self.species = []
# class Suite2
def DistN(self):
items = zip(self.ns, self.probs)
pmf = thinkbayes.MakePmfFromItems(items)
return pmf
# class Species2
def DistOfPrevalence(self, index):
metapmf = thinkbayes.Pmf()
for n, prob in zip(self.ns, self.probs):
beta = self.MarginalBeta(n, index)
pmf = beta.MakePmf()
metapmf.Set(pmf, prob)
mix = thinkbayes.MakeMixture(metapmf)
return metapmf, mix
# class Subject
def RunSimulation(self, num_reads):
m, seen = self.GetSeenSpecies()
n, observations = self.GenerateObservations(num_reads)
curve = []
for k, obs in enumerate(observations):
seen.add(obs)
num_new = len(seen) - m
curve.append((k+1, num_new))
return curve
#class Subject
def GetSeenSpecies(self):
names = self.GetNames()
m = len(names)
seen = set(SpeciesGenerator(names, m))
return m, seen
def SpeciesGenerator(names, num): i=0
for name in names:
yield '%s-%d' % (name, i)
i += 1
while i < num:
yield 'unseen-%d' % i
i += 1
# class Subject
def GenerateObservations(self, num_reads):
n, prevalences = self.suite.SamplePosterior()
names = self.GetNames()
name_iter = SpeciesGenerator(names, n)
d = dict(zip(name_iter, prevalences))
cdf = thinkbayes.MakeCdfFromDict(d)
observations = cdf.Sample(num_reads)
return n, observations
def SamplePosterior(self):
pmf = self.DistOfN()
n = pmf.Random()
prevalences = self.SamplePrevalences(n)
return n, prevalences
# class Species2
def SamplePrevalences(self, n):
params = self.params[:n]
gammas = numpy.random.gamma(params)
gammas /= gammas.sum()
return gammas
def MakeJointPredictive(curves):
joint = thinkbayes.Joint()
for curve in curves:
for k, num_new in curve:
joint.Incr((k, num_new))
joint.Normalize()
return joint
def MakeConditionals(curves, ks):
joint = MakeJointPredictive(curves)
cdfs = []
for k in ks:
pmf = joint.Conditional(1, 0, k)
pmf.name = 'k=%d' % k
cdf = pmf.MakeCdf()
cdfs.append(cdf)
return cdfs
# class Subject
def RunSimulation(self, num_reads):
m, seen = self.GetSeenSpecies()
n, observations = self.GenerateObservations(num_reads)
curve = []
for k, obs in enumerate(observations):
seen.add(obs)
frac_seen = len(seen) / float(n)
curve.append((k+1, frac_seen))
return curve
def MakeFracCdfs(self, curves):
d = {}
for curve in curves:
for k, frac in curve:
d.setdefault(k, []).append(frac)
cdfs = {}
for k, fracs in d.iteritems():
cdf = thinkbayes.MakeCdfFromList(fracs)
cdfs[k] = cdf
return cdfs