from collections import Counter,defaultdict
import random
import gzip
import textwrap
def make_trigrams(filename):
#returns list of words in file
with gzip.open(filename) as f: words = f.read().split()
trigrams = defaultdict(list)
bigram=tuple(words[:2])
startwords=[bigram]
for w in words[2:] + words[:2]:
#keys of trigram dict are tuples, values are lists
trigrams[bigram].append(w)
if bigram[0].endswith('.') and bigram[1][0].isupper():
startwords.append((bigram[1],w))
bigram=(bigram[1],w)
return trigrams,startwords
def random_text(trigrams, startwords, num_words=100):
current_pair = random.choice(startwords)
random_text = list(current_pair)
# continue past num_words until ends in .
while len(random_text)< num_words or not random_text[-1].endswith('.'):
next = random.choice(trigrams[current_pair])
random_text.append(next)
current_pair = (current_pair[1], next)
# avoid long loops if too few periods in training text
if len(random_text) > 2*num_words: random_text[-1] += '.'
return textwrap.fill(' '.join(random_text))
The source text files for the below can be found here:
trigrams_sh,startwords_sh=make_trigrams('sherlock.txt.gz')
sorted([(bi,len(trigrams_sh[bi])) for bi in trigrams_sh],key=lambda x: x[1],reverse=True)[:10]
[(('of', 'the'), 700), (('in', 'the'), 479), (('to', 'the'), 297), (('I', 'have'), 247), (('that', 'I'), 245), (('at', 'the'), 224), (('upon', 'the'), 195), (('and', 'I'), 191), (('to', 'be'), 189), (('and', 'the'), 186)]
Counter(startwords_sh).most_common(10)
[(('It', 'was'), 91), (('It', 'is'), 88), (('I', 'have'), 49), (('He', 'was'), 38), (('There', 'was'), 31), (('There', 'is'), 28), (('I', 'am'), 27), (('I', 'was'), 27), (('I', 'had'), 26), (('On', 'the'), 22)]
len(set(trigrams_sh[('of','the')]))
548
print random_text(trigrams_sh,startwords_sh)
Very long and very little exercise. I feel that no one else does; but he looked very hard man, would have the effect of calling the attention of whoever it was only a few minutes after I had not. Was there a half-pay major of marines, to whom she became engaged. My stepfather learned of the minute knowledge which you need tell me. Still, that little may as well as in Horace, and as noiselessly as she had divined that I had my note?" "Yes, the wine-cellar." "You seem to see which. They laid me on a visit, and will be taken up my arms to cover my face, and she to prove that it means?" "I have a very massive iron gate.
trigrams_di,startwords_di=make_trigrams('di.txt.gz')
trigrams_oz,startwords_oz=make_trigrams('oz.txt.gz')
print random_text(trigrams_oz,startwords_oz)
Dorothy looked at her and killed her," she replied. "Why, it is we never knew." Dorothy carried him food from the forest he seizes an animal with a green velvet counterpane. There was great rejoicing among the poppies too long they also knew that the tallest of them and thanked her for rescuing him, he was busy at this the enchanted axe cut off my left leg. "This at first that their journey for fear of getting home, and I'm sure I could do. Sit down, please, there are wild beasts in the lead, finally discovered a big pile of wood, and now the ruler of this Palace was built, I have brains, and very grateful.
print random_text(trigrams_di,startwords_di)
When in the Legislature, a right inestimable to them shall seem most likely to effect their Safety and Happiness. Prudence, indeed, will dictate that Governments long established should not be changed for light and transient causes; and accordingly all experience hath shewn that mankind are more disposed to suffer, while evils are sufferable than to right themselves by their Hands. He has forbidden his Governors to pass other Laws for the accommodation of large districts of people, unless those people would relinquish the right of Representation in the Legislature, a right inestimable to them shall seem most likely to effect their Safety and Happiness.
trigrams_oz_di= dict(trigrams_oz.items()+trigrams_di.items())
for bigram in set(trigrams_oz) & set (trigrams_di):
trigrams_oz_di[bigram] = trigrams_oz[bigram] + trigrams_di[bigram]
startwords_oz_di = startwords_oz + startwords_di
print random_text(trigrams_oz_di,startwords_di)
Prudence, indeed, will dictate that Governments long established should not be disturbed. The Scarecrow and the balloon with, to make him as he walked boldly through and found his own glasses and told them what Oz had not kept the promise he made of tin." And she scampered out of many wild animals. Toto whimpered a little, but she saw there was not a Wizard, and would gladly face an army or a dozen of her room so she took off her cap and balanced the point on the truck. Of course each one of us were rather frightened at first, for he didn't mention it.
Below the simulation of this example:
from IPython.display import Image
Image(open('m7.png').read())
mchain={0:{1:.5, 7:.5}, 1:{}, 2:{3:.5}, 3:{1:.5},
4:{6:.8}, 5:{}, 6:{5:.3}, 7:{2:.2, 4:.4}}
T=array([[mchain[m][n] if n in mchain[m]
else round(1.-sum(mchain[m].values()),4) if m==n
else 0.
for n in mchain ] for m in mchain])
T
array([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0.5], [ 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. ], [ 0. , 0.5, 0. , 0.5, 0. , 0. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0.2, 0. , 0.8, 0. ], [ 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 0.3, 0.7, 0. ], [ 0. , 0. , 0.2, 0. , 0.4, 0. , 0. , 0.4]])
sum(T,axis=1)
array([ 1., 1., 1., 1., 1., 1., 1., 1.])
T[7].cumsum()
array([ 0. , 0. , 0.2, 0.2, 0.6, 0.6, 0.6, 1. ])
digitize(rand(10),T[7].cumsum())
array([4, 7, 4, 7, 4, 7, 4, 2, 4, 7])
from collections import Counter
Counter(digitize(rand(100000),T[7].cumsum()))
Counter({4: 40182, 7: 39752, 2: 20066})
n=len(T)
results={1:[],5:[]}
for t in xrange(100000): #trials
m=0 #start at 0
for k in xrange(1,10000): #just much bigger than likely path
#pull next state from probability distribution given by m'th row
m=digitize([rand()],T[m].cumsum())[0]
if m==5 or m==1:
results[m].append(k)
break
len(results[1]),len(results[5])
(66670, 33330)
[round(m,3) for m in mean(results[1]),mean(results[5])]
[2.41, 7.254]
take nth power of matrix either with
linalg.matrix_power(T,n)
or
reduce(dot,[T]*n)
(gives same result, but linalg.matrix_power()
is optimized so will be much faster on big matrices). The pylab inline option in the EPD python distribution seems to call in the linalg
module by default, so it's possible to say just matrix_power()
, as below.
around(matrix_power(T,30),3)
array([[ 0. , 0.667, 0. , 0. , 0. , 0.333, 0. , 0. ], [ 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. ], [ 0. , 0.333, 0. , 0. , 0. , 0.667, 0. , 0. ]])
Note that the first row shows that starting from state 0, ultimately state 1 is reached 2/3 of the time and state 5 is reached 1/3 of the time, as expected. From the next three rows we see that starting any of states 1,2,3 state 1 is always reached, and so on.
Finally noticed that T**n
just takes the nth power of the entries of the matrix one by one, which is usually (but not always) not what is wanted:
around(T**30,3)
array([[ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 0. , 0.001, 0. ], [ 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])
To see the effect of taking powers of a matrix, consider a random 4x4 matrix:
T=rand(4,4) #random 4x4 matrix
T
array([[ 0.37191866, 0.19514006, 0.59574904, 0.21311382], [ 0.22453282, 0.44280837, 0.32729572, 0.25502435], [ 0.06934085, 0.50455362, 0.09645712, 0.12621279], [ 0.47506432, 0.14128696, 0.3994524 , 0.22011466]])
sum(T,axis=1) #sum along the rows
array([ 1.37592158, 1.24966127, 0.79656438, 1.23591834])
T /= sum(T,axis=1)[:,None] #divide by the sums along the rows so they sum to 1
T
array([[ 0.27030513, 0.14182499, 0.43298182, 0.15488806], [ 0.17967495, 0.35434272, 0.26190755, 0.20407478], [ 0.0870499 , 0.63341223, 0.12109143, 0.15844644], [ 0.38438164, 0.11431739, 0.3232029 , 0.17809806]])
sum(T,axis=1) #verify
array([ 1., 1., 1., 1.])
T.dot(T) # this gives T^2
array([[ 0.19577441, 0.38055306, 0.2566729 , 0.16699963], [ 0.21347519, 0.3402659 , 0.26827334, 0.17798556], [ 0.20878324, 0.33160484, 0.26945995, 0.19015196], [ 0.22103271, 0.32010284, 0.29306977, 0.16579469]])
T.dot(T).dot(T) # this gives T^3
array([[ 0.20782962, 0.34428263, 0.26949214, 0.17839562], [ 0.21060825, 0.34112132, 0.27156014, 0.17671029], [ 0.21256365, 0.33952935, 0.27133612, 0.17657088], [ 0.20650086, 0.34936126, 0.26861406, 0.17552382]])
T.dot(T).dot(T).dot(T) # this gives T^4
array([[ 0.21006764, 0.34256281, 0.27044784, 0.17692171], [ 0.20978292, 0.34295394, 0.27052868, 0.17673446], [ 0.20995235, 0.34250933, 0.2708862 , 0.17665212], [ 0.20944067, 0.34328946, 0.27016814, 0.17710173]])
We see that the rows are already converging after a few iterations (recall the largest eigenvalue of the matrix is always 1, and the convergence is governed by the size of the second eigenvalue, see these notes. The first row gives the probability of landing at the other states starting from the first state, the second row for starting at the second state, and so on. Equal rows means that the probabilities of where one ends up become independent of where one started (that's what's meant by the steady state distribution). After twelve iterations, the probabilities are all the same to eight significant digits:
matrix_power(T,12)
array([[ 0.20982798, 0.34281091, 0.27054469, 0.17681641], [ 0.20982798, 0.34281091, 0.27054469, 0.17681641], [ 0.20982798, 0.34281091, 0.27054469, 0.17681641], [ 0.20982798, 0.34281091, 0.27054469, 0.17681641]])
From here it's possible to take any row as the stationary state v (corresponds to starting from a single state), or we could imagine starting from an equiprobability state w=[1./4,1./4,1./4,1./4] (gives the same result since all the rows are the same at this point):
w=ones(4)/4 #all components equal to 1/4.
v=w.dot(matrix_power(T,12))
v
array([ 0.20982798, 0.34281091, 0.27054469, 0.17681641])
for n in range(20): #see how little it changes from one iteration to the next
print n,norm(matrix_power(T,n)-matrix_power(T,n-1))
0 25.1509253232 1 1.86295622688 2 0.543004876818 3 0.0616561134596 4 0.00908178780218 5 0.00110233172968 6 0.000208809494365 7 2.95128192228e-05 8 3.44895318113e-06 9 6.85237417189e-07 10 7.61096014459e-08 11 1.57132110039e-08 12 1.43155680143e-09 13 3.49483081912e-10 14 3.48567470089e-11 15 7.27357460449e-12 16 8.38966022033e-13 17 1.48519413403e-13 18 2.10786374484e-14 19 2.82984393717e-15
w=array([0.,1.,0.,0.]) # see the result of n steps starting from 2nd state
for n in range(6):
print 'step',n,w
w = w.dot(T)
step 0 [ 0. 1. 0. 0.] step 1 [ 0.17967495 0.35434272 0.26190755 0.20407478] step 2 [ 0.21347519 0.3402659 0.26827334 0.17798556] step 3 [ 0.21060825 0.34112132 0.27156014 0.17671029] step 4 [ 0.20978292 0.34295394 0.27052868 0.17673446] step 5 [ 0.20980861 0.34283569 0.27053421 0.17682149]
w=rand(4) #eventually doesn't matter where started
w /= sum(w) #normalize as probability
w
array([ 0.19934442, 0.50442549, 0.00408142, 0.29214866])
for n in range(6):
print 'step',n,w
w = w.dot(T)
step 0 [ 0.19934442 0.50442549 0.00408142 0.29214866] step 1 [ 0.25716831 0.24299442 0.31334288 0.18649439] step 2 [ 0.21213541 0.34237096 0.27320994 0.17228369] step 3 [ 0.20886236 0.3441523 0.27028629 0.17669905] step 4 [ 0.20974038 0.34297218 0.27040869 0.17687875] step 5 [ 0.2098454 0.34277661 0.27055268 0.17682531]