#!/usr/bin/env python # coding: utf-8 # In[1]: from collections import Counter,defaultdict import random import gzip import textwrap # In[2]: def make_trigrams(filename): #returns list of words in file with gzip.open(filename) as f: words = f.read().split() trigrams = defaultdict(list) bigram=tuple(words[:2]) startwords=[bigram] for w in words[2:] + words[:2]: #keys of trigram dict are tuples, values are lists trigrams[bigram].append(w) if bigram[0].endswith('.') and bigram[1][0].isupper(): startwords.append((bigram[1],w)) bigram=(bigram[1],w) return trigrams,startwords # In[3]: def random_text(trigrams, startwords, num_words=100): current_pair = random.choice(startwords) random_text = list(current_pair) # continue past num_words until ends in . while len(random_text)< num_words or not random_text[-1].endswith('.'): next = random.choice(trigrams[current_pair]) random_text.append(next) current_pair = (current_pair[1], next) # avoid long loops if too few periods in training text if len(random_text) > 2*num_words: random_text[-1] += '.' return textwrap.fill(' '.join(random_text)) # The source text files for the below can be found here: # # 1. sherlock.txt.gz # 2. oz.txt.gz # 3. di.txt.gz # 4. shakespeare.txt.gz # In[4]: trigrams_sh,startwords_sh=make_trigrams('sherlock.txt.gz') # In[5]: #most common bigram keys sorted([(bi,len(trigrams_sh[bi])) for bi in trigrams_sh],key=lambda x: x[1],reverse=True)[:10] # In[6]: Counter(startwords_sh).most_common(10) #starts of sentences # In[7]: len(set(trigrams_sh[('of','the')])) #distinct continuations from "of the ..." # In[25]: print random_text(trigrams_sh,startwords_sh) #random sherlock # In[10]: trigrams_di,startwords_di=make_trigrams('di.txt.gz') trigrams_oz,startwords_oz=make_trigrams('oz.txt.gz') # In[30]: print random_text(trigrams_oz,startwords_oz) #random Oz # In[27]: print random_text(trigrams_di,startwords_di) #random declaration of independence # In[32]: trigrams_oz_di= dict(trigrams_oz.items()+trigrams_di.items()) for bigram in set(trigrams_oz) & set (trigrams_di): trigrams_oz_di[bigram] = trigrams_oz[bigram] + trigrams_di[bigram] startwords_oz_di = startwords_oz + startwords_di print random_text(trigrams_oz_di,startwords_di) #mashup of declaration and oz # In[17]: trigrams_sk,startwords_sk=make_trigrams('shakespeare.txt.gz') # In[34]: print random_text(trigrams_sk,startwords_sk) #random shakespeare # In[19]: trigrams_sh_sk= dict(trigrams_sh.items()+trigrams_sk.items()) for bigram in set(trigrams_sh) & set (trigrams_sk): trigrams_sh_sk[bigram] = trigrams_sh[bigram] + trigrams_sk[bigram] startwords_sh_sk = startwords_sh + startwords_sk # In[35]: print random_text(trigrams_sh_sk,startwords_sh) #mashup of shakespeare and sherlock # Below the simulation of this example: # In[2]: from IPython.display import Image Image(open('m7.png').read()) # In[49]: mchain={0:{1:.5, 7:.5}, 1:{}, 2:{3:.5}, 3:{1:.5}, 4:{6:.8}, 5:{}, 6:{5:.3}, 7:{2:.2, 4:.4}} # In[50]: T=array([[mchain[m][n] if n in mchain[m] else round(1.-sum(mchain[m].values()),4) if m==n else 0. for n in mchain ] for m in mchain]) # In[51]: T # In[21]: sum(T,axis=1) # In[22]: T[7].cumsum() # In[23]: digitize(rand(10),T[7].cumsum()) # In[24]: from collections import Counter Counter(digitize(rand(100000),T[7].cumsum())) # In[25]: n=len(T) results={1:[],5:[]} for t in xrange(100000): #trials m=0 #start at 0 for k in xrange(1,10000): #just much bigger than likely path #pull next state from probability distribution given by m'th row m=digitize([rand()],T[m].cumsum())[0] if m==5 or m==1: results[m].append(k) break # In[26]: len(results[1]),len(results[5]) # In[27]: [round(m,3) for m in mean(results[1]),mean(results[5])] # ### Finally, some examples of powers of matrices # take nth power of matrix either with # # linalg.matrix_power(T,n) # or # # reduce(dot,[T]*n) # (gives same result, but `linalg.matrix_power()` is optimized so will be much faster on big matrices). The pylab inline option in the EPD python distribution seems to call in the `linalg` module by default, so it's possible to say just `matrix_power()`, as below. # In[52]: around(matrix_power(T,30),3) # Note that the first row shows that starting from state 0, ultimately state 1 is reached 2/3 of the time and state 5 is reached 1/3 of the time, as expected. From the next three rows we see that starting any of states 1,2,3 state 1 is always reached, and so on. # # Finally noticed that `T**n` just takes the nth power of the entries of the matrix one by one, which is usually (but not always) not what is wanted: # In[53]: around(T**30,3) # To see the effect of taking powers of a matrix, consider a random 4x4 matrix: # In[55]: T=rand(4,4) #random 4x4 matrix T # In[56]: sum(T,axis=1) #sum along the rows # In[57]: T /= sum(T,axis=1)[:,None] #divide by the sums along the rows so they sum to 1 T # In[58]: sum(T,axis=1) #verify # In[59]: T.dot(T) # this gives T^2 # In[60]: T.dot(T).dot(T) # this gives T^3 # In[61]: T.dot(T).dot(T).dot(T) # this gives T^4 # We see that the rows are already converging after a few iterations (recall the largest eigenvalue of the matrix is always 1, and the convergence is governed by the size of the second eigenvalue, see these notes. The first row gives the probability of landing at the other states starting from the first state, the second row for starting at the second state, and so on. Equal rows means that the probabilities of where one ends up become independent of where one started (that's what's meant by the steady state distribution). After twelve iterations, the probabilities are all the same to eight significant digits: # In[62]: matrix_power(T,12) # From here it's possible to take any row as the stationary state v (corresponds to starting from a single state), or we could imagine starting from an equiprobability state w=[1./4,1./4,1./4,1./4] (gives the same result since all the rows are the same at this point): # In[67]: w=ones(4)/4 #all components equal to 1/4. v=w.dot(matrix_power(T,12)) v # In[63]: for n in range(20): #see how little it changes from one iteration to the next print n,norm(matrix_power(T,n)-matrix_power(T,n-1)) # In[64]: w=array([0.,1.,0.,0.]) # see the result of n steps starting from 2nd state for n in range(6): print 'step',n,w w = w.dot(T) # In[65]: w=rand(4) #eventually doesn't matter where started w /= sum(w) #normalize as probability w # In[66]: for n in range(6): print 'step',n,w w = w.dot(T) # In[ ]: