from collections import Counter,defaultdict
import random
import gzip
import textwrap

def make_trigrams(filename):

    #returns list of words in file
    with gzip.open(filename) as f: words = f.read().split()

    trigrams = defaultdict(list)
    
    bigram=tuple(words[:2])
    startwords=[bigram]
    
    for w in words[2:] + words[:2]:
    #keys of trigram dict are tuples, values are lists
        trigrams[bigram].append(w)
        if bigram[0].endswith('.') and bigram[1][0].isupper():
            startwords.append((bigram[1],w))
        bigram=(bigram[1],w)

    return trigrams,startwords

def random_text(trigrams, startwords, num_words=100):
     
    current_pair = random.choice(startwords)
    random_text = list(current_pair)
    
    # continue past num_words until ends in .    
    while len(random_text)< num_words or not random_text[-1].endswith('.'):
        next = random.choice(trigrams[current_pair])
        random_text.append(next)
        current_pair = (current_pair[1], next)
        # avoid long loops if too few periods in training text
        if len(random_text) > 2*num_words: random_text[-1] += '.'
        
    return textwrap.fill(' '.join(random_text))

trigrams_sh,startwords_sh=make_trigrams('sherlock.txt.gz')

sorted([(bi,len(trigrams_sh[bi])) for bi in trigrams_sh],key=lambda x: x[1],reverse=True)[:10]

Counter(startwords_sh).most_common(10)

len(set(trigrams_sh[('of','the')]))

print random_text(trigrams_sh,startwords_sh)

trigrams_di,startwords_di=make_trigrams('di.txt.gz')
trigrams_oz,startwords_oz=make_trigrams('oz.txt.gz')

print random_text(trigrams_oz,startwords_oz)

print random_text(trigrams_di,startwords_di)

trigrams_oz_di= dict(trigrams_oz.items()+trigrams_di.items())
for bigram in set(trigrams_oz) & set (trigrams_di):
    trigrams_oz_di[bigram] = trigrams_oz[bigram] + trigrams_di[bigram]
startwords_oz_di = startwords_oz + startwords_di
    
print random_text(trigrams_oz_di,startwords_di)

from IPython.display import Image
Image(open('m7.png').read())

mchain={0:{1:.5, 7:.5}, 1:{}, 2:{3:.5}, 3:{1:.5},
        4:{6:.8}, 5:{}, 6:{5:.3}, 7:{2:.2, 4:.4}}

T=array([[mchain[m][n] if n in mchain[m]
     else round(1.-sum(mchain[m].values()),4) if m==n
     else 0.
          for n in mchain ]  for m in mchain])

T

sum(T,axis=1)

T[7].cumsum()

digitize(rand(10),T[7].cumsum())

from collections import Counter
Counter(digitize(rand(100000),T[7].cumsum()))

n=len(T)
results={1:[],5:[]}
for t in xrange(100000): #trials
    m=0 #start at 0
    for k in xrange(1,10000):  #just much bigger than likely path
    #pull next state from probability distribution given by m'th row
        m=digitize([rand()],T[m].cumsum())[0]
        if m==5 or m==1:
            results[m].append(k)
            break

len(results[1]),len(results[5])

[round(m,3) for m in mean(results[1]),mean(results[5])]

around(matrix_power(T,30),3)

around(T**30,3)

T=rand(4,4)  #random 4x4 matrix
T

sum(T,axis=1)  #sum along the rows

T /= sum(T,axis=1)[:,None]   #divide by the sums along the rows so they sum to 1
T

sum(T,axis=1)  #verify

T.dot(T)  # this gives T^2

T.dot(T).dot(T) # this gives T^3

T.dot(T).dot(T).dot(T) # this gives T^4

matrix_power(T,12)

w=ones(4)/4  #all components equal to 1/4.
v=w.dot(matrix_power(T,12))
v

for n in range(20):  #see how little it changes from one iteration to the next
  print n,norm(matrix_power(T,n)-matrix_power(T,n-1))

w=array([0.,1.,0.,0.]) # see the result of n steps starting from 2nd state
for n in range(6):
    print 'step',n,w
    w = w.dot(T)

w=rand(4) #eventually doesn't matter where started
w /= sum(w)  #normalize as probability
w

for n in range(6):
    print 'step',n,w
    w = w.dot(T)