from collections import Counter,defaultdict import random import gzip import textwrap def make_trigrams(filename): #returns list of words in file with gzip.open(filename) as f: words = f.read().split() trigrams = defaultdict(list) bigram=tuple(words[:2]) startwords=[bigram] for w in words[2:] + words[:2]: #keys of trigram dict are tuples, values are lists trigrams[bigram].append(w) if bigram[0].endswith('.') and bigram[1][0].isupper(): startwords.append((bigram[1],w)) bigram=(bigram[1],w) return trigrams,startwords def random_text(trigrams, startwords, num_words=100): current_pair = random.choice(startwords) random_text = list(current_pair) # continue past num_words until ends in . while len(random_text)< num_words or not random_text[-1].endswith('.'): next = random.choice(trigrams[current_pair]) random_text.append(next) current_pair = (current_pair[1], next) # avoid long loops if too few periods in training text if len(random_text) > 2*num_words: random_text[-1] += '.' return textwrap.fill(' '.join(random_text)) trigrams_sh,startwords_sh=make_trigrams('sherlock.txt.gz') sorted([(bi,len(trigrams_sh[bi])) for bi in trigrams_sh],key=lambda x: x[1],reverse=True)[:10] Counter(startwords_sh).most_common(10) len(set(trigrams_sh[('of','the')])) print random_text(trigrams_sh,startwords_sh) trigrams_di,startwords_di=make_trigrams('di.txt.gz') trigrams_oz,startwords_oz=make_trigrams('oz.txt.gz') print random_text(trigrams_oz,startwords_oz) print random_text(trigrams_di,startwords_di) trigrams_oz_di= dict(trigrams_oz.items()+trigrams_di.items()) for bigram in set(trigrams_oz) & set (trigrams_di): trigrams_oz_di[bigram] = trigrams_oz[bigram] + trigrams_di[bigram] startwords_oz_di = startwords_oz + startwords_di print random_text(trigrams_oz_di,startwords_di) from IPython.display import Image Image(open('m7.png').read()) mchain={0:{1:.5, 7:.5}, 1:{}, 2:{3:.5}, 3:{1:.5}, 4:{6:.8}, 5:{}, 6:{5:.3}, 7:{2:.2, 4:.4}} T=array([[mchain[m][n] if n in mchain[m] else round(1.-sum(mchain[m].values()),4) if m==n else 0. for n in mchain ] for m in mchain]) T sum(T,axis=1) T[7].cumsum() digitize(rand(10),T[7].cumsum()) from collections import Counter Counter(digitize(rand(100000),T[7].cumsum())) n=len(T) results={1:[],5:[]} for t in xrange(100000): #trials m=0 #start at 0 for k in xrange(1,10000): #just much bigger than likely path #pull next state from probability distribution given by m'th row m=digitize([rand()],T[m].cumsum())[0] if m==5 or m==1: results[m].append(k) break len(results[1]),len(results[5]) [round(m,3) for m in mean(results[1]),mean(results[5])] around(matrix_power(T,30),3) around(T**30,3) T=rand(4,4) #random 4x4 matrix T sum(T,axis=1) #sum along the rows T /= sum(T,axis=1)[:,None] #divide by the sums along the rows so they sum to 1 T sum(T,axis=1) #verify T.dot(T) # this gives T^2 T.dot(T).dot(T) # this gives T^3 T.dot(T).dot(T).dot(T) # this gives T^4 matrix_power(T,12) w=ones(4)/4 #all components equal to 1/4. v=w.dot(matrix_power(T,12)) v for n in range(20): #see how little it changes from one iteration to the next print n,norm(matrix_power(T,n)-matrix_power(T,n-1)) w=array([0.,1.,0.,0.]) # see the result of n steps starting from 2nd state for n in range(6): print 'step',n,w w = w.dot(T) w=rand(4) #eventually doesn't matter where started w /= sum(w) #normalize as probability w for n in range(6): print 'step',n,w w = w.dot(T)