import random #returns list of words in file def word_list(filename): inputFile = open(filename) return inputFile.read().split() def make_trigrams(filename): words = word_list(filename) trigrams = {} for i in range(len(words)-2): #keys of trigram dict are tuples, values are lists bigram = tuple(words[i:i+2]) if bigram not in trigrams: trigrams[bigram] = [] trigrams[bigram].append(words[i+2]) return trigrams def start_sentence(trigrams): # look for capitalized words following period to start sentences start_words = [i for k,v in trigrams.iteritems() if k[1].endswith('.') for i in v if i[0].isupper()] # find all bigrams starting with start words first_two = [k for k in trigrams.iterkeys() if k[0] in start_words] return random.choice(first_two) def random_text(filename, num_words=100): trigrams = make_trigrams(filename) current_pair = start_sentence(trigrams) random_text = ' '.join(current_pair) # continue past num_words until ends in . while len(random_text.split())< num_words or not random_text.endswith('.'): # last two words in document may not have a suffix if current_pair not in trigrams: current_pair = start_sentence(trigrams) random_text += ' '.join(current_pair) next = random.choice(trigrams[current_pair]) random_text += ' ' + next current_pair = (current_pair[1], next) # avoid infinite loops if no periods in training text if len(random_text.split()) > 5*num_words: random_text += '.' break return random_text random_text('oz.txt') #declaration of independence trigrams=make_trigrams('di.txt') #find bigram that appears most frequently max(map(len,trigrams.values())) [bigram for bigram in trigrams if len(trigrams[bigram]) == 17] [(b,w) for b,w in trigrams.iteritems() if len(w)==17] #make trigrams global (have to comment out line in def random_text) trigrams={} random_text('di.txt') #now add in some of sherlock holmes random_text('sh.txt')