import random
#returns list of words in file
def word_list(filename):
inputFile = open(filename)
return inputFile.read().split()
def make_trigrams(filename):
words = word_list(filename)
trigrams = {}
for i in range(len(words)-2):
#keys of trigram dict are tuples, values are lists
bigram = tuple(words[i:i+2])
if bigram not in trigrams: trigrams[bigram] = []
trigrams[bigram].append(words[i+2])
return trigrams
def start_sentence(trigrams):
# look for capitalized words following period to start sentences
start_words = [i for k,v in trigrams.iteritems() if k[1].endswith('.')
for i in v if i[0].isupper()]
# find all bigrams starting with start words
first_two = [k for k in trigrams.iterkeys() if k[0] in start_words]
return random.choice(first_two)
def random_text(filename, num_words=100):
trigrams = make_trigrams(filename)
current_pair = start_sentence(trigrams)
random_text = ' '.join(current_pair)
# continue past num_words until ends in .
while len(random_text.split())< num_words or not random_text.endswith('.'):
# last two words in document may not have a suffix
if current_pair not in trigrams:
current_pair = start_sentence(trigrams)
random_text += ' '.join(current_pair)
next = random.choice(trigrams[current_pair])
random_text += ' ' + next
current_pair = (current_pair[1], next)
# avoid infinite loops if no periods in training text
if len(random_text.split()) > 5*num_words:
random_text += '.'
break
return random_text
random_text('oz.txt')
'So we know you are really brighter than he needs." "And I should have lived a coward like me," continued the man, with surprise. "I don\'t know how to use her power." Then she sat up and had they not felt so gay. "I was born that way. All the other wild beasts. It seems to me as King of Beasts. I learned that if Dorothy had finished telling him everything that had handled the Scarecrow to the edge of the Forest?" inquired the Scarecrow. "The Lion was heavy, they managed to scramble to the Emerald City; and to Dorothy they presented a beautiful bracelet studded with diamonds; and to sleep.'
#declaration of independence
trigrams=make_trigrams('di.txt')
#find bigram that appears most frequently
max(map(len,trigrams.values()))
17
[bigram for bigram in trigrams if len(trigrams[bigram]) == 17]
[('He', 'has')]
[(b,w) for b,w in trigrams.iteritems() if len(w)==17]
[(('He', 'has'), ['refuted', 'forbidden', 'refused', 'called', 'dissolved', 'refused', 'endeavoured', 'obstructed', 'made', 'erected', 'kept', 'affected', 'combined', 'abdicated', 'plundered', 'constrained', 'excited'])]
#make trigrams global (have to comment out line in def random_text)
trigrams={}
random_text('di.txt')
'We, therefore, the Representatives of the good People of these States: For cutting off our Trade with all parts of the benefit of Trial by Jury: For transporting us beyond Seas to bear Arms against their Country, to become the executioners of their friends and Brethren, or to abolish it, and to provide new Guards for their exercise; the State of Great Britain, is and ought to be Free and Independent States, that they are endowed by their Hands. He has made Judges dependent on his Will alone for the sole purpose of fatiguing them into compliance with his measures.'
#now add in some of sherlock holmes
random_text('sh.txt')
'Dorothy discovered something shining in a rainstorm, before I was ascertaining whether the Woodman nor the dog is asleep already." It was some time done manual labour, that he had drifted into the bargain, you no doubt suggested to Clay\'s ingenious mind by the collar. The other trees of the East. When you raise your cry of satisfaction. For a minute I shall be there in an office in Leadenhall Street--and--" "What office?" "That\'s the worst of it, and \'Letters, memoranda, receipts, and a bulge on the floor and keep the gas in it. It will end in such form, as to render it at your cottage to pass the night of May 2nd." "Thank you.'