'''
Most of this code is borrowed from: https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py
'''
import theano
from __future__ import print_function
from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.datasets.data_utils import get_file
import numpy as np
import random
import sys
Using Theano backend. /Users/tylerfolkman/anaconda/lib/python2.7/site-packages/theano/tensor/signal/downsample.py:5: UserWarning: downsample module has been moved to the pool module. warnings.warn("downsample module has been moved to the pool module.")
import json
with open("candidate_words_dict.json") as f:
candidate_word_dict = json.load(f)
text = []
start_indexes = []
i = 0
for v in candidate_word_dict.values():
text.append("<START>")
start_indexes.append(i)
i+=1
for word in v:
if word[-1] == '.':
text.append(word[:-1].lower())
text.append("<END>")
text.append("<START>")
i+=3
start_indexes.append(i-1)
else:
text.append(word.lower())
i+=1
print (len(text))
131792
## Get unique characters and create dictionaries from character to index and index to character
chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
total chars: 9198
maxlen = 5
step = 3
sentences = []
next_chars = []
## i goes to len(text) - maxlen because grabbing in those sets
for i in range(0, len(text) - maxlen, step):
# get the first set of max len, next time around move start forward step and get max len chars
sentences.append(text[i: i + maxlen])
# character immediately following the sentences set
next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
nb sequences: 43929
print('Vectorization...')
# row for each sentence, column for each word in sentence, depth for possible characters
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
# row for each sentence prediction, column for each possible character
y = np.zeros((len(next_chars), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
# char indicies return index for a given character
X[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1
Vectorization...
# build the model: 2 stacked LSTM
print('Build model...')
# once build model and have some weights, can re-use
#model = model_from_json(open('candidate_model_architecture.json').read())
#model.load_weights('candidate_lstm_weights.h5')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
Build model...
json_string = model.to_json()
open('candidate_model_architecture.json', 'w').write(json_string)
def sample(a, temperature=1.0):
# helper function to sample an index from a probability array
# take log of probabilities
a = np.log(a) / temperature
# convert back to probabilities
a = np.exp(a) / np.sum(np.exp(a))
# Sample one given the probabilities and get that index
return np.argmax(np.random.multinomial(1, a, 1))
# train the model, output generated text after each iteration
for iteration in range(1, 4):
print()
print('-' * 50)
print('Iteration', iteration)
model.fit(X, y, batch_size=64, nb_epoch=1)
-------------------------------------------------- Iteration 1 Epoch 1/1 43929/43929 [==============================] - 605s - loss: 4.9875 -------------------------------------------------- Iteration 2 Epoch 1/1 43929/43929 [==============================] - 2801s - loss: 4.9129 -------------------------------------------------- Iteration 3 Epoch 1/1 43929/43929 [==============================] - 644s - loss: 4.8380
model.save_weights('candidate_lstm_weights_v2.h5', overwrite=True)
start_index = random.choice(start_indexes)
for diversity in [.1, .5, .75, .9, 1.0]:
print()
print('----- diversity:', diversity)
generated = []
# get a random sentence of correct length for model
sentence = text[start_index: start_index + maxlen]
generated.extend(sentence)
print('----- Generating with seed: {}\n'.format(u" ".join(sentence).encode('utf-8').strip()))
next_char = ""
while next_char != "<END>":
# convert the sentence into the x format necessary for model
# meaning # sentences, # characters in sentence, # vocab
x = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
x[0, t, char_indices[char]] = 1.
preds = model.predict(x, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]
# generated keeps track of text im generating
generated.append(next_char)
# sentence shifts down by 1 for next loop
sentence = sentence[1:]
sentence.append(next_char)
print("Resulting Setence: {}".format(u" ".join(generated).encode('utf-8').strip()))