Notebook

In [1]:

'''
Most of this code is borrowed from: https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py
'''

import theano
from __future__ import print_function
from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.datasets.data_utils import get_file
import numpy as np
import random
import sys

Using Theano backend.
/Users/tylerfolkman/anaconda/lib/python2.7/site-packages/theano/tensor/signal/downsample.py:5: UserWarning: downsample module has been moved to the pool module.
  warnings.warn("downsample module has been moved to the pool module.")

In [2]:

import json

with open("candidate_words_dict.json") as f:
    candidate_word_dict = json.load(f)
text = []
start_indexes = []
i = 0
for v in candidate_word_dict.values():
    text.append("<START>")
    start_indexes.append(i)
    i+=1
    for word in v:
        if word[-1] == '.':
            text.append(word[:-1].lower())
            text.append("<END>")
            text.append("<START>")
            i+=3
            start_indexes.append(i-1)
        else:
            text.append(word.lower())
            i+=1
print (len(text))

In [3]:

## Get unique characters and create dictionaries from character to index and index to character

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 9198

In [4]:

maxlen = 5
step = 3
sentences = []
next_chars = []
## i goes to len(text) - maxlen because grabbing in those sets
for i in range(0, len(text) - maxlen, step):
    # get the first set of max len, next time around move start forward step and get max len chars
    sentences.append(text[i: i + maxlen])
    # character immediately following the sentences set
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 43929

In [5]:

print('Vectorization...')
# row for each sentence, column for each word in sentence, depth for possible characters
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
# row for each sentence prediction, column for each possible character
y = np.zeros((len(next_chars), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        # char indicies return index for a given character
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...

In [7]:

# build the model: 2 stacked LSTM
print('Build model...')

# once build model and have some weights, can re-use

#model = model_from_json(open('candidate_model_architecture.json').read())
#model.load_weights('candidate_lstm_weights.h5')

model = Sequential()

model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

Build model...

In [35]:

json_string = model.to_json()
open('candidate_model_architecture.json', 'w').write(json_string)

In [9]:

def sample(a, temperature=1.0):
    # helper function to sample an index from a probability array
    # take log of probabilities
    a = np.log(a) / temperature
    # convert back to probabilities
    a = np.exp(a) / np.sum(np.exp(a))
    # Sample one given the probabilities and get that index
    return np.argmax(np.random.multinomial(1, a, 1))

In [10]:

# train the model, output generated text after each iteration
for iteration in range(1, 4):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y, batch_size=64, nb_epoch=1)

--------------------------------------------------
Iteration 1
Epoch 1/1
43929/43929 [==============================] - 605s - loss: 4.9875   

--------------------------------------------------
Iteration 2
Epoch 1/1
43929/43929 [==============================] - 2801s - loss: 4.9129  

--------------------------------------------------
Iteration 3
Epoch 1/1
43929/43929 [==============================] - 644s - loss: 4.8380

In [11]:

model.save_weights('candidate_lstm_weights_v2.h5', overwrite=True)

In [ ]:

start_index = random.choice(start_indexes)

for diversity in [.1, .5, .75, .9, 1.0]:
    print()
    print('----- diversity:', diversity)

    generated = []
    # get a random sentence of correct length for model
    sentence = text[start_index: start_index + maxlen]
    generated.extend(sentence)
    print('----- Generating with seed: {}\n'.format(u" ".join(sentence).encode('utf-8').strip()))

    next_char = ""
    while next_char != "<END>":
        # convert the sentence into the x format necessary for model
        # meaning # sentences, # characters in sentence, # vocab
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        # generated keeps track of text im generating
        generated.append(next_char)
        # sentence shifts down by 1 for next loop
        sentence = sentence[1:]
        sentence.append(next_char)

    print("Resulting Setence: {}".format(u" ".join(generated).encode('utf-8').strip()))