This code was written as part of a project to satisfy the Capstone Course requirement of the UC Berkeley Master of Information and Data Science program. Our project was completed in April 2016. For more details and to see our results, please visit our website. We look forward to your feedback!
For questions and comments about this code, please contact the author, Marguerite Oneto, by email at marguerite.oneto@ischool.berkeley.edu.
This code is based on an excellent tutorial by Denny Britz giving an introduction to Gated Recurrent Units. Please see the links below for more information.
Reference: http://www.wildml.com/2015/10/recurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano/
Reference: https://github.com/dennybritz/rnn-tutorial-gru-lstm/blob/master/gru_theano.py
Data for Unit Test: https://github.com/maoneto/W210/blob/master/Code/data/reddit-comments-2015-trunc.csv
Data for Yield Prediction Test: https://github.com/maoneto/W210/blob/master/Code/data/train_trajectories_11_images_max_mv_trunc10000.csv
%%writefile gru_theano_ut.py
import numpy as np
import theano as theano
import theano.tensor as T
from theano.gradient import grad_clip
import time
import operator
class GRUTheanoUT:
def __init__(self, word_dim, hidden_dim=128, bptt_truncate=-1):
# Assign instance variables
self.word_dim = word_dim
self.hidden_dim = hidden_dim
self.bptt_truncate = bptt_truncate
# Initialize the network parameters
E = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
U = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
b = np.zeros((6, hidden_dim))
c = np.zeros(word_dim)
# Theano: Created shared variables
self.E = theano.shared(name='E', value=E.astype(theano.config.floatX))
self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))
self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
self.b = theano.shared(name='b', value=b.astype(theano.config.floatX))
self.c = theano.shared(name='c', value=c.astype(theano.config.floatX))
# SGD / rmsprop: Initialize parameters
self.mE = theano.shared(name='mE', value=np.zeros(E.shape).astype(theano.config.floatX))
self.mU = theano.shared(name='mU', value=np.zeros(U.shape).astype(theano.config.floatX))
self.mV = theano.shared(name='mV', value=np.zeros(V.shape).astype(theano.config.floatX))
self.mW = theano.shared(name='mW', value=np.zeros(W.shape).astype(theano.config.floatX))
self.mb = theano.shared(name='mb', value=np.zeros(b.shape).astype(theano.config.floatX))
self.mc = theano.shared(name='mc', value=np.zeros(c.shape).astype(theano.config.floatX))
# We store the Theano graph here
self.theano = {}
self.__theano_build__()
def __theano_build__(self):
E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c
x = T.ivector('x')
y = T.ivector('y')
def forward_prop_step(x_t, s_t1_prev, s_t2_prev):
# This is how we calculated the hidden state in a simple RNN. No longer!
# s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))
# Word embedding layer
x_e = E[:,x_t]
# GRU Layer 1
z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0])
r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1])
c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev
# GRU Layer 2
z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3])
r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4])
c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev
# Final output calculation
# Theano's softmax returns a matrix with one row, we only need the row
o_t = T.nnet.softmax(V.dot(s_t2) + c)[0]
return [o_t, s_t1, s_t2]
[o, s, s2], updates = theano.scan(
forward_prop_step,
sequences=x,
truncate_gradient=self.bptt_truncate,
outputs_info=[None,
dict(initial=T.zeros(self.hidden_dim)),
dict(initial=T.zeros(self.hidden_dim))])
prediction = T.argmax(o, axis=1)
o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
# Total cost (could add regularization here)
cost = o_error
# Gradients
dE = T.grad(cost, E)
dU = T.grad(cost, U)
dW = T.grad(cost, W)
db = T.grad(cost, b)
dV = T.grad(cost, V)
dc = T.grad(cost, c)
# Assign functions
self.predict = theano.function([x], o)
self.predict_class = theano.function([x], prediction)
self.ce_error = theano.function([x, y], cost)
self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc])
# SGD parameters
learning_rate = T.scalar('learning_rate')
decay = T.scalar('decay')
# rmsprop cache updates
mE = decay * self.mE + (1 - decay) * dE ** 2
mU = decay * self.mU + (1 - decay) * dU ** 2
mW = decay * self.mW + (1 - decay) * dW ** 2
mV = decay * self.mV + (1 - decay) * dV ** 2
mb = decay * self.mb + (1 - decay) * db ** 2
mc = decay * self.mc + (1 - decay) * dc ** 2
self.sgd_step = theano.function(
[x, y, learning_rate, theano.Param(decay, default=0.9)],
[],
updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
(U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
(W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
(V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
(b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
(c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
(self.mE, mE),
(self.mU, mU),
(self.mW, mW),
(self.mV, mV),
(self.mb, mb),
(self.mc, mc)
])
def calculate_total_loss(self, X, Y):
return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
def calculate_loss(self, X, Y):
# Divide calculate_loss by the number of words
num_words = np.sum([len(y) for y in Y])
return self.calculate_total_loss(X,Y)/float(num_words)
Overwriting gru_theano_ut.py
%%writefile utils_ut.py
import csv
import itertools
import numpy as np
import nltk
import time
import sys
import operator
import io
import array
from datetime import datetime
from gru_theano_ut import GRUTheanoUT
SENTENCE_START_TOKEN = "SENTENCE_START"
SENTENCE_END_TOKEN = "SENTENCE_END"
UNKNOWN_TOKEN = "UNKNOWN_TOKEN"
def load_data(filename, vocabulary_size=2000, min_sent_characters=0):
word_to_index = []
index_to_word = []
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print("Reading CSV file...")
with open(filename, 'rt') as f:
reader = csv.reader(f, skipinitialspace=True)
reader.next()
# Split full comments into sentences
sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode("utf-8").lower()) for x in reader])
# Filter sentences
sentences = [s for s in sentences if len(s) >= min_sent_characters]
sentences = [s for s in sentences if "http" not in s]
# Append SENTENCE_START and SENTENCE_END
sentences = ["%s %s %s" % (SENTENCE_START_TOKEN, x, SENTENCE_END_TOKEN) for x in sentences]
print("Parsed %d sentences." % (len(sentences)))
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))
# Get the most common words and build index_to_word and word_to_index vectors
vocab = sorted(word_freq.items(), key=lambda x: (x[1], x[0]), reverse=True)[:vocabulary_size-2]
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
sorted_vocab = sorted(vocab, key=operator.itemgetter(1))
index_to_word = ["<MASK/>", UNKNOWN_TOKEN] + [x[0] for x in sorted_vocab]
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
tokenized_sentences[i] = [w if w in word_to_index else UNKNOWN_TOKEN for w in sent]
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
return X_train, y_train, word_to_index, index_to_word
def train_with_sgd(model, X_train, y_train, learning_rate=0.001, nepoch=20, decay=0.9,
callback_every=10000, callback=None):
num_examples_seen = 0
for epoch in range(nepoch):
# For each training example...
for i in np.random.permutation(len(y_train)):
# One SGD step
model.sgd_step(X_train[i], y_train[i], learning_rate, decay)
num_examples_seen += 1
# Optionally do callback
if (callback and callback_every and num_examples_seen % callback_every == 0):
callback(model, num_examples_seen)
return model
def save_model_parameters_theano(model, outfile):
np.savez(outfile,
E=model.E.get_value(),
U=model.U.get_value(),
W=model.W.get_value(),
V=model.V.get_value(),
b=model.b.get_value(),
c=model.c.get_value())
print "Saved model parameters to %s." % outfile
def load_model_parameters_theano(path, modelClass=GRUTheanoUT):
npzfile = np.load(path)
E, U, W, V, b, c = npzfile["E"], npzfile["U"], npzfile["W"], npzfile["V"], npzfile["b"], npzfile["c"]
hidden_dim, word_dim = E.shape[0], E.shape[1]
print "Building model model from %s with hidden_dim=%d word_dim=%d" % (path, hidden_dim, word_dim)
sys.stdout.flush()
model = modelClass(word_dim, hidden_dim=hidden_dim)
model.E.set_value(E)
model.U.set_value(U)
model.W.set_value(W)
model.V.set_value(V)
model.b.set_value(b)
model.c.set_value(c)
return model
def gradient_check_theano(model, x, y, h=0.001, error_threshold=0.01):
# Overwrite the bptt attribute. We need to backpropagate all the way to get the correct gradient
model.bptt_truncate = 1000
# Calculate the gradients using backprop
bptt_gradients = model.bptt(x, y)
# List of all parameters we want to chec.
model_parameters = ['E', 'U', 'W', 'b', 'V', 'c']
# Gradient check for each parameter
for pidx, pname in enumerate(model_parameters):
# Get the actual parameter value from the mode, e.g. model.W
parameter_T = operator.attrgetter(pname)(model)
parameter = parameter_T.get_value()
print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
# Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index
# Save the original value so we can reset it later
original_value = parameter[ix]
# Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
parameter[ix] = original_value + h
parameter_T.set_value(parameter)
gradplus = model.calculate_total_loss([x],[y])
parameter[ix] = original_value - h
parameter_T.set_value(parameter)
gradminus = model.calculate_total_loss([x],[y])
estimated_gradient = (gradplus - gradminus)/(2*h)
parameter[ix] = original_value
parameter_T.set_value(parameter)
# The gradient for this parameter calculated using backpropagation
backprop_gradient = bptt_gradients[pidx][ix]
# calculate The relative error: (|x - y|/(|x| + |y|))
relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
# If the error is to large fail the gradient check
if relative_error > error_threshold:
print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
print "+h Loss: %f" % gradplus
print "-h Loss: %f" % gradminus
print "Estimated_gradient: %f" % estimated_gradient
print "Backpropagation gradient: %f" % backprop_gradient
print "Relative Error: %f" % relative_error
return
it.iternext()
print "Gradient check for parameter %s passed." % (pname)
def print_sentence(s, index_to_word):
sentence_str = [index_to_word[x] for x in s[1:-1]]
print(" ".join(sentence_str))
sys.stdout.flush()
def generate_sentence(model, index_to_word, word_to_index, min_length=5):
# We start the sentence with the start token
new_sentence = [word_to_index[SENTENCE_START_TOKEN]]
# Repeat until we get an end token
while not new_sentence[-1] == word_to_index[SENTENCE_END_TOKEN]:
next_word_probs = model.predict(new_sentence)[-1]
samples = np.random.multinomial(1, next_word_probs)
sampled_word = np.argmax(samples)
new_sentence.append(sampled_word)
# Seomtimes we get stuck if the sentence becomes too long, e.g. "........" :(
# And: We don't want sentences with UNKNOWN_TOKEN's
if len(new_sentence) > 100 or sampled_word == word_to_index[UNKNOWN_TOKEN]:
return None
if len(new_sentence) < min_length:
return None
return new_sentence
def generate_sentences(model, n, index_to_word, word_to_index):
for i in range(n):
sent = None
while not sent:
sent = generate_sentence(model, index_to_word, word_to_index)
print_sentence(sent, index_to_word)
Overwriting utils_ut.py
#! /usr/bin/env python
import sys
import os
import time
import numpy as np
from utils_ut import *
from datetime import datetime
from gru_theano_ut import GRUTheanoUT
LEARNING_RATE = float(os.environ.get("LEARNING_RATE", "0.001"))
VOCABULARY_SIZE = int(os.environ.get("VOCABULARY_SIZE", "200"))
EMBEDDING_DIM = int(os.environ.get("EMBEDDING_DIM", "48"))
HIDDEN_DIM = int(os.environ.get("HIDDEN_DIM", "128"))
NEPOCH = int(os.environ.get("NEPOCH", "1"))
MODEL_OUTPUT_FILE = os.environ.get("MODEL_OUTPUT_FILE")
INPUT_DATA_FILE = os.environ.get("INPUT_DATA_FILE", "./data/reddit-comments-2015-trunc.csv")
PRINT_EVERY = int(os.environ.get("PRINT_EVERY", "25"))
if not MODEL_OUTPUT_FILE:
ts = datetime.now().strftime("%Y-%m-%d-%H-%M")
MODEL_OUTPUT_FILE = "GRU-%s-%s-%s-%s.dat" % (ts, VOCABULARY_SIZE, EMBEDDING_DIM, HIDDEN_DIM)
# Load data
x_train, y_train, word_to_index, index_to_word = load_data(INPUT_DATA_FILE, VOCABULARY_SIZE)
# Build model
model = GRUTheanoUT(VOCABULARY_SIZE, hidden_dim=HIDDEN_DIM, bptt_truncate=-1)
# Print SGD step time
t1 = time.time()
model.sgd_step(x_train[10], y_train[10], LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)
sys.stdout.flush()
# We do this every few examples to understand what's going on
def sgd_callback(model, num_examples_seen):
dt = datetime.now().isoformat()
loss = model.calculate_loss(x_train[:10000], y_train[:10000])
print("\n%s (%d)" % (dt, num_examples_seen))
print("--------------------------------------------------")
print("Loss: %f" % loss)
generate_sentences(model, 10, index_to_word, word_to_index)
save_model_parameters_theano(model, MODEL_OUTPUT_FILE)
print("\n")
sys.stdout.flush()
for epoch in range(NEPOCH):
train_with_sgd(model, x_train, y_train, learning_rate=LEARNING_RATE, nepoch=1, decay=0.9,
callback_every=PRINT_EVERY, callback=sgd_callback)
Reading CSV file... Parsed 94 sentences. Found 632 unique words tokens. Using vocabulary size 200. The least frequent word in our vocabulary is 'we' and appeared 1 times. SGD Step time: 58.140039 milliseconds 2016-04-25T18:34:47.266062 (25) -------------------------------------------------- Loss: 4.135619 myself SENTENCE_START not while much in fact now that there similar % second in too all 're anyone however connections think someone tone overlap to when SENTENCE_START cost que more connections in the with with que é from who . s : from his right wrong n't anyone can did n't same no lot be its so in . it back % character . work . then of each much welcome down see ? by , leia season n't like last $ . 's a $ the Saved model parameters to GRU-2016-04-25-18-33-200-48-128.dat. 2016-04-25T18:34:48.804555 (50) -------------------------------------------------- Loss: 3.976999 for have n't é ) from yeah leia a how where with get from : lidstrom multiply they right with award . so though v. still is yt good on writing but up on stream was season tone 2 guys on be fact . result but while riven that the similar with have or leia guys other , with end someone Saved model parameters to GRU-2016-04-25-18-33-200-48-128.dat. 2016-04-25T18:34:50.175742 (75) -------------------------------------------------- Loss: 3.921327 more what just the similar happens they result to get `` have but define but seeing connections yellow paul s in the would for : . other happens think it the he if SENTENCE_START good leia hit have said ? Saved model parameters to GRU-2016-04-25-18-33-200-48-128.dat.
import math
import csv
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
X = []
Y = []
with open('train_trajectories_11_images_max_mv_trunc10000.csv', 'r') as csvfile:
datareader = csv.reader(csvfile, delimiter=',')
for row in datareader:
label = row.pop() # pop the last element in the list which is the label (yield)
if float(label) != 0.0:
X.append(row)
Y.append(len(X[0])*[label]) # output at each t, o_t, is the yield
X = np.array(X).astype(np.float)
Y = np.array(Y).astype(np.float)
# Break labeled examples into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=22)
log_Y_train = np.log(Y_train)
log_Y_test = np.log(Y_test)
print "X_train and X_test Shape:"
print X_train.shape, X_test.shape
print "Y_train, Y_test, log_Y_train, log_Y_test, log_Y_bar Shape:"
print Y_train.shape, Y_test.shape, log_Y_train.shape, log_Y_test.shape
X_train and X_test Shape: (8000, 7) (2000, 7) Y_train, Y_test, log_Y_train, log_Y_test, log_Y_bar Shape: (8000, 7) (2000, 7) (8000, 7) (2000, 7)
%%writefile gru_theano_yp.py
#!/usr/bin/env python
import numpy as np
import theano as theano
import theano.tensor as T
from theano.gradient import grad_clip
import time
import operator
theano.exception_verbosity='high'
theano.mode='FAST_COMPILE'
theano.allow_gc=False
theano.optimizer='fast_compile'
theano.config.compute_test_value = 'off'
class GRUTheanoYP:
def __init__(self, x_dim, hidden_dim=128, bptt_truncate=-1):
# Assign instance variables
self.x_dim = x_dim
self.hidden_dim = hidden_dim
self.bptt_truncate = bptt_truncate
# Initialize the network parameters
E = np.random.uniform(-np.sqrt(1./x_dim), np.sqrt(1./x_dim), (hidden_dim, x_dim))
U = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (x_dim, hidden_dim))
b = np.zeros((6, hidden_dim, 1))
c = np.zeros((x_dim, 1))
# Theano: Created shared variables
self.E = theano.shared(name='E', value=E.astype(theano.config.floatX))
self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))
self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
self.b = theano.shared(name='b', value=b.astype(theano.config.floatX))
self.c = theano.shared(name='c', value=c.astype(theano.config.floatX))
# SGD / rmsprop: Initialize parameters
self.mE = theano.shared(name='mE', value=np.zeros(E.shape).astype(theano.config.floatX))
self.mU = theano.shared(name='mU', value=np.zeros(U.shape).astype(theano.config.floatX))
self.mV = theano.shared(name='mV', value=np.zeros(V.shape).astype(theano.config.floatX))
self.mW = theano.shared(name='mW', value=np.zeros(W.shape).astype(theano.config.floatX))
self.mb = theano.shared(name='mb', value=np.zeros(b.shape).astype(theano.config.floatX))
self.mc = theano.shared(name='mc', value=np.zeros(c.shape).astype(theano.config.floatX))
# We store the Theano graph here
self.theano = {}
self.__theano_build__()
def __theano_build__(self):
E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c
x = T.vector('x')
y = T.vector('y')
def forward_prop_step(x_t, s_t1_prev, s_t2_prev):
# This is how we calculated the hidden state in a simple RNN. No longer!
# s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))
# Embedding layer: x_t is the NDVI value of the pixel
x_e = T.mul(E, x_t)
# GRU Layer 1
z_t1 = T.nnet.hard_sigmoid(T.dot(U[0], x_e) + T.dot(W[0], s_t1_prev) + b[0])
r_t1 = T.nnet.hard_sigmoid(T.dot(U[1], x_e) + T.dot(W[1], s_t1_prev) + b[1])
c_t1 = T.tanh(T.dot(U[2], x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev
# For debugging ...
# print T.shape(s_t1).eval({x_t: 0.8888, s_t1_prev: np.asarray([[0], [0], [0], [0], [0]])})
# GRU Layer 2
z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3])
r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4])
c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev
# Final output calculation
# Theano's softmax returns a matrix with one row, we only need the row
# o_t = T.nnet.softmax(V.dot(s_t2) + c)[0]
# We do not use softmax because we are predicting a continuous variable, not doing classification:
o_t = (V.dot(s_t2) + c)[0]
return [o_t, s_t1, s_t2]
[o, s, s2], updates = theano.scan(
forward_prop_step,
sequences=[x],
truncate_gradient=self.bptt_truncate,
outputs_info=[None,
dict(initial=T.zeros_like(E)),
dict(initial=T.zeros_like(E))])
# prediction = T.argmax(o, axis=1)
# o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
# Again, we are predicting a continuous variable, not doing classification.
# We use the sum of squared errors (SSE) as our cost function:
prediction = o
o_error = T.sum(T.sqr(o - T.reshape(y, [T.shape(y)[0],1,1], ndim=3)))
# Total cost (could add regularization here)
cost = o_error
# Gradients
dE = T.grad(cost, E)
dU = T.grad(cost, U)
dW = T.grad(cost, W)
db = T.grad(cost, b)
dV = T.grad(cost, V)
dc = T.grad(cost, c)
# Assign functions
self.predict = theano.function([x], o)
# We do not need this because we are not doing classification
# self.predict_class = theano.function([x], prediction)
self.sse_error = theano.function([x, y], cost)
self.bptt = theano.function([x, y], [dU, dW, db, dV, dc])
# SGD parameters
learning_rate = T.scalar('learning_rate')
decay = T.scalar('decay')
# rmsprop cache updates
mE = decay * self.mE + (1 - decay) * dE ** 2
mU = decay * self.mU + (1 - decay) * dU ** 2
mW = decay * self.mW + (1 - decay) * dW ** 2
mV = decay * self.mV + (1 - decay) * dV ** 2
mb = decay * self.mb + (1 - decay) * db ** 2
mc = decay * self.mc + (1 - decay) * dc ** 2
self.sgd_step = theano.function(
[x, y, learning_rate, theano.Param(decay, default=0.9)],
[],
updates=[(U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
(W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
(V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
(b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
(c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
(self.mU, mU),
(self.mW, mW),
(self.mV, mV),
(self.mb, mb),
(self.mc, mc)
])
def calculate_total_loss(self, X, Y):
return np.sum([self.sse_error(x,y) for x,y in zip(X,Y)])
def calculate_loss(self, X, Y):
# Divide calculate_loss by the number of examples
num_examples = np.sum([len(y) for y in Y])
return self.calculate_total_loss(X,Y)/float(num_examples)
Overwriting gru_theano_yp.py
%%writefile utils_yp.py
import csv
import itertools
import numpy as np
import nltk
import time
import sys
import operator
import io
import array
from datetime import datetime
from gru_theano_yp import GRUTheanoYP
def shuffle_data(p, X, y):
# shuffle it
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]
# divide
n_train = np.round(X.shape[0]*p)
return X[:n_train], y[:n_train], X[n_train:], y[n_train:]
# Used to shrink the size of the training and test datasets for debugging
def shuffle_data2(X, y, n_train, n_test):
# shuffle it
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]
# divide
return X[:n_train], y[:n_train], X[n_train:n_train + n_test], y[n_train:n_train + n_test]
def save_model_parameters_theano(model, outfile):
np.savez(outfile,
E=model.E.get_value(),
U=model.U.get_value(),
W=model.W.get_value(),
V=model.V.get_value(),
b=model.b.get_value(),
c=model.c.get_value())
# print "Saved model parameters to %s." % outfile
def load_model_parameters_theano(path, modelClass=GRUTheanoYP):
npzfile = np.load(path)
E, U, W, V, b, c = npzfile["E"], npzfile["U"], npzfile["W"], npzfile["V"], npzfile["b"], npzfile["c"]
hidden_dim, x_dim = E.shape[0], E.shape[1]
print "Building model from %s with hidden_dim=%d x_dim=%d" % (path, hidden_dim, x_dim)
sys.stdout.flush()
model = modelClass(x_dim, hidden_dim=hidden_dim)
model.E.set_value(E)
model.U.set_value(U)
model.W.set_value(W)
model.V.set_value(V)
model.b.set_value(b)
model.c.set_value(c)
return model
Overwriting utils_yp.py
#! /usr/bin/env python
# Make sure the latest code updates are loaded
%reload_ext autoreload
%autoreload 2
import sys
import os
import time
import numpy as np
from utils_yp import *
from datetime import datetime
from gru_theano_yp import GRUTheanoYP
LEARNING_RATE = float(os.environ.get("LEARNING_RATE", "0.01"))
DECAY_RATE = float(os.environ.get("DECAY_RATE", "0.9"))
X_DIM = int(os.environ.get("X_DIM", "1")) # number of features in the x_t vector
HIDDEN_DIM = int(os.environ.get("HIDDEN_DIM", "128"))
NEPOCH = int(os.environ.get("NEPOCH", "100"))
def train_with_sgd(model, X_train, y_train, X_test, y_test, nepoch=20, learning_rate=0.001, decay=0.9):
# Set start of training time
start_time = time.time()
# Set model time
modeltime = datetime.now().strftime("%Y%m%d%H%M%S")
print 'Epochs: %d, TrainingSamples: %s, ModelTime: %s' %(nepoch, y_train.shape[0], modeltime)
# Set epoch variables
min_loss = 100000
train_losses = []
val_losses = []
num_examples_seen = 0
for epoch in range(nepoch):
epoch_start = time.time()
# Divide the data
train_data, train_labels, val_data, val_labels = shuffle_data(0.9, X_train, y_train) # Use this for full sample training
# train_data, train_labels, val_data, val_labels = shuffle_data2(X_train, y_train, 10000, 1000) # Use this for testing and debugging
# For each training example...
for i in range(len(train_data)):
# One SGD step
model.sgd_step(train_data[i], train_labels[i], learning_rate, decay)
num_examples_seen += 1
epoch_time = time.time() - epoch_start
# Calculate training loss
train_loss = model.calculate_loss(train_data, train_labels)
train_losses.append((num_examples_seen, train_loss))
# Calculate validation loss, R-squared, and RMSE
val_loss = model.calculate_loss(val_data, val_labels)
val_losses.append((num_examples_seen, val_loss))
val_predictions = []
for i in range(len(val_data)):
val_predictions.append(model.predict(val_data[i]))
val_predictions = np.reshape(val_predictions, (len(val_predictions), len(val_predictions[0])))
val_R_squared = 1 - np.sum(np.square(val_predictions - val_labels))/np.sum(np.square(val_labels - np.mean(val_labels)))
val_rmse = np.sqrt(np.mean(np.square(val_predictions - val_labels)))
# If validation loss is a new minimum, save predictions and model
is_saved = ''
if val_loss < min_loss:
min_loss = val_loss
# Make and save predictions
X_test_predictions = []
for i in range(len(X_test)):
X_test_predictions.append(np.append(model.predict(X_test[i]).reshape(len(X_test[i])), float(y_test[i][0])))
predictions_and_labels = np.asarray(X_test_predictions)
filename = "./predictions/GRUs/pred-%s.txt" %(modeltime)
np.savetxt(filename, predictions_and_labels, fmt='%.18f', delimiter=',',)
# Save model parameters
filename = "./models/GRUs/GRU-%s.npz" % (modeltime)
save_model_parameters_theano(model, filename)
is_saved = '(saved)'
# Print epoch stats
print '%d) ExamplesSeen=%d, TrainLoss=%.4f, ValLoss=%.4f, ValR2=%.4f, ValRMSE=%.4f, TrainTime=%.2f min, EndTime=%s %s' %(epoch+1, num_examples_seen, train_loss, val_loss, val_R_squared, val_rmse, epoch_time/60, time.strftime("%I:%M:%S"), is_saved)
print 'Finished. Total train time = %.2f hours' %((time.time() - start_time)/3600)
return train_losses, val_losses
# Start Time
print 'Start Time: %s' %(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# Build model
model = GRUTheanoYP(x_dim=X_DIM, hidden_dim=HIDDEN_DIM, bptt_truncate=-1)
# Measure and Print SGD step time
t1 = time.time()
model.sgd_step(X_train[10], Y_train[10], LEARNING_RATE, DECAY_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)
# Train the model
train_losses, val_losses = train_with_sgd(model, X_train, log_Y_train, X_test, log_Y_test, nepoch=NEPOCH, learning_rate=LEARNING_RATE, decay=DECAY_RATE)
#End Time
print 'End Time: %s' %(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
Start Time: 2016-04-25 18:40:06 SGD Step time: 25.578022 milliseconds Epochs: 100, TrainingSamples: 8000, ModelTime: 20160425184114 1) ExamplesSeen=7200, TrainLoss=0.9803, ValLoss=0.9588, ValR2=-0.6773, ValRMSE=0.3701, TrainTime=2.59 min, EndTime=06:43:57 (saved) 2) ExamplesSeen=14400, TrainLoss=0.6720, ValLoss=0.6408, ValR2=-0.0658, ValRMSE=0.3026, TrainTime=2.99 min, EndTime=06:47:04 (saved) 3) ExamplesSeen=21600, TrainLoss=0.6137, ValLoss=0.5375, ValR2=-0.0164, ValRMSE=0.2771, TrainTime=3.05 min, EndTime=06:50:14 (saved) 4) ExamplesSeen=28800, TrainLoss=0.6190, ValLoss=0.6038, ValR2=-0.0367, ValRMSE=0.2937, TrainTime=2.97 min, EndTime=06:53:19 5) ExamplesSeen=36000, TrainLoss=0.6061, ValLoss=0.6145, ValR2=-0.0263, ValRMSE=0.2963, TrainTime=3.05 min, EndTime=06:56:28 6) ExamplesSeen=43200, TrainLoss=0.6232, ValLoss=0.6342, ValR2=-0.0574, ValRMSE=0.3010, TrainTime=2.99 min, EndTime=06:59:33 7) ExamplesSeen=50400, TrainLoss=0.6122, ValLoss=0.5663, ValR2=-0.0163, ValRMSE=0.2844, TrainTime=3.16 min, EndTime=07:02:49 8) ExamplesSeen=57600, TrainLoss=1.3032, ValLoss=1.4270, ValR2=-1.0802, ValRMSE=0.4515, TrainTime=3.20 min, EndTime=07:06:07 9) ExamplesSeen=64800, TrainLoss=0.7038, ValLoss=0.7538, ValR2=-0.2333, ValRMSE=0.3282, TrainTime=3.17 min, EndTime=07:09:23 10) ExamplesSeen=72000, TrainLoss=2.8250, ValLoss=2.8096, ValR2=-3.7463, ValRMSE=0.6335, TrainTime=3.17 min, EndTime=07:12:39 11) ExamplesSeen=79200, TrainLoss=0.6055, ValLoss=0.6124, ValR2=-0.0283, ValRMSE=0.2958, TrainTime=3.16 min, EndTime=07:15:55 12) ExamplesSeen=86400, TrainLoss=0.6043, ValLoss=0.6098, ValR2=-0.0069, ValRMSE=0.2951, TrainTime=3.16 min, EndTime=07:19:11 13) ExamplesSeen=93600, TrainLoss=0.6268, ValLoss=0.6937, ValR2=-0.0511, ValRMSE=0.3148, TrainTime=3.17 min, EndTime=07:22:27 14) ExamplesSeen=100800, TrainLoss=0.6992, ValLoss=0.6789, ValR2=-0.2196, ValRMSE=0.3114, TrainTime=3.16 min, EndTime=07:25:43 15) ExamplesSeen=108000, TrainLoss=2.2451, ValLoss=2.2353, ValR2=-2.9077, ValRMSE=0.5651, TrainTime=3.16 min, EndTime=07:28:59 16) ExamplesSeen=115200, TrainLoss=1.1983, ValLoss=1.2674, ValR2=-1.0014, ValRMSE=0.4255, TrainTime=3.29 min, EndTime=07:32:23 17) ExamplesSeen=122400, TrainLoss=0.6704, ValLoss=0.6520, ValR2=-0.1228, ValRMSE=0.3052, TrainTime=3.40 min, EndTime=07:35:54 18) ExamplesSeen=129600, TrainLoss=1.0103, ValLoss=0.9660, ValR2=-0.6862, ValRMSE=0.3715, TrainTime=3.44 min, EndTime=07:39:27 19) ExamplesSeen=136800, TrainLoss=1.3847, ValLoss=1.4458, ValR2=-1.4236, ValRMSE=0.4545, TrainTime=3.43 min, EndTime=07:43:00 20) ExamplesSeen=144000, TrainLoss=0.7284, ValLoss=0.6904, ValR2=-0.1877, ValRMSE=0.3140, TrainTime=3.47 min, EndTime=07:46:34 21) ExamplesSeen=151200, TrainLoss=0.6332, ValLoss=0.6367, ValR2=-0.0754, ValRMSE=0.3016, TrainTime=3.47 min, EndTime=07:50:09 22) ExamplesSeen=158400, TrainLoss=0.6326, ValLoss=0.6213, ValR2=-0.0619, ValRMSE=0.2979, TrainTime=3.44 min, EndTime=07:53:43 23) ExamplesSeen=165600, TrainLoss=0.6200, ValLoss=0.6160, ValR2=-0.0414, ValRMSE=0.2966, TrainTime=3.38 min, EndTime=07:57:12 24) ExamplesSeen=172800, TrainLoss=0.7124, ValLoss=0.7279, ValR2=-0.2801, ValRMSE=0.3225, TrainTime=3.35 min, EndTime=08:00:40 25) ExamplesSeen=180000, TrainLoss=0.6362, ValLoss=0.6619, ValR2=-0.0590, ValRMSE=0.3075, TrainTime=3.32 min, EndTime=08:04:05 26) ExamplesSeen=187200, TrainLoss=0.7171, ValLoss=0.6968, ValR2=-0.1671, ValRMSE=0.3155, TrainTime=3.37 min, EndTime=08:07:34 27) ExamplesSeen=194400, TrainLoss=0.8331, ValLoss=0.8048, ValR2=-0.3041, ValRMSE=0.3391, TrainTime=3.39 min, EndTime=08:11:04 28) ExamplesSeen=201600, TrainLoss=0.6768, ValLoss=0.6649, ValR2=-0.1226, ValRMSE=0.3082, TrainTime=3.40 min, EndTime=08:14:35 29) ExamplesSeen=208800, TrainLoss=0.7954, ValLoss=0.8067, ValR2=-0.3481, ValRMSE=0.3395, TrainTime=3.27 min, EndTime=08:17:57 30) ExamplesSeen=216000, TrainLoss=0.6135, ValLoss=0.6470, ValR2=-0.0390, ValRMSE=0.3040, TrainTime=3.25 min, EndTime=08:21:18 31) ExamplesSeen=223200, TrainLoss=0.6334, ValLoss=0.5841, ValR2=-0.0458, ValRMSE=0.2889, TrainTime=3.28 min, EndTime=08:24:41 32) ExamplesSeen=230400, TrainLoss=0.6509, ValLoss=0.5885, ValR2=-0.0696, ValRMSE=0.2899, TrainTime=3.28 min, EndTime=08:28:04 33) ExamplesSeen=237600, TrainLoss=1.0427, ValLoss=1.0250, ValR2=-0.7392, ValRMSE=0.3827, TrainTime=3.23 min, EndTime=08:31:23 34) ExamplesSeen=244800, TrainLoss=0.6964, ValLoss=0.7240, ValR2=-0.1709, ValRMSE=0.3216, TrainTime=3.18 min, EndTime=08:34:40 35) ExamplesSeen=252000, TrainLoss=0.6814, ValLoss=0.6839, ValR2=-0.1253, ValRMSE=0.3126, TrainTime=3.18 min, EndTime=08:37:57 36) ExamplesSeen=259200, TrainLoss=0.8927, ValLoss=0.8359, ValR2=-0.4416, ValRMSE=0.3456, TrainTime=3.17 min, EndTime=08:41:14 37) ExamplesSeen=266400, TrainLoss=0.6187, ValLoss=0.6161, ValR2=-0.0509, ValRMSE=0.2967, TrainTime=3.16 min, EndTime=08:44:29 38) ExamplesSeen=273600, TrainLoss=0.6687, ValLoss=0.6684, ValR2=-0.0980, ValRMSE=0.3090, TrainTime=3.19 min, EndTime=08:47:46 39) ExamplesSeen=280800, TrainLoss=1.2828, ValLoss=1.2485, ValR2=-1.1082, ValRMSE=0.4223, TrainTime=3.26 min, EndTime=08:51:08 40) ExamplesSeen=288000, TrainLoss=0.5997, ValLoss=0.6388, ValR2=-0.0104, ValRMSE=0.3021, TrainTime=3.25 min, EndTime=08:54:29 41) ExamplesSeen=295200, TrainLoss=0.6340, ValLoss=0.6536, ValR2=-0.0714, ValRMSE=0.3056, TrainTime=3.26 min, EndTime=08:57:50 42) ExamplesSeen=302400, TrainLoss=0.6020, ValLoss=0.6100, ValR2=-0.0068, ValRMSE=0.2952, TrainTime=3.24 min, EndTime=09:01:11 43) ExamplesSeen=309600, TrainLoss=0.6860, ValLoss=0.6976, ValR2=-0.2293, ValRMSE=0.3157, TrainTime=3.27 min, EndTime=09:04:33 44) ExamplesSeen=316800, TrainLoss=1.0441, ValLoss=1.0488, ValR2=-0.7068, ValRMSE=0.3871, TrainTime=3.21 min, EndTime=09:07:52 45) ExamplesSeen=324000, TrainLoss=0.7382, ValLoss=0.7020, ValR2=-0.2234, ValRMSE=0.3167, TrainTime=3.21 min, EndTime=09:11:11 46) ExamplesSeen=331200, TrainLoss=0.7713, ValLoss=0.7931, ValR2=-0.3101, ValRMSE=0.3366, TrainTime=3.31 min, EndTime=09:14:35 47) ExamplesSeen=338400, TrainLoss=0.6280, ValLoss=0.6366, ValR2=-0.0459, ValRMSE=0.3016, TrainTime=3.26 min, EndTime=09:17:57 48) ExamplesSeen=345600, TrainLoss=0.6144, ValLoss=0.6033, ValR2=-0.0279, ValRMSE=0.2936, TrainTime=3.26 min, EndTime=09:21:18 49) ExamplesSeen=352800, TrainLoss=0.6724, ValLoss=0.6476, ValR2=-0.1247, ValRMSE=0.3042, TrainTime=3.27 min, EndTime=09:24:40 50) ExamplesSeen=360000, TrainLoss=0.6977, ValLoss=0.6236, ValR2=-0.1735, ValRMSE=0.2985, TrainTime=3.27 min, EndTime=09:28:03 51) ExamplesSeen=367200, TrainLoss=0.9264, ValLoss=0.9168, ValR2=-0.5670, ValRMSE=0.3619, TrainTime=3.26 min, EndTime=09:31:24 52) ExamplesSeen=374400, TrainLoss=0.7697, ValLoss=0.7399, ValR2=-0.2403, ValRMSE=0.3251, TrainTime=3.27 min, EndTime=09:34:46 53) ExamplesSeen=381600, TrainLoss=0.6423, ValLoss=0.6454, ValR2=-0.0717, ValRMSE=0.3037, TrainTime=3.24 min, EndTime=09:38:06 54) ExamplesSeen=388800, TrainLoss=0.6948, ValLoss=0.6729, ValR2=-0.1693, ValRMSE=0.3100, TrainTime=3.24 min, EndTime=09:41:27 55) ExamplesSeen=396000, TrainLoss=0.6071, ValLoss=0.5962, ValR2=-0.0138, ValRMSE=0.2919, TrainTime=3.21 min, EndTime=09:44:45 56) ExamplesSeen=403200, TrainLoss=1.0773, ValLoss=1.1789, ValR2=-0.8432, ValRMSE=0.4104, TrainTime=3.24 min, EndTime=09:48:06 57) ExamplesSeen=410400, TrainLoss=0.6226, ValLoss=0.6389, ValR2=-0.0411, ValRMSE=0.3021, TrainTime=3.24 min, EndTime=09:51:26 58) ExamplesSeen=417600, TrainLoss=0.6177, ValLoss=0.6190, ValR2=-0.0320, ValRMSE=0.2974, TrainTime=3.23 min, EndTime=09:54:46 59) ExamplesSeen=424800, TrainLoss=0.6126, ValLoss=0.5951, ValR2=-0.0216, ValRMSE=0.2916, TrainTime=3.27 min, EndTime=09:58:08 60) ExamplesSeen=432000, TrainLoss=0.8821, ValLoss=0.8633, ValR2=-0.4670, ValRMSE=0.3512, TrainTime=3.28 min, EndTime=10:01:31 61) ExamplesSeen=439200, TrainLoss=0.7365, ValLoss=0.7636, ValR2=-0.2666, ValRMSE=0.3303, TrainTime=3.29 min, EndTime=10:04:54 62) ExamplesSeen=446400, TrainLoss=0.6013, ValLoss=0.6393, ValR2=-0.0102, ValRMSE=0.3022, TrainTime=3.29 min, EndTime=10:08:18 63) ExamplesSeen=453600, TrainLoss=0.6398, ValLoss=0.6732, ValR2=-0.0812, ValRMSE=0.3101, TrainTime=3.27 min, EndTime=10:11:40 64) ExamplesSeen=460800, TrainLoss=0.6538, ValLoss=0.6511, ValR2=-0.0647, ValRMSE=0.3050, TrainTime=3.29 min, EndTime=10:15:03 65) ExamplesSeen=468000, TrainLoss=0.6681, ValLoss=0.6425, ValR2=-0.1089, ValRMSE=0.3030, TrainTime=3.30 min, EndTime=10:18:27 66) ExamplesSeen=475200, TrainLoss=0.7962, ValLoss=0.8180, ValR2=-0.3240, ValRMSE=0.3418, TrainTime=3.13 min, EndTime=10:21:40 67) ExamplesSeen=482400, TrainLoss=1.0521, ValLoss=1.1469, ValR2=-0.6872, ValRMSE=0.4048, TrainTime=3.13 min, EndTime=10:24:54 68) ExamplesSeen=489600, TrainLoss=0.6255, ValLoss=0.6121, ValR2=-0.0292, ValRMSE=0.2957, TrainTime=3.24 min, EndTime=10:28:15 69) ExamplesSeen=496800, TrainLoss=0.6116, ValLoss=0.6102, ValR2=-0.0250, ValRMSE=0.2953, TrainTime=3.28 min, EndTime=10:31:38 70) ExamplesSeen=504000, TrainLoss=0.6812, ValLoss=0.6912, ValR2=-0.1381, ValRMSE=0.3142, TrainTime=3.28 min, EndTime=10:35:00 71) ExamplesSeen=511200, TrainLoss=0.6169, ValLoss=0.6332, ValR2=-0.0259, ValRMSE=0.3008, TrainTime=3.23 min, EndTime=10:38:20 72) ExamplesSeen=518400, TrainLoss=0.6727, ValLoss=0.6522, ValR2=-0.1497, ValRMSE=0.3052, TrainTime=3.28 min, EndTime=10:41:43 73) ExamplesSeen=525600, TrainLoss=0.6210, ValLoss=0.5830, ValR2=-0.0539, ValRMSE=0.2886, TrainTime=3.26 min, EndTime=10:45:04 74) ExamplesSeen=532800, TrainLoss=0.7657, ValLoss=0.7392, ValR2=-0.1842, ValRMSE=0.3250, TrainTime=3.27 min, EndTime=10:48:27 75) ExamplesSeen=540000, TrainLoss=0.6036, ValLoss=0.6139, ValR2=-0.0132, ValRMSE=0.2961, TrainTime=3.25 min, EndTime=10:51:48 76) ExamplesSeen=547200, TrainLoss=0.6061, ValLoss=0.5781, ValR2=-0.0124, ValRMSE=0.2874, TrainTime=3.24 min, EndTime=10:55:08 77) ExamplesSeen=554400, TrainLoss=0.6695, ValLoss=0.6484, ValR2=-0.0948, ValRMSE=0.3043, TrainTime=3.23 min, EndTime=10:58:28 78) ExamplesSeen=561600, TrainLoss=0.7782, ValLoss=0.7547, ValR2=-0.2326, ValRMSE=0.3283, TrainTime=3.31 min, EndTime=11:01:53 79) ExamplesSeen=568800, TrainLoss=0.6107, ValLoss=0.5860, ValR2=-0.0168, ValRMSE=0.2893, TrainTime=3.26 min, EndTime=11:05:15 80) ExamplesSeen=576000, TrainLoss=0.7498, ValLoss=0.7782, ValR2=-0.2770, ValRMSE=0.3334, TrainTime=3.27 min, EndTime=11:08:37 81) ExamplesSeen=583200, TrainLoss=0.6121, ValLoss=0.6348, ValR2=-0.0235, ValRMSE=0.3011, TrainTime=3.29 min, EndTime=11:12:00 82) ExamplesSeen=590400, TrainLoss=0.8285, ValLoss=0.8519, ValR2=-0.4161, ValRMSE=0.3488, TrainTime=3.27 min, EndTime=11:15:23 83) ExamplesSeen=597600, TrainLoss=0.6011, ValLoss=0.6011, ValR2=-0.0111, ValRMSE=0.2930, TrainTime=3.27 min, EndTime=11:18:45 84) ExamplesSeen=604800, TrainLoss=0.6761, ValLoss=0.6469, ValR2=-0.1276, ValRMSE=0.3040, TrainTime=3.28 min, EndTime=11:22:08 85) ExamplesSeen=612000, TrainLoss=0.6434, ValLoss=0.6398, ValR2=-0.0678, ValRMSE=0.3023, TrainTime=3.27 min, EndTime=11:25:30 86) ExamplesSeen=619200, TrainLoss=0.6087, ValLoss=0.5794, ValR2=-0.0126, ValRMSE=0.2877, TrainTime=3.29 min, EndTime=11:28:54 87) ExamplesSeen=626400, TrainLoss=0.6064, ValLoss=0.6047, ValR2=-0.0084, ValRMSE=0.2939, TrainTime=3.26 min, EndTime=11:32:16 88) ExamplesSeen=633600, TrainLoss=0.8032, ValLoss=0.7420, ValR2=-0.2693, ValRMSE=0.3256, TrainTime=3.23 min, EndTime=11:35:36 89) ExamplesSeen=640800, TrainLoss=0.6477, ValLoss=0.6773, ValR2=-0.1558, ValRMSE=0.3110, TrainTime=3.27 min, EndTime=11:38:58 90) ExamplesSeen=648000, TrainLoss=0.6927, ValLoss=0.7621, ValR2=-0.2232, ValRMSE=0.3300, TrainTime=3.26 min, EndTime=11:42:19 91) ExamplesSeen=655200, TrainLoss=0.6946, ValLoss=0.6593, ValR2=-0.1533, ValRMSE=0.3069, TrainTime=3.20 min, EndTime=11:45:37 92) ExamplesSeen=662400, TrainLoss=0.6913, ValLoss=0.6071, ValR2=-0.1556, ValRMSE=0.2945, TrainTime=3.27 min, EndTime=11:49:00 93) ExamplesSeen=669600, TrainLoss=0.6099, ValLoss=0.6568, ValR2=-0.0314, ValRMSE=0.3063, TrainTime=3.30 min, EndTime=11:52:23 94) ExamplesSeen=676800, TrainLoss=0.6166, ValLoss=0.6084, ValR2=-0.0296, ValRMSE=0.2948, TrainTime=3.26 min, EndTime=11:55:45 95) ExamplesSeen=684000, TrainLoss=0.6232, ValLoss=0.6107, ValR2=-0.0323, ValRMSE=0.2954, TrainTime=3.26 min, EndTime=11:59:07 96) ExamplesSeen=691200, TrainLoss=0.6043, ValLoss=0.5982, ValR2=-0.0079, ValRMSE=0.2923, TrainTime=3.29 min, EndTime=12:02:30 97) ExamplesSeen=698400, TrainLoss=0.6155, ValLoss=0.5462, ValR2=-0.0102, ValRMSE=0.2793, TrainTime=3.28 min, EndTime=12:05:53 98) ExamplesSeen=705600, TrainLoss=0.6149, ValLoss=0.6126, ValR2=-0.0152, ValRMSE=0.2958, TrainTime=3.28 min, EndTime=12:09:16 99) ExamplesSeen=712800, TrainLoss=0.7542, ValLoss=0.7707, ValR2=-0.2946, ValRMSE=0.3318, TrainTime=3.29 min, EndTime=12:12:40 100) ExamplesSeen=720000, TrainLoss=0.6116, ValLoss=0.5889, ValR2=-0.0234, ValRMSE=0.2900, TrainTime=3.33 min, EndTime=12:16:06 Finished. Total train time = 5.58 hours End Time: 2016-04-26 00:16:06
Y_train_1d = Y_train[:,0]
y_hat = len(Y_test)*[np.mean(Y_train_1d)]
y_test = Y_test[:,0]
y_hat = np.array(y_hat).astype(np.float)
y_test = np.array(y_test).astype(np.float)
y_bar = np.mean(y_test)
R_squared = 1 - np.sum(np.square(y_hat - y_test))/np.sum(np.square(y_test - y_bar))
mae = np.mean(np.abs(y_hat - y_test))
rmse = np.sqrt(np.mean(np.square(y_hat - y_test)))
mape = np.mean(abs(np.divide((y_hat - y_test), y_test)))
print 'R-Squared: %.4f, MAE: %.4f, RMSE: %.4f, MAPE: %.2f%s' %(R_squared, mae, rmse, mape*100, '%')
R-Squared: -0.0002, MAE: 0.7594, RMSE: 0.9684, MAPE: 26.20%
These are the predictions on a holdout sample (X_test above) using the model with the lowest validation loss.
y_hat = []
y_test = []
with open('./predictions/GRUs/pred-20160425154620.txt', 'r') as csvfile:
datareader = csv.reader(csvfile, delimiter=',')
for row in datareader:
label = row.pop()
y_hat.append(row)
y_test.append(label)
# Results keeping predictions as logs of yield
y_hat = np.array(y_hat).astype(np.float)
y_test = np.array(y_test).astype(np.float)
y_bar = np.mean(y_test)
R_squared_list_tr = []
mae_list_tr = []
rmse_list_tr = []
mape_list_tr = []
print 'Log-Transformed Yield Results:'
for j in range(len(y_hat[0])):
y_hat_last = y_hat[:, j]
R_squared = 1 - np.sum(np.square(y_hat_last - y_test))/np.sum(np.square(y_test - y_bar))
mae = np.mean(np.abs(y_hat_last - y_test))
rmse = np.sqrt(np.mean(np.square(y_hat_last - y_test)))
count = 0
sums = 0
for i in range(len(y_test)):
if y_test[i] != 0:
error = np.divide(abs(y_hat_last[i] - y_test[i]), abs(y_test[i]))
count += 1
sums += error
mean_abs_pct_error = sums/count
R_squared_list_tr.append(R_squared)
mae_list_tr.append(mae)
rmse_list_tr.append(rmse)
mape_list_tr.append(mean_abs_pct_error)
print 't-%d) R-Squared: %.4f, MAE: %.4f, RMSE: %.4f, MAPE: %.2f%s' %(6-j, R_squared, mae, rmse, mean_abs_pct_error*100, '%')
# Results transforming predictions and yields back to original units
y_hat = np.array(y_hat).astype(np.float)
y_test = np.array(y_test).astype(np.float)
y_hat_exp = np.exp(y_hat)
y_test_exp = np.exp(y_test)
y_bar_exp = np.mean(y_test_exp)
R_squared_list_exp = []
mae_list_exp = []
rmse_list_exp = []
mape_list_exp = []
print '\nValues Transformed Back to Original Units Results:'
for j in range(len(y_hat_exp[0])):
y_hat_last = y_hat_exp[:, j]
R_squared = 1 - np.sum(np.square(y_hat_last - y_test_exp))/np.sum(np.square(y_test_exp - y_bar_exp))
mae = np.mean(np.abs(y_hat_last - y_test_exp))
rmse = np.sqrt(np.mean(np.square(y_hat_last - y_test_exp)))
count = 0
sums = 0
for i in range(len(y_test_exp)):
if y_test_exp[i] != 0:
error = np.divide(abs(y_hat_last[i] - y_test_exp[i]), abs(y_test_exp[i]))
count += 1
sums += error
mean_abs_pct_error = sums/count
R_squared_list_exp.append(R_squared)
mae_list_exp.append(mae)
rmse_list_exp.append(rmse)
mape_list_exp.append(mean_abs_pct_error)
print 't-%d) R-Squared: %.4f, MAE: %.4f, RMSE: %.4f, MAPE: %.2f%s' %(len(y_hat_exp[0])-1-j, R_squared, mae, rmse, mean_abs_pct_error*100, '%')
Log-Transformed Yield Results: t-6) R-Squared: 0.0000, MAE: 0.2453, RMSE: 0.2978, MAPE: 26.09% t-5) R-Squared: -0.0001, MAE: 0.2454, RMSE: 0.2978, MAPE: 26.00% t-4) R-Squared: 0.0000, MAE: 0.2454, RMSE: 0.2978, MAPE: 26.06% t-3) R-Squared: -0.0002, MAE: 0.2454, RMSE: 0.2978, MAPE: 26.00% t-2) R-Squared: -0.0000, MAE: 0.2454, RMSE: 0.2978, MAPE: 26.06% t-1) R-Squared: -0.0002, MAE: 0.2454, RMSE: 0.2978, MAPE: 26.00% t-0) R-Squared: -0.0000, MAE: 0.2454, RMSE: 0.2978, MAPE: 26.06% Values Transformed Back to Original Units Results: t-6) R-Squared: -0.0196, MAE: 0.7597, RMSE: 0.9778, MAPE: 25.18% t-5) R-Squared: -0.0235, MAE: 0.7600, RMSE: 0.9797, MAPE: 25.08% t-4) R-Squared: -0.0209, MAE: 0.7597, RMSE: 0.9784, MAPE: 25.14% t-3) R-Squared: -0.0235, MAE: 0.7600, RMSE: 0.9797, MAPE: 25.08% t-2) R-Squared: -0.0209, MAE: 0.7597, RMSE: 0.9785, MAPE: 25.14% t-1) R-Squared: -0.0235, MAE: 0.7600, RMSE: 0.9797, MAPE: 25.08% t-0) R-Squared: -0.0209, MAE: 0.7597, RMSE: 0.9785, MAPE: 25.14%
%matplotlib inline
import matplotlib.pyplot as plt
def plot_losses(train_losses, val_losses, epoch_examples, epochs, min_y, max_y):
x1 = [a/epoch_examples for (a,b) in train_losses]
y1 = [b for (a,b) in train_losses]
x2 = [a/epoch_examples for (a,b) in val_losses]
y2 = [b for (a,b) in val_losses]
plt.figure(figsize=(12, 6))
plt.axis((1, epochs, min_y, max_y))
plt.plot(x1, y1, "-m", x2, y2, "-b")
plt.legend(('Train Loss', 'Validation Loss'))
xlabel = 'Epochs (%d Examples Seen Per Epoch)' %(epoch_examples)
plt.xlabel(xlabel)
plt.show()
plot_losses(train_losses, val_losses, 7200, 100, 0, 3.0)
model = load_model_parameters_theano("./models/GRUs/GRU-20160425154620.npz")
Building model from ./models/GRUs/GRU-20160425154620.npz with hidden_dim=128 x_dim=1