%matplotlib inline
"""
A deep neural network with or w/o dropout in one notebook.
To train the networks you just need to call the run() function
You need to run each cell before doing so.
To do so, you can go to "Cell -> Run All"
Feel free to edit individually each cell and rerun modified ones.
You can change global parameters here before running
"""
# Which experiment to run
MNIST = True
DIGITS = False
FACES = False
TWENTYNEWSGROUPS = False
BATCH_SIZE = 100 # default batch size
L2_LAMBDA = 1. # default L2 regularization parameter
INIT_LR = 0.01 # initial learning rate, try making it larger
import numpy
import theano
import sys
import math
from theano import tensor as T
from theano import shared
from theano.tensor.shared_randomstreams import RandomStreams
from collections import OrderedDict
#Activation and helper functions
def relu_f(vec):
""" Wrapper to quickly change the rectified linear unit function """
return (vec + abs(vec)) / 2.
def softplus_f(v):
return T.log(1 + T.exp(v))
def dropout(rng, x, p=0.5):
""" Zero-out random values in x with probability p using rng """
if p > 0. and p < 1.:
seed = rng.randint(2 ** 30)
srng = theano.tensor.shared_randomstreams.RandomStreams(seed)
mask = srng.binomial(n=1, p=1.-p, size=x.shape,
dtype=theano.config.floatX)
return x * mask
return x
def build_shared_zeros(shape, name):
""" Builds a theano shared variable filled with a zeros numpy array """
return shared(value=numpy.zeros(shape, dtype=theano.config.floatX),
name=name, borrow=True)
class Linear(object):
""" Basic linear transformation layer (W.X + b) """
def __init__(self, rng, input, n_in, n_out, W=None, b=None):
if W is None:
W_values = numpy.asarray(rng.uniform(
low=-numpy.sqrt(6. / (n_in + n_out)),
high=numpy.sqrt(6. / (n_in + n_out)),
size=(n_in, n_out)), dtype=theano.config.floatX)
W_values *= 4 # This works for sigmoid activated networks!
W = theano.shared(value=W_values, name='W', borrow=True)
if b is None:
b = build_shared_zeros((n_out,), 'b')
self.input = input
self.W = W
self.b = b
self.params = [self.W, self.b]
self.output = T.dot(self.input, self.W) + self.b
def __repr__(self):
return "Linear"
class SigmoidLayer(Linear):
""" Sigmoid activation layer (sigmoid(W.X + b)) """
def __init__(self, rng, input, n_in, n_out, W=None, b=None):
super(SigmoidLayer, self).__init__(rng, input, n_in, n_out, W, b)
self.pre_activation = self.output
self.output = T.nnet.sigmoid(self.pre_activation)
class ReLU(Linear):
""" Rectified Linear Unit activation layer (max(0, W.X + b)) """
def __init__(self, rng, input, n_in, n_out, W=None, b=None):
if b is None:
b = build_shared_zeros((n_out,), 'b')
super(ReLU, self).__init__(rng, input, n_in, n_out, W, b)
self.pre_activation = self.output
self.output = relu_f(self.pre_activation)
class SoftPlus(Linear):
def __init__(self, rng, input, n_in, n_out, W=None, b=None):
if b is None:
b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
b = theano.shared(value=b_values, name='b', borrow=True)
super(SoftPlus, self).__init__(rng, input, n_in, n_out, W, b)
self.pre_activation = self.output
self.output = softplus_f(self.pre_activation)
class DatasetMiniBatchIterator(object):
""" Basic mini-batch iterator """
def __init__(self, x, y, batch_size=BATCH_SIZE, randomize=False):
self.x = x
self.y = y
self.batch_size = batch_size
self.randomize = randomize
from sklearn.utils import check_random_state
self.rng = check_random_state(42)
def __iter__(self):
n_samples = self.x.shape[0]
if self.randomize:
for _ in xrange(n_samples / BATCH_SIZE):
if BATCH_SIZE > 1:
i = int(self.rng.rand(1) * ((n_samples+BATCH_SIZE-1) / BATCH_SIZE))
else:
i = int(math.floor(self.rng.rand(1) * n_samples))
yield (i, self.x[i*self.batch_size:(i+1)*self.batch_size],
self.y[i*self.batch_size:(i+1)*self.batch_size])
else:
for i in xrange((n_samples + self.batch_size - 1)
/ self.batch_size):
yield (self.x[i*self.batch_size:(i+1)*self.batch_size],
self.y[i*self.batch_size:(i+1)*self.batch_size])
class LogisticRegression:
""" _Multi-class_ Logistic Regression """
def __init__(self, rng, input, n_in, n_out, W=None, b=None):
if W != None:
self.W = W
else:
self.W = build_shared_zeros((n_in, n_out), 'W')
if b != None:
self.b = b
else:
self.b = build_shared_zeros((n_out,), 'b')
self.input = input
self.p_y_given_x = T.nnet.softmax(T.dot(self.input, self.W) + self.b)
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
self.output = self.y_pred
self.params = [self.W, self.b]
def negative_log_likelihood(self, y):
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
def negative_log_likelihood_sum(self, y):
return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
def training_cost(self, y):
""" Wrapper for standard name """
return self.negative_log_likelihood(y)
def errors(self, y):
if y.ndim != self.y_pred.ndim:
raise TypeError("!!! 'y' should have the same shape as 'self.y_pred'",
("y", y.type, "y_pred", self.y_pred.type))
if y.dtype.startswith('int'):
return T.mean(T.neq(self.y_pred, y))
else:
print("!!! y should be of int type")
return T.mean(T.neq(self.y_pred, numpy.asarray(y, dtype='int')))
#class PerceptronLoss: # TODO
class NeuralNet(object):
""" Neural network (not regularized, without dropout) """
def __init__(self, numpy_rng, theano_rng=None,
n_ins=40*3,
layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
layers_sizes=[1024, 1024, 1024, 1024],
n_outs=62*3,
rho=0.95, eps=1.E-6,
momentum=0.9, step_adapt_alpha=1.E-4,
b1=0.1, b2=0.001,
debugprint=False):
"""
Basic Neural Net class
"""
self.layers = []
self.params = []
self.n_layers = len(layers_types)
self.layers_types = layers_types
assert self.n_layers > 0
self._rho = rho # ``momentum'' for adadelta (and discount/decay for RMSprop), alpha for Adam
self._eps = eps # epsilon for adadelta (and for RMSprop and Adam)
self._momentum = momentum # for RMSProp
self._b1 = b1
self._b2 = b2
self._accugrads = [] # for adadelta
self._accudeltas = [] # for adadelta
self._avggrads = [] # for RMSprop in the Alex Graves' variant
self._stepadapts = [] # for RMSprop with step adaptations
self._stepadapt_alpha = step_adapt_alpha
if theano_rng == None:
theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
self.x = T.fmatrix('x')
self.y = T.ivector('y')
self.layers_ins = [n_ins] + layers_sizes
self.layers_outs = layers_sizes + [n_outs]
layer_input = self.x
for layer_type, n_in, n_out in zip(layers_types,
self.layers_ins, self.layers_outs):
this_layer = layer_type(rng=numpy_rng,
input=layer_input, n_in=n_in, n_out=n_out)
assert hasattr(this_layer, 'output')
self.params.extend(this_layer.params)
self._accugrads.extend([build_shared_zeros(t.shape.eval(),
'accugrad') for t in this_layer.params])
self._accudeltas.extend([build_shared_zeros(t.shape.eval(),
'accudelta') for t in this_layer.params])
self._avggrads.extend([build_shared_zeros(t.shape.eval(),
'avggrad') for t in this_layer.params])
self._stepadapts.extend([shared(value=numpy.ones(t.shape.eval(),
dtype=theano.config.floatX),
name='stepadapt', borrow=True) for t in this_layer.params])
self.layers.append(this_layer)
layer_input = this_layer.output
assert hasattr(self.layers[-1], 'training_cost')
assert hasattr(self.layers[-1], 'errors')
self.mean_cost = self.layers[-1].negative_log_likelihood(self.y)
self.cost = self.layers[-1].training_cost(self.y)
if debugprint:
theano.printing.debugprint(self.cost)
self.errors = self.layers[-1].errors(self.y)
def __repr__(self):
dimensions_layers_str = map(lambda x: "x".join(map(str, x)),
zip(self.layers_ins, self.layers_outs))
return "_".join(map(lambda x: "_".join((x[0].__name__, x[1])),
zip(self.layers_types, dimensions_layers_str)))
def get_SGD_trainer(self):
""" Returns a plain SGD minibatch trainer with learning rate as param. """
batch_x = T.fmatrix('batch_x')
batch_y = T.ivector('batch_y')
learning_rate = T.fscalar('lr') # learning rate
gparams = T.grad(self.mean_cost, self.params) # all the gradients
updates = OrderedDict()
for param, gparam in zip(self.params, gparams):
updates[param] = param - gparam * learning_rate
train_fn = theano.function(inputs=[theano.Param(batch_x),
theano.Param(batch_y),
theano.Param(learning_rate)],
outputs=self.mean_cost,
updates=updates,
givens={self.x: batch_x, self.y: batch_y})
return train_fn
def get_adagrad_trainer(self):
""" Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
"""
batch_x = T.fmatrix('batch_x')
batch_y = T.ivector('batch_y')
learning_rate = T.fscalar('lr') # learning rate
gparams = T.grad(self.mean_cost, self.params) # all the gradients
updates = OrderedDict()
for accugrad, param, gparam in zip(self._accugrads, self.params, gparams):
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
agrad = accugrad + gparam * gparam
dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam
updates[param] = param + dx
updates[accugrad] = agrad
train_fn = theano.function(inputs=[theano.Param(batch_x),
theano.Param(batch_y),
theano.Param(learning_rate)],
outputs=self.mean_cost,
updates=updates,
givens={self.x: batch_x, self.y: batch_y})
return train_fn
def get_adadelta_trainer(self):
""" Returns an Adadelta (Zeiler 2012) trainer using self._rho and
self._eps params. """
batch_x = T.fmatrix('batch_x')
batch_y = T.ivector('batch_y')
gparams = T.grad(self.mean_cost, self.params)
updates = OrderedDict()
for accugrad, accudelta, param, gparam in zip(self._accugrads,
self._accudeltas, self.params, gparams):
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
dx = - T.sqrt((accudelta + self._eps)
/ (agrad + self._eps)) * gparam
updates[accudelta] = (self._rho * accudelta
+ (1 - self._rho) * dx * dx)
updates[param] = param + dx
updates[accugrad] = agrad
train_fn = theano.function(inputs=[theano.Param(batch_x),
theano.Param(batch_y)],
outputs=self.mean_cost,
updates=updates,
givens={self.x: batch_x, self.y: batch_y})
return train_fn
def get_rmsprop_trainer(self, with_step_adapt=True, nesterov=False): # TODO Nesterov momentum
""" Returns an RmsProp (possibly Nesterov) (Sutskever 2013) trainer
using self._rho, self._eps and self._momentum params. """
batch_x = T.fmatrix('batch_x')
batch_y = T.ivector('batch_y')
learning_rate = T.fscalar('lr') # learning rate
gparams = T.grad(self.mean_cost, self.params)
updates = OrderedDict()
for accugrad, avggrad, accudelta, sa, param, gparam in zip(
self._accugrads, self._avggrads, self._accudeltas,
self._stepadapts, self.params, gparams):
acc_grad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
avg_grad = self._rho * avggrad + (1 - self._rho) * gparam # this decay/discount (self._rho) should differ from the one of the line above
###scaled_grad = gparam / T.sqrt(acc_grad + self._eps) # original RMSprop gradient scaling
scaled_grad = gparam / T.sqrt(acc_grad - avg_grad**2 + self._eps) # Alex Graves' RMSprop variant (divide by a "running stddev" of the updates)
if with_step_adapt:
incr = sa * (1. + self._stepadapt_alpha)
#decr = sa * (1. - self._stepadapt_alpha)
decr = sa * (1. - 2*self._stepadapt_alpha)
###steps = sa * T.switch(accudelta * -gparam >= 0, incr, decr)
steps = T.clip(T.switch(accudelta * -gparam >= 0, incr, decr), self._eps, 1./self._eps) # bad overloading of self._eps!
scaled_grad = steps * scaled_grad
updates[sa] = steps
dx = self._momentum * accudelta - learning_rate * scaled_grad
updates[param] = param + dx
updates[accugrad] = acc_grad
updates[avggrad] = avg_grad
updates[accudelta] = dx
train_fn = theano.function(inputs=[theano.Param(batch_x),
theano.Param(batch_y),
theano.Param(learning_rate)],
outputs=self.mean_cost,
updates=updates,
givens={self.x: batch_x, self.y: batch_y})
return train_fn
def get_adam_trainer(self):
""" Returns an Adam trainer as described in
ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION, Kingma and Ba 2015 """
batch_x = T.fmatrix('batch_x')
batch_y = T.ivector('batch_y')
self._i_t = shared(numpy.float32(0.))
self.lr = 0.0002
i_t = self._i_t + 1.
fix1 = 1. - (1. - self._b1)**i_t
fix2 = 1. - (1. - self._b2)**i_t
lr_t = self.lr * (T.sqrt(fix2) / fix1)
gparams = T.grad(self.mean_cost, self.params)
updates = OrderedDict()
for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams):
m_t = (self._b1 * gparam) + ((1. - self._b1) * accugrad)
v_t = (self._b2 * T.sqr(gparam)) + ((1. - self._b2) * accudelta)
#m_t_h = m_t / (1 - (1-self._b1)**i_t)
#v_t_h = v_t / (1 - (1-self._b2)**i_t)
#g_t = m_t_h / (T.sqrt(v_t_h) + self._eps)
g_t = m_t / (T.sqrt(v_t) + self._eps)
updates[param] = param - (lr_t * g_t)
updates[accugrad] = m_t
updates[accudelta] = v_t
updates[self._i_t] = i_t
train_fn = theano.function(inputs=[theano.Param(batch_x),
theano.Param(batch_y)],
outputs=self.mean_cost,
updates=updates,
givens={self.x: batch_x, self.y: batch_y})
return train_fn
def score_classif(self, given_set):
""" Returns functions to get current classification errors. """
batch_x = T.fmatrix('batch_x')
batch_y = T.ivector('batch_y')
score = theano.function(inputs=[theano.Param(batch_x),
theano.Param(batch_y)],
outputs=self.errors,
givens={self.x: batch_x, self.y: batch_y})
def scoref():
""" returned function that scans the entire set given as input """
return [score(batch_x, batch_y) for batch_x, batch_y in given_set]
return scoref
class RegularizedNet(NeuralNet):
""" Neural net with L1 and L2 regularization """
def __init__(self, numpy_rng, theano_rng=None,
n_ins=100,
layers_types=[ReLU, ReLU, ReLU, LogisticRegression],
layers_sizes=[1024, 1024, 1024],
n_outs=2,
rho=0.95, eps=1.E-8,
L1_reg=0.1,
L2_reg=0.1,
debugprint=False):
"""
A deep neural net with possible L1 and/or L2 regularization.
"""
super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins,
layers_types, layers_sizes, n_outs, rho, eps, debugprint)
self.L1_reg = L1_reg
self.L2_reg = L2_reg
L1 = shared(0.)
for param in self.params:
L1 += T.sum(abs(param))
if L1_reg > 0.:
self.cost = self.cost + L1_reg * L1
L2 = shared(0.)
for param in self.params:
L2 += T.sum(param ** 2)
if L2_reg > 0.:
self.cost = self.cost + L2_reg * L2
class DropoutNet(NeuralNet):
""" Neural net with dropout (see Hinton's et al. paper) """
def __init__(self, numpy_rng, theano_rng=None,
n_ins=40*3,
layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
layers_sizes=[4000, 4000, 4000, 4000],
dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5],
n_outs=62 * 3,
rho=0.98, eps=1.E-6,
debugprint=False):
"""
A dropout-regularized neural net.
"""
super(DropoutNet, self).__init__(numpy_rng, theano_rng, n_ins,
layers_types, layers_sizes, n_outs, rho, eps, debugprint)
self.dropout_rates = dropout_rates
dropout_layer_input = dropout(numpy_rng, self.x, p=dropout_rates[0])
self.dropout_layers = []
for layer, layer_type, n_in, n_out, dr in zip(self.layers,
layers_types, self.layers_ins, self.layers_outs,
dropout_rates[1:] + [0]): # !!! we do not dropout anything
# from the last layer !!!
if dr:
this_layer = layer_type(rng=numpy_rng,
input=dropout_layer_input, n_in=n_in, n_out=n_out,
W=layer.W * 1. / (1. - dr),
b=layer.b * 1. / (1. - dr))
# N.B. dropout with dr==1 does not dropanything!!
this_layer.output = dropout(numpy_rng, this_layer.output, dr)
else:
this_layer = layer_type(rng=numpy_rng,
input=dropout_layer_input, n_in=n_in, n_out=n_out,
W=layer.W, b=layer.b)
assert hasattr(this_layer, 'output')
self.dropout_layers.append(this_layer)
dropout_layer_input = this_layer.output
assert hasattr(self.layers[-1], 'training_cost')
assert hasattr(self.layers[-1], 'errors')
# TODO standardize cost
# these are the dropout costs
self.mean_cost = self.dropout_layers[-1].negative_log_likelihood(self.y)
self.cost = self.dropout_layers[-1].training_cost(self.y)
# these is the non-dropout errors
self.errors = self.layers[-1].errors(self.y)
def __repr__(self):
return super(DropoutNet, self).__repr__() + "\n"\
+ "dropout rates: " + str(self.dropout_rates)
def add_fit_and_score(class_to_chg):
""" Mutates a class to add the fit() and score() functions to a NeuralNet.
"""
from types import MethodType
def fit(self, x_train, y_train, x_dev=None, y_dev=None,
max_epochs=20, early_stopping=True, split_ratio=0.1, # TODO 100+ epochs
method='adadelta', verbose=False, plot=False):
"""
TODO
"""
import time, copy
if x_dev == None or y_dev == None:
from sklearn.cross_validation import train_test_split
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train,
test_size=split_ratio, random_state=42)
if method == 'sgd':
train_fn = self.get_SGD_trainer()
elif method == 'adagrad':
train_fn = self.get_adagrad_trainer()
elif method == 'adadelta':
train_fn = self.get_adadelta_trainer()
elif method == 'adam':
train_fn = self.get_adam_trainer()
elif method == 'rmsprop':
train_fn = self.get_rmsprop_trainer(with_step_adapt=True,
nesterov=False)
train_set_iterator = DatasetMiniBatchIterator(x_train, y_train)
dev_set_iterator = DatasetMiniBatchIterator(x_dev, y_dev)
train_scoref = self.score_classif(train_set_iterator)
dev_scoref = self.score_classif(dev_set_iterator)
best_dev_loss = numpy.inf
epoch = 0
# TODO early stopping (not just cross val, also stop training)
if plot:
verbose = True
self._costs = []
self._train_errors = []
self._dev_errors = []
self._updates = []
init_lr = INIT_LR
if method == 'rmsprop':
init_lr = 1.E-6 # TODO REMOVE HACK
n_seen = 0
while epoch < max_epochs:
#first TODO: update learning rates when not using adadelta!
#lr = init_lr / (1 + init_lr * L2_LAMBDA * math.log(1+n_seen))
#lr = init_lr / math.sqrt(1 + init_lr * L2_LAMBDA * n_seen/BATCH_SIZE) # try these
lr = init_lr
if not verbose:
sys.stdout.write("\r%0.2f%%" % (epoch * 100./ max_epochs))
sys.stdout.flush()
avg_costs = []
timer = time.time()
for x, y in train_set_iterator:
if method == 'sgd' or method == 'adagrad' or method == 'rmsprop':
#avg_cost = train_fn(x, y, lr=1.E-2)
avg_cost = train_fn(x, y, lr=lr)
elif method == 'adadelta' or method == 'adam':
avg_cost = train_fn(x, y)
if type(avg_cost) == list:
avg_costs.append(avg_cost[0])
else:
avg_costs.append(avg_cost)
if verbose:
mean_costs = numpy.mean(avg_costs)
mean_train_errors = numpy.mean(train_scoref())
print(' epoch %i took %f seconds' %
(epoch, time.time() - timer))
print(' epoch %i, avg costs %f' %
(epoch, mean_costs))
print(' epoch %i, training error %f' %
(epoch, mean_train_errors))
if plot:
self._costs.append(mean_costs)
self._train_errors.append(mean_train_errors)
dev_errors = numpy.mean(dev_scoref())
if plot:
self._dev_errors.append(dev_errors)
if dev_errors < best_dev_loss:
best_dev_loss = dev_errors
best_params = copy.deepcopy(self.params)
if verbose:
print('!!! epoch %i, validation error of best model %f' %
(epoch, dev_errors))
epoch += 1
n_seen += x_train.shape[0]
if not verbose:
print("")
for i, param in enumerate(best_params):
self.params[i] = param
def score(self, x, y):
""" error rates """
iterator = DatasetMiniBatchIterator(x, y)
scoref = self.score_classif(iterator)
return numpy.mean(scoref())
class_to_chg.fit = MethodType(fit, None, class_to_chg)
class_to_chg.score = MethodType(score, None, class_to_chg)
def train_models(x_train, y_train, x_test, y_test, n_features, n_outs,
x_dev=None, y_dev=None,
use_dropout=False, n_epochs=100, numpy_rng=None,
svms=False, nb=False, deepnn=True,
verbose=False, plot=False, name=''):
if svms:
print("Linear SVM")
classifier = svm.SVC(gamma=0.001)
print(classifier)
classifier.fit(x_train, y_train)
print("score: %f" % classifier.score(x_test, y_test))
print("RBF-kernel SVM")
classifier = svm.SVC(kernel='rbf', class_weight='auto')
print(classifier)
classifier.fit(x_train, y_train)
print("score: %f" % classifier.score(x_test, y_test))
if nb:
print("Multinomial Naive Bayes")
classifier = naive_bayes.MultinomialNB()
print(classifier)
classifier.fit(x_train, y_train)
print("score: %f" % classifier.score(x_test, y_test))
if deepnn:
import warnings
warnings.filterwarnings("ignore") # TODO remove
if use_dropout:
n_epochs *= 4
pass
def new_dnn(dropout=False):
if dropout:
print("Dropout DNN")
return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features,
layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
layers_sizes=[2000, 2000, 2000, 2000],
dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5],
n_outs=n_outs,
debugprint=0)
else:
print("Simple (regularized) DNN")
return RegularizedNet(numpy_rng=numpy_rng, n_ins=n_features,
layers_types=[LogisticRegression],
layers_sizes=[],
#layers_types=[ReLU, ReLU, ReLU, LogisticRegression],
#layers_sizes=[1000, 1000, 1000],
#layers_types=[ReLU, LogisticRegression],
#layers_sizes=[200],
n_outs=n_outs,
L1_reg=0.,
L2_reg=L2_LAMBDA,
debugprint=1)
import matplotlib.pyplot as plt
fig = plt.gcf() #plt.figure()
fig.set_size_inches(16,10)
ax1 = plt.subplot(221)
ax2 = plt.subplot(222)
ax3 = plt.subplot(223)
ax4 = plt.subplot(224) # TODO updates of the weights
#methods = ['sgd', 'adagrad', 'adadelta']
methods = ['adam', 'adadelta']
#methods = ['rmsprop', 'adadelta', 'adagrad']
for method in methods:
dnn = new_dnn(use_dropout)
print dnn
dnn.fit(x_train, y_train, x_dev, y_dev, max_epochs=n_epochs,
method=method, verbose=verbose, plot=plot)
test_error = dnn.score(x_test, y_test)
print("score: %f" % (1. - test_error))
ax1.plot(numpy.log10(dnn._costs), label=method)
#ax2.plot(numpy.log10(dnn._train_errors), label=method)
#ax3.plot(numpy.log10(dnn._dev_errors), label=method)
ax2.plot(dnn._train_errors, label=method)
ax3.plot(dnn._dev_errors, label=method)
#ax4.plot(dnn._updates, label=method) TODO
ax4.plot([test_error for _ in range(10)], label=method)
ax1.set_xlabel('epoch')
ax1.set_ylabel('cost (log10)')
ax2.set_xlabel('epoch')
ax2.set_ylabel('train error')
ax3.set_xlabel('epoch')
ax3.set_ylabel('dev error')
ax4.set_ylabel('test error')
plt.legend()
plt.tight_layout()
plt.show()
#main function
def run():
add_fit_and_score(DropoutNet)
add_fit_and_score(RegularizedNet)
from sklearn import datasets, svm, naive_bayes
from sklearn import cross_validation, preprocessing
if MNIST:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
X = numpy.asarray(mnist.data, dtype='float32')
#X = preprocessing.scale(X)
X /= 255.
y = numpy.asarray(mnist.target, dtype='int32')
print("Total dataset size:")
print("n samples: %d" % X.shape[0])
print("n features: %d" % X.shape[1])
print("n classes: %d" % len(set(y)))
x_train, x_test = X[:-10000], X[-10000:]
y_train, y_test = y[:-10000], y[-10000:]
train_models(x_train, y_train, x_test, y_test, X.shape[1],
len(set(y)), numpy_rng=numpy.random.RandomState(123),
use_dropout=False, n_epochs=30,
verbose=True, plot=True, name='mnist_L2')
#train_models(x_train, y_train, x_test, y_test, X.shape[1],
# len(set(y)), numpy_rng=numpy.random.RandomState(123),
# use_dropout=True,
# verbose=True, plot=True, name='mnist_dropout')
if DIGITS:
digits = datasets.load_digits()
data = numpy.asarray(digits.data, dtype='float32')
target = numpy.asarray(digits.target, dtype='int32')
x = preprocessing.scale(data)
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
x, target, test_size=0.2, random_state=42)
train_models(x_train, y_train, x_test, y_test, x.shape[1],
len(set(target)), numpy_rng=numpy.random.RandomState(123),
verbose=True, plot=True, name='digits')
if FACES:
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(message)s')
lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70,
resize=0.4)
X = numpy.asarray(lfw_people.data, dtype='float32')
X = preprocessing.scale(X)
y = numpy.asarray(lfw_people.target, dtype='int32')
target_names = lfw_people.target_names
print("Total dataset size:")
print("n samples: %d" % X.shape[0])
print("n features: %d" % X.shape[1])
print("n classes: %d" % target_names.shape[0])
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
X, y, test_size=0.2, random_state=42)
train_models(x_train, y_train, x_test, y_test, X.shape[1],
len(set(y)), numpy_rng=numpy.random.RandomState(123),
verbose=True, plot=True, name='faces')
if TWENTYNEWSGROUPS:
from sklearn.feature_extraction.text import TfidfVectorizer
newsgroups_train = datasets.fetch_20newsgroups(subset='train')
vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000)
#vectorizer = HashingVectorizer(encoding='latin-1')
x_train = vectorizer.fit_transform(newsgroups_train.data)
x_train = numpy.asarray(x_train.todense(), dtype='float32')
y_train = numpy.asarray(newsgroups_train.target, dtype='int32')
newsgroups_test = datasets.fetch_20newsgroups(subset='test')
x_test = vectorizer.transform(newsgroups_test.data)
x_test = numpy.asarray(x_test.todense(), dtype='float32')
y_test = numpy.asarray(newsgroups_test.target, dtype='int32')
train_models(x_train, y_train, x_test, y_test, x_train.shape[1],
len(set(y_train)),
numpy_rng=numpy.random.RandomState(123),
svms=False, nb=True, deepnn=True,
verbose=True, plot=True, name='20newsgroups')
run()
Total dataset size: n samples: 70000 n features: 784 n classes: 10 Simple (regularized) DNN LogisticRegression_784x10 epoch 0 took 0.269893 seconds epoch 0, avg costs 1.243363 epoch 0, training error 0.155833 !!! epoch 0, validation error of best model 0.153500 epoch 1 took 0.297565 seconds epoch 1, avg costs 0.633141 epoch 1, training error 0.126481 !!! epoch 1, validation error of best model 0.126500 epoch 2 took 0.285746 seconds epoch 2, avg costs 0.492538 epoch 2, training error 0.112778 !!! epoch 2, validation error of best model 0.115833 epoch 3 took 0.299956 seconds epoch 3, avg costs 0.428124 epoch 3, training error 0.104481 !!! epoch 3, validation error of best model 0.109833 epoch 4 took 0.291172 seconds epoch 4, avg costs 0.390489 epoch 4, training error 0.099389 !!! epoch 4, validation error of best model 0.103000 epoch 5 took 0.316594 seconds epoch 5, avg costs 0.365602 epoch 5, training error 0.095093 !!! epoch 5, validation error of best model 0.099667 epoch 6 took 0.281905 seconds epoch 6, avg costs 0.347869 epoch 6, training error 0.091426 !!! epoch 6, validation error of best model 0.097167 epoch 7 took 0.281674 seconds epoch 7, avg costs 0.334582 epoch 7, training error 0.089111 !!! epoch 7, validation error of best model 0.094500 epoch 8 took 0.288380 seconds epoch 8, avg costs 0.324257 epoch 8, training error 0.086981 !!! epoch 8, validation error of best model 0.093000 epoch 9 took 0.286906 seconds epoch 9, avg costs 0.316005 epoch 9, training error 0.085222 !!! epoch 9, validation error of best model 0.092000 epoch 10 took 0.292030 seconds epoch 10, avg costs 0.309255 epoch 10, training error 0.083815 !!! epoch 10, validation error of best model 0.090500 epoch 11 took 0.279655 seconds epoch 11, avg costs 0.303626 epoch 11, training error 0.082685 !!! epoch 11, validation error of best model 0.089000 epoch 12 took 0.294217 seconds epoch 12, avg costs 0.298852 epoch 12, training error 0.081741 !!! epoch 12, validation error of best model 0.088667 epoch 13 took 0.304461 seconds epoch 13, avg costs 0.294745 epoch 13, training error 0.080519 !!! epoch 13, validation error of best model 0.087167 epoch 14 took 0.279682 seconds epoch 14, avg costs 0.291166 epoch 14, training error 0.079500 !!! epoch 14, validation error of best model 0.086667 epoch 15 took 0.277173 seconds epoch 15, avg costs 0.288012 epoch 15, training error 0.078667 !!! epoch 15, validation error of best model 0.086167 epoch 16 took 0.293082 seconds epoch 16, avg costs 0.285206 epoch 16, training error 0.078111 !!! epoch 16, validation error of best model 0.085667 epoch 17 took 0.302496 seconds epoch 17, avg costs 0.282688 epoch 17, training error 0.077463 !!! epoch 17, validation error of best model 0.085167 epoch 18 took 0.283735 seconds epoch 18, avg costs 0.280410 epoch 18, training error 0.076889 !!! epoch 18, validation error of best model 0.084667 epoch 19 took 0.287550 seconds epoch 19, avg costs 0.278337 epoch 19, training error 0.076574 !!! epoch 19, validation error of best model 0.084500 epoch 20 took 0.281636 seconds epoch 20, avg costs 0.276438 epoch 20, training error 0.076111 !!! epoch 20, validation error of best model 0.083833 epoch 21 took 0.289907 seconds epoch 21, avg costs 0.274690 epoch 21, training error 0.075519 epoch 22 took 0.290426 seconds epoch 22, avg costs 0.273072 epoch 22, training error 0.075111 !!! epoch 22, validation error of best model 0.083500 epoch 23 took 0.283991 seconds epoch 23, avg costs 0.271569 epoch 23, training error 0.074741 epoch 24 took 0.276365 seconds epoch 24, avg costs 0.270167 epoch 24, training error 0.074352 epoch 25 took 0.326764 seconds epoch 25, avg costs 0.268854 epoch 25, training error 0.073870 epoch 26 took 0.307149 seconds epoch 26, avg costs 0.267622 epoch 26, training error 0.073667 !!! epoch 26, validation error of best model 0.083167 epoch 27 took 0.313843 seconds epoch 27, avg costs 0.266461 epoch 27, training error 0.073296 !!! epoch 27, validation error of best model 0.082833 epoch 28 took 0.289039 seconds epoch 28, avg costs 0.265364 epoch 28, training error 0.073111 epoch 29 took 0.280149 seconds epoch 29, avg costs 0.264326 epoch 29, training error 0.072611 !!! epoch 29, validation error of best model 0.082667 score: 0.924200 Simple (regularized) DNN LogisticRegression_784x10 epoch 0 took 0.311120 seconds epoch 0, avg costs 0.923582 epoch 0, training error 0.128704 !!! epoch 0, validation error of best model 0.130167 epoch 1 took 0.320057 seconds epoch 1, avg costs 0.452976 epoch 1, training error 0.105963 !!! epoch 1, validation error of best model 0.109500 epoch 2 took 0.326817 seconds epoch 2, avg costs 0.378660 epoch 2, training error 0.097426 !!! epoch 2, validation error of best model 0.102000 epoch 3 took 0.308853 seconds epoch 3, avg costs 0.348301 epoch 3, training error 0.092667 !!! epoch 3, validation error of best model 0.097333 epoch 4 took 0.327724 seconds epoch 4, avg costs 0.331119 epoch 4, training error 0.089056 !!! epoch 4, validation error of best model 0.094667 epoch 5 took 0.311835 seconds epoch 5, avg costs 0.319697 epoch 5, training error 0.086667 !!! epoch 5, validation error of best model 0.092833 epoch 6 took 0.324327 seconds epoch 6, avg costs 0.311373 epoch 6, training error 0.084944 !!! epoch 6, validation error of best model 0.091500 epoch 7 took 0.327300 seconds epoch 7, avg costs 0.304943 epoch 7, training error 0.083370 !!! epoch 7, validation error of best model 0.089667 epoch 8 took 0.313637 seconds epoch 8, avg costs 0.299771 epoch 8, training error 0.082148 !!! epoch 8, validation error of best model 0.088500 epoch 9 took 0.306343 seconds epoch 9, avg costs 0.295485 epoch 9, training error 0.081315 !!! epoch 9, validation error of best model 0.087833 epoch 10 took 0.329969 seconds epoch 10, avg costs 0.291853 epoch 10, training error 0.080315 !!! epoch 10, validation error of best model 0.087667 epoch 11 took 0.311047 seconds epoch 11, avg costs 0.288719 epoch 11, training error 0.079463 !!! epoch 11, validation error of best model 0.087333 epoch 12 took 0.306574 seconds epoch 12, avg costs 0.285976 epoch 12, training error 0.078556 !!! epoch 12, validation error of best model 0.087000 epoch 13 took 0.318012 seconds epoch 13, avg costs 0.283545 epoch 13, training error 0.077926 !!! epoch 13, validation error of best model 0.086500 epoch 14 took 0.318956 seconds epoch 14, avg costs 0.281369 epoch 14, training error 0.077148 !!! epoch 14, validation error of best model 0.085833 epoch 15 took 0.307148 seconds epoch 15, avg costs 0.279405 epoch 15, training error 0.076315 !!! epoch 15, validation error of best model 0.085667 epoch 16 took 0.307458 seconds epoch 16, avg costs 0.277618 epoch 16, training error 0.075815 !!! epoch 16, validation error of best model 0.085333 epoch 17 took 0.312092 seconds epoch 17, avg costs 0.275983 epoch 17, training error 0.075426 !!! epoch 17, validation error of best model 0.085000 epoch 18 took 0.304722 seconds epoch 18, avg costs 0.274479 epoch 18, training error 0.074926 epoch 19 took 0.311247 seconds epoch 19, avg costs 0.273086 epoch 19, training error 0.074463 epoch 20 took 0.340049 seconds epoch 20, avg costs 0.271793 epoch 20, training error 0.074241 epoch 21 took 0.307233 seconds epoch 21, avg costs 0.270587 epoch 21, training error 0.073889 !!! epoch 21, validation error of best model 0.084833 epoch 22 took 0.347523 seconds epoch 22, avg costs 0.269458 epoch 22, training error 0.073574 epoch 23 took 0.324835 seconds epoch 23, avg costs 0.268398 epoch 23, training error 0.073204 !!! epoch 23, validation error of best model 0.083667 epoch 24 took 0.360615 seconds epoch 24, avg costs 0.267400 epoch 24, training error 0.072704 !!! epoch 24, validation error of best model 0.083500 epoch 25 took 0.377638 seconds epoch 25, avg costs 0.266457 epoch 25, training error 0.072611 epoch 26 took 0.306555 seconds epoch 26, avg costs 0.265566 epoch 26, training error 0.072370 !!! epoch 26, validation error of best model 0.083167 epoch 27 took 0.309302 seconds epoch 27, avg costs 0.264720 epoch 27, training error 0.072185 epoch 28 took 0.333637 seconds epoch 28, avg costs 0.263916 epoch 28, training error 0.071907 epoch 29 took 0.324232 seconds epoch 29, avg costs 0.263151 epoch 29, training error 0.071722 score: 0.924300