docker pull songorithm/theano_basic
docker run -d -p 8888:8888 -e GRANT_SUDO=yes --name run_theano songorithm/theano_basic
B 과정은 A.2의 상세 버전이다. 이렇게 직접 구축해도 되지만, A.2의 songorithm/theano_basic 이미지에 다 반영되어 있음.
B.1은 A.1과 동일
docker pull jupyter/scipy-notebook
# 슈도유저 권한주고(권장),
# 별명도 붙여줬음(이건 옵션)
# docker run -d -p [접속_포트]:8888 -e GRANT_SUDO=yes --name [컨테이너별명] jupyter/scipy-notebook
docker run -d -p 8888:8888 -e GRANT_SUDO=yes --name run_theano jupyter/scipy-notebook
# http://docker_vm_ip:port/
# docker_vm_ip는 도커툴박스의 경우는 고래 화면 뜰때 나오는 ip
# 혹은 리눅스&맥의 경우 터미널에서 다음 명령어로 확인할 수 있다 : docker-machine ls
http://192.168.99.100:8888
source activate python2
0.5에서 생성한 터미널로 접속해서 설치해보자.
# (파이썬 2와 파이썬 3에 각각 해주자 - 안쓸 버전엔 안해도 되긴함)
conda install theano
!ls ~/work/DeepLearningTutorials
!ls ~/work/DeepLearningTutorials/code
!ls
!wget http://deeplearning.net/data/mnist/mnist.pkl.gz
import cPickle, gzip, numpy
# Load the dataset
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
train_set
(array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([5, 0, 4, ..., 8, 4, 8]))
valid_set
(array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([3, 8, 6, ..., 5, 6, 8]))
test_set
(array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([7, 2, 1, ..., 4, 5, 6]))
type(train_set)
tuple
len(train_set)
2
train_set[0]
array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32)
len(train_set[0])
50000
type(train_set[0])
numpy.ndarray
len(train_set[1])
50000
train_set[1]
array([5, 0, 4, ..., 8, 4, 8])
%matplotlib inline
import seaborn as sns
sns.distplot(train_set[1])
<matplotlib.axes._subplots.AxesSubplot at 0x7f446558da10>
x_tr = train_set[0]
y_tr = train_set[1]
x_tr.shape
(50000, 784)
28*28
784
x = x_tr[0]
type(x)
numpy.ndarray
x.shape
(784,)
x.shape = (28,28)
x.shape
(28, 28)
import pylab as plt
import PIL
plt.imshow(x)
<matplotlib.image.AxesImage at 0x7f4462a88ed0>
y = y_tr[0]
y
5
def shared_dataset(data_xy):
""" Function that loads the dataset into shared variables
The reason we store our dataset in shared variables is to allow
Theano to copy it into the GPU memory (when code is run on GPU).
Since copying data into the GPU is slow, copying a minibatch everytime
is needed (the default behaviour if the data is not in a shared
variable) would lead to a large decrease in performance.
"""
data_x, data_y = data_xy
shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
# When storing data on the GPU it has to be stored as floats
# therefore we will store the labels as ‘‘floatX‘‘ as well
# (‘‘shared_y‘‘ does exactly that). But during our computations
# we need them as ints (we use labels as index, and if they are
# floats it doesn’t make sense) therefore instead of returning
# ‘‘shared_y‘‘ we will have to cast it to int. This little hack
# lets us get around this issue
return shared_x, T.cast(shared_y, 'int32')
import theano
import theano.tensor as T
test_set_x, test_set_y = shared_dataset(test_set)
valid_set_x, valid_set_y = shared_dataset(valid_set)
train_set_x, train_set_y = shared_dataset(train_set)
batch_size = 500 # size of the minibatch
# accessing the third minibatch of the training set
data = train_set_x[2 * 500: 3 * 500]
label = train_set_y[2 * 500: 3 * 500]
import theano
import theano.tensor as T
import numpy
This chapter reviews the basics of supervised learning for classification models, and covers the minibatch stochastic gradient descent algorithm that is used to fine-tune many of the models in the Deep Learning Tutorials.
# zero_one_loss is a Theano variable representing a symbolic
# expression of the zero one loss ; to get the actual value this
# symbolic expression has to be compiled into a Theano function (see # the Theano tutorial for more details)
zero_one_loss = T.sum(T.neq(T.argmax(p_y_given_x), y))
Since the zero-one loss is not differentiable, optimizing it for large models (thousands or millions of parameters) is prohibitively expensive (computationally). We thus maximize the log-likelihood of our classifier given all the labels in a training set.
# NLL is a symbolic variable ; to get the actual value of NLL, this symbolic # expression has to be compiled into a Theano function (see the Theano
# tutorial for more details)
NLL = -T.sum(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
# note on syntax: T.arange(y.shape[0]) is a vector of integers [0,1,2,...,len(y)].
# Indexing a matrix M by the two vectors [0,1,...,K], [a,b,...,k] returns the
# elements M[0,a], M[1,b], ..., M[K,k] as a vector. Here, we use this
# syntax to retrieve the log-probability of the correct labels, y.
# GRADIENT DESCENT
while True:
loss = f(params)
d_loss_wrt_params = ... # compute gradient
params -= learning_rate * d_loss_wrt_params
if <stopping condition is met>:
return params
# STOCHASTIC GRADIENT DESCENT
for (x_i,y_i) in training_set:
# imagine an infinite generator
# that may repeat examples (if there is only a finite training
loss = f(params, x_i, y_i)
d_loss_wrt_params = ... # compute gradient
params -= learning_rate * d_loss_wrt_params
if <stopping condition is met>:
return params
for (x_batch,y_batch) in train_batches:
# imagine an infinite generator
# that may repeat examples
loss = f(params, x_batch, y_batch)
d_loss_wrt_params = ... # compute gradient using theano
params -= learning_rate * d_loss_wrt_params
if <stopping condition is met>:
return params
All code-blocks above show pseudocode of how the algorithm looks like. Implementing such algorithm in Theano can be done as follows :
# Minibatch Stochastic Gradient Descent
# assume loss is a symbolic description of the loss function given
# the symbolic variables params (shared variable), x_batch, y_batch;
# compute gradient of loss with respect to params
d_loss_wrt_params = T.grad(loss, params)
# compile the MSGD step into a theano function
updates = [(params, params - learning_rate * d_loss_wrt_params)]
MSGD = theano.function([x_batch,y_batch], loss, updates=updates)
for (x_batch, y_batch) in train_batches:
# here x_batch and y_batch are elements of train_batches and
# therefore numpy arrays; function MSGD also updates the params
print('Current loss is ', MSGD(x_batch, y_batch))
if stopping_condition_is_met:
return params
The code block below shows how to compute the loss in python when it contains both a L1 regularization term weighted by λ1 and L2 regularization term weighted by λ2
# symbolic Theano variable that represents the L1 regularization term
L1 = T.sum(abs(param))
# symbolic Theano variable that represents the squared L2 term
L2_sqr = T.sum(param ** 2)
# the loss
loss = NLL + lambda_1 * L1 + lambda_2 * L2
The choice of when to stop is a judgement call and a few heuristics exist, but these tutorials will make use of a strategy based on a geometrically increasing amount of patience.
# early-stopping parameters
patience = 5000 # look as this many examples regardless
patience_increase = 2 # wait this much longer when a new best is
# found
improvement_threshold = 0.995 # a relative improvement of this much is
# considered significant
validation_frequency = min(n_train_batches, patience/2) # go through this many
# minibatches before checking the network
# on the validation set; in this case we
# check every epoch
best_params = None
best_validation_loss = numpy.inf
test_score = 0.
start_time = time.clock()
done_looping = False
epoch = 0
while (epoch < n_epochs) and (not done_looping):
# Report "1" for first epoch, "n_epochs" for last epoch
epoch = epoch + 1
for minibatch_index in xrange(n_train_batches):
d_loss_wrt_params = ... # compute gradient
params -= learning_rate * d_loss_wrt_params # gradient descent
# iteration number. We want it to start at 0.
iter = (epoch - 1) * n_train_batches + minibatch_index
# note that if we do ‘iter % validation_frequency‘ it will be
# true for iter = 0 which we do not want. We want it true for
# iter = validation_frequency - 1.
if (iter + 1) % validation_frequency == 0:
this_validation_loss = ... # compute zero-one loss on validation set
if this_validation_loss < best_validation_loss:
# improve patience if loss improvement is good enough
if this_validation_loss < best_validation_loss * improvement_threshold:
patience = max(patience, iter * patience_increase)
best_params = copy.deepcopy(params)
best_validation_loss = this_validation_loss
if patience <= iter:
done_looping = True
break
# POSTCONDITION :
# best_params refers to the best out-of-sample parameters observed during the optimization
import cPickle
save_file = open('path', 'wb') # this will overwrite current contents
cPickle.dump(w.get_value(borrow=True), save_file, -1) # the -1 is for HIGHEST_PROTOCOL
cPickle.dump(v.get_value(borrow=True), save_file, -1) # .. and it triggers much more e
cPickle.dump(u.get_value(borrow=True), save_file, -1) # .. storage than numpy’s defaul
save_file.close()
save_file = open('path')
w.set_value(cPickle.load(save_file), borrow=True)
v.set_value(cPickle.load(save_file), borrow=True)
u.set_value(cPickle.load(save_file), borrow=True