Introduction to Theano

Credits: Forked from summerschool2015 by mila-udem

Slides

Refer to the associated Introduction to Theano slides and use this notebook for hands-on practice of the concepts.

Basic usage

Defining an expression

In [1]:
import theano
from theano import tensor as T
x = T.vector('x')
W = T.matrix('W')
b = T.vector('b')
In [3]:
dot = T.dot(x, W)
out = T.nnet.sigmoid(dot + b)
out
Out[3]:
sigmoid.0

Graph visualization

In [4]:
from theano.printing import debugprint
debugprint(dot)
dot [id A] ''   
 |x [id B]
 |W [id C]
In [5]:
debugprint(out)
sigmoid [id A] ''   
 |Elemwise{add,no_inplace} [id B] ''   
   |dot [id C] ''   
   | |x [id D]
   | |W [id E]
   |b [id F]

Compiling a Theano function

In [6]:
f = theano.function(inputs=[x, W], outputs=dot)
g = theano.function([x, W, b], out)
h = theano.function([x, W, b], [dot, out])
i = theano.function([x, W, b], [dot + b, out])

Graph visualization

In [7]:
debugprint(f)
CGemv{inplace} [id A] ''   3
 |AllocEmpty{dtype='float64'} [id B] ''   2
 | |Shape_i{1} [id C] ''   1
 |   |W [id D]
 |TensorConstant{1.0} [id E]
 |InplaceDimShuffle{1,0} [id F] 'W.T'   0
 | |W [id D]
 |x [id G]
 |TensorConstant{0.0} [id H]
In [8]:
debugprint(g)
Elemwise{ScalarSigmoid}[(0, 0)] [id A] ''   2
 |CGemv{no_inplace} [id B] ''   1
   |b [id C]
   |TensorConstant{1.0} [id D]
   |InplaceDimShuffle{1,0} [id E] 'W.T'   0
   | |W [id F]
   |x [id G]
   |TensorConstant{1.0} [id D]
In [9]:
from theano.printing import pydotprint
pydotprint(f, outfile='pydotprint_f.png')
The output file is available at pydotprint_f.png
In [10]:
from IPython.display import Image
Image('pydotprint_f.png', width=1000)
Out[10]:
In [11]:
pydotprint(g, outfile='pydotprint_g.png')
Image('pydotprint_g.png', width=1000)
The output file is available at pydotprint_g.png
Out[11]:
In [12]:
pydotprint(h, outfile='pydotprint_h.png')
Image('pydotprint_h.png', width=1000)
The output file is available at pydotprint_h.png
Out[12]:

Executing a Theano function

In [13]:
import numpy as np
np.random.seed(42)
W_val = np.random.randn(4, 3)
x_val = np.random.rand(4)
b_val = np.ones(3)

f(x_val, W_val)
Out[13]:
array([ 1.79048354,  0.03158954, -0.26423186])
In [14]:
g(x_val, W_val, b_val)
Out[14]:
array([ 0.9421594 ,  0.73722395,  0.67606977])
In [15]:
h(x_val, W_val, b_val)
Out[15]:
[array([ 1.79048354,  0.03158954, -0.26423186]),
 array([ 0.9421594 ,  0.73722395,  0.67606977])]
In [16]:
i(x_val, W_val, b_val)
Out[16]:
[array([ 2.79048354,  1.03158954,  0.73576814]),
 array([ 0.9421594 ,  0.73722395,  0.67606977])]

Graph definition and Syntax

Graph structure

In [17]:
pydotprint(f, compact=False, outfile='pydotprint_f_notcompact.png')
Image('pydotprint_f_notcompact.png', width=1000)
The output file is available at pydotprint_f_notcompact.png
Out[17]:

Strong typing

Broadcasting tensors

In [18]:
r = T.row('r')
print(r.broadcastable)
(True, False)
In [19]:
c = T.col('c')
print(c.broadcastable)
(False, True)
In [20]:
f = theano.function([r, c], r + c)
print(f([[1, 2, 3]], [[.1], [.2]]))
[[ 1.1  2.1  3.1]
 [ 1.2  2.2  3.2]]

Graph Transformations

Substitution and Cloning

The givens keyword

In [21]:
x_ = T.vector('x_')
x_n = (x_ - x_.mean()) / x_.std()
f_n = theano.function([x_, W], dot, givens={x: x_n})
f_n(x_val, W_val)
Out[21]:
array([ 1.90651511,  0.60431744, -0.64253361])

Cloning with replacement

In [22]:
dot_n, out_n = theano.clone([dot, out], replace={x: (x - x.mean()) / x.std()})                        
f_n = theano.function([x, W], dot_n)                                                                  
f_n(x_val, W_val)
Out[22]:
array([ 1.90651511,  0.60431744, -0.64253361])

Gradient

Using theano.grad

In [23]:
y = T.vector('y')
C = ((out - y) ** 2).sum()
dC_dW = theano.grad(C, W)
dC_db = theano.grad(C, b)
# dC_dW, dC_db = theano.grad(C, [W, b])

Using the gradients

In [24]:
cost_and_grads = theano.function([x, W, b, y], [C, dC_dW, dC_db])
y_val = np.random.uniform(size=3)
print(cost_and_grads(x_val, W_val, b_val, y_val))
[array(0.6137821438190066), array([[ 0.01095277,  0.07045955,  0.051161  ],
       [ 0.01889131,  0.12152849,  0.0882424 ],
       [ 0.01555008,  0.10003427,  0.07263534],
       [ 0.01048429,  0.06744584,  0.04897273]]), array([ 0.03600015,  0.23159028,  0.16815877])]
In [25]:
upd_W = W - 0.1 * dC_dW
upd_b = b - 0.1 * dC_db
cost_and_upd = theano.function([x, W, b, y], [C, upd_W, upd_b])
print(cost_and_upd(x_val, W_val, b_val, y_val))
[array(0.6137821438190066), array([[ 0.49561888, -0.14531026,  0.64257244],
       [ 1.52114073, -0.24630622, -0.2429612 ],
       [ 1.57765781,  0.7574313 , -0.47673792],
       [ 0.54151161, -0.47016228, -0.47062703]]), array([ 0.99639999,  0.97684097,  0.98318412])]
In [26]:
pydotprint(cost_and_upd, outfile='pydotprint_cost_and_upd.png')
Image('pydotprint_cost_and_upd.png', width=1000)
The output file is available at pydotprint_cost_and_upd.png
Out[26]:

Shared variables

Update values

In [27]:
C_val, dC_dW_val, dC_db_val = cost_and_grads(x_val, W_val, b_val, y_val)
W_val -= 0.1 * dC_dW_val
b_val -= 0.1 * dC_db_val

C_val, W_val, b_val = cost_and_upd(x_val, W_val, b_val, y_val)

Using shared variables

In [28]:
x = T.vector('x')
y = T.vector('y')
W = theano.shared(W_val)
b = theano.shared(b_val)
dot = T.dot(x, W)
out = T.nnet.sigmoid(dot + b)
f = theano.function([x], dot)  # W is an implicit input
g = theano.function([x], out)  # W and b are implicit inputs
print(f(x_val))
[ 1.78587062  0.00189954 -0.28566499]
In [29]:
print(g(x_val))
[ 0.94151144  0.72221187  0.66391952]

Updating shared variables

In [30]:
C = ((out - y) ** 2).sum()
dC_dW, dC_db = theano.grad(C, [W, b])
upd_W = W - 0.1 * dC_dW
upd_b = b - 0.1 * dC_db

cost_and_perform_updates = theano.function(
    inputs=[x, y],
    outputs=C,
    updates=[(W, upd_W),
             (b, upd_b)])
In [31]:
pydotprint(cost_and_perform_updates, outfile='pydotprint_cost_and_perform_updates.png')
Image('pydotprint_cost_and_perform_updates.png', width=1000)
The output file is available at pydotprint_cost_and_perform_updates.png
Out[31]: