In [63]:

# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.0-{platform}-linux_x86_64.whl torchvision

tcmalloc: large alloc 1073750016 bytes == 0x5bc2e000 @  0x7f16d1ff21c4 0x46d6a4 0x5fcbcc 0x4c494d 0x54f3c4 0x553aaf 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54e4c8

In [1]:

import torch
torch.__version__

Out[1]:

'0.4.0'

In [2]:

torch.cuda.is_available()

Out[2]:

True

In [6]:

torch.cuda.device_count()

Out[6]:

In [4]:

dev = torch.cuda.current_device()
dev

Out[4]:

In [5]:

torch.cuda.get_device_name(dev)

Out[5]:

'Tesla K80'

In [3]:

import numpy as np
np.__version__

Out[3]:

'1.14.3'

In [0]:

n = 1000
dtype = 'float32'

In [44]:

x = np.random.rand(n, n).astype(dtype)
y = np.random.rand(n, n).astype(dtype)
print(x)

[[0.64993143 0.54426557 0.4271511  ... 0.63773596 0.35977545 0.47273144]
 [0.10438471 0.4131443  0.4897633  ... 0.25540075 0.650905   0.9069924 ]
 [0.2869643  0.6781323  0.76321834 ... 0.55297124 0.2495685  0.347869  ]
 ...
 [0.65938854 0.9135049  0.55126035 ... 0.3452831  0.37183738 0.70037305]
 [0.75186837 0.8833698  0.67125374 ... 0.15030158 0.11645979 0.15519282]
 [0.8473455  0.7309286  0.38008705 ... 0.7080732  0.81976247 0.6249228 ]]

In [45]:

z = x @ y
print(z)
print(type(z))

[[262.6432  268.94064 249.42052 ... 257.21527 254.57413 257.5146 ]
 [254.75009 264.89435 244.59337 ... 255.7371  248.91745 254.29904]
 [259.03632 264.5193  246.06383 ... 254.04399 246.53589 254.04001]
 ...
 [266.98166 266.02438 251.76076 ... 258.45422 256.73703 264.81305]
 [249.16425 253.55284 240.94025 ... 245.26656 239.66714 249.19298]
 [253.71408 257.41022 238.84543 ... 248.72    242.68701 252.66364]]
<class 'numpy.ndarray'>

In [46]:

a = torch.from_numpy(x).cuda()
b = torch.from_numpy(y).cuda()
print(a)

 6.4993e-01  5.4427e-01  4.2715e-01  ...   6.3774e-01  3.5978e-01  4.7273e-01
 1.0438e-01  4.1314e-01  4.8976e-01  ...   2.5540e-01  6.5091e-01  9.0699e-01
 2.8696e-01  6.7813e-01  7.6322e-01  ...   5.5297e-01  2.4957e-01  3.4787e-01
                ...                   ⋱                   ...                
 6.5939e-01  9.1350e-01  5.5126e-01  ...   3.4528e-01  3.7184e-01  7.0037e-01
 7.5187e-01  8.8337e-01  6.7125e-01  ...   1.5030e-01  1.1646e-01  1.5519e-01
 8.4735e-01  7.3093e-01  3.8009e-01  ...   7.0807e-01  8.1976e-01  6.2492e-01
[torch.cuda.FloatTensor of size 1000x1000 (GPU 0)]

In [47]:

c = a @ b
print(c)
print(type(c))

 262.6434  268.9405  249.4204  ...   257.2153  254.5742  257.5147
 254.7500  264.8945  244.5935  ...   255.7372  248.9176  254.2992
 259.0364  264.5194  246.0638  ...   254.0441  246.5359  254.0401
             ...                ⋱                ...             
 266.9815  266.0244  251.7610  ...   258.4541  256.7370  264.8131
 249.1642  253.5528  240.9402  ...   245.2666  239.6672  249.1931
 253.7141  257.4101  238.8456  ...   248.7199  242.6871  252.6635
[torch.cuda.FloatTensor of size 1000x1000 (GPU 0)]

<class 'torch.cuda.FloatTensor'>

In [48]:

np.allclose(c.cpu().numpy(), z)

Out[48]:

True

In [49]:

%timeit z = x @ y; z /= z.sum()
%timeit c = a @ b; c /= c.sum()

100 loops, best of 3: 29.7 ms per loop
1000 loops, best of 3: 1.22 ms per loop

In [51]:

(29.7 * 1e-3) / (1.22 * 1e-3)

Out[51]:

24.34426229508197

In [0]:

N = int(1e6)
n = 10
dtype = 'float32'

x = np.random.random(n).astype(dtype)
y = torch.from_numpy(x).cuda()

In [15]:

%%timeit
i = 0
np.random.multinomial(N, x/x.sum())

The slowest run took 118.90 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 8.22 µs per loop

In [18]:

np.random.multinomial(N, x/x.sum()).shape

Out[18]:

(10,)

In [0]:

%%timeit -n 10 -r 3
m = torch.distributions.multinomial.Multinomial(N, y)
m.sample() / N

In [0]: