# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'
!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.0-{platform}-linux_x86_64.whl torchvision
tcmalloc: large alloc 1073750016 bytes == 0x5bc2e000 @ 0x7f16d1ff21c4 0x46d6a4 0x5fcbcc 0x4c494d 0x54f3c4 0x553aaf 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54e4c8
import torch
torch.__version__
'0.4.0'
torch.cuda.is_available()
True
torch.cuda.device_count()
1
dev = torch.cuda.current_device()
dev
0
torch.cuda.get_device_name(dev)
'Tesla K80'
import numpy as np
np.__version__
'1.14.3'
n = 1000
dtype = 'float32'
x = np.random.rand(n, n).astype(dtype)
y = np.random.rand(n, n).astype(dtype)
print(x)
[[0.64993143 0.54426557 0.4271511 ... 0.63773596 0.35977545 0.47273144] [0.10438471 0.4131443 0.4897633 ... 0.25540075 0.650905 0.9069924 ] [0.2869643 0.6781323 0.76321834 ... 0.55297124 0.2495685 0.347869 ] ... [0.65938854 0.9135049 0.55126035 ... 0.3452831 0.37183738 0.70037305] [0.75186837 0.8833698 0.67125374 ... 0.15030158 0.11645979 0.15519282] [0.8473455 0.7309286 0.38008705 ... 0.7080732 0.81976247 0.6249228 ]]
z = x @ y
print(z)
print(type(z))
[[262.6432 268.94064 249.42052 ... 257.21527 254.57413 257.5146 ] [254.75009 264.89435 244.59337 ... 255.7371 248.91745 254.29904] [259.03632 264.5193 246.06383 ... 254.04399 246.53589 254.04001] ... [266.98166 266.02438 251.76076 ... 258.45422 256.73703 264.81305] [249.16425 253.55284 240.94025 ... 245.26656 239.66714 249.19298] [253.71408 257.41022 238.84543 ... 248.72 242.68701 252.66364]] <class 'numpy.ndarray'>
a = torch.from_numpy(x).cuda()
b = torch.from_numpy(y).cuda()
print(a)
6.4993e-01 5.4427e-01 4.2715e-01 ... 6.3774e-01 3.5978e-01 4.7273e-01 1.0438e-01 4.1314e-01 4.8976e-01 ... 2.5540e-01 6.5091e-01 9.0699e-01 2.8696e-01 6.7813e-01 7.6322e-01 ... 5.5297e-01 2.4957e-01 3.4787e-01 ... ⋱ ... 6.5939e-01 9.1350e-01 5.5126e-01 ... 3.4528e-01 3.7184e-01 7.0037e-01 7.5187e-01 8.8337e-01 6.7125e-01 ... 1.5030e-01 1.1646e-01 1.5519e-01 8.4735e-01 7.3093e-01 3.8009e-01 ... 7.0807e-01 8.1976e-01 6.2492e-01 [torch.cuda.FloatTensor of size 1000x1000 (GPU 0)]
c = a @ b
print(c)
print(type(c))
262.6434 268.9405 249.4204 ... 257.2153 254.5742 257.5147 254.7500 264.8945 244.5935 ... 255.7372 248.9176 254.2992 259.0364 264.5194 246.0638 ... 254.0441 246.5359 254.0401 ... ⋱ ... 266.9815 266.0244 251.7610 ... 258.4541 256.7370 264.8131 249.1642 253.5528 240.9402 ... 245.2666 239.6672 249.1931 253.7141 257.4101 238.8456 ... 248.7199 242.6871 252.6635 [torch.cuda.FloatTensor of size 1000x1000 (GPU 0)] <class 'torch.cuda.FloatTensor'>
np.allclose(c.cpu().numpy(), z)
True
%timeit z = x @ y; z /= z.sum()
%timeit c = a @ b; c /= c.sum()
100 loops, best of 3: 29.7 ms per loop 1000 loops, best of 3: 1.22 ms per loop
(29.7 * 1e-3) / (1.22 * 1e-3)
24.34426229508197
N = int(1e6)
n = 10
dtype = 'float32'
x = np.random.random(n).astype(dtype)
y = torch.from_numpy(x).cuda()
%%timeit
i = 0
np.random.multinomial(N, x/x.sum())
The slowest run took 118.90 times longer than the fastest. This could mean that an intermediate result is being cached. 100000 loops, best of 3: 8.22 µs per loop
np.random.multinomial(N, x/x.sum()).shape
(10,)
%%timeit -n 10 -r 3
m = torch.distributions.multinomial.Multinomial(N, y)
m.sample() / N