This contains the implementations of the benchmarks described at http://jakevdp.github.com/blog/2012/08/08/memoryview-benchmarks.
Here we'll use ipython's cython magic to compile and run the benchmarks.
%load_ext cythonmagic
# Define our test array
import numpy as np
X = np.random.random((500, 3))
import numpy as np
def euclidean_distance(x1, x2):
x1 = np.asarray(x1)
x2 = np.asarray(x2)
return np.sqrt(np.sum((x1 - x2) ** 2))
def pairwise_v1(X, metric=euclidean_distance):
X = np.asarray(X)
n_samples, n_dim = X.shape
D = np.empty((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
D[i, j] = metric(X[i], X[j])
return D
%timeit pairwise_v1(X)
1 loops, best of 3: 6.6 s per loop
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython
# define a function pointer to a metric
ctypedef double (*metric_ptr)(np.ndarray, np.ndarray)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double euclidean_distance(np.ndarray[double, ndim=1, mode='c'] x1,
np.ndarray[double, ndim=1, mode='c'] x2):
cdef double tmp, d
cdef np.intp_t i, N
d = 0
N = x1.shape[0]
# assume x2 has the same shape as x1. This could be dangerous!
for i in range(N):
tmp = x1[i] - x2[i]
d += tmp * tmp
return sqrt(d)
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v2(np.ndarray[double, ndim=2, mode='c'] X not None,
metric = 'euclidean'):
cdef metric_ptr dist_func
if metric == 'euclidean':
dist_func = &euclidean_distance
else:
raise ValueError("unrecognized metric")
cdef np.intp_t i, j, n_samples
n_samples = X.shape[0]
cdef np.ndarray[double, ndim=2, mode='c'] D = np.empty((n_samples,
n_samples))
for i in range(n_samples):
for j in range(n_samples):
D[i, j] = dist_func(X[i], X[j])
return D
%timeit pairwise_v2(X)
1 loops, best of 3: 668 ms per loop
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython
# define a function pointer to a metric
ctypedef double (*metric_ptr)(double[::1], double[::1])
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double euclidean_distance(double[::1] x1,
double[::1] x2):
cdef double tmp, d
cdef np.intp_t i, N
d = 0
N = x1.shape[0]
# assume x2 has the same shape as x1. This could be dangerous!
for i in range(N):
tmp = x1[i] - x2[i]
d += tmp * tmp
return sqrt(d)
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v3(double[:, ::1] X not None,
metric = 'euclidean'):
cdef metric_ptr dist_func
if metric == 'euclidean':
dist_func = &euclidean_distance
else:
raise ValueError("unrecognized metric")
cdef np.intp_t i, j, n_samples
n_samples = X.shape[0]
cdef double[:, ::1] D = np.empty((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
D[i, j] = dist_func(X[i], X[j])
return D
%timeit pairwise_v3(X)
10 loops, best of 3: 22 ms per loop
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython
# define a function pointer to a metric
ctypedef double (*metric_ptr)(double*, double*, int)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double euclidean_distance(double* x1,
double* x2,
int N):
cdef double tmp, d
cdef np.intp_t i
d = 0
for i in range(N):
tmp = x1[i] - x2[i]
d += tmp * tmp
return sqrt(d)
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v4(double[:, ::1] X not None,
metric = 'euclidean'):
cdef metric_ptr dist_func
if metric == 'euclidean':
dist_func = &euclidean_distance
else:
raise ValueError("unrecognized metric")
cdef np.intp_t i, j, n_samples, n_dim
n_samples = X.shape[0]
n_dim = X.shape[1]
cdef double[:, ::1] D = np.empty((n_samples, n_samples))
cdef double* Dptr = &D[0, 0]
cdef double* Xptr = &X[0, 0]
for i in range(n_samples):
for j in range(n_samples):
Dptr[i * n_samples + j] = dist_func(Xptr + i * n_dim,
Xptr + j * n_dim,
n_dim)
return D
%timeit pairwise_v4(X)
100 loops, best of 3: 2.44 ms per loop
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython
# define a function pointer to a metric
ctypedef double (*metric_ptr)(double[:, ::1], np.intp_t, np.intp_t)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double euclidean_distance(double[:, ::1] X,
np.intp_t i1, np.intp_t i2):
cdef double tmp, d
cdef np.intp_t j
d = 0
for j in range(X.shape[1]):
tmp = X[i1, j] - X[i2, j]
d += tmp * tmp
return sqrt(d)
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v5(double[:, ::1] X not None,
metric = 'euclidean'):
cdef metric_ptr dist_func
if metric == 'euclidean':
dist_func = &euclidean_distance
else:
raise ValueError("unrecognized metric")
cdef np.intp_t i, j, n_samples, n_dim
n_samples = X.shape[0]
n_dim = X.shape[1]
cdef double[:, ::1] D = np.empty((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
D[i, j] = dist_func(X, i, j)
return D
missing cimport: /home/vanderplas/.config/ipython/cython/_cython_magic_1c925a99ed742e8928654e6eeb078ef5.pyx cython
%timeit pairwise_v5(X)
100 loops, best of 3: 2.48 ms per loop
Here we'll compare the benchmark to two similar routines from scipy
and scikit-learn
from scipy.spatial.distance import cdist
%timeit cdist(X, X)
100 loops, best of 3: 3.27 ms per loop
from sklearn.metrics import pairwise_distances
%timeit pairwise_distances(X, X)
100 loops, best of 3: 13 ms per loop