import numpy as np
%reload_ext cythonmagic
%reload_ext memory_profiler
pos1 = np.random.randint(0, 10000000, size=10000000)
pos1 = np.unique(pos1)
pos1.shape
(6321007,)
pos2 = np.random.randint(0, 10000000, size=10000000)
pos2 = np.unique(pos2)
pos2.shape
(6321583,)
pos1
array([ 1, 4, 6, ..., 9999996, 9999998, 9999999])
pos2
array([ 0, 1, 2, ..., 9999993, 9999996, 9999998])
def intersect_positions_impl1(pos1, pos2):
cond1 = np.in1d(pos1, pos2)
cond2 = np.in1d(pos2, pos1)
return cond1, cond2
cond1, cond2 = intersect_positions_impl1(pos1, pos2)
pos1[cond1]
array([ 1, 4, 6, ..., 9999993, 9999996, 9999998])
pos2[cond2]
array([ 1, 4, 6, ..., 9999993, 9999996, 9999998])
np.count_nonzero(cond1)
3996265
%timeit intersect_positions_impl1(pos1, pos2)
1 loops, best of 3: 3.28 s per loop
%memit intersect_positions_impl1(pos1, pos2)
peak memory: 812.40 MiB, increment: 555.64 MiB
%%cython
import numpy as np
cimport numpy as np
def intersect_positions_impl2(np.ndarray[np.int64_t, ndim=1] pos1,
np.ndarray[np.int64_t, ndim=1] pos2):
cdef int i = 0
cdef int j = 0
cdef np.ndarray[np.uint8_t, ndim=1] cond1
cdef np.ndarray[np.uint8_t, ndim=1] cond2
cond1 = np.zeros((pos1.size, ), dtype=np.uint8)
cond2 = np.zeros((pos2.size, ), dtype=np.uint8)
cdef int m = pos1.size
cdef int n = pos2.size
while i < m and j < n:
if pos1[i] < pos2[j]:
# advance left
i += 1
elif pos1[i] > pos2[j]:
# advance right
j += 1
else:
# match
cond1[i] = 1
cond2[j] = 1
# advance both
i += 1
j += 1
return cond1.astype('b1'), cond2.astype('b1')
cond1, cond2 = intersect_positions_impl2(pos1, pos2)
pos1[cond1]
array([ 1, 4, 6, ..., 9999993, 9999996, 9999998])
pos2[cond2]
array([ 1, 4, 6, ..., 9999993, 9999996, 9999998])
np.count_nonzero(cond1)
3996265
%timeit intersect_positions_impl2(pos1, pos2)
10 loops, best of 3: 94.5 ms per loop
%memit intersect_positions_impl2(pos1, pos2)
peak memory: 300.15 MiB, increment: 0.17 MiB
pos1.nbytes / 1.e6
50.568056