import numpy as np
import pandas as pd
print("numpy", np.__version__)
print("pandas", pd.__version__)
numpy 1.21.2 pandas 1.4.1
# number of integers
N = 1_000_000
10 groups over N rows.
group_min, group_max = 100, 110
group_values = np.random.randint(group_min, group_max, N)
compo = np.random.randint(1, 100, 4 * N).reshape(N, 4)
values = np.concatenate((compo, group_values[:, np.newaxis]), axis=1)
df = pd.DataFrame(values, columns=list("ABCDM"))
df.head()
A | B | C | D | M | |
---|---|---|---|---|---|
0 | 46 | 2 | 91 | 90 | 109 |
1 | 28 | 19 | 25 | 70 | 104 |
2 | 49 | 14 | 84 | 85 | 104 |
3 | 66 | 27 | 61 | 30 | 102 |
4 | 59 | 69 | 49 | 24 | 100 |
%%timeit
df = pd.DataFrame(values, columns=list("ABCDM"))
groups = df.groupby("M")
219 µs ± 12.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%%timeit
sorted_M = np.sort(group_values)
21.2 ms ± 365 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
sorted_M = np.sort(group_values)
%%timeit
# use already sorted
keys, sections = np.unique(sorted_M, return_index=True)
group_list = np.split(
sorted_M,
sections[1:]
)
groups = {keys[i]: group_list[i] for i in range(len(keys))}
8.33 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%prun -s cumulative -l 20
sorted_M = np.sort(group_values)
keys, sections = np.unique(sorted_M, return_index=True)
group_list = np.split(
sorted_M,
sections[1:]
)
groups = {keys[i]: group_list[i] for i in range(len(keys))}
121 function calls (109 primitive calls) in 0.034 seconds Ordered by: cumulative time List reduced from 33 to 20 due to restriction <20> ncalls tottime percall cumtime percall filename:lineno(function) 1 0.000 0.000 0.034 0.034 {built-in method builtins.exec} 1 0.001 0.001 0.034 0.034 <string>:1(<module>) 15/3 0.000 0.000 0.033 0.011 {built-in method numpy.core._multiarray_umath.implement_array_function} 1 0.000 0.000 0.024 0.024 <__array_function__ internals>:2(sort) 1 0.000 0.000 0.024 0.024 fromnumeric.py:846(sort) 1 0.023 0.023 0.023 0.023 {method 'sort' of 'numpy.ndarray' objects} 1 0.000 0.000 0.009 0.009 <__array_function__ internals>:2(unique) 1 0.002 0.002 0.009 0.009 arraysetops.py:138(unique) 1 0.003 0.003 0.007 0.007 arraysetops.py:320(_unique1d) 1 0.002 0.002 0.002 0.002 {method 'argsort' of 'numpy.ndarray' objects} 1 0.002 0.002 0.002 0.002 {method 'copy' of 'numpy.ndarray' objects} 1 0.001 0.001 0.001 0.001 {method 'flatten' of 'numpy.ndarray' objects} 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(split) 1 0.000 0.000 0.000 0.000 shape_base.py:799(split) 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(array_split) 1 0.000 0.000 0.000 0.000 shape_base.py:739(array_split) 11 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(swapaxes) 11 0.000 0.000 0.000 0.000 fromnumeric.py:550(swapaxes) 11 0.000 0.000 0.000 0.000 fromnumeric.py:51(_wrapfunc) 1 0.000 0.000 0.000 0.000 arraysetops.py:125(_unpack_tuple)
Above, it is clear that sort
is the most time consuming part.
1000 groups over N rows.
group_min, group_max = 100, 1200
group_values = np.random.randint(group_min, group_max, N)
compo = np.random.randint(1, 100, 4 * N).reshape(N, 4)
values = np.concatenate((compo, group_values[:, np.newaxis]), axis=1)
df = pd.DataFrame(values, columns=list("ABCDM"))
df.head()
A | B | C | D | M | |
---|---|---|---|---|---|
0 | 19 | 29 | 91 | 63 | 193 |
1 | 73 | 26 | 72 | 78 | 300 |
2 | 29 | 25 | 43 | 53 | 282 |
3 | 24 | 13 | 95 | 8 | 1118 |
4 | 54 | 37 | 75 | 11 | 464 |
%%timeit
df = pd.DataFrame(values, columns=list("ABCDM"))
groups = df.groupby("M")
210 µs ± 6.12 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%%timeit
sorted_M = np.sort(group_values)
40.9 ms ± 946 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
group_values[group_values.argsort()]
70.3 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Here, in the case of 1D array, np.sort
is more efficient than argsort()
.
sorted_M = np.sort(group_values)
%%timeit
# run here using already sorted
keys, sections = np.unique(sorted_M, return_index=True)
group_list = np.split(
sorted_M,
sections[1:]
)
groups = {keys[i]: group_list[i] for i in range(len(keys))}
9.61 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%prun -s cumulative -l 20
sorted_M = np.sort(group_values)
keys, sections = np.unique(sorted_M, return_index=True)
group_list = np.split(
sorted_M,
sections[1:]
)
groups = {keys[i]: group_list[i] for i in range(len(keys))}
8841 function calls (7739 primitive calls) in 0.063 seconds Ordered by: cumulative time List reduced from 33 to 20 due to restriction <20> ncalls tottime percall cumtime percall filename:lineno(function) 1 0.000 0.000 0.063 0.063 {built-in method builtins.exec} 1 0.001 0.001 0.063 0.063 <string>:1(<module>) 1105/3 0.000 0.000 0.061 0.020 {built-in method numpy.core._multiarray_umath.implement_array_function} 1 0.000 0.000 0.046 0.046 <__array_function__ internals>:2(sort) 1 0.000 0.000 0.046 0.046 fromnumeric.py:846(sort) 1 0.045 0.045 0.045 0.045 {method 'sort' of 'numpy.ndarray' objects} 1 0.000 0.000 0.011 0.011 <__array_function__ internals>:2(unique) 1 0.003 0.003 0.011 0.011 arraysetops.py:138(unique) 1 0.004 0.004 0.008 0.008 arraysetops.py:320(_unique1d) 1 0.000 0.000 0.004 0.004 <__array_function__ internals>:2(split) 1 0.000 0.000 0.004 0.004 shape_base.py:799(split) 1 0.000 0.000 0.004 0.004 <__array_function__ internals>:2(array_split) 1 0.001 0.001 0.004 0.004 shape_base.py:739(array_split) 1 0.003 0.003 0.003 0.003 {method 'argsort' of 'numpy.ndarray' objects} 1101 0.001 0.000 0.003 0.000 <__array_function__ internals>:2(swapaxes) 1101 0.000 0.000 0.002 0.000 fromnumeric.py:550(swapaxes) 1 0.001 0.001 0.001 0.001 {method 'flatten' of 'numpy.ndarray' objects} 1 0.001 0.001 0.001 0.001 {method 'copy' of 'numpy.ndarray' objects} 1101 0.001 0.000 0.001 0.000 fromnumeric.py:51(_wrapfunc) 1101 0.000 0.000 0.000 0.000 {method 'swapaxes' of 'numpy.ndarray' objects}
https://stackoverflow.com/questions/38013778/
>>> a = np.random.randint(5, size=(10000, 2)) # 5 different "groups"
# Only the sort
>>> a = a[a[:, 0].argsort()]
⏱ 116.9 ms
# Group by on the already sorted table
>>> np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])
⏱ 35.5 ms
# Total sort + groupby
>>> a = a[a[:, 0].argsort()]
>>> np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])
⏱ 153.0 ms 👑
# With pandas (cf Piotr answer)
>>> df = pd.DataFrame(a, columns=["key", "val"]) # no timer for this line
>>> df.groupby("key").val.apply(pd.Series.tolist)
⏱ 362.3 ms
a = np.random.randint(5, size=(10000, 2)) # 5 different "groups"
%timeit a[a[:, 0].argsort()]
298 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
a = a[a[:, 0].argsort()]
%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])
54.2 µs ± 570 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
a = np.random.randint(5, size=(10000, 2)) # 5 different "groups"
%%timeit
df = pd.DataFrame(a, columns=["key", "val"])
df.groupby("key").val.apply(pd.Series.tolist)
1.11 ms ± 67.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%%timeit
df = pd.DataFrame(a, columns=["key", "val"])
df.groupby("key")
211 µs ± 9.29 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
The apply
and tolist
operation is quite expensive. Removing this operation,
sorting plus grouping operation in numpy is slower than pandas.
Using the groupby object of Pandas, you can access to a given group using the get_group
method which is fast.
df = pd.DataFrame(a, columns=["key", "val"])
groupby = df.groupby("key")
%timeit groupby.get_group(4)
99.7 µs ± 3.88 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
The get_group
is quite slow, but in the case of only 5 groups, you call it
at most 5 times.
a = np.random.randint(500, size=(10000, 2)) # 500 different "groups"
%timeit a[a[:, 0].argsort()]
544 µs ± 3.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
a = a[a[:, 0].argsort()]
%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])
667 µs ± 2.33 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
a = np.random.randint(500, size=(10000, 2)) # 5 different "groups"
%%timeit
df = pd.DataFrame(a, columns=["key", "val"])
df.groupby("key").val.apply(pd.Series.tolist)
7.79 ms ± 305 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
df = pd.DataFrame(a, columns=["key", "val"])
df.groupby("key")
225 µs ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
df = pd.DataFrame(a, columns=["key", "val"])
groupby = df.groupby("key")
%timeit groupby.get_group(4)
75.3 µs ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
The get_group
is quite slow, but in the case of only 5 groups, you call it
at most 5 times.
print(N)
1000000
a = np.random.randint(500, size=(N, 2)) # 5àà different "groups"
%timeit a[a[:, 0].argsort()]
109 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
a = a[a[:, 0].argsort()]
%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])
8.22 ms ± 98.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
a = np.random.randint(500, size=(N, 2)) # 5 different "groups"
%%timeit
df = pd.DataFrame(a, columns=["key", "val"])
df.groupby("key").val.apply(pd.Series.tolist)
128 ms ± 4.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
df = pd.DataFrame(a, columns=["key", "val"])
df.groupby("key")
210 µs ± 5.22 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
df = pd.DataFrame(a, columns=["key", "val"])
groupby = df.groupby("key")
%timeit groupby.get_group(40)
114 µs ± 670 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)