Follow-up on http://stackoverflow.com/questions/15062205/finding-start-and-stops-of-consecutive-values-block-in-python-numpy-pandas

In [1]:
import numpy as np
def find_nans_blocks_length_np(a):
"""
My numpy solution based on mask of nans and np.where.
"""
))
))

return stop_col_idx - start_col_idx + 1

In [2]:
import pandas as pd
def find_nans_blocks_length_pd(df):
"""
Zelazny7's solution based on pandas isnull/notnull and grouby magic.
"""
s = df.T.unstack()
nans_blocks = s.isnull().astype(int).groupby(s.notnull().astype(int).cumsum()).sum()
return nans_blocks[nans_blocks > 0]


Unless I am wrong, Zelazny's solution merge consecutive nans from different rows

In [3]:
a = np.array([
[1, np.nan, np.nan, 3],
[np.nan, 1, np.nan, np.nan],
[np.nan, np.nan, np.nan,np.nan]
])
find_nans_blocks_length_np(a)

Out[3]:
array([2, 1, 2, 4], dtype=int64)

In [4]:
df = pd.DataFrame(a)
df

Out[4]:
0 1 2 3
0 1 NaN NaN 3
1 NaN 1 NaN NaN
2 NaN NaN NaN NaN
In [5]:
find_nans_blocks_length_pd(df).values

Out[5]:
array([ 2.,  1.,  6.])


Setting-up a small benchmark to evaluate performances :

In [6]:
p_nans = .5
a_shape = (100,100)
a = np.random.rand(*a_shape)
a[np.random.randint(0,a_shape[0],p_nans * a_shape[0] * a_shape[1]),np.random.randint(0,a_shape[1],p_nans * a_shape[1] * a_shape[1])] = np.nan
df = pd.DataFrame(a)

In [7]:
plt.hist([find_nans_blocks_length_np(a),find_nans_blocks_length_pd(df)])
leg = plt.legend(['numpy way','pandas way'])

In [8]:
%timeit find_nans_blocks_length_np(a)
%timeit find_nans_blocks_length_pd(df)

1000 loops, best of 3: 846 us per loop
100 loops, best of 3: 3.54 ms per loop