#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') # # Kernel Density Estimation # # This notebook presents and compares several ways to compute the Kernel Density Estimation (KDE) of the probability density function (PDF) of a random variable. KDE plots are available in usual python data analysis and visualization packages such as [pandas](https://pandas.pydata.org/) or [seaborn](https://seaborn.pydata.org/). These packages relies on statisics pacakges to compute the KDE and this notebook will present you how to compute the KDE either by hand or using [scipy](https://docs.scipy.org). # # For a more complete reading about KDE, you should read [this article](https://jakevdp.github.io/PythonDataScienceHandbook/05.13-kernel-density-estimation.html). # In[2]: import numpy as np import scipy as sp from scipy.stats import gaussian_kde from scipy.stats import norm import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import matplotlib.patches as mpatches plt.rcParams["figure.figsize"] = (9, 6) plt.rcParams["font.size"] = 18 # just for fun # plt.xkcd(scale=.5, length=50); # The results strongly depend on the version of the used packages. For example, `seaborn` will compute the kde using eihter `scipy` or `statsmodels` and depending on the `scipy` version, the meaning of the kde width is not the same. Hereafter are the package versions used in this notebook. # In[3]: print(f"Numpy: {np.__version__}") print(f"Scipy: {sp.__version__}") print(f"Pandas: {pd.__version__}") print(f"Seaborn: {sns.__version__}") # ## Sample # # This is a data sample that will we used in this notebook. # In[4]: data = [-20.31275116, -18.3594738, -18.3553103, -14.18406452, -11.67305, -11.38179997, -11.3761126, -10.6904519, -10.68305023, -10.34148, -8.75222277, -8.7498553, -6.00130727, 1.45761078, 1.77479, 1.78314794, 2.6612791] # `seaborn` is a good option to get a quick view of the data. You can plot the data as a *rugplot* and the KDE. In `seaborn` the default kernel is a gaussian kernel. # In[5]: sns.distplot(data, hist=False, rug=True, axlabel="Something ?", kde_kws=dict(label="kde"), rug_kws=dict(height=.2, linewidth=2, color="C1", label="data")) plt.legend(); # ## Compute the gaussian KDE by hands # # A gaussian KDE is computed from a sum of normal functions of a defined width. It could be computed as follow: # In[6]: def my_kde(data, width=1, gridsize=100, normalized=True, bounds=None): """ Compute the gaussian KDE from the given sample. Args: data (array or list): sample of values width (float): width of the normal functions gridsize (int): number of grid points on which the kde is computed normalized (bool): if True the KDE is normalized (default) bounds (tuple): min and max value of the kde Returns: The grid and the KDE """ # boundaries if bounds: xmin, xmax = bounds else: xmin = min(data) - 3 * width xmax = max(data) + 3 * width # grid points x = np.linspace(xmin, xmax, gridsize) # compute kde kde = np.zeros(gridsize) for val in data: kde += norm.pdf(x, loc=val, scale=width) # normalized the KDE if normalized: kde /= sp.integrate.simps(kde, x) return x, kde # Using our function, the KDE is computed as: # In[7]: x, kde = my_kde(data, gridsize=200) # Now we can compare with the KDE otained from `seaborn`. The gaussian kernel being the default, we have to give to `seaborn` the same band width, that is `1.` here, using the `bw` keyword. # In[8]: ax = sns.distplot(data, hist=False, axlabel="Something ?", kde_kws=dict(gridsize=200, bw=1, label="seaborn")) ax.plot(x, kde, "o", label="my_kde") plt.legend(); # ## Compute the gaussian KDE with scipy # # Using available methods from reliable packages such as `scipy` is always a better option. In addition to save time the functions are generally more efficient and less buggy. # # Here we will use the [`gaussian_kde`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html) class from the `scipy.stats` module. # In[9]: # KDE instance bw = 1. / np.std(data) g_kde = gaussian_kde(dataset=data, bw_method=bw) # compute KDE gridsize = 200 g_x = np.linspace(-24, 6, gridsize) g_kde_values = g_kde(g_x) # Now we can compare with our previous KDE computed by hands. # # **Remark:** In `scipy`, the bandwidth is scaled by the covariance of the input data, that is the standard deviation here. Thus, for comparison, the bandwidth is divided by the standard deviation. # In[10]: plt.plot(g_x, g_kde_values, label="scipy") plt.plot(x, kde, "o", label="by hands") plt.legend(); # There are several methods embeded in the `gaussian_kde` class in order to integrate or sample using the new kernel. # # Depending on the `bw_method` argument, the class can compute an estimator of the bandwidth which could be obtained from a class attribute. # In[11]: g_kde = gaussian_kde(dataset=data, bw_method="scott") print(f"Band width: {g_kde.factor}") g_kde = gaussian_kde(dataset=data, bw_method="silverman") print(f"Band width: {g_kde.factor}") # If you want to use this new value in our `my_kde()` functions, don't forget to scaled the bandwidth by the standard deviation of the input data. # In[12]: # compute KDE value from gaussian_kde instance g_kde = gaussian_kde(dataset=data, bw_method="silverman") g_kde_values = g_kde(g_x) # our functions with the computed bandwidth x, kde = my_kde(data, width=g_kde.factor * np.std(data), gridsize=200) # plot plt.plot(g_x, g_kde_values, label="scipy") plt.plot(x, kde, label="by hands") plt.legend(); # ### About performances # # Hereafter is a comparison of the performances of our functions and scipy `gaussian_kde` class. You can see that scipy is three times faster. # In[13]: get_ipython().run_line_magic('timeit', 'x, kde = my_kde(data, width=g_kde.factor, gridsize=200)') # In[14]: get_ipython().run_cell_magic('timeit', '', '# KDE instance \ng_kde = gaussian_kde(dataset=data, bw_method=0.14)\n\n# compute KDE\ngridsize = 200\ng_x = np.linspace(-24, 6, gridsize)\ng_kde_values = g_kde(g_x)\n') # ## Comparisons # # Now, let us compare the KDE obtained from either our functions, `scipy`, `seaborn` or `pandas`. # In[15]: df = pd.DataFrame(data, columns=["data"]) df.head() # Note in the following cell that in `seaborn` (with gaussian kernel) the meaning of the bandwidth is the same as the one in our function (the width of the normal functions summed to obtain the KDE). As `pandas` uses `scipy` the meaning of the band width is different and for comparison, using `scipy` or `pandas`, you have to scale the bandwidth by the standard deviation. # # In the plot below, the KDE computed by hands or using `scipy` are shifted vertically for a better comparison. # In[16]: # bandwidth bw = 1. bw_sp = bw / np.std(data) # KDE from our function ax = plt.subplot() x, kde = my_kde(data, width=bw, gridsize=200) ax.plot(x, kde + 0.05, label="by hands") # scipy g_kde = gaussian_kde(dataset=data, bw_method=bw_sp) g_kde_values = g_kde(x) ax.plot(x, g_kde_values + 0.025, label="scipy") # pandas df.data.plot.kde(bw_method=bw_sp, ax=ax, linewidth=3, label="pandas") # seaborn sns.distplot(data, hist=False, rug=True, ax=ax, axlabel="Something ?", kde_kws=dict(gridsize=200, shade=True, bw=bw, linewidth=0), rug_kws=dict(height=.2, linewidth=2, color="C7", label="data")) lgd = plt.legend(fontsize=16); lines = lgd.get_lines() lines.append(mpatches.Patch(color="C3", label="seaborn", alpha=.4)) plt.legend(lines, ["by hands", "scipy", "pandas", "data", "seaborn"], fontsize=16);