We will use the LOF method to make a percentage of examples as outliers of the Iris dataset. This method computes an outlierness factor and marks the percentage of examples with higher value
from sklearn import datasets
from sklearn.neighbors import LocalOutlierFactor
from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
You can play with the number of neighbors used to compute LOF
iris = datasets.load_iris()
NEIGHBORS = 25
OUTLIERS = 0.1
lof = LocalOutlierFactor(n_neighbors=NEIGHBORS, contamination=OUTLIERS)
labels = lof.fit_predict(iris['data'])
fig = plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plt.scatter(iris['data'][:, 2], iris['data'][:, 1], c=labels,s=100)
plt.title('Outliers/Inliers')
plt.subplot(1,2,2)
plt.scatter(iris['data'][:, 2], iris['data'][:, 1], c=lof.negative_outlier_factor_,s=100)
plt.title('LOF')
<matplotlib.text.Text at 0x7f972d152160>
from amltlearn.preprocessing import Discretizer
We obtain a corrupted copy of the iris dataset by adding some missing values (exactly 75 missing values distributed on the four dimensions)
from sklearn.preprocessing import Imputer
from numpy.random import randint
iris = datasets.load_iris()
dimX, dimY = iris['data'].shape
lrandX = randint(dimX, size=75)
lrandY = randint(dimY, size=75)
lcols = [['r','g','b'][i] for i in iris['target']]
for i in lrandX:
lcols[i] = 'y'
fig = plt.figure(figsize=(8,8))
plt.scatter(iris['data'][:, 2], iris['data'][:, 1], c=lcols,s=100)
<matplotlib.collections.PathCollection at 0x7f2a9c0539b0>
The graphic shows the original data marking in yellow the examples that are going to be corrupted.
vshow = 1
sns.distplot(iris['data'][:,vshow], hist=False, rug=True, color="g", kde_kws={"shade": True})
plt.show()
This is a kernel density estimatio of the distribution of the values for dimension 1 for the original data (no missing values)
Now we corrupt the data an we apply a missing values imputer algorithm to complete the data, in this case we substitute the missings using the mean of the attibute.
orig = iris['data'].copy()
for x,y in zip(lrandX,lrandY):
iris['data'][x,y]=float('NaN')
imp = Imputer(missing_values='NaN', strategy='mean')
imp_iris = imp.fit_transform(iris['data'])
fig = plt.figure(figsize=(8,8))
plt.scatter(imp_iris[:, 2], imp_iris[:, 1], c=lcols,s=100)
<matplotlib.collections.PathCollection at 0x7f2a9667f358>
As we can see, all the examples with missing values for the dimensions 1 and 2 appear aligned on the mean of the attributes. It can be seen that the distribution of the dimension 1 has changed, the variance has been reduced.
sns.distplot(orig[:,vshow], hist=False, rug=True, color="r", kde_kws={"shade": True})
sns.distplot(imp_iris[:,vshow], hist=False, rug=True, color="g", kde_kws={"shade": True})
plt.show()
Now we use the most frequent value of the attribute to impute the missing values
imp = Imputer(missing_values='NaN', strategy='most_frequent')
imp_iris = imp.fit_transform(iris['data'])
fig = plt.figure(figsize=(8,8))
plt.scatter(imp_iris[:, 2], imp_iris[:, 1], c=lcols,s=100)
<matplotlib.collections.PathCollection at 0x7f2a96e7a780>
As expected, the imputed examples now appear aligned on the most frequent value and the variance is also reduced
sns.distplot(orig[:,vshow], hist=False, rug=True, color="r", kde_kws={"shade": True})
sns.distplot(imp_iris[:,vshow], hist=False, rug=True, color="g", kde_kws={"shade": True})
plt.show()
Now we are going to use the euclidean distance to determine the closest examples and to use the mean of the values of the 3-nearest neighbor to substitute the missing value
from amltlearn.preprocessing import KnnImputer
knnimp = KnnImputer(missing_values='NaN', n_neighbors=3)
imp_iris = knnimp.fit_transform(iris['data'])
fig = plt.figure(figsize=(8,8))
plt.scatter(imp_iris[:, 2], imp_iris[:, 1], c=lcols,s=100)
<matplotlib.collections.PathCollection at 0x7f2a9c08c1d0>
As we can see the examples look more naturally distributed and now the distribution of the attributes looks more similar to the original one.
sns.distplot(orig[:,vshow], hist=False, rug=True, color="r", kde_kws={"shade": True})
sns.distplot(imp_iris[:,vshow], hist=False, rug=True, color="g", kde_kws={"shade": True})
plt.show()
Change the vshow variable from 1 to 2 to see what happens on the other dimension