import numpy as np # as column vectors x = np.random.poisson(5,10) y = np.random.poisson(5,10) # examine the shape of the numpy arrays print x.shape print y.shape covariance_xy = np.cov(x,y, rowvar=0) inv_covariance_xy = np.linalg.inv(covariance_xy) # Examine inverse covariance matrix and shape print inv_covariance_xy print inv_covariance_xy.shape xy_mean = np.mean(x),np.mean(y) x_diff = np.array([x_i - xy_mean[0] for x_i in x]) y_diff = np.array([y_i - xy_mean[1] for y_i in y]) print x_diff.shape diff_xy = np.transpose([x_diff, y_diff]) diff_xy.shape, diff_xy print np.transpose(diff_xy[1]).shape print inv_covariance_xy.shape print diff_xy.shape md = [] for i in range(len(diff_xy)): md.append(np.sqrt(np.dot(np.dot(np.transpose(diff_xy[i]),inv_covariance_xy),diff_xy[i]))) md def MahalanobisDist(x, y): covariance_xy = np.cov(x,y, rowvar=0) inv_covariance_xy = np.linalg.inv(covariance_xy) xy_mean = np.mean(x),np.mean(y) x_diff = np.array([x_i - xy_mean[0] for x_i in x]) y_diff = np.array([y_i - xy_mean[1] for y_i in y]) diff_xy = np.transpose([x_diff, y_diff]) md = [] for i in range(len(diff_xy)): md.append(np.sqrt(np.dot(np.dot(np.transpose(diff_xy[i]),inv_covariance_xy),diff_xy[i]))) return md MahalanobisDist(x,y) def MD_removeOutliers(x, y): MD = MahalanobisDist(x, y) threshold = np.mean(MD) * 1.5 # adjust 1.5 accordingly nx, ny, outliers = [], [], [] for i in range(len(MD)): if MD[i] <= threshold: nx.append(x[i]) ny.append(y[i]) else: outliers.append(i) # position of removed pair return (np.array(nx), np.array(ny), np.array(outliers)) print 'x:', x print 'y:', y MD_removeOutliers(x,y) %load_ext rmagic #%reload_ext rmagic %%R -i DF_diff_xy # list object to be transferred to python here install.packages("ggplot2") # Had to add this for some reason, shouldn't be necessary library(ggplot2) df = data.frame(DF_diff_xy) plot = ggplot(df, aes(x = X, y = Y)) + geom_point(alpha = .8, color = 'dodgerblue',size = 5) + geom_point(data=subset(df, Y >= 6.7 | X >= 4), color = 'red',size = 6) + theme(axis.text.x = element_text(size= rel(1.5),angle=90, hjust=1)) + ggtitle('Distance Pairs with outliers highlighted in red') print(plot) import pandas as pd DF_diff_xy = pd.DataFrame(diff_xy) DF_diff_xy.rename(columns = lambda x: str(x), inplace=True) DF_diff_xy.rename(columns={"0": "X"}, inplace=True) # rename a dfcolumn DF_diff_xy.rename(columns={"1": "Y"}, inplace=True) # rename a dfcolumn DF_diff_xy from ggplot import * ggplot(DF_diff_xy, aes(x = 'X', y ='Y')) + \ geom_point(alpha=1, size=100, color='dodgerblue') + \ geom_point(data = DF_diff_xy[:1],alpha=1, size = 100, color='red') plt.show()