%load_ext watermark %watermark -d -v -u -t -z -p numpy,scikit-learn import numpy as np np.random.seed(123) # A random 2D-array ranging from 0-100 X = np.random.rand(100,2) X.dtype = np.float64 X *= 100 def numpy_minmax(X): xmin = X.min(axis=0) return (X - xmin) / (X.max(axis=0) - xmin) from sklearn import preprocessing def sci_minmax(X): minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True) return minmax_scale.fit_transform(X) %matplotlib inline from matplotlib import pyplot as plt sci_mm = sci_minmax(X) numpy_mm = numpy_minmax(X) fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12, 5)) ax1.scatter(X[:,0], X[:,1], color='r', alpha=0.5, marker='o', s=50 ) ax2.scatter(numpy_mm[:,0], numpy_mm[:,1], color='g', label='NumPy bottom-up', alpha=0.5, marker='o', s=50 ) ax2.scatter(sci_mm[:,0], sci_mm[:,1], color='b', label='scikit-learn', alpha=0.5, marker='x', s=50 ) ax1.set_title('before Min-Max scaling') ax2.set_title('Min-Max scaling (min=0, max=1)') ax1.grid() ax2.grid() ax2.legend() plt.show() def numpy_zscore(X): return (X - X.mean(axis=0)) / X.std(axis=0) def sci_zscore(X): std_scale = preprocessing.StandardScaler(copy=True) return std_scale.fit_transform(X) from matplotlib import pyplot as plt sci_z = sci_zscore(X) numpy_z = numpy_zscore(X) fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12, 5)) ax1.scatter(X[:,0], X[:,1], color='r', alpha=0.5, marker='o', s=50 ) ax2.scatter(numpy_z[:,0], numpy_z[:,1], color='g', label='NumPy bottom-up', alpha=0.5, marker='o', s=50 ) ax2.scatter(sci_z[:,0], sci_z[:,1], color='b', label='scikit-learn', alpha=0.5, marker='x', s=50 ) ax1.set_title('before Z-score standardization') ax2.set_title('Z-score standardization ($\mu=0$, $\sigma=1$)') ax1.grid() ax2.grid() ax2.legend() plt.show() import timeit funcs = (numpy_minmax, sci_minmax, sci_zscore, numpy_zscore) timings = {f.__name__:[] for f in funcs} orders = [10**i for i in range(1, 5)] for n in orders: print('n=%s (%s of %s)' %(n, orders.index(n)+1, len(orders))) X = np.random.rand(n,n) X.dtype = np.float64 X *= 100 for f in timings.keys(): timings[f].append(min(timeit.Timer('%s(X)' %f, 'from __main__ import %s, X' %f).repeat(repeat=5, number=1))) print('finished') %matplotlib inline size = np.asarray(orders)**2 from matplotlib import pyplot as plt def plot(): def settings(): plt.xlim([min(size) / 10, max(size)* 10]) plt.grid() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xscale('log') plt.yscale('log') plt.legend(loc='upper left', fontsize=14) plt.xlabel('number of matrix elements (log-scale)', fontsize=16) plt.ylabel('time in seconds (log-scale)', fontsize=16) fig = plt.figure(figsize=(14,6)) plt.subplot(1,2,1) plt.plot(size, timings['numpy_minmax'], label='NumPy') plt.plot(size, timings['sci_minmax'], label='scikit-learn') plt.title('Min-Max scaling (min=0, max=1)', fontsize=22) settings() plt.subplot(1,2,2) plt.plot(size, timings['numpy_zscore'], label='NumPy') plt.plot(size, timings['sci_zscore'], label='scikit-learn') plt.title('Z-score scaling ($\mu=0$, $\sigma=1$)', fontsize=22) settings() plt.tight_layout() plt.show() plot() %watermark -d -v -m -p numpy,scikit-learn