from scipy.stats import norm, laplace sigma2=1 mu=0 b=sqrt(0.5) Xs=linspace(-2, 2, 500) plot(Xs, norm.pdf(Xs, loc=mu, scale=sigma2)) plot(Xs, laplace.pdf(Xs, loc=mu, scale=b)) _=legend([ 'Gaussian','Laplace']) n=220 X=norm.rvs(size=n, loc=mu, scale=sigma2) Y=laplace.rvs(size=n, loc=mu, scale=b) subplot(1,2,1) hist(X) xlim([-5,5]) ylim([0,100]) title('Gaussian') subplot(1,2,2) title('Laplace') hist(Y) xlim([-5,5]) _=ylim([0,100]) print "Gaussian sample mean", mean(X) print "Laplacian sample mean", mean(Y) print "Gaussian sample variance", var(X) print "Laplacian sample variance", var(Y) from shogun.Features import RealFeatures from shogun.Kernel import GaussianKernel, CustomKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, UNBIASED feat_p=RealFeatures(reshape(X, (1,len(X)))) feat_q=RealFeatures(reshape(Y, (1,len(Y)))) kernel_width=1 kernel=GaussianKernel(10, kernel_width) mmd=QuadraticTimeMMD(kernel, feat_p, feat_q) # precompute kernel to be faster for null sampling p_and_q=mmd.get_p_and_q() kernel.init(p_and_q, p_and_q); precomputed_kernel=CustomKernel(kernel); mmd.set_kernel(precomputed_kernel); alpha=0.05 mmd.set_null_approximation_method(BOOTSTRAP); mmd.set_bootstrap_iterations(250); p_value_boot=mmd.perform_test(); print p_value_boot, alpha num_samples=250 # sample null distribution null_samples=mmd.bootstrap_null(num_samples) # sample alternative distribution, generate new data for that alt_samples=zeros(num_samples) for i in range(num_samples): X=norm.rvs(size=n, loc=mu, scale=sigma2) Y=laplace.rvs(size=n, loc=mu, scale=b) feat_p=RealFeatures(reshape(X, (1,len(X)))) feat_q=RealFeatures(reshape(Y, (1,len(Y)))) mmd=QuadraticTimeMMD(kernel, feat_p, feat_q) alt_samples[i]=mmd.compute_statistic() subplot(1,2,1) hist(null_samples, color='blue') title('Null distribution') subplot(1,2,2) title('Alternative distribution') hist(alt_samples, color='green') figure() hist(null_samples, color='blue') hist(alt_samples, color='green', alpha=0.5) title('Null and alternative distriution') # find (1-alpha) element of null distribution null_samples=sort(null_samples) quantile=null_samples[round(num_samples*(1-alpha))] axvline(x=quantile, ymin=0, ymax=100, color='red', label=str(int(round((1-alpha)*100))) + '% quantile') _=legend() from shogun.Kernel import CombinedKernel from shogun.Features import GaussianBlobsDataGenerator from shogun.Statistics import LinearTimeMMD from shogun.Statistics import MMDKernelSelectionOpt from shogun.Statistics import MMD1_GAUSSIAN m=20000 distance=10 stretch=5 num_blobs=3 angle=pi/4 # these are streaming features gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot=1000 features=gen_p.get_streamed_features(num_plot) features=features.create_merged_copy(gen_q.get_streamed_features(num_plot)) data=features.get_feature_matrix() figure(figsize=(8,5)) subplot(2,2,1) grid(True) plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') title('$X\sim p$') subplot(2,2,2) grid(True) plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) _=title('$Y\sim q$') sigmas=[2**x for x in linspace(-5,5, 10)] print "choosing kernel width from", ["{0:.2f}".format(sigma) for sigma in sigmas] combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, sigmas[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # optmal kernel choice is possible for linear time MMD selection=MMDKernelSelectionOpt(mmd) # select best kernel kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) print "best single kernel", kernel.get_width() alpha=0.05 mmd.set_null_approximation_method(MMD1_GAUSSIAN); p_value_boot=mmd.perform_test(); print p_value_boot, alpha mmd=LinearTimeMMD(combined, gen_p, gen_q, 5000, block_size) num_samples=100 # sample null and alternative distribution, implicitly generate new data for that null_samples=zeros(num_samples) alt_samples=zeros(num_samples) for i in range(num_samples): alt_samples[i]=mmd.compute_statistic() # tell MMD to merge data internally while streaming mmd.set_simulate_h0(True) null_samples[i]=mmd.compute_statistic() mmd.set_simulate_h0(False) subplot(1,2,1) hist(null_samples, color='blue') title('Null distribution') subplot(1,2,2) title('Alternative distribution') hist(alt_samples, color='green') figure() hist(null_samples, color='blue') hist(alt_samples, color='green', alpha=0.5) title('Null and alternative distriution') # find (1-alpha) element of null distribution null_samples=sort(null_samples) quantile=null_samples[round(num_samples*(1-alpha))] axvline(x=quantile, ymin=0, ymax=100, color='red', label=str(int(round((1-alpha)*100))) + '% quantile') _=legend()