#!/usr/bin/env python # coding: utf-8 # # Practical 7.2: Kolmogorov-Smirnov Test # # * Use the triolbite data above and perform a two-sided Kolmogorov-Smirnov test (two samples) on whether they come from the same distribution, or not. # * Test whether each trilobite data comes from an exponential distribution or not. (You will need to generate an exponential distribution for this question, and normalise all the distributions first) # # **NB** : The necessary function CAN be found in scipy.stats [here](http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.ks_2samp.html) # # ## BONUS (Harder) # # * Plot the cumulative distribution functions (CDF) for the trilobite data in one figure as linegraphs. # * Plot an exponential CDF on top of this as well (with a dashed line) # * Was there ever any suspicion the data might be exponential? # # **HINT** numpy has a function 'cumsum' which should make this easier. # In[3]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np from scipy import stats import matplotlib.pyplot as plt # Read in the records record1 = np.recfromtxt("./triloshape1.csv") record2 = np.recfromtxt("./triloshape2.csv") triloshape1 = np.array(record1, dtype=float) triloshape2 = np.array(record2, dtype=float) D, p = stats.ks_2samp(triloshape1, triloshape2) print "p-value = {}".format(p) if p <= 0.05: print "Reject", else: print "Accept", print "the hypothesis that the samples comes from the same distribution." plt.figure() # Generate a histogram of the data and overlay the normalised exponential # distribution n, bins, patches = plt.hist(triloshape1, 10, normed=True) plt.plot(bins,stats.expon.pdf(bins),color = 'k', lw = 2., linestyle='--', label = 'Exponential Distribution') plt.xlim(0,2) plt.xlabel("Samples") plt.ylabel("Mean length:width ratio") plt.legend(loc = 'best') plt.show() D1, p1 = stats.ks_2samp(triloshape1, stats.expon.pdf(bins)) # As before we compare triloshape1 with another dataset. This time # it is with an exponential distribution. print "H0: Trilobyte dataset 1 is an exponential distribution" print "p-value = {}".format(p1) if p1 <= 0.05: print "Reject H0 the samples do not", else: print "Accept H0 the samples do", print "come from an exponential distribution." # Repeat as just done, for the second set of data plt.figure() n, bins, patches = plt.hist(triloshape2, 10, normed=True) plt.plot(bins,stats.expon.pdf(bins),color = 'k', lw = 2., linestyle='--', label = 'Exponential Distribution') plt.xlim(0,2) plt.xlabel("Samples") plt.ylabel("Mean length:width ratio") plt.legend(loc = 'best') plt.show() D2, p2 = stats.ks_2samp(triloshape2, stats.expon.pdf(bins)) print "H0: Trilobyte dataset 2 is an exponential distribution" print "p-value = {}".format(p2) if p2 <= 0.05: print "Reject H0 the samples do not", else: print "Accept H0 the samples do", print "come from an exponential distribution." """ BONUS """ plt.figure() n1, bins1, patches1 = plt.hist(triloshape2, 10, normed=True, cumulative=True, histtype = 'step', label = 'Trilobyte CDF 2') n2, bins2, patches2 = plt.hist(triloshape1, 10, normed=True, cumulative=True, histtype = 'step', label = 'Trilobyte CDF 1') plt.plot(bins2, stats.expon.cdf(bins2), color='r', lw=2., linestyle='--', label = 'Exponential Distribution') plt.xlabel("Samples") plt.ylabel("Mean length:width ratio") plt.legend(loc = 'best') plt.show() # There was not really any reason to suspect exponetiality however this is a useful exercise # on manipulating distributions other than normal.