NB : The necessary function CAN be found in scipy.stats here
HINT numpy has a function 'cumsum' which should make this easier.
%matplotlib inline
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
# Read in the records
record1 = np.recfromtxt("./triloshape1.csv")
record2 = np.recfromtxt("./triloshape2.csv")
triloshape1 = np.array(record1, dtype=float)
triloshape2 = np.array(record2, dtype=float)
D, p = stats.ks_2samp(triloshape1, triloshape2)
print "p-value = {}".format(p)
if p <= 0.05:
print "Reject",
else:
print "Accept",
print "the hypothesis that the samples comes from the same distribution."
plt.figure()
# Generate a histogram of the data and overlay the normalised exponential
# distribution
n, bins, patches = plt.hist(triloshape1, 10, normed=True)
plt.plot(bins,stats.expon.pdf(bins),color = 'k', lw = 2.,
linestyle='--', label = 'Exponential Distribution')
plt.xlim(0,2)
plt.xlabel("Samples")
plt.ylabel("Mean length:width ratio")
plt.legend(loc = 'best')
plt.show()
D1, p1 = stats.ks_2samp(triloshape1, stats.expon.pdf(bins))
# As before we compare triloshape1 with another dataset. This time
# it is with an exponential distribution.
print "H0: Trilobyte dataset 1 is an exponential distribution"
print "p-value = {}".format(p1)
if p1 <= 0.05:
print "Reject H0 the samples do not",
else:
print "Accept H0 the samples do",
print "come from an exponential distribution."
# Repeat as just done, for the second set of data
plt.figure()
n, bins, patches = plt.hist(triloshape2, 10, normed=True)
plt.plot(bins,stats.expon.pdf(bins),color = 'k', lw = 2.,
linestyle='--', label = 'Exponential Distribution')
plt.xlim(0,2)
plt.xlabel("Samples")
plt.ylabel("Mean length:width ratio")
plt.legend(loc = 'best')
plt.show()
D2, p2 = stats.ks_2samp(triloshape2, stats.expon.pdf(bins))
print "H0: Trilobyte dataset 2 is an exponential distribution"
print "p-value = {}".format(p2)
if p2 <= 0.05:
print "Reject H0 the samples do not",
else:
print "Accept H0 the samples do",
print "come from an exponential distribution."
""" BONUS """
plt.figure()
n1, bins1, patches1 = plt.hist(triloshape2, 10, normed=True,
cumulative=True, histtype = 'step', label = 'Trilobyte CDF 2')
n2, bins2, patches2 = plt.hist(triloshape1, 10, normed=True,
cumulative=True, histtype = 'step', label = 'Trilobyte CDF 1')
plt.plot(bins2, stats.expon.cdf(bins2), color='r', lw=2.,
linestyle='--', label = 'Exponential Distribution')
plt.xlabel("Samples")
plt.ylabel("Mean length:width ratio")
plt.legend(loc = 'best')
plt.show()
# There was not really any reason to suspect exponetiality however this is a useful exercise
# on manipulating distributions other than normal.
p-value = 0.153861546495 Accept the hypothesis that the samples comes from the same distribution.
H0: Trilobyte dataset 1 is an exponential distribution p-value = 2.91067367394e-06 Reject H0 the samples do not come from an exponential distribution.
H0: Trilobyte dataset 2 is an exponential distribution p-value = 6.18818081532e-10 Reject H0 the samples do not come from an exponential distribution.