%pylab inline

# Read in data
import numpy as np

# Read in the records.
record1 = np.recfromcsv("../data/triloshape1.csv") 
record2 = np.recfromcsv("../data/triloshape2.csv") 

print record1.dtype.names
print record2.dtype.names

# Read in the records - take II.
record1 = np.recfromtxt("../data/triloshape1.csv") 
record2 = np.recfromtxt("../data/triloshape2.csv") 

triloshape1 = np.array(record1, dtype=float)
triloshape2 = np.array(record2, dtype=float)

# Plot distribution - triloshape1
n, bins, patches = pylab.hist(triloshape1, 10, normed=1)

# Add a 'best fit' line
sigma = np.std(triloshape1)
mu = np.mean(triloshape1)

y = pylab.normpdf(bins, mu, sigma)
l = pylab.plot(bins, y, 'r--', linewidth=1)

pylab.xlabel("Samples")
pylab.ylabel("Mean length:width ratio")

pylab.show()


# Plot distribution - triloshape2
n, bins, patches = pylab.hist(triloshape2, 10, normed=1)

# Add a 'best fit' line
sigma = np.std(triloshape2)
mu = np.mean(triloshape2)

y = pylab.normpdf(bins, mu, sigma)
l = pylab.plot(bins, y, 'r--', linewidth=1)

pylab.xlabel("Samples")
pylab.ylabel("Mean length:width ratio")

pylab.show()

# The plots don't look convincing. However, it is always better to use a quantative test.
# Use the D’Agostino & Pearson test to test the null hypothesis that a samples come from
# a normal distribution.
from scipy import stats

k2_1, p_1 = stats.normaltest(triloshape1)
k2_2, p_2 = stats.normaltest(triloshape2)

print "p-values = ", p_1, p_2
if p_1 <= 0.05 or p_2 <= 0.05:
    print "Reject",
else:
    print "Accept",
print "the hypothesis that the sample comes from a normal distribution."

t, p = stats.ttest_ind(triloshape1, triloshape2)

print "T-test - (t, p) = %g, %g"%(t, p)
if p<=0.05:
    print "Reject",
else:
    print "Accept"
print "the hypothesis that the populations are the same."


# Read in data
import numpy as np

# Read in the records.
record1 = np.recfromcsv("../data/micapercent1.csv") 
record2 = np.recfromcsv("../data/micapercent2.csv") 

print record1.dtype.names
print record2.dtype.names

# Read in the records - take II.
record1 = np.recfromtxt("../data/micapercent1.csv") 
record2 = np.recfromtxt("../data/micapercent2.csv") 

micapercent1 = np.array(record1, dtype=float)
micapercent2 = np.array(record2, dtype=float)

# Plot distribution - micapercent1
n, bins, patches = pylab.hist(micapercent1, 10, normed=1)

# Add a 'best fit' line
sigma = np.std(micapercent1)
mu = np.mean(micapercent1)

y = pylab.normpdf(bins, mu, sigma)
l = pylab.plot(bins, y, 'r--', linewidth=1)

pylab.xlabel("Samples")
pylab.ylabel("Mean length:width ratio")

pylab.show()

# Plot distribution - micapercent2
n, bins, patches = pylab.hist(micapercent2, 10, normed=1)

# Add a 'best fit' line
sigma = np.std(micapercent2)
mu = np.mean(micapercent2)

y = pylab.normpdf(bins, mu, sigma)
l = pylab.plot(bins, y, 'r--', linewidth=1)

pylab.xlabel("Samples")
pylab.ylabel("Mean length:width ratio")

pylab.show()

# Looking at the first of these two plots it would appear unlikely that the sample comes
# from a normal distribution. However, it is always better to use a quantative test.
# Use the D’Agostino & Pearson test to test the null hypothesis that a samples come from
# a normal distribution.
from scipy import stats

k2_1, p_1 = stats.normaltest(micapercent1)
k2_2, p_2 = stats.normaltest(micapercent2)

print "p-values = ", p_1, p_2
if p_1 <= 0.05 or p_2 <= 0.05:
    print "Reject",
else:
    print "Accept",
print "the hypothesis that the sample comes from a normal distribution."

u, p = stats.mannwhitneyu(micapercent1, micapercent2)

# This is a one-tailed test. We want to consider both tails therefore we multiply the p value by 2.
p = p*2

print "Mann-Whitney rank test two-tailed p-value = ", p, " and u value = ", u
if p<=0.05:
    print "Reject",
else:
    print "Accept"
print "the hypothesis that the populations are the same."