# Number of allocations to 8, 11, 12, 12
choose(43,8)*choose(35,11)*choose(24,12)  # big number!

# Random sampling using random permutations

prompt <- 1:43;  # dummy data for illustration
x <- runif(43);
i <- order(x);
prompt[i]

# the data are in a .csv file called "Macnell-RatingsData.csv" in the directory Data
ratings <- read.csv("Data/Macnell-RatingsData.csv", as.is=T);  # reads a .csv file into a DataFrame
ratings[1:5,]
summary(ratings) # summary statistics for the data. Note the issue with "age"

# we're going to want to format the printed output. The relevant function is "sprintf". 
# Here's the documentation.
help(sprintf)

# Let's try to reproduce the MacNell et al. results
character <- setdiff(names(ratings),c("group","gender","tagender","taidgender","age"));
for (ch in character) {
    print(sprintf('%s: mean difference %f', 
                   ch,
                  mean(ratings[ratings$taidgender==0,ch]) - mean(ratings[ratings$taidgender==1,ch])), quote=F)
}

grades <- read.csv("Data/Macnell-GradeData.csv",as.is = T);  # reads a .csv file into a DataFrame
grades[1:5,]
summary(grades) # summary statistics for the data. Note the issue with "age"

# simulate the distribution of the mean difference
simPermu <- function(m1, f1, m2, f2, iter) {
    n1f <- length(f1);              # number of students assigned to instructor 1 when instructor 1
                                    # was identified as female
    n2f <- length(f2);              # number of students assigned to instructor 2 when instructor 2
                                    # was identified as female
    z1 <- c(m1, f1);                # pooled responses for instructor 1
    z2 <- c(m2, f2);                # pooled responses for instructor 1
    ts <- abs(mean(c(m1,m2)) - mean(c(f1,f2)))     # test statistic
    sum(replicate(iter, {          # replicate() repeats the 2nd argument
                         zp1 <- sample(z1);
                         zp2 <- sample(z2);
                         abs(mean(c(zp1[1:n1],zp2[1:n2])) - mean(c(zp1[-(1:n1)],zp2[-(1:n2)]))) > ts
                        }))/iter
}

# It's good practice to set the seed of the random number generator, so that your work will be
# reproducible. I'm using the date of this lecture as the seed.  
# Don't reset the seed repeatedly in your analysis! That compromises the pseudorandom behavior of the PRNG.
# R uses the Mersenne Twister PRNG, which is good enough for general statistical purposes, but not for cryptography
#
set.seed(20150630);  # set the seed so that the analysis is reproducible
iter <- 10^4;        # iterations to estimate p-value

characteristics <- setdiff(names(ratings),c("group","gender","tagender","taidgender","age"));
characteristics

male1 <- ratings[ratings$taidgender == 1 & ratings$tagender == 1,][characteristics];
female1 <- ratings[ratings$taidgender == 0 & ratings$tagender == 1,][characteristics];

male2 <- ratings[ratings$taidgender == 1 & ratings$tagender == 0,][characteristics];
female2 <- ratings[ratings$taidgender == 0 & ratings$tagender == 0,][characteristics];

for (ch in characteristics) {
    sp <- simPermu(unlist(male1[ch]), unlist(female1[ch]), unlist(male2[ch]), unlist(female2[ch]), iter);
    print(sprintf("%s: diff of means=%f, est. p=%f", 
            ch, 
            mean(c(unlist(male1[ch]),unlist(male2[ch])))- mean(c(unlist(female1[ch]),unlist(female2[ch]))), 
            sp), quote = F);
}

binoUpperCL <- function(n, x, cl = 0.975, inc=0.000001, p=x/n) {
    if (x < n) {
            f <- pbinom(x, n, p, lower.tail = TRUE);
            while (f >= 1-cl) {  # this could be sped up using Brent's method, e.g.
                p <- p + inc;
                f <- pbinom(x, n, p, lower.tail = TRUE)
            }
            p
    } else {
            1.0
    }
}

## Example: 10**4 samples give 13 successes, so the estimated p is 0.0013. 
## What's an upper 95% confidence interval for the true p?
binoUpperCL(10**4, 13, cl=0.95)

help("sd")