# Number of allocations to 8, 11, 12, 12 choose(43,8)*choose(35,11)*choose(24,12) # big number! # Random sampling using random permutations prompt <- 1:43; # dummy data for illustration x <- runif(43); i <- order(x); prompt[i] # the data are in a .csv file called "Macnell-RatingsData.csv" in the directory Data ratings <- read.csv("Data/Macnell-RatingsData.csv", as.is=T); # reads a .csv file into a DataFrame ratings[1:5,] summary(ratings) # summary statistics for the data. Note the issue with "age" # we're going to want to format the printed output. The relevant function is "sprintf". # Here's the documentation. help(sprintf) # Let's try to reproduce the MacNell et al. results character <- setdiff(names(ratings),c("group","gender","tagender","taidgender","age")); for (ch in character) { print(sprintf('%s: mean difference %f', ch, mean(ratings[ratings$taidgender==0,ch]) - mean(ratings[ratings$taidgender==1,ch])), quote=F) } grades <- read.csv("Data/Macnell-GradeData.csv",as.is = T); # reads a .csv file into a DataFrame grades[1:5,] summary(grades) # summary statistics for the data. Note the issue with "age" # simulate the distribution of the mean difference simPermu <- function(m1, f1, m2, f2, iter) { n1f <- length(f1); # number of students assigned to instructor 1 when instructor 1 # was identified as female n2f <- length(f2); # number of students assigned to instructor 2 when instructor 2 # was identified as female z1 <- c(m1, f1); # pooled responses for instructor 1 z2 <- c(m2, f2); # pooled responses for instructor 1 ts <- abs(mean(c(m1,m2)) - mean(c(f1,f2))) # test statistic sum(replicate(iter, { # replicate() repeats the 2nd argument zp1 <- sample(z1); zp2 <- sample(z2); abs(mean(c(zp1[1:n1],zp2[1:n2])) - mean(c(zp1[-(1:n1)],zp2[-(1:n2)]))) > ts }))/iter } # It's good practice to set the seed of the random number generator, so that your work will be # reproducible. I'm using the date of this lecture as the seed. # Don't reset the seed repeatedly in your analysis! That compromises the pseudorandom behavior of the PRNG. # R uses the Mersenne Twister PRNG, which is good enough for general statistical purposes, but not for cryptography # set.seed(20150630); # set the seed so that the analysis is reproducible iter <- 10^4; # iterations to estimate p-value characteristics <- setdiff(names(ratings),c("group","gender","tagender","taidgender","age")); characteristics male1 <- ratings[ratings$taidgender == 1 & ratings$tagender == 1,][characteristics]; female1 <- ratings[ratings$taidgender == 0 & ratings$tagender == 1,][characteristics]; male2 <- ratings[ratings$taidgender == 1 & ratings$tagender == 0,][characteristics]; female2 <- ratings[ratings$taidgender == 0 & ratings$tagender == 0,][characteristics]; for (ch in characteristics) { sp <- simPermu(unlist(male1[ch]), unlist(female1[ch]), unlist(male2[ch]), unlist(female2[ch]), iter); print(sprintf("%s: diff of means=%f, est. p=%f", ch, mean(c(unlist(male1[ch]),unlist(male2[ch])))- mean(c(unlist(female1[ch]),unlist(female2[ch]))), sp), quote = F); } binoUpperCL <- function(n, x, cl = 0.975, inc=0.000001, p=x/n) { if (x < n) { f <- pbinom(x, n, p, lower.tail = TRUE); while (f >= 1-cl) { # this could be sped up using Brent's method, e.g. p <- p + inc; f <- pbinom(x, n, p, lower.tail = TRUE) } p } else { 1.0 } } ## Example: 10**4 samples give 13 successes, so the estimated p is 0.0013. ## What's an upper 95% confidence interval for the true p? binoUpperCL(10**4, 13, cl=0.95) help("sd")