### QUESTION 4
# (Grad students only) Suppose k=4, n1=n2=n3=1 and n4=2. Obtain the form of the exact null (H0) distribution of H for the case of no tied data.

## Formulas needed for exact null distribution of H:
## R (overall mean rank) = sum of ranks/n
## Ri (individual mean rank)
## V = 2∑(Ri-R)^2 
## H Statistic = 12V/n(n+1)

n=5
r <- c(1,2,3,4,5)
r_mu <- sum(r)/length(r)
h.stat <- 12/(n*(n+1))

# 10 unique sequences: 
# D1, D2 ranks = (1, 2) (1,3) (1,4) (1,5) (2,3) (2,4) (2,5) (3,4) (3,5) (4,5)
# sum of D ranks = (3)   (4)   (5)   (6)   (5)   (6)   (7)   (7)   (8)   (9)
# A-C ranks = (3,4,5) (2,4,5) (2,3,5) (2,3,4) (1,4,5) (1,3,5) (1,3,4) (1,2,5) (1,2,4) (1,2,3)
# Switching A-C rank placements will have no impact on the H statistsic because V will not change (mean ranks of A-C will just be the value of that rank)

dat4<-data.frame(A=c(3, 2, 2, 2, 1, 1, 1, 1, 1, 1),
                 B=c(4, 4, 3, 3, 4, 3, 3, 2, 2, 2),
                 C=c(5, 5, 5, 4, 5, 5, 4, 5, 4, 3),
                 D1.D2=c(3, 4, 5, 6, 5, 6, 7, 7, 8, 9))

# Mean rank for D1 and D2
dat4$D <- (dat4$D1.D2/2)
# Calculate (Ri-R)^2 for A-D
dat4$Va <- ((dat4$A - r_mu)^2) 
dat4$Vb <- ((dat4$B - r_mu)^2) 
dat4$Vc <- ((dat4$C - r_mu)^2) 
dat4$Vd <- ((dat4$D - r_mu)^2) 
# Calculate V
dat4$V.stat <- 2*(dat4$Va + dat4$Vb + dat4$Vc +dat4$Vd)
# Calculate H
dat4$h.stat <- dat4$V.stat*h.stat

h.stat.list <- dat4[c(1:10), c(11)]
h.stat.list
##  [1] 5.8 5.6 4.2 1.6 7.4 6.4 4.2 7.4 5.6 5.8
# Probability function
dat4.function <- data.frame(h=c(1.6, 4.2, 5.6, 5.8, 6.4, 7.4), p.of.h=c(1/10, 2/10, 2/10, 2/10, 1/10, 2/10))

## Exact Null Distribution
## P(H=1.6) = P(H=6.4) = 1/10
## P(H=4.2) = P(H=5.6) = P(H=5.8) = P(H=7.4)=2/10
## P(H)=0 for all other x




### QUESTION 5
# (Grad students only) The Bonferroni correction is known to be conservative. This means that we are guaranteed to control family-wise error rate at less than or equal to α, but often the true family-wise error rate is much less than this. Perform a simulation study to compute the true family-wise error rate when performing multiple comparisons under the null hypothesis that the means of all groups are the same. Perform the simulation study using the following parameters:

# α = 0.2
# k=6
# nj=20 for j=1,2,3,4,5,6,
# Data in each group is generated from a normal distribution with variance 1
# Use the Wilcoxon rank-sum test for multiple comparisons. table(detected)


set.seed(1234)
dat5<-data.frame(score=c(rnorm(20,0,1),rnorm(20,0,1),rnorm(20,0,1),rnorm(20,0,1),rnorm(20,0,1),rnorm(20,0,1))
                 ,group=factor(c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20),rep(6,20))))



# Testing comparisons with/without adjustment
k <- 5
alpha <- 0.2
alpha/(k*(k-1)/2)
## [1] 0.02
# α'=0.02
pairwise.wilcox.test(dat5$score,dat5$group)
## 
##  Pairwise comparisons using Wilcoxon rank sum test 
## 
## data:  dat5$score and dat5$group 
## 
##   1    2    3    4    5   
## 2 1.00 -    -    -    -   
## 3 1.00 1.00 -    -    -   
## 4 0.82 0.10 0.16 -    -   
## 5 0.86 0.10 0.29 1.00 -   
## 6 1.00 0.58 0.58 1.00 1.00
## 
## P value adjustment method: holm
pairwise.wilcox.test(dat5$score,dat5$group, p.adj="bonf")
## 
##  Pairwise comparisons using Wilcoxon rank sum test 
## 
## data:  dat5$score and dat5$group 
## 
##   1    2    3    4    5   
## 2 1.00 -    -    -    -   
## 3 1.00 1.00 -    -    -   
## 4 1.00 0.10 0.18 -    -   
## 5 1.00 0.11 0.37 1.00 -   
## 6 1.00 0.79 0.79 1.00 1.00
## 
## P value adjustment method: bonferroni
# Error Rates:
test <- function(n, k, mu) {
  stats <- rnorm(n, mean = 0)
  stats[1:k] <- stats[1:k] + mu
  rejections <- which(stats > qnorm(.8)) 
  # family-wise error rate
  fwer <- any(rejections > k)
  # Type 1 error
  typ.1 <- sum(rejections > k)/max(1,length(rejections))
  return(c(fwer, typ.1))
}

dat.sim <- replicate(1000, test(20, 6, 0))
row.names(dat.sim) <- c("Family-wise Error Rate", "Type 1 Error Proportion")
rowMeans(dat.sim)
##  Family-wise Error Rate Type 1 Error Proportion 
##               0.9600000               0.6907991
# 0.9600000