Simulation

Why simulate?

  • We know and control the truth
  • We repeatedly draw samples from a known distribution (not necessarily normal)

    ** This help us to do some statistical analysis on an artificial population with known, but random, properties.

Sampling in R

Fair coin

N <- 10
headProb<-0.5
tailProb<-0.5

set.seed(6)
coinSample1<-sample(c("Head","Tail"), size = N
                    #  replace=TRUE to over ride the default sample without replacement
                    ,replace = TRUE
                    # prob= to sample elements with different probabilities
                    ,prob=c(headProb,tailProb)) 
coinSample1
##  [1] "Head" "Head" "Tail" "Tail" "Head" "Head" "Head" "Head" "Head" "Tail"
prop.table(table(coinSample1))
## coinSample1
## Head Tail 
##  0.7  0.3

Let’s repeat this for 20 times.

N<-10
headProb<-0.5
tailProb<-0.5
nReplications<-20

resultHold<-NULL
for(which.replication in 1:nReplications){
coinSample1<-sample(c("Head","Tail")
                   ,size = N
                   ,replace = TRUE
                   ,prob=c(headProb,tailProb)) 

proportions<-prop.table(table(coinSample1))

resultHold<-rbind(resultHold, proportions)

}

resultHold<-as.data.frame(resultHold)
row.names(resultHold)<-NULL
resultHold
##    Head Tail
## 1   0.6  0.4
## 2   0.4  0.6
## 3   0.6  0.4
## 4   0.7  0.3
## 5   0.4  0.6
## 6   0.6  0.4
## 7   0.4  0.6
## 8   0.4  0.6
## 9   0.4  0.6
## 10  0.6  0.4
## 11  0.4  0.6
## 12  0.6  0.4
## 13  0.3  0.7
## 14  0.2  0.8
## 15  0.6  0.4
## 16  0.6  0.4
## 17  0.4  0.6
## 18  0.7  0.3
## 19  0.3  0.7
## 20  0.3  0.7

Your turn - Exercise 1

Change N to

  • 100
  • 1000
  • 10000
  • even more?

Your turn - Exercise 2

Select an N and make the coin not fair by changing the probabilities (headProb and tailProb)

  • 0.4 and 0.6
  • 0.25 and .75
  • Your own choice

Chenge N

N<-seq(from=5, to=100,by=5)
headProb<-0.5
tailProb<-0.5
nReplications<-20

resultHoldChangeN<-NULL
for(which.N in 1:length(N)){
  #which.N <-1
resultHold<-NULL
  for(which.replication in 1:nReplications){
    #which.replication=1
  coinSample1<-sample(c("Head","Tail")
                     ,size = N[which.N]
                     ,replace = TRUE
                     ,prob=c(headProb,tailProb)) 
  
  proportions<-prop.table(table(coinSample1))
  
  resultHold<-rbind(resultHold, proportions)
  }
resultHold<-as.data.frame(resultHold)
row.names(resultHold)<-NULL
resultHoldChangeN<-rbind(resultHoldChangeN
                         ,cbind(N[which.N],mean(resultHold[["Head"]])))
}

resultHoldChangeN<-as.data.frame(resultHoldChangeN)
names(resultHoldChangeN)<-c("N","Head Ave. Prob.")
resultHoldChangeN
##      N Head Ave. Prob.
## 1    5       0.5400000
## 2   10       0.5550000
## 3   15       0.4566667
## 4   20       0.4850000
## 5   25       0.4660000
## 6   30       0.5350000
## 7   35       0.5185714
## 8   40       0.5025000
## 9   45       0.5111111
## 10  50       0.5100000
## 11  55       0.4763636
## 12  60       0.4958333
## 13  65       0.5100000
## 14  70       0.5092857
## 15  75       0.4973333
## 16  80       0.4906250
## 17  85       0.5088235
## 18  90       0.4866667
## 19  95       0.5121053
## 20 100       0.5095000

Power

  • Power is defined as the probability to reject a false null hypothesis.
  • In plain English, power is the likelihood that a study will detect an effect when there is an effect there to be detected.

Independent Sample t-test

nReplications<-100
N=10

meanControl=0.10
meanTreatment=0.10

pHold<-NULL
for(which.rep in 1:nReplications){
  
  treatmentGroup<-rnorm(N, mean=meanTreatment,sd=1)
  controlGroup  <-rnorm(N, mean=meanControl,sd=1)
  dataToSave<-data.frame(group=c(rep("Treatment",N)
                                ,rep("Control",N))
                         ,score=c(treatmentGroup,controlGroup))
  #write.csv(dataToSave,file=paste("data ",which.rep,".csv",sep=""),row.names=FALSE)
  
  tResult<-t.test(treatmentGroup, controlGroup,alternative = c("two.sided"),paired=FALSE)
  pHold<-c(pHold,tResult$p.value)
}

sum(pHold<0.05)/nReplications
## [1] 0.09
#round(pHold,digits=3)

Your turn - Exercise 3

Try new means for control and treatment

  • small effect
  • medium effect
  • large effect
  • no effect

Dependent Sample t-test

  • Can we use the same code for the dependent sample t-test?
  • Why?
# Correlation
cor(treatmentGroup,controlGroup)
## [1] -0.06321852
library("MASS")

corBetweenPrePost<-0.3
Covariance<- matrix(c(1,corBetweenPrePost,
                     corBetweenPrePost,1)
                    ,nrow=2
                    ,ncol=2)
# cov2cor(Covariance)
preMean<-0
postMean<-0
N=10

dataDependent<-as.data.frame(mvrnorm(n = N, mu=c(preMean,postMean), Sigma=Covariance))
names(dataDependent)<-c("pre","post")
head(dataDependent)
##          pre        post
## 1 -0.1642676  1.05107767
## 2 -1.2177751 -0.06242311
## 3 -1.0041020 -0.51739361
## 4 -1.5245561 -1.32871782
## 5  1.2195065  1.12262317
## 6  0.3473573  0.85328703
cor(dataDependent)
##            pre      post
## pre  1.0000000 0.2267753
## post 0.2267753 1.0000000
colMeans(dataDependent)
##         pre        post 
##  0.05773755 -0.20239969
t.test(dataDependent$pre
            ,dataDependent$post
            ,alternative = c("two.sided")
            ,paired=TRUE)
## 
##  Paired t-test
## 
## data:  dataDependent$pre and dataDependent$post
## t = 0.57871, df = 9, p-value = 0.577
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.7567235  1.2769980
## sample estimates:
## mean of the differences 
##               0.2601372

Let’s repeat it

corBetweenPrePost<-0.45
Covariance<- matrix(c(1,corBetweenPrePost,
                     corBetweenPrePost,1)
                    ,nrow=2
                    ,ncol=2)
# cov2cor(Covariance)
preMean<-0
postMean<-0.2
N=10
nReplications<-20
pHold<-NULL
for(which.rep in 1:nReplications){
  
    dataDependent<-as.data.frame(mvrnorm(n = N, mu=c(preMean,postMean), Sigma=Covariance))
    names(dataDependent)<-c("pre","post")
  #write.csv(dataToSave,file=paste("data ",which.rep,".csv",sep=""),row.names=FALSE)
  tResult<-t.test(dataDependent$pre
            ,dataDependent$post
            ,alternative = c("two.sided")
            ,paired=TRUE)

  
  pHold<-c(pHold,tResult$p.value)

}

sum(pHold<0.05)/nReplications
## [1] 0.1

Your turn - Exercise 4

Try some new correlations and means for pre and post

  • low correlation and small mean difference
  • high correlation and small mean difference
  • high correlation and meddium mean difference
  • …..

Beyond t-test

# https://gist.github.com/gjkerns/1608265 (Code modified)
# Simulation study for sample size 1 between/ 2within
# Counseled Status is a between-subjects variable, with 4 levels (Both Uncounseled, Both Counseled, Wife Only Counseled, Husband Only Counseled
# Target is a within-subjects variable, with 2 levels (Wife and Husband)
# Dispute System is also within-subjects, with 2 levels (Trial and Mediation)

library(MASS)
library(tidyverse)
## -- Attaching packages -------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ----------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks MASS::select()
nPerGroup <- 50
nBetween<-4


nReplications  <- 100

# set up the indep var data
target <- factor(1:2, labels = c("Wife", "Husband"))
disputeSystem <- factor(1:2, labels = c("Trial","Mediation"))
Subject <- factor(1:(nPerGroup*nBetween))


theData <- expand.grid(target,disputeSystem, Subject)
names(theData) <- c("target","disputeSystem", "Subject")

between <- rep(c("bothUnCou", "husbandOnly","wifeOnly","bothCou"), each = nPerGroup * nBetween)
theData$betweenVar <- factor(between)


#order of the means
expand.grid(target,disputeSystem)
##      Var1      Var2
## 1    Wife     Trial
## 2 Husband     Trial
## 3    Wife Mediation
## 4 Husband Mediation
# Set the 
muBothUnCou   <- c(2, 2.0, 2.0, 2.0)
muHusbandOnly <- c(2, 2.0, 2.0, 2.0)
muWifeOnly    <- c(2, 2.0, 2.0, 2.0)
muBothCou     <- c(2, 2.0, 2.0, 2.0)

# to set up variance-covariance matrix
#stdevs    <- c(0.6, 0.8, 1.0, 1.2)
#stdiff    <- 1
#ones <- rep(1, nBetween)
#A <- stdevs^2 %o% ones
#B <- (A + t(A) + (stdiff^2)*(diag(nBetween) - ones %o% ones))/2

B <-matrix(c(1,0.3,0.3,0.3,
             0.3,1.0,0.3,0.3,
             0.3,0.3,1.0,0.3,
             0.3,0.3,0.3,1.0),nrow=4,ncol=4)

# can run it through once to check that it works
out1 <- mvrnorm(nPerGroup, mu = muBothUnCou, Sigma = B)
out2 <- mvrnorm(nPerGroup, mu = muHusbandOnly, Sigma = B)
out3 <- mvrnorm(nPerGroup, mu = muWifeOnly, Sigma = B)
out4 <- mvrnorm(nPerGroup, mu = muBothCou, Sigma = B)

theData$OutCome <- c(as.vector(t(out1)), as.vector(t(out2)), as.vector(t(out3)), as.vector(t(out4)))
#theData


aovComp <- aov(OutCome ~ target*disputeSystem*betweenVar + Error(Subject/(target*disputeSystem)), theData)
summary(aovComp)
## 
## Error: Subject
##             Df Sum Sq Mean Sq F value Pr(>F)
## betweenVar   3    3.9   1.284   0.651  0.583
## Residuals  196  386.6   1.972               
## 
## Error: Subject:target
##                    Df Sum Sq Mean Sq F value Pr(>F)
## target              1   0.15  0.1512   0.220  0.639
## target:betweenVar   3   1.46  0.4871   0.709  0.548
## Residuals         196 134.60  0.6868               
## 
## Error: Subject:disputeSystem
##                           Df Sum Sq Mean Sq F value Pr(>F)
## disputeSystem              1    0.2  0.1954   0.272  0.603
## disputeSystem:betweenVar   3    0.4  0.1329   0.185  0.907
## Residuals                196  140.8  0.7186               
## 
## Error: Subject:target:disputeSystem
##                                  Df Sum Sq Mean Sq F value Pr(>F)  
## target:disputeSystem              1   0.92  0.9197   1.225 0.2697  
## target:disputeSystem:betweenVar   3   5.03  1.6777   2.235 0.0854 .
## Residuals                       196 147.11  0.7506                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# some descriptive statistics and graphs
print(model.tables(aovComp, "means"), digits = 3) 
## Tables of means
## Grand mean
##          
## 1.980874 
## 
##  target 
## target
##    Wife Husband 
##   1.967   1.995 
## 
##  disputeSystem 
## disputeSystem
##     Trial Mediation 
##     1.965     1.997 
## 
##  betweenVar 
## betweenVar
##     bothCou   bothUnCou husbandOnly    wifeOnly 
##       2.014       1.905       1.926       2.078 
## 
##  target:disputeSystem 
##          disputeSystem
## target    Trial Mediation
##   Wife    1.985 1.949    
##   Husband 1.945 2.044    
## 
##  target:betweenVar 
##          betweenVar
## target    bothCou bothUnCou husbandOnly wifeOnly
##   Wife    1.991   1.936     1.846       2.095   
##   Husband 2.037   1.875     2.005       2.061   
## 
##  disputeSystem:betweenVar 
##              betweenVar
## disputeSystem bothCou bothUnCou husbandOnly wifeOnly
##     Trial     1.963   1.886     1.930       2.082   
##     Mediation 2.065   1.925     1.922       2.074   
## 
##  target:disputeSystem:betweenVar 
## , , betweenVar = bothCou
## 
##          disputeSystem
## target    Trial Mediation
##   Wife    1.895 2.087    
##   Husband 2.032 2.042    
## 
## , , betweenVar = bothUnCou
## 
##          disputeSystem
## target    Trial Mediation
##   Wife    2.036 1.836    
##   Husband 1.737 2.013    
## 
## , , betweenVar = husbandOnly
## 
##          disputeSystem
## target    Trial Mediation
##   Wife    1.957 1.736    
##   Husband 1.902 2.109    
## 
## , , betweenVar = wifeOnly
## 
##          disputeSystem
## target    Trial Mediation
##   Wife    2.053 2.137    
##   Husband 2.110 2.012
theData %>% group_by(target) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 2 x 2
##   target    ave
##   <fct>   <dbl>
## 1 Wife     1.97
## 2 Husband  1.99
theData %>% group_by(disputeSystem) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 2 x 2
##   disputeSystem   ave
##   <fct>         <dbl>
## 1 Trial          1.97
## 2 Mediation      2.00
theData %>% group_by(betweenVar ) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 4 x 2
##   betweenVar    ave
##   <fct>       <dbl>
## 1 bothCou      2.01
## 2 bothUnCou    1.91
## 3 husbandOnly  1.93
## 4 wifeOnly     2.08
theData %>% group_by(target,disputeSystem) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 4 x 3
##   target  disputeSystem   ave
##   <fct>   <fct>         <dbl>
## 1 Wife    Trial          1.99
## 2 Wife    Mediation      1.95
## 3 Husband Trial          1.95
## 4 Husband Mediation      2.04
theData %>% group_by(target,betweenVar) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 8 x 3
##   target  betweenVar    ave
##   <fct>   <fct>       <dbl>
## 1 Wife    bothCou      1.99
## 2 Wife    bothUnCou    1.94
## 3 Wife    husbandOnly  1.85
## 4 Wife    wifeOnly     2.10
## 5 Husband bothCou      2.04
## 6 Husband bothUnCou    1.88
## 7 Husband husbandOnly  2.01
## 8 Husband wifeOnly     2.06
theData %>% group_by(disputeSystem,betweenVar) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 8 x 3
##   disputeSystem betweenVar    ave
##   <fct>         <fct>       <dbl>
## 1 Trial         bothCou      1.96
## 2 Trial         bothUnCou    1.89
## 3 Trial         husbandOnly  1.93
## 4 Trial         wifeOnly     2.08
## 5 Mediation     bothCou      2.06
## 6 Mediation     bothUnCou    1.92
## 7 Mediation     husbandOnly  1.92
## 8 Mediation     wifeOnly     2.07
theData %>% group_by(target,disputeSystem,betweenVar) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 16 x 4
##    target  disputeSystem betweenVar    ave
##    <fct>   <fct>         <fct>       <dbl>
##  1 Wife    Trial         bothCou      1.90
##  2 Wife    Trial         bothUnCou    2.04
##  3 Wife    Trial         husbandOnly  1.96
##  4 Wife    Trial         wifeOnly     2.05
##  5 Wife    Mediation     bothCou      2.09
##  6 Wife    Mediation     bothUnCou    1.84
##  7 Wife    Mediation     husbandOnly  1.74
##  8 Wife    Mediation     wifeOnly     2.14
##  9 Husband Trial         bothCou      2.03
## 10 Husband Trial         bothUnCou    1.74
## 11 Husband Trial         husbandOnly  1.90
## 12 Husband Trial         wifeOnly     2.11
## 13 Husband Mediation     bothCou      2.04
## 14 Husband Mediation     bothUnCou    2.01
## 15 Husband Mediation     husbandOnly  2.11
## 16 Husband Mediation     wifeOnly     2.01
boxplot(OutCome ~ target, data = theData)              

boxplot(OutCome ~ disputeSystem, data = theData)  

boxplot(OutCome ~ betweenVar, data = theData)            

boxplot(OutCome ~ target*disputeSystem, data = theData)       

boxplot(OutCome ~ target*betweenVar, data = theData)    

boxplot(OutCome ~ betweenVar*disputeSystem, data = theData) 

boxplot(OutCome ~ target*disputeSystem*betweenVar, data = theData)    

###############################################
# for power estimate run the below
# don't forget to set up theData and var-cov
###############################################
library(MASS)

runTest <- function(){
  out1 <- mvrnorm(nPerGroup, mu = muBothUnCou, Sigma = B)
  out2 <- mvrnorm(nPerGroup, mu = muHusbandOnly, Sigma = B)
  out3 <- mvrnorm(nPerGroup, mu = muWifeOnly, Sigma = B)
  out4 <- mvrnorm(nPerGroup, mu = muBothCou, Sigma = B)
  
  theData$OutCome <- c(as.vector(t(out1)), as.vector(t(out2)), as.vector(t(out3)), as.vector(t(out4)))
  aovComp <- aov(OutCome ~ (target*disputeSystem*betweenVar) + Error(Subject/(target*disputeSystem)), theData)
  
  b <- summary(aovComp)$'Error: Subject:target:disputeSystem'[[1]][2,5]
  b < 0.05
}

# here is estimate of power for given nPerGroup
mean(replicate(nReplications, runTest()))
## [1] 0.04