Why simulate?
We repeatedly draw samples from a known distribution (not necessarily normal)
** This help us to do some statistical analysis on an artificial population with known, but random, properties.
Fair coin
N <- 10
headProb<-0.5
tailProb<-0.5
set.seed(6)
coinSample1<-sample(c("Head","Tail"), size = N
# replace=TRUE to over ride the default sample without replacement
,replace = TRUE
# prob= to sample elements with different probabilities
,prob=c(headProb,tailProb))
coinSample1
## [1] "Head" "Head" "Tail" "Tail" "Head" "Head" "Head" "Head" "Head" "Tail"
prop.table(table(coinSample1))
## coinSample1
## Head Tail
## 0.7 0.3
Let’s repeat this for 20 times.
N<-10
headProb<-0.5
tailProb<-0.5
nReplications<-20
resultHold<-NULL
for(which.replication in 1:nReplications){
coinSample1<-sample(c("Head","Tail")
,size = N
,replace = TRUE
,prob=c(headProb,tailProb))
proportions<-prop.table(table(coinSample1))
resultHold<-rbind(resultHold, proportions)
}
resultHold<-as.data.frame(resultHold)
row.names(resultHold)<-NULL
resultHold
## Head Tail
## 1 0.6 0.4
## 2 0.4 0.6
## 3 0.6 0.4
## 4 0.7 0.3
## 5 0.4 0.6
## 6 0.6 0.4
## 7 0.4 0.6
## 8 0.4 0.6
## 9 0.4 0.6
## 10 0.6 0.4
## 11 0.4 0.6
## 12 0.6 0.4
## 13 0.3 0.7
## 14 0.2 0.8
## 15 0.6 0.4
## 16 0.6 0.4
## 17 0.4 0.6
## 18 0.7 0.3
## 19 0.3 0.7
## 20 0.3 0.7
Change N to
Select an N and make the coin not fair by changing the probabilities (headProb and tailProb)
N<-seq(from=5, to=100,by=5)
headProb<-0.5
tailProb<-0.5
nReplications<-20
resultHoldChangeN<-NULL
for(which.N in 1:length(N)){
#which.N <-1
resultHold<-NULL
for(which.replication in 1:nReplications){
#which.replication=1
coinSample1<-sample(c("Head","Tail")
,size = N[which.N]
,replace = TRUE
,prob=c(headProb,tailProb))
proportions<-prop.table(table(coinSample1))
resultHold<-rbind(resultHold, proportions)
}
resultHold<-as.data.frame(resultHold)
row.names(resultHold)<-NULL
resultHoldChangeN<-rbind(resultHoldChangeN
,cbind(N[which.N],mean(resultHold[["Head"]])))
}
resultHoldChangeN<-as.data.frame(resultHoldChangeN)
names(resultHoldChangeN)<-c("N","Head Ave. Prob.")
resultHoldChangeN
## N Head Ave. Prob.
## 1 5 0.5400000
## 2 10 0.5550000
## 3 15 0.4566667
## 4 20 0.4850000
## 5 25 0.4660000
## 6 30 0.5350000
## 7 35 0.5185714
## 8 40 0.5025000
## 9 45 0.5111111
## 10 50 0.5100000
## 11 55 0.4763636
## 12 60 0.4958333
## 13 65 0.5100000
## 14 70 0.5092857
## 15 75 0.4973333
## 16 80 0.4906250
## 17 85 0.5088235
## 18 90 0.4866667
## 19 95 0.5121053
## 20 100 0.5095000
nReplications<-100
N=10
meanControl=0.10
meanTreatment=0.10
pHold<-NULL
for(which.rep in 1:nReplications){
treatmentGroup<-rnorm(N, mean=meanTreatment,sd=1)
controlGroup <-rnorm(N, mean=meanControl,sd=1)
dataToSave<-data.frame(group=c(rep("Treatment",N)
,rep("Control",N))
,score=c(treatmentGroup,controlGroup))
#write.csv(dataToSave,file=paste("data ",which.rep,".csv",sep=""),row.names=FALSE)
tResult<-t.test(treatmentGroup, controlGroup,alternative = c("two.sided"),paired=FALSE)
pHold<-c(pHold,tResult$p.value)
}
sum(pHold<0.05)/nReplications
## [1] 0.09
#round(pHold,digits=3)
Try new means for control and treatment
# Correlation
cor(treatmentGroup,controlGroup)
## [1] -0.06321852
library("MASS")
corBetweenPrePost<-0.3
Covariance<- matrix(c(1,corBetweenPrePost,
corBetweenPrePost,1)
,nrow=2
,ncol=2)
# cov2cor(Covariance)
preMean<-0
postMean<-0
N=10
dataDependent<-as.data.frame(mvrnorm(n = N, mu=c(preMean,postMean), Sigma=Covariance))
names(dataDependent)<-c("pre","post")
head(dataDependent)
## pre post
## 1 -0.1642676 1.05107767
## 2 -1.2177751 -0.06242311
## 3 -1.0041020 -0.51739361
## 4 -1.5245561 -1.32871782
## 5 1.2195065 1.12262317
## 6 0.3473573 0.85328703
cor(dataDependent)
## pre post
## pre 1.0000000 0.2267753
## post 0.2267753 1.0000000
colMeans(dataDependent)
## pre post
## 0.05773755 -0.20239969
t.test(dataDependent$pre
,dataDependent$post
,alternative = c("two.sided")
,paired=TRUE)
##
## Paired t-test
##
## data: dataDependent$pre and dataDependent$post
## t = 0.57871, df = 9, p-value = 0.577
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.7567235 1.2769980
## sample estimates:
## mean of the differences
## 0.2601372
Let’s repeat it
corBetweenPrePost<-0.45
Covariance<- matrix(c(1,corBetweenPrePost,
corBetweenPrePost,1)
,nrow=2
,ncol=2)
# cov2cor(Covariance)
preMean<-0
postMean<-0.2
N=10
nReplications<-20
pHold<-NULL
for(which.rep in 1:nReplications){
dataDependent<-as.data.frame(mvrnorm(n = N, mu=c(preMean,postMean), Sigma=Covariance))
names(dataDependent)<-c("pre","post")
#write.csv(dataToSave,file=paste("data ",which.rep,".csv",sep=""),row.names=FALSE)
tResult<-t.test(dataDependent$pre
,dataDependent$post
,alternative = c("two.sided")
,paired=TRUE)
pHold<-c(pHold,tResult$p.value)
}
sum(pHold<0.05)/nReplications
## [1] 0.1
Try some new correlations and means for pre and post
# https://gist.github.com/gjkerns/1608265 (Code modified)
# Simulation study for sample size 1 between/ 2within
# Counseled Status is a between-subjects variable, with 4 levels (Both Uncounseled, Both Counseled, Wife Only Counseled, Husband Only Counseled
# Target is a within-subjects variable, with 2 levels (Wife and Husband)
# Dispute System is also within-subjects, with 2 levels (Trial and Mediation)
library(MASS)
library(tidyverse)
## -- Attaching packages -------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ----------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
nPerGroup <- 50
nBetween<-4
nReplications <- 100
# set up the indep var data
target <- factor(1:2, labels = c("Wife", "Husband"))
disputeSystem <- factor(1:2, labels = c("Trial","Mediation"))
Subject <- factor(1:(nPerGroup*nBetween))
theData <- expand.grid(target,disputeSystem, Subject)
names(theData) <- c("target","disputeSystem", "Subject")
between <- rep(c("bothUnCou", "husbandOnly","wifeOnly","bothCou"), each = nPerGroup * nBetween)
theData$betweenVar <- factor(between)
#order of the means
expand.grid(target,disputeSystem)
## Var1 Var2
## 1 Wife Trial
## 2 Husband Trial
## 3 Wife Mediation
## 4 Husband Mediation
# Set the
muBothUnCou <- c(2, 2.0, 2.0, 2.0)
muHusbandOnly <- c(2, 2.0, 2.0, 2.0)
muWifeOnly <- c(2, 2.0, 2.0, 2.0)
muBothCou <- c(2, 2.0, 2.0, 2.0)
# to set up variance-covariance matrix
#stdevs <- c(0.6, 0.8, 1.0, 1.2)
#stdiff <- 1
#ones <- rep(1, nBetween)
#A <- stdevs^2 %o% ones
#B <- (A + t(A) + (stdiff^2)*(diag(nBetween) - ones %o% ones))/2
B <-matrix(c(1,0.3,0.3,0.3,
0.3,1.0,0.3,0.3,
0.3,0.3,1.0,0.3,
0.3,0.3,0.3,1.0),nrow=4,ncol=4)
# can run it through once to check that it works
out1 <- mvrnorm(nPerGroup, mu = muBothUnCou, Sigma = B)
out2 <- mvrnorm(nPerGroup, mu = muHusbandOnly, Sigma = B)
out3 <- mvrnorm(nPerGroup, mu = muWifeOnly, Sigma = B)
out4 <- mvrnorm(nPerGroup, mu = muBothCou, Sigma = B)
theData$OutCome <- c(as.vector(t(out1)), as.vector(t(out2)), as.vector(t(out3)), as.vector(t(out4)))
#theData
aovComp <- aov(OutCome ~ target*disputeSystem*betweenVar + Error(Subject/(target*disputeSystem)), theData)
summary(aovComp)
##
## Error: Subject
## Df Sum Sq Mean Sq F value Pr(>F)
## betweenVar 3 3.9 1.284 0.651 0.583
## Residuals 196 386.6 1.972
##
## Error: Subject:target
## Df Sum Sq Mean Sq F value Pr(>F)
## target 1 0.15 0.1512 0.220 0.639
## target:betweenVar 3 1.46 0.4871 0.709 0.548
## Residuals 196 134.60 0.6868
##
## Error: Subject:disputeSystem
## Df Sum Sq Mean Sq F value Pr(>F)
## disputeSystem 1 0.2 0.1954 0.272 0.603
## disputeSystem:betweenVar 3 0.4 0.1329 0.185 0.907
## Residuals 196 140.8 0.7186
##
## Error: Subject:target:disputeSystem
## Df Sum Sq Mean Sq F value Pr(>F)
## target:disputeSystem 1 0.92 0.9197 1.225 0.2697
## target:disputeSystem:betweenVar 3 5.03 1.6777 2.235 0.0854 .
## Residuals 196 147.11 0.7506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# some descriptive statistics and graphs
print(model.tables(aovComp, "means"), digits = 3)
## Tables of means
## Grand mean
##
## 1.980874
##
## target
## target
## Wife Husband
## 1.967 1.995
##
## disputeSystem
## disputeSystem
## Trial Mediation
## 1.965 1.997
##
## betweenVar
## betweenVar
## bothCou bothUnCou husbandOnly wifeOnly
## 2.014 1.905 1.926 2.078
##
## target:disputeSystem
## disputeSystem
## target Trial Mediation
## Wife 1.985 1.949
## Husband 1.945 2.044
##
## target:betweenVar
## betweenVar
## target bothCou bothUnCou husbandOnly wifeOnly
## Wife 1.991 1.936 1.846 2.095
## Husband 2.037 1.875 2.005 2.061
##
## disputeSystem:betweenVar
## betweenVar
## disputeSystem bothCou bothUnCou husbandOnly wifeOnly
## Trial 1.963 1.886 1.930 2.082
## Mediation 2.065 1.925 1.922 2.074
##
## target:disputeSystem:betweenVar
## , , betweenVar = bothCou
##
## disputeSystem
## target Trial Mediation
## Wife 1.895 2.087
## Husband 2.032 2.042
##
## , , betweenVar = bothUnCou
##
## disputeSystem
## target Trial Mediation
## Wife 2.036 1.836
## Husband 1.737 2.013
##
## , , betweenVar = husbandOnly
##
## disputeSystem
## target Trial Mediation
## Wife 1.957 1.736
## Husband 1.902 2.109
##
## , , betweenVar = wifeOnly
##
## disputeSystem
## target Trial Mediation
## Wife 2.053 2.137
## Husband 2.110 2.012
theData %>% group_by(target) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 2 x 2
## target ave
## <fct> <dbl>
## 1 Wife 1.97
## 2 Husband 1.99
theData %>% group_by(disputeSystem) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 2 x 2
## disputeSystem ave
## <fct> <dbl>
## 1 Trial 1.97
## 2 Mediation 2.00
theData %>% group_by(betweenVar ) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 4 x 2
## betweenVar ave
## <fct> <dbl>
## 1 bothCou 2.01
## 2 bothUnCou 1.91
## 3 husbandOnly 1.93
## 4 wifeOnly 2.08
theData %>% group_by(target,disputeSystem) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 4 x 3
## target disputeSystem ave
## <fct> <fct> <dbl>
## 1 Wife Trial 1.99
## 2 Wife Mediation 1.95
## 3 Husband Trial 1.95
## 4 Husband Mediation 2.04
theData %>% group_by(target,betweenVar) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 8 x 3
## target betweenVar ave
## <fct> <fct> <dbl>
## 1 Wife bothCou 1.99
## 2 Wife bothUnCou 1.94
## 3 Wife husbandOnly 1.85
## 4 Wife wifeOnly 2.10
## 5 Husband bothCou 2.04
## 6 Husband bothUnCou 1.88
## 7 Husband husbandOnly 2.01
## 8 Husband wifeOnly 2.06
theData %>% group_by(disputeSystem,betweenVar) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 8 x 3
## disputeSystem betweenVar ave
## <fct> <fct> <dbl>
## 1 Trial bothCou 1.96
## 2 Trial bothUnCou 1.89
## 3 Trial husbandOnly 1.93
## 4 Trial wifeOnly 2.08
## 5 Mediation bothCou 2.06
## 6 Mediation bothUnCou 1.92
## 7 Mediation husbandOnly 1.92
## 8 Mediation wifeOnly 2.07
theData %>% group_by(target,disputeSystem,betweenVar) %>% summarise(ave=mean(OutCome)) %>% ungroup()
## # A tibble: 16 x 4
## target disputeSystem betweenVar ave
## <fct> <fct> <fct> <dbl>
## 1 Wife Trial bothCou 1.90
## 2 Wife Trial bothUnCou 2.04
## 3 Wife Trial husbandOnly 1.96
## 4 Wife Trial wifeOnly 2.05
## 5 Wife Mediation bothCou 2.09
## 6 Wife Mediation bothUnCou 1.84
## 7 Wife Mediation husbandOnly 1.74
## 8 Wife Mediation wifeOnly 2.14
## 9 Husband Trial bothCou 2.03
## 10 Husband Trial bothUnCou 1.74
## 11 Husband Trial husbandOnly 1.90
## 12 Husband Trial wifeOnly 2.11
## 13 Husband Mediation bothCou 2.04
## 14 Husband Mediation bothUnCou 2.01
## 15 Husband Mediation husbandOnly 2.11
## 16 Husband Mediation wifeOnly 2.01
boxplot(OutCome ~ target, data = theData)
boxplot(OutCome ~ disputeSystem, data = theData)
boxplot(OutCome ~ betweenVar, data = theData)
boxplot(OutCome ~ target*disputeSystem, data = theData)
boxplot(OutCome ~ target*betweenVar, data = theData)
boxplot(OutCome ~ betweenVar*disputeSystem, data = theData)
boxplot(OutCome ~ target*disputeSystem*betweenVar, data = theData)
###############################################
# for power estimate run the below
# don't forget to set up theData and var-cov
###############################################
library(MASS)
runTest <- function(){
out1 <- mvrnorm(nPerGroup, mu = muBothUnCou, Sigma = B)
out2 <- mvrnorm(nPerGroup, mu = muHusbandOnly, Sigma = B)
out3 <- mvrnorm(nPerGroup, mu = muWifeOnly, Sigma = B)
out4 <- mvrnorm(nPerGroup, mu = muBothCou, Sigma = B)
theData$OutCome <- c(as.vector(t(out1)), as.vector(t(out2)), as.vector(t(out3)), as.vector(t(out4)))
aovComp <- aov(OutCome ~ (target*disputeSystem*betweenVar) + Error(Subject/(target*disputeSystem)), theData)
b <- summary(aovComp)$'Error: Subject:target:disputeSystem'[[1]][2,5]
b < 0.05
}
# here is estimate of power for given nPerGroup
mean(replicate(nReplications, runTest()))
## [1] 0.04