Do new “directed reading activities” improve the reading ability of elementary school students, as measured by their Degree of Reading Power” (DRP) scores? A study assigns students at random to either the new method (treatment group, 21 students) or traditional teaching methods(control group, 23 students).
#Data subset:
#Treatment Group 57 61
#Control Group 42 62 41 28
TGroup<-c(57,61)
CGroup<-c(42,62,41,28)
TGMean<-mean(TGroup)
TGMean
## [1] 59
CGMean<-mean(CGroup)
CGMean
## [1] 43.25
Meandif<-TGMean-CGMean
Meandif
## [1] 15.75
permHyp2<-function(data1,data2,nsim,alternative){
testStat<-mean(data1)-mean(data2)
n1<-length(data1)
n2<-length(data2)
allDat<-c(data1,data2)
permNull<-c()
for(i in 1:nsim){
permSamp<-sample(1:(n1+n2),n1,replace=FALSE)
thisXbar<-mean(allDat[permSamp]-mean(allDat[-permSamp]))
permNull<-c(permNull,thisXbar)
}
return(permNull)
}
TConcepermHyp2<- permHyp2(TGroup,CGroup,nsim=1,alternative = "greater")
TConcepermHyp2
## [1] -20.25
TCpermHyp2<- permHyp2(TGroup,CGroup,nsim=20,alternative = "greater")
TCpermHyp2
## [1] -6.00 5.25 -20.25 -10.50 16.50 16.50 1.50 4.50 15.75 3.75
## [11] 5.25 -10.50 4.50 -6.00 -21.00 15.75 15.75 19.50 -5.25 15.75
hist(TCpermHyp2)
abline(v=15.75, col="blue", lwd=2, lty=2)
(TCpermHyp2>15.75)/20
## [1] 0.00 0.00 0.00 0.00 0.05 0.05 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## [15] 0.00 0.00 0.00 0.05 0.00 0.00
#Usually calculates to a p-value of 0.1, although other p-values occur based on the distribution.
#Possible permutations for 57,61,42,62,41,28
mean(c(57,61)) - mean(c(41,28,42,62))
## [1] 15.75
mean(c(61,41)) - mean(c(28,42,62,57))
## [1] 3.75
mean(c(62,41)) - mean(c(28,61,57,42))
## [1] 4.5
mean(c(28,61)) - mean(c(42,41,57,62))
## [1] -6
mean(c(42,57)) - mean(c(41,62,28,61))
## [1] 1.5
mean(c(62,28)) - mean(c(61,42,41,57))
## [1] -5.25
mean(c(41,42)) - mean(c(28,57,61,62))
## [1] -10.5
mean(c(41,28)) - mean(c(57,61,42,62))
## [1] -21
mean(c(61,62)) - mean(c(57,41,42,28))
## [1] 19.5
mean(c(28,42)) - mean(c(57,61,62,41))
## [1] -20.25
mean(c(57,41)) - mean(c(61,42,62,28))
## [1] 0.75
mean(c(57,28)) - mean(c(61,42,62,41))
## [1] -9
mean(c(42,61)) - mean(c(41,62,28,57))
## [1] 4.5
mean(c(42,62)) - mean(c(61,41,28,57))
## [1] 5.25
mean(c(62,57)) - mean(c(61,41,42,28))
## [1] 16.5
#Mean differences for all possible permutations: 15.75,3.75,4.5,-6,1.5,-5.25,-10.5,-21,19.5,-20.25,0.75,-9,4.5,5.25,16.5
3/15
## [1] 0.2
#The exact p-value is 0.2 --> My estimate was 0.1, which is not too far off.
calls<-read.csv("calls80.csv",
header=TRUE,
na.strings = "?")
hist(calls$length)
#There is a large amount of values on the left side of the graph --> The distribution is skewed to the left.
bootStrapCI1<-function(data,nsim){
n<-length(data)
bootCI<-c()
for(i in 1:nsim){
bootSamp<-sample(1:n,n,replace=TRUE)
thisXbar<-mean(data[bootSamp])
bootCI<-c(bootCI, thisXbar)
}
return(bootCI)
}
callsBootCI<-bootStrapCI1(calls$length, nsim=1000)
hist(callsBootCI)
#While the sample curve appears relatively normal, it is still skewed to the left.
#My package does not include ggplot and won't recognize ggplot as a downloadable package, but from the histogram I can discern that the left tail is rather short, whereas the right tail is elongated.
callsSRS<-c(104,102,35,211,56,325,67,9,179,59)
callsSRSBootCI<-bootStrapCI1(callsSRS, nsim=1000)
hist(callsSRSBootCI)
#This histrogram appears to be slightly more centralized, but still skewed.
(sd(callsBootCI))/sqrt(80)
## [1] 4.105443
#Standard error for calls$length: 4.362069
(sd(callsSRSBootCI))/sqrt(length(callsSRS))
## [1] 8.880672
#Standard error for callsSRS: 9.141214
#I would expect the SRS standard error to be greater because it comes from a smaller sample size (as a smaller sample size is less reliable and generally less likely to be normalized), and that is reflected in the two standard errors here.
trees<-read.csv("nspines.csv",
header=TRUE,
na.strings = "?")
ntrees<-trees$dbh[1:30]
strees<-trees$dbh[31:60]
hist(ntrees)
hist(strees)
#I would choose to do bootstrapping in this particular case because neither ntrees nor strees is normalized, and 30 is not a relatively large sample size.
Obsstat<-mean(ntrees) - mean(strees)
Obsstat
## [1] -10.83333
bootStrapHyp2<-function(data1,data2,nsim,alternative){
testStat<-mean(data1)-mean(data1)
n1<-length(data1)
n2<-length(data2)
allDat<-c(data1,data2)
bootNull<-c()
for(i in 1:nsim){
bootSamp<-sample(1:(n1+n2),(n1+n2),replace=TRUE)
boot1<-bootSamp[1:n1]
boot2<-bootSamp[(n1+1):(n1+n2)]
thisXbar<-mean(allDat[boot1])-mean(allDat[boot2])
bootNull<-c(bootNull,thisXbar)
}
return(bootNull)
}
treesbSH2<-bootStrapHyp2(ntrees,strees,1000,alternative = "not equal")
hist(treesbSH2)
quantile(treesbSH2, c(0.025,0.975))
## 2.5% 97.5%
## -8.60375 8.89025
n<-length(treesbSH2)
se<-sd(treesbSH2)
mean(treesbSH2)+c(-1,1)*qt(0.975, df=n-1)*se
## [1] -8.467526 8.586893
#Because the bootstrap came back relatively normalized and both the quantile and hybrid method reflected that, I feel a boostrap two sample confidence interval could be done with reliable results.
#t confidence interval test
t.test(treesbSH2)
##
## One Sample t-test
##
## data: treesbSH2
## t = 0.43433, df = 999, p-value = 0.6641
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.2099707 0.3293374
## sample estimates:
## mean of x
## 0.05968333
#The confidence interval for the bootstrapped results under the quantile method and the hybrid method are more accurate than the standard t.test method.