getwd()
## [1] "C:/Users/Caleb/Documents/Civil Engineering degree coursework/Applied Statistical Methods/Labs/Lab 12"
set.seed(55);x1=rnorm(30,mean=25,sd=5)
test1=t.test(x1,mu=22)
ci1=test1$conf.int
ci1
## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95
pval1=test1$p.value
pval1
## [1] 0.002052426
boxplot(x1, main="Sample x1")
abline(h=c(ci1,mean(x1)),col=c("Red","Red","Green"))
Reject the Null Hypothesis in favor of the alternative.
test2=t.test(x1,mu=23)
ci2=test2$conf.int
ci2
## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95
pval2=test2$p.value
pval2
## [1] 0.02542929
boxplot(x1, main="Sample x1")
abline(h=c(ci2,mean(x1)),col=c("Red","Red","Green"))
Reject the Null Hypothesis in favor of the alternative.
test3=t.test(x1,mu=24)
ci3=test3$conf.int
ci3
## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95
pval3=test3$p.value
pval3
## [1] 0.1951074
boxplot(x1, main="Sample x1")
abline(h=c(ci3,mean(x1)),col=c("Red","Red","Green"))
test4=t.test(x1,mu=25)
ci4=test4$conf.int
ci4
## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95
pval4=test4$p.value
pval4
## [1] 0.7691681
boxplot(x1, main="Sample x1")
abline(h=c(ci4,mean(x1)),col=c("Red","Red","Green"))
test5=t.test(x1,mu=26)
ci5=test5$conf.int
ci5
## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95
pval5=test5$p.value
pval5
## [1] 0.468963
boxplot(x1, main="Sample x1")
abline(h=c(ci5,mean(x1)),col=c("Red","Red","Green"))
Do not reject the Null on the last three.
tcalc=(mean(x1)-24)/(sd(x1)/sqrt(30))
tcalc
## [1] 1.326252
mypvalue=function(t0,xmax=4,n=20, alpha=0.05){
#calculate alpha/2
va=round(pt(-t0,df=n-1),4)
pv=2*va
# plot the t dist
curve(dt(x,df=n-1),xlim=c(-xmax,xmax),ylab="T Density",xlab=expression(t),
main=substitute(paste("P-value=", pv, " alpha=", alpha)))
# set up points on the polygon to the right
xcurve=seq(t0,xmax,length=1000)
ycurve=dt(xcurve,df=n-1)
# set up points to the left
xlcurve=seq(-t0,-xmax,length=1000)
ylcurve=dt(xcurve,df=n-1)
# Shade in the polygon defined by the line segments
polygon(c(t0,xcurve,xmax),c(0,ycurve,0),col="green")
polygon(c(-t0,xlcurve,-xmax),c(0,ylcurve,0),col="green")
# make quantiles
q=qt(1-alpha/2,n-1)
abline( v=c(q,-q),lwd=2) # plot the cut off t value
axis(3,c(q,-q),c(expression(abs(t[alpha/2])),expression(-abs(t[alpha/2]))))
# Annotation
text(0.5*(t0+xmax),max(ycurve),substitute(paste(area, "=",va)))
text(-0.5*(t0+xmax),max(ycurve),expression(area))
return(list(q=q,pvalue=pv))
}
##### END of FUNCTION for P VALUES #####
mypvalue(tcalc,n=30,alpha=0.05)
## $q
## [1] 2.04523
##
## $pvalue
## [1] 0.1952
t.test(x1,mu=23)
##
## One Sample t-test
##
## data: x1
## t = 2.3563, df = 29, p-value = 0.02543
## alternative hypothesis: true mean is not equal to 23
## 95 percent confidence interval:
## 23.30198 27.27320
## sample estimates:
## mean of x
## 25.28759
The rejection region is below -2.04523 and above 2.04523. We should reject \(H_0\) if our P value goes below \(0.05\). No, \(t_{calc}\) is not within the rejection region.
bootpval<-function(x,conf.level=0.95,iter=3000,mu0=0, test="two"){
n=length(x)
y=x-mean(x)+mu0 # transform the data so that it is centered at the NULL
rs.mat<-c() #rs.mat will become a resample matrix -- now it is an empty vector
xrs.mat<-c()
for(i in 1:iter){ # for loop - the loop will go around iter times
rs.mat<-cbind(rs.mat,sample(y,n,replace=TRUE)) #sampling from y cbind -- column bind -- binds the vectors together by columns
xrs.mat<-cbind(xrs.mat,sample(x,n,replace=TRUE)) #sampling from x cbind -- column bind -- binds the vectors together by columns
}
tstat<-function(z){ # The value of t when the NULL is assumed true (xbar-muo)/z/sqrt(n)
sqrt(n)*(mean(z)-mu0)/sd(z)
}
tcalc=tstat(x) # t for the data collected
ytstat=apply(rs.mat,2,tstat) # tstat of resampled y's, ytstat is a vector and will have iter values in it
xstat=apply(xrs.mat,2,mean) # mean of resampled x's
alpha=1-conf.level # calculating alpha
ci=quantile(xstat,c(alpha/2,1-alpha/2))# Nice way to form a confidence interval
pvalue=ifelse(test=="two",length(ytstat[ytstat>abs(tcalc) | ytstat < -abs(tcalc)])/iter,
ifelse(test=="upper",length(ytstat[ytstat>tcalc])/iter,
length(ytstat[ytstat<xstat])/iter))
h=hist(ytstat,plot=FALSE)
mid=h$mid
if(test=="two"){
ncoll=length(mid[mid<= -abs(tcalc)])
ncolr=length(mid[mid>= abs(tcalc)])
col=c(rep("Green",ncoll),rep("Gray",length(mid)-ncoll-ncolr),rep("Green",ncolr))
}
if(test=="upper"){
ncolr=length(mid[mid>= abs(tcalc)])
col=c(rep("Gray",length(mid)-ncolr),rep("Green",ncolr))
}
if(test=="lower"){
ncoll=length(mid[mid<= -abs(tcalc)])
col=c(rep("Green",ncoll),rep("Gray",length(mid)-ncoll))
}
hist(ytstat,col=col,freq=FALSE,las=1,main="",xlab=expression(T[stat]))
#segments(ci[1],0,ci[2],0,lwd=2)
pround=round(pvalue,4)
title(substitute(paste(P[value],"=",pround)))
return(list(pvalue=pvalue,tcalc=tcalc,n=n,x=x,test=test,ci=ci))
}
bootpval(x=x1,mu0=22,test="two")
## $pvalue
## [1] 0.002666667
##
## $tcalc
## [1] 3.386302
##
## $n
## [1] 30
##
## $x
## [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
## [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
##
## $test
## [1] "two"
##
## $ci
## 2.5% 97.5%
## 23.43182 27.12862
bootpval(x=x1,mu0=23,test="two")
## $pvalue
## [1] 0.02233333
##
## $tcalc
## [1] 2.356277
##
## $n
## [1] 30
##
## $x
## [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
## [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
##
## $test
## [1] "two"
##
## $ci
## 2.5% 97.5%
## 23.37327 27.18672
bootpval(x=x1,mu0=24,test="two")
## $pvalue
## [1] 0.19
##
## $tcalc
## [1] 1.326252
##
## $n
## [1] 30
##
## $x
## [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
## [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
##
## $test
## [1] "two"
##
## $ci
## 2.5% 97.5%
## 23.44065 27.12708
bootpval(x=x1,mu0=25,test="two")
## $pvalue
## [1] 0.7696667
##
## $tcalc
## [1] 0.2962266
##
## $n
## [1] 30
##
## $x
## [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
## [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
##
## $test
## [1] "two"
##
## $ci
## 2.5% 97.5%
## 23.42376 27.18891
bootpval(x=x1,mu0=26,test="two")
## $pvalue
## [1] 0.457
##
## $tcalc
## [1] -0.7337985
##
## $n
## [1] 30
##
## $x
## [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
## [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
##
## $test
## [1] "two"
##
## $ci
## 2.5% 97.5%
## 23.41850 27.13104
These agree with the previous result that we should reject \(\mu=22\) and \(\mu=23\) and accept \(\mu=24\), \(\mu=25\), and \(\mu=26\).
set.seed(30);x=rnorm(15,mean=10,sd=7)
set.seed(40);y=rnorm(20,mean=12,sd=4)
var.test(x,y)
##
## F test to compare two variances
##
## data: x and y
## F = 3.3363, num df = 14, denom df = 19, p-value = 0.01596
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 1.260446 9.544255
## sample estimates:
## ratio of variances
## 3.33631
I conclude that the standard deviation of x is about 3 times larger than the variance of y. Thus, I will put var.equal=FALSE in t.test().
t.test(y,x,var.eqaul=FALSE,mu=0)
##
## Welch Two Sample t-test
##
## data: y and x
## t = 1.7945, df = 20.248, p-value = 0.08768
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5950525 7.9632156
## sample estimates:
## mean of x mean of y
## 11.674425 7.990343
t.test(y,x,var.eqaul=FALSE,mu=2)
##
## Welch Two Sample t-test
##
## data: y and x
## t = 0.8203, df = 20.248, p-value = 0.4216
## alternative hypothesis: true difference in means is not equal to 2
## 95 percent confidence interval:
## -0.5950525 7.9632156
## sample estimates:
## mean of x mean of y
## 11.674425 7.990343
According to the results of the tests we should accept the Null Hypothesis of both of them because our null value lies with the confidence interval and both P values are larger than \(\alpha\).
set.seed(30);x=rnorm(15,mean=10,sd=4)
set.seed(40);y=rnorm(20,mean=12,sd=4)
var.test(x,y)
##
## F test to compare two variances
##
## data: x and y
## F = 1.0894, num df = 14, denom df = 19, p-value = 0.8454
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4115743 3.1164915
## sample estimates:
## ratio of variances
## 1.089407
t.test(y,x,var.equal=TRUE,mu=0)
##
## Two Sample t-test
##
## data: y and x
## t = 2.0623, df = 33, p-value = 0.04712
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.03803598 5.60756393
## sample estimates:
## mean of x mean of y
## 11.674425 8.851625
t.test(y,x,var.equal=TRUE,mu=2)
##
## Two Sample t-test
##
## data: y and x
## t = 0.60113, df = 33, p-value = 0.5519
## alternative hypothesis: true difference in means is not equal to 2
## 95 percent confidence interval:
## 0.03803598 5.60756393
## sample estimates:
## mean of x mean of y
## 11.674425 8.851625
According to the tests we should reject the Null Hypothesis that the difference is zero. But we should accept the Null Hypothesis that the difference is 2.
boot2pval<-function(x1,x2,conf.level=0.95,iter=3000,mudiff=0, test="two"){
n1=length(x1)
n2=length(x2)
y1=x1-mean(x1)+mean(c(x1,x2)) # transform the data so that it is centered at the NULL
y2=x2-mean(x2)+mean(c(x1,x2))
y1rs.mat<-c() #rs.mat will be come a resample matrix -- now it is an empty vector
x1rs.mat<-c()
y2rs.mat<-c()
x2rs.mat<-c()
for(i in 1:iter){ # for loop - the loop will go around iter times
y1rs.mat<-cbind(y1rs.mat,sample(y1,n1,replace=TRUE)) #sampling from y cbind -- column bind -- binds the vectors together by columns
y2rs.mat<-cbind(y2rs.mat,sample(y2,n2,replace=TRUE))
}
x1rs.mat<-y1rs.mat+mean(x1)-mean(c(x1,x2))
x2rs.mat<-y2rs.mat+mean(x2)-mean(c(x1,x2))
xbar1=mean(x1)
xbar2=mean(x2)
sx1sq=var(x1)
sx2sq=var(x2)
tcalc=(xbar1-xbar2-mudiff)/sqrt(sx1sq/n1+sx2sq/n2)
sy1sq=apply(y1rs.mat,2,var)
sy2sq=apply(y2rs.mat,2,var)
y1bar=apply(y1rs.mat,2,mean)
y2bar=apply(y2rs.mat,2,mean)
tstat=(y1bar-y2bar-mudiff)/sqrt(sy1sq/n1+sy2sq/n2)
alpha=1-conf.level # calculating alpha
#ci=quantile(xstat,c(alpha/2,1-alpha/2))# Nice way to form a confidence interval
pvalue=ifelse(test=="two",length(tstat[tstat>abs(tcalc) | tstat < -abs(tcalc)])/iter,
ifelse(test=="upper",length(tstat[tstat>tcalc])/iter,
length(ytstat[tstat<tcalc])/iter))
h=hist(tstat,plot=FALSE)
mid=h$mid
if(test=="two"){
ncoll=length(mid[mid<= -abs(tcalc)])
ncolr=length(mid[mid>= abs(tcalc)])
col=c(rep("Green",ncoll),rep("Gray",length(mid)-ncoll-ncolr),rep("Green",ncolr))
}
hist(tstat,col=col,freq=FALSE)
#segments(ci[1],0,ci[2],0,lwd=2)
return(list(pvalue=pvalue))
#return(list(pvalue=pvalue,tcalc=tcalc,n=n,x=x,test=test,ci=ci))
}
set.seed(30);x=rnorm(15,mean=10,sd=7)
set.seed(40);y=rnorm(20,mean=12,sd=4)
boot2pval(x1=y,x2=x,mudiff=0)
## $pvalue
## [1] 0.08366667
boot2pval(x1=y,x2=x,mudiff=2)
## $pvalue
## [1] 0.5896667
set.seed(30);x=rnorm(15,mean=10,sd=4)
set.seed(40);y=rnorm(20,mean=12,sd=4)
boot2pval(x1=y,x2=x,mudiff=0)
## $pvalue
## [1] 0.048
boot2pval(x1=y,x2=x,mudiff=2)
## $pvalue
## [1] 0.8336667
Line A is running a T test on the data named x1 with the null hypothesis that \(\mu=23\).
Line B is the title of the results of the T-test, and it says that this test was done on one sample.
Line C contains the \(T_{calc}\) value, degrees of freedom, and the p-value. Calculated from the T test.
Line D says what the alternate hypothesis is for this test.
Line E is the confidence of the interval that it is calculating, while line F gives the confidence interval.
Line G gives an estimate of the mean of x1.
set.seed(55);x1=rnorm(30,mean=25,sd=5)
MATH4753GRAY::bootpval(x1,mu=25)
## $pvalue
## [1] 0.7716667
##
## $tcalc
## [1] 0.2962266
##
## $n
## [1] 30
##
## $x
## [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
## [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
##
## $test
## [1] "two"
##
## $ci
## 2.5% 97.5%
## 23.43182 27.12862