Lab 12-2021

Task 1

getwd()

## [1] "C:/Users/Caleb/Documents/Civil Engineering degree coursework/Applied Statistical Methods/Labs/Lab 12"

Task 2

set.seed(55);x1=rnorm(30,mean=25,sd=5)

test1=t.test(x1,mu=22)
ci1=test1$conf.int
ci1

## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95

pval1=test1$p.value
pval1

## [1] 0.002052426

boxplot(x1, main="Sample x1")
abline(h=c(ci1,mean(x1)),col=c("Red","Red","Green"))

Reject the Null Hypothesis in favor of the alternative.

test2=t.test(x1,mu=23)
ci2=test2$conf.int
ci2

## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95

pval2=test2$p.value
pval2

## [1] 0.02542929

boxplot(x1, main="Sample x1")
abline(h=c(ci2,mean(x1)),col=c("Red","Red","Green"))

Reject the Null Hypothesis in favor of the alternative.

test3=t.test(x1,mu=24)
ci3=test3$conf.int
ci3

## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95

pval3=test3$p.value
pval3

## [1] 0.1951074

boxplot(x1, main="Sample x1")
abline(h=c(ci3,mean(x1)),col=c("Red","Red","Green"))

test4=t.test(x1,mu=25)
ci4=test4$conf.int
ci4

## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95

pval4=test4$p.value
pval4

## [1] 0.7691681

boxplot(x1, main="Sample x1")
abline(h=c(ci4,mean(x1)),col=c("Red","Red","Green"))

test5=t.test(x1,mu=26)
ci5=test5$conf.int
ci5

## [1] 23.30198 27.27320
## attr(,"conf.level")
## [1] 0.95

pval5=test5$p.value
pval5

## [1] 0.468963

boxplot(x1, main="Sample x1")
abline(h=c(ci5,mean(x1)),col=c("Red","Red","Green"))

Do not reject the Null on the last three.

tcalc=(mean(x1)-24)/(sd(x1)/sqrt(30))
tcalc

## [1] 1.326252

mypvalue=function(t0,xmax=4,n=20, alpha=0.05){
#calculate alpha/2
va=round(pt(-t0,df=n-1),4)
pv=2*va

# plot the t dist
curve(dt(x,df=n-1),xlim=c(-xmax,xmax),ylab="T Density",xlab=expression(t),
main=substitute(paste("P-value=", pv, " alpha=", alpha)))


# set up points on the polygon to the right
xcurve=seq(t0,xmax,length=1000)
ycurve=dt(xcurve,df=n-1)

# set up points to the left
xlcurve=seq(-t0,-xmax,length=1000)
ylcurve=dt(xcurve,df=n-1)

# Shade in the polygon defined by the line segments
polygon(c(t0,xcurve,xmax),c(0,ycurve,0),col="green")
polygon(c(-t0,xlcurve,-xmax),c(0,ylcurve,0),col="green")

# make quantiles
q=qt(1-alpha/2,n-1)
abline( v=c(q,-q),lwd=2) # plot the cut off t value 
axis(3,c(q,-q),c(expression(abs(t[alpha/2])),expression(-abs(t[alpha/2]))))


# Annotation
text(0.5*(t0+xmax),max(ycurve),substitute(paste(area, "=",va)))
text(-0.5*(t0+xmax),max(ycurve),expression(area))

return(list(q=q,pvalue=pv))
}
##### END of FUNCTION for P VALUES    #####

mypvalue(tcalc,n=30,alpha=0.05)

## $q
## [1] 2.04523
## 
## $pvalue
## [1] 0.1952

t.test(x1,mu=23)

## 
##  One Sample t-test
## 
## data:  x1
## t = 2.3563, df = 29, p-value = 0.02543
## alternative hypothesis: true mean is not equal to 23
## 95 percent confidence interval:
##  23.30198 27.27320
## sample estimates:
## mean of x 
##  25.28759

The rejection region is below -2.04523 and above 2.04523. We should reject \(H_0\) if our P value goes below \(0.05\). No, \(t_{calc}\) is not within the rejection region.

bootpval<-function(x,conf.level=0.95,iter=3000,mu0=0, test="two"){
n=length(x)
y=x-mean(x)+mu0  # transform the data so that it is centered at the NULL
rs.mat<-c()    #rs.mat will become a resample matrix -- now it is an empty vector
xrs.mat<-c()
for(i in 1:iter){ # for loop - the loop will go around iter times
rs.mat<-cbind(rs.mat,sample(y,n,replace=TRUE)) #sampling from y cbind -- column bind -- binds the vectors together by columns
xrs.mat<-cbind(xrs.mat,sample(x,n,replace=TRUE)) #sampling from x cbind -- column bind -- binds the vectors together by columns

}

tstat<-function(z){ # The value of t when the NULL is assumed true (xbar-muo)/z/sqrt(n)
sqrt(n)*(mean(z)-mu0)/sd(z)
}

tcalc=tstat(x) # t for the data collected
ytstat=apply(rs.mat,2,tstat) # tstat of resampled y's, ytstat is a vector and will have iter values in it
xstat=apply(xrs.mat,2,mean)  # mean of resampled x's
alpha=1-conf.level # calculating alpha
ci=quantile(xstat,c(alpha/2,1-alpha/2))# Nice way to form a confidence interval
pvalue=ifelse(test=="two",length(ytstat[ytstat>abs(tcalc) | ytstat < -abs(tcalc)])/iter,
ifelse(test=="upper",length(ytstat[ytstat>tcalc])/iter,
length(ytstat[ytstat<xstat])/iter))

h=hist(ytstat,plot=FALSE)
mid=h$mid
if(test=="two"){
ncoll=length(mid[mid<= -abs(tcalc)])
ncolr=length(mid[mid>=  abs(tcalc)])
col=c(rep("Green",ncoll),rep("Gray",length(mid)-ncoll-ncolr),rep("Green",ncolr))
}
if(test=="upper"){
ncolr=length(mid[mid>=  abs(tcalc)])
col=c(rep("Gray",length(mid)-ncolr),rep("Green",ncolr))
}

if(test=="lower"){
ncoll=length(mid[mid<=  -abs(tcalc)])
col=c(rep("Green",ncoll),rep("Gray",length(mid)-ncoll))
}
hist(ytstat,col=col,freq=FALSE,las=1,main="",xlab=expression(T[stat]))
#segments(ci[1],0,ci[2],0,lwd=2)
pround=round(pvalue,4)
title(substitute(paste(P[value],"=",pround)))
return(list(pvalue=pvalue,tcalc=tcalc,n=n,x=x,test=test,ci=ci))
}


bootpval(x=x1,mu0=22,test="two")

## $pvalue
## [1] 0.002666667
## 
## $tcalc
## [1] 3.386302
## 
## $n
## [1] 30
## 
## $x
##  [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
##  [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
## 
## $test
## [1] "two"
## 
## $ci
##     2.5%    97.5% 
## 23.43182 27.12862

bootpval(x=x1,mu0=23,test="two")

## $pvalue
## [1] 0.02233333
## 
## $tcalc
## [1] 2.356277
## 
## $n
## [1] 30
## 
## $x
##  [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
##  [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
## 
## $test
## [1] "two"
## 
## $ci
##     2.5%    97.5% 
## 23.37327 27.18672

bootpval(x=x1,mu0=24,test="two")

## $pvalue
## [1] 0.19
## 
## $tcalc
## [1] 1.326252
## 
## $n
## [1] 30
## 
## $x
##  [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
##  [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
## 
## $test
## [1] "two"
## 
## $ci
##     2.5%    97.5% 
## 23.44065 27.12708

bootpval(x=x1,mu0=25,test="two")

## $pvalue
## [1] 0.7696667
## 
## $tcalc
## [1] 0.2962266
## 
## $n
## [1] 30
## 
## $x
##  [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
##  [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
## 
## $test
## [1] "two"
## 
## $ci
##     2.5%    97.5% 
## 23.42376 27.18891

bootpval(x=x1,mu0=26,test="two")

## $pvalue
## [1] 0.457
## 
## $tcalc
## [1] -0.7337985
## 
## $n
## [1] 30
## 
## $x
##  [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
##  [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
## 
## $test
## [1] "two"
## 
## $ci
##     2.5%    97.5% 
## 23.41850 27.13104

These agree with the previous result that we should reject \(\mu=22\) and \(\mu=23\) and accept \(\mu=24\), \(\mu=25\), and \(\mu=26\).

Task 3

set.seed(30);x=rnorm(15,mean=10,sd=7)   
set.seed(40);y=rnorm(20,mean=12,sd=4)
var.test(x,y)

## 
##  F test to compare two variances
## 
## data:  x and y
## F = 3.3363, num df = 14, denom df = 19, p-value = 0.01596
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.260446 9.544255
## sample estimates:
## ratio of variances 
##            3.33631

I conclude that the standard deviation of x is about 3 times larger than the variance of y. Thus, I will put var.equal=FALSE in t.test().

t.test(y,x,var.eqaul=FALSE,mu=0)

## 
##  Welch Two Sample t-test
## 
## data:  y and x
## t = 1.7945, df = 20.248, p-value = 0.08768
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5950525  7.9632156
## sample estimates:
## mean of x mean of y 
## 11.674425  7.990343

t.test(y,x,var.eqaul=FALSE,mu=2)

## 
##  Welch Two Sample t-test
## 
## data:  y and x
## t = 0.8203, df = 20.248, p-value = 0.4216
## alternative hypothesis: true difference in means is not equal to 2
## 95 percent confidence interval:
##  -0.5950525  7.9632156
## sample estimates:
## mean of x mean of y 
## 11.674425  7.990343

According to the results of the tests we should accept the Null Hypothesis of both of them because our null value lies with the confidence interval and both P values are larger than \(\alpha\).

Task 4

set.seed(30);x=rnorm(15,mean=10,sd=4)   
set.seed(40);y=rnorm(20,mean=12,sd=4)
var.test(x,y)

## 
##  F test to compare two variances
## 
## data:  x and y
## F = 1.0894, num df = 14, denom df = 19, p-value = 0.8454
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4115743 3.1164915
## sample estimates:
## ratio of variances 
##           1.089407

t.test(y,x,var.equal=TRUE,mu=0)

## 
##  Two Sample t-test
## 
## data:  y and x
## t = 2.0623, df = 33, p-value = 0.04712
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03803598 5.60756393
## sample estimates:
## mean of x mean of y 
## 11.674425  8.851625

t.test(y,x,var.equal=TRUE,mu=2)

## 
##  Two Sample t-test
## 
## data:  y and x
## t = 0.60113, df = 33, p-value = 0.5519
## alternative hypothesis: true difference in means is not equal to 2
## 95 percent confidence interval:
##  0.03803598 5.60756393
## sample estimates:
## mean of x mean of y 
## 11.674425  8.851625

According to the tests we should reject the Null Hypothesis that the difference is zero. But we should accept the Null Hypothesis that the difference is 2.

Task 5

boot2pval<-function(x1,x2,conf.level=0.95,iter=3000,mudiff=0, test="two"){
n1=length(x1)
n2=length(x2)
y1=x1-mean(x1)+mean(c(x1,x2))  # transform the data so that it is centered at the NULL
y2=x2-mean(x2)+mean(c(x1,x2))
y1rs.mat<-c()    #rs.mat will be come a resample matrix -- now it is an empty vector
x1rs.mat<-c()
y2rs.mat<-c()
x2rs.mat<-c()
for(i in 1:iter){ # for loop - the loop will go around iter times
y1rs.mat<-cbind(y1rs.mat,sample(y1,n1,replace=TRUE)) #sampling from y cbind -- column bind -- binds the vectors together by columns
y2rs.mat<-cbind(y2rs.mat,sample(y2,n2,replace=TRUE))

}
x1rs.mat<-y1rs.mat+mean(x1)-mean(c(x1,x2))
x2rs.mat<-y2rs.mat+mean(x2)-mean(c(x1,x2))

xbar1=mean(x1)
xbar2=mean(x2)
sx1sq=var(x1)
sx2sq=var(x2)

tcalc=(xbar1-xbar2-mudiff)/sqrt(sx1sq/n1+sx2sq/n2)

sy1sq=apply(y1rs.mat,2,var)
sy2sq=apply(y2rs.mat,2,var) 
y1bar=apply(y1rs.mat,2,mean)
y2bar=apply(y2rs.mat,2,mean)

tstat=(y1bar-y2bar-mudiff)/sqrt(sy1sq/n1+sy2sq/n2)


alpha=1-conf.level # calculating alpha
#ci=quantile(xstat,c(alpha/2,1-alpha/2))# Nice way to form a confidence interval
pvalue=ifelse(test=="two",length(tstat[tstat>abs(tcalc) | tstat < -abs(tcalc)])/iter,
ifelse(test=="upper",length(tstat[tstat>tcalc])/iter,
length(ytstat[tstat<tcalc])/iter))

h=hist(tstat,plot=FALSE)
mid=h$mid
if(test=="two"){
ncoll=length(mid[mid<= -abs(tcalc)])
ncolr=length(mid[mid>=  abs(tcalc)])
col=c(rep("Green",ncoll),rep("Gray",length(mid)-ncoll-ncolr),rep("Green",ncolr))
}
hist(tstat,col=col,freq=FALSE)
#segments(ci[1],0,ci[2],0,lwd=2)

return(list(pvalue=pvalue))
#return(list(pvalue=pvalue,tcalc=tcalc,n=n,x=x,test=test,ci=ci))
}

set.seed(30);x=rnorm(15,mean=10,sd=7)   
set.seed(40);y=rnorm(20,mean=12,sd=4)

boot2pval(x1=y,x2=x,mudiff=0)

## $pvalue
## [1] 0.08366667

boot2pval(x1=y,x2=x,mudiff=2)

## $pvalue
## [1] 0.5896667

Task 6

set.seed(30);x=rnorm(15,mean=10,sd=4)   
set.seed(40);y=rnorm(20,mean=12,sd=4)

boot2pval(x1=y,x2=x,mudiff=0)

## $pvalue
## [1] 0.048

boot2pval(x1=y,x2=x,mudiff=2)

## $pvalue
## [1] 0.8336667

Task 7

Line A is running a T test on the data named x1 with the null hypothesis that \(\mu=23\).

Line B is the title of the results of the T-test, and it says that this test was done on one sample.

Line C contains the \(T_{calc}\) value, degrees of freedom, and the p-value. Calculated from the T test.

Line D says what the alternate hypothesis is for this test.

Line E is the confidence of the interval that it is calculating, while line F gives the confidence interval.

Line G gives an estimate of the mean of x1.

Task 8

set.seed(55);x1=rnorm(30,mean=25,sd=5)
MATH4753GRAY::bootpval(x1,mu=25)

## $pvalue
## [1] 0.7716667
## 
## $tcalc
## [1] 0.2962266
## 
## $n
## [1] 30
## 
## $x
##  [1] 25.60070 15.93812 25.75791 19.40389 25.00954 30.94259 22.47328 24.50383
##  [9] 26.52677 25.99205 24.75545 20.78383 14.62365 23.19618 21.81155 23.16861
## [17] 36.77682 30.46689 26.42921 29.96829 17.40364 32.48559 29.09808 30.33025
## [25] 28.66878 29.80172 21.53910 32.02999 16.83231 26.30915
## 
## $test
## [1] "two"
## 
## $ci
##     2.5%    97.5% 
## 23.43182 27.12862