We write a function which takes in two arguments (numbers here), runs some computations (basic inequality) on them and prints an output based on the computation result -
myp=function(p, alpha){
if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}
Test our function to make sure it is performing as intended -
myp(p = .01, alpha = .05) # p is less than alpha
## [1] "REJECT Ho"
myp(p = .1, alpha = .05) # p is greater than alpha
## [1] "FAIL 2 REJECT"
Now, lets write a bit more complex function (takes in many arguments) that is designed to shade the standard normal distribution as the default option for a 5% double sided hypothesis test and can be adapted for other purposes too. You can chnage the arguments of mu, sig, pcts, color,…
shadenorm = function(below=NULL, above=NULL, pcts = c(0.025,0.975), mu=0, sig=1, numpts = 500, color = "gray", dens = 40, justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){
if(is.null(between)){
below = ifelse(is.null(below), qnorm(pcts[1],mu,sig), below)
above = ifelse(is.null(above), qnorm(pcts[2],mu,sig), above)
}
if(is.null(outside)==FALSE){
below = min(outside)
above = max(outside)
}
lowlim = mu - 4*sig # min point plotted on x axis
uplim = mu + 4*sig # max point plotted on x axis
x.grid = seq(lowlim,uplim, length= numpts)
dens.all = dnorm(x.grid,mean=mu, sd = sig)
if(lines==FALSE){
plot(x.grid, dens.all, type="l", xlab="X", ylab="Density") # label y and x axis
}
if(lines==TRUE){
lines(x.grid,dens.all)
}
if(justabove==FALSE){
x.below = x.grid[x.grid<below]
dens.below = dens.all[x.grid<below]
polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
}
if(justbelow==FALSE){
x.above = x.grid[x.grid>above]
dens.above = dens.all[x.grid>above]
polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
}
if(is.null(between)==FALSE){
from = min(between)
to = max(between)
x.between = x.grid[x.grid>from&x.grid<to]
dens.between = dens.all[x.grid>from&x.grid<to]
polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
}
}
# TEST THE FUCTION
shadenorm(mu = 0, sig = 1, pcts = c(0.025,0.975))
# shadenorm(mu = 20, sig = 6, pcts = c(0.025,0.975))
shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40, justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){
if(is.null(between)){
below = ifelse(is.null(below), qt(pcts[1],df), below)
above = ifelse(is.null(above), qt(pcts[2],df), above)
}
if(is.null(outside)==FALSE){
below = min(outside)
above = max(outside)
}
lowlim = -4
uplim = 4
x.grid = seq(lowlim,uplim, length= numpts)
dens.all = dt(x.grid,df)
if(lines==FALSE){
plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
}
if(lines==TRUE){
lines(x.grid,dens.all)
}
if(justabove==FALSE){
x.below = x.grid[x.grid<below]
dens.below = dens.all[x.grid<below]
polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
}
if(justbelow==FALSE){
x.above = x.grid[x.grid>above]
dens.above = dens.all[x.grid>above]
polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
}
if(is.null(between)==FALSE){
from = min(between)
to = max(between)
x.between = x.grid[x.grid>from&x.grid<to]
dens.between = dens.all[x.grid>from&x.grid<to]
polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
}
}
# TEST THE FUCTION
shadet(df = 4, pcts = c(0.025,0.975)) # see the area under the tails are further away from the mean 0..
# shadet(df = 120, pcts = c(0.025,0.975)) # t dist converges to normal when we have high degrees o freedom..
Inference
Using traditional methods, it takes 109 hours to receive a basic driving license. A new license training method using Computer Aided Instruction (CAI) has been proposed. A researcher used the technique with 190 students and observed that they had a mean of 110 hours. Assume the standard deviation is known to be 6. A level of significance of 0.05 will be used to determine if the technique performs differently than the traditional method. Make a decision to reject or fail to reject the null hypothesis. Show all work in R. ### i. Null and Alternative Hypothesis Ho: \(\mu=109\), Ha: \(\mu \neq 109\) ## Set up Hypothesis ## Ho: the technique does not performs differently than the traditional method. ## Ha:the technique performs differently than the traditional method. x<-110 x1<-109 # t value
t<-(110-109)/6 t Z<-pnorm(t)
n<-190
sd<-6 ## Degree of freedom n-1. How many independent pieces of information
df<-n-1 df
se<-sd/sqrt(n) se
pi = 2
\(pi=2\)
\(\pi=2\) ## Decide Alpha \(\alpha=.05\) alpha<-0.05
pvalue<-0.025+0.95 pvalue
# this is my calculation
x<-110
x1<-109
n<-190
sd<-6
df<-n-1
df
## [1] 189
# Z= (value – mean)/ (Standard Deviation)
Ztesttval<-(x-x1)/(sd/sqrt(n))
Ztesttval
## [1] 2.297341
pval<-pnorm(Ztesttval)
pval
## [1] 0.9892004
pvalue<-2*(1-pval)
pvalue
## [1] 0.0215993
alpha<-0.05
alpha
## [1] 0.05
# compute critical value (split alpha since we have double sided hypothesis)
?pnorm
## starting httpd help server ... done
critical_value <- qnorm(p = .975,
mean = 0,
sd = 1
)
critical_value
## [1] 1.959964
#find Z critical value. Another way to find critical value
Zcriric<-qnorm(p=.05/2, lower.tail=FALSE)
Zcriric
## [1] 1.959964
tdf<-qt((p=alpha/2),df,lower.tail=FALSE)
tdf
## [1] 1.972595
Finding the p value using a t distribution and similar to using the Z-score. The difference is that you have to specify the number of degrees of freedom.
sd<-6
x1=109
n<-190
x=110
t<-(110-109)/(sd/sqrt(n))
t
## [1] 2.297341
tdis<-2*pt(t,df=n-1)
tdis
## [1] 1.977305
Since the test statistic is more extreme than the critical value here, or alternatively the test statistic lies in the rejection region, we will reject the null.
# shades significance level gates
shadenorm( mu = 109,
sig = 6/sqrt(190),
pcts = c(0.025,0.975),
color = "orange")
# mark point estimate from sample
lines(x = rep(x = 110,10),
y = seq(from = 0,
to = 1,
length.out=10),
col ='purple')
Alpha, the significance level, is the probability that you will make the mistake of rejecting the null hypothesis when in fact it is true. The p-value measures the probability of getting a more extreme value than the one you got from the experiment. If the p-value is greater than alpha, you accept the null hypothesis.
# p value associated with the two sided hypothesis test
pval<-pnorm(Ztesttval)
pval
## [1] 0.9892004
pvalue<-2*(1-pval)
pvalue
## [1] 0.0215993
alpha<-0.05
alpha
## [1] 0.05
### Result: A p-value less than 0.05 (typically ≤ 0.05) is statistically significant. the p value is smaller than alpha, reject the null.
*Since the pvalue is less than alpha, we reject the null hypothesis - unlikely to see the sample mean we saw if the null is true. **
We could have used can use our function to give us the same conclusion (which operationalizes the reject/do not reject rule by comparing p value and alpha/level of significance) -
myp(pvalue, alpha)
## [1] "REJECT Ho"
?qnorm
qnorm(p = Ztesttval)
## Warning in qnorm(p = Ztesttval): NaNs produced
## [1] NaN
# shades p-values here
shadenorm( mu = 109,
sig = 6/sqrt(190),
pcts = c(1-pnorm(Ztesttval),pnorm(Ztesttval)),
color = "green")
# mark point estimate from sample
lines(x = rep(x = 110,10),
y = seq(from = 0,
to = 1,
length.out=10),
col ='blue')
rnorm(n, # Number of observations to be generated mean = 110, # Integer or vector of means sd = 6) # Integer or vector of standard deviations Simulated p value could have resulted in the same conclusion. NOT REQUIRED/WILL NOT BE TESTED. The idea below is based on the assumption
# NOT REQUIRED - Simulated p value gives us pretty much the same result...
temp = rnorm(n = 10000,
mean = 110,
sd = 6/sqrt(190)) # 100000 obs generated from mean 110 and sd 6/sqrt(190)
p_value_sim <- 2 * length(temp[temp<=109]) / length(temp)
p_value_sim
## [1] 0.0186
CI give us the same conclusion, as the hypothesized population parameter is not within the 95% CI constructed from sample. ** confidence interval is an interval that contains the population parameter with probability 1−α . Calculate the mean Calculate the standard error of the mean Find the t-score that corresponds to the confidence level Calculate the margin of error and construct the confidence interval
alpha <-0.05
n<-190
df<-n - 1
tdf<-qt((p=alpha/2),df,lower.tail=FALSE)
tdf
## [1] 1.972595
x <- 110
Se <- 6/sqrt(190)
z <- qnorm(p = .975)
interval = c( x - z * Se, x + z * Se)
interval
## [1] 109.1469 110.8531
upper<- x + (tdf*Se)
upper
## [1] 110.8586
lower<- x - (tdf*Se)
lower
## [1] 109.1414
##Conclusion : Since 109 falls outside of the interval, we can reject the null hypothesis and assume that the technique performs differently than the traditional method.
Our environment is very sensitive to the amount of ozone in the upper
atmosphere.
The level of ozone normally found is 5.3 parts/million (ppm). A
researcher believes that the current ozone level is at an insufficient
level. The mean of 5 samples is 5.0 ppm with a standard deviation of
1.1. Does the data support the claim at the 0.05 level?
Assume the population distribution is approximately normal.
t<-(5.3-5)/1.1 t Z<-pnorm(t)
n<-5
sd<-1.1
df<-n-1 df
se<-sd/sqrt(n) se
pi = 2
\(pi=2\)
\(\pi=2\) ## Decide Alpha \(\alpha=.05\) alpha<-0.05
pvalue<-0.025+0.95 pvalue
x<-5.3
x1<-5
n<-5
sd<-1.1
df<-n-1
df
## [1] 4
# Z= (value – mean)/ (Standard Deviation)
Ztesttval2<-(x-x1)/(sd/sqrt(n))
Ztesttval2
## [1] 0.6098367
pval2<-pnorm(Ztesttval2)
pval2
## [1] 0.729015
pvalue<-2*(1-pval2)
pvalue
## [1] 0.54197
alpha<-0.05
alpha
## [1] 0.05
# compute critical value (split alpha since we have double sided hypothesis)
?pnorm
critical_value2 <- qnorm(p = .975,
mean = 0,
sd = 1
)
critical_value2
## [1] 1.959964
#find Z critical value. Another way to find critical value
Zcriric2<-qnorm(p=.05/2, lower.tail=FALSE)
Zcriric2
## [1] 1.959964
tdf2<-qt((p=alpha/2),df,lower.tail=FALSE)
tdf2
## [1] 2.776445
## p value using a t distribution Finding the p value using a t distribution and similar to using the Z-score. The difference is that you have to specify the number of degrees of freedom.
** This case Z distribution matches with T Distribution
sd<-1.1
x1=5
n<-5
x=5.3
t<-(5.3-5)/(sd/sqrt(n))
t
## [1] 0.6098367
tdis2<-2*pt(t,df=n-1)
tdis2
## [1] 1.425086
Since the test statistic is more extreme than the critical value here, or alternatively the test statistic lies in the rejection region, we will reject the null.
# shades significance level gates
shadenorm( mu = 5,
sig = 1.1/sqrt(5),
pcts = c(0.025,0.975),
color = "orange")
# mark point estimate from sample
lines(x = rep(x = 5.3,10),
y = seq(from = 0,
to = 1,
length.out=10),
col ='purple')
Alpha, the significance level, is the probability that you will make the mistake of rejecting the null hypothesis when in fact it is true. The p-value measures the probability of getting a more extreme value than the one you got from the experiment. If the p-value is greater than alpha, you accept the null hypothesis.
** The smaller the p-value, the stronger the evidence that you should reject the null hypothesis. ** A p-value less than 0.05 (typically ≤ 0.05) is statistically significant. … ** A p-value higher than 0.05 (> 0.05) is not statistically significant and indicates strong evidence for the null hypothesis.
# p value associated with the two sided hypothesis test
pval2<-pnorm(Ztesttval)
pval2
## [1] 0.9892004
pvalue<-2*(1-pval)
pvalue
## [1] 0.0215993
alpha<-0.05
alpha
## [1] 0.05
## Result: A p-value less than 0.05 (typically ≤ 0.05) is statistically significant. the p value is smaller than alpha, reject the null.
We could have used can use our function to give us the same conclusion (which operationalizes the reject/do not reject rule by comparing p value and alpha/level of significance) -
myp(pvalue, alpha)
## [1] "REJECT Ho"
?qnorm
qnorm(p = Ztesttval2)
## [1] 0.2788935
# shades p-values here
shadenorm( mu = 5,
sig = 1.1/sqrt(5),
pcts = c(1-pnorm(Ztesttval2),pnorm(Ztesttval2)),
color = "maroon")
# mark point estimate from sample
lines(x = rep(x = 5.3,10),
y = seq(from = 0,
to = 1,
length.out=10),
col ='yellow')
CI give us the same conclusion, as the hypothesized population parameter is not within the 95% CI constructed from sample. ** confidence interval is an interval that contains the population parameter with probability 1−α . Calculate the mean Calculate the standard error of the mean Find the t-score that corresponds to the confidence level Calculate the margin of error and construct the confidence interval
alpha <-0.05
n<-5
df<-n - 1
tdf<-qt((p=alpha/2),df,lower.tail=FALSE)
tdf
## [1] 2.776445
x <- 5.3
Se <- 1.1/sqrt(5)
z <- qnorm(p = .975)
interval = c( x - z * Se, x + z * Se)
interval
## [1] 4.335825 6.264175
upper<- x + (tdf*Se)
upper
## [1] 6.66583
lower<- x - (tdf*Se)
lower
## [1] 3.93417
Q3 # 3. Our environment is very sensitive to the amount of ozone in
the upper atmosphere.
The level of ozone normally found is 7.3 parts/million (ppm). A
researcher believes that the current ozone level is not at a normal
level. The mean of 51 samples is 7.1 ppm with a variance of 0.49. Assume
the population is normally distributed. A level of significance of 0.01
will be used. Show all work and hypothesis testing steps.
Ho: The population is normally distributed Ha: The population is not normally distributed ### i. Null and Alternative Hypothesis Ho: \(\mu=5\), Ha: \(\mu \neq 5\) ## Set up Hypothesis ## Ho: The population is normally distributed.Data doessupport the claim. ## Ha: The population is not normally distributed Data does not support the claim. x<-7.3 x1<-7.1 # t value
t<-(7.3-7)/0.49 t Z<-pnorm(t)
n<-51
sd<-0.49 ## Degree of freedom n-1. How many independent pieces of information
df<-n-1 df
se<-sd/sqrt(n) se
pi = 2
\(pi=2\)
\(\pi=2\) ## Decide Alpha \(\alpha=0.01\) alpha<-0.01
pvalue<-0.025+0.95 pvalue
x<-7.3
x1<-7.1
n<-51
sd<-0.49
df<-n-1
df
## [1] 50
# Z= (value – mean)/ (Standard Deviation)
Ztesttval<-(x-x1)/(sd/sqrt(n))
Ztesttval
## [1] 2.914869
pval<-pnorm(Ztesttval)
pval
## [1] 0.9982208
pvalue<-2*(1-pval)
pvalue
## [1] 0.003558382
alpha<-0.01
alpha
## [1] 0.01
# compute critical value (split alpha since we have double sided hypothesis)
?pnorm
critical_value <- qnorm(p = .975,
mean = 0,
sd = 1
)
critical_value
## [1] 1.959964
#find Z critical value. Another way to find critical value
Zcriric<-qnorm(p=0.01/2, lower.tail=FALSE)
Zcriric
## [1] 2.575829
tdf<-qt((p=alpha/2),df,lower.tail=FALSE)
tdf
## [1] 2.677793
## p value using a t distribution Finding the p value using a t distribution and similar to using the Z-score. The difference is that you have to specify the number of degrees of freedom.
sd<-0.49
x1=7.1
n<-51
x=7.3
t<-(7.3-7.1)/(sd/sqrt(n))
t
## [1] 2.914869
tdis<-2*pt(t,df=n-1)
tdis
## [1] 1.994688
Since the test statistic is more extreme than the critical value here, or alternatively the test statistic lies in the rejection region, we will reject the null.
# shades significance level gates
shadenorm( mu = 7.1,
sig = 0.49/sqrt(51),
pcts = c(0.025,0.975),
color = "orange")
# mark point estimate from sample
lines(x = rep(x = 7.3,10),
y = seq(from = 0,
to = 1,
length.out=10),
col ='purple')
Alpha, the significance level, is the probability that you will make the
mistake of rejecting the null hypothesis when in fact it is true. The
p-value measures the probability of getting a more extreme value than
the one you got from the experiment. If the p-value is greater than
alpha, you accept the null hypothesis.
# p value associated with the two sided hypothesis test
pval<-pnorm(Ztesttval)
pval
## [1] 0.9982208
pvalue<-2*(1-pval)
pvalue
## [1] 0.003558382
alpha<-0.01
alpha
## [1] 0.01
### Result: A p-value less than 0.05 (typically ≤ 0.05) is statistically significant. the p value is smaller than alpha, reject the null.
*Since the pvalue is less than alpha, we reject the null hypothesis - unlikely to see the sample mean we saw if the null is true. **
We could have used can use our function to give us the same conclusion (which operationalizes the reject/do not reject rule by comparing p value and alpha/level of significance) -
myp(pvalue, alpha)
## [1] "REJECT Ho"
?qnorm
qnorm(p = Ztesttval)
## Warning in qnorm(p = Ztesttval): NaNs produced
## [1] NaN
# shades p-values here
shadenorm( mu = 7.1,
sig = 0.49/sqrt(51),
pcts = c(1-pnorm(Ztesttval),pnorm(Ztesttval)),
color = "brown")
# mark point estimate from sample
lines(x = rep(x = 7.3,10),
y = seq(from = 0,
to = 1,
length.out=10),
col ='cyan')
rnorm(n, # Number of observations to be generated mean = 7.1, # Integer or vector of means sd = 0.49) # Integer or vector of standard deviations Simulated p value could have resulted in the same conclusion. NOT REQUIRED/WILL NOT BE TESTED. The idea below is based on the assumption
# NOT REQUIRED - Simulated p value gives us pretty much the same result...
temp = rnorm(n = 10000,
mean = 7.3,
sd = 0.49/sqrt(51)) # 100000 obs generated from mean 7.3 and sd 0.49/sqrt(51)
p_value_sim <- 2 * length(temp[temp<=7.1]) / length(temp)
p_value_sim
## [1] 0.003
CI give us the same conclusion, as the hypothesized population parameter is not within the 95% CI constructed from sample. ** confidence interval is an interval that contains the population parameter with probability 1−α . Calculate the mean Calculate the standard error of the mean Find the t-score that corresponds to the confidence level Calculate the margin of error and construct the confidence interval
alpha <-0.01
n<-51
df<-n - 1
tdf<-qt((p=alpha/2),df,lower.tail=FALSE)
tdf
## [1] 2.677793
x <- 7.1
Se <- 0.49/sqrt(51)
z <- qnorm(p = .975)
interval = c( x - z * Se, x + z * Se)
interval
## [1] 6.96552 7.23448
upper<- x + (tdf*Se)
upper
## [1] 7.283733
lower<- x - (tdf*Se)
lower
## [1] 6.916267
##Conclusion : With a 99% confidence interval, the level of ozone normally found is between 6.96552 and 7.283733 ppm. ## Because 7.3 is outside of that interval, we can reject the null hypothesis and assume that the data supports the researcher’s claim.
A publisher reports that 36% of their readers own a laptop. A
marketing executive wants to test the claim that the percentage is
actually less than the reported percentage. A random sample of 100 found
that 29% of the readers owned a laptop.
Is there sufficient evidence at the 0.02 level to support the
executive’s claim? Show all work and hypothesis testing steps. ## State
the null hypothesis H0 and alternative hypothesis Ha
\(H_a = \pi <.36\)
\(H_0 = \pi \geq.36\)
\(n=100\) n<-100
p<-.36
phat<-.29
q<-1-p
df<-n-1 df tstat=(phat-p)/Se tstat
alpha<-0.02
\(\alpha=.02\)
Se = sqrt( p * q / n) Se
Se<- sqrt( p * q / n) Se
ZScore=(phat-p)/Se ZScore
#pnorm(ZTest)
p_value<-pnorm(ZScore) p_value
# don't have sd, so finding Se using Z value
n<-100
p<-.36
phat<-.29
q<-1-p
Se = sqrt( p * q / n)
Se
## [1] 0.048
#p value and the zscore, also called the standard score,
tstat=(phat-p)/Se
tstat
## [1] -1.458333
# A p-value falls between 0 and 1 just like the likelihood of an event happening, it gives the probability of a null hypothesis
# Calculatr the CDF for z-score
p_value<-pt(tstat,df=99)
p_value
## [1] 0.07395698
p_value2<-pnorm(tstat)
p_value2
## [1] 0.07237434
Finding the p value using a t distribution and similar to using the Z-score. The difference is that you have to specify the number of degrees of freedom.
t<-(p-phat)/(p/sqrt(n))
t
## [1] 1.944444
tdis<-2*pt(t,df=n-1)
tdis
## [1] 1.94532
# t-score that corresponds to the confidence level
alpha = 0.02
df = n - 1
t.score = qt(p=alpha/2, df,lower.tail=F)
print(t.score)
## [1] 2.364606
# Calculate the margin of error and construct the confidence interval
me <- t.score * Se
me
## [1] 0.1135011
#confidence interval is the mean +/- margin of error
lower <- p - me
upper <- p + me
print(c(lower,upper))
## [1] 0.2464989 0.4735011
#Since the test statistic is more extreme than the critical value here, or alternatively the test statistic lies in the rejection region, we will reject the null.**
Since the test statistic is more extreme than the critical value here, or alternatively the test statistic lies in the rejection region, we will reject the null.
# shades significance level gates
shadenorm( mu = .36,
sig =Se,
pcts = c(.05),
color = "orange")
# mark point estimate from sample
lines(x = rep(phat,10),
y = seq(0.20,
length.out=10),
col ='purple')
Alpha, the significance level, is the probability that you will make the mistake of rejecting the null hypothesis when in fact it is true. The p-value measures the probability of getting a more extreme value than the one you got from the experiment. If the p-value is greater than alpha, you accept the null hypothesis.
# p value associated with the two sided hypothesis test
p_value2
## [1] 0.07237434
alpha<-0.02
alpha
## [1] 0.02
## A p-value higher than 0.05 (> 0.05) is not statistically significant and indicates strong evidence for the null hypothesis.
We could have used can use our function to give us the same conclusion (which operationalizes the reject/do not reject rule by comparing p value and alpha/level of significance) -
myp(p_value2, alpha)
## [1] "FAIL 2 REJECT"
?qnorm
#qnorm(p = tsat)
# shades p-values here
shadenorm( mu = .36,
sig = Se,
pcts = c(pnorm(tstat),1-pnorm(tstat)),
color = "green")
# mark point estimate from sample
lines(x = rep(p,10),
y = seq(from = 0,
to = 1,
length.out=10),
col ='blue')
CI give us the same conclusion, as the hypothesized population parameter is not within the 95% CI constructed from sample. ** confidence interval is an interval that contains the population parameter with probability 1−α . Calculate the mean Calculate the standard error of the mean Find the t-score that corresponds to the confidence level Calculate the margin of error and construct the confidence interval
# t-score that corresponds to the confidence level
alpha = 0.02
df = n - 1
t.score = qt(p=alpha/2, df,lower.tail=FALSE)
print(t.score)
## [1] 2.364606
# Calculate the margin of error and construct the confidence interval
me <- t.score * Se
me
## [1] 0.1135011
#confidence interval is the mean +/- margin of error
##interval
lower <- p - me
upper <- p + me
print(c(lower,upper))
## [1] 0.2464989 0.4735011
## With a 98% confidence interval, the percent of readers that own a laptop is between 24.64989 and 47.16647 percent.
## Because 36 is equal to or less than the upper bound, we cannot reject the null hypothesis. The data does not support the claim.
A hospital director is told that 31% of the treated patients are uninsured. The director wants to test the claim that the percentage of uninsured patients is less than the expected percentage. A sample of 380 patients found that 95 were uninsured. Make the decision to reject or fail to reject the null hypothesis at the 0.05 level. Show all work and hypothesis testing steps.
#State the null hypothesis H0 and alternative hypothesis Ha
n<-380
p<-.31
phat<-95/380 phat
q<-1-p
df<-n-1 df
alpha<-0.05
Se = sqrt( p * q / n) Se
Se<- sqrt( p * q / n) Se
ZScore=(phat-p)/Se ZScore
#p_value = pnorm(ZTest) #pnorm(ZTest)
p_value<-pnorm(ZScore) p_value
# don't have sd, so finding Se using Z value
n<-380
p<-.31
phat<-95/380
q<-1-p
Se = sqrt( p * q / n)
Se
## [1] 0.0237254
#p value and the zscore, also called the standard score,
tstat=(phat-p)/Se
tstat
## [1] -2.528935
# A p-value falls between 0 and 1 just like the likelihood of an event happening, it gives the probability of a null hypothesis
# Calculatr the CDF for z-score
p_value<-pnorm(tstat)
p_value
## [1] 0.005720462
Finding the p value using a t distribution and similar to using the Z-score. The difference is that you have to specify the number of degrees of freedom.
phat<-95/380
phat
## [1] 0.25
t<-(phat-p)/sqrt(p*q/(n))
t
## [1] -2.528935
# tdis<-pt(tstat,df=n-1)
# tdis
# t-score that corresponds to the confidence level
alpha = 0.05
df = n - 1
t.score = qt(p=alpha, df,lower.tail=TRUE)
print(t.score)
## [1] -1.648884
# Calculate the margin of error and construct the confidence interval
me <- t.score * Se
me
## [1] -0.03912044
#confidence interval is the mean +/- margin of error
lower <- phat + me
##upper <- p + me
print(c(lower,phat))
## [1] 0.2108796 0.2500000
#Because 31 is equal to or less than the upper bound, we cannot reject the null hypothesis.**
Since the test statistic is more extreme than the critical value here, or alternatively the test statistic lies in the rejection region, we will reject the null.
# shades significance level gates
shadenorm( mu = .31,
sig =Se,
pcts = c(.05),
color = "orange")
# mark point estimate from sample
lines(x = rep(phat,10),
y = seq(0.20,
length.out=10),
col ='red')
Alpha, the significance level, is the probability that you will make the mistake of rejecting the null hypothesis when in fact it is true. The p-value measures the probability of getting a more extreme value than the one you got from the experiment. If the p-value is greater than alpha, you accept the null hypothesis.
# p value associated with the two sided hypothesis test
pvalue<-2*(1-p_value)
pvalue
## [1] 1.988559
alpha<-0.05
alpha
## [1] 0.05
## A p-value higher than 0.05 (> 0.05) is not statistically significant and indicates strong evidence for the null hypothesis.
We could have used can use our function to give us the same conclusion (which operationalizes the reject/do not reject rule by comparing p value and alpha/level of significance) -
myp(pvalue, alpha)
## [1] "FAIL 2 REJECT"
?qnorm
qnorm(p = tstat)
## Warning in qnorm(p = tstat): NaNs produced
## [1] NaN
# shades p-values here
shadenorm( mu = .31,
sig = Se,
pcts = c(1-pnorm(tstat),pnorm(tstat)),
color = "green")
# mark point estimate from sample
lines(x = rep(p,10),
y = seq(0.20,
length.out=10),
col ='blue')
CI give us the same conclusion, as the hypothesized population parameter is not within the 95% CI constructed from sample. ** confidence interval is an interval that contains the population parameter with probability 1−α . Calculate the mean Calculate the standard error of the mean Find the t-score that corresponds to the confidence level Calculate the margin of error and construct the confidence interval
# t-score that corresponds to the confidence level
alpha = 0.05
df = n - 1
t.score = qt(p=alpha/2, df,lower.tail=F)
print(t.score)
## [1] 1.966243
# Calculate the margin of error and construct the confidence interval
me <- t.score * Se
me
## [1] 0.04664991
#confidence interval is the mean +/- margin of error
##interval
lower <- p - me
upper <- p + me
print(c(lower,upper))
## [1] 0.2633501 0.3566499
## With a 95% confidence interval, the percent of readers that own a laptop is between 24.64989 and 47.16647 percent.
## Because 31 is equal to or less than the upper bound, we cannot reject the null hypothesis. The data does not support the claim.
## confidence interval with qnorm
ZCI<-qnorm(1-.05)/2
ZCI
## [1] 0.8224268
CI<-ZCI*Se
CI
## [1] 0.01951241
upperCI<-phat+CI
upperCI
## [1] 0.2695124
lowerCI<-phat-CI
lowerCI
## [1] 0.2304876
print(c(lowerCI,upperCI))
## [1] 0.2304876 0.2695124
## With a 95% confidence interval, the percent of uninsured patients is between 93.04876 and 96.95124 percent.
Find the minimum sample size needed to be 99% confident that the sample’s variance is within 1% of the population’s variance.
A standardized test is given to a sixth-grade class. Historically the mean score has been 112 with a standard deviation of 24. The superintendent believes that the standard deviation of performance may have recently decreased. She randomly sampled 22 students and found a mean of 102 with a standard deviation of 15.4387. Is there evidence that the standard deviation has decreased at the 𝛼𝛼 = 0.1 level? Show all work and hypothesis testing steps.
#Ho: sigma>=24, Ha: sigma<=24
#chi # df<-n-1 # chi=df*s_2/sigma_2
x<-112
sd1<-24
n<-22
x1<-102
sd2<-15.4387
alpha<-0.1
df<-n-1
meandiff<-x-x1
meandiff
## [1] 10
SE<-sqrt((sd1^2/n)+(sd2^2/n))
SE
## [1] 6.084083
statistic<-(112-102)/SE
statistic
## [1] 1.643633
sdf<-sd1/sd2
sdf
## [1] 1.554535
SE<-sqrt((sd1^2/n)+(sd2^2/n))
SE
## [1] 6.084083
statistic<-(112-102)/SE
statistic
## [1] 1.643633
# Z= (value – mean)/ (Standard Deviation)
ZtestFirstval<-(x-x1)/(sd1/sqrt(n))
ZtestFirstval
## [1] 1.95434
Ztestsecondval<-(x-x1)/(sd2/sqrt(n))
Ztestsecondval
## [1] 3.03809
pvalFirst<-pnorm(ZtestFirstval)
pvalFirst
## [1] 0.9746695
pvalsecond<-pnorm(Ztestsecondval)
pvalsecond
## [1] 0.9988096
pvalueFirst <-2*(1-pvalFirst)
pvalueFirst
## [1] 0.05066103
pvalueSecond <-2*(1-pvalsecond)
pvalueSecond
## [1] 0.00238083
PValAll<-2*pnorm((statistic),lower.tail=FALSE)
PValAll
## [1] 0.100252
Pvalless<-pnorm(statistic, lower.tail = TRUE)
Pvalless
## [1] 0.949874
Pvalmore<-pnorm(statistic, lower.tail = FALSE)
Pvalmore
## [1] 0.05012601
alpha<-0.05
alpha
## [1] 0.05
LowerCL <- (112 - 102 - SE * qnorm(1 - alpha / 2))
LowerCL
## [1] -1.924584
UpperCL <- (112 - 102 + SE * qnorm(1 - alpha / 2))
UpperCL
## [1] 21.92458
## With a 99% confidence interval, the standard deviation is between -1.924584 and 21.92458
## # Because 24 is within that interval, we cannot reject the null hypothesis that SD has not decreased below 24.