# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 525004 28.1 1169818 62.5 660385 35.3
## Vcells 964354 7.4 8388608 64.0 1769489 13.6
cat("\f") # Clear the console
We write a function which takes in two arguments (numbers here), runs some computations (basic inequality) on them and prints an output based on the computation result -
myp=function(p, alpha){
if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}
Test our function to make sure it is performing as intended -
myp(.01, .05) # p is less than alpha
## [1] "REJECT Ho"
myp(.1, .05) # p is greater than alpha
## [1] "FAIL 2 REJECT"
Now, lets write a bit more complex function (takes in many arguments) that is designed to shade the standard normal distribution as the default option for a 5% double sided hypothesis test and can be adapted for other purposes too. You can chnage the arguments of mu, sig, pcts, color,…
shadenorm = function(below=NULL, above=NULL, pcts = c(0.025,0.975), mu=0, sig=1, numpts = 500, color = "gray", dens = 40, justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){
if(is.null(between)){
below = ifelse(is.null(below), qnorm(pcts[1],mu,sig), below)
above = ifelse(is.null(above), qnorm(pcts[2],mu,sig), above)
}
if(is.null(outside)==FALSE){
below = min(outside)
above = max(outside)
}
lowlim = mu - 4*sig # min point plotted on x axis
uplim = mu + 4*sig # max point plotted on x axis
x.grid = seq(lowlim,uplim, length= numpts)
dens.all = dnorm(x.grid,mean=mu, sd = sig)
if(lines==FALSE){
plot(x.grid, dens.all, type="l", xlab="X", ylab="Density") # label y and x axis
}
if(lines==TRUE){
lines(x.grid,dens.all)
}
if(justabove==FALSE){
x.below = x.grid[x.grid<below]
dens.below = dens.all[x.grid<below]
polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
}
if(justbelow==FALSE){
x.above = x.grid[x.grid>above]
dens.above = dens.all[x.grid>above]
polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
}
if(is.null(between)==FALSE){
from = min(between)
to = max(between)
x.between = x.grid[x.grid>from&x.grid<to]
dens.between = dens.all[x.grid>from&x.grid<to]
polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
}
}
# TEST THE FUCTION
shadenorm(mu = 0, sig = 1, pcts = c(0.025,0.975))
# shadenorm(mu = 20, sig = 6, pcts = c(0.025,0.975))
shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40, justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){
if(is.null(between)){
below = ifelse(is.null(below), qt(pcts[1],df), below)
above = ifelse(is.null(above), qt(pcts[2],df), above)
}
if(is.null(outside)==FALSE){
below = min(outside)
above = max(outside)
}
lowlim = -4
uplim = 4
x.grid = seq(lowlim,uplim, length= numpts)
dens.all = dt(x.grid,df)
if(lines==FALSE){
plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
}
if(lines==TRUE){
lines(x.grid,dens.all)
}
if(justabove==FALSE){
x.below = x.grid[x.grid<below]
dens.below = dens.all[x.grid<below]
polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
}
if(justbelow==FALSE){
x.above = x.grid[x.grid>above]
dens.above = dens.all[x.grid>above]
polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
}
if(is.null(between)==FALSE){
from = min(between)
to = max(between)
x.between = x.grid[x.grid>from&x.grid<to]
dens.between = dens.all[x.grid>from&x.grid<to]
polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
}
}
# TEST THE FUCTION
shadet(df = 4, pcts = c(0.025,0.975)) # see the area under the tails are further away from the mean 0..
# shadet(df = 120, pcts = c(0.025,0.975)) # t dist converges to normal when we have high degrees o freedom..
shadechi = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40, justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){
if(is.null(between)){
below = ifelse(is.null(below), qchisq(pcts[1],df), below)
above = ifelse(is.null(above), qchisq(pcts[2],df), above)
}
if(is.null(outside)==FALSE){
below = min(outside)
above = max(outside)
}
lowlim = 0
uplim = qchisq(.99,df)
x.grid = seq(lowlim,uplim, length= numpts)
dens.all = dchisq(x.grid,df)
if(lines==FALSE){
plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
}
if(lines==TRUE){
lines(x.grid,dens.all)
}
if(justabove==FALSE){
x.below = x.grid[x.grid<below]
dens.below = dens.all[x.grid<below]
polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
}
if(justbelow==FALSE){
x.above = x.grid[x.grid>above]
dens.above = dens.all[x.grid>above]
polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
}
if(is.null(between)==FALSE){
from = min(between)
to = max(between)
x.between = x.grid[x.grid>from&x.grid<to]
dens.between = dens.all[x.grid>from&x.grid<to]
polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
}
}
# TEST THE FUCTION
shadechi(df = 2, pcts=c(.05)) # change pcts and see what happen
shadechi(df = 18, pcts=c(.05)) # change df and see what happens
Using traditional methods, it takes 109 hours to receive a basic driving license. A new license training method using Computer Aided Instruction (CAI) has been proposed. A researcher used the technique with 190 students and observed that they had a mean of 110 hours. Assume the standard deviation is known to be 6. A level of significance of 0.05 will be used to determine if the technique performs differently than the traditional method. Make a decision to reject or fail to reject the null hypothesis. Show all work in R.
Given: \(\mu= 109, n= 190, \bar{x}= 110, \sigma (population)= 6, \alpha= .05\).
To Do: Determine if the technique performs differently than the traditional method. Burden of proof falls on alternative hypothesis -
Ho: \(\bar{x}= \mu\)
Ha: \(\bar{x} \neq \mu\)
Ho: There is no difference in mean time to obtain a basic driving license between methods.
Ha: There is a difference in the mean time to obtain a basic driving license between methods.
Two sided test (look at alternative hypothesis). Two sided because we are testing to see if the methods are different. There is no language about less than or more than.
\(\alpha = .05\)
Distribution: Z (known SD), large sample size
Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL
Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL
Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL
#Compute the z-statistic
z_1 <- (110 - 109) / (6 / sqrt(190))
z_1
## [1] 2.297341
#Compute the p-value
#multiply by 2 for 2-sided test. The p-value represents the probability of finding a mean time to get drivers license as or more extreme than what the sample suggusts.
p_value_1 <- 2 * (1 - pnorm(q = z_1,
mean = 0,
sd = 1)
)
p_value_1
## [1] 0.0215993
alpha_1 <- .05 #given information
#plot it out using the pre-defined functions
shadenorm( mu = 109, sig = 6/sqrt(190), pcts = c(.025, .975), color = 'red' )
lines(x = rep(110,10), y = seq(0,1,length.out = 10), col = 'blue')
#Make a Decision
#method 1 - test stat vs. critical value
test_stat_1 <- z_1
critical_value_1 <- qnorm(p = .975, mean = 0, sd = 1)
abs(test_stat_1) > abs(critical_value_1)
## [1] TRUE
#Use pre-defined function 'myp' (method 2)
myp(p = p_value_1, alpha = alpha_1)
## [1] "REJECT Ho"
#method 3: create a confidence interval
se_1 <- 6 / sqrt(190)
se_1
## [1] 0.4352858
CI1_upper_bound <- 110 + abs(critical_value_1 * se_1)
CI1_lower_bound <- 110 - abs(critical_value_1 * se_1)
CI1 <- c(CI1_lower_bound,CI1_upper_bound)
CI1
## [1] 109.1469 110.8531
Since the pvalue (.02159) is less than alpha (.05) we reject the null hypothesis. We also have a test stat that absolute value greater than the absolute value of the critical value.. also basis to reject the null.
Finally, our 95% confidence interval is predicting the true mean time of the new method to be in the interval (109.1469, 110.8531). This interval does not contain the hypothesized status quo mean time of 109.0 hours. Another method that tells us to reject the null.
This means we would be unlikely on the 95% confidence level to obtain the sample mean if the null mean value is the true mean. (i.e. if the null is true).
Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 5.3 parts/million (ppm). A researcher believes that the current ozone level is at an insufficient level. The mean of 5 samples is 5.0 parts per million (ppm) with a standard deviation of 1.1. Does the data support the claim at the 0.05 level? Assume the population distribution is approximately normal.
Given: Given: \(\mu= 5.3 ppm, n= 5, \bar{x}= 5.0ppm, \sigma (samples)= 1.1ppm, \alpha= .05\).
To Do: Researcher believes that the current ozone level is at an insufficient level - does the data support the claim at the 0.05 level ?
Ho: \(\bar{x} \ge \mu\)
Ha: \(\bar{x} < \mu\)
Ho: Ozone levels are sufficient (there is enough) - current levels are greater than or equal to \(\mu\) (5.3 ppm).
Ha: Ozone levels are insufficient (lower levels than normally expected) - current levels are less than \(\mu\) (5.3 ppm)
This is going to be a one-tailed (left) test. We are testing for insufficient levels, meaning testing to see if there is enough ozone… not too much. We are only looking at the left-hand side.
\(\alpha = .05\)
Distribution: Student-T distribution with 4 degrees of freedom. This is because the population standard deviation is not known, we only have a standard deviation of the sample, and we have a small number of samples (less than 30).
Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL
Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL
Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL
#Compute the T-Statistic
alpha_2 <- .05
df_2 <- 5-1 #degrees of freedom
t_2 <- (5.0 - 5.3) / (1.1 / sqrt(5)) #T-stat formula
t_2
## [1] -0.6098367
#Compute the p-value
#Unlike problem 1, we are not multiplying by 2 because this is a one tailed test. The Test Statistic we calculated should take this into account and give us the probability of getting a value in the 5th percentile or lower assuming the null is true.
p_value_2 <- pt(q = t_2,
df = df_2)
p_value_2
## [1] 0.2874568
#Compute a Critical Value
critical_value_2 <- qt(p = .05,
df = 4)
critical_value_2
## [1] -2.131847
#Plot it out
shadet(pcts = .05, df = 4, color = 'red' )
lines(x = rep(t_2,10), y = seq(0,1,length.out = 10), col = 'blue')
MAKE A DECISION
#Method 1
abs(t_2) > abs(critical_value_2)
## [1] FALSE
#Method 2
myp(p = p_value_2, alpha = alpha_2)
## [1] "FAIL 2 REJECT"
#Method 3
se_2 <- 1.1 / sqrt(5)
se_2
## [1] 0.491935
CI2_upper_bound <- 5.0 + abs(critical_value_2 * se_2)
CI2_lower_bound <- 5.0 - abs(critical_value_2 * se_2)
CI2 <- c(CI2_lower_bound, CI2_upper_bound)
CI2
## [1] 3.95127 6.04873
Since the absolute value of the test statistic is less than the absolute value of the critical value, we FAIL TO REJECT THE NULL. This is confirmed by method 2, where the p-value is greater than the significance (alpha level)… so again we will FAIL TO REJECT THE NULL.
The 95% confidence interval is (3.95, 6.048). This means we are 95% confident the true mean value of ozone is between those values. The Null value of 5.3ppm is contained in the confidence interval, therefore we FAIL TO REJECT THE NULL.
In this case we do not have enough evidence at the .05 alpha level to say that there is insufficient ozone in the atmosphere.
Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 7.3 parts/million (ppm). A researcher believes that the current ozone level is not at a normal level. The mean of 51 samples is 7.1 ppm with a variance of 0.49. Assume the population is normally distributed. A level of significance of 0.01 will be used. Show all work and hypothesis testing steps.
Given: \(\mu= 7.3 ppm, n= 51, \bar{x}= 7.1ppm, \sigma (samples)= .7ppm, \alpha= .01\).
To Do: Researcher believes that the current ozone level is not at normal level. Thus, set a double sided hypothesis. This is because we will have to test both greater and less than the normal level.
Ho: \(\bar{x} = \mu\)
Ha: \(\bar{x} \neq \mu\)
Ho: Ozone is in the atmosphere is at a normal level.
Ha: Ozone in the atmosphere is not at a normal level.
\(\alpha = .01\)
Distribution: t (unknown SD)
Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL
Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL
Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL
#compute the test-statistic
alpha_3 <- .01
df_3 <- 51-1 #degrees of freedom
t_3 <- (7.1 - 7.3) / ( .7/ sqrt(51)) #T-stat formula
t_3
## [1] -2.040408
#Compute p-value
p_value_3 <- 2 * (pt(q = t_3,
df = 50))
p_value_3
## [1] 0.04660827
#Compute the Critical Value
critical_value_3 <- qt(p = .005,
df = 50)
critical_value_3
## [1] -2.677793
#plot
shadet(pcts = c(.005, .995), df = 50, color = 'red' )
lines(x = rep(t_3,10), y = seq(0,1,length.out = 10), col = 'blue')
MAKE A DECISION
#Method 1
abs(t_3) > abs(critical_value_3)
## [1] FALSE
#Method 2
myp(p = p_value_3, alpha = .01)
## [1] "FAIL 2 REJECT"
#Method 3
se_3 <- .7 / sqrt(51)
se_3
## [1] 0.09801961
CI3_upper_bound <- 7.1 + abs(critical_value_3 * se_3)
CI3_lower_bound <- 7.1 - abs(critical_value_3 * se_3)
CI3 <- c(CI3_lower_bound, CI3_upper_bound)
CI3
## [1] 6.837524 7.362476
FAIL TO REJECT THE NULL.
Method 1: The absolute value of the critical value is larger than the abs of the test statistic, meaning we FAIL TO REJECT THE NULL.
Method 2: The p-value is larger than the alpha level, so we FAIL TO REJECT THE NULL
Method 3: The hypothesized mean of 7.3 is contained in the 99% confidence interval of (6.83 - 7.36). This means we are 99% confident that the true mean lies in that range, which contains the hypothesized mean… FAIL TO REJECT THE NULL.
In this context, failing to reject the null means at the .01 alpha level (99% confidence level) there is not enough statistical evidence to reject the null and say ozone levels are not normal based on the data we have.
A publisher reports that 36% of their readers own a laptop. A marketing executive wants to test the claim that the percentage is actually less than the reported percentage. A random sample of 100 found that 29% of the readers owned a laptop. Is there sufficient evidence at the 0.02 level to support the executive’s claim? Show all work and hypothesis testing steps.
Given: \(\pi= .36 , n = 100 , \hat{p}= .29 ,\alpha= .02\)
To Do: Executive wants to test the claim that the percentage is actually less than the reported percentage. Thus, set a single sided hypothesis.
Ho: \(\hat{p} \ge \pi\)
Ha: \(\hat{p} < \pi\)
Ho: Mean proportion of readers that own a laptop is greater than or equal to .36 (pi)
Ha: Mean proportion of readers that own a laptop is less than .36 (pi)
This will be a single sided test. We are only testing to see if the true proportion is lower than the hypothesized value.
\(\alpha = .02\)
Distribution: Z (proportion) - since we have a large enough sample size (100) we can assume standard normal distribution. Usually after n = 30 the T-distribution becomes largely similar to the Z distribution.
We will perform both tests here and get the same result for showing purposes.
Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL
Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL
Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL
#Use the Z-Dist first
#Calculate the Test Stat
alpha_4 <- .02
p_4 <- .36
p_bar_4 <- .29
se_4_z <- sqrt((.36 * .64) / (100))
z_4 <- (.29 - .36) / (se_4_z)
z_4
## [1] -1.458333
#Calc Critical Value
critical_value_4_z <- qnorm(p = .02, mean = 0, sd = 1)
critical_value_4_z
## [1] -2.053749
#compute the p-value
p_value_4_z <- pnorm(q = z_4, mean = 0, sd = 1)
p_value_4_z
## [1] 0.07237434
#Compute the values for the T Distribution
se_4_t <- sqrt((.29 * .71) / (100))
t_4 <- (.29 - .36) / (se_4_t)
critical_value_4_t <- qt(p = .02, df = 99)
p_value_4_t <- pt(q = t_4, df = 99)
#compare the values from the two distributions to show they are very similar due to the large sample size. Listed first is the z and then the t
z_4
## [1] -1.458333
p_value_4_z
## [1] 0.07237434
critical_value_4_z
## [1] -2.053749
t_4
## [1] -1.542659
p_value_4_t
## [1] 0.06305207
critical_value_4_t
## [1] -2.081162
#plot it out
par(mfrow = c(1,2))
shadenorm( mu = .36, sig = se_4_z, pcts = .02, color = 'red' )
lines(x = rep(.29,10), y = seq(0,1,length.out = 10), col = 'blue')
shadet(pcts = .02, df = 99, color = 'red' )
lines(x = rep(t_4,10), y = seq(0,1,length.out = 10), col = 'blue')
#Method 1
abs(z_4) > abs(critical_value_4_z)
## [1] FALSE
abs(t_4) > abs(critical_value_4_t)
## [1] FALSE
#Method 2
myp(p = p_value_4_z, alpha = .02)
## [1] "FAIL 2 REJECT"
myp(p = p_value_4_t, alpha = .02)
## [1] "FAIL 2 REJECT"
#Method 3
CI4_upper_bound <- .29 + abs(critical_value_4_z * se_4_z)
CI4_lower_bound <- .29 - abs(critical_value_4_z * se_4_z)
CI4 <- c(CI4_lower_bound, CI4_upper_bound)
CI4
## [1] 0.1914201 0.3885799
We FAIL TO REJECT THE NULL.
Method 1: The absolute value of the test-stat (in both cases) is greater than the absolute value of the critical value. This is basis to FAIL TO REJECT THE NULL.
Method 2: The p-value is greater than the significance value, basis to FAIL TO REJECT THE NULL. This is saying, under null is true conditions the probability of getting a proportion that we got in our sample (.29) is greater than the alpha value.
Method 3: The 98% confidence interval of the true population proportion is (.191, .3885). The null value (.36), is included in this interval, therefore we FAIL TO REJECT THE NULL.
A hospital director is told that 31% of the treated patients are uninsured. The director wants to test the claim that the percentage of uninsured patients is less than the expected percentage. A sample of 380 patients found that 95 were uninsured. Make the decision to reject or fail to reject the null hypothesis at the 0.05 level. Show all work and hypothesis testing steps.
To Do: Researcher believes that the percentage of uninsured patients is less than the expected percentage.
\(\hat{p} = .25 , \pi = .31, n = 380, \alpha = .10\)
Ho: \(\pi \ge .31\)
Ha: \(\pi < .31\)
Ho: Mean proportion of people medically insured is equal to or greater than .31.
Ha: Mean proportion of people medically insured is less than .31.
This will be a single sided test. We are only testing to see if the true proportion is lower than the hypothesized value.
\(\alpha = .05\)
Distribution: Z (proportion) - since we have a large enough sample size (380) we can assume standard normal distribution. Usually after n = 30 the T-distribution becomes largely similar to the Z distribution.
Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL
Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL
Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL
#Compute the z-score
p_5 <- .31
p_hat5 <- 95/380
alpha5 <- .05
n_5 <- 380
se5 <- sqrt(p_5 * (1 - p_5) / n_5)
z5 <- (p_hat5 - p_5) / (se5)
z5
## [1] -2.528935
#Calculate the Critical Value
critical_value_5 <- qnorm(p = .05, mean = 0, sd = 1)
critical_value_5
## [1] -1.644854
#calculate the p-value
p_value_5 <- pnorm(q = z5, mean = 0, sd = 1)
p_value_5
## [1] 0.005720462
#plot
shadenorm( mu = .31, sig = se5, pcts = .05, color = 'red' )
lines(x = rep(.25,10), y = seq(0,1,length.out = 10), col = 'blue')
MAKE A DECISION
#method 1
abs(z5) > abs(critical_value_5)
## [1] TRUE
#Method 2
myp(p = p_value_5, alpha = .05)
## [1] "REJECT Ho"
#Method 3: Confidence Interval
CI5_upper_bound <- p_hat5 + abs(critical_value_5 * se5)
CI5_lower_bound <- p_hat5 - abs(critical_value_5 * se5)
CI5_upper_bound
## [1] 0.2890248
CI5_lower_bound
## [1] 0.2109752
We REJECT THE NULL. All methods point to this decision. The absolute value of the z-score (test stat) is greater than the absolute value of the critical value. The p-value is also less than the alpha, meaning it is less probable we got our sample proportion assuming the null is true than the alpha level. Lastly, the 95% confidence interval does not contain the null hypothesized value for population proportion.
In this context, we REJECT THE NULL that the proportion of patients medically insured is greater or equal to .31.
A standardized test is given to a sixth-grade class. Historically, the mean score has been 112 with a standard deviation of 24. The superintendent believes that the standard deviation performance may have recently decreased. She randomly sampled 22 students and found a mean of 102 with a standard deviation of 15.4387. Is there evidence that the standard deviation has decreased at the .10 significance level?
Given: \(n = 22, \sigma = 24, s = 15.4387\)
Ho: \(s \ge \sigma\)
Ha: \(s < \sigma\)
where \(\sigma = 24\)
Ho: The standard deviation of scores on the standardized test is at least 24.
Ha: The standard deviation of scores on the standardized test is less than 24.
This will be a single sided test. We are only testing to see if the true proportion is lower than the hypothesized value.
\(\alpha = .10\)
Chi Squared. Used to test if the variance of a sample is significantly different from a historical variance (which is the squared standard deviation).
Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL
Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL
Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL
#Given
n6 <- 22
sigma6 <- 24
#variance of population
var6 <- sigma6^2
s6 <- 15.4387
#variance of the sample
var_sample_6 <- s6^2
df6 <- n6 -1
#Compute the test stat
chi_square6 <- (df6 * var_sample_6) / (var6)
chi_square6
## [1] 8.68997
#compute the critical value
critical_value_6 <- qchisq(p = .10, df = df6)
critical_value_6
## [1] 13.2396
#compute the p_value
p_value_6 <- pchisq(q = chi_square6, df = df6, lower.tail = TRUE)
p_value_6
## [1] 0.008549436
#graph
shadechi(df = df6, pcts=c(.10))
lines(x = rep(chi_square6,10), y = seq(0,.01,length.out = 10), col = 'blue')
Make a Decision:
#Method 1
abs(chi_square6) > abs(critical_value_6)
## [1] FALSE
#Method 2
myp(p = p_value_6, alpha = .10)
## [1] "REJECT Ho"
A medical researcher wants to compare the pulse rates of smokers and non-smokers. He believes that the pulse rate for smokers and non-smokers is different and wants to test this claim at the 0.1 level of significance. The researcher checks 32 smokers and finds that they have a mean pulse rate of 87, and 31 non-smokers have a mean pulse rate of 84. The standard deviation of the pulse rates is found to be 9 for smokers and 10 for non-smokers. Let \(\mu_1\) be the true mean pulse rate for smokers and \(\mu_2\) be the true mean pulse rate for non-smokers. Show all work and hypothesis testing steps.
Let smoker group be indexed by 1, non-smoker group by 2.
Given: \(n_1 =32 , \mu_1 = 87 , n_2 = 31,
\mu_2 = 84, \sigma_1 = 9 , \sigma_2 = 10 , \alpha = .10\)
To Do: Test if the pulse rate for smokers and non-smokers is different at the 0.1 level of significance. Thus, double sided test.
Ho: \(\mu_1 = \mu_2\) or \(\mu_1 - \mu_2 = 0\)
Ha: \(\mu_1 \neq \mu_2\) or \(\mu_1 - \mu_2 \neq 0\)
Ho: There is no difference between the mean pulse rate of smokers and non-smokers.
Ha: There is a difference between the mean pulse rate of smokers and non-smokers.
\(\alpha = .10\)
Distribution: t. Sample size is relatively small and we do not know the population standard deviations.
myp()
# Ho: Mu1-mu2=0, Ha: Mu1-Mu2<>0
mu7_1 <- 87
mu7_2 <- 84
alpha7 <- .10
# dist = t, set up the problem
n7_1 <- 32
n7_2 <- 31
df7_1 <- n7_1-1 #DOF
df7_2 <- n7_2-1 #DOF
sd7_1 <- 9
sd7_2 <- 10
var7_1 <- sd7_1^2 #Variances
var7_2 <- sd7_2^2
num_point_estimate_diff_7 <- (mu7_1 - mu7_2 ) # point estimate difference
Se_7 <- sqrt( var7_1/n7_1 + var7_2/n7_2 ) # Se formula - Standard Error using sample standard deviations rather than population standard deviations
t_7 <- num_point_estimate_diff_7 / Se_7
numdf_7 <- (var7_1/n7_1 + var7_2/n7_2)^2 # Satterthwaite
dendf_7 <- (var7_1/n7_1)^2 / df7_1 + (var7_2/n7_2)^2 / df7_2 # Satterthwaite
df7 <- numdf_7 / dendf_7 # Satterthwaite - can be replaced with smaller of df1 or df2
shadet(df = df7, pcts = c(.05,.95))
lines(rep(t_7,10), seq(0,1,length.out=10),col='red')
p_value_7 = 2 * ( 1 - pt(q = t_7, df = df7)) # Satterthwaite ## [1] 0.2160473
p_value_7
## [1] 0.2160473
myp(p_value_7,alpha7)
## [1] "FAIL 2 REJECT"
#p_value_robust <- 2 * ( 1 - pt(q = t_7, df = min(df7_1, df7_2))) # smaller of the numerator and denominator degree of freedom
#p_value_robust # a bit different p value, but the same end decision rule !!! ## [1] 0.220848
#myp(p_value_robust,alpha)
#T-stat
t_7 <- num_point_estimate_diff_7 / Se_7
#p-value
p_value_7 = 2 * ( 1 - pt(q = t_7, df = numdf_7/dendf_7)) # Satterthwaite ## [1] 0.2160473
p_value_7
## [1] 0.2160473
myp(p_value_7,alpha7)
## [1] "FAIL 2 REJECT"
#Critical value:
critical_value_7 <- qt(p = .05, df7)
critical_value_7
## [1] -1.670703
abs(t_7) > abs(critical_value_7)
## [1] FALSE
#Confidence Interval
CI7_upper_bound <- num_point_estimate_diff_7 + abs(critical_value_7 * Se_7)
CI7_lower_bound <- num_point_estimate_diff_7 - abs(critical_value_7 * Se_7)
CI7_upper_bound
## [1] 7.008664
CI7_lower_bound
## [1] -1.008664
FAIL TO REJECT THE NULL.
As we can see, the absolute value of the test statistic is less than the critical value, meaning we fail to reject the null. The p-value is also larger than the p-value, also means to fail to reject the null. Finally, the difference of the sample means is contained in our 90% confidence interval, also means to fail to reject the null.
In this context, failing to reject the null means that there is not enough evidence to say there is a difference between smoker and non-smoker pulse rate at the 90% confidence level.
Given two independent random samples with the following results: \(n_1 = 11, \bar{x}_1 = 127, \sigma_1= 33, n_2 = 18, \bar{x}_2 = 157, \sigma_2 = 27\)
Use this data to find the 95% confidence interval for the true difference between the population means. Assume that the population variances are not equal and that the two populations are normally distributed.
To Do: Create a 95% confidence interval for true difference between the population means.
Ho: \(\bar{x}_1 - \bar{x}_2 = 0\)
Ha: \(\bar{x}_1 - \bar{x}_2 \neq 0\)
\(\alpha = .05\)
Distribution: t. We have small sample sizes and do not know the population standard deviations.
myp()
#set up the problem
alpha8 <- .05
xbar8_1 <- 127 # mean
xbar8_2 <- 157 # mean
n8_1 <- 11 # sample size
n8_2 <- 18 # sample size
df8_1 <- n8_1-1 # degrees of freedom
df8_2 <- n8_2-1 # degrees of freedom
s8_1 <- 33 # sd
s8_2 <- 27 # sd
var8_1 <- s8_1^2 # variance
var8_2 <- s8_2^2 # variance
# Satterthwaite DF - can be replaced with smaller of df1 or df2
numdf_8 <- ( var8_1 / n8_1 + var8_2 / n8_2 )^2
dendf_8 <- ( var8_1 / n8_1 )^2 / df8_1 + (var8_2 / n8_2 )^2 / df8_2
df_8 <- numdf_8 / dendf_8
df_8
## [1] 18.0759
delta_8 <- xbar8_1 - xbar8_2 # point estimate difference
delta_8
## [1] -30
t_8 <- qt(p = .975, df = df_8) # two sided hypothesis test at 5% level of significance, p = vector of probabilities
t_8
## [1] 2.10029
Se_8 <- sqrt( var8_1/n8_1 + var8_2/n8_2 ) # Se formula - Standard Error using sample standard deviations rather than population standard deviations
Se_8
## [1] 11.81101
interval8 <- c( delta_8 - t_8 * Se_8 , delta_8 + t_8 * Se_8 )
interval8
## [1] -54.806548 -5.193452
95% confidence interval is (-54.806 - -5.1934). This means that we are 95% confident the true difference between the population means are within that interval. This would also give us reason to reject the null, that the means were the same (i.e. no difference between the means).
Two men, A and B, who usually commute to work together decide to conduct an experiment to see whether one route is faster than the other. The men feel that their driving habits are approximately the same, so each morning for two weeks one driver is assigned to route I and the other to route II. The times, recorded to the nearest minute, are shown in the following table. Using this data, find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.
Let \(d1 =\) (route I travel time) − (route II travel time).
Assume that the populations of travel times are normally distributed for both routes. Show all work and hypothesis testing steps.
To Do: Find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.
Notation: Let \(d1 =\) (route I travel time) − (route II travel time).
Small Sample -> T distribution (Standard normal with fat tails)
Ho: \(\mu = 0\) - Mean true difference between routes is 0, where \(\mu\) is the mean difference between routes.
Ha: \(\mu_1 \neq 0\) -Mean true difference between routes is not 0, where \(\mu\) is the mean difference between routes.
r1 <- c (32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
r2 <- c (28, 28, 33, 25, 26, 29, 33, 27, 25, 33)
delta_9 <- r1 - r2 #contains the differences of all the trip times
# xbar +/- t * Se
xbar_9 <- mean(delta_9) #average of the difference of times - point estimate
xbar_9
## [1] 0.1
n_9 <- 10
df_9 <- n_9-1
t_9 <- qt(p = .99, df = df_9)
Se_9 <- sd(delta_9) / sqrt(length(delta_9) #standard error formula for a known data set
)
#Construct confidence interval
interval_9 = c(xbar_9 - t_9 * Se_9 , xbar_9 + t_9 * Se_9)
interval_9
## [1] -2.766534 2.966534
The 98% confidence interval for the mean difference in times between routes 1 and 2 is (-2.766534, 2.966534). This means we are 98% confident the true average difference in times between routes lies somewhere between those 2 values (in minutes).
This also means we FAIL TO REJECT THE NULL, because 0 minute difference is contained in our confidence interval, we cannot say at .02 alpha level that there is a difference between the two routes.
You could solve this differently by performing a test on the difference between two population means. This would change the standard error formula (sqrt((s1^2 / n1) + (s2 ^2 / n2)). And the point estimate would be the mean of trip 1 - the mean of trip 2. Slightly different calculations that should lead to similar answers.
The U.S. Census Bureau conducts annual surveys to obtain information on the percentage of the voting-age population that is registered to vote. Suppose that 391 employed persons and 510 unemployed persons are independently and randomly selected, and that 195 of the employed persons and 193 of the unemployed persons have registered to vote. Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote? Use a significance level of 0.05 for the test. Show all work and hypothesis testing steps.
Q: Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote?
Notation: Employed group is group 1 ; Unemployed group is group 2.
Given: \(\pi_1 = .498, \pi_2 = .378, \alpha = .05\)
Ho: \(\pi 1 - \pi 2 \le 0\)
Ha: \(\pi1 - \pi2 > 0\)
\(\alpha = .05\)
Distribution: z. Large sample sizes will approximate standard normal dist.
1-sided test, we are testing if the proportion is
myp()
# Ho: pi1 - pi2 <= 0, Ha: pi1-pi2 > 0 (Percentage of employed workers p1 who have registered to vote, exceeds the percentage of unemployed workers p2 who have registered to vote)
# alpha=.05
# Z
n1_10 <- 391
n2_10 <- 510
x1_10 <- 195
x2_10 <- 193
p1_10 <- x1_10/n1_10 # Estimate
p2_10 <- x2_10/n2_10 # Estimate
point_estimate_10 <- p1_10-p2_10
Se_10 <- sqrt ((p1_10 * (1-p1_10) / n1_10) + (p2_10 * (1-p2_10) / n2_10))
# textbook formula
Se_10
## [1] 0.03317529
#pbar_10 = (x1_10 + x2_10) / (n1_10 + n2_10)
# alternative way to get to the same SE
#qbar_10 = 1 - pbar_10
#correction_10 = 1 / n1_10 + 1 / n2_10
#Se_robust_10 = sqrt( pbar_10 * qbar_10 * correction_10 )
# Pretty much the same
#Se_robust_10 # [1] 0.03328424
Se_10 # [1] 0.03317529
## [1] 0.03317529
Z_10 <- ( p1_10 - p2_10) / Se_10 # textbook formula
Z_10
## [1] 3.625887
shadenorm(mu = 0, sig = Se_10, pcts = c(0.0,0.95))
lines(rep(p1_10 - p2_10,20), seq(0,20,length.out=20),col='red')
#Compute test stat, critical value, and p-value
Z_10
## [1] 3.625887
critical_value_10 <- qnorm(p = .95, mean = 0, sd = 1)
critical_value_10
## [1] 1.644854
p_value_10 <- (1 - pnorm(q = Z_10, mean = 0, sd =1))
p_value_10
## [1] 0.0001439855
myp(p = p_value_10 , alpha = .05)
## [1] "REJECT Ho"
abs(Z_10) > abs(critical_value_10)
## [1] TRUE
#CI
CI10_upper_bound <- point_estimate_10 + abs(critical_value_10 * Se_10)
CI10_lower_bound <- point_estimate_10 - abs(critical_value_10 * Se_10)
CI10 <- c(CI10_lower_bound, CI10_upper_bound)
CI10
## [1] 0.06572136 0.17485835
We REJECT THE NULL. The absolute value of the test statistic is greater than the absolute value of the critical value. Also, the alpha is greater than the p-value … also basis to reject the null.
The 95% confidence interval also does not contain 0 as a difference in proportions. This is also basis to REJECT THE NULL that there is less or equal proportion of employed people that are registered to vote compared to unemployed. It is statistically likely that there is a greater proportion of people that are employed registered to vote than unemployed.