Setting up the Environment

# Clear the workspace
  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 525004 28.1    1169818 62.5   660385 35.3
## Vcells 964354  7.4    8388608 64.0  1769489 13.6
  cat("\f")       # Clear the console

I. Function to Reject or Not

We write a function which takes in two arguments (numbers here), runs some computations (basic inequality) on them and prints an output based on the computation result -

myp=function(p, alpha){
  if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

Test our function to make sure it is performing as intended -

myp(.01, .05) # p is less than alpha
## [1] "REJECT Ho"
myp(.1,  .05) # p is greater than alpha
## [1] "FAIL 2 REJECT"

Now, lets write a bit more complex function (takes in many arguments) that is designed to shade the standard normal distribution as the default option for a 5% double sided hypothesis test and can be adapted for other purposes too. You can chnage the arguments of mu, sig, pcts, color,…

II. Function for Shading Normal

shadenorm = function(below=NULL, above=NULL, pcts = c(0.025,0.975), mu=0, sig=1, numpts = 500, color = "gray", dens = 40,                    justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qnorm(pcts[1],mu,sig), below)
         above = ifelse(is.null(above), qnorm(pcts[2],mu,sig), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = mu - 4*sig                         # min point plotted on x axis
    uplim  = mu + 4*sig                         # max point plotted on x axis
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dnorm(x.grid,mean=mu, sd = sig)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")    # label y and x axis
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUCTION
shadenorm(mu = 0, sig = 1, pcts = c(0.025,0.975))

# shadenorm(mu = 20, sig = 6, pcts = c(0.025,0.975))

III. Function for for Shading t

shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qt(pcts[1],df), below)
         above = ifelse(is.null(above), qt(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = -4
    uplim  = 4
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dt(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUCTION
shadet(df = 4, pcts = c(0.025,0.975))     # see the area under the tails are further away from the mean 0..

# shadet(df = 120, pcts = c(0.025,0.975))   # t dist converges to normal when we have high degrees o freedom..

IV. Function for Shading Chi Square

shadechi = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qchisq(pcts[1],df), below)
         above = ifelse(is.null(above), qchisq(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = 0
    uplim  = qchisq(.99,df)
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dchisq(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }
    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUCTION
shadechi(df = 2, pcts=c(.05))   # change pcts and see what happen

shadechi(df = 18, pcts=c(.05))  # change df and see what happens  

Question 1

Using traditional methods, it takes 109 hours to receive a basic driving license. A new license training method using Computer Aided Instruction (CAI) has been proposed. A researcher used the technique with 190 students and observed that they had a mean of 110 hours. Assume the standard deviation is known to be 6. A level of significance of 0.05 will be used to determine if the technique performs differently than the traditional method. Make a decision to reject or fail to reject the null hypothesis. Show all work in R.

Given: \(\mu= 109, n= 190, \bar{x}= 110, \sigma (population)= 6, \alpha= .05\).

To Do: Determine if the technique performs differently than the traditional method. Burden of proof falls on alternative hypothesis -

i. Null and Alternative Hypothesis

Ho: \(\bar{x}= \mu\)

Ha: \(\bar{x} \neq \mu\)

Ho: There is no difference in mean time to obtain a basic driving license between methods.

Ha: There is a difference in the mean time to obtain a basic driving license between methods.

Two sided test (look at alternative hypothesis). Two sided because we are testing to see if the methods are different. There is no language about less than or more than.

ii. Choose level of significance

\(\alpha = .05\)

iii. Test Statistic

Distribution: Z (known SD), large sample size

iv. Decision Rule

Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL

Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL

Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL

v. Take sample and decide

#Compute the z-statistic

z_1 <- (110 - 109) / (6 / sqrt(190))
z_1
## [1] 2.297341
#Compute the p-value

#multiply by 2 for 2-sided test. The p-value represents the probability of finding a mean time to get drivers license as or more extreme than what the sample suggusts.

p_value_1 <- 2 * (1 - pnorm(q   = z_1,
                           mean = 0,
                           sd   = 1)
)

p_value_1
## [1] 0.0215993
alpha_1 <- .05    #given information
#plot it out using the pre-defined functions

shadenorm( mu = 109, sig = 6/sqrt(190), pcts = c(.025, .975), color = 'red' )
lines(x = rep(110,10), y = seq(0,1,length.out = 10), col = 'blue')

#Make a Decision

#method 1 - test stat vs. critical value
test_stat_1 <- z_1
critical_value_1 <- qnorm(p = .975, mean = 0, sd = 1)
abs(test_stat_1)  > abs(critical_value_1)
## [1] TRUE
#Use pre-defined function 'myp' (method 2)
myp(p = p_value_1, alpha = alpha_1)
## [1] "REJECT Ho"
#method 3: create a confidence interval
se_1 <- 6 / sqrt(190)
se_1
## [1] 0.4352858
CI1_upper_bound <- 110 + abs(critical_value_1 * se_1)
CI1_lower_bound <- 110 - abs(critical_value_1 * se_1)

CI1 <- c(CI1_lower_bound,CI1_upper_bound)
CI1
## [1] 109.1469 110.8531

Since the pvalue (.02159) is less than alpha (.05) we reject the null hypothesis. We also have a test stat that absolute value greater than the absolute value of the critical value.. also basis to reject the null.

Finally, our 95% confidence interval is predicting the true mean time of the new method to be in the interval (109.1469, 110.8531). This interval does not contain the hypothesized status quo mean time of 109.0 hours. Another method that tells us to reject the null.

This means we would be unlikely on the 95% confidence level to obtain the sample mean if the null mean value is the true mean. (i.e. if the null is true).

Question 2

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 5.3 parts/million (ppm). A researcher believes that the current ozone level is at an insufficient level. The mean of 5 samples is 5.0 parts per million (ppm) with a standard deviation of 1.1. Does the data support the claim at the 0.05 level? Assume the population distribution is approximately normal.

Given: Given: \(\mu= 5.3 ppm, n= 5, \bar{x}= 5.0ppm, \sigma (samples)= 1.1ppm, \alpha= .05\).

To Do: Researcher believes that the current ozone level is at an insufficient level - does the data support the claim at the 0.05 level ?

i. Null and Alternative Hypothesis

Ho: \(\bar{x} \ge \mu\)

Ha: \(\bar{x} < \mu\)

Ho: Ozone levels are sufficient (there is enough) - current levels are greater than or equal to \(\mu\) (5.3 ppm).

Ha: Ozone levels are insufficient (lower levels than normally expected) - current levels are less than \(\mu\) (5.3 ppm)

This is going to be a one-tailed (left) test. We are testing for insufficient levels, meaning testing to see if there is enough ozone… not too much. We are only looking at the left-hand side.

ii. Choose level of significance

\(\alpha = .05\)

iii. Test Statistic

Distribution: Student-T distribution with 4 degrees of freedom. This is because the population standard deviation is not known, we only have a standard deviation of the sample, and we have a small number of samples (less than 30).

iv. Decision Rule

Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL

Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL

Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL

v. Take sample and decide

#Compute the T-Statistic

alpha_2 <- .05

df_2 <- 5-1     #degrees of freedom

t_2 <- (5.0 - 5.3) / (1.1 / sqrt(5))  #T-stat formula
t_2
## [1] -0.6098367
#Compute the p-value
#Unlike problem 1, we are not multiplying by 2 because this is a one tailed test. The Test Statistic we calculated should take this into account and give us the probability of getting a value in the 5th percentile or lower assuming the null is true.

p_value_2 <- pt(q    = t_2,
                df   = df_2)

p_value_2
## [1] 0.2874568
#Compute a Critical Value

critical_value_2 <- qt(p  = .05,
                       df = 4)

critical_value_2
## [1] -2.131847
#Plot it out

shadet(pcts = .05, df = 4, color = 'red' )
lines(x = rep(t_2,10), y = seq(0,1,length.out = 10), col = 'blue')

MAKE A DECISION

#Method 1
abs(t_2) > abs(critical_value_2)
## [1] FALSE
#Method 2
myp(p = p_value_2, alpha = alpha_2)
## [1] "FAIL 2 REJECT"
#Method 3
se_2 <- 1.1 / sqrt(5)
se_2
## [1] 0.491935
CI2_upper_bound <- 5.0 + abs(critical_value_2 * se_2)
CI2_lower_bound <- 5.0 - abs(critical_value_2 * se_2)

CI2 <- c(CI2_lower_bound, CI2_upper_bound)
CI2
## [1] 3.95127 6.04873

Since the absolute value of the test statistic is less than the absolute value of the critical value, we FAIL TO REJECT THE NULL. This is confirmed by method 2, where the p-value is greater than the significance (alpha level)… so again we will FAIL TO REJECT THE NULL.

The 95% confidence interval is (3.95, 6.048). This means we are 95% confident the true mean value of ozone is between those values. The Null value of 5.3ppm is contained in the confidence interval, therefore we FAIL TO REJECT THE NULL.

In this case we do not have enough evidence at the .05 alpha level to say that there is insufficient ozone in the atmosphere.

Question 3

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 7.3 parts/million (ppm). A researcher believes that the current ozone level is not at a normal level. The mean of 51 samples is 7.1 ppm with a variance of 0.49. Assume the population is normally distributed. A level of significance of 0.01 will be used. Show all work and hypothesis testing steps.

Given: \(\mu= 7.3 ppm, n= 51, \bar{x}= 7.1ppm, \sigma (samples)= .7ppm, \alpha= .01\).

To Do: Researcher believes that the current ozone level is not at normal level. Thus, set a double sided hypothesis. This is because we will have to test both greater and less than the normal level.

i. Null and Alternative Hypothesis

Ho: \(\bar{x} = \mu\)

Ha: \(\bar{x} \neq \mu\)

Ho: Ozone is in the atmosphere is at a normal level.

Ha: Ozone in the atmosphere is not at a normal level.

ii. Choose level of significance

\(\alpha = .01\)

iii. Test Statistic

Distribution: t (unknown SD)

iv. Decision Rule

Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL

Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL

Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL

v. Take sample and decide

#compute the test-statistic

alpha_3 <- .01

df_3 <- 51-1     #degrees of freedom

t_3 <- (7.1 - 7.3) / ( .7/ sqrt(51))  #T-stat formula
t_3
## [1] -2.040408
#Compute p-value

p_value_3 <- 2 * (pt(q  = t_3,
                    df = 50))

p_value_3
## [1] 0.04660827
#Compute the Critical Value

critical_value_3 <- qt(p  = .005,
                       df = 50)

critical_value_3
## [1] -2.677793
#plot
shadet(pcts = c(.005, .995), df = 50, color = 'red' )
lines(x = rep(t_3,10), y = seq(0,1,length.out = 10), col = 'blue')

MAKE A DECISION

#Method 1

abs(t_3) > abs(critical_value_3)
## [1] FALSE
#Method 2
myp(p = p_value_3, alpha = .01)
## [1] "FAIL 2 REJECT"
#Method 3

se_3 <- .7 / sqrt(51)
se_3
## [1] 0.09801961
CI3_upper_bound <- 7.1 + abs(critical_value_3 * se_3)
CI3_lower_bound <- 7.1 - abs(critical_value_3 * se_3)

CI3 <- c(CI3_lower_bound, CI3_upper_bound)
CI3
## [1] 6.837524 7.362476

FAIL TO REJECT THE NULL.

Method 1: The absolute value of the critical value is larger than the abs of the test statistic, meaning we FAIL TO REJECT THE NULL.

Method 2: The p-value is larger than the alpha level, so we FAIL TO REJECT THE NULL

Method 3: The hypothesized mean of 7.3 is contained in the 99% confidence interval of (6.83 - 7.36). This means we are 99% confident that the true mean lies in that range, which contains the hypothesized mean… FAIL TO REJECT THE NULL.

In this context, failing to reject the null means at the .01 alpha level (99% confidence level) there is not enough statistical evidence to reject the null and say ozone levels are not normal based on the data we have.

Question 4 (See Open Stats Textbook - Chapter 5 Section 5.2: Confidence intervals for a proportion)

A publisher reports that 36% of their readers own a laptop. A marketing executive wants to test the claim that the percentage is actually less than the reported percentage. A random sample of 100 found that 29% of the readers owned a laptop. Is there sufficient evidence at the 0.02 level to support the executive’s claim? Show all work and hypothesis testing steps.

Given: \(\pi= .36 , n = 100 , \hat{p}= .29 ,\alpha= .02\)

To Do: Executive wants to test the claim that the percentage is actually less than the reported percentage. Thus, set a single sided hypothesis.

i. Null and Alternative Hypothesis

Ho: \(\hat{p} \ge \pi\)

Ha: \(\hat{p} < \pi\)

Ho: Mean proportion of readers that own a laptop is greater than or equal to .36 (pi)

Ha: Mean proportion of readers that own a laptop is less than .36 (pi)

This will be a single sided test. We are only testing to see if the true proportion is lower than the hypothesized value.

ii. Choose level of significance

\(\alpha = .02\)

iii. Test Statistic

Distribution: Z (proportion) - since we have a large enough sample size (100) we can assume standard normal distribution. Usually after n = 30 the T-distribution becomes largely similar to the Z distribution.

We will perform both tests here and get the same result for showing purposes.

iv. Decision Rule

Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL

Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL

Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL

v. Take sample and decide

#Use the Z-Dist first

#Calculate the Test Stat
alpha_4 <- .02
p_4 <- .36

p_bar_4 <- .29

se_4_z <- sqrt((.36 * .64) / (100))

z_4 <- (.29 - .36) / (se_4_z)

z_4
## [1] -1.458333
#Calc Critical Value

critical_value_4_z <- qnorm(p = .02, mean = 0, sd = 1)

critical_value_4_z
## [1] -2.053749
#compute the p-value

p_value_4_z <- pnorm(q = z_4, mean = 0, sd = 1)

p_value_4_z
## [1] 0.07237434
#Compute the values for the T Distribution

se_4_t  <- sqrt((.29 * .71) / (100))

t_4 <- (.29 - .36) / (se_4_t)

critical_value_4_t <- qt(p = .02, df = 99)

p_value_4_t <- pt(q = t_4, df = 99)
#compare the values from the two distributions to show they are very similar due to the large sample size. Listed first is the z and then the t

z_4
## [1] -1.458333
p_value_4_z
## [1] 0.07237434
critical_value_4_z
## [1] -2.053749
t_4
## [1] -1.542659
p_value_4_t
## [1] 0.06305207
critical_value_4_t
## [1] -2.081162
#plot it out

par(mfrow = c(1,2))

shadenorm( mu = .36, sig = se_4_z, pcts = .02, color = 'red' )
lines(x = rep(.29,10), y = seq(0,1,length.out = 10), col = 'blue')

shadet(pcts = .02, df = 99, color = 'red' )
lines(x = rep(t_4,10), y = seq(0,1,length.out = 10), col = 'blue')

#Method 1

abs(z_4) > abs(critical_value_4_z)
## [1] FALSE
abs(t_4) > abs(critical_value_4_t)
## [1] FALSE
#Method 2
myp(p = p_value_4_z, alpha = .02)
## [1] "FAIL 2 REJECT"
myp(p = p_value_4_t, alpha = .02)
## [1] "FAIL 2 REJECT"
#Method 3


CI4_upper_bound <- .29 + abs(critical_value_4_z * se_4_z)
CI4_lower_bound <- .29 - abs(critical_value_4_z * se_4_z)

CI4 <- c(CI4_lower_bound, CI4_upper_bound)
CI4
## [1] 0.1914201 0.3885799

We FAIL TO REJECT THE NULL.

Method 1: The absolute value of the test-stat (in both cases) is greater than the absolute value of the critical value. This is basis to FAIL TO REJECT THE NULL.

Method 2: The p-value is greater than the significance value, basis to FAIL TO REJECT THE NULL. This is saying, under null is true conditions the probability of getting a proportion that we got in our sample (.29) is greater than the alpha value.

Method 3: The 98% confidence interval of the true population proportion is (.191, .3885). The null value (.36), is included in this interval, therefore we FAIL TO REJECT THE NULL.

Question 5

A hospital director is told that 31% of the treated patients are uninsured. The director wants to test the claim that the percentage of uninsured patients is less than the expected percentage. A sample of 380 patients found that 95 were uninsured. Make the decision to reject or fail to reject the null hypothesis at the 0.05 level. Show all work and hypothesis testing steps.

To Do: Researcher believes that the percentage of uninsured patients is less than the expected percentage.

\(\hat{p} = .25 , \pi = .31, n = 380, \alpha = .10\)

i. Null and Alternative Hypothesis

Ho: \(\pi \ge .31\)

Ha: \(\pi < .31\)

Ho: Mean proportion of people medically insured is equal to or greater than .31.

Ha: Mean proportion of people medically insured is less than .31.

This will be a single sided test. We are only testing to see if the true proportion is lower than the hypothesized value.

ii. Choose level of significance

\(\alpha = .05\)

iii. Test Statistic

Distribution: Z (proportion) - since we have a large enough sample size (380) we can assume standard normal distribution. Usually after n = 30 the T-distribution becomes largely similar to the Z distribution.

iv. Decision Rule

Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL

Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL

Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL

v. Take sample and decide

#Compute the z-score

p_5 <- .31

p_hat5 <- 95/380

alpha5 <- .05

n_5 <- 380

se5 <- sqrt(p_5 * (1 - p_5) / n_5)

z5 <- (p_hat5 - p_5) / (se5)

z5
## [1] -2.528935
#Calculate the Critical Value

critical_value_5 <- qnorm(p = .05, mean = 0, sd = 1)

critical_value_5
## [1] -1.644854
#calculate the p-value

p_value_5 <- pnorm(q = z5, mean = 0, sd = 1)
p_value_5
## [1] 0.005720462
#plot 

shadenorm( mu = .31, sig = se5, pcts = .05, color = 'red' )
lines(x = rep(.25,10), y = seq(0,1,length.out = 10), col = 'blue')

MAKE A DECISION

#method 1
abs(z5) > abs(critical_value_5)
## [1] TRUE
#Method 2
myp(p = p_value_5, alpha = .05)
## [1] "REJECT Ho"
#Method 3: Confidence Interval
CI5_upper_bound <- p_hat5 + abs(critical_value_5 * se5)
CI5_lower_bound <- p_hat5 - abs(critical_value_5 * se5)

CI5_upper_bound
## [1] 0.2890248
CI5_lower_bound
## [1] 0.2109752

We REJECT THE NULL. All methods point to this decision. The absolute value of the z-score (test stat) is greater than the absolute value of the critical value. The p-value is also less than the alpha, meaning it is less probable we got our sample proportion assuming the null is true than the alpha level. Lastly, the 95% confidence interval does not contain the null hypothesized value for population proportion.

In this context, we REJECT THE NULL that the proportion of patients medically insured is greater or equal to .31.

Question 6

A standardized test is given to a sixth-grade class. Historically, the mean score has been 112 with a standard deviation of 24. The superintendent believes that the standard deviation performance may have recently decreased. She randomly sampled 22 students and found a mean of 102 with a standard deviation of 15.4387. Is there evidence that the standard deviation has decreased at the .10 significance level?

Given: \(n = 22, \sigma = 24, s = 15.4387\)

Null and Alternate Hypothesis

Ho: \(s \ge \sigma\)

Ha: \(s < \sigma\)

where \(\sigma = 24\)

Ho: The standard deviation of scores on the standardized test is at least 24.

Ha: The standard deviation of scores on the standardized test is less than 24.

This will be a single sided test. We are only testing to see if the true proportion is lower than the hypothesized value.

ii. Choose level of significance

\(\alpha = .10\)

iii. Test Statistic

Chi Squared. Used to test if the variance of a sample is significantly different from a historical variance (which is the squared standard deviation).

iv. Decision Rule

Method 1: Absolute Value of Test Stat vs Critical Value – if \(test stat > critical value\) … REJECT THE NULL

Method 2: P-value vs alpha – if \(alpha > p-value\) … REJECT THE NULL

Method 3: Confidence Interval … if CI does not contain hypothesized value… REJECT THE NULL

V. Take sample and decide

#Given

n6 <- 22

sigma6 <- 24

#variance of population
var6 <- sigma6^2

s6 <- 15.4387
#variance of the sample
var_sample_6 <- s6^2

df6 <- n6 -1
#Compute the test stat
chi_square6 <- (df6 * var_sample_6) / (var6)
chi_square6
## [1] 8.68997
#compute the critical value

critical_value_6 <- qchisq(p = .10, df = df6)
critical_value_6
## [1] 13.2396
#compute the p_value

p_value_6 <- pchisq(q = chi_square6, df = df6, lower.tail = TRUE)
p_value_6
## [1] 0.008549436
#graph

shadechi(df = df6, pcts=c(.10)) 
lines(x = rep(chi_square6,10), y = seq(0,.01,length.out = 10), col = 'blue')

Make a Decision:

#Method 1 
abs(chi_square6) > abs(critical_value_6)
## [1] FALSE
#Method 2
myp(p = p_value_6, alpha = .10)
## [1] "REJECT Ho"

Question 7

A medical researcher wants to compare the pulse rates of smokers and non-smokers. He believes that the pulse rate for smokers and non-smokers is different and wants to test this claim at the 0.1 level of significance. The researcher checks 32 smokers and finds that they have a mean pulse rate of 87, and 31 non-smokers have a mean pulse rate of 84. The standard deviation of the pulse rates is found to be 9 for smokers and 10 for non-smokers. Let \(\mu_1\) be the true mean pulse rate for smokers and \(\mu_2\) be the true mean pulse rate for non-smokers. Show all work and hypothesis testing steps.

Let smoker group be indexed by 1, non-smoker group by 2.
Given: \(n_1 =32 , \mu_1 = 87 , n_2 = 31, \mu_2 = 84, \sigma_1 = 9 , \sigma_2 = 10 , \alpha = .10\)

To Do: Test if the pulse rate for smokers and non-smokers is different at the 0.1 level of significance. Thus, double sided test.

i. Null and Alternative Hypothesis

Ho: \(\mu_1 = \mu_2\) or \(\mu_1 - \mu_2 = 0\)

Ha: \(\mu_1 \neq \mu_2\) or \(\mu_1 - \mu_2 \neq 0\)

Ho: There is no difference between the mean pulse rate of smokers and non-smokers.

Ha: There is a difference between the mean pulse rate of smokers and non-smokers.

ii. Choose level of significance

\(\alpha = .10\)

iii. Test Statistic

Distribution: t. Sample size is relatively small and we do not know the population standard deviations.

iv. Decision Rule

myp()

v. Take sample and decide

# Ho: Mu1-mu2=0, Ha:  Mu1-Mu2<>0
mu7_1 <- 87
mu7_2 <- 84

alpha7  <-  .10

# dist = t, set up the problem
n7_1    <-   32
n7_2   <-   31

df7_1   <-   n7_1-1  #DOF
df7_2   <-   n7_2-1  #DOF

sd7_1  <- 9
sd7_2  <- 10

var7_1  <- sd7_1^2   #Variances
var7_2 <- sd7_2^2

  
num_point_estimate_diff_7 <- (mu7_1 - mu7_2 )  # point estimate difference 

Se_7 <- sqrt( var7_1/n7_1 + var7_2/n7_2 )  # Se formula - Standard Error using sample standard deviations rather than population standard deviations

t_7   <- num_point_estimate_diff_7 / Se_7

numdf_7 <- (var7_1/n7_1 + var7_2/n7_2)^2                       # Satterthwaite
dendf_7 <- (var7_1/n7_1)^2 / df7_1 + (var7_2/n7_2)^2 / df7_2   # Satterthwaite

df7 <- numdf_7 / dendf_7     # Satterthwaite - can be replaced with smaller of df1 or df2

shadet(df = df7, pcts = c(.05,.95))
lines(rep(t_7,10), seq(0,1,length.out=10),col='red')

p_value_7 = 2 * ( 1 - pt(q = t_7, df = df7))    # Satterthwaite         ## [1] 0.2160473
p_value_7
## [1] 0.2160473
myp(p_value_7,alpha7)
## [1] "FAIL 2 REJECT"
#p_value_robust <- 2 * ( 1 - pt(q = t_7, df = min(df7_1, df7_2))) # smaller of the numerator and denominator degree of freedom
#p_value_robust # a bit different p value, but the same end decision rule !!!   ## [1] 0.220848
#myp(p_value_robust,alpha)
#T-stat
t_7   <- num_point_estimate_diff_7 / Se_7
#p-value
p_value_7 = 2 * ( 1 - pt(q = t_7, df = numdf_7/dendf_7))    # Satterthwaite         ## [1] 0.2160473
p_value_7
## [1] 0.2160473
myp(p_value_7,alpha7)
## [1] "FAIL 2 REJECT"
#Critical value:

critical_value_7 <- qt(p = .05, df7)
critical_value_7
## [1] -1.670703
abs(t_7) > abs(critical_value_7)
## [1] FALSE
#Confidence Interval

CI7_upper_bound <- num_point_estimate_diff_7 + abs(critical_value_7 * Se_7)
CI7_lower_bound <- num_point_estimate_diff_7 - abs(critical_value_7 * Se_7)

CI7_upper_bound
## [1] 7.008664
CI7_lower_bound
## [1] -1.008664

FAIL TO REJECT THE NULL.

As we can see, the absolute value of the test statistic is less than the critical value, meaning we fail to reject the null. The p-value is also larger than the p-value, also means to fail to reject the null. Finally, the difference of the sample means is contained in our 90% confidence interval, also means to fail to reject the null.

In this context, failing to reject the null means that there is not enough evidence to say there is a difference between smoker and non-smoker pulse rate at the 90% confidence level.

Question 8

Given two independent random samples with the following results: \(n_1 = 11, \bar{x}_1 = 127, \sigma_1= 33, n_2 = 18, \bar{x}_2 = 157, \sigma_2 = 27\)

Use this data to find the 95% confidence interval for the true difference between the population means. Assume that the population variances are not equal and that the two populations are normally distributed.

To Do: Create a 95% confidence interval for true difference between the population means.

i. Null and Alternative Hypothesis

Ho: \(\bar{x}_1 - \bar{x}_2 = 0\)

Ha: \(\bar{x}_1 - \bar{x}_2 \neq 0\)

ii. Choose level of significance

\(\alpha = .05\)

iii. Test Statistic

Distribution: t. We have small sample sizes and do not know the population standard deviations.

iv. Decision Rule

myp()

v. Take sample difference and construct 95% CI around it

#set up the problem

alpha8 <- .05

xbar8_1   <-   127  # mean
xbar8_2   <-   157  # mean

n8_1    <-   11    # sample size
n8_2    <-    18    # sample size

df8_1   <-   n8_1-1    # degrees of freedom
df8_2   <-   n8_2-1    # degrees of freedom

s8_1    <-    33    # sd
s8_2    <-    27    # sd

var8_1    <-   s8_1^2    # variance
var8_2    <-   s8_2^2    # variance

# Satterthwaite DF - can be replaced with smaller of df1 or df2
numdf_8 <-   ( var8_1 / n8_1   +   var8_2 / n8_2 )^2
dendf_8 <-   ( var8_1 / n8_1 )^2 / df8_1  +  (var8_2 / n8_2 )^2 / df8_2
df_8    <-   numdf_8 / dendf_8
df_8
## [1] 18.0759
delta_8   <- xbar8_1   -   xbar8_2               # point estimate difference 
delta_8
## [1] -30
t_8   <-  qt(p = .975, df = df_8)             # two sided hypothesis test at 5% level of significance, p = vector of probabilities

t_8
## [1] 2.10029
Se_8 <- sqrt( var8_1/n8_1 + var8_2/n8_2 )  # Se formula - Standard Error using sample standard deviations rather than population standard deviations
Se_8
## [1] 11.81101
interval8 <- c( delta_8 - t_8 * Se_8 , delta_8 + t_8 * Se_8 )
interval8
## [1] -54.806548  -5.193452

95% confidence interval is (-54.806 - -5.1934). This means that we are 95% confident the true difference between the population means are within that interval. This would also give us reason to reject the null, that the means were the same (i.e. no difference between the means).

Question 9.

Two men, A and B, who usually commute to work together decide to conduct an experiment to see whether one route is faster than the other. The men feel that their driving habits are approximately the same, so each morning for two weeks one driver is assigned to route I and the other to route II. The times, recorded to the nearest minute, are shown in the following table. Using this data, find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

Let \(d1 =\) (route I travel time) − (route II travel time).

Assume that the populations of travel times are normally distributed for both routes. Show all work and hypothesis testing steps.

To Do: Find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

Notation: Let \(d1 =\) (route I travel time) − (route II travel time).

Small Sample -> T distribution (Standard normal with fat tails)

I. Set up Hypothesis

Ho: \(\mu = 0\) - Mean true difference between routes is 0, where \(\mu\) is the mean difference between routes.

Ha: \(\mu_1 \neq 0\) -Mean true difference between routes is not 0, where \(\mu\) is the mean difference between routes.

r1 <- c (32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
r2 <- c (28, 28, 33, 25, 26, 29, 33, 27, 25, 33)
delta_9 <- r1 - r2   #contains the differences of all the trip times

# xbar +/- t * Se
xbar_9  <- mean(delta_9) #average of the difference of times - point estimate
xbar_9
## [1] 0.1
n_9  <- 10 
df_9 <- n_9-1
t_9  <- qt(p = .99, df = df_9)
Se_9 <- sd(delta_9) / sqrt(length(delta_9)  #standard error formula for a known data set
                         )
#Construct confidence interval
interval_9 = c(xbar_9 - t_9 * Se_9 , xbar_9 + t_9 * Se_9)
interval_9
## [1] -2.766534  2.966534

The 98% confidence interval for the mean difference in times between routes 1 and 2 is (-2.766534, 2.966534). This means we are 98% confident the true average difference in times between routes lies somewhere between those 2 values (in minutes).

This also means we FAIL TO REJECT THE NULL, because 0 minute difference is contained in our confidence interval, we cannot say at .02 alpha level that there is a difference between the two routes.

You could solve this differently by performing a test on the difference between two population means. This would change the standard error formula (sqrt((s1^2 / n1) + (s2 ^2 / n2)). And the point estimate would be the mean of trip 1 - the mean of trip 2. Slightly different calculations that should lead to similar answers.

Question 10

The U.S. Census Bureau conducts annual surveys to obtain information on the percentage of the voting-age population that is registered to vote. Suppose that 391 employed persons and 510 unemployed persons are independently and randomly selected, and that 195 of the employed persons and 193 of the unemployed persons have registered to vote. Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote? Use a significance level of 0.05 for the test. Show all work and hypothesis testing steps.

Q: Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote?

Notation: Employed group is group 1 ; Unemployed group is group 2.

Given: \(\pi_1 = .498, \pi_2 = .378, \alpha = .05\)

i. Null and Alternative Hypothesis

Ho: \(\pi 1 - \pi 2 \le 0\)

Ha: \(\pi1 - \pi2 > 0\)

ii. Choose level of significance

\(\alpha = .05\)

iii. Test Statistic

Distribution: z. Large sample sizes will approximate standard normal dist.

1-sided test, we are testing if the proportion is

iv. Decision Rule

myp()

v. Take sample difference and construct 95% CI around it

# Ho:  pi1 - pi2 <= 0, Ha:  pi1-pi2 > 0 (Percentage of employed workers p1 who have registered to vote, exceeds the percentage of unemployed workers p2 who have registered to vote)
# alpha=.05
# Z

n1_10  <- 391
n2_10  <- 510

x1_10  <- 195
x2_10  <- 193

p1_10  <- x1_10/n1_10   # Estimate 
p2_10  <- x2_10/n2_10  # Estimate 
point_estimate_10 <- p1_10-p2_10

Se_10 <- sqrt ((p1_10 * (1-p1_10) / n1_10) + (p2_10 * (1-p2_10) / n2_10))
# textbook formula
Se_10
## [1] 0.03317529
#pbar_10 = (x1_10 + x2_10) / (n1_10 + n2_10)                   
# alternative way to get to the same SE
#qbar_10 = 1 - pbar_10
#correction_10 = 1 / n1_10 + 1 / n2_10
#Se_robust_10 = sqrt( pbar_10 * qbar_10 * correction_10 )

# Pretty much the same
#Se_robust_10     # [1] 0.03328424
Se_10            # [1] 0.03317529 
## [1] 0.03317529
Z_10  <- ( p1_10 - p2_10) / Se_10                            # textbook formula 
Z_10
## [1] 3.625887
shadenorm(mu = 0, sig = Se_10, pcts = c(0.0,0.95))
lines(rep(p1_10 - p2_10,20), seq(0,20,length.out=20),col='red')

#Compute test stat, critical value, and p-value

Z_10
## [1] 3.625887
critical_value_10 <- qnorm(p = .95, mean = 0, sd = 1)
critical_value_10
## [1] 1.644854
p_value_10 <- (1 - pnorm(q = Z_10, mean = 0, sd =1))
p_value_10
## [1] 0.0001439855
myp(p = p_value_10 , alpha = .05)
## [1] "REJECT Ho"
abs(Z_10) > abs(critical_value_10)
## [1] TRUE
#CI 
CI10_upper_bound <- point_estimate_10 + abs(critical_value_10 * Se_10)
CI10_lower_bound <- point_estimate_10 - abs(critical_value_10 * Se_10)

CI10 <- c(CI10_lower_bound, CI10_upper_bound)
CI10
## [1] 0.06572136 0.17485835

We REJECT THE NULL. The absolute value of the test statistic is greater than the absolute value of the critical value. Also, the alpha is greater than the p-value … also basis to reject the null.

The 95% confidence interval also does not contain 0 as a difference in proportions. This is also basis to REJECT THE NULL that there is less or equal proportion of employed people that are registered to vote compared to unemployed. It is statistically likely that there is a greater proportion of people that are employed registered to vote than unemployed.