Setting up our functions

# Clearing workspace 
rm(list = ls()) # Clear environment
  gc()          # Clear unused memory

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 524301 28.1    1167757 62.4   660491 35.3
## Vcells 955822  7.3    8388608 64.0  1769514 13.6

cat("\f")       # Clear the console

Function to Reject or Not

myp=function(p, alpha){
  if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

# Test 
myp(.01, .05) # p is less than alpha

## [1] "REJECT Ho"

# Test 2
myp(.1,  .05) # p is greater than alpha

## [1] "FAIL 2 REJECT"

Funtion for Shading Normal

shadenorm = function(below=NULL, above=NULL, pcts = c(0.025,0.975), mu=0, sig=1, numpts = 500, color = "gray", dens = 40,                    justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qnorm(pcts[1],mu,sig), below)
         above = ifelse(is.null(above), qnorm(pcts[2],mu,sig), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = mu - 4*sig                         # min point plotted on x axis
    uplim  = mu + 4*sig                         # max point plotted on x axis
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dnorm(x.grid,mean=mu, sd = sig)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")    # label y and x axis
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUCTION
shadenorm(mu = 0, sig = 1, pcts = c(0.025,0.975))

Function for Shading T

shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qt(pcts[1],df), below)
         above = ifelse(is.null(above), qt(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = -4
    uplim  = 4
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dt(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUCTION
shadet(df = 4, pcts = c(0.025,0.975))     # see the area under the tails are further away from the mean 0..

Function for Shading Chi Square

shadechi = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qchisq(pcts[1],df), below)
         above = ifelse(is.null(above), qchisq(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = 0
    uplim  = qchisq(.99,df)
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dchisq(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }
    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUCTION
shadechi(df = 2, pcts=c(.05))   # change pcts and see what happen

1 Using traditional methods, it takes 109 hours to receive a basic driving license. A new license training method using Computer Aided Instruction (CAI) has been proposed. A researcher used the technique with 190 students and observed that they had a mean of 110 hours. Assume the standard deviation is known to be 6. A level of significance of 0.05 will be used to determine if the technique performs differently than the traditional method. Make a decision to reject or fail to reject the null hypothesis. Show all work in R.

Null and Alternative Hypothesis

Ho (Null): Our Null hypothesis is that the mean number of hours to obtain a driver’s license using the new training method using computer aided instruction (CAI) IS equal to the mean for the number of hours to obtain a driver’s license with the original traditional method.

Ha (Alternative): Our alternative hypothesis is that the mean number of hours to obtain a driver’s license using the new training method (CAI) is NOT equal to the mean number of hours to obtain a drivers license using the original traditional method.

Level of Significance: 0.05

Test Statistic: Z since we know the SD and is two sides

# Setting our Parameters
n1 <- 190   # Sample Population
pm1 <- 109  # Population Mean
sm1 <- 110  # sample Mean
sd1 <- 6    # Standard Deviation of Population
a1 <- 0.05  # Alpha

# Computing Z Score
Z1 = (sm1-pm1)/(sd1/sqrt(n1))
Z1

## [1] 2.297341

# Computing P Value 
p.value = 2 * (1-pnorm(q = Z1,
                       mean = 0,
                       sd = 1)
               )
round(p.value, digits = 4)

## [1] 0.0216

As we can see here, this p value is less than our alpha, but just to double check, lets throw it into the formula we set up.

# Testing Rejection
myp(p = p.value, 
    alpha = a1 )

## [1] "REJECT Ho"

As we can see above, we would reject our NULL hypothesis. This tells us that the two means are NOT equal.

# Plotting This Distribution
shadenorm( mu = 109, 
           sig = 6/sqrt(190), 
           pcts = c(0.025,0.975), 
           color = "blue")          # shades significance level gates 

lines(x=rep(110,10), 
      y=seq(0,1,length.out=10), 
      col='green')                       # mark point estimate from sample

As we can see from this graph, the highlighted line is in our region to reject as well.

2 Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 5.3 parts/million (ppm). A researcher believes that the current ozone level is at an insufficient level. The mean of 5 observations in a sample is 5.0 parts per million (ppm) with a standard deviation of 1.1. Does the data support the claim at the 0.05 level? Assume the population distribution is approximately normal.

Null and Alternative Hypothesis:

Ho (Null Hypothesis): The level of the Ozone is equal to 5.3 parts/million (ppm).

Ha : The level of the Ozone is not equal to 5.3 parts/million (ppm) and is at an insufficient level.

Significance Level: 0.05

Test Statistic: T since the population standard deviation is not know and then observations are less than 30. This is also one-sided.

# Setting our Parameters
n2 <- 5      # Sample Population
pm2 <- 5.3   # Population Mean
sm2 <- 5.0   # Sample Mean
sd2 <- 1.1   # Sample Standard Deviation
a2 <- 0.05   # Alpha

# Computing Z Score
Z2 = (sm2-pm2)/(sd2/sqrt(n2))
Z2

## [1] -0.6098367

# Computing P Value
?pt

## starting httpd help server ... done

P2 <- pt(q = Z2,
         df=n2-1,
         lower.tail=TRUE)

round(P2, digits = 4)

## [1] 0.2875

# Testing Rejection
myp(p = P2, 
    alpha = a2 )

## [1] "FAIL 2 REJECT"

As seen above, we know that at the 0.05 significance level, our P-value of .2875 is larger than 0.05. To confirm, we can plug this into our rejection algorithm which confirms we fail to reject our null hypothesis. This means that the Ozone level is equal to 5.3 parts/million. Based off of our question, we can assume that this is a sufficient level.

# Plotting This 
shadet(df = n2-1, 
       pcts = c(0.025,0.975))  # shades significance level gates

3 Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 7.3 parts/million (ppm). A researcher believes that the current ozone level is not at a normal level. The mean of 51 observations in a sample is 7.1 ppm with a variance of 0.49. Assume the population is normally distributed. A level of significance of 0.01 will be used.

Null and Alternative Hypothesis:

Ho: The level of the Ozone is 7.3 parts.million (ppm)

Ha: The level of the Ozone is not at a normal level and not equal to 7.3mm

Significance Level: 0.01

Test Statistic: T distribution since we do not know the standard deviation.

# setting our Parameters
n3 <- 51     # Sample Population
pm3 <- 7.3   # Population Mean
sm3 <- 7.1   # Sample Mean
var3 <- 1.1   # Variance
a3 <- 0.01   # Alpha

# Computing Standard Deviation of the Population
sd3 <- sqrt(var3)
sd3

## [1] 1.048809

# Computing Standard Error
se3 <- sd3/sqrt(n3)
se3

## [1] 0.1468626

# Computing Z score
z3 <- (sm3-pm3)/se3
z3

## [1] -1.361817

# Calculating P Value
p3 <- 2 * pt(q = z3,
             df = n3-1,
             lower.tail = TRUE)
p3

## [1] 0.1793602

# Testing Rejection
myp(p = p3, 
    alpha = a3)

## [1] "FAIL 2 REJECT"

We fail to reject our NULL hypothesis after doing our calculations. We can see that our P value of 0.17 is greater than our significance level of 0.01 so therefore, we fail to reject the NULL hypothesis.

4 A publisher reports that 36% of their readers own a laptop. A marketing executive wants to test the claim that the percentage is actually less than the reported percentage. A random sample of 100 found that 29% of the readers owned a laptop. Is there sufficient evidence at the 0.02 level to support the executive’s claim? Show all work and hypothesis testing steps.

Null and Alternative Hypothesis

Ho: Our null hypothesis is 36% or more of readers own a laptop

Ha: Our alternative hypothesis is that less than 36% of readers own a laptop

Level of significance: 0.02

Test Statistic: Z distribution but need to find the standard deviation/error on our own

# Setting our Parameters
n4 <- 100      # Sample Population
pm4 <- .36     # Population %
sm4 <- .29     # Sample %
a4 <- 0.02     # Alpha

# Calculating Standard Error
se4 <- sqrt(pm4*(1-pm4)/n4)
se4

## [1] 0.048

# Calculating Z Score
z4 <- (sm4-pm4)/se4
z4

## [1] -1.458333

# Calculating P Value
p4 <- pnorm(z4)
p4

## [1] 0.07237434

# Testing Rejection
myp(p = p4, 
    alpha = a4)

## [1] "FAIL 2 REJECT"

We fail to reject our hypothesis again. We can see that our P value ended up being 0.072 but at our level of significance was 0.02. We can clearly see this is above our significance level. Our results from this would be that 36% or more of the publishers readers own a laptop.

# Plotting This 
shadenorm(mu = .36, 
          sig = se4, 
          pcts = c(.02),
          color = 'lightblue'
          )
lines(x = rep(.29,10), 
      y = seq(from = 0, 
              to = 20,
              length.out=10), 
      col='green')

5 A hospital director is told that 31% of the treated patients are uninsured. The director wants to test the claim that the percentage of uninsured patients is less than the expected percentage. A sample of 380 patients found that 95 were uninsured. Make the decision to reject or fail to reject the null hypothesis at the 0.05 level. Show all work and hypothesis testing steps.

Null and Alternative Hypothesis

Ho: The null hypothesis is that 31% or more of patients are uninsured.

Ha: The alternative hypothesis is that less than 31% of patients are uninsured.

Level of Significance: 0.05

Test Statistic: Z distribution

# Setting Our Parameters
n5 <- 380       # Sample Population
pm5 <- .31      # Population %
amt5 <- 95      # Amount of people uninsured
sm5 <- amt5/n5  # Sample %
a5 <- 0.05      # Alpha

# Calculating Standard Error
se5 <- sqrt(pm5*(1-pm5)/n5)
se5

## [1] 0.0237254

 # Calculating Z Score
z5 <- (sm5-pm5)/se5
z5

## [1] -2.528935

# Calculating P Value 
p5 <- pnorm(z5)
p5

## [1] 0.005720462

# Testing Rejection 
myp(p = p5, 
    alpha = a5)

## [1] "REJECT Ho"

As we can see from the above, we can reject our NULL hypothesis meaning that we can conclude that less than 31% of patient are uninsured.We confirmed this by plugging it into our rejection algorithm which told us we can reject Ho. In this problem, we were not given the sample % but could pretty easily calculate this from our sample statistics. I included this in the initial parameters for this question.

# Plotting This 
shadenorm(mu = .31, 
          sig = se5, 
          pcts = c(.05),
          color = 'lightblue'
          )
lines(x = rep(sm5,10), 
      y = seq(from = 0, 
              to = 20,
              length.out=10), 
      col='green')

6 A standardized test is given to a sixth-grade class. Historically the mean score has been 112 with a standard deviation of 24. The superintendent believes that the standard deviation of performance may have recently decreased. She randomly sampled 22 students and found a mean of 102 with a standard deviation of 15.4387. Is there evidence that the standard deviation has decreased at the 𝛼 = 0.1 level? Show all work and hypothesis testing steps.

Null and Alternative Hypothesis

Ho: The null hypothesis is that standard deviation of test scores did not decrease from 24

Ha: The alternative hypothesis is that the standard deviation of test scores did decrease from 24

Level of significance: 0.1

Test Statistic: Chi-Squared

# Setting up Parameters
hist.mean <- 112     # Historical Mean
hist.sd <- 24        # Historical standard deviation
n6 <- 22             # Sample size
s.mean <- 102        # Sample mean
s.std <- 15.4387     # Sample Standard Deviation
a6 <- 0.1            # Alpha

# Compute Test Statistic 
chi_square6 <- ((n6 - 1) * s.std^2) / hist.sd^2
chi_square6

## [1] 8.68997

# Determining Critical Value
critical_value6 <- qchisq(1 - a6, 
                          df = n6 - 1
                          )

critical_value6

## [1] 29.61509

As we can see above, we calculate both the test statistic and critical value numbers. Our test statistic was 8.68997 and our critical value was 29.61509. Since our test statistic is less than our critical value, we FAIL to reject our null hypothesis. This would tell us that the standard deviation of tests scores did not decrease from 24.

# Plotting This 
shadechi(df = n6-1, 
         pcts=c(.10),
         color = 'lightblue'
         )

7 A medical researcher wants to compare the pulse rates of smokers and non-smokers. He believes that the pulse rate for smokers and non-smokers is different and wants to test this claim at the 0.1 level of significance. The researcher checks 32 smokers and finds that they have a mean pulse rate of 87, and 31 non-smokers have a mean pulse rate of 84. The standard deviation of the pulse rates is found to be 9 for smokers and 10 for non-smokers. Let 𝜇1 be the true mean pulse rate for smokers and 𝜇2 be the true mean pulse rate for non- smokers. Show all work and hypothesis testing steps.

Null and Alternative Hypothesis

Ho: The pulse rate for smokers and non-smokers is not different

Ha: The pulse rate for smokers and non-smokers is different

Level of Significance: 0.1

Test Statistic: T

# Setting our Parameters

# Smokers
n.smoke <- 32     # Sample Smokers
m.smoke <- 87     # Sample Mean Smokers
sd.smoke <- 9     # Sample Standard Deviation Smokers

# Non Smokers
n.nonsmoke <- 31  # Sample Non Smokers
m.nonsmoke <- 84  # Sample Mean Non Smokers
sd.nonsmoke <- 10 # Sample standard Deviation non smokers

# Total
n7 <- n.smoke+n.nonsmoke   # Total Sample
a7 <- 0.1                  # Alpha

# Calculating Variance 
var.smoke <- sd.smoke^2
var.nonsmoke <- sd.nonsmoke^2

# Standard Error
se7 <- sqrt((var.smoke/n.smoke)+(var.nonsmoke/n.nonsmoke))
se7

## [1] 2.399387

t <- (m.smoke-m.nonsmoke)/se7
t

## [1] 1.25032

# P Value
p7 <- 2*pt(t, 
          df = n.nonsmoke-1, 
          lower.tail = FALSE)
p7

## [1] 0.220848

# Testing Rejection
myp(p = p7, 
    alpha = a7)

## [1] "FAIL 2 REJECT"

Per our calculations, we fail to reject our NULL hypothesis. In this case, the pulse rate is not different for smokers and non-smokers.

8 Given two independent random samples with the following results:

𝑛1 = 11 𝑥̅1 = 127 𝑠1 = 33 𝑛2 = 18 𝑥̅ 2 = 157 𝑠2 = 27

Use this data to find the 95% confidence interval for the true difference between the population means. Assume that the population variances are not equal and that the two populations are normally distributed.

Null and Alternative Hypothesis

Ho: The Population variances are not equal

Ha: The population variances are equal

Level of Significance: 0.05

Test Statistic: T distribution

# Setting Up Parameters (all listed in our question)

n8.1 <- 11
xbar1 <- 127
sigma1 <- 33
n8.2 <- 18
xbar2 <- 157
sigma2 <- 27
alpha <- 0.05
var1 <- sigma1^2
var2 <- sigma2^2
df <- min(n8.1-1,n8.2-1)

mdiff <- xbar1 - xbar2
se8 <- sqrt((var1/n8.1)+(var2/n8.2))
tstat <- mdiff/se8
t8 <- qt(0.025, 
         df, 
        lower.tail = FALSE
        )
margin.error <- t8 * se8

low.b <- mdiff - margin.error
up.b <- mdiff + margin.error

# CI
cat("95% Confidence interval:", round(low.b, 4), ",", round(up.b, 4))

## 95% Confidence interval: -56.3166 , -3.6834

We can see here that our 95% confidence interval is -56.31657,-3.683426

9 Two men, A and B, who usually commute to work together decide to conduct an experiment to see whether one route is faster than the other. The men feel that their driving habits are approximately the same, so each morning for two weeks one driver is assigned to route I and the other to route II. The times, recorded to the nearest minute, are shown in the following table.

Using this data, find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II. Let 𝑑 = (𝑟𝑜𝑢𝑡𝑒 𝐼 𝑡𝑟𝑎𝑣𝑒𝑙 𝑡𝑖𝑚𝑒) − (𝑟𝑜𝑢𝑡𝑒 𝐼𝐼 𝑡𝑟𝑎𝑣𝑒𝑙 𝑡𝑖𝑚𝑒). Assume that the populations of travel times are normally distributed for both routes. Show all work and hypothesis testing steps.

# Create data sets for each route
route1 <- c(32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
route2 <- c(28, 28, 33, 25, 26, 29, 33, 27, 25, 33)

# Setting up parameters
# Define Variables
n9.1 <- 10
xbar1.9 <- mean(route1)
s1 <- sd(route1)
var9.1 <- s1^2
n9.2 <- 10
xbar2.9 <- mean(route2)
s2 <- sd(route2)
var9.2 <- s2^2
df <- n9.1-1
a9 <- 0.02

mdiff9 <- xbar1.9 - xbar2.9
se9 <- sqrt((var9.1/n9.1)+(var9.2/n9.2))

t9 <- mdiff9/se9
tdf <- qt(p=.01, 
          df, 
          lower.tail=FALSE
          )

up.b9 <- mdiff9 + (tdf*se9)
low.b9 <- mdiff9 - (tdf*se9)
cat("(Lower: ",low.b9,", Upper:",up.b9,")")

## (Lower:  -4.637066 , Upper: 4.837066 )

10 The U.S. Census Bureau conducts annual surveys to obtain information on the percentage of the voting-age population that is registered to vote. Suppose that 391 employed persons and 510 unemployed persons are independently and randomly selected, and that 195 of the employed persons and 193 of the unemployed persons have registered to vote. Can we conclude that the percentage of employed workers ( 𝑝1 ), who have registered to vote, exceeds the percentage of unemployed workers ( 𝑝2 ), who have registered to vote? Use a significance level of 𝛼 = 0.05 for the test. Show all work and hypothesis testing steps.

Null and Alternative Hypothesis

Ho: The Null hypothesis is that the percentage of unemployed workers is more than employed workers.

Ha: The alternate hypothesis is the percentage of employed workers is more than unemployed workers.

# Setting the Parameters
n1.10 <- 391         #Employed
xbar1.10 <- 195
s1.10 <- 1
var1.10 <- s1.10^2
n2.10 <- 510         #Unemployed
xbar2.10 <- 193
s2.10 <- 1
var2.10 <- s2.10^2
df <- min(n1.10-1,n2.10-1)
a.10 <- 0.05

mdiff.10 <- xbar1.10 - xbar2.10
se.10 <- sqrt((var1.10/n1.10)+(var2.10/n2.10))

t.10 <- mdiff.10/se.10
tdf.10<-qt(p=.025, 
           df, 
           lower.tail=FALSE
           )

up.b10<- mdiff.10 + (tdf.10*se.10)
low.b10<- mdiff.10 - (tdf.10*se.10)
cat("(Lower: ",low.b10,", Upper:",up.b10,")")

## (Lower:  1.867844 , Upper: 2.132156 )

p.10<-2*pt(t.10, 
        df, 
        lower.tail = FALSE)
p.10

## [1] 2.234708e-102

myp(p = p.10, 
    alpha = a.10)

## [1] "REJECT Ho"

DrewBaker-HW5_Inference

2024-04-21

Setting up our functions

Function to Reject or Not

Funtion for Shading Normal

Function for Shading T

Function for Shading Chi Square

8 Given two independent random samples with the following results:

Use this data to find the 95% confidence interval for the true difference between the population means. Assume that the population variances are not equal and that the two populations are normally distributed.