Homework 5, Data Analysis

Resources :

Skimming over Chapter 5, 6 and 7 in the Open Intro Statistics textbook may be helpful to see the standard error formulas for some of the questions. I will explicitly redirect you to the textbook for some questions.
I have 4 user defined functions below - you do not have to use them, but may find them useful to graphically draw out what is happening in the question.

Set Up (4 functions) to better answer the questions.

I. Function to Reject or Not

We write a function which takes in two arguments (numbers here), runs some computations (basic inequality) on them and prints an output based on the computation result -

myp=function(p, alpha){
  if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

Test our function to make sure it is performing as intended -

myp(.01, .05) # p is less than alpha

## [1] "REJECT Ho"

myp(.1,  .05) # p is greater than alpha

## [1] "FAIL 2 REJECT"

Now, lets write a bit more complex function (takes in many arguments) that is designed to shade the standard normal distribution as the default option for a 5% double sided hypothesis test and can be adapted for other purposes too. You can chnage the arguments of mu, sig, pcts, color,…

II. Function for Shading Normal

shadenorm = function(below=NULL, above=NULL, pcts = c(0.025,0.975), mu=0, sig=1, numpts = 500, color = "gray", dens = 40,                    justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qnorm(pcts[1],mu,sig), below)
         above = ifelse(is.null(above), qnorm(pcts[2],mu,sig), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = mu - 4*sig                         # min point plotted on x axis
    uplim  = mu + 4*sig                         # max point plotted on x axis
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dnorm(x.grid,mean=mu, sd = sig)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")    # label y and x axis
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUNCTION
shadenorm(mu = 0, sig = 1, pcts = c(0.025,0.975))

# shadenorm(mu = 20, sig = 6, pcts = c(0.025,0.975))

III. Function for for Shading t

shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qt(pcts[1],df), below)
         above = ifelse(is.null(above), qt(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = -4
    uplim  = 4
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dt(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUNCTION
shadet(df = 4, pcts = c(0.025,0.975))     # see the area under the tails are further away from the mean 0..

# shadet(df = 120, pcts = c(0.025,0.975))   # t dist converges to normal when we have high degrees o freedom..

IV. Function for Shading Chi Square

shadechi = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qchisq(pcts[1],df), below)
         above = ifelse(is.null(above), qchisq(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = 0
    uplim  = qchisq(.99,df)
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dchisq(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }
    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUNCTION
shadechi(df = 2, pcts=c(.05))   # change pcts and see what happen

shadechi(df = 18, pcts=c(.05))  # change df and see what happens

Question 1 (In class Lecture notes)

Using traditional methods, it takes 109 hours to receive a basic driving license. A new license training method using Computer Aided Instruction (CAI) has been proposed. A researcher used the technique with 190 students and observed that they had a mean of 110 hours. Assume the standard deviation is known to be 6. A level of significance of 0.05 will be used to determine if the technique performs differently than the traditional method. Make a decision to reject or fail to reject the null hypothesis. Show all work in R.

Given: \(\mu=109, n=190, \bar{x}=110, \sigma=6, \alpha=.05\).

To Do: Determine if the technique performs differently than the traditional method. Burden of proof falls on alternative hypothesis.

mu0 <- 109
n <- 190
xbar <- 110
sigma <- 6
alpha <- 0.05

# Hypotheses
# Ho: mu = 109
# Ha: mu != 109 (Two-sided test)

#  SE
SE <- sigma / sqrt(n)

#  Z-stat
Z <- (xbar - mu0) / SE

#  p-value 
p_value <- 2 * (1 - pnorm(abs(Z)))

# results
cat("Z-statistic:", Z, "\n")

## Z-statistic: 2.297341

cat("P-value:", p_value, "\n")

## P-value: 0.0215993

# Conclusion
if(p_value < alpha){
  cat("Reject Ho: There is sufficient evidence to suggest the technique performs differently.\n")
} else {
  cat("Fail to reject Ho: There is insufficient evidence to suggest the technique performs differently.\n")
}

## Reject Ho: There is sufficient evidence to suggest the technique performs differently.

Here, our p value is below the significance level which is sufficient evidence to reject our null hypothesis.

Question 2 (Lecture notes)

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 5.3 parts/million (ppm). A researcher believes that the current ozone level is at an insufficient level. The mean of 5 samples is 5.0 parts per million (ppm) with a standard deviation of 1.1. Does the data support the claim at the 0.05 level? Assume the population distribution is approximately normal.

Given: \(\mu=5.3, n=5, \bar{x}=5, \sigma=1.1, \alpha=.05\).

To Do: Researcher believes that the current ozone level is at an insufficient level - does the data support the claim at the 0.05 level?

# Question 2
mu0 <- 5.3
n <- 5
xbar <- 5.0
s <- 1.1
alpha <- 0.05

# Hypotheses
# Ho: mu = 5.3
# Ha: mu < 5.3 (Insufficient level)

#  SE
SE <- s / sqrt(n)

# T-stat
t_stat <- (xbar - mu0) / SE

# p-value
df <- n - 1
p_value <- pt(t_stat, df)

#  results
cat("T-statistic:", t_stat, "\n")

## T-statistic: -0.6098367

cat("P-value:", p_value, "\n")

## P-value: 0.2874568

# Conclusion
if(p_value < alpha){
  cat("Reject Ho: Data supports the claim that the ozone level is insufficient.\n")
} else {
  cat("Fail to reject Ho: Data does not support the claim that the ozone level is insufficient.\n")
}

## Fail to reject Ho: Data does not support the claim that the ozone level is insufficient.

This time, our results are above the p-value which does not provide sufficient evidence to reject the hypothesis

Question 3 (Lecture notes)

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 7.3 parts/million (ppm). A researcher believes that the current ozone level is not at a normal level. The mean of 51 samples is 7.1 ppm with a variance of 0.49. Assume the population is normally distributed. A level of significance of 0.01 will be used. Show all work and hypothesis testing steps.

Given: \(\mu=7.3, n=51, \bar{x}=7.1, \sigma^2=0.49, \alpha=.01\).

To Do: Researcher believes that the current ozone level is not at normal level. Thus, set a double sided hypothesis.

# Question 3
mu0 <- 7.3
n <- 51
xbar <- 7.1
variance <- 0.49
s <- sqrt(variance) # Standard deviation is sqrt of variance
alpha <- 0.01

#Hypotheses
# Ho: mu = 7.3
# Ha: mu != 7.3 (Not at normal level)

#SE
SE <- s / sqrt(n)

# T-stat
t_stat <- (xbar - mu0) / SE

# 4. P-value
df <- n - 1
p_value <- 2 * pt(-abs(t_stat), df)

# results
cat("Standard Deviation:", s, "\n")

## Standard Deviation: 0.7

cat("T-statistic:", t_stat, "\n")

## T-statistic: -2.040408

cat("P-value:", p_value, "\n")

## P-value: 0.04660827

#Conclusion
if(p_value < alpha){
  cat("Reject Ho: There is evidence the ozone level is not at a normal level.\n")
} else {
  cat("Fail to reject Ho: There is insufficient evidence to say the ozone level is not at a normal level.\n")
}

## Fail to reject Ho: There is insufficient evidence to say the ozone level is not at a normal level.

With a much smaller significance level of .01, our p-value does not come below this and fails to provide sufficient evidence to reject our null hypothesis.

Question 4 (See Open Stats Textbook - Chapter 5 Section 5.2: Confidence intervals for a proportion)

A publisher reports that 36% of their readers own a laptop. A marketing executive wants to test the claim that the percentage is actually less than the reported percentage. A random sample of 100 found that 29% of the readers owned a laptop. Is there sufficient evidence at the 0.02 level to support the executive’s claim? Show all work and hypothesis testing steps.

Given: \(\pi=.36, n=100, \hat{p}=.29,\alpha=.02\).

To Do: Executive wants to test the claim that the percentage is actually less than the reported percentage. Thus, set a single sided hypothesis.

p0 <- 0.36
n <- 100
phat <- 0.29
alpha <- 0.02

#  Hypotheses
# Ho: p = 0.36
# Ha: p < 0.36 

#  SE
SE <- sqrt((p0 * (1 - p0)) / n)

#  Z-stat
Z <- (phat - p0) / SE

#  P-value 
p_value <- pnorm(Z)

# Output results
cat("Z-statistic:", Z, "\n")

## Z-statistic: -1.458333

cat("P-value:", p_value, "\n")

## P-value: 0.07237434

# Conclusion
if(p_value < alpha){
  cat("Reject Ho: There is sufficient evidence to support the executive's claim.\n")
} else {
  cat("Fail to reject Ho: There is insufficient evidence to support the executive's claim.\n")
}

## Fail to reject Ho: There is insufficient evidence to support the executive's claim.

Our p-value does not come below the significance level which indicates a failure to provide sufficient evidence to reject null hypothesis. # Question 5 (See Open Stats Textbook - Chapter 5)

A hospital director is told that 31% of the treated patients are uninsured. The director wants to test the claim that the percentage of uninsured patients is less than the expected percentage. A sample of 380 patients found that 95 were uninsured. Make the decision to reject or fail to reject the null hypothesis at the 0.05 level. Show all work and hypothesis testing steps.

Given: \(\pi=.31, n=380, \hat{p}=\dfrac{95}{380}=.25,\alpha=.05\).

# Question 5
p0 <- 0.31
n <- 380
phat <- 95 / 380
alpha <- 0.05

# Hypotheses
# Ho: p = 0.31
# Ha: p < 0.31

# SE
SE <- sqrt((p0 * (1 - p0)) / n)

# Z-stat
Z <- (phat - p0) / SE

# P-value 
p_value <- pnorm(Z)

#  results
cat("Sample Proportion (phat):", phat, "\n")

## Sample Proportion (phat): 0.25

cat("Z-statistic:", Z, "\n")

## Z-statistic: -2.528935

cat("P-value:", p_value, "\n")

## P-value: 0.005720462

#Conclusion
if(p_value < alpha){
  cat("Reject Ho: The percentage of uninsured patients is significantly less than expected.\n")
} else {
  cat("Fail to reject Ho: The percentage is not significantly less than expected.\n")
}

## Reject Ho: The percentage of uninsured patients is significantly less than expected.

The p-value falls below the significance level which provides sufficient evidence to reject the null hypothesis.

Question 6 (See Open Stats Section 7.3, Example 7.25 in particular)

A medical researcher wants to compare the pulse rates of smokers and non-smokers. He believes that the pulse rate for smokers and non-smokers is different and wants to test this claim at the 0.1 level of significance. The researcher checks 32 smokers and finds that they have a mean pulse rate of 87, and 31 non-smokers have a mean pulse rate of 84. The standard deviation of the pulse rates is found to be 9 for smokers and 10 for non-smokers. Let \(\mu_1\) be the true mean pulse rate for smokers and \(\mu_2\) be the true mean pulse rate for non-smokers. Show all work and hypothesis testing steps.

Let smoker group be indexed by 1, non-smoker group by 2.
Given: \(n_1 = 32, \mu_1 = 87, n_2 = 31, \mu_2 = 84, \sigma_1 = 9, \sigma_2 = 10 , \alpha = 10\%\).

To Do: Test if the pulse rate for smokers and non-smokers is different at the 0.1 level of significance. Thus, double sided test.

#Smokers
n1 <- 32
mu1 <- 87
s1 <- 9

#Non-smokers
n2 <- 31
mu2 <- 84
s2 <- 10

alpha <- 0.10

#  Hypotheses
# Ho: mu1 - mu2 = 0
# Ha: mu1 - mu2 != 0

# SE 
SE <- sqrt((s1^2 / n1) + (s2^2 / n2))

#  T-stat
t_stat <- (mu1 - mu2) / SE

# Df (Using conservative estimate min(n1-1, n2-1)  Welch formula)
num <- ((s1^2 / n1) + (s2^2 / n2))^2
denom <- ((s1^2 / n1)^2 / (n1 - 1)) + ((s2^2 / n2)^2 / (n2 - 1))
df <- num / denom

#P-value 
p_value <- 2 * pt(-abs(t_stat), df)

# results
cat("T-statistic:", t_stat, "\n")

## T-statistic: 1.25032

cat("Degrees of Freedom:", df, "\n")

## Degrees of Freedom: 59.87528

cat("P-value:", p_value, "\n")

## P-value: 0.2160473

# Conclusion
if(p_value < alpha){
  cat("Reject Ho: There is a significant difference in pulse rates.\n")
} else {
  cat("Fail to reject Ho: There is no significant difference in pulse rates.\n")
}

## Fail to reject Ho: There is no significant difference in pulse rates.

Our p-value falls above the significance level.

Question 7 (See Open Stats Section 7.3, Example 7.22 in particular)

Given two independent random samples with the following results: \(n_1=11, \bar{x}_1=127, \sigma_1=33, n_2=18, \bar{x}_2=157, \sigma_2=27\)

Use this data to find the 95% confidence interval for the true difference between the population means. Assume that the population variances are not equal and that the two populations are normally distributed.

To Do: Create a 95% confidence interval for true difference between the population means.

n1 <- 11
xbar1 <- 127
s1 <- 33

n2 <- 18
xbar2 <- 157
s2 <- 27

confidence_level <- 0.95

# Point Estimate
point_est <- xbar1 - xbar2

# SE
SE <- sqrt((s1^2 / n1) + (s2^2 / n2))

# Critical t-value (using Welch's df)
num <- ((s1^2 / n1) + (s2^2 / n2))^2
denom <- ((s1^2 / n1)^2 / (n1 - 1)) + ((s2^2 / n2)^2 / (n2 - 1))
df <- num / denom

t_crit <- qt(1 - (1 - confidence_level) / 2, df)

# Confidence Interval
ci_lower <- point_est - t_crit * SE
ci_upper <- point_est + t_crit * SE

cat("95% Confidence Interval: [", ci_lower, ",", ci_upper, "]\n")

## 95% Confidence Interval: [ -54.80655 , -5.193452 ]

Question 8 (See Open Stats Section 6.2 Difference of two proportions, Example 6.2.2 in particular)

Two men, A and B, who usually commute to work together decide to conduct an experiment to see whether one route is faster than the other. The men feel that their driving habits are approximately the same, so each morning for two weeks one driver is assigned to route I and the other to route II. The times, recorded to the nearest minute, are shown in the following table. Using this data, find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

r1 = c (32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
r2 = c (28, 28, 33, 25, 26, 29, 33, 27, 25, 33)

Let \(d1 =\) (route I travel time) − (route II travel time).

Assume that the populations of travel times are normally distributed for both routes. Show all work and hypothesis testing steps.

To Do: Find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

r1 <- c(32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
r2 <- c(28, 28, 33, 25, 26, 29, 33, 27, 25, 33)

#  differences
d <- r1 - r2
n <- length(d)
mean_d <- mean(d)
sd_d <- sd(d)

confidence_level <- 0.98

# SE
SE <- sd_d / sqrt(n)

# 3. Critical t-value
df <- n - 1
t_crit <- qt(1 - (1 - confidence_level) / 2, df)

#Confidence interval
ci_lower <- mean_d - t_crit * SE
ci_upper <- mean_d + t_crit * SE

cat("Mean Difference:", mean_d, "\n")

## Mean Difference: 0.1

cat("98% Confidence Interval: [", ci_lower, ",", ci_upper, "]\n")

## 98% Confidence Interval: [ -2.766534 , 2.966534 ]

Question 9 (See Open Stats Textbook - Chapter 5 Section 5.2-5.33: Confidence intervals/Hypothesis testing for a proportion)

The U.S. Census Bureau conducts annual surveys to obtain information on the percentage of the voting-age population that is registered to vote. Suppose that 391 employed persons and 510 unemployed persons are independently and randomly selected, and that 195 of the employed persons and 193 of the unemployed persons have registered to vote. Can we conclude that the percentage of employed workers (p1) who have registere gemid to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote? Use a significance level of 0.05 for the test. Show all work and hypothesis testing steps.

Q: Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote?

n1 <- 391 # Employed total
x1 <- 195 # Employed registered

n2 <- 510 # Unemployed total
x2 <- 193 # Unemployed registered

alpha <- 0.05

# sample proportions
p1_hat <- x1 / n1
p2_hat <- x2 / n2

# Hypotheses
# Ho: p1 = p2
# Ha: p1 > p2 (Exceeds)

p_pool <- (x1 + x2) / (n1 + n2)

#SE
SE <- sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))

#Z-stat
Z <- (p1_hat - p2_hat) / SE

# P-value
p_value <- 1 - pnorm(Z)


cat("p1_hat:", p1_hat, " p2_hat:", p2_hat, "\n")

## p1_hat: 0.4987212  p2_hat: 0.3784314

cat("Z-statistic:", Z, "\n")

## Z-statistic: 3.614018

cat("P-value:", p_value, "\n")

## P-value: 0.000150744

#Conclusion
if(p_value < alpha){
  cat("Reject Ho: We can conclude the percentage of employed workers registered to vote exceeds unemployed workers.\n")
} else {
  cat("Fail to reject Ho: We cannot conclude the percentage of employed workers exceeds unemployed workers.\n")
}

## Reject Ho: We can conclude the percentage of employed workers registered to vote exceeds unemployed workers.

Week-5

Doru Cojoc

2026-02-18