Homework 5, Data Analysis

Resources :

Skimming over Chapter 5, 6 and 7 in the Open Intro Statistics textbook may be helpful to see the standard error formulas for some of the questions. I will explicitly redirect you to the textbook for some questions.
I have 4 user defined functions below - you do not have to use them, but may find them useful to graphically draw out what is happening in the question.

Set Up (4 functions) to better answer the questions.

I. Function to Reject or Not

We write a function which takes in two arguments (numbers here), runs some computations (basic inequality) on them and prints an output based on the computation result -

myp=function(p, alpha){
  if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

Test our function to make sure it is performing as intended -

myp(.01, .05) # p is less than alpha

## [1] "REJECT Ho"

myp(.1,  .05) # p is greater than alpha

## [1] "FAIL 2 REJECT"

Now, lets write a bit more complex function (takes in many arguments) that is designed to shade the standard normal distribution as the default option for a 5% double sided hypothesis test and can be adapted for other purposes too. You can chnage the arguments of mu, sig, pcts, color,…

II. Function for Shading Normal

shadenorm = function(below=NULL, above=NULL, pcts = c(0.025,0.975), mu=0, sig=1, numpts = 500, color = "gray", dens = 40,                    justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qnorm(pcts[1],mu,sig), below)
         above = ifelse(is.null(above), qnorm(pcts[2],mu,sig), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = mu - 4*sig                         # min point plotted on x axis
    uplim  = mu + 4*sig                         # max point plotted on x axis
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dnorm(x.grid,mean=mu, sd = sig)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")    # label y and x axis
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUNCTION
shadenorm(mu = 0, sig = 1, pcts = c(0.025,0.975))

# shadenorm(mu = 20, sig = 6, pcts = c(0.025,0.975))

III. Function for for Shading t

shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qt(pcts[1],df), below)
         above = ifelse(is.null(above), qt(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = -4
    uplim  = 4
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dt(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUNCTION
shadet(df = 4, pcts = c(0.025,0.975))     # see the area under the tails are further away from the mean 0..

# shadet(df = 120, pcts = c(0.025,0.975))   # t dist converges to normal when we have high degrees o freedom..

IV. Function for Shading Chi Square

shadechi = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qchisq(pcts[1],df), below)
         above = ifelse(is.null(above), qchisq(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = 0
    uplim  = qchisq(.99,df)
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dchisq(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }
    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUNCTION
shadechi(df = 2, pcts=c(.05))   # change pcts and see what happen

shadechi(df = 18, pcts=c(.05))  # change df and see what happens

Question 1 (In class Lecture notes)

Using traditional methods, it takes 109 hours to receive a basic driving license. A new license training method using Computer Aided Instruction (CAI) has been proposed. A researcher used the technique with 190 students and observed that they had a mean of 110 hours. Assume the standard deviation is known to be 6. A level of significance of 0.05 will be used to determine if the technique performs differently than the traditional method. Make a decision to reject or fail to reject the null hypothesis. Show all work in R.

Given: \(\mu=109, n=190, \bar{x}=110, \sigma=6, \alpha=.05\).

To Do: Determine if the technique performs differently than the traditional method. Burden of proof falls on alternative hypothesis -

##define variables
mu <- 109 ##population mean
n <- 190 ##sample population
bar_x <- 110 ##sample mean
sigma <- 6 ##standard deviation
alpha <- 0.05 ##level of significance
se <- sigma / sqrt(n) ##standard deviation of sample (standard error)

####calculate critical value/test statistic
t <- (bar_x - mu)/se
t

## [1] 2.297341

##calculate p-value
p <- (2*(1 - pnorm(t, mean = 0, sd = 1, lower.tail = TRUE)))
round(p, 4)

## [1] 0.0216

##create function which rejects or fails to reject null hypothesis based upon p-value relevant to level of significance
myp <- function(p, alpha){
  if(p < alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

##print result
myp(p, alpha)

## [1] "REJECT Ho"

This example acts as a hypothesis test where: \(H_0: (\mu = 109)\) “Null Hypothesis states that the average time to obtain a license using CAI is equal to the average time to obtain a license using the traditional method” \(H_a: (\mu \neq 109)\) “Alternate hypothesis states the average time to obtain a license using CAI is not equal to the average time to obtain a license using the traditional method”

From this the t score and p-value of the sample was calculated where t = 2.297 and p-value = .0216. Because the sample yielded a low p-value which was substantially lower than the level of significance alpha = 0.05, the CAI method showed statistical significance and the null hypothesis can be rejected.

Question 2 (Lecture notes)

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 5.3 parts/million (ppm). A researcher believes that the current ozone level is at an insufficient level. The mean of 5 samples is 5.0 parts per million (ppm) with a standard deviation of 1.1. Does the data support the claim at the 0.05 level? Assume the population distribution is approximately normal.

Given: \(\mu=5.3, n=5, \bar{x}=5, \sigma=1.1, \alpha=.05\).

To Do: Researcher believes that the current ozone level is at an insufficient level - does the data support the claim at the 0.05 level ?

##define variables
mu <- 5.3 ##population mean
n <- 5  ##sample population
bar_x <- 5 ##sample mean
sigma <- 1.1 ##standard deviation
alpha <- 0.05 ##level of significance
se <- sigma / sqrt(n) ##standard deviation of sample (standard error)
df <- n-1

##calculate critical value/test statistic
t <- (bar_x - mu)/se
t

## [1] -0.6098367

##calculate p-value
p <- pt(t, df, lower.tail = TRUE)
p

## [1] 0.2874568

##create function which rejects or fails to reject null hypothesis based upon p-value relevant to level of significance
myp <- function(p, alpha){
  if(p < alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

##print result
myp(p, alpha)

## [1] "FAIL 2 REJECT"

This example acts as a hypothesis test where: \(H_0: \mu \geq 5.3ppm\) “Average ppm of ozone is greater than or equal to 5.3ppm” \(H_a: \mu < 5.3ppm\) “Average ppm of ozone is less than 5.3ppm”

This example represented a one-tail test because we are trying to observe if current ozone levels are insufficient or if the average ozone ppm from sampling yields lower results than normal values where \(H_a: \mu < 5.3ppm\). Therefore the critical value, degrees of freedom and p-value were calculated and input into the built-in formula pt. This yielded that that the test statistic or critical value was 0.28 which is far greater than the alpha value or level of significance of 0.05 meaning that the test statistic did not show statistical significance and failed to reject the null hypothesis.

Question 3 (Lecture notes)

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 7.3 parts/million (ppm). A researcher believes that the current ozone level is not at a normal level. The mean of 51 samples is 7.1 ppm with a variance of 0.49. Assume the population is normally distributed. A level of significance of 0.01 will be used. Show all work and hypothesis testing steps.

Given: \(\mu=7.3, n=51, \bar{x}=7.1, \sigma^2=0.49, \alpha=.01\).

To Do: Researcher believes that the current ozone level is not at normal level. Thus, set a double sided hypothesis.

mu <- 7.3 ##population mean
n <- 51  ##sample population
bar_x <- 7.1 ##sample mean
var <- 0.49 ##standard variance
sigma <- sqrt(var) ##standard deviation
alpha <- 0.01 ##level of significance
se <- sigma / sqrt(n) ##standard deviation of sample (standard error)

##calculate critical value/test statistic
t <- (bar_x - mu)/se
t

## [1] -2.040408

##calculate p-value
p <- pnorm(t, mean = 0, sd = 1, lower.tail = TRUE)
p

## [1] 0.02065485

##create function which rejects or fails to reject null hypothesis based upon p-value relevant to level of significance
myp <- function(p, alpha){
  if(p < alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

##print result
myp(p, alpha)

## [1] "FAIL 2 REJECT"

This example acts as a hypothesis test where: \(H_0: \mu = 7.3ppm\) “Average ppm of ozone is equal to 7.3ppm” \(H_a: \mu \neq 7.3ppm\) “Average ppm of ozone does not equal 7.3ppm”

This example represented a two-tail test because we are trying to observe if critical value or t-statistic does not equal x which can be represented by positive or negative values. For this reason the pnorm function in R was used to yield a p value that was not statistically significant in reference to our significance level of 0.01. S o at the 99% statistical confidence level the null hypothesis could not be rejected. However, if the level of significance was 0.05 as previously the null hypothesis would be rejected because p = .02. So at the 95% confidence level the null hypothesis is rejected while at the 99% confidence level the null hypothesis is not rejected.

Question 4 (See Open Stats Textbook - Chapter 5 Section 5.2: Confidence intervals for a proportion)

A publisher reports that 36% of their readers own a laptop. A marketing executive wants to test the claim that the percentage is actually less than the reported percentage. A random sample of 100 found that 29% of the readers owned a laptop. Is there sufficient evidence at the 0.02 level to support the executive’s claim? Show all work and hypothesis testing steps.

Given: \(\pi=.36, n=100, \hat{p}=.29,\alpha=.02\).

To Do: Executive wants to test the claim that the percentage is actually less than the reported percentage. Thus, set a single sided hypothesis.

# Define Variables
n <- 100 #sample population
pi <- 0.36 ##population percentage
p_hat <- 0.29 ##sample percentage
alpha <- 0.02 ##level of significance
se <- sqrt((pi*(1-pi))/n) ##standard error
df <- n - 1 ##degree of freedom

##calculate critical value/test statistic
t <- t <- (p_hat - pi)/se
t

## [1] -1.458333

##calculate p-value
p <- pt(t, df, lower.tail = TRUE)
p

## [1] 0.07395698

##create function which rejects or fails to reject null hypothesis based upon p-value relevant to level of significance
myp <- function(p, alpha){
  if(p < alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

##print result
myp(p, alpha)

## [1] "FAIL 2 REJECT"

This example acts as a hypothesis test where: \(H_0: \pi \geq 36\%\) “percentage of readers who own laptops is greater than or equal to 36%” \(H_a: \pi < 36\%\) “percentage of readers who own laptops is less than 36%”

This example represented a one-tail test because we are trying to observe if the percentage of readers who own a laptop is less than 36%. Therefore, the pt function was used where the critical value was calculated as \(t = \frac{.30 -.36}{\frac{\sqrt{.36*(1-.36)}}{100}}\) From there the formula yielded a p-value of 0.074 which is higher than the level of significance of 0.02 showing that the test statistic was not statistically significant and the null hypothesis could not be rejected.

Question 5 (See Open Stats Textbook - Chapter 5)

A hospital director is told that 31% of the treated patients are uninsured. The director wants to test the claim that the percentage of uninsured patients is less than the expected percentage. A sample of 380 patients found that 95 were uninsured. Make the decision to reject or fail to reject the null hypothesis at the 0.05 level. Show all work and hypothesis testing steps.

Given: \(\pi=.31, n=380, \hat{p}=\dfrac{95}{380}=.25,\alpha=.05\).

To Do: Researcher believes that the current ozone level is not at normal level. Thus, set a double sided hypothesis.

# Define Variables
n <- 380 #sample population
pi <- 0.31 ##population percentage
p_hat <- 95/n ##sample percentage
alpha <- 0.05 ##level of significance
se <- sqrt((pi*(1-pi))/n) ##standard error
df <- n - 1 ##degree of freedom

##calculate critical value/test statistic
t <- (p_hat - pi)/se
t

## [1] -2.528935

##calculate p-value
p <- pt(t, df, lower.tail = TRUE)
p

## [1] 0.00592291

##create function which rejects or fails to reject null hypothesis based upon p-value relevant to level of significance
myp <- function(p, alpha){
  if(p < alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

##print result
myp(p, alpha)

## [1] "REJECT Ho"

This example acts as a hypothesis test where: \(H_0: \pi \geq 31\%\) “percentage of uninsured patients is greater than or equal to 31%” \(H_a: \pi < 31\%\) “percentage of uninsured patients is less than 31%”

This example represented a one-tail test because we are trying to observe if the percentage of uninsured patients is less than 31%. Therefore, the pt function was used where the critical value was calculated similar to the previous question. From there the formula yielded a p-value of 0.0059 which is lower than the level of significance of 0.05 showing that the test statistic was statistically significant and the null hypothesis was rejected. Therefore, at the 95% statistical confidence level, the percentage of uninsured patients was less than 31%.

Question 6 (See Open Stats Section 7.3, Example 7.25 in particular)

A medical researcher wants to compare the pulse rates of smokers and non-smokers. He believes that the pulse rate for smokers and non-smokers is different and wants to test this claim at the 0.1 level of significance. The researcher checks 32 smokers and finds that they have a mean pulse rate of 87, and 31 non-smokers have a mean pulse rate of 84. The standard deviation of the pulse rates is found to be 9 for smokers and 10 for non-smokers. Let \(\mu_1\) be the true mean pulse rate for smokers and \(\mu_2\) be the true mean pulse rate for non-smokers. Show all work and hypothesis testing steps.

Let smoker group be indexed by 1, non-smoker group by 2.
Given: \(n_1 = 32, \mu_1 = 87, n_2 = 32, \mu_2 = 84, \sigma_1 = 9, \sigma_2 = 10 , \alpha = .01\).

To Do: Test if the pulse rate for smokers and non-smokers is different at the 0.1 level of significance. Thus, double sided test.

##define variables
n_1 <- 32 ##number of smokers
xbar_1 <- 87 ##mean of smokers
sigma_1 <- 9 ##standard deviation of smokers
n_2 <- 31 ##number of nonsmokers
xbar_2 <- 84 ##mean of nonsmokers
sigma_2 <- 10 ##standard deviation of nonsmokers
n <- n_1 + n_2 ##total sample size
alpha <- 0.1 ##level of significance
df <- n_2 - 1 ##degrees of freedom

xbar_diff <- xbar_1 - xbar_2 ##difference of smoker mean and nonsmoker mean
var_1 <- sigma_1^2 ##variance of smokers
var_2 <- sigma_2^2 ##variance of nonsmokers
se <- sqrt((var_1/n_1) + (var_2/n_2)) ##standard error

##calculate critical value/test statistic
t <- xbar_diff/se 
t

## [1] 1.25032

##calculate p-value
p <- 2*pt(t, df, lower.tail = FALSE)
p

## [1] 0.220848

##create function which rejects or fails to reject null hypothesis based upon p-value relevant to level of significance
myp <- function(p, alpha){
  if(p < alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

##print result
myp(p, alpha)

## [1] "FAIL 2 REJECT"

This example acts as a hypothesis test where: \(H_0: \mu_1 = \mu_2\) “Average pulse rate of smokers equals average pulse rate of non-smokers” \(H_a: \mu_1 \neq \mu_2\) “Average pulse rate of smokers does not equal average pulse rate of non-smokers”

This example represented a two-tail test because we are trying to observe if the critical value or t-statistic does not equal x which can be represented by positive or negative values. Additionally there were two separate samples that yielded different values for the variables xbar, n, and sigma. For this reason the standard error was calculated as \(\frac{var_1}{n_1} + \frac{var_2}{n_2}\) and the mean was calculated as \(\bar{x_1} - \bar{x_2}\). This is how the test statistic was calculated which was used in the pt function in R to yield a p value that was not statistically significant in reference to our significance level of 0.01. So at the 99% statistical confidence level the null hypothesis could not be rejected.

Question 7 (See Open Stats Section 7.3, Example 7.22 in particular)

Given two independent random samples with the following results: \(n_1=11, \bar{x}_1=127, \sigma_1=33, n_2=18, \bar{x}_2=157, \sigma_2=27\)

Use this data to find the 95% confidence interval for the true difference between the population means. Assume that the population variances are not equal and that the two populations are normally distributed.

To Do: Create a 95% confidence interval for true difference between the population means.

##define variables
n_1 <- 11 ##sample size of sample 1
xbar_1 <- 127 ##mean of sample 1
sigma_1 <- 33 ##standard deviation of sample 1
n_2 <- 18 ##sample size of sample 2
xbar_2 <- 157 ##mean of sample 2
sigma_2 <- 27 ##standard deviation of sample 2
alpha <- 0.05 ##level of significance
var_1 <- sigma_1^2 ##variance of sample 1 
var_2 <- sigma_2^2 ##variance of sample 2
df <- (((sigma_1^2/n_1)+(sigma_2^2/n_2))^2)/(((sigma_1^2/n_1)^2/(n_1-1))+((sigma_2^2/n_2)^2/(n_2-1)))##degrees of freedom

df

## [1] 18.0759

xbar_diff <- xbar_1 - xbar_2 ##difference in sample 1 mean and sample 2 mean
se <- sqrt((var_1/n_1)+(var_2/n_2)) ##standard error

##calculate critical value/test statistic
t_stat <- xbar_diff/se
t_stat

## [1] -2.540003

t_c <- qt(alpha/2, df, lower.tail = FALSE)
t_c

## [1] 2.10029

margin_error <- t_c * se

lower <- xbar_diff - margin_error
upper <- xbar_diff + margin_error

round(lower, 4)

## [1] -54.8065

round(upper,4)

## [1] -5.1935

Question 8 (See Open Stats Section 6.2 Difference of two proportions, Example 6.2.2 in particular)

Two men, A and B, who usually commute to work together decide to conduct an experiment to see whether one route is faster than the other. The men feel that their driving habits are approximately the same, so each morning for two weeks one driver is assigned to route I and the other to route II. The times, recorded to the nearest minute, are shown in the following table. Using this data, find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

r1 = c (32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
r2 = c (28, 28, 33, 25, 26, 29, 33, 27, 25, 33)

Let \(d1 =\) (route I travel time) − (route II travel time).

Assume that the populations of travel times are normally distributed for both routes. Show all work and hypothesis testing steps.

To Do: Find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

##create data set for each route
r1 = c(32, 27, 34, 24, 31, 25, 30, 23, 27, 35) ##route 1 
r2 = c(28, 28, 33, 25, 26, 29, 33, 27, 25, 33) ##route 2

##define variables
n_1 <- 10 ##sample size of sample 1
xbar_1 <- mean(r1) ##mean of sample 1
sigma_1 <- sd(r1) ##standard deviation of sample 1
n_2 <- 10 ##sample size of sample 2
xbar_2 <- mean(r2) ##mean of sample 2
sigma_2 <- sd(r2) ##standard deviation of sample 2
alpha <- 1 - 0.98 ##level of significance
var_1 <- sigma_1^2 ##variance of sample 1 
var_2 <- sigma_2^2 ##variance of sample 2
df <- n_1 - 1 ##degrees of freedom

xbar_diff <- xbar_1 - xbar_2 ##difference in sample 1 mean and sample 2 mean
se <- sqrt((var_1/n_1)+(var_2/n_2)) ##standard error

##calculate critical value/test statistic
t_stat <- xbar_diff/se
t_stat

## [1] 0.05956087

t_c <- qt(alpha/2, df, lower.tail = FALSE)
t_c

## [1] 2.821438

margin_error <- t_c * se

lower <- xbar_diff - margin_error
upper <- xbar_diff + margin_error

round(lower, 4)

## [1] -4.6371

round(upper,4)

## [1] 4.8371

Question 9 (See Open Stats Textbook - Chapter 5 Section 5.2-5.33: Confidence intervals/Hypothesis testing for a proportion)

The U.S. Census Bureau conducts annual surveys to obtain information on the percentage of the voting-age population that is registered to vote. Suppose that 391 employed persons and 510 unemployed persons are independently and randomly selected, and that 195 of the employed persons and 193 of the unemployed persons have registered to vote. Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote? Use a significance level of 0.05 for the test. Show all work and hypothesis testing steps.

Q: Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote?

##define variables
n_1 <- 391 ##number of employed persons
x_1 <- 195 ##number of employed person registered to vote
n_2 <- 510 ##number of unemployed persons
x_2 <- 193 ##number of unemployed persons registered to vote

alpha <- 0.05 ##level of significance

p_1 <- x_1/n_1 ##percentage of employed persons registered to vote
p_2 <- x_2/n_2 ##percentage of unemployed persons registered to vote

perc_diff <- p_1 - p_2 ##difference in sample 1 mean and sample 2 mean

se <- sqrt(p_1 * (1-p_1) / n_1 + p_2 * (1-p_2) / n_2) ##standard error
se

## [1] 0.03317529

##calculate critical value/test statistic
t_stat <- perc_diff/se
t_stat

## [1] 3.625887

p <- pt(t_stat, df, lower.tail = FALSE)
p

## [1] 0.002760438

##create function which rejects or fails to reject null hypothesis based upon p-value relevant to level of significance
myp <- function(p, alpha){
  if(p < alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

##print result
myp(p, alpha)

## [1] "REJECT Ho"

This example acts as a hypothesis test where: \(H_0: p_1 \leq p_2\) “percentage of employed persons registered to vote is less than or equal to the percentage of unemployed person registered to vote” \(H_a: p-1 > p_2\) “percentage of employed persons registered to vote is greater than the percentage of unemployed persons registered to vote”

This example represented a one-tail test because we are trying to observe if the percentage of employed persons registered to vote is greater than or equal to the percentage of unemployed persons that are registered to vote. Therefore, the pt function was used where the critical value was calculated as percentage difference over the standard error. From there the formula yielded a p-value of 0.0002 which is lower than the level of significance of 0.05 showing that the test statistic was statistically significant and the null hypothesis could be rejected. It is concluded that the percentage of employed persons registered to vote is greater than the percentage of unemployed persons registered to vote.

Week-5

Justin Nevins

2024-02-18