Homework 5, Data Analysis

Resources :

  1. Please have a look at re-emphasizing the core concepts of CLT, standard error, hypothesis testing, confidence interval and p-values. Please skim through them (~15 minutes) before attempting the assignment to refresh your memories.

  2. Please find the Open Intro Statistics textbook (OpenStat_textbook.pdf) in our Dropbox folder - skimming over Chapter 5, 6 and 7 may be helpful to see the standard error formulas for some of the questions. I will explicitly redirect you to the textbook for some questions.

  3. I have 4 user defined functions below - you do not have to use them, but may find them useful to graphically draw out what is happening in the question.

# Clear the workspace
  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory
##           used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells  540909 28.9    1210744 64.7         NA   669420 35.8
## Vcells 1005399  7.7    8388608 64.0      16384  1851760 14.2
  cat("\f")       # Clear the console

Set Up (4 functions) to better answer the questions.

I. Function to Reject or Not

We write a function which takes in two arguments (numbers here), runs some computations (basic inequality) on them and prints an output based on the computation result -

myp=function(p, alpha){
  if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

Test our function to make sure it is performing as intended -

myp(.01, .05) # p is less than alpha
## [1] "REJECT Ho"
myp(.1,  .05) # p is greater than alpha
## [1] "FAIL 2 REJECT"

Now, lets write a bit more complex function (takes in many arguments) that is designed to shade the standard normal distribution as the default option for a 5% double sided hypothesis test and can be adapted for other purposes too. You can change the arguments of mu, sig, pcts, color,…

II. Function for Shading Normal

shadenorm = function(below=NULL, above=NULL, pcts = c(0.025,0.975), mu=0, sig=1, numpts = 500, color = "gray", dens = 40,                    justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qnorm(pcts[1],mu,sig), below)
         above = ifelse(is.null(above), qnorm(pcts[2],mu,sig), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = mu - 4*sig                         # min point plotted on x axis
    uplim  = mu + 4*sig                         # max point plotted on x axis
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dnorm(x.grid,mean=mu, sd = sig)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")    # label y and x axis
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUCTION
shadenorm(mu = 0, sig = 1, pcts = c(0.025,0.975))

# shadenorm(mu = 20, sig = 6, pcts = c(0.025,0.975))

III. Function for for Shading t

shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qt(pcts[1],df), below)
         above = ifelse(is.null(above), qt(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = -4
    uplim  = 4
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dt(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUCTION
shadet(df = 4, pcts = c(0.025,0.975))     # see the area under the tails are further away from the mean 0..

# shadet(df = 120, pcts = c(0.025,0.975))   # t dist converges to normal when we have high degrees o freedom..

IV. Function for Shading Chi Square

shadechi = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qchisq(pcts[1],df), below)
         above = ifelse(is.null(above), qchisq(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = 0
    uplim  = qchisq(.99,df)
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dchisq(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }
    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUCTION
shadechi(df = 2, pcts=c(.05))   # change pcts and see what happen

shadechi(df = 18, pcts=c(.05))  # change df and see what happens   

Question 1 (In class Lecture notes)

Using traditional methods, it takes 109 hours to receive a basic driving license. A new license training method using Computer Aided Instruction (CAI) has been proposed. A researcher used the technique with 190 students and observed that they had a mean of 110 hours. Assume the standard deviation is known to be 6. A level of significance of 0.05 will be used to determine if the technique performs differently than the traditional method. Make a decision to reject or fail to reject the null hypothesis. Show all work in R.

Given: \(\mu=, n=, \bar{x}=, \sigma=, \alpha=\).

To Do: Determine if the technique performs differently than the traditional method. Burden of proof falls on alternative hypothesis -

i. Null and Alternative Hypothesis

Ho: $\mu$,

Ha:$$ \mu \neq \geq $$

Ho: The CAI technique does not perform differently than traditional methods.

Ha: The CAI technique does perform differently than traditional methods.

Two sided test (look at alternative hypothesis).

ii. Choose level of significance

$\alpha = .05$

iii. Test Statistic

Distribution: Z (known SD)

iv. Decision Rule

Test statistic vs. critical value - if the test statistic is larger than the critical value in absolute terms/more extreme, reject the null

$\alpha$ (significance level) vs. p. value - If the p value is smaller than alpha, reject the null

Confidence intervals - Does the confidence interval from the sample point estimate contain the hypothesized values? If not, reject the null

v. Take sample and decide

Z = (110-109 ) / (6/sqrt(190))                    # compute test statistic
Z                                                 # 110 is Z or 2.297 standard deviations away to the right of mean 109 since it's positive
## [1] 2.297341
p_value = 2 * ( 1-pnorm(q = Z,
                        mean = 0,
                        sd = 1) 
                )          # p value associated with the two sided hypothesis test - probability of finding as or more extreme outcomes than what the sample suggests

p_value
## [1] 0.0215993
alpha = .05                           # given to us in question 


# visually, this is happening

shadenorm( mu = 109, sig = 6/sqrt(190), pcts = c(0.025,0.975), color = "red")   # shades significance level gates 
lines(x=rep(110,10), y=seq(0,1,length.out=10), col='blue')                       # mark point estimate from sample

# Algorithm / myp function that we define gives us the same result
myp(p = p_value, alpha = alpha )
## [1] "REJECT Ho"
# Lets call comparing p value with alpha this method 1.  Other methods should give us the same result too.  


# Method 2 - comparing test statistic with critical values (we should expect more extreme test statistic than critical value as we know we reject the null here)...these are on standard normal...
test_value <- Z
critical_value <- qnorm(p = .975,mean = 0,sd = 1 )
test_value
## [1] 2.297341
critical_value
## [1] 1.959964

Since the p-value is less than alpha, we reject the null hypothesis - we’re unlikely to see the sample mean we saw if the null is true. Also, the Z-score (test statistic) is greater than the critical value, so we reject the null. Last, we can see in our picture of the curve that the mean falls beyond the critical values, so we reject the null.

Question 2 (Lecture notes)

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 5.3 parts/million (ppm). A researcher believes that the current ozone level is at an insufficient level. The mean of 5 samples is 5.0 parts per million (ppm) with a standard deviation of 1.1. Does the data support the claim at the 0.05 level? Assume the population distribution is approximately normal.

Given: $\mu=, n=, \bar{x}=, \sigma=, \alpha=$

To Do: Researcher believes that the current ozone level is at an insufficient level - does the data support the claim at the 0.05 level ?

i. Null and Alternative Hypothesis

Ho: $\mu$ or \$\\mu = 5.3 \$, Ha: $\mu \neq 5.3$

Ho: Current ozone layer is at a sufficient level at 5.3ppm

Ha: Current ozone layer is not 5.3ppm, thus at an insufficient level.

Not assuming that more or less ozone is a good thing, we just know that it needs to be 5.3ppm. So, we are looking at whether the mean is significantly less or higher than 5.3ppm.

Two sided test

$\alpha = .05$

iii. Test Statistic

Distribution: Normal, given in problem, becomes standard normal, so use Z score as test statistic

iv. Decision Rule

Test statistic vs. critical value - if the test statistic is larger than the critical value in absolute terms/more extreme, reject the null

$\alpha$ (significance level) vs. p. value - If the p value is smaller than alpha, reject the null

Confidence intervals - Does the confidence interval from the sample point estimate contain the hypothesized values? If not, reject the null

v. Take sample and decide

Z2 = (5-5.3) / (1.1/sqrt(5))                    # compute test statistic
Z2                                              # 5 is Z or -.61 standard deviations away to the left of mean 5.3 since it's negative
## [1] -0.6098367
p_value2 = 2 * ( 1-pnorm(q = Z2,
                        mean = 0,
                        sd = 1) 
                )          # p value associated with the two sided hypothesis test - probability of finding as or more extreme outcomes than what the sample suggests

p_value2
## [1] 1.45803
alpha2 = .05                           # given to us in question 


# visually, this is happening 

shadenorm( mu = 5.3, sig = 1.1/sqrt(5), pcts = c(0.025,0.975), color = "red")   # shades significance level gates 
lines(x=rep(5,10), y=seq(0,1,length.out=10), col='blue')                       # mark point estimate from sample

#applying the pvalue function to determine if p < alpha
myp(p = p_value2, alpha = alpha2 )
## [1] "FAIL 2 REJECT"
# Lets call comparing p value with alpha this method 1.  Other methods should give us the same result too.  


# Method 2 - comparing test statistic with critical values (we should expect more extreme test statistic than critical value as we know we reject the null here)...these are on standard normal...
test_value2 <- Z2
critical_value2 <- qnorm(p = .975,mean = 0,sd = 1 )
test_value2
## [1] -0.6098367
critical_value2
## [1] 1.959964

Fail to reject the null, p-value is not less than our alpha, the z score or test stat is not greater than our critical value, and the mean does not fall in past the gates of our critical values.

Question 3 (Lecture notes)

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 7.3 parts/million (ppm). A researcher believes that the current ozone level is not at a normal level. The mean of 51 samples is 7.1 ppm with a variance of 0.49. Assume the population is normally distributed. A level of significance of 0.01 will be used. Show all work and hypothesis testing steps.

Given: \(\mu=, n=, \bar{x}=, \sigma=, \alpha=\), df = n-1 = 50

To Do: Researcher believes that the current ozone level is not at normal level. Thus, set a double sided hypothesis.

i. Null and Alternative Hypothesis

Ho: $\mu$, Ha: $\mu \neq 5.3$

Ho: The current ozone level is at a normal level.

Ha: The current ozone level is not at a normal level.

ii. Choose level of significance

$\alpha = .01$

iii. Test Statistic

Distribution: t (unknown SD)

iv. Decision Rule

Test statistic vs. critical value - if the test statistic is larger than the critical value in absolute terms/more extreme, reject the null

$\alpha$ (significance level) vs. p. value - If the p value is smaller than alpha, reject the null

Confidence intervals - Does the confidence interval from the sample point estimate contain the hypothesized values? If not, reject the null

v. Take sample and decide

#defining our variables - sample mean, sample size, sample standard error, to find our t statistic or t-score
?t.test
sample.mean <- 7.1
sample.n <- 51
sample.se <- sqrt(.49)/sqrt(51)
t.stat3 <- (sample.mean - 7.3)/sample.se
t.stat3
## [1] -2.040408
#then we calculate the p-value
degrees.freedom3 <- sample.n - 1
p_value3 = pt(q=abs(t.stat3), df = degrees.freedom3, lower.tail = FALSE) *2
p_value3
## [1] 0.04660827
#define alpha
alpha3 <- .01
myp(p = p_value3, alpha = alpha3)
## [1] "FAIL 2 REJECT"
#testing t test statistic against critical value

t.stat3
## [1] -2.040408
critical_value3 <- qnorm(p = .995, mean = 0,sd = 1 )
critical_value3
## [1] 2.575829

We fail to reject the null hypothesis that the current ozone layer is at a normal level.

Question 4 (See Open Stats Textbook - Chapter 5 Section 5.2: Confidence intervals for a proportion)

A publisher reports that 36% of their readers own a laptop. A marketing executive wants to test the claim that the percentage is actually less than the reported percentage. A random sample of 100 found that 29% of the readers owned a laptop. Is there sufficient evidence at the 0.02 level to support the executive’s claim? Show all work and hypothesis testing steps.

Given: $\pi= , n = 100 , \hat{p}= ,\alpha= .02$

To Do: Executive wants to test the claim that the percentage is actually less than the reported percentage. Thus, set a single sided hypothesis.

i. Null and Alternative Hypothesis

Ho: $\pi = .36$, Ha: \$\\pi = .29 \$

Ho: Mean proportion of readers that own a laptop is greater than or equal to .36

Ha: Mean proportion of readers that own a laptop is less than .36

ii. Choose level of significance

$\alpha = .02$

iii. Test Statistic

Distribution: Z (proportion)

iv. Decision Rule

# we calculate the confidence interval formula

#z with alpha = 0.02 multiplier
z4_mutiplier = 2.326

#use formula p-hat +/ z alpha/2 multiplier * standard error 
confidence_interval_lower <- .29 - 2.326 * sqrt(.36*(1-.36)/100)
confidence_interval_lower
## [1] 0.178352
confidence_interval_higher <- .29 + 2.326 * sqrt(.36*(1-.36)/100)
confidence_interval_higher
## [1] 0.401648

v. Take sample and decide

Because the mean proportion of readers proposed in H0 (36%) is within our 98% confidence interval between 17.8% and 40.2%, we cannot say the null value is implausible, thus we fail to reject the null.

Question 5 (See Open Stats Textbook - Chapter 5)

A hospital director is told that 31% of the treated patients are uninsured. The director wants to test the claim that the percentage of uninsured patients is less than the expected percentage. A sample of 380 patients found that 95 were uninsured. Make the decision to reject or fail to reject the null hypothesis at the 0.05 level. Show all work and hypothesis testing steps.

Given: $\pi= .31 n = 380 , \hat{p}= 95/380 = .25, \alpha= .05$

To Do: Hospital director believes uninsured patients are less than .31, thus a one sided test, also a lower tailed test.

i. Null and Alternative Hypothesis

Ho: \$\\pi = .31 \$ , Ha: $\pi < .31$

Ho: # of uninsured patients is .31

Ha: # of uninsured patients is less than .31

ii. Choose level of significance

\$\\alpha = 0.05 \$

iii. Test Statistic

Distribution: Z (proportion)

iv. Decision Rule

We need to find under what circumstances to reject the null. Where it’s a lower-tailed test, we reject the null if the Z test statistic is smaller than the critical value. The critical value is dictated by the level of significance, .05. With this case, our critical value is -1.645. Thus, we reject the null if our Z score < or = -1.645.

v. Take sample and decide

#use the z formula for proportions which is z = p-hat - p / sqrt(p*(1-p-hat)/n)

z5 <- .25 - .31 / sqrt(.31*(1-.31)/380)
z5
## [1] -12.81616

Yes, -12.816 is much less than our critical value of -1.645 so we strongly reject the null hypothesis that 31% or less people are insured.

Question 6. Find the minimum sample size needed to be 99% confident that the sample’s variance is within 1% of the population’s variance.

This problem is asking us to find the minimum sample size needed at a 99% confidence, or in other words a 99% probability the confidence interval will contain the true population parameter if we were to take many samples. We know that the sample variance is the best estimate of the population’s variance. We also know that the width of the confidence interval is affected by sample size, thus we need a larger sample size to be 99% confident. We need additional parameters like the sample variance and margin of error to understand this problem. For example’s sake, say the margin of error = 0.5 and the sample variance is 1.5.

#our equation to find N = (Zalpha2)^2 * sample variance^2 / E^2

#our z value at .01 / 2 = .005 = 2.576

n6 <- 2.576^2 * 1.5 ^2 / .5 ^ 2
n6
## [1] 59.72198
# Our minimum sample size needed is 60. This makes sense as our sample size typically needs to be greater than 30. 

Question 7 (See W5 Dropbox “Lecture 21” file for similar question. OMIT submission for this week as not covered this class.)

A standardized test is given to a sixth-grade class. Historically the mean score has been 112 with a standard deviation of 24. The superintendent believes that the standard deviation of performance may have recently decreased. She randomly sampled 22 students and found a mean of 102 with a standard deviation of 15.4387. Is there evidence that the standard deviation has decreased at the = 0.1 level? Show all work and hypothesis testing steps.

Given: n = 22, $\hat{\sigma} = 15.4387 , \alpha= .1$

To Do: Is there evidence that the standard deviation has decreased?

i. Null and Alternative Hypothesis

Ho: \$\sigma = 24\$, Ha: \$\sigma = 15.4387\$

Ho: There has been no decrease in the standard deviation of 6th grade standardize test performance.

Ha: The standard deviation of 6th grade standardize test performance has decreased.

ii. Choose level of significance

\$\alpha = .1 \$

iii. Test Statistic

Distribution: Chi Squared (Random variable is the probability distribution of the sum of the squared errors, and it occurs naturally by squaring a normal distribution)

iv. Decision Rule

If p-value is less than alpha, we reject the null.

#we need to calculate the test statistic using the chi square distribution

# this is a one-sided test

# since the distribution of chi square is skewed right, we need 2 separate interval values to calculate confidence intervals. The formula is (n-1) * s^2 / sigma ^2 of alpha on the left and same thing on the right but sigma is 1 - alpha/2. 

sqrt((21*15.4387^2/qchisq(.1,21)))
## [1] 19.44388
sqrt((21*15.4387^2/qchisq(.90,21)))
## [1] 13.00061

v. Take sample and decide

#Ho:  sigma>=24, Ha: sigma<=24


shadechi(df = 21, pcts=c(.1))  # we can see a picture of the distributing plugging in degrees of freedom n - 1 = 21 and the confidence level .1.

p_value7 <- (1-pnorm(q = qchisq(.1,21),
                        mean = 112,
                        sd = 24))
p_value7
## [1] 0.9999806

Our above confidence interval is between 13.000 and 19.443. Where our mean is 15.438 which falls withint this confidence interval, which is a range of values likely to contain a population standard deviation. Due to this, we fail to reject the null hypothesis. Our p-value is also not smaller than alpha, so we fall to reject.

For 8 and 9, I use the Satterthwaite approximation1.

The Satterthwaite formula for 2 sample t-test degrees of freedom is $\dfrac{(\dfrac{s_1^2}{n_1}+\dfrac{s_2^2}{n_2})^2}{\dfrac{1}{n_1-1}(\dfrac{s_1^2}{n_1})^2+\dfrac{1}{n_2-1}(\dfrac{s_2^2}{n_2})^2}$, where $s_1$ and $n_1$ is the standard deviation and sample size of group 1, and $s_2$ and $n_2$ is the standard deviation and sample size of group 2.

The official Satterthwaite formula for the degrees of freedom is quite complex and is generally computed using software, so instead you may use the smaller of n1 − 1 and n2 − 1 for the degrees of freedom if software isn’t readily available.

Thus, you may for 8 and 9 select the smaller of the two degrees of freedom corresponding to each group.

See 7.3.3.R - the R scripts for Case Study 7.3.3 in Open Stats textbook, available now in Dropbox W5 FOLDER, to see how to solve similar questions -

Question 8 (See Open Stats Section 7.3, Example 7.25 in particular)

A medical researcher wants to compare the pulse rates of smokers and non-smokers. He believes that the pulse rate for smokers and non-smokers is different and wants to test this claim at the 0.1 level of significance. The researcher checks 32 smokers and finds that they have a mean pulse rate of 87, and 31 non-smokers have a mean pulse rate of 84. The standard deviation of the pulse rates is found to be 9 for smokers and 10 for non-smokers. Let $\mu_1$ be the true mean pulse rate for smokers and $\mu_2$ be the true mean pulse rate for non-smokers. Show all work and hypothesis testing steps.

Let smoker group be indexed by 1, non-smoker group by 2.
Given: \$n_1 = 32 , \mu\_1 = 87 , n_2 = 31 , \mu\_2 = 84, \sigma\_1 = 9 , \sigma\_2 = 10, \alpha = .1\$

To Do: Test if the pulse rate for smokers and non-smokers is different at the 0.1 level of significance. Thus, double sided test.

i. Null and Alternative Hypothesis

Ho: $\mu_1 - \mu_2 = 0$, Ha: $\mu_1 - \mu_2 \neq 0$

Ho: There is no difference in the pulse rate of smokers vs non-smokers

Ha: There is in fact a difference in the pulse rate of smokers vs non-smokers

ii. Choose level of significance

$\alpha = .1$

iii. Test Statistic

Distribution: t

iv. Decision Rule

myp()

v. Take sample and decide

# Ho: Mu1-mu2=0, Ha:  Mu1-Mu2<>0
mu1 <- 87
mu2 <- 84

alpha   <-  .1

# dist = t
n1    <-   32
n2    <-   31

 df1   <-   n1-1
 df2   <-   n2-1

sd1   <- 9
sd2   <- 10

var1  <- 9^2
var2  <- 10^2

  
num_point_estimate_diff <- (mu1 - mu2 )  # point estimate difference 
den_Se <- sqrt( var1/n1 + var2/n2 )  # Se formula - Standard Error using sample standard deviations rather than population standard deviations
t   <- num_point_estimate_diff / den_Se

numdf <- (var1/n1 + var2/n2)^2                       # Satterthwaite
dendf <- (var1/n1)^2 / df1 + (var2/n2)^2 / df2       # Satterthwaite
df <- numdf / dendf                                  # Satterthwaite - can be replaced with smaller of df1 or df2

shadet(df = df, pcts = c(.05,.95))
lines(rep(t,10), seq(0,1,length.out=10),col='red')

?pt # distribution function for the t distribution with df degrees of freedom
p_value = 2 * ( 1 - pt(q = t, df = numdf/dendf))    # Satterthwaite         ## [1] 0.2160473
p_value
myp(p_value,alpha)

p_value_robust <- 2 * ( 1 - pt(q = t, df = min(df1, df2))) # smaller of the numerator and denominator degree of freedom
p_value_robust # a bit different p value, but the same end decision rule !!!   ## [1] 0.220848
myp(p_value_robust,alpha)

Since our p-value ~.2 in both cases is not less than our significance level alpha .1, we fail to reject our null hypothesis. Our p-value is the probability of obtaining a test statistic as extreme or more extreme than our t test statistic, assuming the null is true. A p-value greater than our alpha is basically saying there’s a higher probability of obtaining this test statistic so we can’t reject the statement that there is not a different in pulse rates between smokers.

Question 9 (See Open Stats Section 7.3, Example 7.22 in particular)

Given two independent random samples with the following results: $n_1, \bar{x}_1, \sigma_1, n_2, \bar{x}_2, \sigma_2$

Use this data to find the 95% confidence interval for the true difference between the population means. Assume that the population variances are not equal and that the two populations are normally distributed.

To Do: Create a 95% confidence interval for true difference between the population means. Unpooled variances.

i. Null and Alternative Hypothesis

Ho: \$\bar{x}\_1 - \bar{x}\_2 = 0\$ , Ha: $\bar{x}_1 - \bar{x}_2 > 0$

ii. Choose level of significance

$\alpha = .05$

iii. Test Statistic

Distribution: t

iv. Decision Rule

If p-value is less than alpha, we reject the null hypothesis.

v. Take sample difference and construct 95% CI around it

alpha = .05

xbar1   =   127  # mean
xbar2   =   157  # mean

n1    =     11  # sample size
n2    =     18   # sample size

df1   =   n1-1    # degrees of freedom
df2   =   n2-1    # degrees of freedom

s1    =   33  # sd
s2    =   27  # sd

var1    =   s1^2    # variance
var2    =   s2^2    # variance

# Satterthwaite DF - can be replaced with smaller of df1 or df2
numdf =   ( var1 / n1   +   var2 / n2 )^2
dendf =   ( var1 / n1 )^2 / df1  +  (var2 / n2 )^2 / df2
df    =   numdf / dendf
df

delta   = xbar1   -   xbar2               # point estimate difference 
delta
t   =   qt(p = .975, df = df)             # two sided hypothesis test at 5% level of significance, p = vector of probabilities
t

Se = sqrt( var1/n1 + var2/n2 )  # Se formula - Standard Error using sample standard deviations rather than population standard deviations
Se

interval = c( delta - t * Se , delta + t * Se )
interval

CI is between -54.807 and - 5.193 at 95%.

Question 10 (See Open Stats Section 6.2 Difference of two proportions, Example 6.2.2 in particular)

Two men, A and B, who usually commute to work together decide to conduct an experiment to see whether one route is faster than the other. The men feel that their driving habits are approximately the same, so each morning for two weeks one driver is assigned to route I and the other to route II. The times, recorded to the nearest minute, are shown in the following table. Using this data, find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

Let $d1 = $ (route I travel time) − (route II travel time).

Assume that the populations of travel times are normally distributed for both routes. Show all work and hypothesis testing steps.

To Do: Find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

Notation: Let $$d1 =$$ (route I travel time) − (route II travel time).

Small Sample -> T distribution (Standard normal with fat tails)

# Ho: xbarr1 - xbarr2 = 0
# Ha: xbar1r1 - xbarr2 > 0

r1 = c (32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
r2 = c (28, 28, 33, 25, 26, 29, 33, 27, 25, 33)

meanr1 <- mean(r1)
meanr2 <- mean(r2)
sdr1   <- sd(r1)
sdr2   <- sd(r2)
var1   <- sdr1^2
var2   <- sdr2^2

delta = meanr1 - meanr2

alpha = 0.02

# xbar +/- t * Se

#xbar  = mean(delta)
#xbar
n     = 10
df    = n-1
t     = qt(p = .99, df = df)
#Se    = sd(delta) / sqrt( length(delta)) 
Se = sqrt( var1/n + var2/n )
interval = c( delta - t * Se , delta + t * Se )
interval
## [1] -4.637066  4.837066

CI is ( -4.637, 4.837).

Question 11 (See Open Stats Textbook - Chapter 5 Section 5.2-5.33: Confidence intervals/Hypothesis testing for a proportion)

The U.S. Census Bureau conducts annual surveys to obtain information on the percentage of the voting-age population that is registered to vote. Suppose that 391 employed persons and 510 unemployed persons are independently and randomly selected, and that 195 of the employed persons and 193 of the unemployed persons have registered to vote. Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote? Use a significance level of 0.05 for the test. Show all work and hypothesis testing steps.

Q: Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote?

Notation: Employed group is group 1 ; Unemployed group is group 2.

Given mean, sample size of both groups, alpha, single sided test, use proportion formula (for Se) from framing of question.

# Ho:  pi1 - pi2 <= 0, Ha:  pi1-pi2 > 0 (Percentage of employed workers p1 who have registered to vote, exceeds the percentage of unemployed workers p2 who have registered to vote)
alpha=.05
# Z

n1  = 391
n2  = 510

x1  = 195
x2  = 193

p1  = x1/n1   # Estimate 
p2  = x2/n2   # Estimate 
p1-p2

Se = sqrt (p1 * (1-p1) / n1 + p2 * (1-p2) / n2)  # textbook formula
Se

pbar = (x1 + x2) / (n1 + n2)                     # alternative way to get to the same SE
qbar = 1 - pbar
correction = 1 / n1 + 1 / n2
Se_robust = sqrt( pbar * qbar * correction )

# Pretty much the same
Se_robust     # [1] 0.03328424
Se            # [1] 0.03317529 

Z  = ( p1 - p2) / Se                            # textbook formula 
Z

p_value = ( 1-pnorm(q = Z,
                        mean = 0,
                        sd = 1) 
                ) 
p_value
myp(p_value,alpha)
#p-value is .00014 < alpha. 05
shadenorm(mu = 0, sig = Se, pcts = c(0.0,0.95))
lines(rep(p1-p2,10), seq(0,20,length.out=10),col='red')

Our p-value is less than alpha so we reject the null. It is unlikely we would get this extreme value were the null true, so we reject the null.


  1. If we assume equal variance of the two groups, then we used the “pooled” method. If we do not assume equal variances of the groups, then we use the “Satterthwaite” method.↩︎