HW_5

Inference

# Clear the workspace
  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 522973 28.0    1164015 62.2   660385 35.3
## Vcells 953924  7.3    8388608 64.0  1769723 13.6

  cat("\f")       # Clear the console

Function reject or not

myp=function(p, alpha){
  if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

#Shade norm
shadenorm = function(below=NULL, above=NULL, pcts = c(0.025,0.975), mu=0, sig=1, numpts = 500, color = "gray", dens = 40,                    justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qnorm(pcts[1],mu,sig), below)
         above = ifelse(is.null(above), qnorm(pcts[2],mu,sig), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = mu - 4*sig                         # min point plotted on x axis
    uplim  = mu + 4*sig                         # max point plotted on x axis
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dnorm(x.grid,mean=mu, sd = sig)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")    # label y and x axis
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

#Shade t
shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qt(pcts[1],df), below)
         above = ifelse(is.null(above), qt(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = -4
    uplim  = 4
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dt(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

#Shade Chisquare
shadechi = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qchisq(pcts[1],df), below)
         above = ifelse(is.null(above), qchisq(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = 0
    uplim  = qchisq(.99,df)
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dchisq(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }
    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

Using traditional methods, it takes 109 hours to receive a basic driving license. A new license training method using Computer Aided Instruction (CAI) has been proposed. A researcher used the technique with 190 students and observed that they had a mean of 110 hours. Assume the standard deviation is known to be 6. A level of significance of 0.05 will be used to determine if the technique performs differently than the traditional method. Make a decision to reject or fail to reject the null hypothesis. Show all work in R.

# Ho: Mu=109, Ha: Mu<>109
n = 190 # Sample population
m = 109 # Population Mean
sm = 110 # Sample mean
sd = 6 # Standard deviation of population
se = sd/sqrt(n) #Standard error
alpha = 0.05 #Level of significance

z = (sm - m)/(se)
z

## [1] 2.297341

p = round(2*(1 - pnorm(z)),4)
p

## [1] 0.0216

myp(p,alpha)

## [1] "REJECT Ho"

p value < alpha, reject null

shadenorm( mu = 109, sig = 6/sqrt(190), pcts = c(0.025,0.975), color = "red")   # shades significance level gates 
lines(x=rep(110,10), y=seq(0,1,length.out=10), col='blue')

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 5.3 parts/million (ppm). A researcher believes that the current ozone level is at an insufficient level. The mean of 5 samples is 5.0 ppm with a standard deviation of 1.1. Does the data support the claim at the 0.05 level? Assume the population distribution is approximately normal.

# Ho: Mu=5, Ha: Mu<>5
n = 5 # Sample population
m = 5.3 # Population Mean
smean = 5.0 # Sample mean
std = 1.1 # Standard deviation of population
se = std/sqrt(n) #standard error
alpha = 0.05 # Level of significance

z = (smean - m)/(se)
z

## [1] -0.6098367

df = n-1
p = qt(p=alpha/2, df, lower.tail=FALSE)
p

## [1] 2.776445

interval = c(smean - p * se, smean + p * se)
interval

## [1] 3.63417 6.36583

The level of ozone normally found is between 3.63417 and 6.36583 ppm. The interval is within 5.3, so we cannot reject the null hypothesis. And the data does not support the claim

myp(p,alpha)

## [1] "FAIL 2 REJECT"

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 7.3 parts/million (ppm). A researcher believes that the current ozone level is not at a normal level. The mean of 51 samples is 7.1 ppm with a variance of 0.49. Assume the population is normally distributed. A level of significance of 0.01 will be used. Show all work and hypothesis testing steps.

# Ho: Mu=7.3,
#Ha: Mu<>7.3
n <- 51 # Sample population
m <- 7.3 # Population Mean
Samp <- 7.1 # Sample mean
var <- 0.49 # Variance of population
sd <- sqrt(var) # Standard deviation of population
se <- sd/sqrt(n) # Standard deviation of sample (standard error)
alpha <- 0.01 # Level of significance

t <- (7.1 - 7.3) / se
t

## [1] -2.040408

?pt

## starting httpd help server ... done

df = n-1
p =  2 * pt(q = t,
                   df)
p

## [1] 0.04660827

interval = c(Samp - p * se, Samp + p * se)
interval

## [1] 7.095431 7.104569

myp(p, alpha)

## [1] "FAIL 2 REJECT"

The level of ozone normally found is between 7.095431 and 7.104569 ppm. 7.3 is outside of the interval, so we can reject the null hypothesis and assume that the data supports the researcher’s claim.

shadet(df = n-1, 
       pcts = c(.005 , 0.995),
       color = "lightblue")
lines(x = rep(t,10), 
      y = seq(from = 0, 
              to = 1,
              length.out = 10
              ), 
      col='gray'
)

A publisher reports that 36% of their readers own a laptop. A marketing executive wants to test the claim that the percentage is actually less than the reported percentage. A random sample of 100 found that 29% of the readers owned a laptop. Is there sufficient evidence at the 0.02 level to support the executive’s claim? Show all work and hypothesis testing steps.

# Ho: pi>=0.36, Ha: pi<0.36
#No standard deviation so t distribution

n = 100
pr = 0.36
sam = 0.29
q = 1-pr
alpha <- 0.02 # Level of significance
se <- sqrt(pr*q/n)

Z = (sam - pr) / se
Z

## [1] -1.458333

p = pnorm(Z)
pnorm(Z)

## [1] 0.07237434

myp(p, alpha)

## [1] "FAIL 2 REJECT"

shadenorm(mu = .36, 
          sig = se, 
          pcts = c(.02),
          color = 'red'
          )
lines(x = rep(.29,10), 
      y = seq(from = 0, 
              to = 20,
              length.out=10), 
      col='blue')

A hospital director is told that 31% of the treated patients are uninsured. The director wants to test the claim that the percentage of uninsured patients is less than the expected percentage. A sample of 380 patients found that 95 were uninsured. Make the decision to reject or fail to reject the null hypothesis at the 0.05 level. Show all work and hypothesis testing steps.

# Ho: pi>=0.31
#Ha: pi<0.31

hd = .31
n = 380
unins = .95
df = n-1
lev = -0.05

SE<-sqrt((hd*(1-hd))/n)
SE

## [1] 0.0237254

z<-qnorm(1-.05/2)
z

## [1] 1.959964

myp(p,alpha)

## [1] "FAIL 2 REJECT"

upper<-unins+Z*SE
upper

## [1] 0.9154005

lower<-unins-Z*SE 
lower

## [1] 0.9845995

The percent of uninsured patients is between 90.34991 and 99.65009 percent.Because 31 is equal to or less than the upper bound, we cannot reject the null hypothesis. The data does not support the claim.

Act<-112
Lat<-102
sd1<-24
sd2<-15.4387
n=22
alpha<-0.1
meandifference<-Act-Lat
meandifference

## [1] 10

SE<-sqrt((sd1^2/n)+(sd2^2/n))
SE

## [1] 6.084083

df<-n-1
tvdf<-qt(p=alpha/2,df,lower.tail = FALSE)
tvdf

## [1] 1.720743

upper<-Lat+(tvdf*SE)
upper

## [1] 112.4691

lower<-Lat-(tvdf*SE)
lower

## [1] 91.53086

upper1<-sd2+(tvdf*SE)
upper1

## [1] 25.90784

lower1<-sd2-(tvdf*SE)
lower1

## [1] 4.969557

Fail to reject null.

A medical researcher wants to compare the pulse rates of smokers and non-smokers. He believes that the pulse rate for smokers and non-smokers is different and wants to test this claim at the 0.1 level of significance. The researcher checks 32 smokers and finds that they have a mean pulse rate of 87, and 31 non-smokers have a mean pulse rate of 84. The standard deviation of the pulse rates is found to be 9 for smokers and 10 for non-smokers. Let 𝜇1 be the true mean pulse rate for smokers and 𝜇2 be the true mean pulse rate for non-smokers. Show all work and hypothesis testing steps.

# Ho:  Mu1-Mu2=0
# Ha:  Mu1-Mu2<>0

#Let smoker group be indexed by 1, non-smoker group by 2.

mu1 <- 87
mu2 <- 84

alpha =   0.1

n1    =   32
n2    =   31

df1   =   n1 - 1
df2   =   n2 - 1
sd1   =   9
sd2   =   10

var1  = sd1^2
var2  = sd2^2

pointestdiff = (mu1-mu2)  
denSe        = sqrt(var1/n1 + var2/n2)  
t            = (pointestdiff - 0) / denSe
t

## [1] 1.25032

numdf = (var1/n1 + var2/n2)^2                       
dendf = (var1/n1)^2 / df1 + (var2/n2)^2 / df2       
df    = numdf / dendf

shadet(df = df, pcts = c(.05,.95))
lines(rep(t,10), seq(0,1,length.out=10),col='red')

p_value = 2 * ( 1 - pt(q = t, df = numdf/dendf))
p_value

## [1] 0.2160473

myp(p = p_value,alpha)

## [1] "FAIL 2 REJECT"

p_value_robust = 2 * ( 1 - pt(q = t, df = min(df1, df2)))
p_value_robust

## [1] 0.220848

myp(p = p_value_robust, alpha )

## [1] "FAIL 2 REJECT"

Fail to reject null

Given two independent random samples with the following results: 𝑛1 = 11 𝑥̅1 = 127 𝑠1 = 33 𝑛2 = 18 𝑥̅ 2 = 157 𝑠2 = 27 Use this data to find the 95% confidence interval for the true difference between the population means. Assume that the population variances are not equal and that the two populations are normally distributed.

# Ho: xbar1 - xbar2 =  0,
# Ha: xbar1 - xbar2 <> 0

alpha = 0.05

xbar1   =   127 
xbar2   =   157   

n1      =   11      
n2      =   18     

df1     =   n1-1    
df2     =   n2-1    

s1      =   33     
s2      =   27      

var1    =   s1^2    
var2    =   s2^2

numdf =   (var1 / n1 + var2 / n2)^2
dendf =   (var1 / n1)^2 / df1 +  
          (var2 / n2)^2 / df2
df    =   numdf / dendf
df

## [1] 18.0759

delta = xbar1 - xbar2               
delta

## [1] -30

t   =   qt(p = 0.975, df = df)   
t

## [1] 2.10029

Se = sqrt( var1/n1 + var2/n2 )
Se

## [1] 11.81101

interval = c( delta - t * Se , delta + t * Se )
interval

## [1] -54.806548  -5.193452

CI is -54.806548 and -5.193452.

pval = 2*(pt(q = delta/Se, df = numdf/dendf))   
pval

## [1] 0.02048034

myp(p = pval,alpha)

## [1] "REJECT Ho"

pval2 = 2*( pt(q = delta/Se, df = min(df1, df2))) 
pval2

## [1] 0.02936303

myp(p = pval2,alpha)

## [1] "REJECT Ho"

Two men, A and B, who usually commute to work together decide to conduct an experiment to see whether one route is faster than the other. The men feel that their driving habits are approximately the same, so each morning for two weeks one driver is assigned to route I and the other to route II. The times, recorded to the nearest minute, are shown in the following table.

Day

Mo Tu Wed Th F Mo Tu Wed Th F

Route I

32 27 34 24 31 25 30 23 27 35

Route II

28 28 33 25 26 29 33 27 25 33

Using this data, find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

Let 𝑑 = (𝑟𝑜𝑢𝑡𝑒 𝐼 𝑡𝑟𝑎𝑣𝑒𝑙 𝑡𝑖𝑚𝑒) − (𝑟𝑜𝑢𝑡𝑒 𝐼𝐼 𝑡𝑟𝑎𝑣𝑒𝑙 𝑡𝑖𝑚𝑒). Assume that the populations of travel times are normally distributed for both routes. Show all work and hypothesis testing steps.

r1    <- c(32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
r2    <- c(28, 28, 33, 25, 26, 29, 33, 27, 25, 33)

alpha <- 0.02

xbar1 <- mean(r1) 
xbar2 <- mean(r2) 

n1    <-   10      
n2    <-   10    

df1   <-   n1 - 1    
df2   <-   n2 - 1   

?sd 
s1    <-   sd(r1)      
s2    <-   sd(r2)      

var1  <-   s1^2    
var2  <-   s2^2    

Se    = sqrt(var1/n1 + var2/n2)        
Se

## [1] 1.678955

delta <- xbar1-xbar2            
delta

## [1] 0.1

t     =   qt(p = alpha/2, df = min(df1,df2))      
t

## [1] -2.821438

t     =   qt(p = 1-alpha/2, df = min(df1,df2))      
t

## [1] 2.821438

interval = c( delta - t * Se , delta + t * Se )
interval

## [1] -4.637066  4.837066

numdf = (var1/n1 + var2/n2)^2                       
dendf = (var1/n1)^2 / df1 + (var2/n2)^2 / df2     
df    = numdf / dendf                                 

t1 <- qt(p = (1-alpha/2), df = df)  
t1

## [1] 2.568883

interval = c(delta - t1 * Se , 
             delta + t1 * Se )
interval

## [1] -4.213039  4.413039

The U.S. Census Bureau conducts annual surveys to obtain information on the percentage of the voting-age population that is registered to vote. Suppose that 391 employed persons and 510 unemployed persons are independently and randomly selected, and that 195 of the employed persons and 193 of the unemployed persons have registered to vote. Can we conclude that the percentage of employed workers ( 𝑝1 ), who have registered to vote, exceeds the percentage of unemployed workers ( 𝑝2 ), who have registered to vote? Use a significance level of 𝛼 = 0.05 for the test. Show all work and hypothesis testing steps

# Ho: p1 - p2 <= 0,
# Ha: p1 - p2 >  0

alpha=0.05

n1  = 391
n2  = 510

x1  = 195
x2  = 193

p1  = x1/n1  
p2  = x2/n2    
p1-p2

## [1] 0.1202899

Se  = sqrt (p1 * (1-p1) / n1 + p2 * (1-p2) / n2)
Se

## [1] 0.03317529

Z   = (p1 - p2) / Se
Z

## [1] 3.625887

p = pnorm(q = Z, 
      lower.tail = FALSE)
p

## [1] 0.0001439855

myp(p, alpha)

## [1] "REJECT Ho"

Reject null.

HW_5

Ganesh Kumar

2024-04-20

Inference