Assignment 5

Author

Allison Shrivastava

Create and store the shade norm function for later use

shadenorm = function(below=NULL, above=NULL, pcts = c(0.025,0.975), mu=0, sig=1, numpts = 500, color = "gray", dens = 40,                    justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qnorm(pcts[1],mu,sig), below)
         above = ifelse(is.null(above), qnorm(pcts[2],mu,sig), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = mu - 4*sig                         # min point plotted on x axis
    uplim  = mu + 4*sig                         # max point plotted on x axis
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dnorm(x.grid,mean=mu, sd = sig)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")    # label y and x axis
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUNCTION
shadenorm(mu = 0, sig = 1, pcts = c(0.025,0.975))

create and store chi for later use

shadechi = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qchisq(pcts[1],df), below)
         above = ifelse(is.null(above), qchisq(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = 0
    uplim  = qchisq(.99,df)
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dchisq(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }
    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUNCTION
shadechi(df = 2, pcts=c(.05))   # change pcts and see what happen

create and store t

shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qt(pcts[1],df), below)
         above = ifelse(is.null(above), qt(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = -4
    uplim  = 4
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dt(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# TEST THE FUNCTION
shadet(df = 4, pcts = c(0.025,0.975)) 

Using traditional methods, it takes 109 hours to receive a basic driving license. A new license training method using Computer Aided Instruction (CAI) has been proposed. A researcher used the technique with 190 students and observed that they had a mean of 110 hours. Assume the standard deviation is known to be 6. A level of significance of 0.05 will be used to determine if the technique performs differently than the traditional method. Make a decision to reject or fail to reject the null hypothesis. Show all work in R.

Given: 𝜇=109,𝑛=190,𝑥¯=110,𝜎=6,𝛼=.05.

To Do: Determine if the technique performs differently than the traditional method. Burden of proof falls on alternative hypothesis.

Because our test statistic is 2.297, falling within the rejection region of >= 1.96, we would assume that the CAI technique performs differently than the traditional method.

#define the values
mu<-109
alpha<-0.05
xbar<-110
sigma<-6
n<-190

# now calculate test stat (using a z test as sample is 190)
z<-(xbar-mu)/(sigma/sqrt(190))

#print
z
[1] 2.297
#get values for two tailed test and store the value for visualizing 
z2<-qnorm(1-alpha/2)

## now visualise
shadenorm(mu=0,
          sig=1,
          outside=c(-z2, z2),
          color="pink")

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 5.3 parts/million (ppm). A researcher believes that the current ozone level is at an insufficient level. The mean of 5 samples is 5.0 parts per million (ppm) with a standard deviation of 1.1. Does the data support the claim at the 0.05 level? Assume the population distribution is approximately normal.

Given: 𝜇=5.3,𝑛=5,𝑥¯=5,𝜎=1.1,𝛼=.05.

To Do: Researcher believes that the current ozone level is at an insufficient level - does the data support the claim at the 0.05 level?

Because our P value is 0.2875 greater than 0.05, there is not sufficient evidence to support that claim.

#set values
mu<-5.3
xbar<-5
s<-1.1
n<-5
alpha<-0.05

# 5 is a very small sample so we're going to use a t-test
t_test<-(xbar-mu)/(s/sqrt(n))

#print
t_test
[1] -0.6098
# belief is that current is insufficient, left tailed test needed
p_value<-pt(t_test,df=n-1)
#print
p_value
[1] 0.2875
## now lets visualize
# first find critical value
t_crit<-qt(alpha,4)
shadet(df=4,
       below=t_crit,
       justabove = TRUE,
       color="blue")
#print
t_crit
[1] -2.132
abline(v=-0.61, col="black", lwd=2)

Our environment is very sensitive to the amount of ozone in the upper atmosphere. The level of ozone normally found is 7.3 parts/million (ppm). A researcher believes that the current ozone level is not at a normal level. The mean of 51 samples is 7.1 ppm with a variance of 0.49. Assume the population is normally distributed. A level of significance of 0.01 will be used. Show all work and hypothesis testing steps.

Given: 𝜇=7.3,𝑛=51,𝑥¯=7.1,𝜎2=0.49,𝛼=.01.

To Do: Researcher believes that the current ozone level is not at normal level. Thus, set a double sided hypothesis.

Since 0.041 > 0.01 , there is not enough evidence to conclude the ozone level is different

# set values
mu<-7.3
xbar<-7.1
sigma<-sqrt(0.49)
n<-51
alpha<-0.01

zstat<-(xbar-mu)/(sigma/sqrt(n))
#print
zstat
[1] -2.04
# two tailed test as hypothesis is that its abnormal
p_value<-2*pnorm(zstat)
#print
p_value
[1] 0.04131
#now calculate the critical value
z_val<-qnorm(1-alpha/2)
#print
z_val
[1] 2.576
# now visualize 
shadenorm(mu=0,
          sig=1,
          outside=c(-z_val,z_val),
          color="brown")
# add test stat line
abline(v=zstat, col="black", lwd=2)

A publisher reports that 36% of their readers own a laptop. A marketing executive wants to test the claim that the percentage is actually less than the reported percentage. A random sample of 100 found that 29% of the readers owned a laptop. Is there sufficient evidence at the 0.02 level to support the executive’s claim? Show all work and hypothesis testing steps.

Given: 𝜋=.36,𝑛=100,𝑝̂ =.29,𝛼=.02.

To Do: Executive wants to test the claim that the percentage is actually less than the reported percentage. Thus, set a single sided hypothesis.

since 0.072>0.02, we don’t have sufficient evidence to support this claim

#set values
p<-0.36
p_hat<-0.29
n<-100
alpha<-0.02

# calculate standard error
se<-sqrt(p*(1-p)/n)

## larger sample, so z-test is appropriate
z<-(p_hat-p)/se

#print
z
[1] -1.458
# left tailed p-value
p_val<-pnorm(z)

#print
p_val
[1] 0.07237
# critical value
z_cv<-qnorm(alpha)
z_cv
[1] -2.054
### now visualize and check
shadenorm(mu=0,
          sig=1,
          below=z_cv,
          justabove = TRUE,
          color="purple")

## add test stat and critical value line
abline(v=z, col="black", lwd=2)
abline(v=z_cv, col="blue", lty=2)

A hospital director is told that 31% of the treated patients are uninsured. The director wants to test the claim that the percentage of uninsured patients is less than the expected percentage. A sample of 380 patients found that 95 were uninsured. Make the decision to reject or fail to reject the null hypothesis at the 0.05 level. Show all work and hypothesis testing steps.

Given: 𝜋=.31,𝑛=380,𝑝̂ =95380=.25,𝛼=.05.

To Do: Researcher believes that the current ozone level is not at normal level. Thus, set a double sided hypothesis.

There is enough evidence to assume there are less than 31% uninsured patients as -2.53 <-1.64

# left-tailed, one proportion z-test (large sample)

#set values
p<-0.31
p_hat<-95/380
n<-380
alpha<-0.05

se<-sqrt(p*(1-p)/n)
z<-(p_hat-p)/se

#print
z
[1] -2.529
p_val<-pnorm(z)
#print
p_val
[1] 0.00572
## now visualize and calculate the critical value
zcv<-qnorm(alpha)
#print 
zcv
[1] -1.645
shadenorm(mu=0,
          sig=1,
          below=zcv,
          color="blue")

## add test statistic and critical value line
abline(v=z, col="turquoise", lwd=2)
abline(v=zcv, col="darkblue", ity=2)
Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "ity" is
not a graphical parameter

A medical researcher wants to compare the pulse rates of smokers and non-smokers. He believes that the pulse rate for smokers and non-smokers is different and wants to test this claim at the 0.1 level of significance. The researcher checks 32 smokers and finds that they have a mean pulse rate of 87, and 31 non-smokers have a mean pulse rate of 84. The standard deviation of the pulse rates is found to be 9 for smokers and 10 for non-smokers. Let 𝜇1 be the true mean pulse rate for smokers and 𝜇2 be the true mean pulse rate for non-smokers. Show all work and hypothesis testing steps.

Let smoker group be indexed by 1, non-smoker group by 2.
Given: 𝑛1=32,𝜇1=87,𝑛2=31,𝜇2=84,𝜎1=9,𝜎2=10,𝛼=10%.

To Do: Test if the pulse rate for smokers and non-smokers is different at the 0.1 level of significance. Thus, double sided test.

0.216 >0.10 so there is not sufficient evidence to say the mean pulse rates are different

#set values
n1<-32
n2<-31
x1<-87
x2<-84
s1<-9
s2<-10
alpha<-0.1

se<-sqrt((s1^2/n1)+(s2^2/n2))
t<-(x1-x2)/se
#print
t
[1] 1.25
##now calculate the degrees of freedom
degf<-((s1^2/n1+s2^2/n2)^2)/(((s1^2/n1)^2)/(n1-1)+((s2^2/n2)^2)/(n2-1))
#print
degf
[1] 59.88
p_val<-2*(1-pt(abs(t),degf))
#print
p_val
[1] 0.216
##now visualize and calculate critical value (t, two tailed)
t_cv<-qt(1-alpha/2, degf)
#print
t_cv
[1] 1.671
## now calc t stat 
t_stat<-(x1-x2)/se
shadet(df=degf,
       outside=c(-t_cv,t_cv),
       color="darkgreen")

abline(v=t_stat, col="lightgreen", lwd=2)
abline(v=c(-t_cv,t_cv), col="black", lty=2)

Given two independent random samples with the following results: 𝑛1=11,𝑥¯1=127,𝜎1=33,𝑛2=18,𝑥¯2=157,𝜎2=27

Use this data to find the 95% confidence interval for the true difference between the population means. Assume that the population variances are not equal and that the two populations are normally distributed.

To Do: Create a 95% confidence interval for true difference between the population means.

confidence intervalis -54.807, -5.193

## unequal variances, so I'll use a welch test
#sd and n are not known 
# set values
n1<-11
n2<-18
x1<-127
x2<-157
s1<-33
s2<-27
alpha<-0.05

se<-sqrt(s1^2/n1+s2^2/n2)

#degrees of freedom
degf<-((s1^2/n1+s2^2/n2)^2)/((s1^2/n1)^2/(n1-1)+(s2^2/n2)^2/(n2-1))

#t critical vlaue
t_cv<-qt(1-alpha/2,degf)

##now confidence interval
dif<-x1-x2
lower<-dif-t_cv*se
upper<-dif+t_cv*se
c(lower,upper)
[1] -54.807  -5.193
## now visualize
shadet(df=degf,
       between=c(lower,upper),
       color="darkred")

## add mean difference line
abline(v=dif, col="lightblue", lwd=2)
## now CI lines
abline(v=c(lower,upper), col="darkblue",lty=2,lwd=2)

### the range in the shadet function are outside the upper and lower limits so i'm going to use base r
plot(1, type="n", xlim=c(lower-10, upper+10), ylim=c(0,1))
rect(lower, 0, upper, 0.5, col="green")
abline(v=dif, col="darkgreen", lwd=2)
## add boundary lines
abline(v=c(lower, upper), col="pink",lty=2, lwd=2)

Two men, A and B, who usually commute to work together decide to conduct an experiment to see whether one route is faster than the other. The men feel that their driving habits are approximately the same, so each morning for two weeks one driver is assigned to route I and the other to route II. The times, recorded to the nearest minute, are shown in the following table. Using this data, find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

r1 = c (32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
r2 = c (28, 28, 33, 25, 26, 29, 33, 27, 25, 33)

Let 𝑑1= (route I travel time) − (route II travel time).

Assume that the populations of travel times are normally distributed for both routes. Show all work and hypothesis testing steps.

To Do: Find the 98% confidence interval for the true mean difference between the average travel time for route I and the average travel time for route II.

confidence interval is -2.767, 2.967

## paired experiment
r1 = c (32, 27, 34, 24, 31, 25, 30, 23, 27, 35)
r2 = c (28, 28, 33, 25, 26, 29, 33, 27, 25, 33)

#difference
d<-r1-r2
#print
d
 [1]  4 -1  1 -1  5 -4 -3 -4  2  2
## compute the mean and sd of the differences
d_bar<-mean(d)
sd_d<-sd(d)
## sample size
n<-length(d)
#print
d_bar
[1] 0.1
sd_d
[1] 3.213
n
[1] 10
## now t critical value
t_cv<-qt(0.99,df=9)
## and standard error
SE<-sd_d/sqrt(n)
#print
t_cv
[1] 2.821
se
[1] 11.81
lower<-d_bar-t_cv*SE
upper<-d_bar+t_cv*SE
c(lower,upper)
[1] -2.767  2.967
### plotting using base r barplot
plot(1, type="n", xlim=c(lower-1, upper+1), ylim=c(0,1))

## add CI shading
rect(lower, 0, upper, 0.5, col="grey", border=NA)

## now mean difference line and CI boundary lines
abline(v=d_bar, col="red", lwd=2)
abline(v=c(lower, upper), col="blue", lty=2, lwd=2)

The U.S. Census Bureau conducts annual surveys to obtain information on the percentage of the voting-age population that is registered to vote. Suppose that 391 employed persons and 510 unemployed persons are independently and randomly selected, and that 195 of the employed persons and 193 of the unemployed persons have registered to vote. Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote? Use a significance level of 0.05 for the test. Show all work and hypothesis testing steps.

Q: Can we conclude that the percentage of employed workers (p1) who have registered to vote, exceeds the percentage of unemployed workers (p2) who have registered to vote?

evidence is sufficient to conclude the percentage of employed workers who are registered exceeds the percentage of unemployed workers who have also registered to vote as 3.61 >1.645

### two proportion z-test from independent samples
#set values
n1<-391
n2<-510
x1<-195
x2<-193

p1_hat<-x1/n1
p2_hat<-x2/n2

## pooled proportion
p_hat<-(x1+x2)/(n1+n2)

## standard error calculation
SE<-sqrt(p_hat*(1-p_hat)*(1/n1+1/n2))

##test statistic calculation
z<-(p1_hat-p2_hat)/SE

# critical value for right tailed test
z_cv<-qnorm(0.95)

## p-value
p_val<-1-pnorm(z)

### print 
z
[1] 3.614
z_cv
[1] 1.645
p_val
[1] 0.0001507
### now visualize using function
shadenorm(mu=0,
          sig=1,
          above=z_cv, color="brown")

## add critical value line and test statistic line
abline(v=z_cv, col="lightblue", lty=2, lwd=2)
abline(v=z, col="darkblue", lwd=2)