\(1.\) Consider the air pollution data on Canvas. In this problem we will evaluate the pair (\(X5 = NO2\),\(X6 = O3\)) for bivariate normality.

\((a)\) Evaluate the marginal distributions for univariate normality.

## First, import the data into R: ##
setwd("/Users/terrysitu/Documents/Data");pollution <- read.table("Air_Pollution.txt", header=T);head(pollution)
##   Wind Solar CO NO NO2 O3 HC
## 1    8    98  7  2  12  8  2
## 2    7   107  4  3   9  5  3
## 3    7   103  4  3   5  6  3
## 4   10    88  5  2   8 15  4
## 5    6    91  4  2   8 10  3
## 6    8    90  5  2  12 12  4
layout(matrix(c(1,1,2,3), 2, 2, byrow = F))
# scatterplot #
plot(pollution[,5], pollution[,6], main="Scatterplot", xlab="NO2", ylab="O3")

# qq-plots #  
  for (i in 5:6){
     qqnorm(pollution[,i],  main="Normal Q-Q Plot", asp=1)
     qqline(pollution[,i])  }

par(mfrow=c(1,1),pch=1)  

# histogram #
par(mfrow=c(1,2),pch=1) 
for (i in 5:6){
  hist(pollution[,i], main="Histogram", breaks=10)
}

par(mfrow=c(1,1),pch=1) 

qq-plots show there is no serious violation in the assumption of normality for variable \(NO_2\) and \(O_3\).

\((b)\) Calculate the statistical distances.

## First, import the data into R: ##

pollution2 <- as.matrix(pollution[,5:6]) ## Subsetting the data for varialbes NO2 and O3 ##

# Find all statistical distances   
   d = NULL
   for (i in 1:nrow(pollution2)){
     d<-cbind(d, t(pollution2[i,] - apply(pollution2, 2, mean))
              %*% solve(var(pollution2))%*% 
              (pollution2[i,]-apply(pollution2,2,mean)))
   }


a <- NULL
for (k in 1:42){
  a[k] <- paste("The statistical distance for Observation", k, "is", d[k])
}

a
##  [1] "The statistical distance for Observation 1 is 0.460652355265481" 
##  [2] "The statistical distance for Observation 2 is 0.659220634578521" 
##  [3] "The statistical distance for Observation 3 is 2.37706098824896"  
##  [4] "The statistical distance for Observation 4 is 1.6282902383679"   
##  [5] "The statistical distance for Observation 5 is 0.41353635656463"  
##  [6] "The statistical distance for Observation 6 is 0.476072630265241" 
##  [7] "The statistical distance for Observation 7 is 1.18488946119611"  
##  [8] "The statistical distance for Observation 8 is 10.6391791770288"  
##  [9] "The statistical distance for Observation 9 is 0.13883386298671"  
## [10] "The statistical distance for Observation 10 is 0.81624681520988" 
## [11] "The statistical distance for Observation 11 is 1.35663005916879" 
## [12] "The statistical distance for Observation 12 is 0.622809578106267"
## [13] "The statistical distance for Observation 13 is 5.64943915082732" 
## [14] "The statistical distance for Observation 14 is 0.315949838512122"
## [15] "The statistical distance for Observation 15 is 0.41353635656463" 
## [16] "The statistical distance for Observation 16 is 0.122497330449552"
## [17] "The statistical distance for Observation 17 is 0.898798225782979"
## [18] "The statistical distance for Observation 18 is 4.76468729853035" 
## [19] "The statistical distance for Observation 19 is 3.00891218603843" 
## [20] "The statistical distance for Observation 20 is 0.659220634578521"
## [21] "The statistical distance for Observation 21 is 2.77414155335887" 
## [22] "The statistical distance for Observation 22 is 1.03600609523507" 
## [23] "The statistical distance for Observation 23 is 0.787415244699765"
## [24] "The statistical distance for Observation 24 is 3.44377480038025" 
## [25] "The statistical distance for Observation 25 is 6.1488605925449"  
## [26] "The statistical distance for Observation 26 is 1.03600609523507" 
## [27] "The statistical distance for Observation 27 is 0.13883386298671" 
## [28] "The statistical distance for Observation 28 is 0.885604117244452"
## [29] "The statistical distance for Observation 29 is 0.137971902192269"
## [30] "The statistical distance for Observation 30 is 2.24888673940633" 
## [31] "The statistical distance for Observation 31 is 0.19011883348272" 
## [32] "The statistical distance for Observation 32 is 0.460652355265481"
## [33] "The statistical distance for Observation 33 is 1.14719394739828" 
## [34] "The statistical distance for Observation 34 is 7.08572374389475" 
## [35] "The statistical distance for Observation 35 is 1.45842287802724" 
## [36] "The statistical distance for Observation 36 is 0.122497330449552"
## [37] "The statistical distance for Observation 37 is 1.89847083132145" 
## [38] "The statistical distance for Observation 38 is 2.77825962195751" 
## [39] "The statistical distance for Observation 39 is 8.47306491350619" 
## [40] "The statistical distance for Observation 40 is 0.637021750575236"
## [41] "The statistical distance for Observation 41 is 0.703248506023699"
## [42] "The statistical distance for Observation 42 is 1.80136110654303"

\((c)\) Construct a chi-square plot of the ordered statistical distances from \((b)\).

#  Compute quantiles of a chi-square distribution  

  q1 <- NULL
  for (i in 1:nrow(pollution2)){
    q1 <- cbind(q1, qchisq((i-0.5) / (nrow(pollution2)), ncol(pollution2))) }


#  Order the squared distances from smallest to largest

  d <- sort(d) 

#  Specify a (5 inch)x(5 inch) plot

   par(mfrow = c(1,1), fin = c(5,5))

#  Create the chi-squared probability plot
  
   d <- matrix(d, nrow = nrow(pollution2), ncol = 1)
   q1 <- matrix(q1, nrow = nrow(pollution2), ncol = 1)

  plot(q1, d, type="p", pch=1, xlab="Chi-square quantiles",
       ylab="Ordered distances", main="Chi-Square Probability Plot") 
  lines(q1, q1, lty=1)

\((d)\) Determine the proportion of observations that fall within the approximate \(50%\) probability contour of a bivariate normal distribution.

chi0.5 <- qchisq(0.5, df=2)
chi0.5
## [1] 1.386294
which(d<=chi0.5) ## 26 observations with d square less or equal to chi square (0.5) ##
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26
26/42 
## [1] 0.6190476
## 61.90476% of the observation fall within the approximate 50% probability contour of a bivariate normal distribution ##

\(2.\) Prove the following versions of Bonferroni Inequality:

\((a)\). \[P\left( \bigcup^n_{i=1} A^c_i \right) \ge 1- \sum^n_{i=1} P\left(A_i \right)\]

First prove: \[P\left(\bigcup^n_{i=1} A_i \right) \le \sum^n_{i=1} P\left( \left(A_i \right)) \right)\]

by induction: \(n=2\)

\[P\left(A_1 \bigcup A_2 \right)=P\left( A_1 \right) + P\left( A_2 \right)- P\left(A_1 \bigcap A_2 \right) \le P\left(A_1 \right)+P\left(A_2 \right)\]

assume that is true for \(n-1\):

\[P\left(\bigcup^{n-1}_{i=1} A_i \right) \le \sum^{n-1}_{i=1} P\left(A_i \right)\]

Let \(B=\bigcup^{n-1}_{i=1} A_i\),

\[P\left(B \bigcup A_n \right) \le P\left( B \right)+P\left(A_n \right)=\sum^{n-1}_{i=1} P\left(A_i \right)+P\left( A_n \right)=\sum^n_{i=1} P\left(A_i \right)\]

Also true for \(n\).

\[\implies P\left(\bigcup^n_{i=1} A_i \right) \le \sum^n_{i=1} P\left(A_i \right)\]

because

\[\begin{equation} \begin{split} P\left( \bigcup^n_{i=1} A_i \right) & \le \sum^n_{i=1} P\left(A_i \right) \\ -P\left( \bigcup^n_{i=1} A_i \right) & \ge -\sum^n_{i=1} P\left(A_i \right) \\ 1-P\left( \bigcup^n_{i=1} A_i \right) & \ge 1-\sum^n_{i=1} P\left(A_i \right) \\ \implies \space P\left( \bigcup^n_{i=1} A^c_i \right) & \ge 1- \sum^n_{i=1} P\left(A_i \right) \end{split} \end{equation}\]

\((b)\). \[ P\left( \bigcap^n_{i=1} A^c_i \right) \ge 1- \sum^n_{i=1} P\left(A_i \right)\]

First prove: \[P\left(\bigcap^n_{i=1} A_i \right) \le \sum^n_{i=1} P\left( \left(A_i \right)) \right)\]

by induction: \(n=2\)

\[P\left(A_1 \bigcap A_2 \right)=P\left( A_1 \right) + P\left( A_2 \right)- P\left(A_1 \bigcup A_2 \right) \le P\left(A_1 \right)+P\left(A_2 \right)\]

assume that is true for \(n-1\):

\[P\left(\bigcap^{n-1}_{i=1} A_i \right) \le \sum^{n-1}_{i=1} P\left(A_i \right)\]

Let \(B=\bigcap^{n-1}_{i=1} A_i\),

\[P\left(B \bigcap A_n \right) \le P\left( B \right)+P\left(A_n \right)=\sum^{n-1}_{i=1} P\left(A_i \right)+P\left( A_n \right)=\sum^n_{i=1} P\left(A_i \right)\]

Also true for \(n\).

\[\implies P\left(\bigcap^n_{i=1} A_i \right) \le \sum^n_{i=1} P\left(A_i \right)\]

because

\[\begin{equation} \begin{split} P\left( \bigcap^n_{i=1} A_i \right) & \le \sum^n_{i=1} P\left(A_i \right) \\ -P\left( \bigcap^n_{i=1} A_i \right) & \ge -\sum^n_{i=1} P\left(A_i \right) \\ 1-P\left( \bigcap^n_{i=1} A_i \right) & \ge 1-\sum^n_{i=1} P\left(A_i \right) \\ \implies \space P\left( \bigcap^n_{i=1} A^c_i \right) & \ge 1- \sum^n_{i=1} P\left(A_i \right) \end{split} \end{equation}\]

\((c)\). Explain how this inequality justifies the Bonferroni method for multiple comparison discussed in class.

If each \(A_i\) is the event that a calculated confidence interval for a particular linear combination of treatments includes the true value of that combination, then the left hand side of the inequality is the probability that all the confidence intervals simultaneously cover their respective true values. The right side is one minus the sum of the probabilities of each of the intervals missing their true values. Hence, if simultaneous multiple interval estimate are desire with an overall confidence coefficient \(1-\alpha\), you can construct each interval with confidence coefficient \((1-\alpha/g)\), and the Bonferroni inequality insures that the overall confidence coefficient is at least \(1-\alpha\).

\(3.\) Consider the bird data on Canvas. Here \(x_1\) (first column) is tail length and \(x_2\) (second column) is wing length both in millimeters of a sample of \(n = 45\) female hook-billed kites.

\((a)\) Find the 95% confidence ellipse for the population means \(μ_1\) and \(μ_2\).

setwd("/Users/terrysitu/Dropbox/Stat MS Classes/Regression Modeling/Data")
bird <- as.matrix(read.table("Bird.txt", header=F))
colnames(bird) <- c("tail", "wing")
head(bird)
##      tail wing
## [1,]  191  284
## [2,]  197  285
## [3,]  208  288
## [4,]  180  273
## [5,]  180  275
## [6,]  188  280
## calculate the mean for tail length and wing  length ##
mean_x <- apply(bird, 2, mean)
mean_x
##     tail     wing 
## 193.6222 279.7778
## compute the variance-covriance matrix S ##
S <- var(bird)

#create confidence ellipse based on chi-square distribution
library(ellipse)
conf.ellipse1=ellipse(S/nrow(bird), centre=mean_x, level=0.95)
plot(conf.ellipse1, type="l")

#create confidence ellipse based on scaled F distribution
c.sq=ncol(bird)*(nrow(bird)-1)/(nrow(bird)-ncol(bird))*qf(.95, ncol(bird), nrow(bird)-ncol(bird))
conf.ellipse2=ellipse(S/nrow(bird), centre=mean_x, t=sqrt(c.sq))
quartz()
plot(conf.ellipse2,type='l')

\((b)\) Suppose it is known that \(μ_1 = 190\) mm and \(μ_2 = 275\) mm for male hook-billed kites. Are these plausible values for the mean tail length and mean wing length for female birds? Justify your answer.

We can, base on the confidence region calculations on the same data set, comment on the true tail and wing length of male hook-billed kites. The steps are shown below:

  p = ncol(S)
   n = nrow(bird)
   nullmean = c(190, 275) ## the suggested true mean for tail and wing lengthes of female hook-billie kites ##
   d = mean_x-nullmean
   t2 <- n*t(d)%*%solve(S)%*%d;
   t2mod <- (n-p)*t2/(p*(n-1))
   pval <- 1- pf(t2mod,p,n-p)

ifelse(pval<0.05, "Reject", "Fail to Reject")
##      [,1]            
## [1,] "Fail to Reject"

As the result shown above, based on \(\alpha=0.05\), we fail to reject the hypothesis that the true mean values of tail length and wing length are \(190\) and \(275\), respectively.

In indeed, the confidence ellipse analysis in part \((a)\) also give the same conclusion, as we can see the “point” corresponds to tail length = \(190\) and wing length = \(275\) falls inside the ellipse.

\((c)\) Construct the simultaneous \(95\)% Scheffe and Bonferroni confidence intervals for \(μ_1\) and \(μ_2\). Compare the two sets of intervals.

we already computed the sample mean for tail length and wing length in the previous parts.

First, we compute Scheffe simultaneous confidence interval:

## "F" value ##

F <- ((ncol(bird)*(nrow(bird)-1))/(nrow(bird)-ncol(bird)))*qf(0.95, ncol(bird), nrow(bird)-ncol(bird))

sqrt(F)
## [1] 2.564853
upper <- NULL
lower <- NULL

Sch.ci <- function(mean_x, S){
  for (i in 1:2){
  upper[i] <- mean_x[i]+sqrt(F)*sqrt(S[i,i]/nrow(bird))
  lower[i] <- mean_x[i]-sqrt(F)*sqrt(S[i,i]/nrow(bird))
  } 
  tab <- cbind(upper, lower)
  rownames(tab) <- c('Tail Length', "Wing Length") 
  tab
}


Sch.ci(mean_x, S)
##                upper    lower
## Tail Length 197.8227 189.4217
## Wing Length 285.2992 274.2564

Second, we compute the \(95\)% Bonferroni confidence interval as follow:

## compute the t value ##

T <- qt(1-(0.05/(2*ncol(bird))), nrow(bird)-1)
T
## [1] 2.320711
## construct the Bonferroni confidence interval for tail length and wing length ##

upper <- NULL
lower <- NULL

Bon.ci <- function(mean_x, S){
  for (i in 1:2){
  upper[i] <- mean_x[i]+T*sqrt(S[i,i]/nrow(bird))
  lower[i] <- mean_x[i]-T*sqrt(S[i,i]/nrow(bird))
  } 
  tab <- cbind(upper, lower)
  rownames(tab) <- c('Tail Length', "Wing Length") 
  tab
}


Bon.ci(mean_x, S)
##                upper    lower
## Tail Length 197.4229 189.8216
## Wing Length 284.7736 274.7819

These two fashions of simultaneous confidence intervals give very similar resits. The Scheffe type gives us larger upper bound values on tail length and wing length but smaller lower bound values, comparing to the Bonferroni type. However, we can compare \(\sqrt{F}\) and \(T\), and use the one with smaller value.

ifelse(sqrt(F)<T, "Using Scheffe type", "Using Bonferroni type")
## [1] "Using Bonferroni type"

\(4.\) Consider the milk transportation data on Canvas. These data represent various costs associated with transporting milk from farms to dairy plants for gasoline trucks. Here \(x_1\) (second column) is fuel costs, \(x_2\) (third column) is repair costs and \(x_3\) (fourth column) is capital costs. You can ignore the first column consisting of \(1s\).

\((a)\) Do the data appear to follow a jointly normal distribution?

setwd("/Users/terrysitu/Dropbox/Stat MS Classes/Regression Modeling/Data")
milk <- read.table("Milk_Costs.txt")
milk <- as.matrix(milk[-1])
head(milk)
##         V2    V3    V4
## [1,] 16.44 12.43 11.23
## [2,]  7.19  2.70  3.92
## [3,]  9.92  1.35  9.75
## [4,]  4.24  5.78  7.78
## [5,] 11.20  5.05 10.67
## [6,] 14.25  5.78  9.88
par(mfrow=c(2,2),pch=1) 
for (i in 1:ncol(milk)){
  qqnorm(milk[,i],  main="Normal Q-Q Plot")
     qqline(milk[,i]) 
}
par(mfrow=c(1,1),pch=1) 

\(QQ-plots\) show no serious violation in normality for individual variable.

# Find all statistical distances   
   d = NULL
   for (i in 1:nrow(milk)){
     d<-cbind(d, t(milk[i,] - apply(milk, 2, mean))
              %*% solve(var(milk))%*% 
              (milk[i,]-apply(milk,2,mean)))
   }


a <- NULL
for (k in 1:ncol(milk)){
  a[k] <- paste("The statistical distance for Observation", k, "is", d[k])
}

a
## [1] "The statistical distance for Observation 1 is 1.17957637523565"
## [2] "The statistical distance for Observation 2 is 3.19166452877484"
## [3] "The statistical distance for Observation 3 is 3.37427179617113"
#  Compute quantiles of a chi-square distribution  

  q1 <- NULL
  for (i in 1:nrow(milk)){
    q1 <- cbind(q1, qchisq((i-0.5) / (nrow(milk)), ncol(milk))) }


#  Order the squared distances from smallest to largest

  d <- sort(d) 

#  Specify a (5 inch)x(5 inch) plot

   par(mfrow = c(1,1), fin = c(5,5))

#  Create the chi-squared probability plot
  
   d <- matrix(d, nrow = nrow(milk), ncol = 1)
   q1 <- matrix(q1, nrow = nrow(milk), ncol = 1)

  plot(q1, d, type="p", pch=1, xlab="Chi-square quantiles",
       ylab="Ordered distances", main="Chi-Square Probability Plot") 
  lines(q1, q1, lty=1)

chi0.5 <- qchisq(0.5, df=3)
chi0.5
## [1] 2.365974
which(d<=chi0.5) 
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
20/nrow(milk)
## [1] 0.5555556

The \(\chi^2\) plot shows no serious violation in multivariate normality and \(55.55556\)% \(d^2 \le \chi^2_3 (0.5)\), which suggest the multivariate normality assumption does hold.

\((b)\) Construct the simultaneous \(95\)% Scheffe and Bonferroni confidence intervals for \(μ_1\), \(μ_2\) and \(μ_3\). Compare the two sets of intervals.

First, we compute Scheffe simultaneous confidence interval:

mean_x <- apply(milk, 2, mean)
S <- var(milk)

## "F" value ##

F <- ((ncol(milk)*(nrow(milk)-1))/(nrow(milk)-ncol(milk)))*qf(0.95, ncol(milk), nrow(milk)-ncol(milk))

sqrt(F)
## [1] 3.033221
upper <- NULL
lower <- NULL

Sch.ci <- function(mean_x, S){
  for (i in 1:ncol(milk)){
  upper[i] <- mean_x[i]+sqrt(F)*sqrt(S[i,i]/nrow(milk))
  lower[i] <- mean_x[i]-sqrt(F)*sqrt(S[i,i]/nrow(milk))
  } 
  tab <- cbind(upper, lower)
  rownames(tab) <- c("x1", "x2", "x3") 
  tab
}


Sch.ci(mean_x, S)
##       upper    lower
## x1 14.64378 9.793438
## x2 10.22998 5.995024
## x3 11.47934 7.701211

Second, we compute the \(95\)% Bonferroni confidence interval as follow:

## compute the t value ##

T <- qt(1-(0.05/(2*ncol(milk))), nrow(bird)-1)
T
## [1] 2.488968
## construct the Bonferroni confidence interval for tail length and wing length ##

upper <- NULL
lower <- NULL

Bon.ci <- function(mean_x, S){
  for (i in 1:ncol(milk)){
  upper[i] <- mean_x[i]+T*sqrt(S[i,i]/nrow(milk))
  lower[i] <- mean_x[i]-T*sqrt(S[i,i]/nrow(milk))
  } 
  tab <- cbind(upper, lower)
  rownames(tab) <- c("x1","x2","x3") 
  tab
}


Bon.ci(mean_x, S)
##        upper     lower
## x1 14.208634 10.228588
## x2  9.850036  6.374964
## x3 11.140388  8.040167

Comparing to Bonferroni, the upper bound values for all the true means of \(X's\) are lager but the lower bound values are smaller. Overall, the confidence intervals are pretty similar and we can compare the \(F\) values and \(T\) values then decide to use which one, as shown below:

ifelse(sqrt(F)<T, "Using Scheffe type", "Using Bonferroni type")
## [1] "Using Bonferroni type"

\(5.\) Carefully read Example 6.1 on the following pages. The data is available on Canvas:

\(Col. 1\): BOD1 (biochemical oxygen demand) commercial lab measurement

\(Col. 2\): SS1 (suspended solids) commercial lab measurement

\(Col. 3\): BOD2 (biochemical oxygen demand) state lab of hygiene measurement

\(Col. 4\): SS2 (suspended solids) state lab of hygiene measurement

\((a)\) Construct the \(95\)% confidence ellipse for the mean difference vector \(δ\) using the results in Example 6.1. Does the point \(δ = 0\) fall inside the contour? Is this consistent with the hypothesis test conducted in the example?

setwd("/Users/terrysitu/Dropbox/Stat MS Classes/Regression Modeling/Data")
effluent <- as.matrix(read.table("Effluent.txt", header=F))
colnames(effluent) <- c("x1","x2","x3","x4")
effluent <- rbind(c(6 , 27,25, 15),effluent)
head(effluent)
##      x1 x2 x3 x4
## [1,]  6 27 25 15
## [2,]  6 23 28 13
## [3,] 18 64 36 22
## [4,]  8 44 35 29
## [5,] 11 30 15 31
## [6,] 34 75 44 64
# compute the paired difference (results from example 6.1 #
mean_x_d <- c(-9.36, 13.27)
S_d <- matrix(c(199.26, 88.38, 88.38, 418.61), 2, 2)
library(ellipse)
conf.ellipse1=ellipse(S_d/nrow(effluent), centre=mean_x_d, level=0.95)
plot(conf.ellipse1, type="l")

\(δ = 0\) does not fall inside the contour. \(H_0\) is rejected, which suggests the test result is consistent with the hypothesis test conducted in the example \(6.1\).

\((b)\) Construct the \(95\)% Bonferroni simultaneous intervals for the components of the mean difference vector \(δ\). What do you find?

## compute the t value ##

T <- qt(1-(0.05/(2*2)), nrow(effluent)-1)
T
## [1] 2.633767
## construct the Bonferroni confidence interval for tail length and wing length ##

upper <- NULL
lower <- NULL

Bon.ci <- function(mean_x_d, S_d){
  for (i in 1:2){
  upper[i] <- mean_x_d[i]+T*sqrt(S_d[i,i]/nrow(effluent))
  lower[i] <- mean_x_d[i]-T*sqrt(S_d[i,i]/nrow(effluent))
  } 
  tab <- cbind(upper, lower)
  rownames(tab) <- c('BOB Difference', "SS Difference") 
  tab
}


Bon.ci(mean_x_d, S_d)
##                    upper      lower
## BOB Difference  1.849624 -20.569624
## SS Difference  29.517472  -2.977472

\(δ = 0\) falls inside the confidence intervals for both of the mean differences. \(H_0\) is failed to rejected here.

\((c)\) The data corresponding to sample \(8\) is unusually large. Remove this observation and construct the joint confidence region for the mean distance vector \(δ\) and the \(95\)% Bonferroni simultaneous intervals for the components of the mean difference vector. Now are the results consistent with a test of \(H_0 : δ = 0\)? Does this outliner make a difference in the analysis of these data?

effluent.new <- effluent[-8,]
effluent.d <- as.matrix(cbind(effluent.new[,1]-effluent.new[,3], effluent.new[,2]-effluent.new[,4]))

new_mean_x_d <- apply(effluent.d, 2, mean)
new_S_d <- var(effluent.d)

## joint confidence region ##

q = ncol(new_S_d)
   n = nrow(effluent.d)
   nullmean = c(0, 0) 
   d = new_mean_x_d-nullmean
   t2 <- n*t(d)%*%solve(new_S_d)%*%d;
   t2mod <- (n-p)*t2/(p*(n-1))
   pval <- 1- pf(t2mod,p,n-p)

ifelse(pval<0.05, "Reject", "Fail to Reject")
##      [,1]    
## [1,] "Reject"
## 95% Bonferroni simultaneous intervals ##
upper <- NULL
lower <- NULL
T <- qt(1-(0.05/(2*2)), nrow(effluent.d)-1)
T
## [1] 2.685011
Bon.ci <- function(new_mean_x_d, new_S_d){
  for (i in 1:2){
  upper[i] <- new_mean_x_d[i]+T*sqrt(new_S_d[i,i]/nrow(effluent.d))
  lower[i] <- new_mean_x_d[i]-T*sqrt(new_S_d[i,i]/nrow(effluent.d))
  } 
  tab <- cbind(upper, lower)
  rownames(tab) <- c('BOB Difference', "SS Difference") 
  tab
}


Bon.ci(new_mean_x_d, new_S_d)
##                    upper      lower
## BOB Difference -2.082003 -21.917997
## SS Difference  20.555587  -3.355587

Joint confidence region now suggest the \(H_0: δ = 0\) is rejected. In addition, based on Bonfeerroni sim. intervals, BOB difference is nonzero while SS difference can be zero.

\(6\). A researcher considered three indices measuring the severity of heart attacks. The values of these indices for \(n = 40\) heart attack patients arriving at a hospital emergency room produced the following summary statistics: \(\bar{x}= [46.1, 57.3, 50.4]\) and

\[\begin{bmatrix} 101.3 & 63.0 & 71.0 \\ 63.0 & 80.2 & 55.6 \\ 71.0 & 55.6 & 97.4 \end{bmatrix}\]

\((a)\) All three indices are evaluated for each patient. Test for the equality of mean indices using \(α = 0.05\).

#  Compute sample mean vector and
#  sample covariance matrix 
mean_x <- c(46.1, 57.3, 50.4)
S <- matrix(c(101.3, 63, 71, 63, 80.2, 55.6, 71, 55.6, 97.4), 3, 3)

#  Apply the contrasts

Cstar <-matrix( c(1,0,-1,1,-1,0), 2, 3, byrow=T)
Cstar
##      [,1] [,2] [,3]
## [1,]    1    0   -1
## [2,]    1   -1    0
Cxbar <- Cstar %*% mean_x
Cxvar <- Cstar %*% S %*%t(Cstar)

# Compute Hotelling statistic

q <- ncol(Cstar)
n <- 40
t2<-n*t(Cxbar)%*%solve(Cxvar)%*%Cxbar;
t2mod<- (n-q+1)*t2/((q-1)*(n-1))
pval <- 1- pf(t2mod,p,n-p)

ifelse(pval<0.05, "Reject", "Fail to Reject")
##      [,1]    
## [1,] "Reject"

\((b)\) Investigate the differences in pairs of mean indices using \(95\)% simultaneous confidence intervals.

n=40
F <- (((ncol(Cstar)-1)*(n-1))/(n-ncol(Cstar)+1))*qf(0.95, ncol(Cstar)-1, n-ncol(Cstar)+1)

sqrt(F)
## [1] 2.580778
upper <- NULL
lower <- NULL

Sch.ci <- function(Cxbar, Cxvar){
  for (i in 1:ncol(Cxvar)){
  upper[i] <- Cxbar[i]+sqrt(F)*sqrt(Cxvar[i,i]/n)
  lower[i] <- Cxbar[i]-sqrt(F)*sqrt(Cxvar[i,i]/n)
  } 
  tab <- cbind(upper, lower)
  rownames(tab) <- c("Cx1", "Cx2") 
  tab
}


Sch.ci(Cxbar, Cxvar)
##         upper      lower
## Cx1 -1.227356  -7.372644
## Cx2 -8.160045 -14.239955

\(7.\) Complete problem 6.17 from the text. The problem is included at the end of this document. The data are available on Canvas:

\(Col. 1\): \(x_1\) = WordDiff

\(Col. 2\): \(x_24\) = WordSame

\(Col. 3\): \(x_3\) = ArabicDiff

\(Col. 4\): \(x_4\) = ArabicSame

\(a\).

setwd("/Users/terrysitu/Dropbox/Stat MS Classes/Regression Modeling/Data")
number <- as.matrix(read.table("Number_Parity.txt", header=F))
colnames(number) <- c("x1","x2", "x3", "x4")
number <- rbind(c(  869.0  , 860.5  , 691.0  , 601.0), number)
head(number)
##        x1    x2     x3  x4
## [1,]  869 860.5  691.0 601
## [2,]  995 875.0  678.0 659
## [3,] 1056 930.5  833.0 826
## [4,] 1126 954.0  888.0 728
## [5,] 1044 909.0  865.0 839
## [6,]  925 856.5 1059.5 797
#  Apply the contrasts

Cstar <-matrix( c(-1, 1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1), 3, 4, byrow=T)
Cstar
##      [,1] [,2] [,3] [,4]
## [1,]   -1    1   -1    1
## [2,]   -1   -1    1    1
## [3,]   -1    1    1   -1
#  Compute sample mean vector and
#  sample covariance matrix 
mean_x <- apply(number, 2, mean)
S <- var(number)

Cxbar <- Cstar %*% mean_x
Cxvar <- Cstar %*% S %*%t(Cstar)

# Compute Hotelling statistic

q <- ncol(Cstar)
n <- nrow(number)
t2<-n*t(Cxbar)%*%solve(Cxvar)%*%Cxbar;
t2mod<- (n-q+1)*t2/((q-1)*(n-1))
pval <- 1- pf(t2mod,p,n-p)

ifelse(pval<0.05, "Reject", "Fail to Reject")
##      [,1]    
## [1,] "Reject"

so we reject \(H_0: C\mu=0\)

\(b\).

n <- nrow(number)
F <- (((ncol(Cstar)-1)*(n-1))/(n-ncol(Cstar)+1))*qf(0.95, ncol(Cstar)-1, n-ncol(Cstar)+1)

sqrt(F)
## [1] 3.067431
upper <- NULL
lower <- NULL

Sch.ci <- function(Cxbar, Cxvar){
  for (i in 1:ncol(Cxvar)){
  upper[i] <- Cxbar[i]+sqrt(F)*sqrt(Cxvar[i,i]/n)
  lower[i] <- Cxbar[i]-sqrt(F)*sqrt(Cxvar[i,i]/n)
  } 
  tab <- cbind(upper, lower)
  rownames(tab) <- c("Parity Effect", "Format Effect", "Interaction Effect") 
  tab
}


Sch.ci(Cxbar, Cxvar)
##                         upper      lower
## Parity Effect      -124.97937 -280.36438
## Format Effect      -186.83888 -411.56737
## Interaction Effect   75.05883  -32.40258

Based on the intervals shown above, we can conclude there is no interaction effect.

\(c\).

Based on part \(b\), the \(M\) model of numerical cognition is a reasonable population model for the scores.

\(d\).

First, check if each of the individual variable \(x\) is univariate normal using qq-plots:

par(mfrow=c(2, 2), pch=1)
for (i in 1:ncol(number)){
   qqnorm(number[,i],  main="Normal Q-Q Plot")
     qqline(number[,i])
}

par(mfrow=c(1,1), pch=1)

There is no serious violation of normality found from qq-plots.

Second, check the \(\chi^2\) plot:

# Find all statistical distances   
   d = NULL
   for (i in 1:nrow(number)){
     d<-cbind(d, t(number[i,] - apply(number, 2, mean))
              %*% solve(var(number))%*% 
              (number[i,]-apply(number,2,mean)))
   }

#  Compute quantiles of a chi-square distribution  

  q1 <- NULL
  for (i in 1:nrow(number)){
    q1 <- cbind(q1, qchisq((i-0.5) / (nrow(number)), ncol(number))) }


#  Order the squared distances from smallest to largest

  d <- sort(d) 

#  Specify a (5 inch)x(5 inch) plot

   par(mfrow = c(1,1), fin = c(5,5))

#  Create the chi-squared probability plot
  
   d <- matrix(d, nrow = nrow(number), ncol = 1)
   q1 <- matrix(q1, nrow = nrow(number), ncol = 1)

  plot(q1, d, type="p", pch=1, xlab="Chi-square quantiles",
       ylab="Ordered distances", main="Chi-Square Probability Plot") 
  lines(q1, q1, lty=1)

Combine these \(2\) steps, we can conclude that multivariate normal distribution is a reasonable population model for these data.