Problem 4.

Part A.

Based on the plots and Shapiro test below, I would argue that the variables do not seem to be normally distributed.

The histogram for the duration and clearing times seem to be skewed right. Additionally, the qq-plot would suggest that the data is not normal. The Shapiro test provides p-values < \(\alpha\) = 0.05, so I would reject the \(H_0\) that the variables are normally distributed. Lastly, we draw the same conclusion from the Chi-Squared plot. The data almost looks cubic with the turn upwards. A log transformation may fair well.

snow <- read.csv('T3.txt',header=FALSE, sep="")
colnames(snow) <-c("duration","clear")

##Histogram
par(mfrow=c(2,2))
hist(snow$duration)
hist(snow$clear)

#QQ Plots
qqnorm(snow$duration);qqline(snow$duration)  # Q-Q plot of duration
qqnorm(snow$clear);qqline(snow$clear)  # Q-Q plot of clearing

#Shapiro Test
shapiro.test(snow$duration);shapiro.test(snow$clear);
## 
##  Shapiro-Wilk normality test
## 
## data:  snow$duration
## W = 0.91701, p-value = 0.04382
## 
##  Shapiro-Wilk normality test
## 
## data:  snow$clear
## W = 0.88247, p-value = 0.007773
#Chi Squared Plot for Multivariate Case
q1 <- NULL
  for (i in 1:nrow(snow[,1:2])){
    q1 <- cbind(q1, qchisq((i-0.5) / (nrow(snow[,1:2])), ncol(snow[,1:2]))) }

m_dist12 <- mahalanobis(snow[,c(1,2)],colMeans(snow[,c(1,2)]),cov(snow[,c(1,2)])) 

par(mfrow=c(1,1))
plot(sort(m_dist12),q1,main ="Chi-squared Plot for Duration and Clearing Time", xlab=expression(d[j]^2),
     ylab=expression(paste(chi[2]^2,"( ",frac(j-.5,25)," )")),pch=20)
abline(a=0,b=1,lwd=2)

Part B.

Below, we have applied a log transformation and recalculated the tests of normality. We see that the log transformation has been successful as the data appears more “normal”.

The new histograms look fairly normally distributed. Additionally, the ellipse contains most of the data which fairs well for normality. Lastly, our shapiro wilks test now has a p-value > 0.05 so we fail to reject our \(H_0\). That is, we fail to reject normality. Lastly, our chi-squared data plot looks much better- the data follows a straighter line.

We will continue to use the transformed data moving forward.

library(car) 
## Loading required package: carData
trans = powerTransform(snow,family="yjPower")  # yjPower" family (Yeo and Johnson(2000)),
# another modifiation of the Box-Cox family that allows a few negative values.
summary(trans)
## yjPower Transformations to Multinormality 
##          Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## duration    0.1404           1      -0.7579       1.0388
## clear      -0.7196           0      -1.8971       0.4580
## 
##  Likelihood ratio test that all transformation parameters are equal to 0
##                              LRT df    pval
## LR test, lambda = (0 0) 1.621078  2 0.44462
logsnow=log(snow)

sqrt((24)*2/(25-2) *qf(0.95,2,25-2))
## [1] 2.672422
library(ellipse)
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:car':
## 
##     ellipse
## The following object is masked from 'package:graphics':
## 
##     pairs
par(par(mfrow=c(2,2)))
hist(log(snow$duration))

hist(log(snow$clear))

plot(log(snow$duration),log(snow$clear),pch=20)
quan_chisq1=ellipse(cor(logsnow[,c(1,2)]), scale = c(sd(log(snow$duration)),sd(log(snow$clear))), center=c(0,0), 
                   t = sqrt(qchisq(.5, 2)), which = c(1, 2), npoints = 100)
lines(quan_chisq1[,1]+mean(log(snow$duration)),quan_chisq1[,2]+mean(log(snow$clear)),col="blue",lwd=2)

#Shapiro Test
shapiro.test(log(snow$duration));shapiro.test(log(snow$clear));
## 
##  Shapiro-Wilk normality test
## 
## data:  log(snow$duration)
## W = 0.97795, p-value = 0.8416
## 
##  Shapiro-Wilk normality test
## 
## data:  log(snow$clear)
## W = 0.95898, p-value = 0.3945
#Chi Squared Plot for Multivariate Case
q1 <- NULL
  for (i in 1:nrow(snow[,1:2])){
    q1 <- cbind(q1, qchisq((i-0.5) / (nrow(snow[,1:2])), ncol(snow[,1:2]))) }

m_dist12_log <- mahalanobis(logsnow[,c(1,2)],colMeans(logsnow[,c(1,2)]),cov(logsnow[,c(1,2)])) 

par(mfrow=c(1,1))
plot(sort(m_dist12_log),q1,main ="Chi-squared Plot for Duration and Clearing Time Log Transform", xlab=expression(d[j]^2),
     ylab=expression(paste(chi[2]^2,"( ",frac(j-.5,25)," )")),pch=20)
abline(a=0,b=1,lwd=2)

Part C.

95% Confidence Ellipse for the population means is shown below on the log transformed data.

D = as.matrix(logsnow) 
dbar=as.matrix(colMeans(D),ncol=1);dbar   #est mu1-mu2
##              [,1]
## duration 2.170685
## clear    2.888439
Sd=cov(D);Sd
##            duration      clear
## duration 0.15130352 0.05170523
## clear    0.05170523 0.13883808
n1 = nrow(D);n1
## [1] 25
p1=2

##Using D
Tsq1 <- n1*t(dbar)%*%solve(Sd)%*%(dbar);Tsq1
##          [,1]
## [1,] 1729.305
cri1 = (n1-1)*p1/(n1-p1) *qf(0.95,p1,n1-p1);cri1
## [1] 7.141841
## Confidence region for delta
plot(D, main="95% confidence ellipse")
CR = ellipse(x=cor(D), scale=sqrt(diag(Sd)/n1), centre=as.vector(dbar),t=sqrt(cri1))
#n1*mahalanobis(as.vector(dbar),as.vector(CR[3,]),cov(D))
lines(CR,type="l",xlab=expression({delta}[1]),ylab=expression({delta}[2]),
     main="95% confidence ellipse")
points(0,0,pch=20)

Part D.

Here we are using the LOG TRANSFORMED DATA!

The simultaneous T^2 and Bonferroni Intervals are below.

The Bonferroni interval is a bit smaller. Since the sqrt(F) > T, we use the Bonferroni confidence interval.

#F-Value for Sheffe
F <- ((ncol(logsnow)*(nrow(logsnow)-1))/(nrow(logsnow)-ncol(logsnow)))*qf(0.95, ncol(logsnow), nrow(logsnow)-ncol(logsnow))

sqrt(F)
## [1] 2.672422
#Tsquared
n1 <-nrow(logsnow)
p1 <- 2
dbar<-as.matrix(colMeans(logsnow),ncol=1)
Sd <- cov(logsnow)
Tsq1 = matrix(NA,2,2)
c1 = sqrt(p1*(n1-1)/(n1-p1)*qf(.95,p1,n1-p1))
for (i in 1:2){
  Tsq1[i,1] = dbar[i] - c1*sqrt(Sd[i,i]/n1) #left
  Tsq1[i,2] = dbar[i] + c1*sqrt(Sd[i,i]/n1) #right
}

#T-value
T <- qt(1-(0.05/(2*ncol(logsnow))), nrow(logsnow)-1)
T
## [1] 2.390949
#Bonferroni
m=2
Bonf1 = matrix(NA,2,2)
for (i in 1:m){
  Bonf1[i,1] = dbar[i] - qt(1-0.05/(2*m),n1-1)*sqrt(Sd[i,i]/n1) #left
  Bonf1[i,2] = dbar[i] + qt(1-0.05/(2*m),n1-1)*sqrt(Sd[i,i]/n1) #right
}

Bonf1
##         [,1]     [,2]
## [1,] 1.98468 2.356690
## [2,] 2.71026 3.066617
colnames(Tsq1) =colnames(Bonf1) =c("lower","upper")
list(Tsq1, Bonf1)
## [[1]]
##         lower    upper
## [1,] 1.962783 2.378587
## [2,] 2.689284 3.087593
## 
## [[2]]
##        lower    upper
## [1,] 1.98468 2.356690
## [2,] 2.71026 3.066617

Problem 5.

Part A.

Hypothesis: \(H_0:\mu_1-\mu_2=\delta_0\) \(H_A:\mu_1-\mu_2\neq\delta_0\)

Decision Rule: Reject \(H_0\) if \(T^2 > c^2\)

Conclusion: As \(T^2 = 50.9127 > c^2 = 12.93\), we reject the \(H_0\). That is the difference in the mean cost vectors is not delta.

milk <- read.csv('T6.txt',header=FALSE, sep="")
colnames(milk) <-c("fuel","repair","capital", "type")

X_1=milk[milk$type=="gasoline",1:3]
X_2=milk[milk$type=="diesel",1:3]

n_1<-dim(X_1)[1]
n_2<-dim(X_2)[1]
p<-dim(X_1)[2]

delta0=rep(0,p)
xbar_1 = colMeans(X_1);xbar_1
##      fuel    repair   capital 
## 12.218611  8.112500  9.590278
xbar_2 = colMeans(X_2);xbar_2
##     fuel   repair  capital 
## 10.10565 10.76217 18.16783
S_1 = var(X_1);S_1
##              fuel    repair   capital
## fuel    23.013361 12.366395  2.906609
## repair  12.366395 17.544111  4.773082
## capital  2.906609  4.773082 13.963334
S_2 = var(X_2);S_2
##              fuel     repair   capital
## fuel    4.3623166  0.7598872  2.362099
## repair  0.7598872 25.8512360  7.685732
## capital 2.3620992  7.6857322 46.654400
S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2);S_p
##              fuel    repair   capital
## fuel    15.814712  7.886690  2.696447
## repair   7.886690 20.750370  5.897263
## capital  2.696447  5.897263 26.580938
(T2 = (n_1*n_2/(n_1+n_2))*t(xbar_1-xbar_2-delta0)%*%solve(S_p)%*%(xbar_1-xbar_2-delta0))
##          [,1]
## [1,] 50.91279
(crit = (n_1+n_2-2)*p/(n_1+n_2-p-1)*qf(.99,p,n_1+n_2-p-1))
## [1] 12.93096

Part B.

The mean components most responsible for the rejection are listed below.

# Linear combination of the mean components most responsiblefor rejection of H_0
(a<-solve(S_p)%*%(xbar_1-xbar_2))
##               [,1]
## fuel     0.2547452
## repair  -0.1339036
## capital -0.3188296
A = diag(p)
A
##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

Part C.

The cost for the capital seems to be quite different. All the values are completely negative.

# T^2-intervals
Clower = t(A)%*%(xbar_1-xbar_2)-sqrt(crit)*sqrt(diag((t(A)%*%S_p%*%A)*(1/n_1+1/n_2)))
Cupper = t(A)%*%(xbar_1-xbar_2)+ sqrt(crit)*sqrt(diag((t(A)%*%S_p%*%A)*(1/n_1+1/n_2)))
cbind(Clower,Cupper)
##            [,1]      [,2]
## [1,]  -1.704346  5.930264
## [2,]  -7.022268  1.722920
## [3,] -13.526479 -3.628618

Part D.

Hypothesis: \(H_0:\mu_1-\mu_2=\delta_0\) \(H_A:\mu_1-\mu_2\neq\delta_0\)

Decision Rule: Reject \(H_0\) if \(T^2 > c^2\)

Conclusion: As \(T^2 = 52.3836 > c^2 = 12.99521\), we reject the \(H_0\). That is the difference in the mean cost vectors is not delta.

The results are about the same, however the \(T^2\) has increased slightly.

milknew <- milk[-c(9,21),]

X_1=milknew[milknew$type=="gasoline",1:3]
X_2=milknew[milknew$type=="diesel",1:3]

n_1<-dim(X_1)[1]
n_2<-dim(X_2)[1]
p<-dim(X_1)[2]

delta0=rep(0,p)
xbar_1 = colMeans(X_1);xbar_1
##      fuel    repair   capital 
## 11.311765  7.632941  9.561176
xbar_2 = colMeans(X_2);xbar_2
##     fuel   repair  capital 
## 10.10565 10.76217 18.16783
S_1 = var(X_1);S_1
##             fuel    repair   capital
## fuel    9.025021  5.155749  3.201671
## repair  5.155749 14.258694  4.318945
## capital 3.201671  4.318945 11.987344
S_2 = var(X_2);S_2
##              fuel     repair   capital
## fuel    4.3623166  0.7598872  2.362099
## repair  0.7598872 25.8512360  7.685732
## capital 2.3620992  7.6857322 46.654400
S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2);S_p
##             fuel    repair   capital
## fuel    7.159939  3.397404  2.865842
## repair  3.397404 18.895711  5.665660
## capital 2.865842  5.665660 25.854166
(T2 = (n_1*n_2/(n_1+n_2))*t(xbar_1-xbar_2-delta0)%*%solve(S_p)%*%(xbar_1-xbar_2-delta0))
##         [,1]
## [1,] 52.3836
(crit = (n_1+n_2-2)*p/(n_1+n_2-p-1)*qf(.99,p,n_1+n_2-p-1))
## [1] 12.99521