Problem 1

air <- read.csv('air_polution.dat.txt',header=FALSE, sep="")

colnames(air) <-c("Wind",
"Solar", "CO", "NO", "NO2", "O3", "HC")

head(air)
##   Wind Solar CO NO NO2 O3 HC
## 1    8    98  7  2  12  8  2
## 2    7   107  4  3   9  5  3
## 3    7   103  4  3   5  6  3
## 4   10    88  5  2   8 15  4
## 5    6    91  4  2   8 10  3
## 6    8    90  5  2  12 12  4

Part 1.A.

Below, we have calculated the mahalanobis distance for each observation.

m_dist56 <- mahalanobis(air[,c(5,6)],colMeans(air[,c(5,6)]),cov(air[,c(5,6)]))
m_dist56
##  [1]  0.4606524  0.6592206  2.3770610  1.6282902  0.4135364  0.4760726
##  [7]  1.1848895 10.6391792  0.1388339  0.8162468  1.3566301  0.6228096
## [13]  5.6494392  0.3159498  0.4135364  0.1224973  0.8987982  4.7646873
## [19]  3.0089122  0.6592206  2.7741416  1.0360061  0.7874152  3.4437748
## [25]  6.1488606  1.0360061  0.1388339  0.8856041  0.1379719  2.2488867
## [31]  0.1901188  0.4606524  1.1471939  7.0857237  1.4584229  0.1224973
## [37]  1.8984708  2.7782596  8.4730649  0.6370218  0.7032485  1.8013611

Part 1.B.

76.19% of the observation fall within the approximate 70% probability contour of a bivariate normal distribution.

sum(m_dist56<=qchisq(.7,2))/42
## [1] 0.7619048

Part 1.C.

Based on the chi-square plot below, I would argue it is fairly non normal. About half way through, the slope changes and the function looks almost cubic.

q1 <- NULL
  for (i in 1:nrow(air[,5:6])){
    q1 <- cbind(q1, qchisq((i-0.5) / (nrow(air[,5:6])), ncol(air[,5:6]))) }

par(mar=c(5,5.5,4,2)+0.1)  # by hand calculation
plot(sort(m_dist56),q1,main ="Chi-squared Plot for NO2 and O3", xlab=expression(d[j]^2),
     ylab=expression(paste(chi[2]^2,"( ",frac(j-.5,42)," )")),pch=20)
abline(a=0,b=1,lwd=2)

Part 1.D.

Based on the chi-square plot below with all the variables, I would argue it is more normal than the first chi-squared plot from above. The points fall a bit more along the line.

m_dist_all <- mahalanobis(air,colMeans(air),cov(air)) # Mahalanobis distance

q1 <- NULL
  for (i in 1:nrow(air)){
    q1 <- cbind(q1, qchisq((i-0.5) / (nrow(air)), ncol(air))) }

par(mar=c(5,5.5,4,2)+0.1)  # by hand calculation
plot(sort(m_dist_all),q1,main ="Chi-squared Plot for all", xlab=expression(d[j]^2),
     ylab=expression(paste(chi[7]^2,"( ",frac(j-.5,42)," )")),pch=20)
abline(a=0,b=1,lwd=2)

Problem 2

Part 2.A.

pr2 <- matrix(c(2,12,8,9,6,9,8,10),4,2,byrow =TRUE)
n <- nrow(pr2)
p <- ncol(pr2)
mu0 <- matrix(c(7,11),ncol=1)

xbar <- as.matrix(colMeans(pr2), ncol=1)
S <- cov(pr2)

Tsq <- t(xbar-mu0)%*%solve(S)%*%(xbar-mu0)
Tsq*4
##          [,1]
## [1,] 13.63636

Part 2.B.

Under \(H_0\): \(\mu' = \mu_0'= [7 11]\), \(T^2\) is distributed as \(\frac{(n-1)p}{n-p}F_{p,n-p}\). So, \(T^2 \sim 3 F_{2,2}\)

Part 2.C.

Hypothesis Test: \(H_0\): \(\mu' = [7 11]\) \(H_1\): \(\mu' \ne [7 11]\)

Decision Rule: If \(T^2\) > F, reject the \(H_0\). If \(T^2\) < F, fail to reject the \(H_0\).

Conclusion: Since \(T2 = 13.636 < 3F_{2,2}(0.5) = 57\), we fail to reject the \(H_0\) at \(\alpha = 0.05\). We conclude that \(\mu' = [7 11]\)

F22 <- qf(1-0.05,df1=2,df2=2)
F22*3 #Critical value of F
## [1] 57

Problem 3

bird <- read.csv('bird_data.dat.txt', sep ="")
colnames(bird) <-c("Tail","Wing")

Part 3.A.

Hypothesis: \(H_0 : \mu_1 = 190mm\) and \(\mu_2 = 275mm\) \(H_a : \mu_1 \neq 190mm\) or \(\mu_2 \neq 275mm\)

Decision Rule: If p-value < \(\alpha\), we reject the \(H_0\).

Conclusion: \(p-value = 0.087 > \alpha = 0.05\), so we fail to reject our \(H_0\). That is, \(\mu_1 = 190mm\) and \(\mu_2 = 275mm\) are plausible values for the mean tail length and mean wing length for female birds.

S <- var(bird)
p <- ncol(S)
n <- nrow(bird)
testmean <- c(190, 275) ## the suggested true mean for tail and wing lengthes of female hook-billie kites ##
d <- apply(bird, 2, mean)-testmean
t2 <- n*t(d)%*%solve(S)%*%d;
t2mod <- (n-p)*t2/(p*(n-1))
pval <- 1- pf(t2mod,p,n-p)
pval
##           [,1]
## [1,] 0.0875292

Part 3.B.

Based on the ellipse below, we see values for \(\mu_1 and \mu_2\) are within our 95% confidence interval as we suggested in our hypothesis test in part a.

The green value is the tested values for \(\mu_1 and \mu_2\) and they are within the ellipse. The violet value is our mean for \(\mu_1 and \mu_2\) from our dataset.

library(ellipse)
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
## 
##     pairs
c.sq=ncol(bird)*(nrow(bird)-1)/(nrow(bird)-ncol(bird))*qf(.95, ncol(bird), nrow(bird)-ncol(bird))

conf.ellipse=ellipse(S/nrow(bird), centre=apply(bird, 2, mean), t=sqrt(c.sq))
plot(bird)
lines(conf.ellipse,type="l",xlab=expression({delta}[1]),ylab=expression({delta}[2]),
     main=expression("95% confidence ellipse"))
points(190,275,pch=20, col='green')
points(193.6818, 279.6818, pch=20, col='violet')

Part 3.C.

Below, the first confidence interval output is the \(T^2\) interval. The second is the Bonferroni. We see that the Bonferroni 95% confidence interval is more conservative.

Both contain the \(\mu_1 and \mu_2\) values from part A.

Since the sqrt(F) > T, we use the Bonferroni confidence interval.

#F-Value for Sheffe
F <- ((ncol(bird)*(nrow(bird)-1))/(nrow(bird)-ncol(bird)))*qf(0.95, ncol(bird), nrow(bird)-ncol(bird))

sqrt(F)
## [1] 2.567726
#Tsquared
n1 <-nrow(bird)
p1 <- 2
dbar<-as.matrix(colMeans(bird),ncol=1)
Sd <- cov(bird)
Tsq1 = matrix(NA,2,2)
c1 = sqrt(p1*(n1-1)/(n1-p1)*qf(.95,p1,n1-p1))
for (i in 1:2){
  Tsq1[i,1] = dbar[i] - c1*sqrt(Sd[i,i]/n1) #left
  Tsq1[i,2] = dbar[i] + c1*sqrt(Sd[i,i]/n1) #right
}

#T-value
T <- qt(1-(0.05/(2*ncol(bird))), nrow(bird)-1)
T
## [1] 2.322618
#Bonferroni
m=2
Bonf1 = matrix(NA,2,2)
for (i in 1:m){
  Bonf1[i,1] = dbar[i] - qt(1-0.05/(2*m),n1-1)*sqrt(Sd[i,i]/n1) #left
  Bonf1[i,2] = dbar[i] + qt(1-0.05/(2*m),n1-1)*sqrt(Sd[i,i]/n1) #right
}

Bonf1
##          [,1]     [,2]
## [1,] 189.7932 197.5705
## [2,] 274.5720 284.7917
colnames(Tsq1) =colnames(Bonf1) =c("lower","upper")
list(Tsq1, Bonf1)
## [[1]]
##         lower    upper
## [1,] 189.3828 197.9809
## [2,] 274.0327 285.3309
## 
## [[2]]
##         lower    upper
## [1,] 189.7932 197.5705
## [2,] 274.5720 284.7917

Part 3.D.

Based on below, we see that from the QQ plot of tail length and wing length, along with the Shapiro test, both variables are seemingly normal. I don’t know if I would be able to tell that from the graphs alone. However, the null hypothesis is that the population is normall distributed. Both p-values in the shapiro test are greater than 0.05, so we fail to reject the null hypothesis and conclude that the tail length and wing lengths are in fact normally distributed.

par(mfrow=c(2,2))
hist(bird$Tail)
hist(bird$Wing)
qqnorm(bird$Tail);qqline(bird$Tail)  # Q-Q plot of Tail Length
qqnorm(bird$Wing);qqline(bird$Wing)  # Q-Q plot of Wing Length

shapiro.test(bird$Tail);shapiro.test(bird$Wing)
## 
##  Shapiro-Wilk normality test
## 
## data:  bird$Tail
## W = 0.96902, p-value = 0.2794
## 
##  Shapiro-Wilk normality test
## 
## data:  bird$Wing
## W = 0.98309, p-value = 0.7571
m_dist_bird <- mahalanobis(bird,colMeans(bird),cov(bird)) # Mahalanobis distance

q1 <- NULL
  for (i in 1:nrow(bird)){
    q1 <- cbind(q1, qchisq((i-0.5) / (nrow(bird)), ncol(bird))) }

par(mar=c(5,5.5,4,2)+0.1)  # by hand calculation
plot(sort(m_dist_bird),q1,main ="Chi-squared Plot for Bird", xlab=expression(d[j]^2),
     ylab=expression(paste(chi[7]^2,"( ",frac(j-.5,42)," )")),pch=20)
abline(a=0,b=1,lwd=2)

Problem 4.

mean_x <- c(46.1, 57.3, 50.4)
S <- matrix(c(101.3, 63, 71, 63, 80.2, 55.6, 71, 55.6, 97.4), 3, 3)

Part 4.A.

Hypothesis: \(H_0: \mu_1 = \mu_2 = \mu_3\) \(H_A\): At least 1 \(\mu_n\) does not equal the rest.

Decision Rule: If p-value < \(\alpha = 0.05\), we reject the \(H_0\).

Conclusion: \(p-value = 1.251548e-10 < \alpha = 0.05\), so we reject our \(H_0\). That is, at least 1 \(\mu_n\) does not equal the rest.

Cstar <-matrix( c(1,-1,0 ,1,0, -1), 2, 3, byrow=T)
Cstar
##      [,1] [,2] [,3]
## [1,]    1   -1    0
## [2,]    1    0   -1
Cxbar <- Cstar %*% mean_x
Cxvar <- Cstar %*% S %*% t(Cstar)
# Compute Hotelling statistic

q <- ncol(Cstar)
n <- 40
t2<-n*t(Cxbar)%*%solve(Cxvar)%*%Cxbar;
t2mod<- (n-q+1)*t2/((q-1)*(n-1))
pval <- 1- pf(t2mod,p,n-p)
pval
##              [,1]
## [1,] 1.251548e-10

Part 4.B.

Hypothesis: \(H_0: \begin{bmatrix} \mu_{1}-\mu_{2} \\ \mu_{1}-\mu_{3} \end{bmatrix} = 0\) or \(c_{\mu} = 0\)

\(H_A: \begin{bmatrix} \mu_{1}-\mu_{2} \\ \mu_{1}-\mu_{3} \end{bmatrix} \neq 0\) or \(c_{\mu} \neq 0\)

Decision Rule: p-value < \(\alpha\), we reject the \(H_0\).

Conclusion: Based on the F value of 6.660, we will reject the \(H_0\): c_n = 0. This F value indicates a difference between the two contrasts.

n=40
q=3
F <- (((q-1)*(n-1))/(n-q+1))*qf(0.95, q-1, n-q+1)

sqrt(F)
## [1] 2.580778
upper <- NULL
lower <- NULL

simultaneous.ci <- function(Cxbar, Cxvar){
  for (i in 1:ncol(Cxvar)){
  upper[i] <- Cxbar[i]+sqrt(F)*sqrt(Cxvar[i,i]/n)
  lower[i] <- Cxbar[i]-sqrt(F)*sqrt(Cxvar[i,i]/n)
  } 
  tab <- cbind(upper, lower)
  rownames(tab) <- c("Cx1", "Cx2") 
  tab
}


simultaneous.ci(Cxbar, Cxvar)
##         upper      lower
## Cx1 -8.160045 -14.239955
## Cx2 -1.227356  -7.372644

Problem 5.

Hypothesis: \(H_0:\mu_1-\mu_2=\delta_0\) \(H_A:\mu_1-\mu_2\neq\delta_0\)

Decision Rule: Reject \(H_0\) if \(T^2 > c^2\)

Conclusion: Based on our critical value of \(c^2 = 6.33459 < 82.33759\), we reject our \(H_0\). That is, there is a difference between the mean vectors between the two groups of noncarriers and carriers of hemophilia A gene.

Below we check for normality to make sure this test is viable.

Since n1-p and n2-p are large, we know that it is chi squared distributed based on the large sample theory.

noncarrier <- read.csv('noncarrier.txt', sep ="", header=FALSE)
colnames(noncarrier) <-c("ncAHF",
"ncAHFlike")

obligatory <- read.csv('obligatory.txt', sep ="", header=FALSE)
colnames(obligatory) <-c("AHF",
"AHFlike")
#######################
# Two Sample Independent
#######################

n_1<-dim(noncarrier)[1]
n_2<-dim(obligatory)[1]
p<-dim(noncarrier)[2]

# Testing H0:mu1-mu2=delta0
delta0=rep(0,p)
xbar_1 = colMeans(noncarrier);xbar_1
##       ncAHF   ncAHFlike 
## -0.13487000 -0.07785667
xbar_2 = colMeans(obligatory);xbar_2
##          AHF      AHFlike 
## -0.307946667 -0.005991111
S_1 = var(noncarrier);S_1
##                ncAHF  ncAHFlike
## ncAHF     0.02089726 0.01551495
## ncAHFlike 0.01551495 0.01792005
S_2 = var(obligatory);S_2
##                AHF    AHFlike
## AHF     0.02378380 0.01537617
## AHFlike 0.01537617 0.02403505
S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2);S_p
##                ncAHF ncAHFlike
## ncAHF     0.02263709 0.0154313
## ncAHFlike 0.01543130 0.0216058
(T2 = (n_1*n_2/(n_1+n_2))*t(xbar_1-xbar_2-delta0)%*%solve(S_p)%*%(xbar_1-xbar_2-delta0))
##          [,1]
## [1,] 82.33759
(crit = (n_1+n_2-2)*p/(n_1+n_2-p-1)*qf(.95,p,n_1+n_2-p-1))
## [1] 6.33459
T2
##          [,1]
## [1,] 82.33759
crit
## [1] 6.33459
# Linear combination of the mean components most responsible for rejection of H_0
(a<-solve(S_p)%*%(xbar_1-xbar_2))
##                [,1]
## ncAHF      19.31900
## ncAHFlike -17.12424
A = diag(p)

# T^2-intervals
Clower = t(A)%*%(xbar_1-xbar_2)-sqrt(crit)*sqrt(diag((t(A)%*%S_p%*%A)*(1/n_1+1/n_2)))
Cupper = t(A)%*%(xbar_1-xbar_2)+ sqrt(crit)*sqrt(diag((t(A)%*%S_p%*%A)*(1/n_1+1/n_2)))
cbind(Clower,Cupper)
##             [,1]       [,2]
## [1,]  0.08382151 0.26233183
## [2,] -0.15906389 0.01533278
# Bonferroni intervlas
crit.b = abs(qt(0.05/(2*p),(n_1+n_2-2)))
Clower.b = t(A)%*%(xbar_1-xbar_2)-crit.b*sqrt(diag((t(A)%*%S_p%*%A)*(1/n_1+1/n_2)))
Cupper.b = t(A)%*%(xbar_1-xbar_2)+crit.b*sqrt(diag((t(A)%*%S_p%*%A)*(1/n_1+1/n_2)))
cbind(Clower.b,Cupper.b)
##             [,1]        [,2]
## [1,]  0.09191721 0.254236127
## [2,] -0.15115475 0.007423642
# Assessing the normality and equal covariance assumptions 
par(mfrow=c(1,3))

# 3-d Scatterplot: Ellipsoidal? are the covariances the same?
library(scatterplot3d)
sp<-scatterplot3d(noncarrier,main="Red=Carrier")
sp$points3d(obligatory,col="red")

#Visually compare the sample variances and covariances between the two groups
cov(noncarrier) 
##                ncAHF  ncAHFlike
## ncAHF     0.02089726 0.01551495
## ncAHFlike 0.01551495 0.01792005
cov(obligatory)
##                AHF    AHFlike
## AHF     0.02378380 0.01537617
## AHFlike 0.01537617 0.02403505
# Chi-square plot for carrier and noncarrier separately.
distances <-function(D)
{
  dsq=c()
  M<-colMeans(D); S<-cov(D)
  n<-dim(D)[1]
  p<-dim(D)[2]
  for (i in 1:n) dsq<-c(dsq,as.matrix(D[i,]-M)%*%solve(S)%*%as.matrix(t(D[i,]-M)))
  as.numeric(dsq)
}

#Chi-square plot for noncarriers
qc<- qchisq( ((1:n_1)-1/2)/n_1,df=p)
par(mar=c(4,4,1,1))
plot(sort(distances(noncarrier)),qc,xlab=expression(d[j]^2),main="noncarrier")
abline(a=0,b=1)

#Chi-square plot for carrrier
qc<- qchisq( ((1:n_2)-1/2)/n_2,df=p)
par(mar=c(4,4,1,1))
plot(sort(distances(obligatory)),qc,xlab=expression(d[j]^2),main="carrier obligatory")
abline(a=0,b=1)

Problem 6.

H0 : µ1 + µ3 = µ2 + µ4 = µ5 + µ7 = µ6 + µ8

c = (1,-1,1,-1,-1,-1,-1,-1)

The procedure for testing the null hypothesis would involve the contrast matrix C. We would perform something similar to what we performed above in 4.B.

We have our C matrix. We would need to compute Cxbar and Cvbar which would be the mean and variance matrices of the C matrix.

Then, we would need to calculate the Hotelling statistic and the p-value.

Finally we can compute the F-statistic which can be calculated as follows:

F <- (((q-1)(n-1))/(n-q+1))qf(0.95, q-1, n-q+1)

Based on the results of the pvalue from the Hotelling statistic and the F value, we can decide to reject or fail to reject the \(H_0\).