Intro
Basic Sample Statistics
Mean; Var-Cov; Correlation Coefficient Matrix
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
|---|---|---|---|---|
| 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 5.4 | 3.9 | 1.7 | 0.4 | setosa |
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.68112222 -0.04215111 1.2658200 0.5128289
## Sepal.Width -0.04215111 0.18871289 -0.3274587 -0.1208284
## Petal.Length 1.26582000 -0.32745867 3.0955027 1.2869720
## Petal.Width 0.51282889 -0.12082844 1.2869720 0.5771329
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.6856935 -0.0424340 1.2743154 0.5162707
## Sepal.Width -0.0424340 0.1899794 -0.3296564 -0.1216394
## Petal.Length 1.2743154 -0.3296564 3.1162779 1.2956094
## Petal.Width 0.5162707 -0.1216394 1.2956094 0.5810063
Plots
# 2D Scatter Plot Matrix
colors <- brewer.pal(nlevels(iris$Species),"Dark2")
scatterplotMatrix(iris[1:4],
groups=iris[[5]],
smooth=FALSE,
regLine=FALSE,
legend=FALSE,
oma=c(0,0,8,0),
col=colors)
legend("top",
legend=levels(iris[[5]]),
pch=1:3,
col=colors,
horiz=TRUE,
bty="n",
cex=1.5,
xpd=TRUE)# 3D Plot
with(iris,
scatterplot3d(Sepal.Width,
Sepal.Length,
Petal.Length,
color=colors[Species],
type = "h",
pch=20))L1: Matrix
L2: Random Vectors & Moments
L3: Multivariate Normal Distribution
###########
## MVN Density function and Its Contours
###########
X <- seq(-1,5,length.out=100)
Y <- seq(2,8,length.out=100)
dens <- matrix(dmvnorm(expand.grid(X,Y),
mean=c(2,5),
sigma = rbind(c(1,.5),
c(.5, 1))),
ncol = 100)
persp(X,
Y,
dens,
main = "Density Surface",
box = FALSE)s3d <- scatterplot3d(X,
Y,
seq(min(dens),
max(dens),
length = length(X)),
type = "n",
grid = FALSE,
angle = 70,
zlab = "expression(f(x, y))",
xlab = expression(X), ylab = expression(Y),
main = "Density Surface")
for(i in length(X):1){
s3d$points3d(rep(X[i], length(Y)), Y, dens[i,], type = "l")
}
for(i in length(Y):1){
s3d$points3d(X, rep(Y[i], length(X)), dens[,i], type = "l")
}###########
## Simulated Data
###########
Xbvn <- rmvnorm(100,c(2,5),matrix(c(1,.5,.5,1),nrow=2))
dataEllipse(Xbvn,
xlim = c(-2,6),
ylim = c(2,10),
levels = c(0.5,0.75, 0.95),
pch = 20)Stiff Data
Testing Normality
Univariate Normality
| V1 | V2 | V3 | V4 |
|---|---|---|---|
| 1889 | 1651 | 1561 | 1778 |
| 2403 | 2048 | 2087 | 2197 |
| 2119 | 1700 | 1815 | 2222 |
| 1645 | 1627 | 1110 | 1533 |
| 1976 | 1916 | 1614 | 1883 |
| 1712 | 1712 | 1439 | 1546 |
colnames(stiff) <- c("x1","x2","x3","x4")
attach(stiff)
###########
## Univariate normality
###########
par(mfrow = c(2,2))
hist(x1)
hist(x4)
qqnorm(x1);qqline(x1) # Q-Q plot of x1
qqnorm(x4);qqline(x4) # Q-Q plot of x4##
## Shapiro-Wilk normality test
##
## data: x1
## W = 0.93068, p-value = 0.05118
##
## Shapiro-Wilk normality test
##
## data: x2
## W = 0.91274, p-value = 0.01746
##
## Shapiro-Wilk normality test
##
## data: x3
## W = 0.93258, p-value = 0.05751
##
## Shapiro-Wilk normality test
##
## data: x4
## W = 0.96127, p-value = 0.3337
Bivariate Normality
#Bivariate Percentage
par(mfrow = c(1,1))
plot(x1, x4, pch = 20)
quan_chisq = ellipse(cor(stiff[,c(1,4)]),
scale = c(sd(x1),sd(x4)),
center = c(0,0),
level = 0.5,
which = c(1, 2),
npoints = 100)
lines(quan_chisq[,1] + mean(x1),
quan_chisq[,2] + mean(x4),
col = "blue",
lwd = 2)# Mahalanobis distance between x1 & x4
m_dist14 <- mahalanobis(x = stiff[,c(1,4)],
center = colMeans(stiff[,c(1,4)]),
cov = cov(stiff[,c(1,4)]))
# 60% of Mahalanobis distance <= 50th quantile of chi-square
sum(m_dist14<=qchisq(.5,2))/30 ## [1] 0.6
Chi-Square Plot
# Calculate Mahalanobis distance
m_dist <- mahalanobis(x = stiff,
center = colMeans(stiff),
cov = cov(stiff))
sum (m_dist <= qchisq(.05,4))## [1] 3
par(mar=c(5,5.5,4,2)+0.1) # by hand calculation
plot(sort(m_dist),qchisq(((1:30)-0.5)/30,4),xlab=expression(d[j]^2),
ylab=expression(paste(chi[4]^2,"( ",frac(j-.5,30)," )")),pch=20)
abline(a=0,b=1,lwd=2)## $multivariateNormality
## Test H p value MVN
## 1 Royston 9.823858 0.009534665 NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk x1 0.9307 0.0512 YES
## 2 Shapiro-Wilk x2 0.9127 0.0175 NO
## 3 Shapiro-Wilk x3 0.9326 0.0575 YES
## 4 Shapiro-Wilk x4 0.9613 0.3337 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th Skew
## x1 30 1906.100 324.9866 1863.0 1325 2983 1715.25 2057.25 1.0380842
## x2 30 1749.533 318.6065 1680.0 1170 2794 1595.50 1888.75 1.1435912
## x3 30 1509.133 303.1783 1466.0 1002 2412 1295.75 1623.75 0.9800274
## x4 30 1724.967 322.8436 1674.5 1176 2581 1520.25 1880.75 0.5978431
## Kurtosis
## x1 2.03586397
## x2 1.94986381
## x3 0.99683699
## x4 -0.04626509
Detecting Outliers
## scale.x1. scale.x2. scale.x3. scale.x4.
## 1 -0.05261755 -0.3092634 0.17107644 0.16426945
## 2 1.52898605 0.9367877 1.90602908 1.46211166
## 3 0.65510390 -0.1554687 1.00886726 1.53954854
## 4 -0.80341770 -0.3845914 -1.31649701 -0.59461204
## 5 0.21508578 0.5224835 0.34589106 0.48950437
## 6 -0.59725537 -0.1178047 -0.23132702 -0.55434486
## 7 0.11354314 -0.2025487 -0.78545638 -0.16716042
## 8 0.60894816 0.2211714 0.68562513 0.46162709
## 9 3.31367493 3.2782337 2.97800552 2.65154223
## 10 -0.49571272 -0.4693354 -0.41273841 -0.67204892
## 11 -0.60340947 -0.4975834 0.02924572 -0.17955033
## 12 0.43047927 0.4942355 0.38877012 0.53596650
## 13 -0.20339299 0.2870835 0.28322167 0.04966286
## 14 -0.12031265 -0.2025487 -0.05321401 -0.14547810
## 15 -0.14492905 -0.3155407 -0.39624647 -0.03396898
## 16 0.14739069 1.2537931 -1.08560978 -1.37517585
## 17 -1.78807364 -1.8189625 -1.67272303 -1.70041077
## 18 -1.49883096 -1.1880903 -0.84812577 -1.29154401
## 19 -0.24031759 -0.3626207 0.30631040 0.09302751
## 20 -0.55725372 -0.4881674 -0.64692404 -0.24459731
## 21 1.13820072 1.3793398 0.12489900 1.19572877
## 22 -0.02184705 -0.4253941 -0.28739963 -0.76807066
## 23 -0.84034230 -0.7423995 -0.72278698 -0.64726912
## 24 0.47663501 0.3686888 0.45143951 0.96651559
## 25 -0.15416020 -0.8051729 -0.50509331 -0.59461204
## 26 -0.55109962 -1.0594050 -0.89430321 -0.79285046
## 27 0.80587934 0.4597102 0.63285091 0.33772807
## 28 -0.77264721 -0.2339354 -0.31378674 -0.39637361
## 29 1.29205321 1.7308706 1.83346452 1.57671825
## 30 -1.28036042 -1.1535650 -0.97346455 -1.36588342
## [1] 3 25 27 24 8 12 22 10 29 4 11 2 16 1 6 30 19 20 7 9 28 23 5
## [24] 14 21 18 13 17 26 15
## m_dist
## [1,] 14 0.1295714
## [2,] 12 0.4635159
## [3,] 1 0.6000129
## [4,] 10 0.7665400
## [5,] 23 0.7962096
## [6,] 15 1.0792484
## [7,] 19 1.3632124
## [8,] 5 1.3980776
## [9,] 20 1.4649908
## [10,] 8 1.4876570
## [11,] 11 1.9307771
## [12,] 6 2.2191409
## [13,] 27 2.3816050
## [14,] 24 2.5385575
## [15,] 30 2.5838186
## [16,] 13 2.6959024
## [17,] 28 2.9951752
## [18,] 26 3.3979804
## [19,] 17 3.5018290
## [20,] 18 3.9900603
## [21,] 25 4.5767867
## [22,] 7 4.9883498
## [23,] 22 5.0557446
## [24,] 4 5.2076098
## [25,] 2 5.4770196
## [26,] 29 6.2837628
## [27,] 3 7.6166439
## [28,] 21 9.8980384
## [29,] 9 12.2647550
## [30,] 16 16.8474070
## $multivariateNormality
## Test H p value MVN
## 1 Royston 1.098338 0.6271166 YES
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk x1 0.9847 0.9439 YES
## 2 Shapiro-Wilk x2 0.9664 0.4871 YES
## 3 Shapiro-Wilk x3 0.9672 0.5070 YES
## 4 Shapiro-Wilk x4 0.9660 0.4779 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th Skew
## x1 28 1865.929 262.1619 1857.5 1325 2403 1711.50 2049.75 0.08994538
## x2 28 1697.964 244.8618 1663.0 1170 2301 1593.25 1847.50 0.39091767
## x3 28 1488.643 253.1536 1466.0 1002 2087 1307.25 1617.25 0.49661284
## x4 28 1710.250 277.9986 1674.5 1176 2234 1528.75 1876.25 0.25921958
## Kurtosis
## x1 -0.5084972
## x2 0.1961808
## x3 0.0516768
## x4 -0.6484407
Transformations
# yjPower" family (Yeo and Johnson(2000)),
# another modifiation of the Box-Cox family that allows a few negative values.
trans <- powerTransform(stiff,
family = "yjPower")
summary(trans)## yjPower Transformations to Multinormality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## x1 0.0939 1 -0.9911 1.1790
## x2 -0.2802 0 -1.5129 0.9525
## x3 0.1478 1 -0.9585 1.2542
## x4 0.7546 1 -0.5292 2.0385
##
## Likelihood ratio test that all transformation parameters are equal to 0
## LRT df pval
## LR test, lambda = (0 0 0 0) 1.980807 4 0.73929
# Scatterplot
plot(log(x1),log(x4),pch=20)
# Add Eclipse
quan_chisq1 <- ellipse(cor(logstiff[,c(1,4)]),
scale = c(sd(log(x1)),
sd(log(x4))),
center=c(0,0),
t = sqrt(qchisq(.5, 2)),
which = c(1, 2),
npoints = 100)
# Add Lines
lines(quan_chisq1[,1] + mean(log(x1)),
quan_chisq1[,2] + mean(log(x4)),
col="blue",
lwd=2)## $multivariateNormality
## Test H p value MVN
## 1 Royston 1.652091 0.5200565 YES
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk x1 0.9742 0.6577 YES
## 2 Shapiro-Wilk x2 0.9626 0.3612 YES
## 3 Shapiro-Wilk x3 0.9795 0.8118 YES
## 4 Shapiro-Wilk x4 0.9831 0.9005 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th
## x1 30 7.539650 0.1631929 7.529941 7.189168 8.000685 7.447309 7.629120
## x2 30 7.452334 0.1721702 7.426545 7.064759 7.935230 7.374941 7.543648
## x3 30 7.301165 0.1910199 7.290123 6.909753 7.788212 7.166816 7.392488
## x4 30 7.436553 0.1834145 7.423268 7.069874 7.855932 7.326618 7.539424
## Skew Kurtosis
## x1 0.3660786 0.69728520
## x2 0.4814651 0.73170914
## x3 0.4107699 0.07436358
## x4 0.1581748 -0.44895116
L4: Inference for Mean Vectors
Sweat Data
Comparing Perspiration Between 2 Groups
Sample Statistics Sweat
We want to see if perspiration in Latin American healthy women in the age group 50-65 is the same as their counterparts in the North America. Perspiration is quantified in terms of the following variables
* X1: Sweat Rate
* X2: Sodium in Sweat
* X3: Potassium in Sweat
| V1 | V2 | V3 |
|---|---|---|
| 3.7 | 48.5 | 9.3 |
| 5.7 | 65.1 | 8.0 |
| 3.8 | 47.2 | 10.9 |
| 3.2 | 53.2 | 12.0 |
| 3.1 | 55.5 | 9.7 |
| 4.6 | 36.1 | 7.9 |
Hotelling Test
# Calculating Hotelling
## By hand
Tsq <- n*t(xbar-mu0)%*%solve(S)%*%(xbar-mu0)
## By Mahalanobis distance
Tsq <- n*mahalanobis(x = colMeans(sweat),
center = mu0,
cov = cov(sweat))
# Critical point at alpha = 0.05
cri <- (((n-1)*p)/(n-p))*qf((1-alpha),p,n-p)
# Tsq < cri Fail to reject H0
Tsq > cri## [1] FALSE
# Compute P value
Tsq.mod <- ((n-p)/p)/(n-1)*Tsq
p.val <- 1 - pf(Tsq.mod, p, (n-p))
test.frame <- data.frame(stat = Tsq,
mod.stat = Tsq.mod,
crit = cri,
reject = (Tsq > cri),
reject.mod = (Tsq.mod > cri),
p = p.val)
test.frame| stat | mod.stat | crit | reject | reject.mod | p |
|---|---|---|---|---|---|
| 9.738773 | 2.904546 | 10.7186 | FALSE | FALSE | 0.0649283 |
##
## Hotelling's one sample T2-test
##
## data: sweat
## T.2 = 2.9045, df1 = 3, df2 = 17, p-value = 0.06493
## alternative hypothesis: true location is not equal to c(4,50,10)
Confidence Intervals for \(\mu\)
# One at a time intervals of mu
One <- matrix(NA,3,2)
for (i in 1:3){
One[i,1] <- xbar[i] - qt(.975,n-1)*sqrt(S[i,i]/n) #left
One[i,2] <- xbar[i] + qt(.975,n-1)*sqrt(S[i,i]/n) #right
}
## T^2 interval mu
Tsq <- matrix(NA,3,2)
c <- sqrt(((p*(n-1))/(n-p))*qf(.95,p,n-p))
for (i in 1:3){
Tsq[i,1] <- xbar[i] - c*sqrt(S[i,i]/n)
Tsq[i,2] <- xbar[i] + c*sqrt(S[i,i]/n)
}
## Bonferroni interval mu when m = 3 (m is the number of CIs being estimated).
m <- 3
Bonf <- matrix(NA,3,2)
for (i in 1:m){
Bonf[i,1] <- xbar[i] - qt((1-alpha)/(2*m),n-1)*sqrt(S[i,i]/n) #left
Bonf[i,2] <- xbar[i] + qt((1-alpha)/(2*m),n-1)*sqrt(S[i,i]/n) #right
}
colnames(One) = colnames(Tsq) =colnames(Bonf) =c("lower","upper")
row.names(One)=row.names(Tsq)=row.names(Bonf)=c("SweatRate","Sodium","Potassium")
list(One, Tsq, Bonf)## [[1]]
## lower upper
## SweatRate 3.845840 5.43416
## Sodium 38.784779 52.01522
## Potassium 9.073601 10.85640
##
## [[2]]
## lower upper
## SweatRate 3.397768 5.882232
## Sodium 35.052408 55.747592
## Potassium 8.570664 11.359336
##
## [[3]]
## lower upper
## SweatRate 5.030216 4.249784
## Sodium 48.650435 42.149565
## Potassium 10.402995 9.527005
Waste Water Data
Municipal wastewater treatment plants are required by law to monitor their discharges into rivers and streams on a regular basis. Concern about the reliability of data from one of these self-monitoring program led to a study in which samples of effluent were divided and sent to two laboratories.
Compare two lab’s measurement by two variables:
* X1 & X3: Biochemical oxygen demand (BOD) from two labs
* X2 & X4: Suspended solids (SS) from two labs
Paired Comparison Tests
## V1 V2 V3 V4
## [1,] 6 27 25 15
## [2,] 6 23 28 13
## [3,] 18 64 36 22
## [4,] 8 44 35 29
## [5,] 11 30 15 31
## [6,] 34 75 44 64
n1 <- nrow(ww)
p1 <- 2
# Calculate differences mu1-mu3 & mu2-mu4
D <- ww[,1:2] - ww[,3:4]
## Estimate of mu1-mu2
dbar <- as.matrix(colMeans(D),ncol=1)
Sd <- cov(D)
## Using D
Tsq1 <- n1*t(dbar)%*%solve(Sd)%*%(dbar)
cri1 <- (((n1-1)*p1)/(n1-p1)) * qf((1-alpha), p1, n1 - p1)
pval1 <- 1 - pf(Tsq1*(n1 - p1)/p1/(n1-1), p1, n1 - p1)
# Results
test.frame <- data.frame(stat = Tsq1,
crit = cri1,
reject = (Tsq1 > cri1),
p = pval1)
test.frame| stat | crit | reject | p |
|---|---|---|---|
| 13.63931 | 9.458877 | TRUE | 0.0208278 |
Alternate Method Using Matrix
C <- cbind(diag(p1),-diag(p1))
D <- t(C%*%t(ww))
dbar <- as.matrix(colMeans(D), ncol=1)
Sd <- cov(D)
Tsq1 <- n1*t( C%*%colMeans(ww))%*%solve(C%*%cov(ww)%*%t(C))%*%(C%*%colMeans(ww))
test.frame <- data.frame(stat = Tsq1,
crit = cri1,
reject = (Tsq1 > cri1),
p = pval1)
test.frame| stat | crit | reject | p |
|---|---|---|---|
| 13.63931 | 9.458877 | TRUE | 0.0208278 |
Confidence Region/Interval for Delta
plot(D)
CR <- ellipse(x = cor(D),
scale = sqrt(diag(Sd)/n1),
centre = as.vector(dbar),
t = sqrt(cri1))
#n1*mahalanobis(as.vector(dbar),as.vector(CR[3,]),cov(D))
lines(CR,
type = "l",
xlab = expression({delta}[1]),
ylab = expression({delta}[2]),
main = expression("95% confidence ellipse"))
points(0,0,pch=20)## One at a time intervals of for delta1 and delta2
One1 <- matrix(NA,2,2)
for (i in 1:2){
One1[i,1] <- dbar[i] - qt(.975,n1-1)*sqrt(Sd[i,i]/n1)
One1[i,2] <- dbar[i] + qt(.975,n1-1)*sqrt(Sd[i,i]/n1)
}
## T^2 interval for delta1 and delta2
Tsq1 <- matrix(NA,2,2)
c1 <- sqrt(((p1*(n1-1))/(n1-p1))*qf(.95,p1,n1-p1))
for (i in 1:2){
Tsq1[i,1] <- dbar[i] - c1*sqrt(Sd[i,i]/n1)
Tsq1[i,2] <- dbar[i] + c1*sqrt(Sd[i,i]/n1)
}
## Bonferroni Interval for delta1 and delta2 when m=2 (m is the number of CIs).
m <- 2
Bonf1 <- matrix(NA,2,2)
for (i in 1:m){
Bonf1[i,1] <- dbar[i] - qt((1-0.05/(2*m)),n1-1)*sqrt(Sd[i,i]/n1)
Bonf1[i,2] <- dbar[i] + qt((1-0.05/(2*m)),n1-1)*sqrt(Sd[i,i]/n1)
}
cols <- c("lower","upper")
rows <- c("BOD","SS")
colnames(One1) = colnames(Tsq1) = colnames(Bonf1) = cols
row.names(One1) = row.names(Tsq1) = row.names(Bonf1) = rows
out <- list(One1, Tsq1, Bonf1)
names(out) <- c("One-at-a-time Intervals", "T^2 Intervals", "Bonferroni Simultaneous Intervals")
out## $`One-at-a-time Intervals`
## lower upper
## BOD -18.8467298 0.119457
## SS -0.4725958 27.018050
##
## $`T^2 Intervals`
## lower upper
## BOD -22.453272 3.72600
## SS -5.700119 32.24557
##
## $`Bonferroni Simultaneous Intervals`
## lower upper
## BOD -20.573107 1.845835
## SS -2.974903 29.520358
Turtles Dataset
Two Sample Independent Tests
Measurements on the carapaces of 24 female and 24 male painted turtles:
* X1: Length
* X2: Width
* X3: Height
* X4: Sex (female and male)
rm(list=ls())
alpha <- .05
X <- read.table("resources/data/T6-9.dat",
header = FALSE,
col.names = c("Length", "Width", "Height", "Sex"))
# log transformation improves Normality.
X[,1:3] <- log(X[,1:3])
X1 <- X[X$Sex=="male",1:3]
X2 <- X[X$Sex=="female",1:3]
n1 <- dim(X1)[1]
n2 <- dim(X2)[1]
p <- dim(X1)[2]
# Testing H0: mu1-mu2 = delta0
delta0 <- rep(0,p)
xbar1 <- colMeans(X1)
xbar2 <- colMeans(X2)
S1 <- var(X1)
S2 <- var(X2)
Sp <- ((n1-1)*S1+(n2-1)*S2)/(n1+n2-2)
T2 <- (n1*n2/(n1+n2))*t(xbar1-xbar2-delta0)%*%solve(Sp)%*%(xbar1-xbar2-delta0)
crit <- (n1 + n2 - 2)*p/(n1 + n2 - p - 1)*qf(.95, p, n1 + n2 - p - 1)
# Linear combination of the mean components most responsiblefor rejection of H_0
a <- solve(Sp)%*%(xbar1 - xbar2)
A <- diag(p) 2-Sample Independent Confidence Intervals
# T^2-intervals
Clower <- t(A)%*%(xbar1 - xbar2) - sqrt(crit) * sqrt(diag((t(A)%*%Sp%*%A)*(1/n1 + 1/n2)))
Cupper <- t(A)%*%(xbar1 - xbar2) + sqrt(crit) * sqrt(diag((t(A)%*%Sp%*%A)*(1/n1 + 1/n2)))
cbind(Clower,Cupper)## [,1] [,2]
## [1,] -0.2926638 -0.05776762
## [2,] -0.2365537 -0.05411666
## [3,] -0.3451377 -0.12906223
# Bonferroni Intervals
crit.b <- abs(qt(alpha/(2*p), (n1 + n2-2)))
Clower.b <- t(A)%*%(xbar1 - xbar2) - crit.b*sqrt(diag((t(A)%*%Sp%*%A)*(1/n1+1/n2)))
Cupper.b <- t(A)%*%(xbar1 - xbar2) + crit.b*sqrt(diag((t(A)%*%Sp%*%A)*(1/n1+1/n2)))
cbind(Clower.b,Cupper.b)## [,1] [,2]
## [1,] -0.2734025 -0.07702893
## [2,] -0.2215940 -0.06907636
## [3,] -0.3274197 -0.14678026
# Assessing the normality and equal covariance assumptions
par(mfrow = c(1,3))
# 3-d Scatterplot: Ellipsoidal? are the covariances the same?
sp <- scatterplot3d(X1,
xlim = c(4.4,5.3),
ylim = c(4.2,5),
zlim = c(3.4,4.4),
main = "Red = FEMALE")
sp$points3d(X2, col = "red")
#Visually compare the sample variances and covariances between the two groups
cov(X1) ## Length Width Height
## Length 0.011072004 0.008019142 0.008159648
## Width 0.008019142 0.006416726 0.006005271
## Height 0.008159648 0.006005271 0.006772758
## Length Width Height
## Length 0.02640563 0.02011195 0.02491758
## Width 0.02011195 0.01619045 0.01942430
## Height 0.02491758 0.01942430 0.02493980
Chi-Square Plots
# Chi-square plot for males and females separately.
distances <-function(D){
dsq <- c()
M <- colMeans(D)
S <- cov(D)
n <- dim(D)[1]
p <- dim(D)[2]
for(i in 1:n){
dsq <- c(dsq, as.matrix(D[i,]-M)%*%solve(S)%*%as.matrix(t(D[i,]-M)))
}
as.numeric(dsq)
}
#Chi-square plot for males
qc <- qchisq( ((1:n1)-1/2)/n1, df = p)
par(mar = c(4,4,1,1))
plot(sort(distances(X1)),
qc,
xlab = expression(d[j]^2),
main="Male")
abline(a = 0, b = 1)#Chi-square plot for females
qc <- qchisq( ((1:n2)-1/2)/n2, df = p)
par(mar = c(4,4,1,1))
plot(sort(distances(X2)),
qc,
xlab = expression(d[j]^2),
main = "Female")
abline(a = 0, b = 1)Tests For Additional Information
X11 <- X[X$Sex=="male",1:2]
X12 <- X[X$Sex=="female",1:2]
# Testing H0: mu1-mu2 = delta0
xbar11 <- colMeans(X11)
xbar12 <- colMeans(X12)
S11 <- var(X11)
S12 <- var(X12)
S1p <- ((n1-1)*S11+(n2-1)*S12)/(n1+n2-2)
T12 <- (n1*n2/(n1+n2))*t(xbar11-xbar12)%*%solve(S1p)%*%(xbar11 - xbar12)
U <- (T2-T12)/(n1+n2-2+T12)
p <- 3
p1 <- 2
p2 <- 1
crit2 <- p2/(n1+n2-p-1)*qf((1-alpha), p2, n1+n2-p-1)Iris Data
One-Way MANOVA Manually
| V1 | V2 | V3 | V4 | V5 |
|---|---|---|---|---|
| 5.1 | 3.5 | 1.4 | 0.2 | 1 |
| 4.9 | 3.0 | 1.4 | 0.2 | 1 |
| 4.7 | 3.2 | 1.3 | 0.2 | 1 |
| 4.6 | 3.1 | 1.5 | 0.2 | 1 |
| 5.0 | 3.6 | 1.4 | 0.2 | 1 |
| 5.4 | 3.9 | 1.7 | 0.4 | 1 |
## [1] 1 2 3
## [1] 3
## G
## 1 2 3
## 50 50 50
# Total sample size
n <- sum(N)
X <- as.matrix(D[,c(2,4)])
# dimension
p <- dim(X)[2]
X1 <- X[G==1,]
X2 <- X[G==2,]
X3 <- X[G==3,]
###
# Getting a feel for the data
##
plot(X1,
xlim=c(1,5),
ylim=c(0,3),
col="red",
ylab="Petal",
xlab="Sepal",
main="Width Measurements from Three Species of Iris Plants")
points(X2,
col="blue")
points(X3,
col="green")
text(4,0.5,
"Species 1",
col="red")
text(4,1.5,
"Species 2",
col="blue")
text(4,2.25,
"Species 3",
col="green")# Matrix for B
C1 <- c(1)
for (i in 1:g){
C1 <- bdiag(C1, (1/N[i])*matrix(1,N[i],N[i]))
}
C1 <- as.matrix(C1[-1,-1]-(1/n)*matrix(1,n,n))
# Matrix for W
C2 <- c(1)
for (i in 1:g){
C2 <- bdiag(C2, diag(1,N[i])-(1/N[i])*matrix(1,N[i],N[i]))
}
C2 <- as.matrix(C2[-1,-1])
# SS
B <- t(X)%*%C1%*%X
W <- t(X)%*%C2%*%X
# Test Stat
Lambda <- det(W)/det(B+W)
# Lambda has the same distributions as U(2,2,147)
m1 <- g-1; m1## [1] 2
## [1] 147
## [1] 299.936
## [1] 2.402562
One-Way Manova Function
test1 <- manova(cbind(D[,2],D[,4]) ~ as.character(G))
#c("Pillai", "Wilks", "Hotelling-Lawley", "Roy")
summary(test1,'Wilks') ## Df Wilks approx F num Df den Df Pr(>F)
## as.character(G) 2 0.038316 299.94 4 292 < 2.2e-16 ***
## Residuals 147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.character(G) 2 11.345 5.6725 49.16 < 2.2e-16 ***
## Residuals 147 16.962 0.1154
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.character(G) 2 80.413 40.207 960.01 < 2.2e-16 ***
## Residuals 147 6.157 0.042
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Pairwise Comparison With Hotelling’s T2
# Testing H0:mui-muj=delta0
delta0 <- rep(0,p)
xbar1 <- colMeans(X1)
xbar2 <- colMeans(X2)
xbar3 <- colMeans(X3)
Sp <- W/(n-g)
# Number of pairs of mu you are comparing
m <- g*(g-1)/2
crit <- ((n-g)*p/(n-g-p+1))*qf(1- alpha/m,p,n-g-p+1)
# Between Species 1 and 2
T2 <- (N[1]*N[2]/(N[1]+N[2]))*t(xbar1-xbar2-delta0)%*%solve(Sp)%*%(xbar1-xbar2-delta0)
# Between Species 1 and 3
T2 <- (N[1]*N[3]/(N[1]+N[3]))*t(xbar1-xbar3-delta0)%*%solve(Sp)%*%(xbar1-xbar3-delta0)
# Between Species 2 and 3
T2 <- (N[2]*N[3]/(N[2]+N[3]))*t(xbar2-xbar3-delta0)%*%solve(Sp)%*%(xbar2-xbar3-delta0)
# Bonferoni Intervals
m <- p*g*(g-1)/2
crit.b <- qt(1- alpha/(2*m), n-g)
# Comparing Species 1 and 2
xbar1-xbar2 - crit.b*sqrt((1/N[1]+1/N[2])*diag(Sp))## V2 V4
## 0.4763056 -1.1894645
## V2 V4
## 0.8396944 -0.9705355
## V2 V4
## 0.2723056 -1.8894645
## V2 V4
## 0.6356944 -1.6705355
## V2 V4
## -0.3856944 -0.8094645
## V2 V4
## -0.02230564 -0.59053548
Plastic Data
Factor 1: Change in the rate of extrusion 0 for Low 10% 1 for High 10%
Factor 2: Amount of additive 0 for Low 1% 1 for High 1.5%
* X1: Tear resistance
* X2: Gloss
* X3: Opacity
Two-Way MANOVA
| V1 | V2 | V3 | V4 | V5 |
|---|---|---|---|---|
| 0 | 0 | 6.5 | 9.5 | 4.4 |
| 0 | 0 | 6.2 | 9.9 | 6.4 |
| 0 | 0 | 5.8 | 9.6 | 3.0 |
| 0 | 0 | 6.5 | 9.6 | 4.1 |
| 0 | 0 | 6.5 | 9.2 | 0.8 |
| 0 | 1 | 6.9 | 9.1 | 5.7 |
plastic <- as.list(plastic)
test2 <- manova(cbind(V3,V4,V5)~V1*V2, data = plastic)
summary.aov(test2)## Response V3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## V1 1 1.7405 1.74050 15.7868 0.001092 **
## V2 1 0.7605 0.76050 6.8980 0.018330 *
## V1:V2 1 0.0005 0.00050 0.0045 0.947143
## Residuals 16 1.7640 0.11025
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response V4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## V1 1 1.3005 1.30050 7.9178 0.01248 *
## V2 1 0.6125 0.61250 3.7291 0.07139 .
## V1:V2 1 0.5445 0.54450 3.3151 0.08740 .
## Residuals 16 2.6280 0.16425
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response V5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## V1 1 0.421 0.4205 0.1036 0.7517
## V2 1 4.901 4.9005 1.2077 0.2881
## V1:V2 1 3.960 3.9605 0.9760 0.3379
## Residuals 16 64.924 4.0578
## Df Wilks approx F num Df den Df Pr(>F)
## V1 1 0.38186 7.5543 3 14 0.003034 **
## V2 1 0.52303 4.2556 3 14 0.024745 *
## V1:V2 1 0.77711 1.3385 3 14 0.301782
## Residuals 16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Box’s M Test for Equality of Covariance
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: cbind(plastic$V3, plastic$V4, plastic$V5)
## Chi-Sq (approx.) = 7.1269, df = 6, p-value = 0.3093
boxM <-function(data, grouping) {
if (!inherits(data, c("data.frame", "matrix")))
stop("'data' must be a numeric data.frame or matrix!")
if (length(grouping) != nrow(data))
stop("incompatible dimensions!")
dname <- deparse(substitute(data))
data <- as.matrix(data)
grouping <- as.factor(as.character(grouping))
p <- ncol(data)
nlev <- nlevels(grouping)
lev <- levels(grouping)
dfs <- tapply(grouping, grouping, length) - 1
if (any(dfs < p))
warning("there are one or more levels with less observations than variables!")
mats <- aux <- list()
for(i in 1:nlev) {
mats[[i]] <- cov(data[grouping == lev[i], ])
aux[[i]] <- mats[[i]] * dfs[i]
}
names(mats) <- lev
pooled <- Reduce("+", aux) / sum(dfs)
logdet <- log(unlist(lapply(mats, det)))
minus2logM <- sum(dfs) * log(det(pooled)) - sum(logdet * dfs)
sum1 <- sum(1 / dfs)
Co <- (((2 * p^2) + (3 * p) - 1) / (6 * (p + 1) *
(nlev - 1))) * (sum1 - (1 / sum(dfs)))
X2 <- minus2logM * (1 - Co)
dfchi <- (choose(p, 2) + p) * (nlev - 1)
pval <- pchisq(X2, dfchi, lower.tail = FALSE)
out <- structure(
list(statistic = c("Chi-Sq (approx.)" = X2),
parameter = c(df = dfchi),
p.value = pval,
cov = mats, pooled = pooled, logDet = logdet,
data.name = dname,
method = " Box's M-test for Homogeneity of Covariance Matrices"
),
class = c("htest", "boxM")
)
return(out)
}