Intro

Basic Sample Statistics

Mean; Var-Cov; Correlation Coefficient Matrix

kable(head(iris))

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
5.1	3.5	1.4	0.2	setosa
4.9	3.0	1.4	0.2	setosa
4.7	3.2	1.3	0.2	setosa
4.6	3.1	1.5	0.2	setosa
5.0	3.6	1.4	0.2	setosa
5.4	3.9	1.7	0.4	setosa

n <- nrow(iris)
colMeans(iris[1:4]) # sample mean

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333

cov(iris[1:4])*(n-1)/n #sample variance-covariance matrix

##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length   0.68112222 -0.04215111    1.2658200   0.5128289
## Sepal.Width   -0.04215111  0.18871289   -0.3274587  -0.1208284
## Petal.Length   1.26582000 -0.32745867    3.0955027   1.2869720
## Petal.Width    0.51282889 -0.12082844    1.2869720   0.5771329

cov(iris[1:4]) #sample correlation coefficient matrix

##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    0.6856935  -0.0424340    1.2743154   0.5162707
## Sepal.Width    -0.0424340   0.1899794   -0.3296564  -0.1216394
## Petal.Length    1.2743154  -0.3296564    3.1162779   1.2956094
## Petal.Width     0.5162707  -0.1216394    1.2956094   0.5810063

Plots

# Iris Data
# Marginal Histogram
hist(iris$Sepal.Length); rug(iris$Sepal.Length)

# 2D Scatter Plot Matrix
colors <- brewer.pal(nlevels(iris$Species),"Dark2")
scatterplotMatrix(iris[1:4],
                  groups=iris[[5]],
                  smooth=FALSE,
                  regLine=FALSE,
                  legend=FALSE,
                  oma=c(0,0,8,0),
                  col=colors)
legend("top",
       legend=levels(iris[[5]]),
       pch=1:3,
       col=colors,
       horiz=TRUE,
       bty="n",
       cex=1.5,
       xpd=TRUE)

# 3D Plot
with(iris,
     scatterplot3d(Sepal.Width,
                   Sepal.Length,
                   Petal.Length,
                   color=colors[Species],
                   type = "h",
                   pch=20))

# Star Plot
stars(iris[-5],
      col.lines=colors[iris$Species],
      locations=0:1,
      key.loc=0:1)

L1: Matrix

L2: Random Vectors & Moments

L3: Multivariate Normal Distribution

###########
## MVN Density function and Its Contours
###########
X <- seq(-1,5,length.out=100)
Y <- seq(2,8,length.out=100)
dens <- matrix(dmvnorm(expand.grid(X,Y), 
                       mean=c(2,5),
                       sigma = rbind(c(1,.5), 
                                     c(.5, 1))), 
               ncol = 100)

persp(X, 
      Y, 
      dens,  
      main = "Density Surface", 
      box = FALSE)

s3d <- scatterplot3d(X,
                     Y,
                     seq(min(dens), 
                         max(dens), 
                         length = length(X)),
                     type = "n", 
                     grid = FALSE, 
                     angle = 70,
                     zlab = "expression(f(x, y))",
                     xlab = expression(X), ylab = expression(Y),
                     main = "Density Surface")
for(i in length(X):1){
  s3d$points3d(rep(X[i], length(Y)), Y, dens[i,], type = "l")
}
for(i in length(Y):1){
  s3d$points3d(X, rep(Y[i], length(X)), dens[,i], type = "l")
}

contour(X,Y,dens)

###########
## Simulated Data
###########
Xbvn <- rmvnorm(100,c(2,5),matrix(c(1,.5,.5,1),nrow=2))
dataEllipse(Xbvn, 
            xlim = c(-2,6), 
            ylim = c(2,10), 
            levels = c(0.5,0.75, 0.95), 
            pch = 20)

Stiff Data

Testing Normality

Univariate Normality

stiff <- read.table("resources/data/T4-3.DAT", header = FALSE)[,1:4]
head(stiff)

V1	V2	V3	V4
1889	1651	1561	1778
2403	2048	2087	2197
2119	1700	1815	2222
1645	1627	1110	1533
1976	1916	1614	1883
1712	1712	1439	1546

colnames(stiff) <- c("x1","x2","x3","x4")
attach(stiff)

###########
## Univariate normality
###########
par(mfrow = c(2,2))
hist(x1)
hist(x4)
qqnorm(x1);qqline(x1)  # Q-Q plot of x1
qqnorm(x4);qqline(x4)  # Q-Q plot of x4

# Shapiro Test for Univariate Normality
shapiro.test(x1)

## 
##  Shapiro-Wilk normality test
## 
## data:  x1
## W = 0.93068, p-value = 0.05118

shapiro.test(x2)

## 
##  Shapiro-Wilk normality test
## 
## data:  x2
## W = 0.91274, p-value = 0.01746

shapiro.test(x3)

## 
##  Shapiro-Wilk normality test
## 
## data:  x3
## W = 0.93258, p-value = 0.05751

shapiro.test(x4)

## 
##  Shapiro-Wilk normality test
## 
## data:  x4
## W = 0.96127, p-value = 0.3337

Bivariate Normality

#Scatter Plot Matrix
pairs(stiff)

#Bivariate Percentage
par(mfrow = c(1,1))  
plot(x1, x4, pch = 20)
quan_chisq = ellipse(cor(stiff[,c(1,4)]), 
                     scale = c(sd(x1),sd(x4)), 
                     center = c(0,0), 
                     level = 0.5, 
                     which = c(1, 2), 
                     npoints = 100)
lines(quan_chisq[,1] + mean(x1),
      quan_chisq[,2] + mean(x4),
      col = "blue",
      lwd = 2)

# Mahalanobis distance between x1 & x4
m_dist14 <- mahalanobis(x = stiff[,c(1,4)],
                        center = colMeans(stiff[,c(1,4)]),
                        cov = cov(stiff[,c(1,4)]))

# 60% of Mahalanobis distance <= 50th quantile of chi-square
sum(m_dist14<=qchisq(.5,2))/30

## [1] 0.6

Chi-Square Plot

# Calculate Mahalanobis distance
m_dist <- mahalanobis(x = stiff, 
                      center = colMeans(stiff),
                      cov = cov(stiff))

sum (m_dist <= qchisq(.05,4))

## [1] 3

par(mar=c(5,5.5,4,2)+0.1)  # by hand calculation
plot(sort(m_dist),qchisq(((1:30)-0.5)/30,4),xlab=expression(d[j]^2),
     ylab=expression(paste(chi[4]^2,"( ",frac(j-.5,30)," )")),pch=20)
abline(a=0,b=1,lwd=2)

# Directly by function mvn()
mvn(stiff, 
    mvnTest = "royston", 
    multivariatePlot="qq")

## $multivariateNormality
##      Test        H     p value MVN
## 1 Royston 9.823858 0.009534665  NO
## 
## $univariateNormality
##           Test  Variable Statistic   p value Normality
## 1 Shapiro-Wilk    x1        0.9307    0.0512    YES   
## 2 Shapiro-Wilk    x2        0.9127    0.0175    NO    
## 3 Shapiro-Wilk    x3        0.9326    0.0575    YES   
## 4 Shapiro-Wilk    x4        0.9613    0.3337    YES   
## 
## $Descriptives
##     n     Mean  Std.Dev Median  Min  Max    25th    75th      Skew
## x1 30 1906.100 324.9866 1863.0 1325 2983 1715.25 2057.25 1.0380842
## x2 30 1749.533 318.6065 1680.0 1170 2794 1595.50 1888.75 1.1435912
## x3 30 1509.133 303.1783 1466.0 1002 2412 1295.75 1623.75 0.9800274
## x4 30 1724.967 322.8436 1674.5 1176 2581 1520.25 1880.75 0.5978431
##       Kurtosis
## x1  2.03586397
## x2  1.94986381
## x3  0.99683699
## x4 -0.04626509

Detecting Outliers

# Standardized Value
print(data.frame(scale(x1),scale(x2),scale(x3),scale(x4)))

##      scale.x1.  scale.x2.   scale.x3.   scale.x4.
## 1  -0.05261755 -0.3092634  0.17107644  0.16426945
## 2   1.52898605  0.9367877  1.90602908  1.46211166
## 3   0.65510390 -0.1554687  1.00886726  1.53954854
## 4  -0.80341770 -0.3845914 -1.31649701 -0.59461204
## 5   0.21508578  0.5224835  0.34589106  0.48950437
## 6  -0.59725537 -0.1178047 -0.23132702 -0.55434486
## 7   0.11354314 -0.2025487 -0.78545638 -0.16716042
## 8   0.60894816  0.2211714  0.68562513  0.46162709
## 9   3.31367493  3.2782337  2.97800552  2.65154223
## 10 -0.49571272 -0.4693354 -0.41273841 -0.67204892
## 11 -0.60340947 -0.4975834  0.02924572 -0.17955033
## 12  0.43047927  0.4942355  0.38877012  0.53596650
## 13 -0.20339299  0.2870835  0.28322167  0.04966286
## 14 -0.12031265 -0.2025487 -0.05321401 -0.14547810
## 15 -0.14492905 -0.3155407 -0.39624647 -0.03396898
## 16  0.14739069  1.2537931 -1.08560978 -1.37517585
## 17 -1.78807364 -1.8189625 -1.67272303 -1.70041077
## 18 -1.49883096 -1.1880903 -0.84812577 -1.29154401
## 19 -0.24031759 -0.3626207  0.30631040  0.09302751
## 20 -0.55725372 -0.4881674 -0.64692404 -0.24459731
## 21  1.13820072  1.3793398  0.12489900  1.19572877
## 22 -0.02184705 -0.4253941 -0.28739963 -0.76807066
## 23 -0.84034230 -0.7423995 -0.72278698 -0.64726912
## 24  0.47663501  0.3686888  0.45143951  0.96651559
## 25 -0.15416020 -0.8051729 -0.50509331 -0.59461204
## 26 -0.55109962 -1.0594050 -0.89430321 -0.79285046
## 27  0.80587934  0.4597102  0.63285091  0.33772807
## 28 -0.77264721 -0.2339354 -0.31378674 -0.39637361
## 29  1.29205321  1.7308706  1.83346452  1.57671825
## 30 -1.28036042 -1.1535650 -0.97346455 -1.36588342

# Generalized Squared Distance--Mahalanobis distance
rank(m_dist)

##  [1]  3 25 27 24  8 12 22 10 29  4 11  2 16  1  6 30 19 20  7  9 28 23  5
## [24] 14 21 18 13 17 26 15

cbind(1:30,m_dist)[order(m_dist),]

##              m_dist
##  [1,] 14  0.1295714
##  [2,] 12  0.4635159
##  [3,]  1  0.6000129
##  [4,] 10  0.7665400
##  [5,] 23  0.7962096
##  [6,] 15  1.0792484
##  [7,] 19  1.3632124
##  [8,]  5  1.3980776
##  [9,] 20  1.4649908
## [10,]  8  1.4876570
## [11,] 11  1.9307771
## [12,]  6  2.2191409
## [13,] 27  2.3816050
## [14,] 24  2.5385575
## [15,] 30  2.5838186
## [16,] 13  2.6959024
## [17,] 28  2.9951752
## [18,] 26  3.3979804
## [19,] 17  3.5018290
## [20,] 18  3.9900603
## [21,] 25  4.5767867
## [22,]  7  4.9883498
## [23,] 22  5.0557446
## [24,]  4  5.2076098
## [25,]  2  5.4770196
## [26,] 29  6.2837628
## [27,]  3  7.6166439
## [28,] 21  9.8980384
## [29,]  9 12.2647550
## [30,] 16 16.8474070

# By MVN Function
mvn(stiff[-c(16,9),], mvnTest = "royston", multivariatePlot="qq")

## $multivariateNormality
##      Test        H   p value MVN
## 1 Royston 1.098338 0.6271166 YES
## 
## $univariateNormality
##           Test  Variable Statistic   p value Normality
## 1 Shapiro-Wilk    x1        0.9847    0.9439    YES   
## 2 Shapiro-Wilk    x2        0.9664    0.4871    YES   
## 3 Shapiro-Wilk    x3        0.9672    0.5070    YES   
## 4 Shapiro-Wilk    x4        0.9660    0.4779    YES   
## 
## $Descriptives
##     n     Mean  Std.Dev Median  Min  Max    25th    75th       Skew
## x1 28 1865.929 262.1619 1857.5 1325 2403 1711.50 2049.75 0.08994538
## x2 28 1697.964 244.8618 1663.0 1170 2301 1593.25 1847.50 0.39091767
## x3 28 1488.643 253.1536 1466.0 1002 2087 1307.25 1617.25 0.49661284
## x4 28 1710.250 277.9986 1674.5 1176 2234 1528.75 1876.25 0.25921958
##      Kurtosis
## x1 -0.5084972
## x2  0.1961808
## x3  0.0516768
## x4 -0.6484407

Transformations

# yjPower" family (Yeo and Johnson(2000)),
# another modifiation of the Box-Cox family that allows a few negative values.
trans <- powerTransform(stiff, 
                        family = "yjPower")  
summary(trans)

## yjPower Transformations to Multinormality 
##    Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## x1    0.0939           1      -0.9911       1.1790
## x2   -0.2802           0      -1.5129       0.9525
## x3    0.1478           1      -0.9585       1.2542
## x4    0.7546           1      -0.5292       2.0385
## 
##  Likelihood ratio test that all transformation parameters are equal to 0
##                                  LRT df    pval
## LR test, lambda = (0 0 0 0) 1.980807  4 0.73929

logstiff = log(stiff)

# Histogram to assess normality
hist(log(x1))

# Scatterplot
plot(log(x1),log(x4),pch=20)
# Add Eclipse
quan_chisq1 <- ellipse(cor(logstiff[,c(1,4)]), 
                       scale = c(sd(log(x1)),
                                 sd(log(x4))), 
                       center=c(0,0), 
                       t = sqrt(qchisq(.5, 2)), 
                       which = c(1, 2), 
                       npoints = 100)
# Add Lines
lines(quan_chisq1[,1] + mean(log(x1)), 
      quan_chisq1[,2] + mean(log(x4)), 
      col="blue", 
      lwd=2)

# Compare to test on untransformed stiff
mvn(logstiff, mvnTest = "royston", multivariatePlot="qq")

## $multivariateNormality
##      Test        H   p value MVN
## 1 Royston 1.652091 0.5200565 YES
## 
## $univariateNormality
##           Test  Variable Statistic   p value Normality
## 1 Shapiro-Wilk    x1        0.9742    0.6577    YES   
## 2 Shapiro-Wilk    x2        0.9626    0.3612    YES   
## 3 Shapiro-Wilk    x3        0.9795    0.8118    YES   
## 4 Shapiro-Wilk    x4        0.9831    0.9005    YES   
## 
## $Descriptives
##     n     Mean   Std.Dev   Median      Min      Max     25th     75th
## x1 30 7.539650 0.1631929 7.529941 7.189168 8.000685 7.447309 7.629120
## x2 30 7.452334 0.1721702 7.426545 7.064759 7.935230 7.374941 7.543648
## x3 30 7.301165 0.1910199 7.290123 6.909753 7.788212 7.166816 7.392488
## x4 30 7.436553 0.1834145 7.423268 7.069874 7.855932 7.326618 7.539424
##         Skew    Kurtosis
## x1 0.3660786  0.69728520
## x2 0.4814651  0.73170914
## x3 0.4107699  0.07436358
## x4 0.1581748 -0.44895116

L4: Inference for Mean Vectors

Sweat Data

Comparing Perspiration Between 2 Groups

Sample Statistics Sweat

We want to see if perspiration in Latin American healthy women in the age group 50-65 is the same as their counterparts in the North America. Perspiration is quantified in terms of the following variables
* X1: Sweat Rate
* X2: Sodium in Sweat
* X3: Potassium in Sweat

rm(list = ls())

sweat <- read.table("resources/data/T5-1.DAT")
head(sweat)

V1	V2	V3
3.7	48.5	9.3
5.7	65.1	8.0
3.8	47.2	10.9
3.2	53.2	12.0
3.1	55.5	9.7
4.6	36.1	7.9

n <- nrow(sweat)
p <- ncol(sweat)
alpha <- 0.05

# Null Hypothesis Mean Values
mu0 <- matrix(c(4,50,10),ncol=1)

# Sample Means
xbar <- as.matrix(colMeans(sweat), ncol = 1)

# Sample Covariance Matrix
S <- cov(sweat)

Hotelling Test

# Calculating Hotelling
## By hand
Tsq <- n*t(xbar-mu0)%*%solve(S)%*%(xbar-mu0)

## By Mahalanobis distance
Tsq <- n*mahalanobis(x = colMeans(sweat), 
                     center = mu0, 
                     cov = cov(sweat))

# Critical point at alpha = 0.05
cri <- (((n-1)*p)/(n-p))*qf((1-alpha),p,n-p) 

# Tsq < cri Fail to reject H0
Tsq > cri

## [1] FALSE

# Compute P value
Tsq.mod <- ((n-p)/p)/(n-1)*Tsq
p.val <- 1 - pf(Tsq.mod, p, (n-p))

test.frame <- data.frame(stat = Tsq,
                         mod.stat = Tsq.mod,
                         crit = cri,
                         reject = (Tsq > cri),
                         reject.mod = (Tsq.mod > cri),
                         p = p.val)
test.frame

stat	mod.stat	crit	reject	reject.mod	p
9.738773	2.904546	10.7186	FALSE	FALSE	0.0649283

# Hotellings T^2 with Function
HotellingsT2(sweat, 
             mu = mu0)

## 
##  Hotelling's one sample T2-test
## 
## data:  sweat
## T.2 = 2.9045, df1 = 3, df2 = 17, p-value = 0.06493
## alternative hypothesis: true location is not equal to c(4,50,10)

Confidence Intervals for \(\mu\)

# One at a time intervals of mu
One <- matrix(NA,3,2)
for (i in 1:3){
  One[i,1] <- xbar[i] - qt(.975,n-1)*sqrt(S[i,i]/n) #left
  One[i,2] <- xbar[i] + qt(.975,n-1)*sqrt(S[i,i]/n) #right
}

## T^2 interval mu
Tsq <- matrix(NA,3,2)
c <- sqrt(((p*(n-1))/(n-p))*qf(.95,p,n-p))
for (i in 1:3){
  Tsq[i,1] <- xbar[i] - c*sqrt(S[i,i]/n)
  Tsq[i,2] <- xbar[i] + c*sqrt(S[i,i]/n)
}

## Bonferroni interval mu when m = 3 (m is the number of CIs being estimated).
m <- 3
Bonf <- matrix(NA,3,2)
for (i in 1:m){
  Bonf[i,1] <- xbar[i] - qt((1-alpha)/(2*m),n-1)*sqrt(S[i,i]/n) #left
  Bonf[i,2] <- xbar[i] + qt((1-alpha)/(2*m),n-1)*sqrt(S[i,i]/n) #right
}

colnames(One) = colnames(Tsq) =colnames(Bonf) =c("lower","upper")
row.names(One)=row.names(Tsq)=row.names(Bonf)=c("SweatRate","Sodium","Potassium")
list(One, Tsq, Bonf)

## [[1]]
##               lower    upper
## SweatRate  3.845840  5.43416
## Sodium    38.784779 52.01522
## Potassium  9.073601 10.85640
## 
## [[2]]
##               lower     upper
## SweatRate  3.397768  5.882232
## Sodium    35.052408 55.747592
## Potassium  8.570664 11.359336
## 
## [[3]]
##               lower     upper
## SweatRate  5.030216  4.249784
## Sodium    48.650435 42.149565
## Potassium 10.402995  9.527005

# For large sample inference, only need to change the critical point or multiplier using Z (replace to t) and chi-square (replace to F).

Waste Water Data

Municipal wastewater treatment plants are required by law to monitor their discharges into rivers and streams on a regular basis. Concern about the reliability of data from one of these self-monitoring program led to a study in which samples of effluent were divided and sent to two laboratories.

Compare two lab’s measurement by two variables:
* X1 & X3: Biochemical oxygen demand (BOD) from two labs
* X2 & X4: Suspended solids (SS) from two labs

Paired Comparison Tests

rm(list = ls())
alpha <- 0.05
ww <- as.matrix(read.table("resources/data/T6-1.dat"))
head(ww)

##      V1 V2 V3 V4
## [1,]  6 27 25 15
## [2,]  6 23 28 13
## [3,] 18 64 36 22
## [4,]  8 44 35 29
## [5,] 11 30 15 31
## [6,] 34 75 44 64

n1 <- nrow(ww)
p1 <- 2

# Calculate differences mu1-mu3 & mu2-mu4 
D <- ww[,1:2] - ww[,3:4] 

## Estimate of mu1-mu2
dbar <- as.matrix(colMeans(D),ncol=1)
Sd <- cov(D)

## Using D
Tsq1 <- n1*t(dbar)%*%solve(Sd)%*%(dbar)
cri1 <- (((n1-1)*p1)/(n1-p1)) * qf((1-alpha), p1, n1 - p1)
pval1 <- 1 - pf(Tsq1*(n1 - p1)/p1/(n1-1), p1, n1 - p1)

# Results
test.frame <- data.frame(stat = Tsq1,
                         crit = cri1,
                         reject = (Tsq1 > cri1),
                         p = pval1)
test.frame

stat	crit	reject	p
13.63931	9.458877	TRUE	0.0208278

Alternate Method Using Matrix

C <- cbind(diag(p1),-diag(p1))
D <- t(C%*%t(ww)) 
dbar <- as.matrix(colMeans(D), ncol=1)
Sd <- cov(D)
Tsq1 <- n1*t( C%*%colMeans(ww))%*%solve(C%*%cov(ww)%*%t(C))%*%(C%*%colMeans(ww)) 

test.frame <- data.frame(stat = Tsq1,
                         crit = cri1,
                         reject = (Tsq1 > cri1),
                         p = pval1)
test.frame

stat	crit	reject	p
13.63931	9.458877	TRUE	0.0208278

Confidence Region/Interval for Delta

plot(D)
CR <- ellipse(x = cor(D), 
              scale = sqrt(diag(Sd)/n1), 
              centre = as.vector(dbar),
              t = sqrt(cri1))
#n1*mahalanobis(as.vector(dbar),as.vector(CR[3,]),cov(D))
lines(CR, 
      type = "l",
      xlab = expression({delta}[1]),
      ylab = expression({delta}[2]), 
      main = expression("95% confidence ellipse"))
points(0,0,pch=20)

## One at a time intervals of for delta1 and delta2
One1 <- matrix(NA,2,2)
for (i in 1:2){
  One1[i,1] <- dbar[i] - qt(.975,n1-1)*sqrt(Sd[i,i]/n1) 
  One1[i,2] <- dbar[i] + qt(.975,n1-1)*sqrt(Sd[i,i]/n1) 
}

## T^2 interval for delta1 and delta2
Tsq1 <- matrix(NA,2,2)
c1 <- sqrt(((p1*(n1-1))/(n1-p1))*qf(.95,p1,n1-p1))
for (i in 1:2){
  Tsq1[i,1] <- dbar[i] - c1*sqrt(Sd[i,i]/n1) 
  Tsq1[i,2] <- dbar[i] + c1*sqrt(Sd[i,i]/n1) 
}

## Bonferroni Interval for delta1 and delta2 when m=2 (m is the number of CIs).
m <- 2
Bonf1 <- matrix(NA,2,2)
for (i in 1:m){
  Bonf1[i,1] <- dbar[i] - qt((1-0.05/(2*m)),n1-1)*sqrt(Sd[i,i]/n1)
  Bonf1[i,2] <- dbar[i] + qt((1-0.05/(2*m)),n1-1)*sqrt(Sd[i,i]/n1)
}

cols <- c("lower","upper")
rows <- c("BOD","SS")

colnames(One1) = colnames(Tsq1) = colnames(Bonf1) = cols
row.names(One1) = row.names(Tsq1) = row.names(Bonf1) = rows
out <- list(One1, Tsq1, Bonf1)
names(out) <- c("One-at-a-time Intervals", "T^2 Intervals", "Bonferroni Simultaneous Intervals")
out

## $`One-at-a-time Intervals`
##           lower     upper
## BOD -18.8467298  0.119457
## SS   -0.4725958 27.018050
## 
## $`T^2 Intervals`
##          lower    upper
## BOD -22.453272  3.72600
## SS   -5.700119 32.24557
## 
## $`Bonferroni Simultaneous Intervals`
##          lower     upper
## BOD -20.573107  1.845835
## SS   -2.974903 29.520358

Turtles Dataset

Two Sample Independent Tests

Measurements on the carapaces of 24 female and 24 male painted turtles:
* X1: Length
* X2: Width
* X3: Height
* X4: Sex (female and male)

rm(list=ls())

alpha <- .05
X <- read.table("resources/data/T6-9.dat", 
                header = FALSE, 
                col.names = c("Length", "Width", "Height", "Sex"))

# log transformation improves Normality.
X[,1:3] <- log(X[,1:3])

X1 <- X[X$Sex=="male",1:3]
X2 <- X[X$Sex=="female",1:3]

n1 <- dim(X1)[1]
n2 <- dim(X2)[1]
p <- dim(X1)[2]

# Testing H0: mu1-mu2 = delta0
delta0 <- rep(0,p)
xbar1 <- colMeans(X1)
xbar2 <- colMeans(X2)
S1 <- var(X1)
S2 <- var(X2)
Sp <- ((n1-1)*S1+(n2-1)*S2)/(n1+n2-2)
T2 <- (n1*n2/(n1+n2))*t(xbar1-xbar2-delta0)%*%solve(Sp)%*%(xbar1-xbar2-delta0)
crit <- (n1 + n2 - 2)*p/(n1 + n2 - p - 1)*qf(.95, p, n1 + n2 - p - 1)

# Linear combination of the mean components most responsiblefor rejection of H_0
a <- solve(Sp)%*%(xbar1 - xbar2)
A <- diag(p)

2-Sample Independent Confidence Intervals

# T^2-intervals
Clower <- t(A)%*%(xbar1 - xbar2) - sqrt(crit) * sqrt(diag((t(A)%*%Sp%*%A)*(1/n1 + 1/n2)))
Cupper <- t(A)%*%(xbar1 - xbar2) + sqrt(crit) * sqrt(diag((t(A)%*%Sp%*%A)*(1/n1 + 1/n2)))
cbind(Clower,Cupper)

##            [,1]        [,2]
## [1,] -0.2926638 -0.05776762
## [2,] -0.2365537 -0.05411666
## [3,] -0.3451377 -0.12906223

# Bonferroni Intervals
crit.b <- abs(qt(alpha/(2*p), (n1 + n2-2)))
Clower.b <- t(A)%*%(xbar1 - xbar2) - crit.b*sqrt(diag((t(A)%*%Sp%*%A)*(1/n1+1/n2)))
Cupper.b <- t(A)%*%(xbar1 - xbar2) + crit.b*sqrt(diag((t(A)%*%Sp%*%A)*(1/n1+1/n2)))
cbind(Clower.b,Cupper.b)

##            [,1]        [,2]
## [1,] -0.2734025 -0.07702893
## [2,] -0.2215940 -0.06907636
## [3,] -0.3274197 -0.14678026

# Assessing the normality and equal covariance assumptions 
par(mfrow = c(1,3))

# 3-d Scatterplot: Ellipsoidal? are the covariances the same?
sp <- scatterplot3d(X1, 
                    xlim = c(4.4,5.3), 
                    ylim = c(4.2,5), 
                    zlim = c(3.4,4.4), 
                    main = "Red = FEMALE")
sp$points3d(X2, col = "red")

#Visually compare the sample variances and covariances between the two groups
cov(X1)

##             Length       Width      Height
## Length 0.011072004 0.008019142 0.008159648
## Width  0.008019142 0.006416726 0.006005271
## Height 0.008159648 0.006005271 0.006772758

cov(X2)

##            Length      Width     Height
## Length 0.02640563 0.02011195 0.02491758
## Width  0.02011195 0.01619045 0.01942430
## Height 0.02491758 0.01942430 0.02493980

Chi-Square Plots

# Chi-square plot for males and females separately.
distances <-function(D){
  dsq <- c()
  M <- colMeans(D) 
  S <- cov(D)
  n <- dim(D)[1]
  p <- dim(D)[2]
  for(i in 1:n){
    dsq <- c(dsq, as.matrix(D[i,]-M)%*%solve(S)%*%as.matrix(t(D[i,]-M)))
  }
  as.numeric(dsq)
}

#Chi-square plot for males
qc <- qchisq( ((1:n1)-1/2)/n1, df = p)
par(mar = c(4,4,1,1))
plot(sort(distances(X1)),
     qc,
     xlab = expression(d[j]^2),
     main="Male")
abline(a = 0, b = 1)

#Chi-square plot for females
qc <- qchisq( ((1:n2)-1/2)/n2, df = p)
par(mar = c(4,4,1,1))
plot(sort(distances(X2)), 
     qc,
     xlab = expression(d[j]^2),
     main = "Female")
abline(a = 0, b = 1)

Tests For Additional Information

X11 <- X[X$Sex=="male",1:2]
X12 <- X[X$Sex=="female",1:2]

# Testing H0: mu1-mu2 = delta0
xbar11 <- colMeans(X11)
xbar12 <- colMeans(X12)
S11 <- var(X11)
S12 <- var(X12)
S1p <- ((n1-1)*S11+(n2-1)*S12)/(n1+n2-2)
T12 <- (n1*n2/(n1+n2))*t(xbar11-xbar12)%*%solve(S1p)%*%(xbar11 - xbar12)

U <- (T2-T12)/(n1+n2-2+T12)
p <- 3
p1 <- 2
p2 <- 1
crit2 <- p2/(n1+n2-p-1)*qf((1-alpha), p2, n1+n2-p-1)

Iris Data

One-Way MANOVA Manually

rm(list=ls())
alpha <- 0.05
D <- read.table("resources/data/T11-5.DAT", header = FALSE)
head(D)

V1	V2	V3	V4	V5
5.1	3.5	1.4	0.2	1
4.9	3.0	1.4	0.2	1
4.7	3.2	1.3	0.2	1
4.6	3.1	1.5	0.2	1
5.0	3.6	1.4	0.2	1
5.4	3.9	1.7	0.4	1

# data must be sorted by the grouping variable
G <- D$V5

# 1,2,3
u <- unique(G); u

## [1] 1 2 3

# number of species (groups)
g <- length(u); g

## [1] 3

# n1,n2,n3
N <- table(G); N

## G
##  1  2  3 
## 50 50 50

# Total sample size 
n <- sum(N)       
X <- as.matrix(D[,c(2,4)])
# dimension
p <- dim(X)[2]    


X1 <- X[G==1,]
X2 <- X[G==2,]
X3 <- X[G==3,]


###
# Getting a feel for the data
##
plot(X1, 
     xlim=c(1,5), 
     ylim=c(0,3), 
     col="red", 
     ylab="Petal", 
     xlab="Sepal",
     main="Width Measurements from Three Species of Iris Plants")
points(X2,
       col="blue")
points(X3,
       col="green")
text(4,0.5,
     "Species 1", 
     col="red")
text(4,1.5,
     "Species 2", 
     col="blue")
text(4,2.25,
     "Species 3", 
     col="green")

# Matrix for B
C1 <- c(1)
for (i in 1:g){
  C1 <- bdiag(C1, (1/N[i])*matrix(1,N[i],N[i]))
}
C1 <- as.matrix(C1[-1,-1]-(1/n)*matrix(1,n,n))

# Matrix for W
C2 <- c(1)
for (i in 1:g){
  C2 <- bdiag(C2, diag(1,N[i])-(1/N[i])*matrix(1,N[i],N[i]))
}
C2 <- as.matrix(C2[-1,-1])


# SS
B <- t(X)%*%C1%*%X
W <- t(X)%*%C2%*%X

# Test Stat
Lambda <- det(W)/det(B+W)

# Lambda has the same distributions as U(2,2,147)
m1 <- g-1; m1

## [1] 2

m2 <- n-g; m2

## [1] 147

((1-sqrt(Lambda))/sqrt(Lambda))* (m2-1)/m1

## [1] 299.936

#Reject H0
qf((1-alpha),2*m1,2*(m2-1))

## [1] 2.402562

One-Way Manova Function

test1 <- manova(cbind(D[,2],D[,4]) ~ as.character(G))
#c("Pillai", "Wilks", "Hotelling-Lawley", "Roy")
summary(test1,'Wilks')

##                  Df    Wilks approx F num Df den Df    Pr(>F)    
## as.character(G)   2 0.038316   299.94      4    292 < 2.2e-16 ***
## Residuals       147                                              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary.aov(test1)

##  Response 1 :
##                  Df Sum Sq Mean Sq F value    Pr(>F)    
## as.character(G)   2 11.345  5.6725   49.16 < 2.2e-16 ***
## Residuals       147 16.962  0.1154                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##                  Df Sum Sq Mean Sq F value    Pr(>F)    
## as.character(G)   2 80.413  40.207  960.01 < 2.2e-16 ***
## Residuals       147  6.157   0.042                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Pairwise Comparison With Hotelling’s T²

# Testing H0:mui-muj=delta0
delta0 <- rep(0,p)
xbar1 <- colMeans(X1) 
xbar2 <- colMeans(X2) 
xbar3 <- colMeans(X3)
Sp <- W/(n-g)

# Number of pairs of mu you are comparing
m <- g*(g-1)/2  
crit <- ((n-g)*p/(n-g-p+1))*qf(1- alpha/m,p,n-g-p+1)

# Between Species 1 and 2
T2 <- (N[1]*N[2]/(N[1]+N[2]))*t(xbar1-xbar2-delta0)%*%solve(Sp)%*%(xbar1-xbar2-delta0)
# Between Species 1 and 3
T2 <- (N[1]*N[3]/(N[1]+N[3]))*t(xbar1-xbar3-delta0)%*%solve(Sp)%*%(xbar1-xbar3-delta0)
# Between Species 2 and 3
T2 <- (N[2]*N[3]/(N[2]+N[3]))*t(xbar2-xbar3-delta0)%*%solve(Sp)%*%(xbar2-xbar3-delta0)

# Bonferoni Intervals

m <- p*g*(g-1)/2
crit.b <- qt(1- alpha/(2*m), n-g)

# Comparing Species 1 and 2
xbar1-xbar2 - crit.b*sqrt((1/N[1]+1/N[2])*diag(Sp))

##         V2         V4 
##  0.4763056 -1.1894645

xbar1-xbar2 + crit.b*sqrt((1/N[1]+1/N[2])*diag(Sp))

##         V2         V4 
##  0.8396944 -0.9705355

#Comparing Species 1 and 3
xbar1-xbar3 - crit.b*sqrt((1/N[1]+1/N[3])*diag(Sp))

##         V2         V4 
##  0.2723056 -1.8894645

xbar1-xbar3 + crit.b*sqrt((1/N[1]+1/N[3])*diag(Sp))

##         V2         V4 
##  0.6356944 -1.6705355

#Comparing Species 2 and 3
xbar2-xbar3 - crit.b*sqrt((1/N[2]+1/N[3])*diag(Sp))

##         V2         V4 
## -0.3856944 -0.8094645

xbar2-xbar3 + crit.b*sqrt((1/N[2]+1/N[3])*diag(Sp))

##          V2          V4 
## -0.02230564 -0.59053548

Plastic Data

Factor 1: Change in the rate of extrusion 0 for Low 10% 1 for High 10%
Factor 2: Amount of additive 0 for Low 1% 1 for High 1.5%
* X1: Tear resistance
* X2: Gloss
* X3: Opacity

Two-Way MANOVA

rm(list=ls())
plastic <- read.table("resources/data/T6-4.dat")
head(plastic)

V2	V3	V4	V5
0	6.5	9.5	4.4
0	6.2	9.9	6.4
0	5.8	9.6	3.0
0	6.5	9.6	4.1
0	6.5	9.2	0.8
1	6.9	9.1	5.7

plastic <- as.list(plastic)
test2 <- manova(cbind(V3,V4,V5)~V1*V2, data = plastic)
summary.aov(test2)

##  Response V3 :
##             Df Sum Sq Mean Sq F value   Pr(>F)   
## V1           1 1.7405 1.74050 15.7868 0.001092 **
## V2           1 0.7605 0.76050  6.8980 0.018330 * 
## V1:V2        1 0.0005 0.00050  0.0045 0.947143   
## Residuals   16 1.7640 0.11025                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response V4 :
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## V1           1 1.3005 1.30050  7.9178 0.01248 *
## V2           1 0.6125 0.61250  3.7291 0.07139 .
## V1:V2        1 0.5445 0.54450  3.3151 0.08740 .
## Residuals   16 2.6280 0.16425                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response V5 :
##             Df Sum Sq Mean Sq F value Pr(>F)
## V1           1  0.421  0.4205  0.1036 0.7517
## V2           1  4.901  4.9005  1.2077 0.2881
## V1:V2        1  3.960  3.9605  0.9760 0.3379
## Residuals   16 64.924  4.0578

#c("Pillai", "Wilks", "Hotelling-Lawley", "Roy")
summary(test2,"Wilks")

##           Df   Wilks approx F num Df den Df   Pr(>F)   
## V1         1 0.38186   7.5543      3     14 0.003034 **
## V2         1 0.52303   4.2556      3     14 0.024745 * 
## V1:V2      1 0.77711   1.3385      3     14 0.301782   
## Residuals 16                                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Box’s M Test for Equality of Covariance

boxM(cbind(plastic$V3,plastic$V4,plastic$V5),plastic$V1*plastic$V2)

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  cbind(plastic$V3, plastic$V4, plastic$V5)
## Chi-Sq (approx.) = 7.1269, df = 6, p-value = 0.3093

boxM <-function(data, grouping)  {
    if (!inherits(data, c("data.frame", "matrix")))
      stop("'data' must be a numeric data.frame or matrix!")
    if (length(grouping) != nrow(data))
      stop("incompatible dimensions!")
    dname <- deparse(substitute(data))
    data <- as.matrix(data)
    grouping <- as.factor(as.character(grouping))
    p <- ncol(data)
    nlev <- nlevels(grouping)
    lev <- levels(grouping)
    dfs <- tapply(grouping, grouping, length) - 1
    if (any(dfs < p)) 
      warning("there are one or more levels with less observations than variables!")
    mats <- aux <- list()
    for(i in 1:nlev) {
      mats[[i]] <- cov(data[grouping == lev[i], ])
      aux[[i]] <- mats[[i]] * dfs[i]
    }
    names(mats) <- lev
    pooled <- Reduce("+", aux) / sum(dfs)
    logdet <- log(unlist(lapply(mats, det)))
    minus2logM <- sum(dfs) * log(det(pooled)) - sum(logdet * dfs)
    sum1 <- sum(1 / dfs) 
    Co <- (((2 * p^2) + (3 * p) - 1) / (6 * (p + 1) *
                                          (nlev - 1))) * (sum1 - (1 / sum(dfs)))
    X2 <- minus2logM * (1 - Co)
    dfchi <- (choose(p, 2) + p) * (nlev - 1)
    pval <- pchisq(X2, dfchi, lower.tail = FALSE)
    out <- structure(
      list(statistic = c("Chi-Sq (approx.)" = X2),
           parameter = c(df = dfchi),
           p.value = pval,
           cov = mats, pooled = pooled, logDet = logdet,
           data.name = dname,
           method = " Box's M-test for Homogeneity of Covariance Matrices"
      ),
      class = c("htest", "boxM")
    )
    return(out)
}

Multivariate Analysis Notes

Patrick Oster, Dr. Xaoli Kong

Spring Semester, 2019

Intro

Basic Sample Statistics

Mean; Var-Cov; Correlation Coefficient Matrix

Plots

L1: Matrix

L2: Random Vectors & Moments

L3: Multivariate Normal Distribution

Stiff Data

Testing Normality

Univariate Normality

Bivariate Normality

Chi-Square Plot

Detecting Outliers

Transformations

L4: Inference for Mean Vectors

Sweat Data

Comparing Perspiration Between 2 Groups

Sample Statistics Sweat

Hotelling Test

Confidence Intervals for \(\mu\)

Waste Water Data

Paired Comparison Tests

Alternate Method Using Matrix

Confidence Region/Interval for Delta

Turtles Dataset

Two Sample Independent Tests

2-Sample Independent Confidence Intervals

Chi-Square Plots

Tests For Additional Information

Iris Data

One-Way MANOVA Manually

One-Way Manova Function

Pairwise Comparison With Hotelling’s T²

Plastic Data

Two-Way MANOVA

Box’s M Test for Equality of Covariance

V1	V2	V3	V4	V5
5.1	3.5	1.4	0.2	1
4.9	3.0	1.4	0.2	1
4.7	3.2	1.3	0.2	1
4.6	3.1	1.5	0.2	1
5.0	3.6	1.4	0.2	1
5.4	3.9	1.7	0.4	1

V2	V3	V4	V5
0	6.5	9.5	4.4
0	6.2	9.9	6.4
0	5.8	9.6	3.0
0	6.5	9.6	4.1
0	6.5	9.2	0.8
1	6.9	9.1	5.7

V1	V2	V3	V4	V5
5.1	3.5	1.4	0.2	1
4.9	3.0	1.4	0.2	1
4.7	3.2	1.3	0.2	1
4.6	3.1	1.5	0.2	1
5.0	3.6	1.4	0.2	1
5.4	3.9	1.7	0.4	1

V2	V3	V4	V5
0	6.5	9.5	4.4
0	6.2	9.9	6.4
0	5.8	9.6	3.0
0	6.5	9.6	4.1
0	6.5	9.2	0.8
1	6.9	9.1	5.7

Intro

Basic Sample Statistics

Mean; Var-Cov; Correlation Coefficient Matrix

Plots

L1: Matrix

L2: Random Vectors & Moments

L3: Multivariate Normal Distribution

Stiff Data

Testing Normality

Univariate Normality

Bivariate Normality

Chi-Square Plot

Detecting Outliers

Transformations

L4: Inference for Mean Vectors

Sweat Data

Comparing Perspiration Between 2 Groups

Sample Statistics Sweat

Hotelling Test

Confidence Intervals for \(\mu\)

Waste Water Data

Paired Comparison Tests

Alternate Method Using Matrix

Confidence Region/Interval for Delta

Turtles Dataset

Two Sample Independent Tests

2-Sample Independent Confidence Intervals

Chi-Square Plots

Tests For Additional Information

Iris Data

One-Way MANOVA Manually

One-Way Manova Function

Pairwise Comparison With Hotelling’s T2

Plastic Data

Two-Way MANOVA

Box’s M Test for Equality of Covariance

Pairwise Comparison With Hotelling’s T²

V1	V2	V3	V4	V5
5.1	3.5	1.4	0.2	1
4.9	3.0	1.4	0.2	1
4.7	3.2	1.3	0.2	1
4.6	3.1	1.5	0.2	1
5.0	3.6	1.4	0.2	1
5.4	3.9	1.7	0.4	1

V2	V3	V4	V5
0	6.5	9.5	4.4
0	6.2	9.9	6.4
0	5.8	9.6	3.0
0	6.5	9.6	4.1
0	6.5	9.2	0.8
1	6.9	9.1	5.7