CCA.t <- read.csv("~/Documents/多變量分析/CCA/new_col2000.csv",fileEncoding = "big5")
CCA <- CCA.t[2:7]
rownames(CCA) <- CCA.t[,1]
col <- CCA
head(col ,6)
## Academic Grad_rate SAT_P25 SAT_P75 HS_P10 Accept_rate
## Caltech 4.7 0.85 1420 1570 1.00 0.18
## Harvard 4.9 0.96 1400 1580 0.90 0.12
## MIT 4.9 0.92 1400 1560 0.95 0.22
## Princeton 4.8 0.95 1360 1540 0.93 0.13
## Yale 4.8 0.94 1360 1540 0.95 0.18
## Stanford 4.9 0.92 1360 1540 0.87 0.13
# Then separate the 6 variables into two groups.
# The first group has 3 variables Academic, Grad_rate, and Accept_rate.
# The second group has 3 variables SAT_P25, SAT_P75, and HS_P10.
x<-cbind(col[,1],col[,2],col[,6])
y<-cbind(col[,3],col[,4],col[,5])
# Standardize all variables and apply CCA to the two groups using the R function "cancor".
cxy<-cancor(scale(x,scale=T,center=T),scale(y,scale=T,center=T)) #scale:T & center:T 表示有標準化
cxy
## $cor
## [1] 0.9138012 0.4853202 0.1060088
##
## $xcoef
## [,1] [,2] [,3]
## [1,] 0.08974032 -0.19011912 -0.06046737
## [2,] 0.03330972 0.16560726 -0.22826530
## [3,] -0.04986535 -0.04022887 -0.28403564
##
## $ycoef
## [,1] [,2] [,3]
## [1,] -0.17690301 0.85320991 -0.14228895
## [2,] 0.30493077 -0.84344730 0.01596496
## [3,] 0.03635201 0.05630199 0.16833153
##
## $xcenter
## [1] -5.815454e-16 4.150119e-16 1.902000e-17
##
## $ycenter
## [1] 1.060412e-15 -4.753142e-16 -1.752897e-16
# In order to determine the number of canonical variates so as to provide a meaningful
# interpretation of CCA, one can perform the Wilk’s test for checking the
# “significance of canonical correlations”.
# However, we then have need to check the assumption of “multivariate normality” for all variables.
# This can be done by using the package “MVN”:
##please update your r version to R 3.4.4(or latest)
library(lme4)
## Loading required package: Matrix
library(MVN)
## sROC 0.1-2 loaded
# Running (i) Mardia's; (ii) Henze-Zirkler’s and (iii) Royston's Multivariate Normality Test:
mvn(col,mvnTest = "mardia")
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 90.4959189840481 0.00239757245324619 NO
## 2 Mardia Kurtosis 0.231402031484855 0.817002487092545 YES
## 3 MVN <NA> <NA> NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk Academic 0.9459 0.0462 NO
## 2 Shapiro-Wilk Grad_rate 0.9236 0.0079 NO
## 3 Shapiro-Wilk SAT_P25 0.9733 0.4238 YES
## 4 Shapiro-Wilk SAT_P75 0.9775 0.5659 YES
## 5 Shapiro-Wilk HS_P10 0.9147 0.0041 NO
## 6 Shapiro-Wilk Accept_rate 0.9598 0.1453 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max
## Academic 42 4.1690476 0.46721181 4.150 3.30 4.90
## Grad_rate 42 0.8361905 0.08883947 0.845 0.66 0.96
## SAT_P25 42 1230.9761905 103.29934906 1245.000 1000.00 1420.00
## SAT_P75 42 1423.8095238 89.38686489 1420.000 1230.00 1580.00
## HS_P10 42 0.7811905 0.16300834 0.830 0.37 1.00
## Accept_rate 42 0.4009524 0.18537779 0.385 0.12 0.79
## 25th 75th Skew Kurtosis
## Academic 3.8000 4.6000 0.01057444 -1.3068589
## Grad_rate 0.7725 0.9175 -0.44068064 -1.1407772
## SAT_P25 1170.0000 1290.0000 -0.32704835 -0.5237529
## SAT_P75 1355.0000 1487.5000 -0.18436202 -0.8640461
## HS_P10 0.6900 0.9000 -0.75691767 -0.4339846
## Accept_rate 0.2500 0.5475 0.28337625 -0.9868371
mvn(col,mvnTest = "hz")
## $multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 1.277839 1.017881e-09 NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk Academic 0.9459 0.0462 NO
## 2 Shapiro-Wilk Grad_rate 0.9236 0.0079 NO
## 3 Shapiro-Wilk SAT_P25 0.9733 0.4238 YES
## 4 Shapiro-Wilk SAT_P75 0.9775 0.5659 YES
## 5 Shapiro-Wilk HS_P10 0.9147 0.0041 NO
## 6 Shapiro-Wilk Accept_rate 0.9598 0.1453 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max
## Academic 42 4.1690476 0.46721181 4.150 3.30 4.90
## Grad_rate 42 0.8361905 0.08883947 0.845 0.66 0.96
## SAT_P25 42 1230.9761905 103.29934906 1245.000 1000.00 1420.00
## SAT_P75 42 1423.8095238 89.38686489 1420.000 1230.00 1580.00
## HS_P10 42 0.7811905 0.16300834 0.830 0.37 1.00
## Accept_rate 42 0.4009524 0.18537779 0.385 0.12 0.79
## 25th 75th Skew Kurtosis
## Academic 3.8000 4.6000 0.01057444 -1.3068589
## Grad_rate 0.7725 0.9175 -0.44068064 -1.1407772
## SAT_P25 1170.0000 1290.0000 -0.32704835 -0.5237529
## SAT_P75 1355.0000 1487.5000 -0.18436202 -0.8640461
## HS_P10 0.6900 0.9000 -0.75691767 -0.4339846
## Accept_rate 0.2500 0.5475 0.28337625 -0.9868371
mvn(col,mvnTest = "royston" ,multivariatePlot = "qq")

## $multivariateNormality
## Test H p value MVN
## 1 Royston 10.06986 0.01368027 NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk Academic 0.9459 0.0462 NO
## 2 Shapiro-Wilk Grad_rate 0.9236 0.0079 NO
## 3 Shapiro-Wilk SAT_P25 0.9733 0.4238 YES
## 4 Shapiro-Wilk SAT_P75 0.9775 0.5659 YES
## 5 Shapiro-Wilk HS_P10 0.9147 0.0041 NO
## 6 Shapiro-Wilk Accept_rate 0.9598 0.1453 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max
## Academic 42 4.1690476 0.46721181 4.150 3.30 4.90
## Grad_rate 42 0.8361905 0.08883947 0.845 0.66 0.96
## SAT_P25 42 1230.9761905 103.29934906 1245.000 1000.00 1420.00
## SAT_P75 42 1423.8095238 89.38686489 1420.000 1230.00 1580.00
## HS_P10 42 0.7811905 0.16300834 0.830 0.37 1.00
## Accept_rate 42 0.4009524 0.18537779 0.385 0.12 0.79
## 25th 75th Skew Kurtosis
## Academic 3.8000 4.6000 0.01057444 -1.3068589
## Grad_rate 0.7725 0.9175 -0.44068064 -1.1407772
## SAT_P25 1170.0000 1290.0000 -0.32704835 -0.5237529
## SAT_P75 1355.0000 1487.5000 -0.18436202 -0.8640461
## HS_P10 0.6900 0.9000 -0.75691767 -0.4339846
## Accept_rate 0.2500 0.5475 0.28337625 -0.9868371
## ==> The above three tests all deny the assumption of multivariate normality!! (what to do?)
# Next, we can do the following: (i) check outliers and/or (ii) check normality
# for each individual variable (then see how to deal with that).
# Let us check outliers first by using the “adjusted robust Mahalanobis distance”:
result <- mvn(data = col ,multivariateOutlierMethod = "adj" ,showNewData = T)

newcol <- result$newData
# Let us remove the 14 identified outliers (although you may not be happy about that)
# and run the Royston's Multivariate Normality Test again:
newcol<-result$newData
mvn(newcol,mvnTest = "royston" ,multivariatePlot = "qq")

## $multivariateNormality
## Test H p value MVN
## 1 Royston 6.127891 0.05244425 YES
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk Academic 0.9409 0.1164 YES
## 2 Shapiro-Wilk Grad_rate 0.8799 0.0040 NO
## 3 Shapiro-Wilk SAT_P25 0.9703 0.5896 YES
## 4 Shapiro-Wilk SAT_P75 0.9821 0.8982 YES
## 5 Shapiro-Wilk HS_P10 0.9227 0.0405 NO
## 6 Shapiro-Wilk Accept_rate 0.9441 0.1403 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th
## Academic 28 4.2678571 0.47923826 4.350 3.30 4.90 3.9000
## Grad_rate 28 0.8700000 0.07586538 0.900 0.66 0.96 0.8200
## SAT_P25 28 1266.0714286 74.35347439 1260.000 1090.00 1400.00 1227.5000
## SAT_P75 28 1451.4285714 69.32020633 1455.000 1310.00 1580.00 1400.0000
## HS_P10 28 0.7728571 0.13692001 0.805 0.44 0.95 0.6900
## Accept_rate 28 0.3578571 0.17758372 0.340 0.12 0.79 0.2175
## 75th Skew Kurtosis
## Academic 4.7000 -0.36345284 -1.1435883
## Grad_rate 0.9200 -1.06885851 0.2811316
## SAT_P25 1300.0000 -0.14979388 -0.2344259
## SAT_P75 1492.5000 -0.05405034 -0.8724510
## HS_P10 0.8800 -0.73872128 -0.3026430
## Accept_rate 0.4275 0.62317089 -0.3518986
# ==> The new data now agree with the assumption of multivariate normality (by choosing Type I error rate = 0.05).
# If you don’t feel comfortable with excluding 14 outliers (which is about 33.333% of the original data),
# you may check out the normality for each individual variable by looking at its Q-Q plot (or histogram) and see what we can do next.
# Note that the MVN package provides different types of choices for normality test on each variable
# (e.g. Shapiro-Wilk, Cramer-von Mises, Lilliefors (Kolmogorov-Smirnov), Shapiro-Francia and Anderson-Darling).
# Here we choose the popular KS test:
mvn(col ,univariateTest = "Lillie" ,desc = T)
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 90.4959189840481 0.00239757245324619 NO
## 2 Mardia Kurtosis 0.231402031484855 0.817002487092545 YES
## 3 MVN <NA> <NA> NO
##
## $univariateNormality
## Test Variable Statistic p value
## 1 Lilliefors (Kolmogorov-Smirnov) Academic 0.1102 0.2250
## 2 Lilliefors (Kolmogorov-Smirnov) Grad_rate 0.1562 0.0115
## 3 Lilliefors (Kolmogorov-Smirnov) SAT_P25 0.1100 0.2271
## 4 Lilliefors (Kolmogorov-Smirnov) SAT_P75 0.0914 0.5073
## 5 Lilliefors (Kolmogorov-Smirnov) HS_P10 0.1618 0.0074
## 6 Lilliefors (Kolmogorov-Smirnov) Accept_rate 0.0846 0.6330
## Normality
## 1 YES
## 2 NO
## 3 YES
## 4 YES
## 5 NO
## 6 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max
## Academic 42 4.1690476 0.46721181 4.150 3.30 4.90
## Grad_rate 42 0.8361905 0.08883947 0.845 0.66 0.96
## SAT_P25 42 1230.9761905 103.29934906 1245.000 1000.00 1420.00
## SAT_P75 42 1423.8095238 89.38686489 1420.000 1230.00 1580.00
## HS_P10 42 0.7811905 0.16300834 0.830 0.37 1.00
## Accept_rate 42 0.4009524 0.18537779 0.385 0.12 0.79
## 25th 75th Skew Kurtosis
## Academic 3.8000 4.6000 0.01057444 -1.3068589
## Grad_rate 0.7725 0.9175 -0.44068064 -1.1407772
## SAT_P25 1170.0000 1290.0000 -0.32704835 -0.5237529
## SAT_P75 1355.0000 1487.5000 -0.18436202 -0.8640461
## HS_P10 0.6900 0.9000 -0.75691767 -0.4339846
## Accept_rate 0.2500 0.5475 0.28337625 -0.9868371
library(CCP)
newx<-cbind(newcol[,1], newcol[,2], newcol[,6])
newy<-cbind(newcol[,3], newcol[,4], newcol[,5])
newcxy<-cancor(scale(newx, scale=T, center=T),scale(newy, scale=T, center=T))
rho<-newcxy$cor
p.asym(rho,28,3,3,tstat="Wilks") #/*Here N=28, p=q=3 */
## Wilks' Lambda, using F-approximation (Rao's F):
## stat approx df1 df2 p.value
## 1 to 3: 0.02265883 22.314909 9 53.69282 3.774758e-15
## 2 to 3: 0.48470754 5.018019 4 46.00000 1.929350e-03
## 3 to 3: 0.99555919 0.107055 1 24.00000 7.463590e-01
# If the significance level is chosen to be 5%, then the first two canonical correlations are significant.
# Check out the result again:
newcxy
## $cor
## [1] 0.97634654 0.71633118 0.06663944
##
## $xcoef
## [,1] [,2] [,3]
## [1,] -0.07542668 0.21667817 0.09637717
## [2,] -0.08301614 -0.22577702 0.24275370
## [3,] 0.05916659 -0.02854145 0.33801594
##
## $ycoef
## [,1] [,2] [,3]
## [1,] 0.1404316 -0.6790589 -0.6334100
## [2,] -0.1706895 0.8781599 0.3215675
## [3,] -0.1628499 -0.2073210 0.2585776
##
## $xcenter
## [1] -7.657565e-16 4.658972e-17 -1.028443e-16
##
## $ycenter
## [1] -1.309221e-15 1.420491e-15 4.308310e-16
# Let us visualize the result in the first canonical variate space:
xx<-scale(newx,scale=T,center=T)
yy<-scale(newy,scale=T,center=T)
scorex<-xx%*%newcxy$xcoef[,1]
scorey<-yy%*%newcxy$ycoef[,1]
plot(scorex,scorey,type="n")
text(scorex,scorey,row.names(newcol),cex=.6)
