多變量分析

CCA.t <- read.csv("~/Documents/多變量分析/CCA/new_col2000.csv",fileEncoding = "big5")
CCA <- CCA.t[2:7]
rownames(CCA) <- CCA.t[,1] 
col <- CCA

head(col ,6)

##           Academic Grad_rate SAT_P25 SAT_P75 HS_P10 Accept_rate
## Caltech        4.7      0.85    1420    1570   1.00        0.18
## Harvard        4.9      0.96    1400    1580   0.90        0.12
## MIT            4.9      0.92    1400    1560   0.95        0.22
## Princeton      4.8      0.95    1360    1540   0.93        0.13
## Yale           4.8      0.94    1360    1540   0.95        0.18
## Stanford       4.9      0.92    1360    1540   0.87        0.13

# Then separate the 6 variables into two groups.
# The first group has 3 variables Academic, Grad_rate, and Accept_rate. 
# The second group has 3 variables SAT_P25, SAT_P75, and HS_P10.


x<-cbind(col[,1],col[,2],col[,6]) 
y<-cbind(col[,3],col[,4],col[,5])

# Standardize all variables and apply CCA to the two groups using the R function "cancor".

cxy<-cancor(scale(x,scale=T,center=T),scale(y,scale=T,center=T)) #scale:T & center:T 表示有標準化

cxy

## $cor
## [1] 0.9138012 0.4853202 0.1060088
## 
## $xcoef
##             [,1]        [,2]        [,3]
## [1,]  0.08974032 -0.19011912 -0.06046737
## [2,]  0.03330972  0.16560726 -0.22826530
## [3,] -0.04986535 -0.04022887 -0.28403564
## 
## $ycoef
##             [,1]        [,2]        [,3]
## [1,] -0.17690301  0.85320991 -0.14228895
## [2,]  0.30493077 -0.84344730  0.01596496
## [3,]  0.03635201  0.05630199  0.16833153
## 
## $xcenter
## [1] -5.815454e-16  4.150119e-16  1.902000e-17
## 
## $ycenter
## [1]  1.060412e-15 -4.753142e-16 -1.752897e-16

# In order to determine the number of canonical variates so as to provide a meaningful 
# interpretation of CCA, one can perform the Wilk’s test for checking the 
# “significance of canonical correlations”.
# However, we then have need to check the assumption of “multivariate normality” for all variables.
# This can be done by using the package “MVN”:

##please update your r version to R 3.4.4(or latest)
library(lme4)

## Loading required package: Matrix

library(MVN)

## sROC 0.1-2 loaded

# Running (i) Mardia's; (ii) Henze-Zirkler’s and (iii) Royston's Multivariate Normality Test:
  

mvn(col,mvnTest = "mardia")

## $multivariateNormality
##              Test         Statistic             p value Result
## 1 Mardia Skewness  90.4959189840481 0.00239757245324619     NO
## 2 Mardia Kurtosis 0.231402031484855   0.817002487092545    YES
## 3             MVN              <NA>                <NA>     NO
## 
## $univariateNormality
##           Test    Variable Statistic   p value Normality
## 1 Shapiro-Wilk  Academic      0.9459    0.0462    NO    
## 2 Shapiro-Wilk  Grad_rate     0.9236    0.0079    NO    
## 3 Shapiro-Wilk   SAT_P25      0.9733    0.4238    YES   
## 4 Shapiro-Wilk   SAT_P75      0.9775    0.5659    YES   
## 5 Shapiro-Wilk   HS_P10       0.9147    0.0041    NO    
## 6 Shapiro-Wilk Accept_rate    0.9598    0.1453    YES   
## 
## $Descriptives
##              n         Mean      Std.Dev   Median     Min     Max
## Academic    42    4.1690476   0.46721181    4.150    3.30    4.90
## Grad_rate   42    0.8361905   0.08883947    0.845    0.66    0.96
## SAT_P25     42 1230.9761905 103.29934906 1245.000 1000.00 1420.00
## SAT_P75     42 1423.8095238  89.38686489 1420.000 1230.00 1580.00
## HS_P10      42    0.7811905   0.16300834    0.830    0.37    1.00
## Accept_rate 42    0.4009524   0.18537779    0.385    0.12    0.79
##                  25th      75th        Skew   Kurtosis
## Academic       3.8000    4.6000  0.01057444 -1.3068589
## Grad_rate      0.7725    0.9175 -0.44068064 -1.1407772
## SAT_P25     1170.0000 1290.0000 -0.32704835 -0.5237529
## SAT_P75     1355.0000 1487.5000 -0.18436202 -0.8640461
## HS_P10         0.6900    0.9000 -0.75691767 -0.4339846
## Accept_rate    0.2500    0.5475  0.28337625 -0.9868371

mvn(col,mvnTest = "hz")

## $multivariateNormality
##            Test       HZ      p value MVN
## 1 Henze-Zirkler 1.277839 1.017881e-09  NO
## 
## $univariateNormality
##           Test    Variable Statistic   p value Normality
## 1 Shapiro-Wilk  Academic      0.9459    0.0462    NO    
## 2 Shapiro-Wilk  Grad_rate     0.9236    0.0079    NO    
## 3 Shapiro-Wilk   SAT_P25      0.9733    0.4238    YES   
## 4 Shapiro-Wilk   SAT_P75      0.9775    0.5659    YES   
## 5 Shapiro-Wilk   HS_P10       0.9147    0.0041    NO    
## 6 Shapiro-Wilk Accept_rate    0.9598    0.1453    YES   
## 
## $Descriptives
##              n         Mean      Std.Dev   Median     Min     Max
## Academic    42    4.1690476   0.46721181    4.150    3.30    4.90
## Grad_rate   42    0.8361905   0.08883947    0.845    0.66    0.96
## SAT_P25     42 1230.9761905 103.29934906 1245.000 1000.00 1420.00
## SAT_P75     42 1423.8095238  89.38686489 1420.000 1230.00 1580.00
## HS_P10      42    0.7811905   0.16300834    0.830    0.37    1.00
## Accept_rate 42    0.4009524   0.18537779    0.385    0.12    0.79
##                  25th      75th        Skew   Kurtosis
## Academic       3.8000    4.6000  0.01057444 -1.3068589
## Grad_rate      0.7725    0.9175 -0.44068064 -1.1407772
## SAT_P25     1170.0000 1290.0000 -0.32704835 -0.5237529
## SAT_P75     1355.0000 1487.5000 -0.18436202 -0.8640461
## HS_P10         0.6900    0.9000 -0.75691767 -0.4339846
## Accept_rate    0.2500    0.5475  0.28337625 -0.9868371

mvn(col,mvnTest = "royston" ,multivariatePlot = "qq")

## $multivariateNormality
##      Test        H    p value MVN
## 1 Royston 10.06986 0.01368027  NO
## 
## $univariateNormality
##           Test    Variable Statistic   p value Normality
## 1 Shapiro-Wilk  Academic      0.9459    0.0462    NO    
## 2 Shapiro-Wilk  Grad_rate     0.9236    0.0079    NO    
## 3 Shapiro-Wilk   SAT_P25      0.9733    0.4238    YES   
## 4 Shapiro-Wilk   SAT_P75      0.9775    0.5659    YES   
## 5 Shapiro-Wilk   HS_P10       0.9147    0.0041    NO    
## 6 Shapiro-Wilk Accept_rate    0.9598    0.1453    YES   
## 
## $Descriptives
##              n         Mean      Std.Dev   Median     Min     Max
## Academic    42    4.1690476   0.46721181    4.150    3.30    4.90
## Grad_rate   42    0.8361905   0.08883947    0.845    0.66    0.96
## SAT_P25     42 1230.9761905 103.29934906 1245.000 1000.00 1420.00
## SAT_P75     42 1423.8095238  89.38686489 1420.000 1230.00 1580.00
## HS_P10      42    0.7811905   0.16300834    0.830    0.37    1.00
## Accept_rate 42    0.4009524   0.18537779    0.385    0.12    0.79
##                  25th      75th        Skew   Kurtosis
## Academic       3.8000    4.6000  0.01057444 -1.3068589
## Grad_rate      0.7725    0.9175 -0.44068064 -1.1407772
## SAT_P25     1170.0000 1290.0000 -0.32704835 -0.5237529
## SAT_P75     1355.0000 1487.5000 -0.18436202 -0.8640461
## HS_P10         0.6900    0.9000 -0.75691767 -0.4339846
## Accept_rate    0.2500    0.5475  0.28337625 -0.9868371

## ==> The above three tests all deny the assumption of multivariate normality!! (what to do?)


# Next, we can do the following: (i) check outliers and/or (ii) check normality 
# for each individual variable (then see how to deal with that).
# Let us check outliers first by using the “adjusted robust Mahalanobis distance”:
  
result <- mvn(data = col ,multivariateOutlierMethod = "adj" ,showNewData = T)

newcol <- result$newData

# Let us remove the 14 identified outliers (although you may not be happy about that) 
# and run the Royston's Multivariate Normality Test again:

newcol<-result$newData

mvn(newcol,mvnTest = "royston" ,multivariatePlot = "qq")

## $multivariateNormality
##      Test        H    p value MVN
## 1 Royston 6.127891 0.05244425 YES
## 
## $univariateNormality
##           Test    Variable Statistic   p value Normality
## 1 Shapiro-Wilk  Academic      0.9409    0.1164    YES   
## 2 Shapiro-Wilk  Grad_rate     0.8799    0.0040    NO    
## 3 Shapiro-Wilk   SAT_P25      0.9703    0.5896    YES   
## 4 Shapiro-Wilk   SAT_P75      0.9821    0.8982    YES   
## 5 Shapiro-Wilk   HS_P10       0.9227    0.0405    NO    
## 6 Shapiro-Wilk Accept_rate    0.9441    0.1403    YES   
## 
## $Descriptives
##              n         Mean     Std.Dev   Median     Min     Max      25th
## Academic    28    4.2678571  0.47923826    4.350    3.30    4.90    3.9000
## Grad_rate   28    0.8700000  0.07586538    0.900    0.66    0.96    0.8200
## SAT_P25     28 1266.0714286 74.35347439 1260.000 1090.00 1400.00 1227.5000
## SAT_P75     28 1451.4285714 69.32020633 1455.000 1310.00 1580.00 1400.0000
## HS_P10      28    0.7728571  0.13692001    0.805    0.44    0.95    0.6900
## Accept_rate 28    0.3578571  0.17758372    0.340    0.12    0.79    0.2175
##                  75th        Skew   Kurtosis
## Academic       4.7000 -0.36345284 -1.1435883
## Grad_rate      0.9200 -1.06885851  0.2811316
## SAT_P25     1300.0000 -0.14979388 -0.2344259
## SAT_P75     1492.5000 -0.05405034 -0.8724510
## HS_P10         0.8800 -0.73872128 -0.3026430
## Accept_rate    0.4275  0.62317089 -0.3518986

# ==> The new data now agree with the assumption of multivariate normality (by choosing Type I error rate = 0.05).
# If you don’t feel comfortable with excluding 14 outliers (which is about 33.333% of the original data), 
# you may check out the normality for each individual variable by looking at its Q-Q plot (or histogram) and see what we can do next.
# Note that the MVN package provides different types of choices for normality test on each variable 
# (e.g. Shapiro-Wilk, Cramer-von Mises, Lilliefors (Kolmogorov-Smirnov), Shapiro-Francia and Anderson-Darling).
# Here we choose the popular KS test:

mvn(col ,univariateTest = "Lillie" ,desc = T)

## $multivariateNormality
##              Test         Statistic             p value Result
## 1 Mardia Skewness  90.4959189840481 0.00239757245324619     NO
## 2 Mardia Kurtosis 0.231402031484855   0.817002487092545    YES
## 3             MVN              <NA>                <NA>     NO
## 
## $univariateNormality
##                              Test    Variable Statistic   p value
## 1 Lilliefors (Kolmogorov-Smirnov)  Academic      0.1102    0.2250
## 2 Lilliefors (Kolmogorov-Smirnov)  Grad_rate     0.1562    0.0115
## 3 Lilliefors (Kolmogorov-Smirnov)   SAT_P25      0.1100    0.2271
## 4 Lilliefors (Kolmogorov-Smirnov)   SAT_P75      0.0914    0.5073
## 5 Lilliefors (Kolmogorov-Smirnov)   HS_P10       0.1618    0.0074
## 6 Lilliefors (Kolmogorov-Smirnov) Accept_rate    0.0846    0.6330
##   Normality
## 1    YES   
## 2    NO    
## 3    YES   
## 4    YES   
## 5    NO    
## 6    YES   
## 
## $Descriptives
##              n         Mean      Std.Dev   Median     Min     Max
## Academic    42    4.1690476   0.46721181    4.150    3.30    4.90
## Grad_rate   42    0.8361905   0.08883947    0.845    0.66    0.96
## SAT_P25     42 1230.9761905 103.29934906 1245.000 1000.00 1420.00
## SAT_P75     42 1423.8095238  89.38686489 1420.000 1230.00 1580.00
## HS_P10      42    0.7811905   0.16300834    0.830    0.37    1.00
## Accept_rate 42    0.4009524   0.18537779    0.385    0.12    0.79
##                  25th      75th        Skew   Kurtosis
## Academic       3.8000    4.6000  0.01057444 -1.3068589
## Grad_rate      0.7725    0.9175 -0.44068064 -1.1407772
## SAT_P25     1170.0000 1290.0000 -0.32704835 -0.5237529
## SAT_P75     1355.0000 1487.5000 -0.18436202 -0.8640461
## HS_P10         0.6900    0.9000 -0.75691767 -0.4339846
## Accept_rate    0.2500    0.5475  0.28337625 -0.9868371

library(CCP)


newx<-cbind(newcol[,1], newcol[,2], newcol[,6])
newy<-cbind(newcol[,3], newcol[,4], newcol[,5])

newcxy<-cancor(scale(newx, scale=T, center=T),scale(newy, scale=T, center=T))

rho<-newcxy$cor

p.asym(rho,28,3,3,tstat="Wilks") #/*Here N=28, p=q=3 */

## Wilks' Lambda, using F-approximation (Rao's F):
##                stat    approx df1      df2      p.value
## 1 to 3:  0.02265883 22.314909   9 53.69282 3.774758e-15
## 2 to 3:  0.48470754  5.018019   4 46.00000 1.929350e-03
## 3 to 3:  0.99555919  0.107055   1 24.00000 7.463590e-01

# If the significance level is chosen to be 5%, then the first two canonical correlations are significant.
# Check out the result again:


newcxy

## $cor
## [1] 0.97634654 0.71633118 0.06663944
## 
## $xcoef
##             [,1]        [,2]       [,3]
## [1,] -0.07542668  0.21667817 0.09637717
## [2,] -0.08301614 -0.22577702 0.24275370
## [3,]  0.05916659 -0.02854145 0.33801594
## 
## $ycoef
##            [,1]       [,2]       [,3]
## [1,]  0.1404316 -0.6790589 -0.6334100
## [2,] -0.1706895  0.8781599  0.3215675
## [3,] -0.1628499 -0.2073210  0.2585776
## 
## $xcenter
## [1] -7.657565e-16  4.658972e-17 -1.028443e-16
## 
## $ycenter
## [1] -1.309221e-15  1.420491e-15  4.308310e-16

# Let us visualize the result in the first canonical variate space:


xx<-scale(newx,scale=T,center=T) 
yy<-scale(newy,scale=T,center=T) 
scorex<-xx%*%newcxy$xcoef[,1] 
scorey<-yy%*%newcxy$ycoef[,1] 
plot(scorex,scorey,type="n")
text(scorex,scorey,row.names(newcol),cex=.6)

多變量分析

Yen

2018/3/29