library(knitr)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
library(RVAideMemoire)
## Warning: package 'RVAideMemoire' was built under R version 4.4.3
## *** Package RVAideMemoire v 0.9-83-12 ***
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
library(CCA)
## Warning: package 'CCA' was built under R version 4.4.3
## Loading required package: fda
## Warning: package 'fda' was built under R version 4.4.3
## Loading required package: splines
## Loading required package: fds
## Warning: package 'fds' was built under R version 4.4.3
## Loading required package: rainbow
## Warning: package 'rainbow' was built under R version 4.4.3
## Loading required package: MASS
## Loading required package: pcaPP
## Warning: package 'pcaPP' was built under R version 4.4.3
## Loading required package: RCurl
## Warning: package 'RCurl' was built under R version 4.4.3
## Loading required package: deSolve
## Warning: package 'deSolve' was built under R version 4.4.3
##
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
##
## matplot
## Loading required package: fields
## Warning: package 'fields' was built under R version 4.4.3
## Loading required package: spam
## Warning: package 'spam' was built under R version 4.4.3
## Spam version 2.11-1 (2025-01-20) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: viridisLite
## Loading required package: RColorBrewer
##
## Try help(fields) to get started.
library(corrplot)
## corrplot 0.95 loaded
library(candisc)
## Warning: package 'candisc' was built under R version 4.4.3
## Loading required package: heplots
## Warning: package 'heplots' was built under R version 4.4.3
## Loading required package: broom
##
## Attaching package: 'broom'
## The following object is masked from 'package:RVAideMemoire':
##
## bootstrap
##
## Attaching package: 'candisc'
## The following object is masked from 'package:stats':
##
## cancor
library(yacca)
## Warning: package 'yacca' was built under R version 4.4.3
## yacca: Yet Another Canonical Correlation Analysis Package
## Version 1.4-2 created on 2022-03-08.
## copyright (c) 2008, Carter T. Butts, University of California-Irvine
## For citation information, type citation("yacca").
## Type help("yacca-package") to get started.
library(CCP)
data_CCA <- as.data.frame(state.x77)
CCA_X <- data_CCA[, c("Income", "Illiteracy", "HS Grad")]
CCA_Y <- data_CCA[, c("Life Exp", "Murder", "Population")]
data_CCA
sum(is.na(data_CCA))
## [1] 0
Terbukti bahwa tidak ada missing value
ggpairs(CCA_X)
det(cor(CCA_X))
## [1] 0.3488921
eigen(cor(CCA_X))$values
## [1] 2.1478324 0.5643164 0.2878512
Variabel dalam set X menunjukkan korelasi sedang hingga kuat, namun tidak ekstrem. Nilai determinan matriks korelasi sebesar 0,3489 (> 0,01) serta seluruh eigenvalue yang bernilai positif dan tidak mendekati nol menunjukkan tidak adanya multikolinearitas maupun singularitas.
ggpairs(CCA_Y)
det(cor(CCA_Y))
## [1] 0.3040795
eigen(cor(CCA_Y))$values
## [1] 1.8797469 0.9499670 0.1702861
Variabel dalam set Y memiliki pola korelasi yang bervariasi. Nilai determinan matriks korelasi sebesar 0,3041 (> 0,01) dan eigenvalue yang seluruhnya lebih besar dari nol mengindikasikan matriks korelasi tidak singular dan bebas dari multikolinearitas.
ggpairs(data_CCA)
mqqnorm(CCA_X, main = "Multi-normal Q-Q Plot X")
## Alaska New Mexico
## 2 31
Berdasarkan QQ-plot normalitas ganda peubah X, terdapat beberapa titik yang tidak menyebar di sekitar garis lurus, khususnya Alaska dan New Mexico.
mqqnorm(CCA_Y, main = "Multi-normal Q-Q Plot Y")
## California Hawaii
## 5 11
Berdasarkan QQ-plot normalitas ganda peubah X, terdapat beberapa titik yang tidak menyebar di sekitar garis lurus, serta dengan beberapa observasi ekstrem, yaitu California dan Hawaii.
mqqnorm(cbind(CCA_X, CCA_Y), main = "Multi-normal Q-Q Plot X dan Y")
## Alaska California
## 2 5
Berdasarkan QQ-plot normalitas ganda peubah X dan Y, diperoleh bahwa titik-titik menyebar di sekitar garis diagonal, namun terdapat titik-titik yang tidak menyebar.
correls_XY <- matcor(CCA_X, CCA_Y)
kable(correls_XY$Xcor)
| Income | Illiteracy | HS Grad | |
|---|---|---|---|
| Income | 1.0000000 | -0.4370752 | 0.6199323 |
| Illiteracy | -0.4370752 | 1.0000000 | -0.6571886 |
| HS Grad | 0.6199323 | -0.6571886 | 1.0000000 |
corrplot(correls_XY$Xcor, method = "color", addCoef.col ='white', is.cor = T, type = "lower", diag = F)
kable(correls_XY$Ycor)
| Life Exp | Murder | Population | |
|---|---|---|---|
| Life Exp | 1.0000000 | -0.7808458 | -0.0680520 |
| Murder | -0.7808458 | 1.0000000 | 0.3436428 |
| Population | -0.0680520 | 0.3436428 | 1.0000000 |
corrplot(correls_XY$Ycor, method = "color", addCoef.col ='black', is.cor = T, type = "lower", diag = F)
###Korelasi XY
kable(correls_XY$XYcor)
| Income | Illiteracy | HS Grad | Life Exp | Murder | Population | |
|---|---|---|---|---|---|---|
| Income | 1.0000000 | -0.4370752 | 0.6199323 | 0.3402553 | -0.2300776 | 0.2082276 |
| Illiteracy | -0.4370752 | 1.0000000 | -0.6571886 | -0.5884779 | 0.7029752 | 0.1076224 |
| HS Grad | 0.6199323 | -0.6571886 | 1.0000000 | 0.5822162 | -0.4879710 | -0.0984897 |
| Life Exp | 0.3402553 | -0.5884779 | 0.5822162 | 1.0000000 | -0.7808458 | -0.0680520 |
| Murder | -0.2300776 | 0.7029752 | -0.4879710 | -0.7808458 | 1.0000000 | 0.3436428 |
| Population | 0.2082276 | 0.1076224 | -0.0984897 | -0.0680520 | 0.3436428 | 1.0000000 |
corrplot(correls_XY$XYcor, method = "color", addCoef.col ='white', is.cor = T, type = "lower", diag = F)
img.matcor(correls_XY, type = 2)
#Korelasi Kanonik
cca1 <- cancor(CCA_X, CCA_Y)
summary(cca1)
##
## Canonical correlation analysis of:
## 3 X variables: Income, Illiteracy, HS Grad
## with 3 Y variables: Life Exp, Murder, Population
##
## CanR CanRSQ Eigen percent cum scree
## 1 0.7215 0.5206 1.0857 76.646 76.65 ******************************
## 2 0.4157 0.1728 0.2090 14.752 91.40 ******
## 3 0.3296 0.1086 0.1219 8.602 100.00 ***
##
## Test of H0: The canonical correlations in the
## current row and all that follow are zero
##
## CanR LR test stat approx F numDF denDF Pr(> F)
## 1 0.72150 0.35350 6.3516 9 107.23 3.527e-07 ***
## 2 0.41575 0.73731 3.7034 4 90.00 0.007745 **
## 3 0.32957 0.89138 5.6052 1 46.00 0.022169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Raw canonical coefficients
##
## X variables:
## Xcan1 Xcan2 Xcan3
## Income 0.00020061 0.00075453 -0.0019243
## Illiteracy 1.52112426 -1.46642271 -0.5343688
## HS Grad -0.02235599 -0.18555039 0.0247592
##
## Y variables:
## Ycan1 Ycan2 Ycan3
## Life Exp -9.0472e-02 -1.17227258 -0.47648787
## Murder 2.5371e-01 -0.40642558 -0.10322327
## Population -2.9473e-05 0.00017121 -0.00018497
cc1 <- cc(CCA_X, CCA_Y)
cc1$cor
## [1] 0.7214954 0.4157492 0.3295704
Dengan demikian, meskipun secara statistik seluruh fungsi kanonik signifikan, fungsi kanonik pertama merupakan yang paling relevan dan dominan dalam menjelaskan hubungan antara gugus variabel X dan gugus variabel Y.
cc1[3:4]
## $xcoef
## [,1] [,2] [,3]
## Income 0.0002006131 0.0007545267 -0.001924318
## Illiteracy 1.5211242579 -1.4664227122 -0.534368841
## HS Grad -0.0223559907 -0.1855503878 0.024759154
##
## $ycoef
## [,1] [,2] [,3]
## Life Exp -9.047223e-02 -1.1722725752 -0.4764878738
## Murder 2.537086e-01 -0.4064255818 -0.1032232656
## Population -2.947339e-05 0.0001712075 -0.0001849682
Koefisien kanonik mentah menunjukkan bahwa pada fungsi kanonik pertama, variabel Illiteracy dan HS Grad merupakan kontributor utama dalam pembentukan peubah kanonik pada set X, sedangkan pada set Y didominasi oleh variabel Murder dan Life Expectancy.
#Menghitung loading kanonik
cc1_comp <- comput(CCA_X, CCA_Y, cc1)
cc1_comp$corr.X.xscores
## [,1] [,2] [,3]
## Income -0.3939155 -0.07478012 -0.91609963
## Illiteracy 0.9919651 -0.11155406 0.05967344
## HS Grad -0.7134790 -0.62385139 -0.31899392
corrplot(cc1_comp$corr.X.xscores, method = "color", addCoef.col ='white', is.cor = T)
cc1_comp$corr.Y.yscores
## [,1] [,2] [,3]
## Life Exp -0.8438158 -0.454135838 -0.2858942
## Murder 0.9861909 -0.008892474 -0.1653735
## Population 0.1985285 0.355864876 -0.9132068
corrplot(cc1_comp$corr.Y.yscores, method = "color", addCoef.col ='yellow', is.cor = T)
##Korelasi silang X dengan fungsi kanonik Y:
cc1_comp$corr.X.yscores
## [,1] [,2] [,3]
## Income -0.2842082 -0.03108978 -0.3019193
## Illiteracy 0.7156983 -0.04637851 0.0196666
## HS Grad -0.5147718 -0.25936570 -0.1051309
corrplot(cc1_comp$corr.X.yscores, method = "color", addCoef.col ='white', is.cor = T)
cc1_comp$corr.Y.xscores
## [,1] [,2] [,3]
## Life Exp -0.6088092 -0.188806603 -0.09422227
## Murder 0.7115322 -0.003697039 -0.05450222
## Population 0.1432374 0.147950531 -0.30096590
corrplot(cc1_comp$corr.Y.xscores, method = "color", addCoef.col ='green', is.cor = T)
rho <- cc1$cor
n <- dim(CCA_X)[1]
p <- length(CCA_X)
q <- length(CCA_Y)
p.asym(rho, n, p, q, tstat = "Wilks")
## Wilks' Lambda, using F-approximation (Rao's F):
## stat approx df1 df2 p.value
## 1 to 3: 0.3534992 6.351574 9 107.235 3.527369e-07
## 2 to 3: 0.7373101 3.703387 4 90.000 7.744693e-03
## 3 to 3: 0.8913834 5.605181 1 46.000 2.216881e-02
p.asym(rho, n, p, q, tstat = "Hotelling")
## Hotelling-Lawley Trace, using F-approximation:
## stat approx df1 df2 p.value
## 1 to 3: 1.4165661 6.715573 9 128 7.600366e-08
## 2 to 3: 0.3308185 3.694140 4 134 6.920293e-03
## 3 to 3: 0.1218518 5.686416 1 140 1.843680e-02
p.asym(rho, n, p, q, tstat = "Pillai")
## Pillai-Bartlett Trace, using F-approximation:
## stat approx df1 df2 p.value
## 1 to 3: 0.8020196 5.594970 9 138 1.422475e-06
## 2 to 3: 0.2814640 3.727265 4 144 6.441474e-03
## 3 to 3: 0.1086166 5.634844 1 150 1.887313e-02
p.asym(rho, n, p, q, tstat = "Roy")
## Roy's Largest Root, using F-approximation:
## stat approx df1 df2 p.value
## 1 to 1: 0.5205556 16.64813 3 46 1.834946e-07
##
## F statistic for Roy's Greatest Root is an upper bound.
Hasil pengujian signifikansi menggunakan Wilks’ Lambda, Hotelling–Lawley Trace, Pillai–Bartlett Trace, dan Roy’s Largest Root secara konsisten menunjukkan bahwa korelasi kanonik pertama berbeda nyata dengan nol (p < 0,01). Selain itu, pengujian parsial menunjukkan bahwa korelasi kanonik kedua dan ketiga juga signifikan pada taraf nyata 5%, yang mengindikasikan bahwa terdapat lebih dari satu hubungan kanonik yang bermakna antara gugus variabel X dan gugus variabel Y.
cca1
##
## Canonical correlation analysis of:
## 3 X variables: Income, Illiteracy, HS Grad
## with 3 Y variables: Life Exp, Murder, Population
##
## CanR CanRSQ Eigen percent cum scree
## 1 0.7215 0.5206 1.0857 76.646 76.65 ******************************
## 2 0.4157 0.1728 0.2090 14.752 91.40 ******
## 3 0.3296 0.1086 0.1219 8.602 100.00 ***
##
## Test of H0: The canonical correlations in the
## current row and all that follow are zero
##
## CanR LR test stat approx F numDF denDF Pr(> F)
## 1 0.72150 0.35350 6.3516 9 107.23 3.527e-07 ***
## 2 0.41575 0.73731 3.7034 4 90.00 0.007745 **
## 3 0.32957 0.89138 5.6052 1 46.00 0.022169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Berdasarkan LR test, diperoleh bahwa korelasi kanonik pertama dan kedua berbeda nyata dengan nol pada taraf nyata 1% (Pr < α, dengan α = 0,01), sedangkan korelasi kanonik ketiga tidak berbeda nyata pada taraf nyata 1% namun signifikan pada taraf nyata 5%. Hal ini menunjukkan bahwa hubungan utama antara gugus variabel X dan gugus variabel Y terutama dijelaskan oleh dua fungsi kanonik pertama, dengan fungsi kanonik pertama sebagai yang paling dominan karena menjelaskan 76,65% dari total keragaman kanonik.
coef_X <- diag(sqrt(diag(cov(CCA_X))))
coef_X %*% cc1$xcoef
## [,1] [,2] [,3]
## [1,] 0.1232707 0.4636340 -1.1824356
## [2,] 0.9271756 -0.8938332 -0.3257155
## [3,] -0.1805693 -1.4986901 0.1999796
coef_Y <- diag(sqrt(diag(cov(CCA_Y))))
coef_Y %*% cc1$ycoef
## [,1] [,2] [,3]
## [1,] -0.1214493 -1.5736511 -0.6396342
## [2,] 0.9365755 -1.5003362 -0.3810528
## [3,] -0.1315837 0.7643545 -0.8257891
$$ U1 = 0.1233X_{1} +0.92717X_2 -0.1806X_3 \ U2 = 0.4636X_{1} -0.8938X_2 -1.4987X_3 \ U3 = -1.1824X_{1} -0.3257X_2 +0.2000X_3 \
$$
\[ V1 = 0.1214X_{1} +0.9366X_2 -0.1316X_3 \\ V2 = -1.5736X_1 -1.5003X_2 + 0.7643X_3 \\ V3 = -0.6396X_1 -0.3810X_2 -0.8258X_3 \]