P12. Canonical Correlation Analysis

Soal 2

The data are measurements done on middle-aged men in a health fitness club (Dr. A. C. Linnerud, NC State University). There are two sets of data about the men. The physiological data: Weight, Waist, Pulse. The exercises data: Chins, Situps, Jumps.

library(readxl) 
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(psych)

## Warning: package 'psych' was built under R version 4.4.3

library(CCA)

## Warning: package 'CCA' was built under R version 4.4.3

## Loading required package: fda

## Warning: package 'fda' was built under R version 4.4.3

## Loading required package: splines

## Loading required package: fds

## Warning: package 'fds' was built under R version 4.4.3

## Loading required package: rainbow

## Warning: package 'rainbow' was built under R version 4.4.3

## Loading required package: MASS

## Warning: package 'MASS' was built under R version 4.4.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: pcaPP

## Warning: package 'pcaPP' was built under R version 4.4.3

## Loading required package: RCurl

## Warning: package 'RCurl' was built under R version 4.4.3

## Loading required package: deSolve

## Warning: package 'deSolve' was built under R version 4.4.3

## 
## Attaching package: 'fda'

## The following object is masked from 'package:graphics':
## 
##     matplot

## Loading required package: fields

## Warning: package 'fields' was built under R version 4.4.3

## Loading required package: spam

## Warning: package 'spam' was built under R version 4.4.3

## Spam version 2.11-1 (2025-01-20) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

## 
## Attaching package: 'spam'

## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve

## Loading required package: viridisLite

## Loading required package: RColorBrewer

## 
## Try help(fields) to get started.

## 
## Attaching package: 'fields'

## The following object is masked from 'package:psych':
## 
##     describe

library(CCP)
library(GGally)

## Warning: package 'GGally' was built under R version 4.4.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.3

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

dat2 <- read_xlsx(path = "D:/UNY/MySta/SEM 5/StatMul/dataset StatMul/fitness.xlsx", sheet = 1)   
dat2 <- dplyr::select(dat2, -ID)
str(dat2)

## tibble [20 × 6] (S3: tbl_df/tbl/data.frame)
##  $ Weight: num [1:20] 86.6 85.7 87.5 73.5 85.7 ...
##  $ Waist : num [1:20] 36 37 38 35 35 36 38 34 31 33 ...
##  $ Pulse : num [1:20] 50 52 58 62 46 56 56 60 74 56 ...
##  $ Chins : num [1:20] 5 2 12 12 13 4 8 6 15 17 ...
##  $ Situps: num [1:20] 162 110 101 105 155 101 101 125 200 251 ...
##  $ Jumps : num [1:20] 60 60 101 37 58 42 38 40 40 250 ...

(a) Test for independence between two sets of variables

# Inspect structure
str(dat2)

## tibble [20 × 6] (S3: tbl_df/tbl/data.frame)
##  $ Weight: num [1:20] 86.6 85.7 87.5 73.5 85.7 ...
##  $ Waist : num [1:20] 36 37 38 35 35 36 38 34 31 33 ...
##  $ Pulse : num [1:20] 50 52 58 62 46 56 56 60 74 56 ...
##  $ Chins : num [1:20] 5 2 12 12 13 4 8 6 15 17 ...
##  $ Situps: num [1:20] 162 110 101 105 155 101 101 125 200 251 ...
##  $ Jumps : num [1:20] 60 60 101 37 58 42 38 40 40 250 ...

# Exploratory pairwise relationships
pairs.panels(dat2,
             method = "pearson",
             hist.col = "#00AFBB",
             density = TRUE,
             ellipses = FALSE)

# Define variable sets and scale them
physio_raw   <- dat2[, c("Weight", "Waist")]   # physiological variables
exercise_raw <- dat2[, c("Chins", "Situps", "Jumps")]   # exercise performance variables

# Standardize (mean = 0, sd = 1)
physio   <- scale(physio_raw)
exercise <- scale(exercise_raw)

# Perform Canonical Correlation Analysis
cc_fit <- cc(physio, exercise)

# Display canonical correlations
cat("\nCanonical correlations:\n")

## 
## Canonical correlations:

print(cc_fit$cor)

## [1] 0.7944229 0.1964931

(b) Determine the number of significant canonical variate pairs

n <- nrow(dat2)
p <- ncol(physio)
q <- ncol(exercise)

cat("\nSignificance test using Wilks’ Lambda:\n")

## 
## Significance test using Wilks’ Lambda:

wilks_test <- p.asym(cc_fit$cor, n, p, q, tstat = "Wilks")

## Wilks' Lambda, using F-approximation (Rao's F):
##               stat    approx df1 df2   p.value
## 1 to 2:  0.3546495 3.3959599   6  30 0.0112200
## 2 to 2:  0.9613905 0.3212809   2  16 0.7297909

print(wilks_test)

## $id
## [1] "Wilks"
## 
## $stat
## [1] 0.3546495 0.9613905
## 
## $approx
## [1] 3.3959599 0.3212809
## 
## $df1
## [1] 6 2
## 
## $df2
## [1] 30 16
## 
## $p.value
## [1] 0.0112200 0.7297909

# Squared canonical correlations (shared variance)
cat("\nSquared canonical correlations:\n")

## 
## Squared canonical correlations:

print((cc_fit$cor)^2)

## [1] 0.63110778 0.03860955

# Decision (alpha = 0.05)
cat("\nInterpretation:\n")

## 
## Interpretation:

cat("At α = 0.05, the first canonical correlation is borderline significant (p =",
    round(wilks_test$p.value[1], 4),
    "). Others are not significant.\n\n")

## At α = 0.05, the first canonical correlation is borderline significant (p = 0.0112 ). Others are not significant.

(c) Compute the canonical variates from the data

# Extract canonical variate scores (U and V)
xs <- as.data.frame(scale(physio) %*% cc_fit$xcoef)  # U variates
ys <- as.data.frame(scale(exercise) %*% cc_fit$ycoef)  # V variates

# Name the columns
colnames(xs) <- paste0("U", 1:ncol(xs))
colnames(ys) <- paste0("V", 1:ncol(ys))

# Combine into one data frame
newdat <- data.frame(xs, ys)
head(newdat)

##            U1          U2          V1         V2
## 1 -0.08436379  0.70849414 -0.11991692 -0.2535522
## 2  0.47363126  0.16165387  0.95263778 -0.3434656
## 3  0.84670242  0.07213638  1.00108971 -0.2290561
## 4  0.31308127 -1.10749996  0.04661993  1.0394822
## 5 -0.51907626  0.95045256 -0.56705591  0.5346354
## 6  0.19302205  0.02250997  0.71967015  0.2295648

# Visualization of canonical variate pairs
library(ggplot2)
library(gridExtra)

p1 <- ggplot(newdat, aes(x = U1, y = V1)) +
  geom_point(color = "#0072B2") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  theme_bw() +
  ggtitle("Canonical Variate Pair 1 (U1 vs V1)")

p2 <- ggplot(newdat, aes(x = U2, y = V2)) +
  geom_point(color = "#E69F00") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  theme_bw() +
  ggtitle("Canonical Variate Pair 2 (U2 vs V2)")

grid.arrange(p1, p2, ncol = 2)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

(d) Interpret each member of a canonical variate pair

# Correlation between original variables and their canonical variates (loadings)
corr.X.U <- cor(physio_raw, xs)
corr.Y.V <- cor(exercise_raw, ys)

cat("\nCanonical loadings (X set – physiological variables):\n")

## 
## Canonical loadings (X set – physiological variables):

print(round(corr.X.U, 3))

##           U1    U2
## Weight 0.622 0.783
## Waist  0.927 0.375

cat("\nCanonical loadings (Y set – exercise variables):\n")

## 
## Canonical loadings (Y set – exercise variables):

print(round(corr.Y.V, 3))

##            V1     V2
## Chins  -0.732 -0.183
## Situps -0.819 -0.573
## Jumps  -0.166 -0.937

(e) Describe relationships between the two sets

corr.X.V <- cor(physio_raw, ys)
corr.Y.U <- cor(exercise_raw, xs)

cat("\nCross-loadings (X variables with Y canonical variates):\n")

## 
## Cross-loadings (X variables with Y canonical variates):

print(round(corr.X.V, 3))

##           V1    V2
## Weight 0.494 0.154
## Waist  0.736 0.074

cat("\nCross-loadings (Y variables with X canonical variates):\n")

## 
## Cross-loadings (Y variables with X canonical variates):

print(round(corr.Y.U, 3))

##            U1     U2
## Chins  -0.581 -0.036
## Situps -0.651 -0.113
## Jumps  -0.132 -0.184