Project of Multivariate Statistics

DATA

Library

library(ggplot2)
library(GGally)
library(CCA)
library(CCP)
library(dplyr)
library(psych)
library(readxl)
library(tseries)
library(plm)
library(forecast)
library(cluster)
library(factoextra)
library(corrplot)
library(lmtest)

Load Dataset

project <- read_xlsx("project.xlsx")

EXPLORATORY DATA ANALYSIS

str(project)

## tibble [170 × 8] (S3: tbl_df/tbl/data.frame)
##  $ PROVINSI: chr [1:170] "Aceh" "Aceh" "Aceh" "Aceh" ...
##  $ TAHUN   : num [1:170] 2017 2018 2019 2020 2021 ...
##  $ Y1_UHH  : num [1:170] 69.5 69.6 69.9 69.9 70 ...
##  $ Y2_HSL  : num [1:170] 14.1 14.3 14.3 14.3 14.4 ...
##  $ Y3_PNB  : num [1:170] 8957 9186 9603 9492 9572 ...
##  $ X1_HEAL : num [1:170] 366 373 384 384 386 605 615 637 646 653 ...
##  $ X2_EDU  : num [1:170] 4182 4211 4224 4233 4231 ...
##  $ X3_ROAD : num [1:170] 24019 23915 23897 23632 23632 ...

summary(project)

##    PROVINSI             TAHUN          Y1_UHH          Y2_HSL     
##  Length:170         Min.   :2017   Min.   :64.34   Min.   :10.54  
##  Class :character   1st Qu.:2018   1st Qu.:68.25   1st Qu.:12.66  
##  Mode  :character   Median :2019   Median :69.92   Median :13.04  
##                     Mean   :2019   Mean   :69.82   Mean   :13.09  
##                     3rd Qu.:2020   3rd Qu.:71.29   3rd Qu.:13.48  
##                     Max.   :2021   Max.   :75.04   Max.   :15.64  
##      Y3_PNB         X1_HEAL           X2_EDU         X3_ROAD     
##  Min.   : 6954   Min.   :  57.0   Min.   :  566   Min.   : 3183  
##  1st Qu.: 9274   1st Qu.: 181.8   1st Qu.: 1681   1st Qu.: 7354  
##  Median :10488   Median : 236.5   Median : 2944   Median :13078  
##  Mean   :10652   Mean   : 318.0   Mean   : 4456   Mean   :15988  
##  3rd Qu.:11545   3rd Qu.: 364.2   3rd Qu.: 4652   3rd Qu.:22351  
##  Max.   :18527   Max.   :1141.0   Max.   :20093   Max.   :42766

describe(project)

##           vars   n     mean      sd   median  trimmed      mad     min      max
## PROVINSI*    1 170    17.50    9.84    17.50    17.50    12.60    1.00    34.00
## TAHUN        2 170  2019.00    1.42  2019.00  2019.00     1.48 2017.00  2021.00
## Y1_UHH       3 170    69.82    2.55    69.93    69.83     2.29   64.34    75.04
## Y2_HSL       4 170    13.09    0.76    13.04    13.06     0.60   10.54    15.64
## Y3_PNB       5 170 10651.68 2164.33 10488.00 10458.03  1712.40 6954.00 18527.00
## X1_HEAL      6 170   317.98  254.23   236.50   264.37   146.04   57.00  1141.00
## X2_EDU       7 170  4455.53 4911.01  2943.50  3249.22  1994.84  566.00 20093.00
## X3_ROAD      8 170 15987.53 9994.48 13078.00 14778.73 10777.76 3183.00 42766.00
##             range  skew kurtosis     se
## PROVINSI*    33.0  0.00    -1.22   0.75
## TAHUN         4.0  0.00    -1.32   0.11
## Y1_UHH       10.7 -0.08    -0.53   0.20
## Y2_HSL        5.1  0.39     2.98   0.06
## Y3_PNB    11573.0  1.22     2.55 166.00
## X1_HEAL    1084.0  1.90     2.99  19.50
## X2_EDU    19527.0  2.31     4.40 376.66
## X3_ROAD   39583.0  0.89     0.07 766.54

# Sample size information
N_provinces <- length(unique(project$PROVINSI))
T_years <- length(unique(project$TAHUN))
n_total <- nrow(project)

data.frame(
  Description = c("Number of Provinces (N)", "Number of Years (T)", "Total Observations"),
  Value = c(N_provinces, T_years, n_total)
)

##               Description Value
## 1 Number of Provinces (N)    34
## 2     Number of Years (T)     5
## 3      Total Observations   170

PREPROCESSING

Set Data Panel

pdata <- pdata.frame(project, index = c("PROVINSI", "TAHUN"))
Y <- pdata[, 3:5]
X <- pdata[, 6:8]

Stasionerity Test

apply(X, 2, adf.test)

## $X1_HEAL
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -2.456, Lag order = 5, p-value = 0.3859
## alternative hypothesis: stationary
## 
## 
## $X2_EDU
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -2.4243, Lag order = 5, p-value = 0.3991
## alternative hypothesis: stationary
## 
## 
## $X3_ROAD
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -2.3074, Lag order = 5, p-value = 0.4479
## alternative hypothesis: stationary

apply(Y, 2, adf.test)

## $Y1_UHH
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -3.1013, Lag order = 5, p-value = 0.1165
## alternative hypothesis: stationary
## 
## 
## $Y2_HSL
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -3.7945, Lag order = 5, p-value = 0.02093
## alternative hypothesis: stationary
## 
## 
## $Y3_PNB
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -3.7265, Lag order = 5, p-value = 0.02427
## alternative hypothesis: stationary

Within Transformation

pdata <- project %>%
  group_by(PROVINSI) %>%
  mutate(
    X1 = X1_HEAL - mean(X1_HEAL, na.rm = TRUE),
    X2  = X2_EDU  - mean(X2_EDU, na.rm = TRUE),
    X3 = X3_ROAD - mean(X3_ROAD, na.rm = TRUE),
    Y1  = Y1_UHH  - mean(Y1_UHH, na.rm = TRUE),
    Y2  = Y2_HSL  - mean(Y2_HSL, na.rm = TRUE),
    Y3  = Y3_PNB  - mean(Y3_PNB, na.rm = TRUE) 
  ) %>%
  ungroup()

Xw <- as.matrix(pdata[, c("X1","X2","X3")])
Yw <- as.matrix(pdata[, c("Y1","Y2","Y3")])

Autocorrelation Diagnostics

dw_X1 <- dwtest(Xw[,1] ~ 1)
dw_X2 <- dwtest(Xw[,2] ~ 1)
dw_X3 <- dwtest(Xw[,3] ~ 1)
dw_Y1 <- dwtest(Yw[,1] ~ 1)
dw_Y2 <- dwtest(Yw[,2] ~ 1)
dw_Y3 <- dwtest(Yw[,3] ~ 1)

dw_summary <- data.frame(
  Variable = c("X1_HEAL", "X2_EDU", "X3_ROAD", "Y1_UHH", "Y2_HSL", "Y3_PNB"),
  DW_Statistic = round(c(dw_X1$statistic, dw_X2$statistic, dw_X3$statistic,
                         dw_Y1$statistic, dw_Y2$statistic, dw_Y3$statistic), 3),
  P_Value = format(c(dw_X1$p.value, dw_X2$p.value, dw_X3$p.value,
                     dw_Y1$p.value, dw_Y2$p.value, dw_Y3$p.value), scientific = TRUE, digits = 3),
  Interpretation = ifelse(
    c(dw_X1$p.value, dw_X2$p.value, dw_X3$p.value,
      dw_Y1$p.value, dw_Y2$p.value, dw_Y3$p.value) < 0.05,
    "Significant autocorrelation",
    "No significant autocorrelation"
  )
)

dw_summary

##   Variable DW_Statistic  P_Value                 Interpretation
## 1  X1_HEAL        1.653 1.13e-02    Significant autocorrelation
## 2   X2_EDU        2.117 7.78e-01 No significant autocorrelation
## 3  X3_ROAD        1.379 2.32e-05    Significant autocorrelation
## 4   Y1_UHH        1.781 7.51e-02 No significant autocorrelation
## 5   Y2_HSL        1.945 3.59e-01 No significant autocorrelation
## 6   Y3_PNB        2.139 8.19e-01 No significant autocorrelation

Handling Autocorrelation

pdata_diff <- pdata %>%
  group_by(PROVINSI) %>%
  arrange(TAHUN) %>%
  mutate(
    X1_diff = c(NA, diff(X1)),
    X2_diff = c(NA, diff(X2)),
    X3_diff = c(NA, diff(X3)),
    Y1_diff = c(NA, diff(Y1)),
    Y2_diff = c(NA, diff(Y2)),
    Y3_diff = c(NA, diff(Y3))
  ) %>%
  ungroup() %>%
  na.omit()

Xw_diff <- as.matrix(pdata_diff[, c("X1_diff","X2_diff","X3_diff")])
Yw_diff <- as.matrix(pdata_diff[, c("Y1_diff","Y2_diff","Y3_diff")])

dwtest(Xw_diff[,1] ~ 1)

## 
##  Durbin-Watson test
## 
## data:  Xw_diff[, 1] ~ 1
## DW = 1.8694, p-value = 0.2214
## alternative hypothesis: true autocorrelation is greater than 0

dwtest(Xw_diff[,3] ~ 1)

## 
##  Durbin-Watson test
## 
## data:  Xw_diff[, 3] ~ 1
## DW = 2.1022, p-value = 0.7259
## alternative hypothesis: true autocorrelation is greater than 0

Standardized Data

Xs <- scale(Xw_diff)
Ys <- scale(Yw_diff)

n <- nrow(Xs)
p <- ncol(Xs)
q <- ncol(Ys)
n; p; q

## [1] 136

## [1] 3

## [1] 3

Correlations

matcor(Xs, Ys)

## $Xcor
##            X1_diff     X2_diff     X3_diff
## X1_diff 1.00000000  0.03773114  0.11889535
## X2_diff 0.03773114  1.00000000 -0.01756292
## X3_diff 0.11889535 -0.01756292  1.00000000
## 
## $Ycor
##            Y1_diff    Y2_diff    Y3_diff
## Y1_diff  1.0000000 -0.0593192  0.2632207
## Y2_diff -0.0593192  1.0000000 -0.1173469
## Y3_diff  0.2632207 -0.1173469  1.0000000
## 
## $XYcor
##             X1_diff      X2_diff     X3_diff      Y1_diff     Y2_diff
## X1_diff  1.00000000  0.037731138  0.11889535  0.282261358 -0.03041802
## X2_diff  0.03773114  1.000000000 -0.01756292 -0.002229886 -0.07883433
## X3_diff  0.11889535 -0.017562919  1.00000000 -0.074860910 -0.07311742
## Y1_diff  0.28226136 -0.002229886 -0.07486091  1.000000000 -0.05931920
## Y2_diff -0.03041802 -0.078834331 -0.07311742 -0.059319202  1.00000000
## Y3_diff  0.18703003  0.096192329 -0.07553553  0.263220673 -0.11734687
##             Y3_diff
## X1_diff  0.18703003
## X2_diff  0.09619233
## X3_diff -0.07553553
## Y1_diff  0.26322067
## Y2_diff -0.11734687
## Y3_diff  1.00000000

par(mfrow = c(1, 2))
corrplot(cor(Xs), method = "color", type = "upper", 
         title = "Correlation within X", mar = c(0,0,2,0))
corrplot(cor(Ys), method = "color", type = "upper", 
         title = "Correlation within Y", mar = c(0,0,2,0))

par(mfrow = c(1, 1))

Visualization

ggpairs(Xs)

ggpairs(Ys)

MODELLING (CANONICAL CORRELATION ANALYSIS)

CCA Model

Sx <- cov(Xs)
Sy <- cov(Ys)
Sxy <- cov(Xs,Ys)
Sxeig <- eigen(Sx, symmetric = TRUE)
Sxisqrt <- Sxeig$vectors %*% diag(1/sqrt(Sxeig$values)) %*% t(Sxeig$vectors)
Syeig <- eigen(Sy, symmetric=TRUE)
Syisqrt <- Syeig$vectors %*% diag(1/sqrt(Syeig$values)) %*% t(Syeig$vectors)
Xmat <- Sxisqrt %*% Sxy %*% solve(Sy) %*% t(Sxy) %*% Sxisqrt
Ymat <- Syisqrt %*% t(Sxy) %*% solve(Sx) %*% Sxy %*% Syisqrt
Xeig <- eigen(Xmat, symmetric=TRUE)
Yeig <- eigen(Ymat, symmetric=TRUE)

Canonical Correlations

rho <- sqrt(Xeig$values)
rho

## [1] 0.33371213 0.12359955 0.07189337

sqrt(Yeig$values)[1:p]

## [1] 0.33371213 0.12359955 0.07189337

Compare Linear Combinations

Ahat<-Sxisqrt%*% Xeig$vectors
Ahat

##             [,1]       [,2]       [,3]
## [1,]  0.95709340 0.02399717  0.3152295
## [2,]  0.06021102 0.92345200 -0.3814859
## [3,] -0.41057477 0.39437311  0.8311066

Bhat <-Syisqrt%*% Yeig$vectors
Bhat

##            [,1]       [,2]       [,3]
## [1,] 0.78811563 -0.3851478  0.5531048
## [2,] 0.08800129 -0.7897110 -0.6192484
## [3,] 0.44957354  0.5226913 -0.7818220

Bartlett’s Test

BT <--(n-1-(1/2)*(p+q+1))*sum(log(1-rho^2))
df_BT <- p * q
p_BT <- pchisq(q = BT, df = df_BT, lower.tail = FALSE)
bartlett_result <- data.frame(
  Test = "Bartlett",
  Statistic = BT,
  DF = df_BT,
  P_Value = p_BT
)
bartlett_result

##       Test Statistic DF    P_Value
## 1 Bartlett   18.2317  9 0.03257856

Multivariate Statistics Test

p.asym(rho, n, p, q, tstat = "Wilks")

## Wilks' Lambda, using F-approximation (Rao's F):
##               stat    approx df1      df2    p.value
## 1 to 3:  0.8705378 2.0617600   9 316.5364 0.03259496
## 2 to 3:  0.9796335 0.6773688   4 262.0000 0.60819638
## 3 to 3:  0.9948313 0.6858075   1 132.0000 0.40908888

p.asym(rho, n, p, q, tstat = "Hotelling")

##  Hotelling-Lawley Trace, using F-approximation:
##                 stat    approx df1 df2    p.value
## 1 to 3:  0.146029244 2.0876773   9 386 0.02968296
## 2 to 3:  0.020709362 0.6765058   4 392 0.60858607
## 3 to 3:  0.005195511 0.6892711   1 398 0.40691012

p.asym(rho, n, p, q, tstat = "Pillai")

##  Pillai-Bartlett Trace, using F-approximation:
##                 stat    approx df1 df2    p.value
## 1 to 3:  0.131809291 2.0220443   9 396 0.03577064
## 2 to 3:  0.020445505 0.6896243   4 402 0.59946849
## 3 to 3:  0.005168657 0.7041506   1 408 0.40188393

p.asym(rho, n, p, q, tstat = "Roy")

##  Roy's Largest Root, using F-approximation:
##               stat   approx df1 df2     p.value
## 1 to 1:  0.1113638 5.514075   3 132 0.001341907
## 
##  F statistic for Roy's Greatest Root is an upper bound.

Canonical Scores

U_scores <- Xs %*% Ahat
V_scores <- Ys %*% Bhat

canonical_data <- data.frame(U1 = U_scores[,1],
                             V1 = V_scores[,1])
head(canonical_data)

##          U1         V1
## 1 0.6278671 -0.1562521
## 2 1.5114141  0.8084030
## 3 0.6999946  0.6448435
## 4 0.2103761  0.4587145
## 5 0.9522456  0.1805180
## 6 2.1627151  0.8210725

Canonical Loadings

loadings_X <- cor(Xs, U_scores)
loadings_Y <- cor(Ys, V_scores)

rownames(loadings_X) <- c("X1_HEAL", "X2_EDU", "X3_ROAD")
rownames(loadings_Y) <- c("Y1_UHH", "Y2_HSL", "Y3_PNB")

list(
  Loadings_X = loadings_X,
  Loadings_Y = loadings_Y
)

## $Loadings_X
##               [,1]      [,2]       [,3]
## X1_HEAL  0.9105498 0.1057292  0.3996503
## X2_EDU   0.1035341 0.9174311 -0.3841886
## X3_ROAD -0.2978383 0.3810078  0.8752859
## 
## $Loadings_Y
##               [,1]       [,2]       [,3]
## Y1_UHH  0.90123251 -0.2007196  0.3840464
## Y2_HSL -0.01150515 -0.8282006 -0.5603137
## Y3_PNB  0.64669519  0.5139825 -0.5635666

MODELLING (K-MEANS)

Summary and Visualization

summary(canonical_data)

##        U1                V1         
##  Min.   :-3.9698   Min.   :-2.2003  
##  1st Qu.:-0.5818   1st Qu.:-0.7798  
##  Median :-0.1799   Median :-0.1237  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6071   3rd Qu.: 0.7754  
##  Max.   : 3.9858   Max.   : 2.3870

describe(canonical_data)

##    vars   n mean sd median trimmed  mad   min  max range skew kurtosis   se
## U1    1 136    0  1  -0.18   -0.06 0.77 -3.97 3.99  7.96 0.53     3.12 0.09
## V1    2 136    0  1  -0.12   -0.04 1.13 -2.20 2.39  4.59 0.31    -0.78 0.09

ggpairs(canonical_data)

Elbow method

fviz_nbclust(canonical_data, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2) + 
  labs(subtitle = "Elbow method")

K-Means Final Model

set.seed(2025)
km_final <- kmeans(canonical_data, centers = 3, nstart = 50)
km_final

## K-means clustering with 3 clusters of sizes 39, 26, 71
## 
## Cluster means:
##           U1         V1
## 1 -0.1856117  1.0412927
## 2  1.4953626  0.5791403
## 3 -0.4456419 -0.7840572
## 
## Clustering vector:
##   [1] 2 2 2 1 2 2 1 1 1 3 3 2 2 3 2 2 1 1 1 1 3 1 2 3 1 1 2 2 1 1 2 1 2 1 2 2 1
##  [38] 2 2 2 1 1 1 1 3 1 3 3 1 1 1 1 2 1 3 1 1 3 1 1 1 1 1 2 1 2 2 1 3 3 3 3 3 3
##  [75] 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 1 3 3 3 3 3 3 2 3 3 3 1 3 3 3 3 3 3 3 3 3 2 3
## 
## Within cluster sum of squares by cluster:
## [1] 24.64165 29.17948 47.94127
##  (between_SS / total_SS =  62.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Model Evaluation

evaluation <- data.frame(
  Metric = c("Total Sum of Squares (TSS)", 
             "Within-Cluster Sum of Squares (WCSS)", 
             "Between-Cluster Sum of Squares (BSS)",
             "BSS/TSS Ratio",
             "Average Silhouette Width"),
  Value = c(km_final$totss,
            km_final$tot.withinss,
            km_final$betweenss,
            km_final$betweenss / km_final$totss,
            mean(silhouette(km_final$cluster, dist(canonical_data))[, 3]))
)
evaluation

##                                 Metric       Value
## 1           Total Sum of Squares (TSS) 270.0000000
## 2 Within-Cluster Sum of Squares (WCSS) 101.7624047
## 3 Between-Cluster Sum of Squares (BSS) 168.2375953
## 4                        BSS/TSS Ratio   0.6231022
## 5             Average Silhouette Width   0.4388055

Visualization

fviz_cluster(
  km_final,
  data = canonical_data,
  ellipse.type = "euclid",
  star.plot = TRUE,
  repel = TRUE,
  ggtheme = theme_minimal()
)

fviz_cluster(
  km_final,
  data = canonical_data,
  geom = "point",
  ellipse.type = "norm",
  show.clust.cent = TRUE,
  main = "Cluster Plot (K-Means on Canonical Variates)"
)