Project of Multivariate Statistics

DATA

Library

library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.1
library(CCA)
## Warning: package 'CCA' was built under R version 4.5.2
## Loading required package: fda
## Warning: package 'fda' was built under R version 4.5.2
## Loading required package: splines
## Loading required package: fds
## Warning: package 'fds' was built under R version 4.5.2
## Loading required package: rainbow
## Warning: package 'rainbow' was built under R version 4.5.2
## Loading required package: MASS
## Loading required package: pcaPP
## Warning: package 'pcaPP' was built under R version 4.5.2
## Loading required package: RCurl
## Warning: package 'RCurl' was built under R version 4.5.2
## Loading required package: deSolve
## Warning: package 'deSolve' was built under R version 4.5.2
## 
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
## 
##     matplot
## The following object is masked from 'package:datasets':
## 
##     gait
## Loading required package: fields
## Warning: package 'fields' was built under R version 4.5.2
## Loading required package: spam
## Warning: package 'spam' was built under R version 4.5.2
## Spam version 2.11-1 (2025-01-20) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## Loading required package: viridisLite
## Loading required package: RColorBrewer
## 
## Try help(fields) to get started.
library(CCP)
## Warning: package 'CCP' was built under R version 4.5.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## Warning: package 'psych' was built under R version 4.5.1
## 
## Attaching package: 'psych'
## The following object is masked from 'package:fields':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(readxl)
library(tseries)
## Warning: package 'tseries' was built under R version 4.5.1
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(plm)
## Warning: package 'plm' was built under R version 4.5.1
## 
## Attaching package: 'plm'
## The following objects are masked from 'package:dplyr':
## 
##     between, lag, lead
library(forecast)
## Warning: package 'forecast' was built under R version 4.5.1
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:fda':
## 
##     fourier
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Load Dataset

project <- read_xlsx("project.xlsx")

EXPLORATORY DATA ANALYSIS

str(project)
## tibble [170 × 8] (S3: tbl_df/tbl/data.frame)
##  $ PROVINSI: chr [1:170] "Aceh" "Aceh" "Aceh" "Aceh" ...
##  $ TAHUN   : num [1:170] 2017 2018 2019 2020 2021 ...
##  $ Y1_UHH  : num [1:170] 69.5 69.6 69.9 69.9 70 ...
##  $ Y2_HSL  : num [1:170] 14.1 14.3 14.3 14.3 14.4 ...
##  $ Y3_PNB  : num [1:170] 8957 9186 9603 9492 9572 ...
##  $ X1_HEAL : num [1:170] 366 373 384 384 386 605 615 637 646 653 ...
##  $ X2_EDU  : num [1:170] 4182 4211 4224 4233 4231 ...
##  $ X3_ROAD : num [1:170] 24019 23915 23897 23632 23632 ...
summary(project)
##    PROVINSI             TAHUN          Y1_UHH          Y2_HSL     
##  Length:170         Min.   :2017   Min.   :64.34   Min.   :10.54  
##  Class :character   1st Qu.:2018   1st Qu.:68.25   1st Qu.:12.66  
##  Mode  :character   Median :2019   Median :69.92   Median :13.04  
##                     Mean   :2019   Mean   :69.82   Mean   :13.09  
##                     3rd Qu.:2020   3rd Qu.:71.29   3rd Qu.:13.48  
##                     Max.   :2021   Max.   :75.04   Max.   :15.64  
##      Y3_PNB         X1_HEAL           X2_EDU         X3_ROAD     
##  Min.   : 6954   Min.   :  57.0   Min.   :  566   Min.   : 3183  
##  1st Qu.: 9274   1st Qu.: 181.8   1st Qu.: 1681   1st Qu.: 7354  
##  Median :10488   Median : 236.5   Median : 2944   Median :13078  
##  Mean   :10652   Mean   : 318.0   Mean   : 4456   Mean   :15988  
##  3rd Qu.:11545   3rd Qu.: 364.2   3rd Qu.: 4652   3rd Qu.:22351  
##  Max.   :18527   Max.   :1141.0   Max.   :20093   Max.   :42766
describe(project)
##           vars   n     mean      sd   median  trimmed      mad     min      max
## PROVINSI*    1 170    17.50    9.84    17.50    17.50    12.60    1.00    34.00
## TAHUN        2 170  2019.00    1.42  2019.00  2019.00     1.48 2017.00  2021.00
## Y1_UHH       3 170    69.82    2.55    69.93    69.83     2.29   64.34    75.04
## Y2_HSL       4 170    13.09    0.76    13.04    13.06     0.60   10.54    15.64
## Y3_PNB       5 170 10651.68 2164.33 10488.00 10458.03  1712.40 6954.00 18527.00
## X1_HEAL      6 170   317.98  254.23   236.50   264.37   146.04   57.00  1141.00
## X2_EDU       7 170  4455.53 4911.01  2943.50  3249.22  1994.84  566.00 20093.00
## X3_ROAD      8 170 15987.53 9994.48 13078.00 14778.73 10777.76 3183.00 42766.00
##             range  skew kurtosis     se
## PROVINSI*    33.0  0.00    -1.22   0.75
## TAHUN         4.0  0.00    -1.32   0.11
## Y1_UHH       10.7 -0.08    -0.53   0.20
## Y2_HSL        5.1  0.39     2.98   0.06
## Y3_PNB    11573.0  1.22     2.55 166.00
## X1_HEAL    1084.0  1.90     2.99  19.50
## X2_EDU    19527.0  2.31     4.40 376.66
## X3_ROAD   39583.0  0.89     0.07 766.54

PREPROCESSING

Set Data Panel

pdata <- pdata.frame(project, index = c("PROVINSI", "TAHUN"))
Y <- pdata[, 3:5]
X <- pdata[, 6:8]

Stasionerity Test

apply(X, 2, adf.test)
## $X1_HEAL
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -2.456, Lag order = 5, p-value = 0.3859
## alternative hypothesis: stationary
## 
## 
## $X2_EDU
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -2.4243, Lag order = 5, p-value = 0.3991
## alternative hypothesis: stationary
## 
## 
## $X3_ROAD
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -2.3074, Lag order = 5, p-value = 0.4479
## alternative hypothesis: stationary
apply(Y, 2, adf.test)
## $Y1_UHH
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -3.1013, Lag order = 5, p-value = 0.1165
## alternative hypothesis: stationary
## 
## 
## $Y2_HSL
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -3.7945, Lag order = 5, p-value = 0.02093
## alternative hypothesis: stationary
## 
## 
## $Y3_PNB
## 
##  Augmented Dickey-Fuller Test
## 
## data:  newX[, i]
## Dickey-Fuller = -3.7265, Lag order = 5, p-value = 0.02427
## alternative hypothesis: stationary
pdata <- project %>%
  group_by(PROVINSI) %>%
  mutate(
    X1_HEAL_w = X1_HEAL - mean(X1_HEAL, na.rm = TRUE),
    X2_EDU_w  = X2_EDU  - mean(X2_EDU, na.rm = TRUE),
    X3_ROAD_w = X3_ROAD - mean(X3_ROAD, na.rm = TRUE),
    Y1_UHH_w  = Y1_UHH  - mean(Y1_UHH, na.rm = TRUE),
    Y2_HSL_w  = Y2_HSL  - mean(Y2_HSL, na.rm = TRUE),
    Y3_PNB_w  = Y3_PNB  - mean(Y3_PNB, na.rm = TRUE) 
  ) %>%
  ungroup()

X_within <- as.matrix(pdata[, c("X1_HEAL_w","X2_EDU_w","X3_ROAD_w")])
Y_within <- as.matrix(pdata[, c("Y1_UHH_w","Y2_HSL_w","Y3_PNB_w")])
prewhiten <- function(ts_data){
  res <- apply(ts_data, 2, function(x){
    fit <- auto.arima(x)
    residuals(fit)
  })
  return(res)
}

X_pw <- prewhiten(X_within)
Y_pw <- prewhiten(Y_within)

Correlations

matcor(X_pw, Y_pw)
## $Xcor
##            X1_HEAL_w   X2_EDU_w X3_ROAD_w
## X1_HEAL_w 1.00000000 0.09447012 0.3269296
## X2_EDU_w  0.09447012 1.00000000 0.2460449
## X3_ROAD_w 0.32692963 0.24604491 1.0000000
## 
## $Ycor
##            Y1_UHH_w     Y2_HSL_w     Y3_PNB_w
## Y1_UHH_w  1.0000000  0.186733106 -0.144282471
## Y2_HSL_w  0.1867331  1.000000000 -0.005896542
## Y3_PNB_w -0.1442825 -0.005896542  1.000000000
## 
## $XYcor
##               X1_HEAL_w   X2_EDU_w   X3_ROAD_w    Y1_UHH_w     Y2_HSL_w
## X1_HEAL_w  1.0000000000 0.09447012  0.32692963  0.28594811  0.501221111
## X2_EDU_w   0.0944701167 1.00000000  0.24604491  0.12465350  0.089652323
## X3_ROAD_w  0.3269296304 0.24604491  1.00000000 -0.02870857  0.181186450
## Y1_UHH_w   0.2859481126 0.12465350 -0.02870857  1.00000000  0.186733106
## Y2_HSL_w   0.5012211108 0.08965232  0.18118645  0.18673311  1.000000000
## Y3_PNB_w  -0.0002921211 0.01882841  0.09167208 -0.14428247 -0.005896542
##                Y3_PNB_w
## X1_HEAL_w -0.0002921211
## X2_EDU_w   0.0188284140
## X3_ROAD_w  0.0916720770
## Y1_UHH_w  -0.1442824708
## Y2_HSL_w  -0.0058965424
## Y3_PNB_w   1.0000000000
n <- nrow(X_pw)
p <- ncol(X_pw)
q <- ncol(Y_pw)
n; p; q
## [1] 170
## [1] 3
## [1] 3

Standardized data

Xs <- scale(X_pw)
Ys <- scale(Y_pw)

MODELLING (CANONICAL CORRELATION ANALYSIS)

Model

# CCA
Sx <- cov(Xs)
Sy <- cov(Ys)
Sxy <- cov(Xs,Ys)
Sxeig <- eigen(Sx, symmetric = TRUE)
Sxisqrt <- Sxeig$vectors %*% diag(1/sqrt(Sxeig$values)) %*% t(Sxeig$vectors)
Syeig <- eigen(Sy, symmetric=TRUE)
Syisqrt <- Syeig$vectors %*% diag(1/sqrt(Syeig$values)) %*% t(Syeig$vectors)
Xmat <- Sxisqrt %*% Sxy %*% solve(Sy) %*% t(Sxy) %*% Sxisqrt
Ymat <- Syisqrt %*% t(Sxy) %*% solve(Sx) %*% Sxy %*% Syisqrt
Xeig <- eigen(Xmat, symmetric=TRUE)
Yeig <- eigen(Ymat, symmetric=TRUE)

The canonical correlations

rho <- sqrt(Xeig$values)
rho
## [1] 0.54662547 0.17675328 0.05384874
sqrt(Yeig$values)[1:p]
## [1] 0.54662547 0.17675328 0.05384874

Compare linear combinations

Ahat<-Sxisqrt%*% Xeig$vectors
Ahat
##            [,1]       [,2]       [,3]
## [1,]  1.0039256 -0.1518951  0.2983256
## [2,]  0.1656744 -0.5175822 -0.8771258
## [3,] -0.1055309  1.0287786 -0.3345794
Bhat <-Syisqrt%*% Yeig$vectors
Bhat
##             [,1]       [,2]       [,3]
## [1,] -0.42030458  0.8219876  0.4541992
## [2,] -0.83455714 -0.5169581 -0.2699697
## [3,] -0.05303559 -0.3631363  0.9418376

Bartlett’s test

BT <--(n-1-(1/2)*(p+q+1))*sum(log(1-rho^2))
BT
## [1] 64.47968
pchisq(q=BT, df = p*q, lower.tail = FALSE)
## [1] 1.82115e-10

Statistics Test

p.asym(rho, n, p, q, tstat = "Wilks")
## Wilks' Lambda, using F-approximation (Rao's F):
##               stat    approx df1      df2      p.value
## 1 to 3:  0.6773241 7.7022174   9 399.2835 1.828293e-10
## 2 to 3:  0.9659492 1.4415160   4 330.0000 2.199511e-01
## 3 to 3:  0.9971003 0.4827478   1 166.0000 4.881517e-01
p.asym(rho, n, p, q, tstat = "Hotelling")
##  Hotelling-Lawley Trace, using F-approximation:
##                 stat    approx df1 df2      p.value
## 1 to 3:  0.461282792 8.3372594   9 488 1.335265e-11
## 2 to 3:  0.035157361 1.4473114   4 494 2.172205e-01
## 3 to 3:  0.002908119 0.4846865   1 500 4.866306e-01
p.asym(rho, n, p, q, tstat = "Pillai")
##  Pillai-Bartlett Trace, using F-approximation:
##                 stat    approx df1 df2      p.value
## 1 to 3:  0.332940812 6.9075051   9 498 2.098095e-09
## 2 to 3:  0.034141407 1.4504458   4 504 2.161861e-01
## 3 to 3:  0.002899687 0.4934236   1 510 4.827245e-01
p.asym(rho, n, p, q, tstat = "Roy")
##  Roy's Largest Root, using F-approximation:
##               stat   approx df1 df2      p.value
## 1 to 1:  0.2987994 23.57894   3 166 9.172663e-13
## 
##  F statistic for Roy's Greatest Root is an upper bound.
U_scores <- Xs %*% Ahat
V_scores <- Ys %*% Bhat

canonical_data <- data.frame(U1 = U_scores[,1],
                             V1 = V_scores[,1])
canonical_data
##               U1          V1
## 1   -1.544952432  1.69145999
## 2   -0.421513333  0.07254563
## 3    0.304730168 -0.04175898
## 4   -0.076539219 -0.25169492
## 5    0.773401681 -0.82042413
## 6   -3.068647835  1.67135769
## 7   -1.081136220  0.29533726
## 8    0.182146660  0.44470370
## 9   -0.038256187 -0.72072779
## 10   1.819396500 -1.07682436
## 11  -0.647953801  0.86315976
## 12   1.553079533  0.12749765
## 13   1.029874303 -0.04490404
## 14   0.118724429 -0.27968614
## 15   0.837682128 -0.69989591
## 16  -1.616279316  1.04341233
## 17  -0.713086479 -0.14829775
## 18   0.627953946  0.01016018
## 19   0.229852665 -0.59896663
## 20   0.414641511 -0.62079418
## 21  -1.670300252  0.22729736
## 22   0.151095648  0.20807284
## 23   0.636423689  0.17432263
## 24   0.248813524 -0.35169623
## 25   0.736969711 -0.69309074
## 26  -2.458072209  1.50138182
## 27   0.197121383  0.42653121
## 28   0.705551255  0.46695836
## 29  -0.090312217 -0.72883042
## 30   1.107377901 -1.12681552
## 31   0.472223078  0.93379177
## 32   1.001238529  0.08159488
## 33   0.619370546 -0.08938207
## 34   0.162333524 -0.14551411
## 35   0.039033335 -0.56234477
## 36  -1.621599399  1.50785439
## 37  -0.592162169 -0.41810527
## 38  -0.024564784  0.07108520
## 39  -0.022168379 -0.55939779
## 40   0.833902263 -0.71077511
## 41  -0.222068556  1.43586214
## 42   0.698652931  0.57730031
## 43   0.450733214  0.20794752
## 44   0.059043638 -0.81129632
## 45   0.227736984 -1.55781654
## 46  -1.605908671 -0.39019711
## 47  -0.081670107  0.31799647
## 48  -0.218668444  0.34669812
## 49  -0.111139524 -0.12626119
## 50   0.986730089 -0.83130924
## 51   0.497637774  1.38916772
## 52   0.956415161 -0.52111039
## 53   0.325331744  0.19919355
## 54   0.144116313 -0.37957703
## 55  -0.125304497 -0.68156196
## 56  -4.634203529  1.32876108
## 57  -0.536076261  0.34108323
## 58  -1.180558341  0.25680244
## 59   0.105149580 -0.30254522
## 60   2.029117604 -1.29891233
## 61  -1.056373919  0.56221787
## 62   1.702688885 -0.32382758
## 63   0.592245970  0.07554916
## 64   0.505592286 -0.50357689
## 65   0.796857265 -0.27202674
## 66   0.002322130  0.99423015
## 67   0.470544874 -0.18465038
## 68   0.359314161 -0.38508041
## 69   0.158470371 -0.32059044
## 70   0.062110298 -0.41229890
## 71  -2.933401969  1.45325460
## 72   0.462611945  0.74045006
## 73   0.075206574  0.22045046
## 74   0.332884925 -0.29339159
## 75   0.947999074 -1.84040070
## 76  -1.689207637  1.35064541
## 77   1.057433034 -0.37682148
## 78   0.134763722  0.33084774
## 79   0.167767866 -0.46488130
## 80   0.691006809 -0.76854922
## 81  -0.119566862  1.27266675
## 82   0.292835567  0.52600447
## 83   0.250084636  0.08725388
## 84   0.094490771 -0.51862291
## 85   0.111052931 -1.12056626
## 86  -1.499863544  2.54285492
## 87  -0.325458728  0.82317759
## 88  -0.290704243  1.31126878
## 89   0.154294246 -1.40243778
## 90   0.854712042 -2.55756520
## 91  -3.479978348  1.10209936
## 92  -1.623336358 -0.07045737
## 93   0.233450132 -0.21245278
## 94   0.017226874 -0.51899171
## 95   2.394489178 -0.24043962
## 96  -0.756318710  0.39610095
## 97   1.248676968 -0.07049419
## 98   1.423807100 -0.18124914
## 99   0.483496413 -0.25794815
## 100  0.469058381 -0.36216966
## 101 -0.607964154 -0.17218701
## 102  0.290445113 -0.28104103
## 103  0.033667482  0.31802133
## 104 -0.014674114 -0.59757693
## 105  0.446841215 -0.50369921
## 106 -0.674373753  2.24734768
## 107  0.092255020  1.01304084
## 108  0.318725774  0.77887675
## 109  0.064145090 -1.08109003
## 110  0.370030801 -2.27585057
## 111 -1.210176087  2.43733569
## 112  0.293396460 -0.72818462
## 113  0.172623137  0.26127383
## 114  0.005942913 -0.54813383
## 115  0.613695272 -1.11454527
## 116 -0.285752226 -0.71550827
## 117  0.532676185 -0.01098644
## 118 -0.022279142 -0.15099596
## 119 -0.094687806 -0.61179275
## 120  0.201970485  0.33114598
## 121 -0.574247983  2.02087619
## 122 -0.059996642  1.12662572
## 123  0.132964600  0.21549491
## 124 -0.120756548 -1.05367288
## 125  0.473577249 -1.61407850
## 126 -1.830037341  3.01338997
## 127 -0.175119425 -0.10694332
## 128 -0.101296307  1.06997238
## 129 -0.209191163 -1.37347357
## 130  1.332317265 -1.03303948
## 131 -0.760868208  0.17303385
## 132  1.187959384 -0.13438498
## 133  1.089603592 -0.41468456
## 134  0.917795083 -0.09458615
## 135 -1.261644197 -0.57834584
## 136 -1.124287571  1.57128451
## 137  0.054693696 -0.26834047
## 138 -0.273183615  0.48673781
## 139 -0.153073661 -1.45024692
## 140  0.664247137 -0.48309107
## 141  0.082657653  1.07153403
## 142  0.497515632  0.56979359
## 143  0.325912659 -0.58948788
## 144  0.140449626  0.12636661
## 145  0.090684398 -0.98865436
## 146 -0.013133006  1.63748835
## 147 -0.572403981 -0.23277663
## 148  0.113032318  1.09887292
## 149  0.082895079 -1.50695228
## 150  0.247780340 -1.13922717
## 151 -1.750677901 -0.08772327
## 152 -0.133984195  0.08189968
## 153 -0.555129831 -0.42654465
## 154 -0.124415846 -0.05095719
## 155  1.296922021 -0.06169067
## 156 -1.692339122  1.30905490
## 157  0.171227169  0.14192142
## 158  1.014250251 -0.11638423
## 159  0.080292116 -0.49434905
## 160  0.659344345 -0.46042207
## 161 -2.943215066  2.55580239
## 162 -0.746708017  1.32706439
## 163 -0.224173013  0.16607992
## 164 -0.096831539 -1.51499091
## 165  2.235655922 -2.88409166
## 166 -0.740402423  3.63930952
## 167  1.094236142 -0.49813165
## 168  1.179742444 -0.31333017
## 169  0.401684746 -1.87851059
## 170  0.489422178 -0.90734590

MODELLING (K-MEANS)

Model

set.seed(0017)
km_final <- kmeans(canonical_data, centers = 3, nstart = 50)
km_final
## K-means clustering with 3 clusters of sizes 87, 65, 18
## 
## Cluster means:
##           U1         V1
## 1  0.5603551 -0.6773583
## 2 -0.2063940  0.3837336
## 3 -1.9630714  1.8881935
## 
## Clustering vector:
##   [1] 3 2 2 2 1 3 2 2 1 1 2 1 1 1 1 3 2 1 1 1 2 2 1 1 1 3 2 2 1 1 2 1 1 2 1 3 2
##  [38] 2 1 1 2 2 2 1 1 2 2 2 2 1 2 1 2 1 1 3 2 2 1 1 2 1 1 1 1 2 1 1 1 1 3 2 2 1
##  [75] 1 3 1 2 1 1 2 2 2 1 1 3 2 2 1 1 3 2 1 1 1 2 1 1 1 1 2 1 2 1 1 3 2 2 1 1 3
## [112] 1 2 1 1 1 1 2 1 2 3 2 2 1 1 3 2 2 1 1 2 1 1 1 2 3 1 2 1 1 2 2 1 2 1 2 2 2
## [149] 1 1 2 2 2 2 1 3 2 1 1 1 3 2 2 1 1 3 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 57.84815 38.49022 28.54609
##  (between_SS / total_SS =  63.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Evaluasi Model

wcss_final <- km_final$tot.withinss
wcss_final
## [1] 124.8845
tss_final  <- km_final$totss
tss_final
## [1] 338
bss_final  <- km_final$betweenss
bss_final
## [1] 213.1155
bss_ratio  <- bss_final / tss_final
bss_ratio
## [1] 0.6305193

Sillhouette k final

dist_mat <- dist(canonical_data)
sil_final <- silhouette(km_final$cluster, dist_mat)
avg_sil <- mean(sil_final[, "sil_width"])
avg_sil
## [1] 0.3500704

Visualization

fviz_cluster(
  km_final,
  data = canonical_data,
  geom = "point",
  ellipse.type = "euclid",
  show.clust.cent = TRUE,
  star.plot = TRUE,
  repel = TRUE,
  main = "Cluster Plot (K-Means on Canonical Variates)"
)