Project of Multivariate Statistics
DATA
Library
EXPLORATORY DATA ANALYSIS
## tibble [170 × 8] (S3: tbl_df/tbl/data.frame)
## $ PROVINSI: chr [1:170] "Aceh" "Aceh" "Aceh" "Aceh" ...
## $ TAHUN : num [1:170] 2017 2018 2019 2020 2021 ...
## $ Y1_UHH : num [1:170] 69.5 69.6 69.9 69.9 70 ...
## $ Y2_HSL : num [1:170] 14.1 14.3 14.3 14.3 14.4 ...
## $ Y3_PNB : num [1:170] 8957 9186 9603 9492 9572 ...
## $ X1_HEAL : num [1:170] 366 373 384 384 386 605 615 637 646 653 ...
## $ X2_EDU : num [1:170] 4182 4211 4224 4233 4231 ...
## $ X3_ROAD : num [1:170] 24019 23915 23897 23632 23632 ...
## PROVINSI TAHUN Y1_UHH Y2_HSL
## Length:170 Min. :2017 Min. :64.34 Min. :10.54
## Class :character 1st Qu.:2018 1st Qu.:68.25 1st Qu.:12.66
## Mode :character Median :2019 Median :69.92 Median :13.04
## Mean :2019 Mean :69.82 Mean :13.09
## 3rd Qu.:2020 3rd Qu.:71.29 3rd Qu.:13.48
## Max. :2021 Max. :75.04 Max. :15.64
## Y3_PNB X1_HEAL X2_EDU X3_ROAD
## Min. : 6954 Min. : 57.0 Min. : 566 Min. : 3183
## 1st Qu.: 9274 1st Qu.: 181.8 1st Qu.: 1681 1st Qu.: 7354
## Median :10488 Median : 236.5 Median : 2944 Median :13078
## Mean :10652 Mean : 318.0 Mean : 4456 Mean :15988
## 3rd Qu.:11545 3rd Qu.: 364.2 3rd Qu.: 4652 3rd Qu.:22351
## Max. :18527 Max. :1141.0 Max. :20093 Max. :42766
## vars n mean sd median trimmed mad min max
## PROVINSI* 1 170 17.50 9.84 17.50 17.50 12.60 1.00 34.00
## TAHUN 2 170 2019.00 1.42 2019.00 2019.00 1.48 2017.00 2021.00
## Y1_UHH 3 170 69.82 2.55 69.93 69.83 2.29 64.34 75.04
## Y2_HSL 4 170 13.09 0.76 13.04 13.06 0.60 10.54 15.64
## Y3_PNB 5 170 10651.68 2164.33 10488.00 10458.03 1712.40 6954.00 18527.00
## X1_HEAL 6 170 317.98 254.23 236.50 264.37 146.04 57.00 1141.00
## X2_EDU 7 170 4455.53 4911.01 2943.50 3249.22 1994.84 566.00 20093.00
## X3_ROAD 8 170 15987.53 9994.48 13078.00 14778.73 10777.76 3183.00 42766.00
## range skew kurtosis se
## PROVINSI* 33.0 0.00 -1.22 0.75
## TAHUN 4.0 0.00 -1.32 0.11
## Y1_UHH 10.7 -0.08 -0.53 0.20
## Y2_HSL 5.1 0.39 2.98 0.06
## Y3_PNB 11573.0 1.22 2.55 166.00
## X1_HEAL 1084.0 1.90 2.99 19.50
## X2_EDU 19527.0 2.31 4.40 376.66
## X3_ROAD 39583.0 0.89 0.07 766.54
# Sample size information
N_provinces <- length(unique(project$PROVINSI))
T_years <- length(unique(project$TAHUN))
n_total <- nrow(project)
data.frame(
Description = c("Number of Provinces (N)", "Number of Years (T)", "Total Observations"),
Value = c(N_provinces, T_years, n_total)
)## Description Value
## 1 Number of Provinces (N) 34
## 2 Number of Years (T) 5
## 3 Total Observations 170
PREPROCESSING
Set Data Panel
Stasionerity Test
## $X1_HEAL
##
## Augmented Dickey-Fuller Test
##
## data: newX[, i]
## Dickey-Fuller = -2.456, Lag order = 5, p-value = 0.3859
## alternative hypothesis: stationary
##
##
## $X2_EDU
##
## Augmented Dickey-Fuller Test
##
## data: newX[, i]
## Dickey-Fuller = -2.4243, Lag order = 5, p-value = 0.3991
## alternative hypothesis: stationary
##
##
## $X3_ROAD
##
## Augmented Dickey-Fuller Test
##
## data: newX[, i]
## Dickey-Fuller = -2.3074, Lag order = 5, p-value = 0.4479
## alternative hypothesis: stationary
## $Y1_UHH
##
## Augmented Dickey-Fuller Test
##
## data: newX[, i]
## Dickey-Fuller = -3.1013, Lag order = 5, p-value = 0.1165
## alternative hypothesis: stationary
##
##
## $Y2_HSL
##
## Augmented Dickey-Fuller Test
##
## data: newX[, i]
## Dickey-Fuller = -3.7945, Lag order = 5, p-value = 0.02093
## alternative hypothesis: stationary
##
##
## $Y3_PNB
##
## Augmented Dickey-Fuller Test
##
## data: newX[, i]
## Dickey-Fuller = -3.7265, Lag order = 5, p-value = 0.02427
## alternative hypothesis: stationary
Within Transformation
pdata <- project %>%
group_by(PROVINSI) %>%
mutate(
X1 = X1_HEAL - mean(X1_HEAL, na.rm = TRUE),
X2 = X2_EDU - mean(X2_EDU, na.rm = TRUE),
X3 = X3_ROAD - mean(X3_ROAD, na.rm = TRUE),
Y1 = Y1_UHH - mean(Y1_UHH, na.rm = TRUE),
Y2 = Y2_HSL - mean(Y2_HSL, na.rm = TRUE),
Y3 = Y3_PNB - mean(Y3_PNB, na.rm = TRUE)
) %>%
ungroup()
Xw <- as.matrix(pdata[, c("X1","X2","X3")])
Yw <- as.matrix(pdata[, c("Y1","Y2","Y3")])Autocorrelation Diagnostics
dw_X1 <- dwtest(Xw[,1] ~ 1)
dw_X2 <- dwtest(Xw[,2] ~ 1)
dw_X3 <- dwtest(Xw[,3] ~ 1)
dw_Y1 <- dwtest(Yw[,1] ~ 1)
dw_Y2 <- dwtest(Yw[,2] ~ 1)
dw_Y3 <- dwtest(Yw[,3] ~ 1)
dw_summary <- data.frame(
Variable = c("X1_HEAL", "X2_EDU", "X3_ROAD", "Y1_UHH", "Y2_HSL", "Y3_PNB"),
DW_Statistic = round(c(dw_X1$statistic, dw_X2$statistic, dw_X3$statistic,
dw_Y1$statistic, dw_Y2$statistic, dw_Y3$statistic), 3),
P_Value = format(c(dw_X1$p.value, dw_X2$p.value, dw_X3$p.value,
dw_Y1$p.value, dw_Y2$p.value, dw_Y3$p.value), scientific = TRUE, digits = 3),
Interpretation = ifelse(
c(dw_X1$p.value, dw_X2$p.value, dw_X3$p.value,
dw_Y1$p.value, dw_Y2$p.value, dw_Y3$p.value) < 0.05,
"Significant autocorrelation",
"No significant autocorrelation"
)
)
dw_summary## Variable DW_Statistic P_Value Interpretation
## 1 X1_HEAL 1.653 1.13e-02 Significant autocorrelation
## 2 X2_EDU 2.117 7.78e-01 No significant autocorrelation
## 3 X3_ROAD 1.379 2.32e-05 Significant autocorrelation
## 4 Y1_UHH 1.781 7.51e-02 No significant autocorrelation
## 5 Y2_HSL 1.945 3.59e-01 No significant autocorrelation
## 6 Y3_PNB 2.139 8.19e-01 No significant autocorrelation
Handling Autocorrelation
pdata_diff <- pdata %>%
group_by(PROVINSI) %>%
arrange(TAHUN) %>%
mutate(
X1_diff = c(NA, diff(X1)),
X2_diff = c(NA, diff(X2)),
X3_diff = c(NA, diff(X3)),
Y1_diff = c(NA, diff(Y1)),
Y2_diff = c(NA, diff(Y2)),
Y3_diff = c(NA, diff(Y3))
) %>%
ungroup() %>%
na.omit()
Xw_diff <- as.matrix(pdata_diff[, c("X1_diff","X2_diff","X3_diff")])
Yw_diff <- as.matrix(pdata_diff[, c("Y1_diff","Y2_diff","Y3_diff")])
dwtest(Xw_diff[,1] ~ 1)##
## Durbin-Watson test
##
## data: Xw_diff[, 1] ~ 1
## DW = 1.8694, p-value = 0.2214
## alternative hypothesis: true autocorrelation is greater than 0
##
## Durbin-Watson test
##
## data: Xw_diff[, 3] ~ 1
## DW = 2.1022, p-value = 0.7259
## alternative hypothesis: true autocorrelation is greater than 0
Standardized Data
## [1] 136
## [1] 3
## [1] 3
Correlations
## $Xcor
## X1_diff X2_diff X3_diff
## X1_diff 1.00000000 0.03773114 0.11889535
## X2_diff 0.03773114 1.00000000 -0.01756292
## X3_diff 0.11889535 -0.01756292 1.00000000
##
## $Ycor
## Y1_diff Y2_diff Y3_diff
## Y1_diff 1.0000000 -0.0593192 0.2632207
## Y2_diff -0.0593192 1.0000000 -0.1173469
## Y3_diff 0.2632207 -0.1173469 1.0000000
##
## $XYcor
## X1_diff X2_diff X3_diff Y1_diff Y2_diff
## X1_diff 1.00000000 0.037731138 0.11889535 0.282261358 -0.03041802
## X2_diff 0.03773114 1.000000000 -0.01756292 -0.002229886 -0.07883433
## X3_diff 0.11889535 -0.017562919 1.00000000 -0.074860910 -0.07311742
## Y1_diff 0.28226136 -0.002229886 -0.07486091 1.000000000 -0.05931920
## Y2_diff -0.03041802 -0.078834331 -0.07311742 -0.059319202 1.00000000
## Y3_diff 0.18703003 0.096192329 -0.07553553 0.263220673 -0.11734687
## Y3_diff
## X1_diff 0.18703003
## X2_diff 0.09619233
## X3_diff -0.07553553
## Y1_diff 0.26322067
## Y2_diff -0.11734687
## Y3_diff 1.00000000
par(mfrow = c(1, 2))
corrplot(cor(Xs), method = "color", type = "upper",
title = "Correlation within X", mar = c(0,0,2,0))
corrplot(cor(Ys), method = "color", type = "upper",
title = "Correlation within Y", mar = c(0,0,2,0))MODELLING (CANONICAL CORRELATION ANALYSIS)
CCA Model
Sx <- cov(Xs)
Sy <- cov(Ys)
Sxy <- cov(Xs,Ys)
Sxeig <- eigen(Sx, symmetric = TRUE)
Sxisqrt <- Sxeig$vectors %*% diag(1/sqrt(Sxeig$values)) %*% t(Sxeig$vectors)
Syeig <- eigen(Sy, symmetric=TRUE)
Syisqrt <- Syeig$vectors %*% diag(1/sqrt(Syeig$values)) %*% t(Syeig$vectors)
Xmat <- Sxisqrt %*% Sxy %*% solve(Sy) %*% t(Sxy) %*% Sxisqrt
Ymat <- Syisqrt %*% t(Sxy) %*% solve(Sx) %*% Sxy %*% Syisqrt
Xeig <- eigen(Xmat, symmetric=TRUE)
Yeig <- eigen(Ymat, symmetric=TRUE)Canonical Correlations
## [1] 0.33371213 0.12359955 0.07189337
## [1] 0.33371213 0.12359955 0.07189337
Compare Linear Combinations
## [,1] [,2] [,3]
## [1,] 0.95709340 0.02399717 0.3152295
## [2,] 0.06021102 0.92345200 -0.3814859
## [3,] -0.41057477 0.39437311 0.8311066
## [,1] [,2] [,3]
## [1,] 0.78811563 -0.3851478 0.5531048
## [2,] 0.08800129 -0.7897110 -0.6192484
## [3,] 0.44957354 0.5226913 -0.7818220
Bartlett’s Test
BT <--(n-1-(1/2)*(p+q+1))*sum(log(1-rho^2))
df_BT <- p * q
p_BT <- pchisq(q = BT, df = df_BT, lower.tail = FALSE)
bartlett_result <- data.frame(
Test = "Bartlett",
Statistic = BT,
DF = df_BT,
P_Value = p_BT
)
bartlett_result## Test Statistic DF P_Value
## 1 Bartlett 18.2317 9 0.03257856
Multivariate Statistics Test
## Wilks' Lambda, using F-approximation (Rao's F):
## stat approx df1 df2 p.value
## 1 to 3: 0.8705378 2.0617600 9 316.5364 0.03259496
## 2 to 3: 0.9796335 0.6773688 4 262.0000 0.60819638
## 3 to 3: 0.9948313 0.6858075 1 132.0000 0.40908888
## Hotelling-Lawley Trace, using F-approximation:
## stat approx df1 df2 p.value
## 1 to 3: 0.146029244 2.0876773 9 386 0.02968296
## 2 to 3: 0.020709362 0.6765058 4 392 0.60858607
## 3 to 3: 0.005195511 0.6892711 1 398 0.40691012
## Pillai-Bartlett Trace, using F-approximation:
## stat approx df1 df2 p.value
## 1 to 3: 0.131809291 2.0220443 9 396 0.03577064
## 2 to 3: 0.020445505 0.6896243 4 402 0.59946849
## 3 to 3: 0.005168657 0.7041506 1 408 0.40188393
## Roy's Largest Root, using F-approximation:
## stat approx df1 df2 p.value
## 1 to 1: 0.1113638 5.514075 3 132 0.001341907
##
## F statistic for Roy's Greatest Root is an upper bound.
Canonical Scores
U_scores <- Xs %*% Ahat
V_scores <- Ys %*% Bhat
canonical_data <- data.frame(U1 = U_scores[,1],
V1 = V_scores[,1])
head(canonical_data)## U1 V1
## 1 0.6278671 -0.1562521
## 2 1.5114141 0.8084030
## 3 0.6999946 0.6448435
## 4 0.2103761 0.4587145
## 5 0.9522456 0.1805180
## 6 2.1627151 0.8210725
Canonical Loadings
loadings_X <- cor(Xs, U_scores)
loadings_Y <- cor(Ys, V_scores)
rownames(loadings_X) <- c("X1_HEAL", "X2_EDU", "X3_ROAD")
rownames(loadings_Y) <- c("Y1_UHH", "Y2_HSL", "Y3_PNB")
list(
Loadings_X = loadings_X,
Loadings_Y = loadings_Y
)## $Loadings_X
## [,1] [,2] [,3]
## X1_HEAL 0.9105498 0.1057292 0.3996503
## X2_EDU 0.1035341 0.9174311 -0.3841886
## X3_ROAD -0.2978383 0.3810078 0.8752859
##
## $Loadings_Y
## [,1] [,2] [,3]
## Y1_UHH 0.90123251 -0.2007196 0.3840464
## Y2_HSL -0.01150515 -0.8282006 -0.5603137
## Y3_PNB 0.64669519 0.5139825 -0.5635666
MODELLING (K-MEANS)
Summary and Visualization
## U1 V1
## Min. :-3.9698 Min. :-2.2003
## 1st Qu.:-0.5818 1st Qu.:-0.7798
## Median :-0.1799 Median :-0.1237
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6071 3rd Qu.: 0.7754
## Max. : 3.9858 Max. : 2.3870
## vars n mean sd median trimmed mad min max range skew kurtosis se
## U1 1 136 0 1 -0.18 -0.06 0.77 -3.97 3.99 7.96 0.53 3.12 0.09
## V1 2 136 0 1 -0.12 -0.04 1.13 -2.20 2.39 4.59 0.31 -0.78 0.09
Elbow method
fviz_nbclust(canonical_data, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2) +
labs(subtitle = "Elbow method") K-Means Final Model
## K-means clustering with 3 clusters of sizes 39, 26, 71
##
## Cluster means:
## U1 V1
## 1 -0.1856117 1.0412927
## 2 1.4953626 0.5791403
## 3 -0.4456419 -0.7840572
##
## Clustering vector:
## [1] 2 2 2 1 2 2 1 1 1 3 3 2 2 3 2 2 1 1 1 1 3 1 2 3 1 1 2 2 1 1 2 1 2 1 2 2 1
## [38] 2 2 2 1 1 1 1 3 1 3 3 1 1 1 1 2 1 3 1 1 3 1 1 1 1 1 2 1 2 2 1 3 3 3 3 3 3
## [75] 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 1 3 3 3 3 3 3 2 3 3 3 1 3 3 3 3 3 3 3 3 3 2 3
##
## Within cluster sum of squares by cluster:
## [1] 24.64165 29.17948 47.94127
## (between_SS / total_SS = 62.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Model Evaluation
evaluation <- data.frame(
Metric = c("Total Sum of Squares (TSS)",
"Within-Cluster Sum of Squares (WCSS)",
"Between-Cluster Sum of Squares (BSS)",
"BSS/TSS Ratio",
"Average Silhouette Width"),
Value = c(km_final$totss,
km_final$tot.withinss,
km_final$betweenss,
km_final$betweenss / km_final$totss,
mean(silhouette(km_final$cluster, dist(canonical_data))[, 3]))
)
evaluation## Metric Value
## 1 Total Sum of Squares (TSS) 270.0000000
## 2 Within-Cluster Sum of Squares (WCSS) 101.7624047
## 3 Between-Cluster Sum of Squares (BSS) 168.2375953
## 4 BSS/TSS Ratio 0.6231022
## 5 Average Silhouette Width 0.4388055
Visualization
fviz_cluster(
km_final,
data = canonical_data,
ellipse.type = "euclid",
star.plot = TRUE,
repel = TRUE,
ggtheme = theme_minimal()
)fviz_cluster(
km_final,
data = canonical_data,
geom = "point",
ellipse.type = "norm",
show.clust.cent = TRUE,
main = "Cluster Plot (K-Means on Canonical Variates)"
)