Clustering
library(mclust)
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:dplyr':
##
## count
results <- regions %>%
group_by(region_name) %>%
group_split() %>%
lapply(function(region_df) {
region <- unique(region_df$region_name)
cat("\n====================================================\n")
cat("Region:", region, "\n")
cat("====================================================\n")
# Average across all years per country
X_full <- region_df %>%
group_by(country_name) %>%
summarise(
democratic_stability = mean(democratic_stability, na.rm = TRUE),
Tscore1519_ML = mean(Tscore1519_ML, na.rm = TRUE),
secondary_enrolment = mean(secondary_enrolment, na.rm = TRUE),
tertiary_enrolment = mean(tertiary_enrolment, na.rm = TRUE),
primary_enrolment = mean(primary_enrolment, na.rm = TRUE)
) %>%
na.omit()
# Numeric only for Mclust
X <- X_full %>% select(-country_name) %>% as.data.frame()
if (nrow(X) < 3) {
cat("Skipping region - not enough countries to cluster.\n")
return(list(
region = region,
mod1 = NULL,
BIC = NULL,
classification = NULL,
cluster_assignments = NULL
))}
cat("Sample size after removing NAs:", nrow(X), "\n")
# Compute and plot BIC
BIC <- mclustBIC(X)
plot(BIC, main = paste("BIC -", region))
cat("\nBIC Summary:\n")
print(summary(BIC))
# Fit model using BIC results
mod1 <- Mclust(X, x = BIC, G = 1:5)
cat("\nModel Summary:\n")
print(summary(mod1, parameters = TRUE))
# Plot clPairs using mod1 classification
clPairs(mod1$data, classification = mod1$classification,
main = paste("clPairs -", region))
# Plot classification and uncertainty
plot(mod1, what = "classification",
main = paste("Classification -", region))
plot(mod1, what = "uncertainty",
main = paste("Uncertainty -", region))
# Classification table
cat("\nClassification Table:\n")
print(table(mod1$classification))
# Country cluster assignments
cluster_assignments <- X_full %>%
mutate(cluster = mod1$classification) %>%
arrange(cluster) %>%
select(country_name, cluster)
cat("\nCountry Cluster Assignments:\n")
print(cluster_assignments)
list(
region = region,
mod1 = mod1,
BIC = BIC,
classification = mod1$classification,
cluster_assignments = cluster_assignments
)
})
##
## ====================================================
## Region: Asia and Oceania
## ====================================================
## Sample size after removing NAs: 6
##
## BIC Summary:
## Best BIC values:
## EEI,5 EEI,4 EEI,3
## BIC -145.6369 -205.04567 -239.60404
## BIC diff 0.0000 -59.40878 -93.96715
##
## Model Summary:
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEI (diagonal, equal volume and shape) model with 5 components:
##
## log-likelihood n df BIC ICL
## -42.35853 6 34 -145.6369 -145.6369
##
## Clustering table:
## 1 2 3 4 5
## 1 2 1 1 1
##
## Mixing probabilities:
## 1 2 3 4 5
## 0.1666667 0.3333333 0.1666667 0.1666667 0.1666667
##
## Means:
## [,1] [,2] [,3] [,4] [,5]
## democratic_stability 1.944444 6.638889 2.722222 2.285714 3.00000
## Tscore1519_ML 312.500000 389.082540 462.577778 602.657143 442.64444
## secondary_enrolment 48.109504 82.989224 77.641026 106.734276 101.34951
## tertiary_enrolment 11.788177 31.048868 37.545810 90.996107 47.24416
## primary_enrolment 114.588223 102.567130 96.247872 101.435440 102.72289
##
## Variances:
## [,,1]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.006430041 0.0000 0.00000000
## Tscore1519_ML 0.000000000 122.1091 0.00000000
## secondary_enrolment 0.000000000 0.0000 0.02762234
## tertiary_enrolment 0.000000000 0.0000 0.00000000
## primary_enrolment 0.000000000 0.0000 0.00000000
## tertiary_enrolment primary_enrolment
## democratic_stability 0.0000000 0.000000
## Tscore1519_ML 0.0000000 0.000000
## secondary_enrolment 0.0000000 0.000000
## tertiary_enrolment 0.2788292 0.000000
## primary_enrolment 0.0000000 6.799506
## [,,2]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.006430041 0.0000 0.00000000
## Tscore1519_ML 0.000000000 122.1091 0.00000000
## secondary_enrolment 0.000000000 0.0000 0.02762234
## tertiary_enrolment 0.000000000 0.0000 0.00000000
## primary_enrolment 0.000000000 0.0000 0.00000000
## tertiary_enrolment primary_enrolment
## democratic_stability 0.0000000 0.000000
## Tscore1519_ML 0.0000000 0.000000
## secondary_enrolment 0.0000000 0.000000
## tertiary_enrolment 0.2788292 0.000000
## primary_enrolment 0.0000000 6.799506
## [,,3]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.006430041 0.0000 0.00000000
## Tscore1519_ML 0.000000000 122.1091 0.00000000
## secondary_enrolment 0.000000000 0.0000 0.02762234
## tertiary_enrolment 0.000000000 0.0000 0.00000000
## primary_enrolment 0.000000000 0.0000 0.00000000
## tertiary_enrolment primary_enrolment
## democratic_stability 0.0000000 0.000000
## Tscore1519_ML 0.0000000 0.000000
## secondary_enrolment 0.0000000 0.000000
## tertiary_enrolment 0.2788292 0.000000
## primary_enrolment 0.0000000 6.799506
## [,,4]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.006430041 0.0000 0.00000000
## Tscore1519_ML 0.000000000 122.1091 0.00000000
## secondary_enrolment 0.000000000 0.0000 0.02762234
## tertiary_enrolment 0.000000000 0.0000 0.00000000
## primary_enrolment 0.000000000 0.0000 0.00000000
## tertiary_enrolment primary_enrolment
## democratic_stability 0.0000000 0.000000
## Tscore1519_ML 0.0000000 0.000000
## secondary_enrolment 0.0000000 0.000000
## tertiary_enrolment 0.2788292 0.000000
## primary_enrolment 0.0000000 6.799506
## [,,5]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.006430041 0.0000 0.00000000
## Tscore1519_ML 0.000000000 122.1091 0.00000000
## secondary_enrolment 0.000000000 0.0000 0.02762234
## tertiary_enrolment 0.000000000 0.0000 0.00000000
## primary_enrolment 0.000000000 0.0000 0.00000000
## tertiary_enrolment primary_enrolment
## democratic_stability 0.0000000 0.000000
## Tscore1519_ML 0.0000000 0.000000
## secondary_enrolment 0.0000000 0.000000
## tertiary_enrolment 0.2788292 0.000000
## primary_enrolment 0.0000000 6.799506
##
## Classification Table:
##
## 1 2 3 4 5
## 1 2 1 1 1
##
## Country Cluster Assignments:
## # A tibble: 6 × 2
## country_name cluster
## <chr> <dbl>
## 1 Cambodia 1
## 2 Indonesia 2
## 3 Philippines 2
## 4 Malaysia 3
## 5 Singapore 4
## 6 Thailand 5
##
## ====================================================
## Region: East-Central and Southeast Europe
## ====================================================
## Sample size after removing NAs: 15
##
## BIC Summary:
## Best BIC values:
## EEV,2 EEE,1 EEV,1
## BIC -478.5586 -481.538552 -481.538552
## BIC diff 0.0000 -2.979993 -2.979993
##
## Model Summary:
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEV (ellipsoidal, equal volume and shape) model with 2 components:
##
## log-likelihood n df BIC ICL
## -190.5344 15 36 -478.5586 -478.5586
##
## Clustering table:
## 1 2
## 5 10
##
## Mixing probabilities:
## 1 2
## 0.3333313 0.6666687
##
## Means:
## [,1] [,2]
## democratic_stability 6.978385 8.877071
## Tscore1519_ML 436.336377 507.323405
## secondary_enrolment 90.160655 102.974701
## tertiary_enrolment 50.677004 69.812412
## primary_enrolment 98.664639 99.759735
##
## Variances:
## [,,1]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 1.4862558 7.268880 1.678104
## Tscore1519_ML 7.2688802 480.422892 5.322323
## secondary_enrolment 1.6781041 5.322323 16.493748
## tertiary_enrolment 4.7066624 124.504493 20.918179
## primary_enrolment 0.4092304 -47.037136 14.964136
## tertiary_enrolment primary_enrolment
## democratic_stability 4.706662 0.4092304
## Tscore1519_ML 124.504493 -47.0371360
## secondary_enrolment 20.918179 14.9641362
## tertiary_enrolment 58.533882 9.1202454
## primary_enrolment 9.120245 24.5899404
## [,,2]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.7837510 11.021888 2.7998395
## Tscore1519_ML 11.0218878 501.060439 75.3354086
## secondary_enrolment 2.7998395 75.335409 17.6902949
## tertiary_enrolment 5.4374488 48.094533 20.9619090
## primary_enrolment 0.2057687 -5.318066 -0.9159219
## tertiary_enrolment primary_enrolment
## democratic_stability 5.4374488 0.2057687
## Tscore1519_ML 48.0945332 -5.3180659
## secondary_enrolment 20.9619090 -0.9159219
## tertiary_enrolment 60.2545373 0.3786191
## primary_enrolment 0.3786191 1.7376965
##
## Classification Table:
##
## 1 2
## 5 10
##
## Country Cluster Assignments:
## # A tibble: 15 × 2
## country_name cluster
## <chr> <dbl>
## 1 Albania 1
## 2 Bosnia and Herzegovina 1
## 3 Montenegro 1
## 4 North Macedonia 1
## 5 Romania 1
## 6 Bulgaria 2
## 7 Croatia 2
## 8 Czechia 2
## 9 Estonia 2
## 10 Hungary 2
## 11 Latvia 2
## 12 Lithuania 2
## 13 Poland 2
## 14 Serbia 2
## 15 Slovenia 2
##
## ====================================================
## Region: Latin America and the Caribbean
## ====================================================
## Sample size after removing NAs: 14
##
## BIC Summary:
## Best BIC values:
## VEV,3 EEV,3 VVV,2
## BIC -367.5386 -390.70649 -466.63477
## BIC diff 0.0000 -23.16786 -99.09614
##
## Model Summary:
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 3 components:
##
## log-likelihood n df BIC ICL
## -112.5148 14 54 -367.5386 -367.5386
##
## Clustering table:
## 1 2 3
## 5 3 6
##
## Mixing probabilities:
## 1 2 3
## 0.3571429 0.2142857 0.4285714
##
## Means:
## [,1] [,2] [,3]
## democratic_stability 7.133333 9.666667 7.232639
## Tscore1519_ML 415.033333 441.677778 352.013657
## secondary_enrolment 99.339624 105.287939 72.898819
## tertiary_enrolment 54.356727 62.098267 34.520233
## primary_enrolment 110.649654 106.216315 103.587744
##
## Variances:
## [,,1]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.1860756 0.5391588 1.150384
## Tscore1519_ML 0.5391588 146.4734368 -20.980199
## secondary_enrolment 1.1503837 -20.9801990 13.320491
## tertiary_enrolment 3.4514795 -124.7452995 60.862202
## primary_enrolment -0.6216770 -15.0779348 2.956047
## tertiary_enrolment primary_enrolment
## democratic_stability 3.451479 -0.621677
## Tscore1519_ML -124.745300 -15.077935
## secondary_enrolment 60.862202 2.956047
## tertiary_enrolment 333.992995 32.854791
## primary_enrolment 32.854791 14.897652
## [,,2]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.2929897 0.5747473 -0.5917133
## Tscore1519_ML 0.5747473 12.7829092 2.3478377
## secondary_enrolment -0.5917133 2.3478377 11.5179949
## tertiary_enrolment 1.2855994 -4.1890180 -23.5776681
## primary_enrolment 0.1105297 3.4769737 4.5997456
## tertiary_enrolment primary_enrolment
## democratic_stability 1.285599 0.1105297
## Tscore1519_ML -4.189018 3.4769737
## secondary_enrolment -23.577668 4.5997456
## tertiary_enrolment 48.367076 -8.9177814
## primary_enrolment -8.917781 4.2703932
## [,,3]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 2.980059 56.8040 22.27877
## Tscore1519_ML 56.803996 1503.2093 285.22843
## secondary_enrolment 22.278769 285.2284 245.45262
## tertiary_enrolment 21.540774 371.2609 245.09230
## primary_enrolment -5.934520 -130.1132 -38.02314
## tertiary_enrolment primary_enrolment
## democratic_stability 21.54077 -5.93452
## Tscore1519_ML 371.26094 -130.11324
## secondary_enrolment 245.09230 -38.02314
## tertiary_enrolment 342.99238 -21.16743
## primary_enrolment -21.16743 24.36919
##
## Classification Table:
##
## 1 2 3
## 5 3 6
##
## Country Cluster Assignments:
## # A tibble: 14 × 2
## country_name cluster
## <chr> <dbl>
## 1 Argentina 1
## 2 Brazil 1
## 3 Colombia 1
## 4 Mexico 1
## 5 Peru 1
## 6 Chile 2
## 7 Costa Rica 2
## 8 Uruguay 2
## 9 Dominican Republic 3
## 10 El Salvador 3
## 11 Guatemala 3
## 12 Jamaica 3
## 13 Panama 3
## 14 Paraguay 3
##
## ====================================================
## Region: Middle East and North Africa
## ====================================================
## Sample size after removing NAs: 10
##
## BIC Summary:
## Best BIC values:
## EEI,9 EEI,8 VEV,2
## BIC -283.2772 -320.49906 -331.5879
## BIC diff 0.0000 -37.22182 -48.3107
##
## Model Summary:
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 2 components:
##
## log-likelihood n df BIC ICL
## -123.1961 10 37 -331.5879 -331.5879
##
## Clustering table:
## 1 2
## 6 4
##
## Mixing probabilities:
## 1 2
## 0.6 0.4
##
## Means:
## [,1] [,2]
## democratic_stability 2.263889 1.943452
## Tscore1519_ML 411.689352 429.417262
## secondary_enrolment 92.544838 97.120312
## tertiary_enrolment 44.372298 34.508187
## primary_enrolment 107.330001 99.091208
##
## Variances:
## [,,1]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.6902561 7.567771 -4.48294
## Tscore1519_ML 7.5677715 1139.847357 177.50411
## secondary_enrolment -4.4829396 177.504107 213.24831
## tertiary_enrolment -6.6180085 183.056580 142.16844
## primary_enrolment -0.5786567 -54.429099 27.89758
## tertiary_enrolment primary_enrolment
## democratic_stability -6.618009 -0.5786567
## Tscore1519_ML 183.056580 -54.4290993
## secondary_enrolment 142.168437 27.8975774
## tertiary_enrolment 207.771675 -35.2173204
## primary_enrolment -35.217320 35.5804021
## [,,2]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.01427416 -0.6186224 -0.2540362
## Tscore1519_ML -0.61862238 87.6481279 -5.1021134
## secondary_enrolment -0.25403619 -5.1021134 13.1041154
## tertiary_enrolment -0.39882467 50.1450091 -11.4765717
## primary_enrolment -0.12689213 1.1032936 6.1636710
## tertiary_enrolment primary_enrolment
## democratic_stability -0.3988247 -0.1268921
## Tscore1519_ML 50.1450091 1.1032936
## secondary_enrolment -11.4765717 6.1636710
## tertiary_enrolment 63.5920104 -7.9975525
## primary_enrolment -7.9975525 4.1121541
##
## Classification Table:
##
## 1 2
## 6 4
##
## Country Cluster Assignments:
## # A tibble: 10 × 2
## country_name cluster
## <chr> <dbl>
## 1 Algeria 1
## 2 Kuwait 1
## 3 Morocco 1
## 4 Saudi Arabia 1
## 5 Tunisia 1
## 6 United Arab Emirates 1
## 7 Bahrain 2
## 8 Jordan 2
## 9 Oman 2
## 10 Qatar 2
##
## ====================================================
## Region: Post-Soviet Eurasia
## ====================================================
## Sample size after removing NAs: 9
##
## BIC Summary:
## Best BIC values:
## EEI,8 EEI,7 EII,8
## BIC -240.2108 -249.109441 -270.82139
## BIC diff 0.0000 -8.898595 -30.61055
##
## Model Summary:
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEE (ellipsoidal, equal volume, shape and orientation) model with 4
## components:
##
## log-likelihood n df BIC ICL
## -102.2422 9 38 -287.9789 -287.9789
##
## Clustering table:
## 1 2 3 4
## 3 3 2 1
##
## Mixing probabilities:
## 1 2 3 4
## 0.3333333 0.3333333 0.2222222 0.1111111
##
## Means:
## [,1] [,2] [,3] [,4]
## democratic_stability 2.648148 5.407407 6.772222 1.444444
## Tscore1519_ML 464.629630 465.588889 437.633333 362.444444
## secondary_enrolment 93.000514 97.316690 103.330735 91.802652
## tertiary_enrolment 43.201334 74.343612 54.310713 12.281159
## primary_enrolment 98.537390 98.525838 105.894496 96.631519
##
## Variances:
## [,,1]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 2.11473022 -9.462492 -4.314540
## Tscore1519_ML -9.46249200 215.614248 36.792105
## secondary_enrolment -4.31454039 36.792105 15.847939
## tertiary_enrolment -3.54380694 127.905949 20.281237
## primary_enrolment 0.08503333 -14.520186 -0.157502
## tertiary_enrolment primary_enrolment
## democratic_stability -3.543807 0.08503333
## Tscore1519_ML 127.905949 -14.52018590
## secondary_enrolment 20.281237 -0.15750201
## tertiary_enrolment 82.231571 -7.53107421
## primary_enrolment -7.531074 2.11146786
## [,,2]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 2.11473022 -9.462492 -4.314540
## Tscore1519_ML -9.46249200 215.614248 36.792105
## secondary_enrolment -4.31454039 36.792105 15.847939
## tertiary_enrolment -3.54380694 127.905949 20.281237
## primary_enrolment 0.08503333 -14.520186 -0.157502
## tertiary_enrolment primary_enrolment
## democratic_stability -3.543807 0.08503333
## Tscore1519_ML 127.905949 -14.52018590
## secondary_enrolment 20.281237 -0.15750201
## tertiary_enrolment 82.231571 -7.53107421
## primary_enrolment -7.531074 2.11146786
## [,,3]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 2.11473022 -9.462492 -4.314540
## Tscore1519_ML -9.46249200 215.614248 36.792105
## secondary_enrolment -4.31454039 36.792105 15.847939
## tertiary_enrolment -3.54380694 127.905949 20.281237
## primary_enrolment 0.08503333 -14.520186 -0.157502
## tertiary_enrolment primary_enrolment
## democratic_stability -3.543807 0.08503333
## Tscore1519_ML 127.905949 -14.52018590
## secondary_enrolment 20.281237 -0.15750201
## tertiary_enrolment 82.231571 -7.53107421
## primary_enrolment -7.531074 2.11146786
## [,,4]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 2.11473022 -9.462492 -4.314540
## Tscore1519_ML -9.46249200 215.614248 36.792105
## secondary_enrolment -4.31454039 36.792105 15.847939
## tertiary_enrolment -3.54380694 127.905949 20.281237
## primary_enrolment 0.08503333 -14.520186 -0.157502
## tertiary_enrolment primary_enrolment
## democratic_stability -3.543807 0.08503333
## Tscore1519_ML 127.905949 -14.52018590
## secondary_enrolment 20.281237 -0.15750201
## tertiary_enrolment 82.231571 -7.53107421
## primary_enrolment -7.531074 2.11146786
##
## Classification Table:
##
## 1 2 3 4
## 3 3 2 1
##
## Country Cluster Assignments:
## # A tibble: 9 × 2
## country_name cluster
## <chr> <dbl>
## 1 Armenia 1
## 2 Azerbaijan 1
## 3 Kazakhstan 1
## 4 Belarus 2
## 5 Mongolia 2
## 6 Ukraine 2
## 7 Georgia 3
## 8 Moldova 3
## 9 Uzbekistan 4
##
## ====================================================
## Region: Southern Africa and Eastern Africa
## ====================================================
## Sample size after removing NAs: 3
##
## BIC Summary:
## Best BIC values:
## EEI,2 EEI,1 EVI,1
## BIC -94.67926 -107.7686 -107.7686
## BIC diff 0.00000 -13.0893 -13.0893
##
## Model Summary:
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEI (diagonal, equal volume and shape) model with 2 components:
##
## log-likelihood n df BIC ICL
## -38.55073 3 16 -94.67926 -94.67926
##
## Clustering table:
## 1 2
## 2 1
##
## Mixing probabilities:
## 1 2
## 0.6666667 0.3333333
##
## Means:
## [,1] [,2]
## democratic_stability 8.361111 9.18750
## Tscore1519_ML 317.461111 444.10000
## secondary_enrolment 87.710918 90.54548
## tertiary_enrolment 19.341295 35.68528
## primary_enrolment 105.579727 98.52711
##
## Variances:
## [,,1]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.1486626 0.000 0.00000
## Tscore1519_ML 0.0000000 1460.507 0.00000
## secondary_enrolment 0.0000000 0.000 42.24365
## tertiary_enrolment 0.0000000 0.000 0.00000
## primary_enrolment 0.0000000 0.000 0.00000
## tertiary_enrolment primary_enrolment
## democratic_stability 0.0000000 0.000000
## Tscore1519_ML 0.0000000 0.000000
## secondary_enrolment 0.0000000 0.000000
## tertiary_enrolment 0.4557051 0.000000
## primary_enrolment 0.0000000 6.686333
## [,,2]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.1486626 0.000 0.00000
## Tscore1519_ML 0.0000000 1460.507 0.00000
## secondary_enrolment 0.0000000 0.000 42.24365
## tertiary_enrolment 0.0000000 0.000 0.00000
## primary_enrolment 0.0000000 0.000 0.00000
## tertiary_enrolment primary_enrolment
## democratic_stability 0.0000000 0.000000
## Tscore1519_ML 0.0000000 0.000000
## secondary_enrolment 0.0000000 0.000000
## tertiary_enrolment 0.4557051 0.000000
## primary_enrolment 0.0000000 6.686333
##
## Classification Table:
##
## 1 2
## 2 1
##
## Country Cluster Assignments:
## # A tibble: 3 × 2
## country_name cluster
## <chr> <dbl>
## 1 Botswana 1
## 2 South Africa 1
## 3 Mauritius 2
##
## ====================================================
## Region: West and Central Africa
## ====================================================
## Skipping region - not enough countries to cluster.
# Name the list by region
names(results) <- sapply(results, `[[`, "region")
# Print summaries and country assignments for all regions
for (r in names(results)) {
cat("\n====================================================\n")
cat("Country assignments for:", r, "\n")
cat("====================================================\n")
print(results[[r]]$cluster_assignments)
}
##
## ====================================================
## Country assignments for: Asia and Oceania
## ====================================================
## # A tibble: 6 × 2
## country_name cluster
## <chr> <dbl>
## 1 Cambodia 1
## 2 Indonesia 2
## 3 Philippines 2
## 4 Malaysia 3
## 5 Singapore 4
## 6 Thailand 5
##
## ====================================================
## Country assignments for: East-Central and Southeast Europe
## ====================================================
## # A tibble: 15 × 2
## country_name cluster
## <chr> <dbl>
## 1 Albania 1
## 2 Bosnia and Herzegovina 1
## 3 Montenegro 1
## 4 North Macedonia 1
## 5 Romania 1
## 6 Bulgaria 2
## 7 Croatia 2
## 8 Czechia 2
## 9 Estonia 2
## 10 Hungary 2
## 11 Latvia 2
## 12 Lithuania 2
## 13 Poland 2
## 14 Serbia 2
## 15 Slovenia 2
##
## ====================================================
## Country assignments for: Latin America and the Caribbean
## ====================================================
## # A tibble: 14 × 2
## country_name cluster
## <chr> <dbl>
## 1 Argentina 1
## 2 Brazil 1
## 3 Colombia 1
## 4 Mexico 1
## 5 Peru 1
## 6 Chile 2
## 7 Costa Rica 2
## 8 Uruguay 2
## 9 Dominican Republic 3
## 10 El Salvador 3
## 11 Guatemala 3
## 12 Jamaica 3
## 13 Panama 3
## 14 Paraguay 3
##
## ====================================================
## Country assignments for: Middle East and North Africa
## ====================================================
## # A tibble: 10 × 2
## country_name cluster
## <chr> <dbl>
## 1 Algeria 1
## 2 Kuwait 1
## 3 Morocco 1
## 4 Saudi Arabia 1
## 5 Tunisia 1
## 6 United Arab Emirates 1
## 7 Bahrain 2
## 8 Jordan 2
## 9 Oman 2
## 10 Qatar 2
##
## ====================================================
## Country assignments for: Post-Soviet Eurasia
## ====================================================
## # A tibble: 9 × 2
## country_name cluster
## <chr> <dbl>
## 1 Armenia 1
## 2 Azerbaijan 1
## 3 Kazakhstan 1
## 4 Belarus 2
## 5 Mongolia 2
## 6 Ukraine 2
## 7 Georgia 3
## 8 Moldova 3
## 9 Uzbekistan 4
##
## ====================================================
## Country assignments for: Southern Africa and Eastern Africa
## ====================================================
## # A tibble: 3 × 2
## country_name cluster
## <chr> <dbl>
## 1 Botswana 1
## 2 South Africa 1
## 3 Mauritius 2
##
## ====================================================
## Country assignments for: West and Central Africa
## ====================================================
## NULL
Not by region
library(mclust)
library(dplyr)
# Average across all years per country (no region grouping)
X_full <- regions %>%
group_by(country_name) %>%
summarise(
democratic_stability = mean(democratic_stability, na.rm = TRUE),
Tscore1519_ML = mean(Tscore1519_ML, na.rm = TRUE),
secondary_enrolment = mean(secondary_enrolment, na.rm = TRUE),
tertiary_enrolment = mean(tertiary_enrolment, na.rm = TRUE),
primary_enrolment = mean(primary_enrolment, na.rm = TRUE)
) %>%
na.omit()
cat("Total countries after removing NAs:", nrow(X_full), "\n")
## Total countries after removing NAs: 58
# Numeric only for Mclust
X <- X_full %>% select(-country_name) %>% as.data.frame()
# Compute and plot BIC
BIC <- mclustBIC(X)
plot(BIC, main = "BIC - All Countries")
cat("\nBIC Summary:\n")
##
## BIC Summary:
print(summary(BIC))
## Best BIC values:
## VEE,2 EVE,4 EEE,1
## BIC -2226.789 -2227.62134 -2234.996403
## BIC diff 0.000 -0.83247 -8.207531
# Fit model using BIC results
mod1 <- Mclust(X, x = BIC, G = 1:5)
cat("\nModel Summary:\n")
##
## Model Summary:
print(summary(mod1, parameters = TRUE))
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 2 components:
##
## log-likelihood n df BIC ICL
## -1058.578 58 27 -2226.789 -2227.353
##
## Clustering table:
## 1 2
## 8 50
##
## Mixing probabilities:
## 1 2
## 0.1345786 0.8654214
##
## Means:
## [,1] [,2]
## democratic_stability 9.181349 5.543935
## Tscore1519_ML 517.719070 416.883353
## secondary_enrolment 104.733705 90.719466
## tertiary_enrolment 71.415145 45.316843
## primary_enrolment 99.667550 103.097316
##
## Variances:
## [,,1]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 0.8430578 -2.300508 0.1110185
## Tscore1519_ML -2.3005084 327.744739 47.7813781
## secondary_enrolment 0.1110185 47.781378 19.3148109
## tertiary_enrolment 1.2510193 73.999068 17.7805399
## primary_enrolment 0.1376422 -8.876891 -0.2431787
## tertiary_enrolment primary_enrolment
## democratic_stability 1.251019305 0.137642242
## Tscore1519_ML 73.999068310 -8.876890862
## secondary_enrolment 17.780539858 -0.243178737
## tertiary_enrolment 43.969898130 0.005527502
## primary_enrolment 0.005527502 3.258092427
## [,,2]
## democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability 7.3989372 -20.18998 0.9743325
## Tscore1519_ML -20.1899752 2876.38951 419.3441987
## secondary_enrolment 0.9743325 419.34420 169.5127732
## tertiary_enrolment 10.9793336 649.43878 156.0475343
## primary_enrolment 1.2079910 -77.90635 -2.1342120
## tertiary_enrolment primary_enrolment
## democratic_stability 10.97933356 1.20799102
## Tscore1519_ML 649.43878222 -77.90634833
## secondary_enrolment 156.04753433 -2.13421205
## tertiary_enrolment 385.89346796 0.04851107
## primary_enrolment 0.04851107 28.59402998
# Plot clPairs
clPairs(mod1$data, classification = mod1$classification,
main = "clPairs - All Countries")
# Plot classification and uncertainty
plot(mod1, what = "classification", main = "Classification - All Countries")
plot(mod1, what = "uncertainty", main = "Uncertainty - All Countries")
# Classification table
cat("\nClassification Table:\n")
##
## Classification Table:
print(table(mod1$classification))
##
## 1 2
## 8 50
# Country cluster assignments
cluster_assignments <- X_full %>%
mutate(cluster = mod1$classification) %>%
arrange(cluster) %>%
select(country_name, cluster)
cat("\nCountry Cluster Assignments:\n")
##
## Country Cluster Assignments:
print(cluster_assignments, n = Inf)
## # A tibble: 58 × 2
## country_name cluster
## <chr> <dbl>
## 1 Croatia 1
## 2 Czechia 1
## 3 Estonia 1
## 4 Hungary 1
## 5 Latvia 1
## 6 Lithuania 1
## 7 Poland 1
## 8 Slovenia 1
## 9 Albania 2
## 10 Algeria 2
## 11 Argentina 2
## 12 Armenia 2
## 13 Azerbaijan 2
## 14 Bahrain 2
## 15 Belarus 2
## 16 Bosnia and Herzegovina 2
## 17 Botswana 2
## 18 Brazil 2
## 19 Bulgaria 2
## 20 Cambodia 2
## 21 Chile 2
## 22 Colombia 2
## 23 Costa Rica 2
## 24 Dominican Republic 2
## 25 El Salvador 2
## 26 Georgia 2
## 27 Ghana 2
## 28 Guatemala 2
## 29 Indonesia 2
## 30 Jamaica 2
## 31 Jordan 2
## 32 Kazakhstan 2
## 33 Kuwait 2
## 34 Malaysia 2
## 35 Mauritius 2
## 36 Mexico 2
## 37 Moldova 2
## 38 Mongolia 2
## 39 Montenegro 2
## 40 Morocco 2
## 41 North Macedonia 2
## 42 Oman 2
## 43 Panama 2
## 44 Paraguay 2
## 45 Peru 2
## 46 Philippines 2
## 47 Qatar 2
## 48 Romania 2
## 49 Saudi Arabia 2
## 50 Serbia 2
## 51 Singapore 2
## 52 South Africa 2
## 53 Thailand 2
## 54 Tunisia 2
## 55 Ukraine 2
## 56 United Arab Emirates 2
## 57 Uruguay 2
## 58 Uzbekistan 2
Regression
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.2.0
## ✔ ggplot2 4.0.2 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.1 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ mclust::count() masks dplyr::count()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks mclust::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(rpart)
library(rpart.plot)
# ── 1. Average per country (collapse across years) ──────────────────────────
regions_avg <- regions %>%
group_by(country_name, region_name) %>%
summarise(
democratic_stability = mean(democratic_stability, na.rm = TRUE),
primary_enrolment = mean(primary_enrolment, na.rm = TRUE),
secondary_enrolment = mean(secondary_enrolment, na.rm = TRUE),
tertiary_enrolment = mean(tertiary_enrolment, na.rm = TRUE),
Tscore1519_ML = mean(Tscore1519_ML, na.rm = TRUE),
.groups = "drop"
)
# ── 2. Define predictors ─────────────────────────────────────────────────────
predictors <- c("primary_enrolment", "secondary_enrolment",
"tertiary_enrolment", "Tscore1519_ML")
# ── 3. Linear regression per region × predictor ──────────────────────────────
lm_results <- map_dfr(predictors, function(pred) {
regions_avg %>%
group_by(region_name) %>%
group_modify(~ {
df <- .x %>% select(democratic_stability, all_of(pred)) %>% drop_na()
if (nrow(df) < 10) return(tibble()) # skip small regions
fit <- lm(democratic_stability ~ ., data = df)
tidy(fit) %>%
filter(term != "(Intercept)") %>%
mutate(n = nrow(df), predictor = pred, r_squared = glance(fit)$r.squared)
}) %>%
ungroup()
})
# ── 4. Print linear regression summary ───────────────────────────────────────
lm_results %>%
select(region_name, predictor, estimate, std.error, statistic, p.value, r_squared, n) %>%
mutate(across(where(is.numeric), ~ round(., 3))) %>%
print(n = Inf)
## # A tibble: 12 × 8
## region_name predictor estimate std.error statistic p.value r_squared n
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 East-Central … primary_… 0.079 0.126 0.624 0.543 0.029 15
## 2 Latin America… primary_… -0.095 0.073 -1.3 0.218 0.123 14
## 3 Middle East a… primary_… -0.018 0.05 -0.35 0.735 0.015 10
## 4 East-Central … secondar… 0.149 0.03 5.04 0 0.662 15
## 5 Latin America… secondar… 0.047 0.019 2.55 0.025 0.352 14
## 6 Middle East a… secondar… -0.027 0.023 -1.18 0.272 0.148 10
## 7 East-Central … tertiary… 0.095 0.018 5.40 0 0.691 15
## 8 Latin America… tertiary… 0.034 0.019 1.84 0.091 0.22 14
## 9 Middle East a… tertiary… -0.017 0.02 -0.839 0.426 0.081 10
## 10 East-Central … Tscore15… 0.025 0.006 3.96 0.002 0.546 15
## 11 Latin America… Tscore15… 0.019 0.007 2.61 0.023 0.362 14
## 12 Middle East a… Tscore15… 0.003 0.01 0.263 0.799 0.009 10
# ── 5. Regression trees per predictor ────────────────────────────────────────
# (using full dataset across all regions, one tree per predictor)
par(mfrow = c(2, 2))
walk(predictors, function(pred) {
df <- regions_avg %>%
select(democratic_stability, all_of(pred)) %>%
drop_na()
fit_tree <- rpart(democratic_stability ~ ., data = df,
control = rpart.control(minsplit = 10, cp = 0.01))
rpart.plot(fit_tree, main = paste("Regression Tree:", pred),
type = 4, extra = 101, digits = 2)
})
# ── 6. Compare R² from LM vs tree per predictor ──────────────────────────────
tree_r2 <- map_dfr(predictors, function(pred) {
df <- regions_avg %>%
select(democratic_stability, all_of(pred)) %>%
drop_na()
fit_tree <- rpart(democratic_stability ~ ., data = df,
control = rpart.control(minsplit = 10, cp = 0.01))
ss_res <- sum((df$democratic_stability - predict(fit_tree))^2)
ss_tot <- sum((df$democratic_stability - mean(df$democratic_stability))^2)
tibble(predictor = pred, method = "Regression Tree", r_squared = round(1 - ss_res/ss_tot, 3))
})
lm_global_r2 <- map_dfr(predictors, function(pred) {
df <- regions_avg %>%
select(democratic_stability, all_of(pred)) %>%
drop_na()
fit <- lm(democratic_stability ~ ., data = df)
tibble(predictor = pred, method = "Linear Regression", r_squared = round(glance(fit)$r.squared, 3))
})
bind_rows(lm_global_r2, tree_r2) %>%
arrange(predictor, method) %>%
print()
## # A tibble: 8 × 3
## predictor method r_squared
## <chr> <chr> <dbl>
## 1 Tscore1519_ML Linear Regression 0.012
## 2 Tscore1519_ML Regression Tree 0.434
## 3 primary_enrolment Linear Regression 0.002
## 4 primary_enrolment Regression Tree 0.539
## 5 secondary_enrolment Linear Regression 0.02
## 6 secondary_enrolment Regression Tree 0.441
## 7 tertiary_enrolment Linear Regression 0.092
## 8 tertiary_enrolment Regression Tree 0.437