Clustering

library(mclust)
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:dplyr':
## 
##     count
results <- regions %>%
  group_by(region_name) %>%
  group_split() %>%
  lapply(function(region_df) {
    region <- unique(region_df$region_name)
    
    cat("\n====================================================\n")
    cat("Region:", region, "\n")
    cat("====================================================\n")
    
    # Average across all years per country
    X_full <- region_df %>%
      group_by(country_name) %>%
      summarise(
        democratic_stability = mean(democratic_stability, na.rm = TRUE),
        Tscore1519_ML        = mean(Tscore1519_ML, na.rm = TRUE),
        secondary_enrolment  = mean(secondary_enrolment, na.rm = TRUE),
        tertiary_enrolment   = mean(tertiary_enrolment, na.rm = TRUE),
        primary_enrolment    = mean(primary_enrolment, na.rm = TRUE)
      ) %>%
      na.omit()
    
    # Numeric only for Mclust
    X <- X_full %>% select(-country_name) %>% as.data.frame()
    
    if (nrow(X) < 3) {
      cat("Skipping region - not enough countries to cluster.\n")
      return(list(
        region = region,
        mod1 = NULL,
        BIC = NULL,
        classification = NULL,
        cluster_assignments = NULL
        ))}
    
    cat("Sample size after removing NAs:", nrow(X), "\n")
    
    # Compute and plot BIC
    BIC <- mclustBIC(X)
    plot(BIC, main = paste("BIC -", region))
    cat("\nBIC Summary:\n")
    print(summary(BIC))
    
    # Fit model using BIC results
    mod1 <- Mclust(X, x = BIC, G = 1:5)
    cat("\nModel Summary:\n")
    print(summary(mod1, parameters = TRUE))
    
    # Plot clPairs using mod1 classification
    clPairs(mod1$data, classification = mod1$classification, 
        main = paste("clPairs -", region))
    
    # Plot classification and uncertainty
    plot(mod1, what = "classification", 
         main = paste("Classification -", region))
    plot(mod1, what = "uncertainty", 
         main = paste("Uncertainty -", region))
    
    # Classification table
    cat("\nClassification Table:\n")
    print(table(mod1$classification))
    
    # Country cluster assignments
    cluster_assignments <- X_full %>%
      mutate(cluster = mod1$classification) %>%
      arrange(cluster) %>%
      select(country_name, cluster)
    
    cat("\nCountry Cluster Assignments:\n")
    print(cluster_assignments)
    
    list(
      region = region,
      mod1 = mod1,
      BIC = BIC,
      classification = mod1$classification,
      cluster_assignments = cluster_assignments
    )
  })
## 
## ====================================================
## Region: Asia and Oceania 
## ====================================================
## Sample size after removing NAs: 6

## 
## BIC Summary:
## Best BIC values:
##              EEI,5      EEI,4      EEI,3
## BIC      -145.6369 -205.04567 -239.60404
## BIC diff    0.0000  -59.40878  -93.96715
## 
## Model Summary:
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EEI (diagonal, equal volume and shape) model with 5 components: 
## 
##  log-likelihood n df       BIC       ICL
##       -42.35853 6 34 -145.6369 -145.6369
## 
## Clustering table:
## 1 2 3 4 5 
## 1 2 1 1 1 
## 
## Mixing probabilities:
##         1         2         3         4         5 
## 0.1666667 0.3333333 0.1666667 0.1666667 0.1666667 
## 
## Means:
##                            [,1]       [,2]       [,3]       [,4]      [,5]
## democratic_stability   1.944444   6.638889   2.722222   2.285714   3.00000
## Tscore1519_ML        312.500000 389.082540 462.577778 602.657143 442.64444
## secondary_enrolment   48.109504  82.989224  77.641026 106.734276 101.34951
## tertiary_enrolment    11.788177  31.048868  37.545810  90.996107  47.24416
## primary_enrolment    114.588223 102.567130  96.247872 101.435440 102.72289
## 
## Variances:
## [,,1]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability          0.006430041        0.0000          0.00000000
## Tscore1519_ML                 0.000000000      122.1091          0.00000000
## secondary_enrolment           0.000000000        0.0000          0.02762234
## tertiary_enrolment            0.000000000        0.0000          0.00000000
## primary_enrolment             0.000000000        0.0000          0.00000000
##                      tertiary_enrolment primary_enrolment
## democratic_stability          0.0000000          0.000000
## Tscore1519_ML                 0.0000000          0.000000
## secondary_enrolment           0.0000000          0.000000
## tertiary_enrolment            0.2788292          0.000000
## primary_enrolment             0.0000000          6.799506
## [,,2]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability          0.006430041        0.0000          0.00000000
## Tscore1519_ML                 0.000000000      122.1091          0.00000000
## secondary_enrolment           0.000000000        0.0000          0.02762234
## tertiary_enrolment            0.000000000        0.0000          0.00000000
## primary_enrolment             0.000000000        0.0000          0.00000000
##                      tertiary_enrolment primary_enrolment
## democratic_stability          0.0000000          0.000000
## Tscore1519_ML                 0.0000000          0.000000
## secondary_enrolment           0.0000000          0.000000
## tertiary_enrolment            0.2788292          0.000000
## primary_enrolment             0.0000000          6.799506
## [,,3]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability          0.006430041        0.0000          0.00000000
## Tscore1519_ML                 0.000000000      122.1091          0.00000000
## secondary_enrolment           0.000000000        0.0000          0.02762234
## tertiary_enrolment            0.000000000        0.0000          0.00000000
## primary_enrolment             0.000000000        0.0000          0.00000000
##                      tertiary_enrolment primary_enrolment
## democratic_stability          0.0000000          0.000000
## Tscore1519_ML                 0.0000000          0.000000
## secondary_enrolment           0.0000000          0.000000
## tertiary_enrolment            0.2788292          0.000000
## primary_enrolment             0.0000000          6.799506
## [,,4]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability          0.006430041        0.0000          0.00000000
## Tscore1519_ML                 0.000000000      122.1091          0.00000000
## secondary_enrolment           0.000000000        0.0000          0.02762234
## tertiary_enrolment            0.000000000        0.0000          0.00000000
## primary_enrolment             0.000000000        0.0000          0.00000000
##                      tertiary_enrolment primary_enrolment
## democratic_stability          0.0000000          0.000000
## Tscore1519_ML                 0.0000000          0.000000
## secondary_enrolment           0.0000000          0.000000
## tertiary_enrolment            0.2788292          0.000000
## primary_enrolment             0.0000000          6.799506
## [,,5]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability          0.006430041        0.0000          0.00000000
## Tscore1519_ML                 0.000000000      122.1091          0.00000000
## secondary_enrolment           0.000000000        0.0000          0.02762234
## tertiary_enrolment            0.000000000        0.0000          0.00000000
## primary_enrolment             0.000000000        0.0000          0.00000000
##                      tertiary_enrolment primary_enrolment
## democratic_stability          0.0000000          0.000000
## Tscore1519_ML                 0.0000000          0.000000
## secondary_enrolment           0.0000000          0.000000
## tertiary_enrolment            0.2788292          0.000000
## primary_enrolment             0.0000000          6.799506

## 
## Classification Table:
## 
## 1 2 3 4 5 
## 1 2 1 1 1 
## 
## Country Cluster Assignments:
## # A tibble: 6 × 2
##   country_name cluster
##   <chr>          <dbl>
## 1 Cambodia           1
## 2 Indonesia          2
## 3 Philippines        2
## 4 Malaysia           3
## 5 Singapore          4
## 6 Thailand           5
## 
## ====================================================
## Region: East-Central and Southeast Europe 
## ====================================================
## Sample size after removing NAs: 15

## 
## BIC Summary:
## Best BIC values:
##              EEV,2       EEE,1       EEV,1
## BIC      -478.5586 -481.538552 -481.538552
## BIC diff    0.0000   -2.979993   -2.979993
## 
## Model Summary:
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EEV (ellipsoidal, equal volume and shape) model with 2 components: 
## 
##  log-likelihood  n df       BIC       ICL
##       -190.5344 15 36 -478.5586 -478.5586
## 
## Clustering table:
##  1  2 
##  5 10 
## 
## Mixing probabilities:
##         1         2 
## 0.3333313 0.6666687 
## 
## Means:
##                            [,1]       [,2]
## democratic_stability   6.978385   8.877071
## Tscore1519_ML        436.336377 507.323405
## secondary_enrolment   90.160655 102.974701
## tertiary_enrolment    50.677004  69.812412
## primary_enrolment     98.664639  99.759735
## 
## Variances:
## [,,1]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability            1.4862558      7.268880            1.678104
## Tscore1519_ML                   7.2688802    480.422892            5.322323
## secondary_enrolment             1.6781041      5.322323           16.493748
## tertiary_enrolment              4.7066624    124.504493           20.918179
## primary_enrolment               0.4092304    -47.037136           14.964136
##                      tertiary_enrolment primary_enrolment
## democratic_stability           4.706662         0.4092304
## Tscore1519_ML                124.504493       -47.0371360
## secondary_enrolment           20.918179        14.9641362
## tertiary_enrolment            58.533882         9.1202454
## primary_enrolment              9.120245        24.5899404
## [,,2]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability            0.7837510     11.021888           2.7998395
## Tscore1519_ML                  11.0218878    501.060439          75.3354086
## secondary_enrolment             2.7998395     75.335409          17.6902949
## tertiary_enrolment              5.4374488     48.094533          20.9619090
## primary_enrolment               0.2057687     -5.318066          -0.9159219
##                      tertiary_enrolment primary_enrolment
## democratic_stability          5.4374488         0.2057687
## Tscore1519_ML                48.0945332        -5.3180659
## secondary_enrolment          20.9619090        -0.9159219
## tertiary_enrolment           60.2545373         0.3786191
## primary_enrolment             0.3786191         1.7376965

## 
## Classification Table:
## 
##  1  2 
##  5 10 
## 
## Country Cluster Assignments:
## # A tibble: 15 × 2
##    country_name           cluster
##    <chr>                    <dbl>
##  1 Albania                      1
##  2 Bosnia and Herzegovina       1
##  3 Montenegro                   1
##  4 North Macedonia              1
##  5 Romania                      1
##  6 Bulgaria                     2
##  7 Croatia                      2
##  8 Czechia                      2
##  9 Estonia                      2
## 10 Hungary                      2
## 11 Latvia                       2
## 12 Lithuania                    2
## 13 Poland                       2
## 14 Serbia                       2
## 15 Slovenia                     2
## 
## ====================================================
## Region: Latin America and the Caribbean 
## ====================================================
## Sample size after removing NAs: 14

## 
## BIC Summary:
## Best BIC values:
##              VEV,3      EEV,3      VVV,2
## BIC      -367.5386 -390.70649 -466.63477
## BIC diff    0.0000  -23.16786  -99.09614
## 
## Model Summary:
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 3 components: 
## 
##  log-likelihood  n df       BIC       ICL
##       -112.5148 14 54 -367.5386 -367.5386
## 
## Clustering table:
## 1 2 3 
## 5 3 6 
## 
## Mixing probabilities:
##         1         2         3 
## 0.3571429 0.2142857 0.4285714 
## 
## Means:
##                            [,1]       [,2]       [,3]
## democratic_stability   7.133333   9.666667   7.232639
## Tscore1519_ML        415.033333 441.677778 352.013657
## secondary_enrolment   99.339624 105.287939  72.898819
## tertiary_enrolment    54.356727  62.098267  34.520233
## primary_enrolment    110.649654 106.216315 103.587744
## 
## Variances:
## [,,1]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability            0.1860756     0.5391588            1.150384
## Tscore1519_ML                   0.5391588   146.4734368          -20.980199
## secondary_enrolment             1.1503837   -20.9801990           13.320491
## tertiary_enrolment              3.4514795  -124.7452995           60.862202
## primary_enrolment              -0.6216770   -15.0779348            2.956047
##                      tertiary_enrolment primary_enrolment
## democratic_stability           3.451479         -0.621677
## Tscore1519_ML               -124.745300        -15.077935
## secondary_enrolment           60.862202          2.956047
## tertiary_enrolment           333.992995         32.854791
## primary_enrolment             32.854791         14.897652
## [,,2]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability            0.2929897     0.5747473          -0.5917133
## Tscore1519_ML                   0.5747473    12.7829092           2.3478377
## secondary_enrolment            -0.5917133     2.3478377          11.5179949
## tertiary_enrolment              1.2855994    -4.1890180         -23.5776681
## primary_enrolment               0.1105297     3.4769737           4.5997456
##                      tertiary_enrolment primary_enrolment
## democratic_stability           1.285599         0.1105297
## Tscore1519_ML                 -4.189018         3.4769737
## secondary_enrolment          -23.577668         4.5997456
## tertiary_enrolment            48.367076        -8.9177814
## primary_enrolment             -8.917781         4.2703932
## [,,3]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability             2.980059       56.8040            22.27877
## Tscore1519_ML                   56.803996     1503.2093           285.22843
## secondary_enrolment             22.278769      285.2284           245.45262
## tertiary_enrolment              21.540774      371.2609           245.09230
## primary_enrolment               -5.934520     -130.1132           -38.02314
##                      tertiary_enrolment primary_enrolment
## democratic_stability           21.54077          -5.93452
## Tscore1519_ML                 371.26094        -130.11324
## secondary_enrolment           245.09230         -38.02314
## tertiary_enrolment            342.99238         -21.16743
## primary_enrolment             -21.16743          24.36919

## 
## Classification Table:
## 
## 1 2 3 
## 5 3 6 
## 
## Country Cluster Assignments:
## # A tibble: 14 × 2
##    country_name       cluster
##    <chr>                <dbl>
##  1 Argentina                1
##  2 Brazil                   1
##  3 Colombia                 1
##  4 Mexico                   1
##  5 Peru                     1
##  6 Chile                    2
##  7 Costa Rica               2
##  8 Uruguay                  2
##  9 Dominican Republic       3
## 10 El Salvador              3
## 11 Guatemala                3
## 12 Jamaica                  3
## 13 Panama                   3
## 14 Paraguay                 3
## 
## ====================================================
## Region: Middle East and North Africa 
## ====================================================
## Sample size after removing NAs: 10

## 
## BIC Summary:
## Best BIC values:
##              EEI,9      EEI,8     VEV,2
## BIC      -283.2772 -320.49906 -331.5879
## BIC diff    0.0000  -37.22182  -48.3107
## 
## Model Summary:
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 2 components: 
## 
##  log-likelihood  n df       BIC       ICL
##       -123.1961 10 37 -331.5879 -331.5879
## 
## Clustering table:
## 1 2 
## 6 4 
## 
## Mixing probabilities:
##   1   2 
## 0.6 0.4 
## 
## Means:
##                            [,1]       [,2]
## democratic_stability   2.263889   1.943452
## Tscore1519_ML        411.689352 429.417262
## secondary_enrolment   92.544838  97.120312
## tertiary_enrolment    44.372298  34.508187
## primary_enrolment    107.330001  99.091208
## 
## Variances:
## [,,1]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability            0.6902561      7.567771            -4.48294
## Tscore1519_ML                   7.5677715   1139.847357           177.50411
## secondary_enrolment            -4.4829396    177.504107           213.24831
## tertiary_enrolment             -6.6180085    183.056580           142.16844
## primary_enrolment              -0.5786567    -54.429099            27.89758
##                      tertiary_enrolment primary_enrolment
## democratic_stability          -6.618009        -0.5786567
## Tscore1519_ML                183.056580       -54.4290993
## secondary_enrolment          142.168437        27.8975774
## tertiary_enrolment           207.771675       -35.2173204
## primary_enrolment            -35.217320        35.5804021
## [,,2]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability           0.01427416    -0.6186224          -0.2540362
## Tscore1519_ML                 -0.61862238    87.6481279          -5.1021134
## secondary_enrolment           -0.25403619    -5.1021134          13.1041154
## tertiary_enrolment            -0.39882467    50.1450091         -11.4765717
## primary_enrolment             -0.12689213     1.1032936           6.1636710
##                      tertiary_enrolment primary_enrolment
## democratic_stability         -0.3988247        -0.1268921
## Tscore1519_ML                50.1450091         1.1032936
## secondary_enrolment         -11.4765717         6.1636710
## tertiary_enrolment           63.5920104        -7.9975525
## primary_enrolment            -7.9975525         4.1121541

## 
## Classification Table:
## 
## 1 2 
## 6 4 
## 
## Country Cluster Assignments:
## # A tibble: 10 × 2
##    country_name         cluster
##    <chr>                  <dbl>
##  1 Algeria                    1
##  2 Kuwait                     1
##  3 Morocco                    1
##  4 Saudi Arabia               1
##  5 Tunisia                    1
##  6 United Arab Emirates       1
##  7 Bahrain                    2
##  8 Jordan                     2
##  9 Oman                       2
## 10 Qatar                      2
## 
## ====================================================
## Region: Post-Soviet Eurasia 
## ====================================================
## Sample size after removing NAs: 9

## 
## BIC Summary:
## Best BIC values:
##              EEI,8       EEI,7      EII,8
## BIC      -240.2108 -249.109441 -270.82139
## BIC diff    0.0000   -8.898595  -30.61055
## 
## Model Summary:
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EEE (ellipsoidal, equal volume, shape and orientation) model with 4
## components: 
## 
##  log-likelihood n df       BIC       ICL
##       -102.2422 9 38 -287.9789 -287.9789
## 
## Clustering table:
## 1 2 3 4 
## 3 3 2 1 
## 
## Mixing probabilities:
##         1         2         3         4 
## 0.3333333 0.3333333 0.2222222 0.1111111 
## 
## Means:
##                            [,1]       [,2]       [,3]       [,4]
## democratic_stability   2.648148   5.407407   6.772222   1.444444
## Tscore1519_ML        464.629630 465.588889 437.633333 362.444444
## secondary_enrolment   93.000514  97.316690 103.330735  91.802652
## tertiary_enrolment    43.201334  74.343612  54.310713  12.281159
## primary_enrolment     98.537390  98.525838 105.894496  96.631519
## 
## Variances:
## [,,1]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability           2.11473022     -9.462492           -4.314540
## Tscore1519_ML                 -9.46249200    215.614248           36.792105
## secondary_enrolment           -4.31454039     36.792105           15.847939
## tertiary_enrolment            -3.54380694    127.905949           20.281237
## primary_enrolment              0.08503333    -14.520186           -0.157502
##                      tertiary_enrolment primary_enrolment
## democratic_stability          -3.543807        0.08503333
## Tscore1519_ML                127.905949      -14.52018590
## secondary_enrolment           20.281237       -0.15750201
## tertiary_enrolment            82.231571       -7.53107421
## primary_enrolment             -7.531074        2.11146786
## [,,2]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability           2.11473022     -9.462492           -4.314540
## Tscore1519_ML                 -9.46249200    215.614248           36.792105
## secondary_enrolment           -4.31454039     36.792105           15.847939
## tertiary_enrolment            -3.54380694    127.905949           20.281237
## primary_enrolment              0.08503333    -14.520186           -0.157502
##                      tertiary_enrolment primary_enrolment
## democratic_stability          -3.543807        0.08503333
## Tscore1519_ML                127.905949      -14.52018590
## secondary_enrolment           20.281237       -0.15750201
## tertiary_enrolment            82.231571       -7.53107421
## primary_enrolment             -7.531074        2.11146786
## [,,3]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability           2.11473022     -9.462492           -4.314540
## Tscore1519_ML                 -9.46249200    215.614248           36.792105
## secondary_enrolment           -4.31454039     36.792105           15.847939
## tertiary_enrolment            -3.54380694    127.905949           20.281237
## primary_enrolment              0.08503333    -14.520186           -0.157502
##                      tertiary_enrolment primary_enrolment
## democratic_stability          -3.543807        0.08503333
## Tscore1519_ML                127.905949      -14.52018590
## secondary_enrolment           20.281237       -0.15750201
## tertiary_enrolment            82.231571       -7.53107421
## primary_enrolment             -7.531074        2.11146786
## [,,4]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability           2.11473022     -9.462492           -4.314540
## Tscore1519_ML                 -9.46249200    215.614248           36.792105
## secondary_enrolment           -4.31454039     36.792105           15.847939
## tertiary_enrolment            -3.54380694    127.905949           20.281237
## primary_enrolment              0.08503333    -14.520186           -0.157502
##                      tertiary_enrolment primary_enrolment
## democratic_stability          -3.543807        0.08503333
## Tscore1519_ML                127.905949      -14.52018590
## secondary_enrolment           20.281237       -0.15750201
## tertiary_enrolment            82.231571       -7.53107421
## primary_enrolment             -7.531074        2.11146786

## 
## Classification Table:
## 
## 1 2 3 4 
## 3 3 2 1 
## 
## Country Cluster Assignments:
## # A tibble: 9 × 2
##   country_name cluster
##   <chr>          <dbl>
## 1 Armenia            1
## 2 Azerbaijan         1
## 3 Kazakhstan         1
## 4 Belarus            2
## 5 Mongolia           2
## 6 Ukraine            2
## 7 Georgia            3
## 8 Moldova            3
## 9 Uzbekistan         4
## 
## ====================================================
## Region: Southern Africa and Eastern Africa 
## ====================================================
## Sample size after removing NAs: 3

## 
## BIC Summary:
## Best BIC values:
##              EEI,2     EEI,1     EVI,1
## BIC      -94.67926 -107.7686 -107.7686
## BIC diff   0.00000  -13.0893  -13.0893
## 
## Model Summary:
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EEI (diagonal, equal volume and shape) model with 2 components: 
## 
##  log-likelihood n df       BIC       ICL
##       -38.55073 3 16 -94.67926 -94.67926
## 
## Clustering table:
## 1 2 
## 2 1 
## 
## Mixing probabilities:
##         1         2 
## 0.6666667 0.3333333 
## 
## Means:
##                            [,1]      [,2]
## democratic_stability   8.361111   9.18750
## Tscore1519_ML        317.461111 444.10000
## secondary_enrolment   87.710918  90.54548
## tertiary_enrolment    19.341295  35.68528
## primary_enrolment    105.579727  98.52711
## 
## Variances:
## [,,1]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability            0.1486626         0.000             0.00000
## Tscore1519_ML                   0.0000000      1460.507             0.00000
## secondary_enrolment             0.0000000         0.000            42.24365
## tertiary_enrolment              0.0000000         0.000             0.00000
## primary_enrolment               0.0000000         0.000             0.00000
##                      tertiary_enrolment primary_enrolment
## democratic_stability          0.0000000          0.000000
## Tscore1519_ML                 0.0000000          0.000000
## secondary_enrolment           0.0000000          0.000000
## tertiary_enrolment            0.4557051          0.000000
## primary_enrolment             0.0000000          6.686333
## [,,2]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability            0.1486626         0.000             0.00000
## Tscore1519_ML                   0.0000000      1460.507             0.00000
## secondary_enrolment             0.0000000         0.000            42.24365
## tertiary_enrolment              0.0000000         0.000             0.00000
## primary_enrolment               0.0000000         0.000             0.00000
##                      tertiary_enrolment primary_enrolment
## democratic_stability          0.0000000          0.000000
## Tscore1519_ML                 0.0000000          0.000000
## secondary_enrolment           0.0000000          0.000000
## tertiary_enrolment            0.4557051          0.000000
## primary_enrolment             0.0000000          6.686333

## 
## Classification Table:
## 
## 1 2 
## 2 1 
## 
## Country Cluster Assignments:
## # A tibble: 3 × 2
##   country_name cluster
##   <chr>          <dbl>
## 1 Botswana           1
## 2 South Africa       1
## 3 Mauritius          2
## 
## ====================================================
## Region: West and Central Africa 
## ====================================================
## Skipping region - not enough countries to cluster.
# Name the list by region
names(results) <- sapply(results, `[[`, "region")

# Print summaries and country assignments for all regions
for (r in names(results)) {
  cat("\n====================================================\n")
  cat("Country assignments for:", r, "\n")
  cat("====================================================\n")
  print(results[[r]]$cluster_assignments)
}
## 
## ====================================================
## Country assignments for: Asia and Oceania 
## ====================================================
## # A tibble: 6 × 2
##   country_name cluster
##   <chr>          <dbl>
## 1 Cambodia           1
## 2 Indonesia          2
## 3 Philippines        2
## 4 Malaysia           3
## 5 Singapore          4
## 6 Thailand           5
## 
## ====================================================
## Country assignments for: East-Central and Southeast Europe 
## ====================================================
## # A tibble: 15 × 2
##    country_name           cluster
##    <chr>                    <dbl>
##  1 Albania                      1
##  2 Bosnia and Herzegovina       1
##  3 Montenegro                   1
##  4 North Macedonia              1
##  5 Romania                      1
##  6 Bulgaria                     2
##  7 Croatia                      2
##  8 Czechia                      2
##  9 Estonia                      2
## 10 Hungary                      2
## 11 Latvia                       2
## 12 Lithuania                    2
## 13 Poland                       2
## 14 Serbia                       2
## 15 Slovenia                     2
## 
## ====================================================
## Country assignments for: Latin America and the Caribbean 
## ====================================================
## # A tibble: 14 × 2
##    country_name       cluster
##    <chr>                <dbl>
##  1 Argentina                1
##  2 Brazil                   1
##  3 Colombia                 1
##  4 Mexico                   1
##  5 Peru                     1
##  6 Chile                    2
##  7 Costa Rica               2
##  8 Uruguay                  2
##  9 Dominican Republic       3
## 10 El Salvador              3
## 11 Guatemala                3
## 12 Jamaica                  3
## 13 Panama                   3
## 14 Paraguay                 3
## 
## ====================================================
## Country assignments for: Middle East and North Africa 
## ====================================================
## # A tibble: 10 × 2
##    country_name         cluster
##    <chr>                  <dbl>
##  1 Algeria                    1
##  2 Kuwait                     1
##  3 Morocco                    1
##  4 Saudi Arabia               1
##  5 Tunisia                    1
##  6 United Arab Emirates       1
##  7 Bahrain                    2
##  8 Jordan                     2
##  9 Oman                       2
## 10 Qatar                      2
## 
## ====================================================
## Country assignments for: Post-Soviet Eurasia 
## ====================================================
## # A tibble: 9 × 2
##   country_name cluster
##   <chr>          <dbl>
## 1 Armenia            1
## 2 Azerbaijan         1
## 3 Kazakhstan         1
## 4 Belarus            2
## 5 Mongolia           2
## 6 Ukraine            2
## 7 Georgia            3
## 8 Moldova            3
## 9 Uzbekistan         4
## 
## ====================================================
## Country assignments for: Southern Africa and Eastern Africa 
## ====================================================
## # A tibble: 3 × 2
##   country_name cluster
##   <chr>          <dbl>
## 1 Botswana           1
## 2 South Africa       1
## 3 Mauritius          2
## 
## ====================================================
## Country assignments for: West and Central Africa 
## ====================================================
## NULL

Not by region

library(mclust)
library(dplyr)

# Average across all years per country (no region grouping)
X_full <- regions %>%
  group_by(country_name) %>%
  summarise(
    democratic_stability = mean(democratic_stability, na.rm = TRUE),
    Tscore1519_ML        = mean(Tscore1519_ML, na.rm = TRUE),
    secondary_enrolment  = mean(secondary_enrolment, na.rm = TRUE),
    tertiary_enrolment   = mean(tertiary_enrolment, na.rm = TRUE),
    primary_enrolment    = mean(primary_enrolment, na.rm = TRUE)
  ) %>%
  na.omit()

cat("Total countries after removing NAs:", nrow(X_full), "\n")
## Total countries after removing NAs: 58
# Numeric only for Mclust
X <- X_full %>% select(-country_name) %>% as.data.frame()

# Compute and plot BIC
BIC <- mclustBIC(X)
plot(BIC, main = "BIC - All Countries")

cat("\nBIC Summary:\n")
## 
## BIC Summary:
print(summary(BIC))
## Best BIC values:
##              VEE,2       EVE,4        EEE,1
## BIC      -2226.789 -2227.62134 -2234.996403
## BIC diff     0.000    -0.83247    -8.207531
# Fit model using BIC results
mod1 <- Mclust(X, x = BIC, G = 1:5)
cat("\nModel Summary:\n")
## 
## Model Summary:
print(summary(mod1, parameters = TRUE))
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 2 components: 
## 
##  log-likelihood  n df       BIC       ICL
##       -1058.578 58 27 -2226.789 -2227.353
## 
## Clustering table:
##  1  2 
##  8 50 
## 
## Mixing probabilities:
##         1         2 
## 0.1345786 0.8654214 
## 
## Means:
##                            [,1]       [,2]
## democratic_stability   9.181349   5.543935
## Tscore1519_ML        517.719070 416.883353
## secondary_enrolment  104.733705  90.719466
## tertiary_enrolment    71.415145  45.316843
## primary_enrolment     99.667550 103.097316
## 
## Variances:
## [,,1]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability            0.8430578     -2.300508           0.1110185
## Tscore1519_ML                  -2.3005084    327.744739          47.7813781
## secondary_enrolment             0.1110185     47.781378          19.3148109
## tertiary_enrolment              1.2510193     73.999068          17.7805399
## primary_enrolment               0.1376422     -8.876891          -0.2431787
##                      tertiary_enrolment primary_enrolment
## democratic_stability        1.251019305       0.137642242
## Tscore1519_ML              73.999068310      -8.876890862
## secondary_enrolment        17.780539858      -0.243178737
## tertiary_enrolment         43.969898130       0.005527502
## primary_enrolment           0.005527502       3.258092427
## [,,2]
##                      democratic_stability Tscore1519_ML secondary_enrolment
## democratic_stability            7.3989372     -20.18998           0.9743325
## Tscore1519_ML                 -20.1899752    2876.38951         419.3441987
## secondary_enrolment             0.9743325     419.34420         169.5127732
## tertiary_enrolment             10.9793336     649.43878         156.0475343
## primary_enrolment               1.2079910     -77.90635          -2.1342120
##                      tertiary_enrolment primary_enrolment
## democratic_stability        10.97933356        1.20799102
## Tscore1519_ML              649.43878222      -77.90634833
## secondary_enrolment        156.04753433       -2.13421205
## tertiary_enrolment         385.89346796        0.04851107
## primary_enrolment            0.04851107       28.59402998
# Plot clPairs
clPairs(mod1$data, classification = mod1$classification,
        main = "clPairs - All Countries")

# Plot classification and uncertainty
plot(mod1, what = "classification", main = "Classification - All Countries")

plot(mod1, what = "uncertainty",     main = "Uncertainty - All Countries")

# Classification table
cat("\nClassification Table:\n")
## 
## Classification Table:
print(table(mod1$classification))
## 
##  1  2 
##  8 50
# Country cluster assignments
cluster_assignments <- X_full %>%
  mutate(cluster = mod1$classification) %>%
  arrange(cluster) %>%
  select(country_name, cluster)

cat("\nCountry Cluster Assignments:\n")
## 
## Country Cluster Assignments:
print(cluster_assignments, n = Inf)
## # A tibble: 58 × 2
##    country_name           cluster
##    <chr>                    <dbl>
##  1 Croatia                      1
##  2 Czechia                      1
##  3 Estonia                      1
##  4 Hungary                      1
##  5 Latvia                       1
##  6 Lithuania                    1
##  7 Poland                       1
##  8 Slovenia                     1
##  9 Albania                      2
## 10 Algeria                      2
## 11 Argentina                    2
## 12 Armenia                      2
## 13 Azerbaijan                   2
## 14 Bahrain                      2
## 15 Belarus                      2
## 16 Bosnia and Herzegovina       2
## 17 Botswana                     2
## 18 Brazil                       2
## 19 Bulgaria                     2
## 20 Cambodia                     2
## 21 Chile                        2
## 22 Colombia                     2
## 23 Costa Rica                   2
## 24 Dominican Republic           2
## 25 El Salvador                  2
## 26 Georgia                      2
## 27 Ghana                        2
## 28 Guatemala                    2
## 29 Indonesia                    2
## 30 Jamaica                      2
## 31 Jordan                       2
## 32 Kazakhstan                   2
## 33 Kuwait                       2
## 34 Malaysia                     2
## 35 Mauritius                    2
## 36 Mexico                       2
## 37 Moldova                      2
## 38 Mongolia                     2
## 39 Montenegro                   2
## 40 Morocco                      2
## 41 North Macedonia              2
## 42 Oman                         2
## 43 Panama                       2
## 44 Paraguay                     2
## 45 Peru                         2
## 46 Philippines                  2
## 47 Qatar                        2
## 48 Romania                      2
## 49 Saudi Arabia                 2
## 50 Serbia                       2
## 51 Singapore                    2
## 52 South Africa                 2
## 53 Thailand                     2
## 54 Tunisia                      2
## 55 Ukraine                      2
## 56 United Arab Emirates         2
## 57 Uruguay                      2
## 58 Uzbekistan                   2

Regression

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.2.0
## ✔ ggplot2   4.0.2     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ mclust::count() masks dplyr::count()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::map()    masks mclust::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(rpart)
library(rpart.plot)

# ── 1. Average per country (collapse across years) ──────────────────────────
regions_avg <- regions %>%
  group_by(country_name, region_name) %>%
  summarise(
    democratic_stability  = mean(democratic_stability,  na.rm = TRUE),
    primary_enrolment     = mean(primary_enrolment,     na.rm = TRUE),
    secondary_enrolment   = mean(secondary_enrolment,   na.rm = TRUE),
    tertiary_enrolment    = mean(tertiary_enrolment,    na.rm = TRUE),
    Tscore1519_ML         = mean(Tscore1519_ML,         na.rm = TRUE),
    .groups = "drop"
  )

# ── 2. Define predictors ─────────────────────────────────────────────────────
predictors <- c("primary_enrolment", "secondary_enrolment",
                "tertiary_enrolment", "Tscore1519_ML")

# ── 3. Linear regression per region × predictor ──────────────────────────────
lm_results <- map_dfr(predictors, function(pred) {
  regions_avg %>%
    group_by(region_name) %>%
    group_modify(~ {
      df <- .x %>% select(democratic_stability, all_of(pred)) %>% drop_na()
      if (nrow(df) < 10) return(tibble())  # skip small regions
      fit <- lm(democratic_stability ~ ., data = df)
      tidy(fit) %>%
        filter(term != "(Intercept)") %>%
        mutate(n = nrow(df), predictor = pred, r_squared = glance(fit)$r.squared)
    }) %>%
    ungroup()
})

# ── 4. Print linear regression summary ───────────────────────────────────────
lm_results %>%
  select(region_name, predictor, estimate, std.error, statistic, p.value, r_squared, n) %>%
  mutate(across(where(is.numeric), ~ round(., 3))) %>%
  print(n = Inf)
## # A tibble: 12 × 8
##    region_name    predictor estimate std.error statistic p.value r_squared     n
##    <chr>          <chr>        <dbl>     <dbl>     <dbl>   <dbl>     <dbl> <dbl>
##  1 East-Central … primary_…    0.079     0.126     0.624   0.543     0.029    15
##  2 Latin America… primary_…   -0.095     0.073    -1.3     0.218     0.123    14
##  3 Middle East a… primary_…   -0.018     0.05     -0.35    0.735     0.015    10
##  4 East-Central … secondar…    0.149     0.03      5.04    0         0.662    15
##  5 Latin America… secondar…    0.047     0.019     2.55    0.025     0.352    14
##  6 Middle East a… secondar…   -0.027     0.023    -1.18    0.272     0.148    10
##  7 East-Central … tertiary…    0.095     0.018     5.40    0         0.691    15
##  8 Latin America… tertiary…    0.034     0.019     1.84    0.091     0.22     14
##  9 Middle East a… tertiary…   -0.017     0.02     -0.839   0.426     0.081    10
## 10 East-Central … Tscore15…    0.025     0.006     3.96    0.002     0.546    15
## 11 Latin America… Tscore15…    0.019     0.007     2.61    0.023     0.362    14
## 12 Middle East a… Tscore15…    0.003     0.01      0.263   0.799     0.009    10
# ── 5. Regression trees per predictor ────────────────────────────────────────
# (using full dataset across all regions, one tree per predictor)
par(mfrow = c(2, 2))

walk(predictors, function(pred) {
  df <- regions_avg %>%
    select(democratic_stability, all_of(pred)) %>%
    drop_na()
  
  fit_tree <- rpart(democratic_stability ~ ., data = df,
                    control = rpart.control(minsplit = 10, cp = 0.01))
  
  rpart.plot(fit_tree, main = paste("Regression Tree:", pred),
             type = 4, extra = 101, digits = 2)
})

# ── 6. Compare R² from LM vs tree per predictor ──────────────────────────────
tree_r2 <- map_dfr(predictors, function(pred) {
  df <- regions_avg %>%
    select(democratic_stability, all_of(pred)) %>%
    drop_na()
  
  fit_tree <- rpart(democratic_stability ~ ., data = df,
                    control = rpart.control(minsplit = 10, cp = 0.01))
  
  ss_res <- sum((df$democratic_stability - predict(fit_tree))^2)
  ss_tot <- sum((df$democratic_stability - mean(df$democratic_stability))^2)
  
  tibble(predictor = pred, method = "Regression Tree", r_squared = round(1 - ss_res/ss_tot, 3))
})

lm_global_r2 <- map_dfr(predictors, function(pred) {
  df <- regions_avg %>%
    select(democratic_stability, all_of(pred)) %>%
    drop_na()
  fit <- lm(democratic_stability ~ ., data = df)
  tibble(predictor = pred, method = "Linear Regression", r_squared = round(glance(fit)$r.squared, 3))
})

bind_rows(lm_global_r2, tree_r2) %>%
  arrange(predictor, method) %>%
  print()
## # A tibble: 8 × 3
##   predictor           method            r_squared
##   <chr>               <chr>                 <dbl>
## 1 Tscore1519_ML       Linear Regression     0.012
## 2 Tscore1519_ML       Regression Tree       0.434
## 3 primary_enrolment   Linear Regression     0.002
## 4 primary_enrolment   Regression Tree       0.539
## 5 secondary_enrolment Linear Regression     0.02 
## 6 secondary_enrolment Regression Tree       0.441
## 7 tertiary_enrolment  Linear Regression     0.092
## 8 tertiary_enrolment  Regression Tree       0.437