Load Library

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(flexclust)

## Warning: package 'flexclust' was built under R version 4.5.3

library(dbscan)

## Warning: package 'dbscan' was built under R version 4.5.3

## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram

library(meanShiftR)
library(e1071)

## Warning: package 'e1071' was built under R version 4.5.3

## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:flexclust':
## 
##     bclust
## 
## The following object is masked from 'package:ggplot2':
## 
##     element

library(cluster)

## Warning: package 'cluster' was built under R version 4.5.3

library(fpc)

## Warning: package 'fpc' was built under R version 4.5.3

## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan

library(mclust)

## Warning: package 'mclust' was built under R version 4.5.3

## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## The following object is masked from 'package:purrr':
## 
##     map

library(moments)

## 
## Attaching package: 'moments'
## 
## The following objects are masked from 'package:e1071':
## 
##     kurtosis, moment, skewness

library(tidyr)
library(ggplot2)
library(clusterSim)

## Warning: package 'clusterSim' was built under R version 4.5.3

## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(dplyr)

Import & Prepare data

df <- read_csv("C:/Users/ARIMBY/Downloads/Global_Development_Indicators_2000_2020.csv")

## Rows: 5556 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): country_code, country_name, region, income_group, currency_unit
## dbl (42): year, gdp_usd, population, gdp_per_capita, inflation_rate, unemplo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df_asli <- df

head(df_asli)

## # A tibble: 6 × 47
##    year country_code country_name      region income_group currency_unit gdp_usd
##   <dbl> <chr>        <chr>             <chr>  <chr>        <chr>           <dbl>
## 1  2000 AFE          Africa Eastern a… <NA>   <NA>         <NA>          2.84e11
## 2  2001 AFE          Africa Eastern a… <NA>   <NA>         <NA>          2.59e11
## 3  2002 AFE          Africa Eastern a… <NA>   <NA>         <NA>          2.65e11
## 4  2003 AFE          Africa Eastern a… <NA>   <NA>         <NA>          3.53e11
## 5  2004 AFE          Africa Eastern a… <NA>   <NA>         <NA>          4.39e11
## 6  2005 AFE          Africa Eastern a… <NA>   <NA>         <NA>          5.12e11
## # ℹ 40 more variables: population <dbl>, gdp_per_capita <dbl>,
## #   inflation_rate <dbl>, unemployment_rate <dbl>, fdi_pct_gdp <dbl>,
## #   co2_emissions_kt <dbl>, energy_use_per_capita <dbl>,
## #   renewable_energy_pct <dbl>, forest_area_pct <dbl>,
## #   electricity_access_pct <dbl>, life_expectancy <dbl>, child_mortality <dbl>,
## #   school_enrollment_secondary <dbl>, health_expenditure_pct_gdp <dbl>,
## #   hospital_beds_per_1000 <dbl>, physicians_per_1000 <dbl>, …

Filterring

df_2020 <- df_asli %>%
  filter(year == 2020)

df_numeric <- df_2020 %>%
  dplyr::select(where(is.numeric))

colSums(is.na(df_numeric))

##                                year                             gdp_usd 
##                                   0                                  22 
##                          population                      gdp_per_capita 
##                                   0                                  22 
##                      inflation_rate                   unemployment_rate 
##                                  58                                  30 
##                         fdi_pct_gdp                    co2_emissions_kt 
##                                  30                                 264 
##               energy_use_per_capita                renewable_energy_pct 
##                                 264                                 264 
##                     forest_area_pct              electricity_access_pct 
##                                   8                                 264 
##                     life_expectancy                     child_mortality 
##                                 264                                  24 
##         school_enrollment_secondary          health_expenditure_pct_gdp 
##                                 207                                 264 
##              hospital_beds_per_1000                 physicians_per_1000 
##                                 264                                 264 
##                  internet_usage_pct        mobile_subscriptions_per_100 
##                                 189                                  76 
##           calculated_gdp_per_capita      real_economic_growth_indicator 
##                                  22                                  59 
##              econ_opportunity_index       co2_emissions_per_capita_tons 
##                                  40                                 264 
##       co2_intensity_per_million_gdp              green_transition_score 
##                                 264                                 264 
##       ecological_preservation_index         renewable_energy_efficiency 
##                                   8                                 264 
##         human_development_composite           healthcare_capacity_index 
##                                 264                                 264 
##          digital_connectivity_index            health_development_ratio 
##                                   0                                 264 
##              education_health_ratio                    years_since_2000 
##                                 213                                   0 
##                 years_since_century                  is_pandemic_period 
##                                   0                                   0 
##             human_development_index         climate_vulnerability_index 
##                                 264                                   0 
##             digital_readiness_score            governance_quality_index 
##                                   0                                   0 
##             global_resilience_score global_development_resilience_index 
##                                 264                                   0

Cleaning

threshold <- nrow(df_numeric) * 0.5

df_numeric2 <- df_numeric[, colSums(is.na(df_numeric)) < threshold]

df_clean <- na.omit(df_numeric2)

dim(df_numeric)

## [1] 264  42

dim(df_clean)

## [1] 164  22

print(colnames(df_clean))

##  [1] "year"                                "gdp_usd"                            
##  [3] "population"                          "gdp_per_capita"                     
##  [5] "inflation_rate"                      "unemployment_rate"                  
##  [7] "fdi_pct_gdp"                         "forest_area_pct"                    
##  [9] "child_mortality"                     "mobile_subscriptions_per_100"       
## [11] "calculated_gdp_per_capita"           "real_economic_growth_indicator"     
## [13] "econ_opportunity_index"              "ecological_preservation_index"      
## [15] "digital_connectivity_index"          "years_since_2000"                   
## [17] "years_since_century"                 "is_pandemic_period"                 
## [19] "climate_vulnerability_index"         "digital_readiness_score"            
## [21] "governance_quality_index"            "global_development_resilience_index"

head(df_clean)

## # A tibble: 6 × 22
##    year gdp_usd population gdp_per_capita inflation_rate unemployment_rate
##   <dbl>   <dbl>      <dbl>          <dbl>          <dbl>             <dbl>
## 1  2020 9.21e11  677243299          1360.         5.19                7.56
## 2  2020 7.85e11  458803476          1710.         2.43                6.77
## 3  2020 2.45e12  436080728          5613.         1.78               11.5 
## 4  2020 1.65e12  102253057         16168.         1.67                4.37
## 5  2020 6.60e10    7442291          8874.         0.0385              9.83
## 6  2020 1.74e13 2113820753          8255.         2.53                4.39
## # ℹ 16 more variables: fdi_pct_gdp <dbl>, forest_area_pct <dbl>,
## #   child_mortality <dbl>, mobile_subscriptions_per_100 <dbl>,
## #   calculated_gdp_per_capita <dbl>, real_economic_growth_indicator <dbl>,
## #   econ_opportunity_index <dbl>, ecological_preservation_index <dbl>,
## #   digital_connectivity_index <dbl>, years_since_2000 <dbl>,
## #   years_since_century <dbl>, is_pandemic_period <dbl>,
## #   climate_vulnerability_index <dbl>, digital_readiness_score <dbl>, …

cols_to_remove <- c("year", "years_since_2000", "years_since_century", "ecological_preservation_index",
                    "is_pandemic_period", "governance_quality_index")

existing_cols_to_remove <- intersect(cols_to_remove, names(df_clean))

df_clean <- df_clean %>%
  dplyr::select(-dplyr::all_of(existing_cols_to_remove))

ncol(df_clean)

## [1] 16

head(df_clean)

## # A tibble: 6 × 16
##   gdp_usd population gdp_per_capita inflation_rate unemployment_rate fdi_pct_gdp
##     <dbl>      <dbl>          <dbl>          <dbl>             <dbl>       <dbl>
## 1 9.21e11  677243299          1360.         5.19                7.56        1.40
## 2 7.85e11  458803476          1710.         2.43                6.77        2.20
## 3 2.45e12  436080728          5613.         1.78               11.5         1.60
## 4 1.65e12  102253057         16168.         1.67                4.37       12.7 
## 5 6.60e10    7442291          8874.         0.0385              9.83        3.60
## 6 1.74e13 2113820753          8255.         2.53                4.39        1.50
## # ℹ 10 more variables: forest_area_pct <dbl>, child_mortality <dbl>,
## #   mobile_subscriptions_per_100 <dbl>, calculated_gdp_per_capita <dbl>,
## #   real_economic_growth_indicator <dbl>, econ_opportunity_index <dbl>,
## #   digital_connectivity_index <dbl>, climate_vulnerability_index <dbl>,
## #   digital_readiness_score <dbl>, global_development_resilience_index <dbl>

Deteksi Outlier

df_long <- pivot_longer(df_clean, cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(df_long, aes(x = Variable, y = Value)) +
  geom_boxplot(fill = "skyblue", outlier.color = "red") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Deteksi Outlier", x = "Variabel", y = "Z-Score Value")

Berdasarkan hasil visualisasi boxplot, ditemukan adanya beberapa nilai ekstrem (outlier) pada sejumlah variabel. Outlier merupakan data yang memiliki nilai jauh dari sebagian besar data lainnya

IQR

count_outliers <- function(x) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = TRUE)
  H <- 1.5 * IQR(x, na.rm = TRUE)
  sum(x < (qnt[1] - H) | x > (qnt[2] + H), na.rm = TRUE)
}

outlier_report <- sapply(df_clean, count_outliers)
print(outlier_report)

##                             gdp_usd                          population 
##                                  24                                  30 
##                      gdp_per_capita                      inflation_rate 
##                                  16                                  11 
##                   unemployment_rate                         fdi_pct_gdp 
##                                  12                                  26 
##                     forest_area_pct                     child_mortality 
##                                   3                                   5 
##        mobile_subscriptions_per_100           calculated_gdp_per_capita 
##                                   2                                  16 
##      real_economic_growth_indicator              econ_opportunity_index 
##                                  50                                  18 
##          digital_connectivity_index         climate_vulnerability_index 
##                                   0                                   3 
##             digital_readiness_score global_development_resilience_index 
##                                   0                                   1

Install package untuk winsorizing (Handling Outlier)

library(DescTools)

## Warning: package 'DescTools' was built under R version 4.5.3

## 
## Attaching package: 'DescTools'

## The following object is masked from 'package:mclust':
## 
##     BrierScore

df_winsorized <- as.data.frame(lapply(df_clean, function(x) {
  lim <- quantile(x, probs = c(0.05, 0.95), na.rm = TRUE)

  x[x < lim[1]] <- lim[1]
  x[x > lim[2]] <- lim[2]
  return(x)
}))

head(df_winsorized)

##        gdp_usd population gdp_per_capita inflation_rate unemployment_rate
## 1 9.207923e+11  677243299       1359.618      5.1914557          7.563187
## 2 7.845876e+11  458803476       1710.073      2.4313077          6.774914
## 3 2.447584e+12  436080728       5612.687      1.7774078         11.487321
## 4 1.653248e+12  102253057      16168.197      1.6724410          4.372293
## 5 6.603933e+10    7442291       8873.522      0.0385211          9.827410
## 6 1.744889e+13 2113820753       8254.671      2.5273254          4.388021
##   fdi_pct_gdp forest_area_pct child_mortality mobile_subscriptions_per_100
## 1    1.403911       30.251786       58.158650                     84.22978
## 2    2.196867       19.816762       77.713581                    104.51582
## 3    1.601804        2.812929       34.086399                    103.17840
## 4   11.487633       34.519877        4.745719                    123.70580
## 5    3.603610       67.817786       16.240576                    115.80447
## 6    1.497914       30.431156       14.437461                    122.93366
##   calculated_gdp_per_capita real_economic_growth_indicator
## 1                  1359.618                       261.8954
## 2                  1710.073                       703.3554
## 3                  5612.687                      3157.7933
## 4                 16168.197                      9667.4244
## 5                  8873.522                     83943.2248
## 6                  8254.671                      3266.1686
##   econ_opportunity_index digital_connectivity_index climate_vulnerability_index
## 1               494.3842                   0.000000                   0.4243705
## 2               656.9962                   0.000000                   0.4504581
## 3              1656.0041                   0.000000                   0.4929677
## 4              7732.2690                  10.140854                   0.4137003
## 5              2830.5868                   0.000000                   0.3304555
## 6              3940.6276                   9.200905                   0.4239221
##   digital_readiness_score global_development_resilience_index
## 1               0.2807659                            4.592021
## 2               0.3483861                            3.122205
## 3               0.3439280                            8.275166
## 4               0.8280038                           18.532399
## 5               0.3860149                           11.037999
## 6               0.7540973                           14.475839

Normalisasi (Scaling)

zero_var_cols <- apply(df_winsorized, 2, function(x) var(x, na.rm = TRUE) == 0)
df_winsorized_no_zero_var <- df_winsorized[, !zero_var_cols]

df_scaled <- scale(df_winsorized_no_zero_var)

df_final <- as.data.frame(df_scaled)

head(df_final)

##       gdp_usd population gdp_per_capita inflation_rate unemployment_rate
## 1 -0.28777097  0.5485440     -0.7839713     1.00715539        -0.1259477
## 2 -0.30984001  0.2082160     -0.7619064    -0.02393701        -0.2876703
## 3 -0.04038727  0.1728141     -0.5161955    -0.26821056         0.6791301
## 4 -0.16909242 -0.3472876      0.1483859    -0.30742242        -0.7805934
## 5 -0.42626524 -0.4950022     -0.3108913    -0.91779634         0.3385818
## 6  2.39025116  2.7867236     -0.3498545     0.01193176        -0.7773666
##   fdi_pct_gdp forest_area_pct child_mortality mobile_subscriptions_per_100
## 1 -0.34339774      0.04814058       1.4900856                   -1.3187311
## 2 -0.09589221     -0.50140160       2.3292276                   -0.3899178
## 3 -0.28162945     -1.39687857       0.4570963                   -0.4511526
## 4  2.80403892      0.27291209      -0.8019721                    0.4887115
## 5  0.34319533      2.02648793      -0.3087044                    0.1269423
## 6 -0.31405672      0.05758677      -0.3860797                    0.4533581
##   calculated_gdp_per_capita real_economic_growth_indicator
## 1                -0.7825812                    -0.19384169
## 2                -0.7604780                    -0.17991456
## 3                -0.5143399                    -0.10248222
## 4                 0.1513967                     0.10288289
## 5                -0.3086788                     2.44612751
## 6                -0.3477098                    -0.09906321
##   econ_opportunity_index digital_connectivity_index climate_vulnerability_index
## 1             -0.7275010                 -0.8356546                 -0.04814058
## 2             -0.7053245                 -0.8356546                  0.50140160
## 3             -0.5690832                 -0.8356546                  1.39687857
## 4              0.2595777                  1.1886520                 -0.27291209
## 5             -0.4088975                 -0.8356546                 -2.02648793
## 6             -0.2575138                  1.0010203                 -0.05758677
##   digital_readiness_score global_development_resilience_index
## 1              -1.0734950                         -1.28182061
## 2              -0.8007295                         -1.45989838
## 3              -0.8187124                         -0.83558331
## 4               1.1339482                          0.40714787
## 5              -0.6489427                         -0.50084794
## 6               0.8358246                         -0.08433096

Statistika Deskriptif

data_summary <- data.frame(
  n = nrow(df_final),
  Mean = apply(df_final, 2, mean),
  Median = apply(df_final, 2, median),
  SD = apply(df_final, 2, sd),
  Min = apply(df_final, 2, min),
  Max = apply(df_final, 2, max),
  Skewness = apply(df_final, 2, skewness),
  Kurtosis = apply(df_final, 2, kurtosis)
)

round(data_summary, 3)

##                                       n Mean Median SD    Min   Max Skewness
## gdp_usd                             164    0 -0.403  1 -0.436 3.297    2.638
## population                          164    0 -0.477  1 -0.506 3.055    2.155
## gdp_per_capita                      164    0 -0.442  1 -0.816 2.416    1.360
## inflation_rate                      164    0 -0.208  1 -1.267 2.741    1.235
## unemployment_rate                   164    0 -0.286  1 -1.158 2.590    1.245
## fdi_pct_gdp                         164    0 -0.271  1 -1.301 2.804    1.553
## forest_area_pct                     164    0  0.045  1 -1.501 2.026    0.247
## child_mortality                     164    0 -0.394  1 -0.893 2.329    1.153
## mobile_subscriptions_per_100        164    0  0.057  1 -1.796 1.835    0.017
## calculated_gdp_per_capita           164    0 -0.440  1 -0.814 2.423    1.372
## real_economic_growth_indicator      164    0 -0.155  1 -2.460 2.446    0.349
## econ_opportunity_index              164    0 -0.490  1 -0.741 2.631    1.518
## digital_connectivity_index          164    0 -0.836  1 -0.836 1.437    0.396
## climate_vulnerability_index         164    0 -0.045  1 -2.026 1.501   -0.247
## digital_readiness_score             164    0 -0.471  1 -1.214 1.571    0.338
## global_development_resilience_index 164    0 -0.034  1 -1.460 1.911    0.304
##                                     Kurtosis
## gdp_usd                                8.500
## population                             6.375
## gdp_per_capita                         3.487
## inflation_rate                         4.273
## unemployment_rate                      3.754
## fdi_pct_gdp                            4.994
## forest_area_pct                        2.298
## child_mortality                        3.067
## mobile_subscriptions_per_100           2.145
## calculated_gdp_per_capita              3.523
## real_economic_growth_indicator         5.031
## econ_opportunity_index                 4.054
## digital_connectivity_index             1.222
## climate_vulnerability_index            2.298
## digital_readiness_score                1.424
## global_development_resilience_index    2.040

df_2020 <- df_asli[df_asli$year == 2020, ]

kolom_dipakai <- c("country_name", "gdp_usd", "population", "gdp_per_capita",
                   "inflation_rate", "unemployment_rate", "fdi_pct_gdp",
                   "forest_area_pct", "child_mortality", "mobile_subscriptions_per_100",
                   "calculated_gdp_per_capita", "real_economic_growth_indicator",
                   "econ_opportunity_index", "digital_connectivity_index",
                   "climate_vulnerability_index", "digital_readiness_score",
                   "global_development_resilience_index")

df_temp <- df_2020[, kolom_dipakai]

df_countryname <- na.omit(df_temp)

nrow(df_countryname)

## [1] 164

PCA

Uji Asumsi

mat_corr <- round(cor(df_final), 3)
mat_corr

##                                     gdp_usd population gdp_per_capita
## gdp_usd                               1.000      0.811          0.100
## population                            0.811      1.000         -0.155
## gdp_per_capita                        0.100     -0.155          1.000
## inflation_rate                       -0.107      0.045         -0.474
## unemployment_rate                    -0.144     -0.147         -0.246
## fdi_pct_gdp                          -0.130     -0.125         -0.036
## forest_area_pct                       0.075      0.000          0.070
## child_mortality                      -0.105      0.155         -0.571
## mobile_subscriptions_per_100          0.062     -0.134          0.391
## calculated_gdp_per_capita             0.101     -0.154          0.999
## real_economic_growth_indicator        0.188      0.037          0.277
## econ_opportunity_index                0.070     -0.156          0.974
## digital_connectivity_index           -0.006     -0.129          0.399
## climate_vulnerability_index          -0.075      0.000         -0.070
## digital_readiness_score               0.006     -0.157          0.472
## global_development_resilience_index   0.001     -0.253          0.797
##                                     inflation_rate unemployment_rate
## gdp_usd                                     -0.107            -0.144
## population                                   0.045            -0.147
## gdp_per_capita                              -0.474            -0.246
## inflation_rate                               1.000            -0.034
## unemployment_rate                           -0.034             1.000
## fdi_pct_gdp                                 -0.031             0.035
## forest_area_pct                             -0.088            -0.012
## child_mortality                              0.426             0.015
## mobile_subscriptions_per_100                -0.373            -0.095
## calculated_gdp_per_capita                   -0.471            -0.246
## real_economic_growth_indicator              -0.048            -0.043
## econ_opportunity_index                      -0.447            -0.343
## digital_connectivity_index                  -0.311            -0.226
## climate_vulnerability_index                  0.088             0.012
## digital_readiness_score                     -0.376            -0.205
## global_development_resilience_index         -0.492            -0.154
##                                     fdi_pct_gdp forest_area_pct child_mortality
## gdp_usd                                  -0.130           0.075          -0.105
## population                               -0.125           0.000           0.155
## gdp_per_capita                           -0.036           0.070          -0.571
## inflation_rate                           -0.031          -0.088           0.426
## unemployment_rate                         0.035          -0.012           0.015
## fdi_pct_gdp                               1.000          -0.013          -0.012
## forest_area_pct                          -0.013           1.000          -0.200
## child_mortality                          -0.012          -0.200           1.000
## mobile_subscriptions_per_100              0.111           0.190          -0.482
## calculated_gdp_per_capita                -0.032           0.072          -0.570
## real_economic_growth_indicator           -0.101           0.098          -0.119
## econ_opportunity_index                   -0.024           0.014          -0.539
## digital_connectivity_index                0.144           0.109          -0.572
## climate_vulnerability_index               0.013          -1.000           0.200
## digital_readiness_score                   0.134           0.140          -0.623
## global_development_resilience_index       0.022           0.110          -0.791
##                                     mobile_subscriptions_per_100
## gdp_usd                                                    0.062
## population                                                -0.134
## gdp_per_capita                                             0.391
## inflation_rate                                            -0.373
## unemployment_rate                                         -0.095
## fdi_pct_gdp                                                0.111
## forest_area_pct                                            0.190
## child_mortality                                           -0.482
## mobile_subscriptions_per_100                               1.000
## calculated_gdp_per_capita                                  0.388
## real_economic_growth_indicator                             0.003
## econ_opportunity_index                                     0.394
## digital_connectivity_index                                 0.499
## climate_vulnerability_index                               -0.190
## digital_readiness_score                                    0.679
## global_development_resilience_index                        0.495
##                                     calculated_gdp_per_capita
## gdp_usd                                                 0.101
## population                                             -0.154
## gdp_per_capita                                          0.999
## inflation_rate                                         -0.471
## unemployment_rate                                      -0.246
## fdi_pct_gdp                                            -0.032
## forest_area_pct                                         0.072
## child_mortality                                        -0.570
## mobile_subscriptions_per_100                            0.388
## calculated_gdp_per_capita                               1.000
## real_economic_growth_indicator                          0.282
## econ_opportunity_index                                  0.975
## digital_connectivity_index                              0.396
## climate_vulnerability_index                            -0.072
## digital_readiness_score                                 0.469
## global_development_resilience_index                     0.796
##                                     real_economic_growth_indicator
## gdp_usd                                                      0.188
## population                                                   0.037
## gdp_per_capita                                               0.277
## inflation_rate                                              -0.048
## unemployment_rate                                           -0.043
## fdi_pct_gdp                                                 -0.101
## forest_area_pct                                              0.098
## child_mortality                                             -0.119
## mobile_subscriptions_per_100                                 0.003
## calculated_gdp_per_capita                                    0.282
## real_economic_growth_indicator                               1.000
## econ_opportunity_index                                       0.215
## digital_connectivity_index                                   0.073
## climate_vulnerability_index                                 -0.098
## digital_readiness_score                                      0.076
## global_development_resilience_index                          0.163
##                                     econ_opportunity_index
## gdp_usd                                              0.070
## population                                          -0.156
## gdp_per_capita                                       0.974
## inflation_rate                                      -0.447
## unemployment_rate                                   -0.343
## fdi_pct_gdp                                         -0.024
## forest_area_pct                                      0.014
## child_mortality                                     -0.539
## mobile_subscriptions_per_100                         0.394
## calculated_gdp_per_capita                            0.975
## real_economic_growth_indicator                       0.215
## econ_opportunity_index                               1.000
## digital_connectivity_index                           0.421
## climate_vulnerability_index                         -0.014
## digital_readiness_score                              0.490
## global_development_resilience_index                  0.784
##                                     digital_connectivity_index
## gdp_usd                                                 -0.006
## population                                              -0.129
## gdp_per_capita                                           0.399
## inflation_rate                                          -0.311
## unemployment_rate                                       -0.226
## fdi_pct_gdp                                              0.144
## forest_area_pct                                          0.109
## child_mortality                                         -0.572
## mobile_subscriptions_per_100                             0.499
## calculated_gdp_per_capita                                0.396
## real_economic_growth_indicator                           0.073
## econ_opportunity_index                                   0.421
## digital_connectivity_index                               1.000
## climate_vulnerability_index                             -0.109
## digital_readiness_score                                  0.969
## global_development_resilience_index                      0.650
##                                     climate_vulnerability_index
## gdp_usd                                                  -0.075
## population                                                0.000
## gdp_per_capita                                           -0.070
## inflation_rate                                            0.088
## unemployment_rate                                         0.012
## fdi_pct_gdp                                               0.013
## forest_area_pct                                          -1.000
## child_mortality                                           0.200
## mobile_subscriptions_per_100                             -0.190
## calculated_gdp_per_capita                                -0.072
## real_economic_growth_indicator                           -0.098
## econ_opportunity_index                                   -0.014
## digital_connectivity_index                               -0.109
## climate_vulnerability_index                               1.000
## digital_readiness_score                                  -0.140
## global_development_resilience_index                      -0.110
##                                     digital_readiness_score
## gdp_usd                                               0.006
## population                                           -0.157
## gdp_per_capita                                        0.472
## inflation_rate                                       -0.376
## unemployment_rate                                    -0.205
## fdi_pct_gdp                                           0.134
## forest_area_pct                                       0.140
## child_mortality                                      -0.623
## mobile_subscriptions_per_100                          0.679
## calculated_gdp_per_capita                             0.469
## real_economic_growth_indicator                        0.076
## econ_opportunity_index                                0.490
## digital_connectivity_index                            0.969
## climate_vulnerability_index                          -0.140
## digital_readiness_score                               1.000
## global_development_resilience_index                   0.700
##                                     global_development_resilience_index
## gdp_usd                                                           0.001
## population                                                       -0.253
## gdp_per_capita                                                    0.797
## inflation_rate                                                   -0.492
## unemployment_rate                                                -0.154
## fdi_pct_gdp                                                       0.022
## forest_area_pct                                                   0.110
## child_mortality                                                  -0.791
## mobile_subscriptions_per_100                                      0.495
## calculated_gdp_per_capita                                         0.796
## real_economic_growth_indicator                                    0.163
## econ_opportunity_index                                            0.784
## digital_connectivity_index                                        0.650
## climate_vulnerability_index                                      -0.110
## digital_readiness_score                                           0.700
## global_development_resilience_index                               1.000

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:DescTools':
## 
##     AUC, ICC, SD

## The following object is masked from 'package:mclust':
## 
##     sim

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Bartlett test

n = nrow(df_final)
p = ncol(df_final)
cortest.bartlett(mat_corr,n=n, diag = TRUE)

## $chisq
## [1] 8885.943
## 
## $p.value
## [1] 0
## 
## $df
## [1] 120

KMO

KMO(df_final)

## Error in solve.default(r) : 
##   system is computationally singular: reciprocal condition number = 3.54288e-32

## matrix is not invertible, image not found

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = df_final)
## Overall MSA =  0.5
## MSA for each item = 
##                             gdp_usd                          population 
##                                 0.5                                 0.5 
##                      gdp_per_capita                      inflation_rate 
##                                 0.5                                 0.5 
##                   unemployment_rate                         fdi_pct_gdp 
##                                 0.5                                 0.5 
##                     forest_area_pct                     child_mortality 
##                                 0.5                                 0.5 
##        mobile_subscriptions_per_100           calculated_gdp_per_capita 
##                                 0.5                                 0.5 
##      real_economic_growth_indicator              econ_opportunity_index 
##                                 0.5                                 0.5 
##          digital_connectivity_index         climate_vulnerability_index 
##                                 0.5                                 0.5 
##             digital_readiness_score global_development_resilience_index 
##                                 0.5                                 0.5

r = cov(df_scaled)

pc <- eigen(r)
print(pc$values)

##  [1]  5.881641e+00  2.072467e+00  1.973670e+00  1.457538e+00  1.074829e+00
##  [6]  9.053951e-01  8.856607e-01  5.868344e-01  5.246235e-01  3.642125e-01
## [11]  1.329389e-01  1.144941e-01  2.102871e-02  4.087793e-03  5.783456e-04
## [16] -5.561957e-17

print(pc$vectors)

##              [,1]         [,2]        [,3]        [,4]        [,5]         [,6]
##  [1,] -0.03056085  0.360221457 -0.50543412  0.26559444 -0.17511779  0.065875049
##  [2,]  0.08344257  0.335008111 -0.48896394  0.32951102 -0.08326466  0.098411150
##  [3,] -0.36113465 -0.081085222 -0.18604124 -0.26289700  0.02014114  0.154007332
##  [4,]  0.23809018 -0.008044023  0.02670188 -0.01698028  0.46247113 -0.252619334
##  [5,]  0.10493651 -0.034684395  0.22880990 -0.10431510 -0.78783446 -0.111648961
##  [6,] -0.01911378 -0.096118007  0.23518854  0.25887555  0.05893360  0.686869613
##  [7,] -0.08355977  0.588525734  0.28547787 -0.20532244  0.08607598  0.088274997
##  [8,]  0.32167470 -0.050194041 -0.07372739 -0.06874150  0.23320883  0.139583035
##  [9,] -0.26270502  0.067908634  0.15816997  0.27583589 -0.08027271 -0.005309919
## [10,] -0.36059895 -0.079618939 -0.18704688 -0.26544743  0.02136821  0.156146397
## [11,] -0.09403192  0.120795492 -0.20633023 -0.28540215  0.01472699 -0.434269382
## [12,] -0.35719217 -0.120638831 -0.19289316 -0.21638098  0.11222666  0.174389370
## [13,] -0.29791515 -0.007948178  0.15931049  0.40151854  0.14964020 -0.275862763
## [14,]  0.08355977 -0.588525734 -0.28547787  0.20532244 -0.08607598 -0.088274997
## [15,] -0.32874675  0.003483157  0.16872321  0.38407537  0.09165676 -0.238324406
## [16,] -0.37931579 -0.074249616  0.01990343 -0.03141345 -0.06518864 -0.052980948
##               [,7]        [,8]         [,9]       [,10]        [,11]
##  [1,] -0.064894218 -0.11633814  0.109181223  0.04512354  0.684419621
##  [2,]  0.010134119 -0.10169555 -0.037183628 -0.16364150 -0.690844682
##  [3,]  0.005257744 -0.07155379  0.099999767 -0.19610918  0.008961346
##  [4,] -0.151433878 -0.59699961  0.522630989 -0.10635484 -0.010729396
##  [5,] -0.162337316 -0.24006704  0.176400337 -0.41068731 -0.019182689
##  [6,] -0.611695087 -0.11365152 -0.009578578  0.07263299 -0.010020576
##  [7,]  0.038384651 -0.02431296 -0.037115376 -0.06200708 -0.002341138
##  [8,] -0.034784583  0.43008106  0.044776064 -0.66475006  0.120177009
##  [9,]  0.083276995  0.44063213  0.741653965  0.13576343 -0.105510768
## [10,] -0.001186484 -0.07300246  0.100878700 -0.19431183  0.007476034
## [11,] -0.739559498  0.29663826 -0.027535885  0.13240394 -0.092308729
## [12,]  0.055978606 -0.04814883  0.089173298 -0.18056967 -0.041680254
## [13,] -0.087718802 -0.07934020 -0.297186568 -0.32721371  0.084035409
## [14,] -0.038384651  0.02431296  0.037115376  0.06200708  0.002341138
## [15,] -0.048249981  0.04403420 -0.058598840 -0.25756336  0.038803351
## [16,]  0.031689007 -0.24466648 -0.083030173  0.13476068 -0.095770094
##              [,12]        [,13]         [,14]         [,15]         [,16]
##  [1,]  0.077852159  0.045633310 -0.0044230063  0.0010395194  0.000000e+00
##  [2,]  0.004892117 -0.024980853 -0.0023597829  0.0003618998  1.463025e-16
##  [3,] -0.136722332 -0.415149840 -0.0048805780  0.7002884992  4.195654e-14
##  [4,]  0.024167053 -0.006381329 -0.0036431201  0.0015940550 -1.394830e-16
##  [5,] -0.016240435  0.081672889  0.0181951927  0.0010473581  4.800271e-17
##  [6,]  0.010715358  0.001134787 -0.0109361197  0.0034334148  3.641384e-16
##  [7,]  0.009073156  0.020554921  0.0005973863  0.0010786852  7.071068e-01
##  [8,]  0.406624592 -0.010511763 -0.0009768425  0.0005431079 -1.535024e-16
##  [9,]  0.051828786 -0.020617634  0.1870274606 -0.0015037135 -8.249122e-16
## [10,] -0.132258958 -0.391282248 -0.0087809969 -0.7136879267 -4.524303e-14
## [11,]  0.015093299  0.041321407  0.0053078901  0.0041522777  3.153116e-16
## [12,] -0.143987819  0.812231207  0.0494643356  0.0134114658  2.889166e-15
## [13,] -0.109837819 -0.041021991  0.6291969980 -0.0042287616 -2.044333e-15
## [14,] -0.009073156 -0.020554921 -0.0005973863 -0.0010786852  7.071068e-01
## [15,] -0.092726822  0.023307340 -0.7523596478  0.0007315901  1.970305e-15
## [16,]  0.864262534  0.009812166 -0.0045023610  0.0029178031  4.156694e-16

sumvar <- sum(pc$values)
proportionvar <- sapply(pc$values, function(x) x/sumvar)*100
cumulativevar <- data.frame(cbind(pc$values, proportionvar)) %>% mutate(cum = cumsum(proportionvar))
colnames(cumulativevar)[1] <- "eigen_value"
row.names(cumulativevar) <- paste0("X",c(1:ncol(df_final)))
options(scipen = 999)
print(cumulativevar)

##                    eigen_value             proportionvar       cum
## X1   5.88164113157723189573289 36.7602570723577031230889  36.76026
## X2   2.07246731746274859276014 12.9529207341421823684868  49.71318
## X3   1.97367048019837088013162 12.3354405012398196106460  62.04862
## X4   1.45753789573587044792191  9.1096118483491910211569  71.15823
## X5   1.07482902866871943281524  6.7176814291794979538963  77.87591
## X6   0.90539514593259340369258  5.6587196620787096890126  83.53463
## X7   0.88566073593222538473668  5.5353795995764096815606  89.07001
## X8   0.58683440279191823041316  3.6677150174494896894828  92.73773
## X9   0.52462352473839268807154  3.2788970296149551053588  96.01662
## X10  0.36421247185998995021805  2.2763279491249375219297  98.29295
## X11  0.13293892668357251696243  0.8308682917723284599987  99.12382
## X12  0.11449409052195343161618  0.7155880657622090135206  99.83941
## X13  0.02102870930241857935772  0.1314294331401161530781  99.97084
## X14  0.00408779298385878505501  0.0255487061491174102801  99.99639
## X15  0.00057834561013329899293  0.0036146600633331195732 100.00000
## X16 -0.00000000000000005561957 -0.0000000000000003476223 100.00000

ev <- cumulativevar$eigen_value

plot(ev, type = "b", pch = 19, col = "blue",
     xlab = "Principal Component",
     ylab = "Eigenvalue",
     main = "Scree Plot",
     xaxt = "n")

axis(1, at = 1:length(ev), labels = paste0("PC", 1:length(ev)))
abline(h = 1, col = "red", lty = 2)
text(length(ev), 1.2, "Threshold (Eigenvalue = 1)", col = "red", adj = 1)

Berdasarkan grafik scree plot, terlihat bahwa nilai eigenvalue mengalami penurunan yang sangat tajam dari komponen pertama (PC1) ke komponen kedua (PC2), kemudian menurun secara bertahap pada komponen berikutnya. PC1 memiliki eigenvalue paling tinggi (sekitar >5), yang menunjukkan bahwa komponen ini menyimpan variasi data paling besar. Setelah itu, kontribusi masing-masing komponen mulai menurun secara signifikan.

scores <- as.matrix(df_final) %*% pc$vectors
scores_PC <- scores[,1:5]
colnames(scores_PC) <- paste0("PC", 1:ncol(scores_PC))
rownames(scores_PC) <- paste0("X", 1:nrow(scores_PC))
head(scores_PC)

##           PC1        PC2        PC3         PC4         PC5
## X1  3.0369013  0.2900002 -0.3636239 -0.55548198  0.75424216
## X2  2.8124569 -0.4581032 -0.3940214 -0.06093578  0.51834295
## X3  1.9472949 -1.4438478 -0.8494465  0.12567112 -1.01303755
## X4 -1.7563948 -0.1012544  1.2817542  1.46633153  0.80944274
## X5  0.1027531  2.4888334  1.2281909 -1.99816542 -0.46303153
## X6 -0.3541429  2.0459649 -2.1900278  2.67449014  0.01938291

scores_df <- as.data.frame(scores_PC)

Clustering with 5 Methode

K-Means

Menentukan Jumlah Cluster(Elbow Method)

wss <- sapply(1:10, function(k){
  kmeans(df_final, centers = k, nstart = 25)$tot.withinss
})

plot(1:10, wss, type="b", pch=19,
     xlab="Jumlah Cluster",
     ylab="WSS",
     main="Elbow Method")

Berdasarkan Visualisasi Diata, terlihat bahwa nilai WSS (Within Sum of Squares) mengalami penurunan yang cukup tajam dari k = 1 hingga k = 3. Setelah k = 3, penurunan nilai WSS cenderung melandai dan tidak signifikan. Hal ini menunjukkan bahwa penambahan jumlah cluster setelah k = 3 tidak memberikan peningkatan yang berarti dalam mengurangi variasi dalam cluster.

set.seed(123)
km_res <- kmeans(scores_df, centers = 3, nstart = 25)

km_res$cluster

##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15  X16 
##    1    1    1    2    2    2    1    2    2    2    3    3    3    1    1    1 
##  X17  X18  X19  X20  X21  X22  X23  X24  X25  X26  X27  X28  X29  X30  X31  X32 
##    1    1    1    1    1    1    2    1    1    1    3    1    1    3    1    1 
##  X33  X34  X35  X36  X37  X38  X39  X40  X41  X42  X43  X44  X45  X46  X47  X48 
##    1    1    2    2    1    1    1    1    2    3    2    2    2    3    3    2 
##  X49  X50  X51  X52  X53  X54  X55  X56  X57  X58  X59  X60  X61  X62  X63  X64 
##    2    3    2    2    2    1    3    1    3    2    2    2    3    2    2    3 
##  X65  X66  X67  X68  X69  X70  X71  X72  X73  X74  X75  X76  X77  X78  X79  X80 
##    3    2    2    3    3    3    2    2    2    2    3    3    2    2    2    3 
##  X81  X82  X83  X84  X85  X86  X87  X88  X89  X90  X91  X92  X93  X94  X95  X96 
##    2    1    2    2    3    3    2    2    2    2    2    2    2    3    2    1 
##  X97  X98  X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 
##    2    1    2    2    2    1    1    1    1    1    2    1    2    2    2    1 
## X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 
##    3    3    1    1    1    1    3    1    3    1    2    3    2    1    3    2 
## X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 
##    1    3    1    1    1    2    2    1    1    1    1    1    1    1    1    2 
## X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 
##    1    1    1    1    1    1    2    1    1    1    1    1    1    1    1    1 
## X161 X162 X163 X164 
##    1    1    1    1

table(km_res$cluster)

## 
##  1  2  3 
## 76 58 30

plot(scores_df$PC1, scores_df$PC2,
     col = km_res$cluster,
     pch = 19,
     main = "K-Means Clustering (k=3) - Basis 5 PC",
     xlab = "PC1", ylab = "PC2")

Hasil visualisasi menunjukkan bahwa metode clustering mampu membagi data menjadi tiga kelompok utama, namun pemisahan antar cluster belum sepenuhnya jelas. Hal ini ditandai dengan masih adanya kedekatan dan sedikit tumpang tindih antar beberapa cluster. Dengan demikian, struktur cluster dalam data tergolong cukup terbentuk, tetapi belum optimal.

df_final$country_name <- df_countryname$country_name

df_final$cluster_kmeans <- km_res$cluster

head(df_final[, c("country_name", "cluster_kmeans")])

##                                  country_name cluster_kmeans
## 1                 Africa Eastern and Southern              1
## 2                  Africa Western and Central              1
## 3                                  Arab World              1
## 4              Central Europe and the Baltics              2
## 5                      Caribbean small states              2
## 6 East Asia & Pacific (excluding high income)              2

K-Median

set.seed(123)

kmed_res <- kcca(scores_df, k = 3, family = kccaFamily("kmedians"))

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

clusters(kmed_res)

##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15  X16 
##    1    1    1    3    2    2    2    2    3    3    2    3    2    1    2    2 
##  X17  X18  X19  X20  X21  X22  X23  X24  X25  X26  X27  X28  X29  X30  X31  X32 
##    1    1    2    2    1    2    2    1    2    1    2    2    1    2    1    1 
##  X33  X34  X35  X36  X37  X38  X39  X40  X41  X42  X43  X44  X45  X46  X47  X48 
##    1    2    2    3    2    1    1    1    2    3    2    3    3    3    3    3 
##  X49  X50  X51  X52  X53  X54  X55  X56  X57  X58  X59  X60  X61  X62  X63  X64 
##    3    3    3    3    3    2    3    1    3    3    2    3    3    3    3    3 
##  X65  X66  X67  X68  X69  X70  X71  X72  X73  X74  X75  X76  X77  X78  X79  X80 
##    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3 
##  X81  X82  X83  X84  X85  X86  X87  X88  X89  X90  X91  X92  X93  X94  X95  X96 
##    3    1    2    3    3    3    3    3    3    3    3    3    3    3    3    2 
##  X97  X98  X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 
##    2    2    2    2    3    2    2    2    2    2    3    1    2    2    2    2 
## X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 
##    3    3    1    1    1    1    3    1    3    1    3    3    3    1    3    3 
## X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 
##    1    3    1    2    1    3    3    1    1    1    1    2    1    1    1    2 
## X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 
##    1    1    1    1    1    1    3    1    1    1    1    2    1    1    1    2 
## X161 X162 X163 X164 
##    1    2    2    2

table(clusters(kmed_res))

## 
##  1  2  3 
## 50 46 68

scores_df$kmedian_cluster <- clusters(kmed_res)

head(scores_df)

##           PC1        PC2        PC3         PC4         PC5 kmedian_cluster
## X1  3.0369013  0.2900002 -0.3636239 -0.55548198  0.75424216               1
## X2  2.8124569 -0.4581032 -0.3940214 -0.06093578  0.51834295               1
## X3  1.9472949 -1.4438478 -0.8494465  0.12567112 -1.01303755               1
## X4 -1.7563948 -0.1012544  1.2817542  1.46633153  0.80944274               3
## X5  0.1027531  2.4888334  1.2281909 -1.99816542 -0.46303153               2
## X6 -0.3541429  2.0459649 -2.1900278  2.67449014  0.01938291               2

plot(scores_PC[,1:2], col = clusters(kmed_res), pch = 19,
     main = "K-Medians (Median-based)", xlab = "PC1", ylab = "PC2")

Metode K-Median menghasilkan pola pengelompokan yang serupa dengan K-Means. Meskipun metode ini lebih tahan terhadap pengaruh outlier, hasil yang diperoleh tetap menunjukkan adanya overlap antar cluster. Hal ini memperkuat indikasi bahwa data tidak memiliki pemisahan cluster yang kuat.

df_final$country_name <- df_countryname$country_name

df_final$cluster_kmedians <- clusters(kmed_res)

head(df_final[, c("country_name", "cluster_kmedians")])

##                                  country_name cluster_kmedians
## 1                 Africa Eastern and Southern                1
## 2                  Africa Western and Central                1
## 3                                  Arab World                1
## 4              Central Europe and the Baltics                3
## 5                      Caribbean small states                2
## 6 East Asia & Pacific (excluding high income)                2

DBSCAN (Density-Based)

set.seed(123)
db_res <- dbscan(scores_df, eps = 2.0, MinPts = 5)
table(db_res$cluster)

## 
##   0   1   2 
##   9 149   6

plot(scores_df$PC1, scores_df$PC2,
     col = db_res$cluster + 1L,
     pch = 19,
     main = "DBSCAN Result (2 Clusters + Noise)",
     xlab = "PC1", ylab = "PC2")

DBSCAN mengelompokkan data berdasarkan kepadatan. Hasil yang diperoleh menunjukkan danya beberapa data yang dikategorikan sebagai noise(tidak termasuk dalam cluster manapun). Keberadaan noise ini menunjukkan bahwa terdapat data yang tidak memiliki kesamaan yang cukup dengan kelompok lainnya, serta mengindikasikan bahwa struktur kepadatan data tidak terlalu jelas.

df_final$country_name <- df_countryname$country_name

df_final$cluster_DBSCAN <- db_res$cluster

head(df_final[, c("country_name", "cluster_DBSCAN")])

##                                  country_name cluster_DBSCAN
## 1                 Africa Eastern and Southern              1
## 2                  Africa Western and Central              1
## 3                                  Arab World              1
## 4              Central Europe and the Baltics              1
## 5                      Caribbean small states              1
## 6 East Asia & Pacific (excluding high income)              2

Mean Shift

set.seed(123)
ms_res <- meanShift(as.matrix(scores_df))

table(ms_res$assignment)

## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
## 28  1 36  2  7  1  1  1  3  4  7  2  3  1  1  6  1  1  1  1  2  3  2  1  2  1 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 
##  2  1  8  1  1  1  1  3  1  1  1  2  1  1  2  1  2  1  1  1  2  1  1  2  1  1 
## 53 54 55 56 
##  1  1  1  1

my_colors <- rainbow(max(ms_res$assignment))

plot(scores_df$PC1, scores_df$PC2,
     col = my_colors[ms_res$assignment],
     pch = 19,
     main = "Mean Shift Clustering Result",
     xlab = "PC1 (Indikator Utama)",
     ylab = "PC2 (Indikator Kedua)")

legend("topright", legend = unique(ms_res$assignment),
       col = my_colors[unique(ms_res$assignment)], pch = 19, cex = 0.8)

Metode Mean Shift secara otomatis menentukan jumlah cluster berdasarkan distribusi data. Hasil yang diperoleh menunjukkan bahwa cluster yang terbentuk masih saling berdekatan dan tidak memiliki batas yang tegas. Hal ini kembali menunjukkan bahwa distribusi data cenderung menyebar dan tidak membentuk kelompok yang jelas.

df_final$country_name <- df_countryname$country_name

df_final$cluster_MeanShift <- ms_res$assignment

head(df_final[, c("country_name", "cluster_MeanShift")])

##                                  country_name cluster_MeanShift
## 1                 Africa Eastern and Southern                 1
## 2                  Africa Western and Central                 1
## 3                                  Arab World                 2
## 4              Central Europe and the Baltics                 3
## 5                      Caribbean small states                 4
## 6 East Asia & Pacific (excluding high income)                 5

Fuzzy C-Means (FCM)

fpc_values <- numeric(9)
k_range <- 2:10

for (k in k_range) {
  tmp_fcm <- cmeans(scores_df, centers = k, m = 2)
  fpc_values[k-1] <- tmp_fcm$withinerror
}

wcss <- numeric(9)
for (k in k_range) {
  tmp_fcm <- cmeans(scores_df, centers = k, m = 2)
  wcss[k-1] <- sum(tmp_fcm$withinerror)
}

# Plot Elbow Method
plot(k_range, wcss, type = "b", pch = 19, frame = FALSE,
     main = "Elbow Method untuk FCM",
     xlab = "Jumlah Cluster (k)",
     ylab = "Total Within-Cluster Sum of Squares")

set.seed(123)
fcm_res <- cmeans(scores_df, centers = 3, m = 2)
table(fcm_res$cluster)

## 
##  1  2  3 
## 62 67 35

head(fcm_res$membership)

##             1         2          3
## X1 0.07756831 0.8880133 0.03441835
## X2 0.03090479 0.9556264 0.01346885
## X3 0.15725306 0.7624006 0.08034636
## X4 0.62417777 0.1058745 0.26994776
## X5 0.49475742 0.2866761 0.21856644
## X6 0.43474757 0.2908191 0.27443336

plot(scores_df$PC1, scores_df$PC2,
     col = fcm_res$cluster,
     pch = 19,
     main = "Fuzzy C-Means Clustering (k=3)",
     xlab = "PC1", ylab = "PC2")

# Pusat cluster (Centroids) a
points(fcm_res$centers[,1:2], col = 1:3, pch = 8, cex = 2, lwd = 3)

Pada metode Fuzzy C-Means, setiap data memiliki derajat keanggotaan pada lebih dari satu cluster. Hasil yang diperoleh menunjukkan bahwa banyak data memiliki keanggotaan pada beberapa cluster sekaligus.Hal ini menandakan bahwa batas antar cluster tidak jelas dan data memiliki karakteristik yang saling tumpang tindih.

df_final$country_name <- df_countryname$country_name

df_final$cluster_FCM <- fcm_res$cluster

head(df_final[, c("country_name", "cluster_FCM")])

##                                  country_name cluster_FCM
## 1                 Africa Eastern and Southern           2
## 2                  Africa Western and Central           2
## 3                                  Arab World           2
## 4              Central Europe and the Baltics           1
## 5                      Caribbean small states           1
## 6 East Asia & Pacific (excluding high income)           1

Perbandingan hasil clustering

options(repr.plot.width = 10, repr.plot.height = 10)
par(mfrow = c(3, 3), mar = c(4, 4, 3, 1))

# K-Means
plot(scores_df$PC1, scores_df$PC2, col = km_res$cluster,
     pch = 19, main = "K-Means (k=3)", xlab = "PC1", ylab = "PC2")

# K-Medians
plot(scores_df$PC1, scores_df$PC2, col = clusters(kmed_res),
     pch = 19, main = "K-Medians (k=3)", xlab = "PC1", ylab = "PC2")

# DBSCAN
# +1L digunakan agar cluster 0 (Noise) tidak berwarna putih/transparan
plot(scores_df$PC1, scores_df$PC2, col = db_res$cluster + 1L,
     pch = 19, main = "DBSCAN (Black = Noise)", xlab = "PC1", ylab = "PC2")

# Mean Shift
plot(scores_df$PC1, scores_df$PC2, col = ms_res$assignment,
     pch = 19, main = "Mean Shift", xlab = "PC1", ylab = "PC2")

# Fuzzy C-Means
plot(scores_df$PC1, scores_df$PC2, col = fcm_res$cluster,
     pch = 19, main = "Fuzzy C-Means (k=3)", xlab = "PC1", ylab = "PC2")

# Data Asli untuk melihat sebaran asli data sebelum dikelompokkan
plot(scores_df$PC1, scores_df$PC2, col = "grey",
     pch = 19, main = "Original Data (No Cluster)", xlab = "PC1", ylab = "PC2")

par(mfrow = c(1, 1))

Berdasarkan seluruh metode clustering yang telah diterapkan, dapat disimpulkan bahwa dataset tidak memiliki struktur cluster yang kuat. Hal ini ditunjukkan oleh hasil visualisasi yang masih memperlihatkan overlap antar cluster. Dengan demikian, metode clustering berbasis jarak seperti K-Means dan K-Median kurang optimal untuk dataset ini, karena data yang digunakan memiliki karakteristik yang relatif homogen.

EVALUASI

## EVALUASI
calc_dbi <- function(data, clusters) {
  clusters <- as.numeric(clusters)
  if(length(unique(clusters[clusters > 0])) < 2) return(NA)

  res <- try(clusterSim::index.DB(data, clusters)$db, silent = TRUE)
  if(inherits(res, "try-error") || is.null(res)) return(NA) else return(res)
}

# helper biar aman dari NULL / kosong
safe_val <- function(x) {
  if(is.null(x) || length(x) == 0 || inherits(x, "try-error")) return(NA)
  return(as.numeric(x))
}

evaluasi <- data.frame(
  Metode = c("K-Means", "K-Medians", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
  Silhouette_Score = NA,
  DB_Index = NA,
  Jumlah_Cluster = NA,
  Jumlah_Noise = NA
)

# K-MEANS
sil_km <- try(silhouette(km_res$cluster, dist(scores_df)), silent = TRUE)
sil_km_val <- if(inherits(sil_km, "try-error")) NA else mean(sil_km[,3])

dbi_km_val <- safe_val(calc_dbi(scores_df, km_res$cluster))

evaluasi[1, 2:5] <- c(
  safe_val(sil_km_val),
  dbi_km_val,
  length(unique(km_res$cluster)),
  0
)

# K-MEDIANS
kmed_clusters <- clusters(kmed_res)

sil_kmed <- try(silhouette(kmed_clusters, dist(scores_df)), silent = TRUE)
sil_kmed_val <- if(inherits(sil_kmed, "try-error")) NA else mean(sil_kmed[,3])

dbi_kmed_val <- safe_val(calc_dbi(scores_df, kmed_clusters))

evaluasi[2, 2:5] <- c(
  safe_val(sil_kmed_val),
  dbi_kmed_val,
  length(unique(kmed_clusters)),
  0
)

# DBSCAN
db_clusters <- db_res$cluster

sil_db_val <- NA
if(length(unique(db_clusters[db_clusters > 0])) > 1) {
  sil_db <- try(
    silhouette(db_clusters[db_clusters > 0],
               dist(scores_df[db_clusters > 0, ])),
    silent = TRUE
  )
  if(!inherits(sil_db, "try-error")) {
    sil_db_val <- mean(sil_db[,3])
  }
}

dbi_db_val <- safe_val(calc_dbi(scores_df, db_clusters))

evaluasi[3, 2:5] <- c(
  safe_val(sil_db_val),
  dbi_db_val,
  length(unique(db_clusters[db_clusters > 0])),
  sum(db_clusters == 0)
)

# MEAN SHIFT
ms_clusters <- ms_res$assignment

sil_ms <- try(silhouette(ms_clusters, dist(scores_df)), silent = TRUE)
sil_ms_val <- if(inherits(sil_ms, "try-error")) NA else mean(sil_ms[,3])

dbi_ms_val <- safe_val(calc_dbi(scores_df, ms_clusters))

evaluasi[4, 2:5] <- c(
  safe_val(sil_ms_val),
  dbi_ms_val,
  length(unique(ms_clusters)),
  0
)

# FUZZY C-MEANS

fcm_clusters <- fcm_res$cluster

sil_fcm <- try(silhouette(fcm_clusters, dist(scores_df)), silent = TRUE)
sil_fcm_val <- if(inherits(sil_fcm, "try-error")) NA else mean(sil_fcm[,3])

dbi_fcm_val <- safe_val(calc_dbi(scores_df, fcm_clusters))

evaluasi[5, 2:5] <- c(
  safe_val(sil_fcm_val),
  dbi_fcm_val,
  length(unique(fcm_clusters)),
  0
)

# HASIL
print(evaluasi)

##          Metode Silhouette_Score DB_Index Jumlah_Cluster Jumlah_Noise
## 1       K-Means      0.304143819       NA              3            0
## 2     K-Medians      0.260370885       NA              3            0
## 3        DBSCAN      0.201105359       NA              2            9
## 4    Mean Shift     -0.001105504       NA             56            0
## 5 Fuzzy C-Means      0.289359770       NA              3            0

profil_cluster <- aggregate(df_final[, 1:16],
                            by = list(Cluster = km_res$cluster),
                            FUN = mean)

profil_cluster

##   Cluster     gdp_usd  population gdp_per_capita inflation_rate
## 1       1 -0.10009635  0.16097739    -0.66613373      0.4636676
## 2       2  0.01517265 -0.08495999    -0.09699187     -0.2188290
## 3       3  0.22424363 -0.24355341     1.87505640     -0.7515553
##   unemployment_rate fdi_pct_gdp forest_area_pct child_mortality
## 1        0.21202541  -0.1103036      -0.2782146       0.7748809
## 2        0.01648077   0.1998048       0.4175677      -0.5894810
## 3       -0.56899387  -0.1068535      -0.1024871      -0.8233684
##   mobile_subscriptions_per_100 calculated_gdp_per_capita
## 1                   -0.6823330                -0.6647325
## 2                    0.6061840                -0.1019379
## 3                    0.5566211                 1.8810688
##   real_economic_growth_indicator econ_opportunity_index
## 1                     -0.1617530             -0.6442123
## 2                     -0.1972637             -0.1366234
## 3                      0.7911508              1.8961430
##   digital_connectivity_index climate_vulnerability_index
## 1                 -0.8079934                   0.2782146
## 2                  0.7904180                  -0.4175677
## 3                  0.5187753                   0.1024871
##   digital_readiness_score global_development_resilience_index
## 1              -0.8643400                          -0.7919436
## 2               0.7995215                           0.3450369
## 3               0.6439196                           1.3391856

Anmul_Clustering

1. Arimbi Deby S.(24031554186) 2. Mutiara Nur Fadhilatun N.(24031554116)

2026-04-05