library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(mclust)
## Warning: package 'mclust' was built under R version 4.5.3
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
##
## The following object is masked from 'package:dplyr':
##
## count
##
## The following object is masked from 'package:purrr':
##
## map
library(moments)
##
## Attaching package: 'moments'
##
## The following objects are masked from 'package:e1071':
##
## kurtosis, moment, skewness
library(tidyr)
library(ggplot2)
library(clusterSim)
## Warning: package 'clusterSim' was built under R version 4.5.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(dplyr)
df <- read_csv("C:/Users/ARIMBY/Downloads/Global_Development_Indicators_2000_2020.csv")
## Rows: 5556 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): country_code, country_name, region, income_group, currency_unit
## dbl (42): year, gdp_usd, population, gdp_per_capita, inflation_rate, unemplo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_asli <- df
head(df_asli)
## # A tibble: 6 × 47
## year country_code country_name region income_group currency_unit gdp_usd
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2000 AFE Africa Eastern a… <NA> <NA> <NA> 2.84e11
## 2 2001 AFE Africa Eastern a… <NA> <NA> <NA> 2.59e11
## 3 2002 AFE Africa Eastern a… <NA> <NA> <NA> 2.65e11
## 4 2003 AFE Africa Eastern a… <NA> <NA> <NA> 3.53e11
## 5 2004 AFE Africa Eastern a… <NA> <NA> <NA> 4.39e11
## 6 2005 AFE Africa Eastern a… <NA> <NA> <NA> 5.12e11
## # ℹ 40 more variables: population <dbl>, gdp_per_capita <dbl>,
## # inflation_rate <dbl>, unemployment_rate <dbl>, fdi_pct_gdp <dbl>,
## # co2_emissions_kt <dbl>, energy_use_per_capita <dbl>,
## # renewable_energy_pct <dbl>, forest_area_pct <dbl>,
## # electricity_access_pct <dbl>, life_expectancy <dbl>, child_mortality <dbl>,
## # school_enrollment_secondary <dbl>, health_expenditure_pct_gdp <dbl>,
## # hospital_beds_per_1000 <dbl>, physicians_per_1000 <dbl>, …
df_2020 <- df_asli %>%
filter(year == 2020)
df_numeric <- df_2020 %>%
dplyr::select(where(is.numeric))
colSums(is.na(df_numeric))
## year gdp_usd
## 0 22
## population gdp_per_capita
## 0 22
## inflation_rate unemployment_rate
## 58 30
## fdi_pct_gdp co2_emissions_kt
## 30 264
## energy_use_per_capita renewable_energy_pct
## 264 264
## forest_area_pct electricity_access_pct
## 8 264
## life_expectancy child_mortality
## 264 24
## school_enrollment_secondary health_expenditure_pct_gdp
## 207 264
## hospital_beds_per_1000 physicians_per_1000
## 264 264
## internet_usage_pct mobile_subscriptions_per_100
## 189 76
## calculated_gdp_per_capita real_economic_growth_indicator
## 22 59
## econ_opportunity_index co2_emissions_per_capita_tons
## 40 264
## co2_intensity_per_million_gdp green_transition_score
## 264 264
## ecological_preservation_index renewable_energy_efficiency
## 8 264
## human_development_composite healthcare_capacity_index
## 264 264
## digital_connectivity_index health_development_ratio
## 0 264
## education_health_ratio years_since_2000
## 213 0
## years_since_century is_pandemic_period
## 0 0
## human_development_index climate_vulnerability_index
## 264 0
## digital_readiness_score governance_quality_index
## 0 0
## global_resilience_score global_development_resilience_index
## 264 0
threshold <- nrow(df_numeric) * 0.5
df_numeric2 <- df_numeric[, colSums(is.na(df_numeric)) < threshold]
df_clean <- na.omit(df_numeric2)
dim(df_numeric)
## [1] 264 42
dim(df_clean)
## [1] 164 22
print(colnames(df_clean))
## [1] "year" "gdp_usd"
## [3] "population" "gdp_per_capita"
## [5] "inflation_rate" "unemployment_rate"
## [7] "fdi_pct_gdp" "forest_area_pct"
## [9] "child_mortality" "mobile_subscriptions_per_100"
## [11] "calculated_gdp_per_capita" "real_economic_growth_indicator"
## [13] "econ_opportunity_index" "ecological_preservation_index"
## [15] "digital_connectivity_index" "years_since_2000"
## [17] "years_since_century" "is_pandemic_period"
## [19] "climate_vulnerability_index" "digital_readiness_score"
## [21] "governance_quality_index" "global_development_resilience_index"
head(df_clean)
## # A tibble: 6 × 22
## year gdp_usd population gdp_per_capita inflation_rate unemployment_rate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020 9.21e11 677243299 1360. 5.19 7.56
## 2 2020 7.85e11 458803476 1710. 2.43 6.77
## 3 2020 2.45e12 436080728 5613. 1.78 11.5
## 4 2020 1.65e12 102253057 16168. 1.67 4.37
## 5 2020 6.60e10 7442291 8874. 0.0385 9.83
## 6 2020 1.74e13 2113820753 8255. 2.53 4.39
## # ℹ 16 more variables: fdi_pct_gdp <dbl>, forest_area_pct <dbl>,
## # child_mortality <dbl>, mobile_subscriptions_per_100 <dbl>,
## # calculated_gdp_per_capita <dbl>, real_economic_growth_indicator <dbl>,
## # econ_opportunity_index <dbl>, ecological_preservation_index <dbl>,
## # digital_connectivity_index <dbl>, years_since_2000 <dbl>,
## # years_since_century <dbl>, is_pandemic_period <dbl>,
## # climate_vulnerability_index <dbl>, digital_readiness_score <dbl>, …
cols_to_remove <- c("year", "years_since_2000", "years_since_century", "ecological_preservation_index",
"is_pandemic_period", "governance_quality_index")
existing_cols_to_remove <- intersect(cols_to_remove, names(df_clean))
df_clean <- df_clean %>%
dplyr::select(-dplyr::all_of(existing_cols_to_remove))
ncol(df_clean)
## [1] 16
head(df_clean)
## # A tibble: 6 × 16
## gdp_usd population gdp_per_capita inflation_rate unemployment_rate fdi_pct_gdp
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9.21e11 677243299 1360. 5.19 7.56 1.40
## 2 7.85e11 458803476 1710. 2.43 6.77 2.20
## 3 2.45e12 436080728 5613. 1.78 11.5 1.60
## 4 1.65e12 102253057 16168. 1.67 4.37 12.7
## 5 6.60e10 7442291 8874. 0.0385 9.83 3.60
## 6 1.74e13 2113820753 8255. 2.53 4.39 1.50
## # ℹ 10 more variables: forest_area_pct <dbl>, child_mortality <dbl>,
## # mobile_subscriptions_per_100 <dbl>, calculated_gdp_per_capita <dbl>,
## # real_economic_growth_indicator <dbl>, econ_opportunity_index <dbl>,
## # digital_connectivity_index <dbl>, climate_vulnerability_index <dbl>,
## # digital_readiness_score <dbl>, global_development_resilience_index <dbl>
df_long <- pivot_longer(df_clean, cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(df_long, aes(x = Variable, y = Value)) +
geom_boxplot(fill = "skyblue", outlier.color = "red") +
coord_flip() +
theme_minimal() +
labs(title = "Deteksi Outlier", x = "Variabel", y = "Z-Score Value")
Berdasarkan hasil visualisasi boxplot, ditemukan adanya beberapa nilai
ekstrem (outlier) pada sejumlah variabel. Outlier merupakan data yang
memiliki nilai jauh dari sebagian besar data lainnya
count_outliers <- function(x) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = TRUE)
H <- 1.5 * IQR(x, na.rm = TRUE)
sum(x < (qnt[1] - H) | x > (qnt[2] + H), na.rm = TRUE)
}
outlier_report <- sapply(df_clean, count_outliers)
print(outlier_report)
## gdp_usd population
## 24 30
## gdp_per_capita inflation_rate
## 16 11
## unemployment_rate fdi_pct_gdp
## 12 26
## forest_area_pct child_mortality
## 3 5
## mobile_subscriptions_per_100 calculated_gdp_per_capita
## 2 16
## real_economic_growth_indicator econ_opportunity_index
## 50 18
## digital_connectivity_index climate_vulnerability_index
## 0 3
## digital_readiness_score global_development_resilience_index
## 0 1
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.5.3
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:mclust':
##
## BrierScore
df_winsorized <- as.data.frame(lapply(df_clean, function(x) {
lim <- quantile(x, probs = c(0.05, 0.95), na.rm = TRUE)
x[x < lim[1]] <- lim[1]
x[x > lim[2]] <- lim[2]
return(x)
}))
head(df_winsorized)
## gdp_usd population gdp_per_capita inflation_rate unemployment_rate
## 1 9.207923e+11 677243299 1359.618 5.1914557 7.563187
## 2 7.845876e+11 458803476 1710.073 2.4313077 6.774914
## 3 2.447584e+12 436080728 5612.687 1.7774078 11.487321
## 4 1.653248e+12 102253057 16168.197 1.6724410 4.372293
## 5 6.603933e+10 7442291 8873.522 0.0385211 9.827410
## 6 1.744889e+13 2113820753 8254.671 2.5273254 4.388021
## fdi_pct_gdp forest_area_pct child_mortality mobile_subscriptions_per_100
## 1 1.403911 30.251786 58.158650 84.22978
## 2 2.196867 19.816762 77.713581 104.51582
## 3 1.601804 2.812929 34.086399 103.17840
## 4 11.487633 34.519877 4.745719 123.70580
## 5 3.603610 67.817786 16.240576 115.80447
## 6 1.497914 30.431156 14.437461 122.93366
## calculated_gdp_per_capita real_economic_growth_indicator
## 1 1359.618 261.8954
## 2 1710.073 703.3554
## 3 5612.687 3157.7933
## 4 16168.197 9667.4244
## 5 8873.522 83943.2248
## 6 8254.671 3266.1686
## econ_opportunity_index digital_connectivity_index climate_vulnerability_index
## 1 494.3842 0.000000 0.4243705
## 2 656.9962 0.000000 0.4504581
## 3 1656.0041 0.000000 0.4929677
## 4 7732.2690 10.140854 0.4137003
## 5 2830.5868 0.000000 0.3304555
## 6 3940.6276 9.200905 0.4239221
## digital_readiness_score global_development_resilience_index
## 1 0.2807659 4.592021
## 2 0.3483861 3.122205
## 3 0.3439280 8.275166
## 4 0.8280038 18.532399
## 5 0.3860149 11.037999
## 6 0.7540973 14.475839
zero_var_cols <- apply(df_winsorized, 2, function(x) var(x, na.rm = TRUE) == 0)
df_winsorized_no_zero_var <- df_winsorized[, !zero_var_cols]
df_scaled <- scale(df_winsorized_no_zero_var)
df_final <- as.data.frame(df_scaled)
head(df_final)
## gdp_usd population gdp_per_capita inflation_rate unemployment_rate
## 1 -0.28777097 0.5485440 -0.7839713 1.00715539 -0.1259477
## 2 -0.30984001 0.2082160 -0.7619064 -0.02393701 -0.2876703
## 3 -0.04038727 0.1728141 -0.5161955 -0.26821056 0.6791301
## 4 -0.16909242 -0.3472876 0.1483859 -0.30742242 -0.7805934
## 5 -0.42626524 -0.4950022 -0.3108913 -0.91779634 0.3385818
## 6 2.39025116 2.7867236 -0.3498545 0.01193176 -0.7773666
## fdi_pct_gdp forest_area_pct child_mortality mobile_subscriptions_per_100
## 1 -0.34339774 0.04814058 1.4900856 -1.3187311
## 2 -0.09589221 -0.50140160 2.3292276 -0.3899178
## 3 -0.28162945 -1.39687857 0.4570963 -0.4511526
## 4 2.80403892 0.27291209 -0.8019721 0.4887115
## 5 0.34319533 2.02648793 -0.3087044 0.1269423
## 6 -0.31405672 0.05758677 -0.3860797 0.4533581
## calculated_gdp_per_capita real_economic_growth_indicator
## 1 -0.7825812 -0.19384169
## 2 -0.7604780 -0.17991456
## 3 -0.5143399 -0.10248222
## 4 0.1513967 0.10288289
## 5 -0.3086788 2.44612751
## 6 -0.3477098 -0.09906321
## econ_opportunity_index digital_connectivity_index climate_vulnerability_index
## 1 -0.7275010 -0.8356546 -0.04814058
## 2 -0.7053245 -0.8356546 0.50140160
## 3 -0.5690832 -0.8356546 1.39687857
## 4 0.2595777 1.1886520 -0.27291209
## 5 -0.4088975 -0.8356546 -2.02648793
## 6 -0.2575138 1.0010203 -0.05758677
## digital_readiness_score global_development_resilience_index
## 1 -1.0734950 -1.28182061
## 2 -0.8007295 -1.45989838
## 3 -0.8187124 -0.83558331
## 4 1.1339482 0.40714787
## 5 -0.6489427 -0.50084794
## 6 0.8358246 -0.08433096
data_summary <- data.frame(
n = nrow(df_final),
Mean = apply(df_final, 2, mean),
Median = apply(df_final, 2, median),
SD = apply(df_final, 2, sd),
Min = apply(df_final, 2, min),
Max = apply(df_final, 2, max),
Skewness = apply(df_final, 2, skewness),
Kurtosis = apply(df_final, 2, kurtosis)
)
round(data_summary, 3)
## n Mean Median SD Min Max Skewness
## gdp_usd 164 0 -0.403 1 -0.436 3.297 2.638
## population 164 0 -0.477 1 -0.506 3.055 2.155
## gdp_per_capita 164 0 -0.442 1 -0.816 2.416 1.360
## inflation_rate 164 0 -0.208 1 -1.267 2.741 1.235
## unemployment_rate 164 0 -0.286 1 -1.158 2.590 1.245
## fdi_pct_gdp 164 0 -0.271 1 -1.301 2.804 1.553
## forest_area_pct 164 0 0.045 1 -1.501 2.026 0.247
## child_mortality 164 0 -0.394 1 -0.893 2.329 1.153
## mobile_subscriptions_per_100 164 0 0.057 1 -1.796 1.835 0.017
## calculated_gdp_per_capita 164 0 -0.440 1 -0.814 2.423 1.372
## real_economic_growth_indicator 164 0 -0.155 1 -2.460 2.446 0.349
## econ_opportunity_index 164 0 -0.490 1 -0.741 2.631 1.518
## digital_connectivity_index 164 0 -0.836 1 -0.836 1.437 0.396
## climate_vulnerability_index 164 0 -0.045 1 -2.026 1.501 -0.247
## digital_readiness_score 164 0 -0.471 1 -1.214 1.571 0.338
## global_development_resilience_index 164 0 -0.034 1 -1.460 1.911 0.304
## Kurtosis
## gdp_usd 8.500
## population 6.375
## gdp_per_capita 3.487
## inflation_rate 4.273
## unemployment_rate 3.754
## fdi_pct_gdp 4.994
## forest_area_pct 2.298
## child_mortality 3.067
## mobile_subscriptions_per_100 2.145
## calculated_gdp_per_capita 3.523
## real_economic_growth_indicator 5.031
## econ_opportunity_index 4.054
## digital_connectivity_index 1.222
## climate_vulnerability_index 2.298
## digital_readiness_score 1.424
## global_development_resilience_index 2.040
df_2020 <- df_asli[df_asli$year == 2020, ]
kolom_dipakai <- c("country_name", "gdp_usd", "population", "gdp_per_capita",
"inflation_rate", "unemployment_rate", "fdi_pct_gdp",
"forest_area_pct", "child_mortality", "mobile_subscriptions_per_100",
"calculated_gdp_per_capita", "real_economic_growth_indicator",
"econ_opportunity_index", "digital_connectivity_index",
"climate_vulnerability_index", "digital_readiness_score",
"global_development_resilience_index")
df_temp <- df_2020[, kolom_dipakai]
df_countryname <- na.omit(df_temp)
nrow(df_countryname)
## [1] 164
mat_corr <- round(cor(df_final), 3)
mat_corr
## gdp_usd population gdp_per_capita
## gdp_usd 1.000 0.811 0.100
## population 0.811 1.000 -0.155
## gdp_per_capita 0.100 -0.155 1.000
## inflation_rate -0.107 0.045 -0.474
## unemployment_rate -0.144 -0.147 -0.246
## fdi_pct_gdp -0.130 -0.125 -0.036
## forest_area_pct 0.075 0.000 0.070
## child_mortality -0.105 0.155 -0.571
## mobile_subscriptions_per_100 0.062 -0.134 0.391
## calculated_gdp_per_capita 0.101 -0.154 0.999
## real_economic_growth_indicator 0.188 0.037 0.277
## econ_opportunity_index 0.070 -0.156 0.974
## digital_connectivity_index -0.006 -0.129 0.399
## climate_vulnerability_index -0.075 0.000 -0.070
## digital_readiness_score 0.006 -0.157 0.472
## global_development_resilience_index 0.001 -0.253 0.797
## inflation_rate unemployment_rate
## gdp_usd -0.107 -0.144
## population 0.045 -0.147
## gdp_per_capita -0.474 -0.246
## inflation_rate 1.000 -0.034
## unemployment_rate -0.034 1.000
## fdi_pct_gdp -0.031 0.035
## forest_area_pct -0.088 -0.012
## child_mortality 0.426 0.015
## mobile_subscriptions_per_100 -0.373 -0.095
## calculated_gdp_per_capita -0.471 -0.246
## real_economic_growth_indicator -0.048 -0.043
## econ_opportunity_index -0.447 -0.343
## digital_connectivity_index -0.311 -0.226
## climate_vulnerability_index 0.088 0.012
## digital_readiness_score -0.376 -0.205
## global_development_resilience_index -0.492 -0.154
## fdi_pct_gdp forest_area_pct child_mortality
## gdp_usd -0.130 0.075 -0.105
## population -0.125 0.000 0.155
## gdp_per_capita -0.036 0.070 -0.571
## inflation_rate -0.031 -0.088 0.426
## unemployment_rate 0.035 -0.012 0.015
## fdi_pct_gdp 1.000 -0.013 -0.012
## forest_area_pct -0.013 1.000 -0.200
## child_mortality -0.012 -0.200 1.000
## mobile_subscriptions_per_100 0.111 0.190 -0.482
## calculated_gdp_per_capita -0.032 0.072 -0.570
## real_economic_growth_indicator -0.101 0.098 -0.119
## econ_opportunity_index -0.024 0.014 -0.539
## digital_connectivity_index 0.144 0.109 -0.572
## climate_vulnerability_index 0.013 -1.000 0.200
## digital_readiness_score 0.134 0.140 -0.623
## global_development_resilience_index 0.022 0.110 -0.791
## mobile_subscriptions_per_100
## gdp_usd 0.062
## population -0.134
## gdp_per_capita 0.391
## inflation_rate -0.373
## unemployment_rate -0.095
## fdi_pct_gdp 0.111
## forest_area_pct 0.190
## child_mortality -0.482
## mobile_subscriptions_per_100 1.000
## calculated_gdp_per_capita 0.388
## real_economic_growth_indicator 0.003
## econ_opportunity_index 0.394
## digital_connectivity_index 0.499
## climate_vulnerability_index -0.190
## digital_readiness_score 0.679
## global_development_resilience_index 0.495
## calculated_gdp_per_capita
## gdp_usd 0.101
## population -0.154
## gdp_per_capita 0.999
## inflation_rate -0.471
## unemployment_rate -0.246
## fdi_pct_gdp -0.032
## forest_area_pct 0.072
## child_mortality -0.570
## mobile_subscriptions_per_100 0.388
## calculated_gdp_per_capita 1.000
## real_economic_growth_indicator 0.282
## econ_opportunity_index 0.975
## digital_connectivity_index 0.396
## climate_vulnerability_index -0.072
## digital_readiness_score 0.469
## global_development_resilience_index 0.796
## real_economic_growth_indicator
## gdp_usd 0.188
## population 0.037
## gdp_per_capita 0.277
## inflation_rate -0.048
## unemployment_rate -0.043
## fdi_pct_gdp -0.101
## forest_area_pct 0.098
## child_mortality -0.119
## mobile_subscriptions_per_100 0.003
## calculated_gdp_per_capita 0.282
## real_economic_growth_indicator 1.000
## econ_opportunity_index 0.215
## digital_connectivity_index 0.073
## climate_vulnerability_index -0.098
## digital_readiness_score 0.076
## global_development_resilience_index 0.163
## econ_opportunity_index
## gdp_usd 0.070
## population -0.156
## gdp_per_capita 0.974
## inflation_rate -0.447
## unemployment_rate -0.343
## fdi_pct_gdp -0.024
## forest_area_pct 0.014
## child_mortality -0.539
## mobile_subscriptions_per_100 0.394
## calculated_gdp_per_capita 0.975
## real_economic_growth_indicator 0.215
## econ_opportunity_index 1.000
## digital_connectivity_index 0.421
## climate_vulnerability_index -0.014
## digital_readiness_score 0.490
## global_development_resilience_index 0.784
## digital_connectivity_index
## gdp_usd -0.006
## population -0.129
## gdp_per_capita 0.399
## inflation_rate -0.311
## unemployment_rate -0.226
## fdi_pct_gdp 0.144
## forest_area_pct 0.109
## child_mortality -0.572
## mobile_subscriptions_per_100 0.499
## calculated_gdp_per_capita 0.396
## real_economic_growth_indicator 0.073
## econ_opportunity_index 0.421
## digital_connectivity_index 1.000
## climate_vulnerability_index -0.109
## digital_readiness_score 0.969
## global_development_resilience_index 0.650
## climate_vulnerability_index
## gdp_usd -0.075
## population 0.000
## gdp_per_capita -0.070
## inflation_rate 0.088
## unemployment_rate 0.012
## fdi_pct_gdp 0.013
## forest_area_pct -1.000
## child_mortality 0.200
## mobile_subscriptions_per_100 -0.190
## calculated_gdp_per_capita -0.072
## real_economic_growth_indicator -0.098
## econ_opportunity_index -0.014
## digital_connectivity_index -0.109
## climate_vulnerability_index 1.000
## digital_readiness_score -0.140
## global_development_resilience_index -0.110
## digital_readiness_score
## gdp_usd 0.006
## population -0.157
## gdp_per_capita 0.472
## inflation_rate -0.376
## unemployment_rate -0.205
## fdi_pct_gdp 0.134
## forest_area_pct 0.140
## child_mortality -0.623
## mobile_subscriptions_per_100 0.679
## calculated_gdp_per_capita 0.469
## real_economic_growth_indicator 0.076
## econ_opportunity_index 0.490
## digital_connectivity_index 0.969
## climate_vulnerability_index -0.140
## digital_readiness_score 1.000
## global_development_resilience_index 0.700
## global_development_resilience_index
## gdp_usd 0.001
## population -0.253
## gdp_per_capita 0.797
## inflation_rate -0.492
## unemployment_rate -0.154
## fdi_pct_gdp 0.022
## forest_area_pct 0.110
## child_mortality -0.791
## mobile_subscriptions_per_100 0.495
## calculated_gdp_per_capita 0.796
## real_economic_growth_indicator 0.163
## econ_opportunity_index 0.784
## digital_connectivity_index 0.650
## climate_vulnerability_index -0.110
## digital_readiness_score 0.700
## global_development_resilience_index 1.000
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:DescTools':
##
## AUC, ICC, SD
## The following object is masked from 'package:mclust':
##
## sim
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
n = nrow(df_final)
p = ncol(df_final)
cortest.bartlett(mat_corr,n=n, diag = TRUE)
## $chisq
## [1] 8885.943
##
## $p.value
## [1] 0
##
## $df
## [1] 120
KMO(df_final)
## Error in solve.default(r) :
## system is computationally singular: reciprocal condition number = 3.54288e-32
## matrix is not invertible, image not found
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = df_final)
## Overall MSA = 0.5
## MSA for each item =
## gdp_usd population
## 0.5 0.5
## gdp_per_capita inflation_rate
## 0.5 0.5
## unemployment_rate fdi_pct_gdp
## 0.5 0.5
## forest_area_pct child_mortality
## 0.5 0.5
## mobile_subscriptions_per_100 calculated_gdp_per_capita
## 0.5 0.5
## real_economic_growth_indicator econ_opportunity_index
## 0.5 0.5
## digital_connectivity_index climate_vulnerability_index
## 0.5 0.5
## digital_readiness_score global_development_resilience_index
## 0.5 0.5
r = cov(df_scaled)
pc <- eigen(r)
print(pc$values)
## [1] 5.881641e+00 2.072467e+00 1.973670e+00 1.457538e+00 1.074829e+00
## [6] 9.053951e-01 8.856607e-01 5.868344e-01 5.246235e-01 3.642125e-01
## [11] 1.329389e-01 1.144941e-01 2.102871e-02 4.087793e-03 5.783456e-04
## [16] -5.561957e-17
print(pc$vectors)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.03056085 0.360221457 -0.50543412 0.26559444 -0.17511779 0.065875049
## [2,] 0.08344257 0.335008111 -0.48896394 0.32951102 -0.08326466 0.098411150
## [3,] -0.36113465 -0.081085222 -0.18604124 -0.26289700 0.02014114 0.154007332
## [4,] 0.23809018 -0.008044023 0.02670188 -0.01698028 0.46247113 -0.252619334
## [5,] 0.10493651 -0.034684395 0.22880990 -0.10431510 -0.78783446 -0.111648961
## [6,] -0.01911378 -0.096118007 0.23518854 0.25887555 0.05893360 0.686869613
## [7,] -0.08355977 0.588525734 0.28547787 -0.20532244 0.08607598 0.088274997
## [8,] 0.32167470 -0.050194041 -0.07372739 -0.06874150 0.23320883 0.139583035
## [9,] -0.26270502 0.067908634 0.15816997 0.27583589 -0.08027271 -0.005309919
## [10,] -0.36059895 -0.079618939 -0.18704688 -0.26544743 0.02136821 0.156146397
## [11,] -0.09403192 0.120795492 -0.20633023 -0.28540215 0.01472699 -0.434269382
## [12,] -0.35719217 -0.120638831 -0.19289316 -0.21638098 0.11222666 0.174389370
## [13,] -0.29791515 -0.007948178 0.15931049 0.40151854 0.14964020 -0.275862763
## [14,] 0.08355977 -0.588525734 -0.28547787 0.20532244 -0.08607598 -0.088274997
## [15,] -0.32874675 0.003483157 0.16872321 0.38407537 0.09165676 -0.238324406
## [16,] -0.37931579 -0.074249616 0.01990343 -0.03141345 -0.06518864 -0.052980948
## [,7] [,8] [,9] [,10] [,11]
## [1,] -0.064894218 -0.11633814 0.109181223 0.04512354 0.684419621
## [2,] 0.010134119 -0.10169555 -0.037183628 -0.16364150 -0.690844682
## [3,] 0.005257744 -0.07155379 0.099999767 -0.19610918 0.008961346
## [4,] -0.151433878 -0.59699961 0.522630989 -0.10635484 -0.010729396
## [5,] -0.162337316 -0.24006704 0.176400337 -0.41068731 -0.019182689
## [6,] -0.611695087 -0.11365152 -0.009578578 0.07263299 -0.010020576
## [7,] 0.038384651 -0.02431296 -0.037115376 -0.06200708 -0.002341138
## [8,] -0.034784583 0.43008106 0.044776064 -0.66475006 0.120177009
## [9,] 0.083276995 0.44063213 0.741653965 0.13576343 -0.105510768
## [10,] -0.001186484 -0.07300246 0.100878700 -0.19431183 0.007476034
## [11,] -0.739559498 0.29663826 -0.027535885 0.13240394 -0.092308729
## [12,] 0.055978606 -0.04814883 0.089173298 -0.18056967 -0.041680254
## [13,] -0.087718802 -0.07934020 -0.297186568 -0.32721371 0.084035409
## [14,] -0.038384651 0.02431296 0.037115376 0.06200708 0.002341138
## [15,] -0.048249981 0.04403420 -0.058598840 -0.25756336 0.038803351
## [16,] 0.031689007 -0.24466648 -0.083030173 0.13476068 -0.095770094
## [,12] [,13] [,14] [,15] [,16]
## [1,] 0.077852159 0.045633310 -0.0044230063 0.0010395194 0.000000e+00
## [2,] 0.004892117 -0.024980853 -0.0023597829 0.0003618998 1.463025e-16
## [3,] -0.136722332 -0.415149840 -0.0048805780 0.7002884992 4.195654e-14
## [4,] 0.024167053 -0.006381329 -0.0036431201 0.0015940550 -1.394830e-16
## [5,] -0.016240435 0.081672889 0.0181951927 0.0010473581 4.800271e-17
## [6,] 0.010715358 0.001134787 -0.0109361197 0.0034334148 3.641384e-16
## [7,] 0.009073156 0.020554921 0.0005973863 0.0010786852 7.071068e-01
## [8,] 0.406624592 -0.010511763 -0.0009768425 0.0005431079 -1.535024e-16
## [9,] 0.051828786 -0.020617634 0.1870274606 -0.0015037135 -8.249122e-16
## [10,] -0.132258958 -0.391282248 -0.0087809969 -0.7136879267 -4.524303e-14
## [11,] 0.015093299 0.041321407 0.0053078901 0.0041522777 3.153116e-16
## [12,] -0.143987819 0.812231207 0.0494643356 0.0134114658 2.889166e-15
## [13,] -0.109837819 -0.041021991 0.6291969980 -0.0042287616 -2.044333e-15
## [14,] -0.009073156 -0.020554921 -0.0005973863 -0.0010786852 7.071068e-01
## [15,] -0.092726822 0.023307340 -0.7523596478 0.0007315901 1.970305e-15
## [16,] 0.864262534 0.009812166 -0.0045023610 0.0029178031 4.156694e-16
sumvar <- sum(pc$values)
proportionvar <- sapply(pc$values, function(x) x/sumvar)*100
cumulativevar <- data.frame(cbind(pc$values, proportionvar)) %>% mutate(cum = cumsum(proportionvar))
colnames(cumulativevar)[1] <- "eigen_value"
row.names(cumulativevar) <- paste0("X",c(1:ncol(df_final)))
options(scipen = 999)
print(cumulativevar)
## eigen_value proportionvar cum
## X1 5.88164113157723189573289 36.7602570723577031230889 36.76026
## X2 2.07246731746274859276014 12.9529207341421823684868 49.71318
## X3 1.97367048019837088013162 12.3354405012398196106460 62.04862
## X4 1.45753789573587044792191 9.1096118483491910211569 71.15823
## X5 1.07482902866871943281524 6.7176814291794979538963 77.87591
## X6 0.90539514593259340369258 5.6587196620787096890126 83.53463
## X7 0.88566073593222538473668 5.5353795995764096815606 89.07001
## X8 0.58683440279191823041316 3.6677150174494896894828 92.73773
## X9 0.52462352473839268807154 3.2788970296149551053588 96.01662
## X10 0.36421247185998995021805 2.2763279491249375219297 98.29295
## X11 0.13293892668357251696243 0.8308682917723284599987 99.12382
## X12 0.11449409052195343161618 0.7155880657622090135206 99.83941
## X13 0.02102870930241857935772 0.1314294331401161530781 99.97084
## X14 0.00408779298385878505501 0.0255487061491174102801 99.99639
## X15 0.00057834561013329899293 0.0036146600633331195732 100.00000
## X16 -0.00000000000000005561957 -0.0000000000000003476223 100.00000
ev <- cumulativevar$eigen_value
plot(ev, type = "b", pch = 19, col = "blue",
xlab = "Principal Component",
ylab = "Eigenvalue",
main = "Scree Plot",
xaxt = "n")
axis(1, at = 1:length(ev), labels = paste0("PC", 1:length(ev)))
abline(h = 1, col = "red", lty = 2)
text(length(ev), 1.2, "Threshold (Eigenvalue = 1)", col = "red", adj = 1)
Berdasarkan grafik scree plot, terlihat bahwa nilai eigenvalue mengalami penurunan yang sangat tajam dari komponen pertama (PC1) ke komponen kedua (PC2), kemudian menurun secara bertahap pada komponen berikutnya. PC1 memiliki eigenvalue paling tinggi (sekitar >5), yang menunjukkan bahwa komponen ini menyimpan variasi data paling besar. Setelah itu, kontribusi masing-masing komponen mulai menurun secara signifikan.
scores <- as.matrix(df_final) %*% pc$vectors
scores_PC <- scores[,1:5]
colnames(scores_PC) <- paste0("PC", 1:ncol(scores_PC))
rownames(scores_PC) <- paste0("X", 1:nrow(scores_PC))
head(scores_PC)
## PC1 PC2 PC3 PC4 PC5
## X1 3.0369013 0.2900002 -0.3636239 -0.55548198 0.75424216
## X2 2.8124569 -0.4581032 -0.3940214 -0.06093578 0.51834295
## X3 1.9472949 -1.4438478 -0.8494465 0.12567112 -1.01303755
## X4 -1.7563948 -0.1012544 1.2817542 1.46633153 0.80944274
## X5 0.1027531 2.4888334 1.2281909 -1.99816542 -0.46303153
## X6 -0.3541429 2.0459649 -2.1900278 2.67449014 0.01938291
scores_df <- as.data.frame(scores_PC)
Menentukan Jumlah Cluster(Elbow Method)
wss <- sapply(1:10, function(k){
kmeans(df_final, centers = k, nstart = 25)$tot.withinss
})
plot(1:10, wss, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="WSS",
main="Elbow Method")
Berdasarkan Visualisasi Diata, terlihat bahwa nilai WSS (Within Sum of Squares) mengalami penurunan yang cukup tajam dari k = 1 hingga k = 3. Setelah k = 3, penurunan nilai WSS cenderung melandai dan tidak signifikan. Hal ini menunjukkan bahwa penambahan jumlah cluster setelah k = 3 tidak memberikan peningkatan yang berarti dalam mengurangi variasi dalam cluster.
set.seed(123)
km_res <- kmeans(scores_df, centers = 3, nstart = 25)
km_res$cluster
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16
## 1 1 1 2 2 2 1 2 2 2 3 3 3 1 1 1
## X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32
## 1 1 1 1 1 1 2 1 1 1 3 1 1 3 1 1
## X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48
## 1 1 2 2 1 1 1 1 2 3 2 2 2 3 3 2
## X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64
## 2 3 2 2 2 1 3 1 3 2 2 2 3 2 2 3
## X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80
## 3 2 2 3 3 3 2 2 2 2 3 3 2 2 2 3
## X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96
## 2 1 2 2 3 3 2 2 2 2 2 2 2 3 2 1
## X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112
## 2 1 2 2 2 1 1 1 1 1 2 1 2 2 2 1
## X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128
## 3 3 1 1 1 1 3 1 3 1 2 3 2 1 3 2
## X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144
## 1 3 1 1 1 2 2 1 1 1 1 1 1 1 1 2
## X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160
## 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## X161 X162 X163 X164
## 1 1 1 1
table(km_res$cluster)
##
## 1 2 3
## 76 58 30
plot(scores_df$PC1, scores_df$PC2,
col = km_res$cluster,
pch = 19,
main = "K-Means Clustering (k=3) - Basis 5 PC",
xlab = "PC1", ylab = "PC2")
Hasil visualisasi menunjukkan bahwa metode clustering mampu membagi data menjadi tiga kelompok utama, namun pemisahan antar cluster belum sepenuhnya jelas. Hal ini ditandai dengan masih adanya kedekatan dan sedikit tumpang tindih antar beberapa cluster. Dengan demikian, struktur cluster dalam data tergolong cukup terbentuk, tetapi belum optimal.
df_final$country_name <- df_countryname$country_name
df_final$cluster_kmeans <- km_res$cluster
head(df_final[, c("country_name", "cluster_kmeans")])
## country_name cluster_kmeans
## 1 Africa Eastern and Southern 1
## 2 Africa Western and Central 1
## 3 Arab World 1
## 4 Central Europe and the Baltics 2
## 5 Caribbean small states 2
## 6 East Asia & Pacific (excluding high income) 2
set.seed(123)
kmed_res <- kcca(scores_df, k = 3, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
clusters(kmed_res)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16
## 1 1 1 3 2 2 2 2 3 3 2 3 2 1 2 2
## X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32
## 1 1 2 2 1 2 2 1 2 1 2 2 1 2 1 1
## X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48
## 1 2 2 3 2 1 1 1 2 3 2 3 3 3 3 3
## X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64
## 3 3 3 3 3 2 3 1 3 3 2 3 3 3 3 3
## X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96
## 3 1 2 3 3 3 3 3 3 3 3 3 3 3 3 2
## X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112
## 2 2 2 2 3 2 2 2 2 2 3 1 2 2 2 2
## X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128
## 3 3 1 1 1 1 3 1 3 1 3 3 3 1 3 3
## X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144
## 1 3 1 2 1 3 3 1 1 1 1 2 1 1 1 2
## X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160
## 1 1 1 1 1 1 3 1 1 1 1 2 1 1 1 2
## X161 X162 X163 X164
## 1 2 2 2
table(clusters(kmed_res))
##
## 1 2 3
## 50 46 68
scores_df$kmedian_cluster <- clusters(kmed_res)
head(scores_df)
## PC1 PC2 PC3 PC4 PC5 kmedian_cluster
## X1 3.0369013 0.2900002 -0.3636239 -0.55548198 0.75424216 1
## X2 2.8124569 -0.4581032 -0.3940214 -0.06093578 0.51834295 1
## X3 1.9472949 -1.4438478 -0.8494465 0.12567112 -1.01303755 1
## X4 -1.7563948 -0.1012544 1.2817542 1.46633153 0.80944274 3
## X5 0.1027531 2.4888334 1.2281909 -1.99816542 -0.46303153 2
## X6 -0.3541429 2.0459649 -2.1900278 2.67449014 0.01938291 2
plot(scores_PC[,1:2], col = clusters(kmed_res), pch = 19,
main = "K-Medians (Median-based)", xlab = "PC1", ylab = "PC2")
Metode K-Median menghasilkan pola pengelompokan yang serupa dengan K-Means. Meskipun metode ini lebih tahan terhadap pengaruh outlier, hasil yang diperoleh tetap menunjukkan adanya overlap antar cluster. Hal ini memperkuat indikasi bahwa data tidak memiliki pemisahan cluster yang kuat.
df_final$country_name <- df_countryname$country_name
df_final$cluster_kmedians <- clusters(kmed_res)
head(df_final[, c("country_name", "cluster_kmedians")])
## country_name cluster_kmedians
## 1 Africa Eastern and Southern 1
## 2 Africa Western and Central 1
## 3 Arab World 1
## 4 Central Europe and the Baltics 3
## 5 Caribbean small states 2
## 6 East Asia & Pacific (excluding high income) 2
set.seed(123)
db_res <- dbscan(scores_df, eps = 2.0, MinPts = 5)
table(db_res$cluster)
##
## 0 1 2
## 9 149 6
plot(scores_df$PC1, scores_df$PC2,
col = db_res$cluster + 1L,
pch = 19,
main = "DBSCAN Result (2 Clusters + Noise)",
xlab = "PC1", ylab = "PC2")
DBSCAN mengelompokkan data berdasarkan kepadatan. Hasil yang diperoleh
menunjukkan danya beberapa data yang dikategorikan sebagai noise(tidak
termasuk dalam cluster manapun). Keberadaan noise ini menunjukkan bahwa
terdapat data yang tidak memiliki kesamaan yang cukup dengan kelompok
lainnya, serta mengindikasikan bahwa struktur kepadatan data tidak
terlalu jelas.
df_final$country_name <- df_countryname$country_name
df_final$cluster_DBSCAN <- db_res$cluster
head(df_final[, c("country_name", "cluster_DBSCAN")])
## country_name cluster_DBSCAN
## 1 Africa Eastern and Southern 1
## 2 Africa Western and Central 1
## 3 Arab World 1
## 4 Central Europe and the Baltics 1
## 5 Caribbean small states 1
## 6 East Asia & Pacific (excluding high income) 2
set.seed(123)
ms_res <- meanShift(as.matrix(scores_df))
table(ms_res$assignment)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 28 1 36 2 7 1 1 1 3 4 7 2 3 1 1 6 1 1 1 1 2 3 2 1 2 1
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
## 2 1 8 1 1 1 1 3 1 1 1 2 1 1 2 1 2 1 1 1 2 1 1 2 1 1
## 53 54 55 56
## 1 1 1 1
my_colors <- rainbow(max(ms_res$assignment))
plot(scores_df$PC1, scores_df$PC2,
col = my_colors[ms_res$assignment],
pch = 19,
main = "Mean Shift Clustering Result",
xlab = "PC1 (Indikator Utama)",
ylab = "PC2 (Indikator Kedua)")
legend("topright", legend = unique(ms_res$assignment),
col = my_colors[unique(ms_res$assignment)], pch = 19, cex = 0.8)
Metode Mean Shift secara otomatis menentukan jumlah cluster berdasarkan
distribusi data. Hasil yang diperoleh menunjukkan bahwa cluster yang
terbentuk masih saling berdekatan dan tidak memiliki batas yang tegas.
Hal ini kembali menunjukkan bahwa distribusi data cenderung menyebar dan
tidak membentuk kelompok yang jelas.
df_final$country_name <- df_countryname$country_name
df_final$cluster_MeanShift <- ms_res$assignment
head(df_final[, c("country_name", "cluster_MeanShift")])
## country_name cluster_MeanShift
## 1 Africa Eastern and Southern 1
## 2 Africa Western and Central 1
## 3 Arab World 2
## 4 Central Europe and the Baltics 3
## 5 Caribbean small states 4
## 6 East Asia & Pacific (excluding high income) 5
fpc_values <- numeric(9)
k_range <- 2:10
for (k in k_range) {
tmp_fcm <- cmeans(scores_df, centers = k, m = 2)
fpc_values[k-1] <- tmp_fcm$withinerror
}
wcss <- numeric(9)
for (k in k_range) {
tmp_fcm <- cmeans(scores_df, centers = k, m = 2)
wcss[k-1] <- sum(tmp_fcm$withinerror)
}
# Plot Elbow Method
plot(k_range, wcss, type = "b", pch = 19, frame = FALSE,
main = "Elbow Method untuk FCM",
xlab = "Jumlah Cluster (k)",
ylab = "Total Within-Cluster Sum of Squares")
set.seed(123)
fcm_res <- cmeans(scores_df, centers = 3, m = 2)
table(fcm_res$cluster)
##
## 1 2 3
## 62 67 35
head(fcm_res$membership)
## 1 2 3
## X1 0.07756831 0.8880133 0.03441835
## X2 0.03090479 0.9556264 0.01346885
## X3 0.15725306 0.7624006 0.08034636
## X4 0.62417777 0.1058745 0.26994776
## X5 0.49475742 0.2866761 0.21856644
## X6 0.43474757 0.2908191 0.27443336
plot(scores_df$PC1, scores_df$PC2,
col = fcm_res$cluster,
pch = 19,
main = "Fuzzy C-Means Clustering (k=3)",
xlab = "PC1", ylab = "PC2")
# Pusat cluster (Centroids) a
points(fcm_res$centers[,1:2], col = 1:3, pch = 8, cex = 2, lwd = 3)
Pada metode Fuzzy C-Means, setiap data memiliki derajat keanggotaan pada lebih dari satu cluster. Hasil yang diperoleh menunjukkan bahwa banyak data memiliki keanggotaan pada beberapa cluster sekaligus.Hal ini menandakan bahwa batas antar cluster tidak jelas dan data memiliki karakteristik yang saling tumpang tindih.
df_final$country_name <- df_countryname$country_name
df_final$cluster_FCM <- fcm_res$cluster
head(df_final[, c("country_name", "cluster_FCM")])
## country_name cluster_FCM
## 1 Africa Eastern and Southern 2
## 2 Africa Western and Central 2
## 3 Arab World 2
## 4 Central Europe and the Baltics 1
## 5 Caribbean small states 1
## 6 East Asia & Pacific (excluding high income) 1
options(repr.plot.width = 10, repr.plot.height = 10)
par(mfrow = c(3, 3), mar = c(4, 4, 3, 1))
# K-Means
plot(scores_df$PC1, scores_df$PC2, col = km_res$cluster,
pch = 19, main = "K-Means (k=3)", xlab = "PC1", ylab = "PC2")
# K-Medians
plot(scores_df$PC1, scores_df$PC2, col = clusters(kmed_res),
pch = 19, main = "K-Medians (k=3)", xlab = "PC1", ylab = "PC2")
# DBSCAN
# +1L digunakan agar cluster 0 (Noise) tidak berwarna putih/transparan
plot(scores_df$PC1, scores_df$PC2, col = db_res$cluster + 1L,
pch = 19, main = "DBSCAN (Black = Noise)", xlab = "PC1", ylab = "PC2")
# Mean Shift
plot(scores_df$PC1, scores_df$PC2, col = ms_res$assignment,
pch = 19, main = "Mean Shift", xlab = "PC1", ylab = "PC2")
# Fuzzy C-Means
plot(scores_df$PC1, scores_df$PC2, col = fcm_res$cluster,
pch = 19, main = "Fuzzy C-Means (k=3)", xlab = "PC1", ylab = "PC2")
# Data Asli untuk melihat sebaran asli data sebelum dikelompokkan
plot(scores_df$PC1, scores_df$PC2, col = "grey",
pch = 19, main = "Original Data (No Cluster)", xlab = "PC1", ylab = "PC2")
par(mfrow = c(1, 1))
Berdasarkan seluruh metode clustering yang telah diterapkan, dapat disimpulkan bahwa dataset tidak memiliki struktur cluster yang kuat. Hal ini ditunjukkan oleh hasil visualisasi yang masih memperlihatkan overlap antar cluster. Dengan demikian, metode clustering berbasis jarak seperti K-Means dan K-Median kurang optimal untuk dataset ini, karena data yang digunakan memiliki karakteristik yang relatif homogen.
## EVALUASI
calc_dbi <- function(data, clusters) {
clusters <- as.numeric(clusters)
if(length(unique(clusters[clusters > 0])) < 2) return(NA)
res <- try(clusterSim::index.DB(data, clusters)$db, silent = TRUE)
if(inherits(res, "try-error") || is.null(res)) return(NA) else return(res)
}
# helper biar aman dari NULL / kosong
safe_val <- function(x) {
if(is.null(x) || length(x) == 0 || inherits(x, "try-error")) return(NA)
return(as.numeric(x))
}
evaluasi <- data.frame(
Metode = c("K-Means", "K-Medians", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
Silhouette_Score = NA,
DB_Index = NA,
Jumlah_Cluster = NA,
Jumlah_Noise = NA
)
# K-MEANS
sil_km <- try(silhouette(km_res$cluster, dist(scores_df)), silent = TRUE)
sil_km_val <- if(inherits(sil_km, "try-error")) NA else mean(sil_km[,3])
dbi_km_val <- safe_val(calc_dbi(scores_df, km_res$cluster))
evaluasi[1, 2:5] <- c(
safe_val(sil_km_val),
dbi_km_val,
length(unique(km_res$cluster)),
0
)
# K-MEDIANS
kmed_clusters <- clusters(kmed_res)
sil_kmed <- try(silhouette(kmed_clusters, dist(scores_df)), silent = TRUE)
sil_kmed_val <- if(inherits(sil_kmed, "try-error")) NA else mean(sil_kmed[,3])
dbi_kmed_val <- safe_val(calc_dbi(scores_df, kmed_clusters))
evaluasi[2, 2:5] <- c(
safe_val(sil_kmed_val),
dbi_kmed_val,
length(unique(kmed_clusters)),
0
)
# DBSCAN
db_clusters <- db_res$cluster
sil_db_val <- NA
if(length(unique(db_clusters[db_clusters > 0])) > 1) {
sil_db <- try(
silhouette(db_clusters[db_clusters > 0],
dist(scores_df[db_clusters > 0, ])),
silent = TRUE
)
if(!inherits(sil_db, "try-error")) {
sil_db_val <- mean(sil_db[,3])
}
}
dbi_db_val <- safe_val(calc_dbi(scores_df, db_clusters))
evaluasi[3, 2:5] <- c(
safe_val(sil_db_val),
dbi_db_val,
length(unique(db_clusters[db_clusters > 0])),
sum(db_clusters == 0)
)
# MEAN SHIFT
ms_clusters <- ms_res$assignment
sil_ms <- try(silhouette(ms_clusters, dist(scores_df)), silent = TRUE)
sil_ms_val <- if(inherits(sil_ms, "try-error")) NA else mean(sil_ms[,3])
dbi_ms_val <- safe_val(calc_dbi(scores_df, ms_clusters))
evaluasi[4, 2:5] <- c(
safe_val(sil_ms_val),
dbi_ms_val,
length(unique(ms_clusters)),
0
)
# FUZZY C-MEANS
fcm_clusters <- fcm_res$cluster
sil_fcm <- try(silhouette(fcm_clusters, dist(scores_df)), silent = TRUE)
sil_fcm_val <- if(inherits(sil_fcm, "try-error")) NA else mean(sil_fcm[,3])
dbi_fcm_val <- safe_val(calc_dbi(scores_df, fcm_clusters))
evaluasi[5, 2:5] <- c(
safe_val(sil_fcm_val),
dbi_fcm_val,
length(unique(fcm_clusters)),
0
)
# HASIL
print(evaluasi)
## Metode Silhouette_Score DB_Index Jumlah_Cluster Jumlah_Noise
## 1 K-Means 0.304143819 NA 3 0
## 2 K-Medians 0.260370885 NA 3 0
## 3 DBSCAN 0.201105359 NA 2 9
## 4 Mean Shift -0.001105504 NA 56 0
## 5 Fuzzy C-Means 0.289359770 NA 3 0
profil_cluster <- aggregate(df_final[, 1:16],
by = list(Cluster = km_res$cluster),
FUN = mean)
profil_cluster
## Cluster gdp_usd population gdp_per_capita inflation_rate
## 1 1 -0.10009635 0.16097739 -0.66613373 0.4636676
## 2 2 0.01517265 -0.08495999 -0.09699187 -0.2188290
## 3 3 0.22424363 -0.24355341 1.87505640 -0.7515553
## unemployment_rate fdi_pct_gdp forest_area_pct child_mortality
## 1 0.21202541 -0.1103036 -0.2782146 0.7748809
## 2 0.01648077 0.1998048 0.4175677 -0.5894810
## 3 -0.56899387 -0.1068535 -0.1024871 -0.8233684
## mobile_subscriptions_per_100 calculated_gdp_per_capita
## 1 -0.6823330 -0.6647325
## 2 0.6061840 -0.1019379
## 3 0.5566211 1.8810688
## real_economic_growth_indicator econ_opportunity_index
## 1 -0.1617530 -0.6442123
## 2 -0.1972637 -0.1366234
## 3 0.7911508 1.8961430
## digital_connectivity_index climate_vulnerability_index
## 1 -0.8079934 0.2782146
## 2 0.7904180 -0.4175677
## 3 0.5187753 0.1024871
## digital_readiness_score global_development_resilience_index
## 1 -0.8643400 -0.7919436
## 2 0.7995215 0.3450369
## 3 0.6439196 1.3391856