LOAD LIBRARY
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.3
library(lubridate)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:ggplot2':
##
## element
library(clusterCrit)
## Warning: package 'clusterCrit' was built under R version 4.5.3
LOAD DATA
data <- read.csv("global_climate_energy_2020_2024.csv", stringsAsFactors = FALSE)
head(data)
## date country avg_temperature humidity co2_emission energy_consumption
## 1 2020-01-01 Germany 28.29 31.08 212.63 11348.75
## 2 2020-01-02 Germany 28.38 37.94 606.05 4166.64
## 3 2020-01-03 Germany 28.74 57.67 268.72 4503.80
## 4 2020-01-04 Germany 26.66 51.34 167.32 3259.13
## 5 2020-01-05 Germany 26.81 65.38 393.89 7023.72
## 6 2020-01-06 Germany 27.81 32.92 644.48 11955.99
## renewable_share urban_population industrial_activity_index energy_price
## 1 14.42 76.39 51.22 83.93
## 2 5.63 86.26 78.27 110.40
## 3 14.20 75.92 48.96 173.58
## 4 13.84 63.15 97.42 89.13
## 5 6.93 76.02 81.89 40.60
## 6 7.08 78.71 78.70 128.62
str(data)
## 'data.frame': 36540 obs. of 10 variables:
## $ date : chr "2020-01-01" "2020-01-02" "2020-01-03" "2020-01-04" ...
## $ country : chr "Germany" "Germany" "Germany" "Germany" ...
## $ avg_temperature : num 28.3 28.4 28.7 26.7 26.8 ...
## $ humidity : num 31.1 37.9 57.7 51.3 65.4 ...
## $ co2_emission : num 213 606 269 167 394 ...
## $ energy_consumption : num 11349 4167 4504 3259 7024 ...
## $ renewable_share : num 14.42 5.63 14.2 13.84 6.93 ...
## $ urban_population : num 76.4 86.3 75.9 63.1 76 ...
## $ industrial_activity_index: num 51.2 78.3 49 97.4 81.9 ...
## $ energy_price : num 83.9 110.4 173.6 89.1 40.6 ...
FILTER DATA TAHUN 2022
data_2022 <- data %>%
mutate(year = year(as.Date(date))) %>%
filter(year == 2022)
unique(data_2022$year)
## [1] 2022
nrow(data_2022)
## [1] 7300
PREPROCESSING
data_num <- data_2022 %>% select(where(is.numeric))
data_num <- na.omit(data_num)
data_num <- data_num[, sapply(data_num, var) != 0]
data_scaled <- scale(data_num)
any(is.na(data_scaled))
## [1] FALSE
any(is.nan(data_scaled))
## [1] FALSE
any(is.infinite(data_scaled))
## [1] FALSE
MENENTUKAN JUMLAH CLUSTER (K)
fviz_nbclust(data_scaled, kmeans, method = "wss") +
ggtitle("Elbow Method")
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 365000)
## Warning: did not converge in 10 iterations

fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
ggtitle("Silhouette Method")
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

k <- 3
CLUSTERING
K-MEANS
kmeans_res <- kmeans(data_scaled, centers = k, nstart = 25)
cluster_kmeans <- kmeans_res$cluster
DBSCAN
kNNdistplot(data_scaled, k = 5)
abline(h = 2, col = "red")

minPts <- ncol(data_scaled) + 1
dbscan_res <- dbscan::dbscan(data_scaled, eps = 2, minPts = minPts)
cluster_dbscan <- dbscan_res$cluster
MEAN SHIFT
bandwidth <- apply(data_scaled, 2, sd)
ms_res <- meanShiftR::meanShift(data_scaled, bandwidth = bandwidth)
cluster_ms <- ms_res$assignment
FUZZY C-MEANS
fcm_res <- cmeans(data_scaled, centers = k, m = 2)
cluster_fcm <- apply(fcm_res$membership, 1, which.max)
FUNGSI EVALUASI
evaluate_cluster <- function(data, cluster){
n_cluster <- length(unique(cluster))
if(n_cluster < 2){
return(c(Silhouette = NA, Dunn = NA, CH = NA))
}
sil <- tryCatch(silhouette(cluster, dist(data)), error = function(e) NULL)
if(is.null(sil)){
sil_score <- NA
} else {
sil_score <- mean(sil[,3])
}
dunn <- intCriteria(as.matrix(data), as.integer(cluster), "Dunn")$dunn
ch <- intCriteria(as.matrix(data), as.integer(cluster),
"Calinski_Harabasz")$calinski_harabasz
return(c(Silhouette = sil_score, Dunn = dunn, CH = ch))
}
EVALUASI SEMUA METODE
eval_kmeans <- evaluate_cluster(data_scaled, cluster_kmeans)
eval_kmedian <- evaluate_cluster(data_scaled, cluster_kmedian)
valid_idx <- cluster_dbscan != 0
if(length(unique(cluster_dbscan[valid_idx])) > 1){
eval_dbscan <- evaluate_cluster(data_scaled[valid_idx,], cluster_dbscan[valid_idx])
} else {
eval_dbscan <- c(Silhouette = NA, Dunn = NA, CH = NA)
}
eval_ms <- evaluate_cluster(data_scaled, cluster_ms)
eval_fcm <- evaluate_cluster(data_scaled, cluster_fcm)
RINGKAS HASIL
results <- rbind(
KMeans = eval_kmeans,
KMedian = eval_kmedian,
DBSCAN = eval_dbscan,
MeanShift = eval_ms,
FCM = eval_fcm
)
round(results, 3)
## Silhouette Dunn CH
## KMeans 0.089 0.068 717.317
## KMedian 0.069 0.041 543.767
## DBSCAN NA NA NA
## MeanShift -0.017 0.104 8.008
## FCM 0.076 0.042 605.449
PILIH METODE TERBAIK
best_method <- rownames(results)[which.max(ifelse(is.na(results[, "Silhouette"]), -Inf, results[, "Silhouette"]))
]
print(paste("Metode terbaik:", best_method))
## [1] "Metode terbaik: KMeans"
EDA
if(best_method == "KMeans"){
best_cluster <- cluster_kmeans
} else if(best_method == "KMedian"){
best_cluster <- cluster_kmedian
} else if(best_method == "DBSCAN"){
best_cluster <- cluster_dbscan
} else if(best_method == "MeanShift"){
best_cluster <- cluster_ms
} else if(best_method == "FCM"){
best_cluster <- cluster_fcm
}
data_eda <- data_num
data_eda$Cluster <- best_cluster
cluster_summary <- data_eda %>%
group_by(Cluster) %>%
summarise(across(everything(), mean))
cluster_summary
## # A tibble: 3 × 9
## Cluster avg_temperature humidity co2_emission energy_consumption
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 13.7 58.6 368. 5570.
## 2 2 13.5 60.2 630. 10980.
## 3 3 13.5 61.1 361. 5677.
## # ℹ 4 more variables: renewable_share <dbl>, urban_population <dbl>,
## # industrial_activity_index <dbl>, energy_price <dbl>
Visualisasi Cluster
fviz_cluster(list(data = data_scaled, cluster = best_cluster),
geom = "point",
ellipse.type = "convex")

Boxplot
data_eda %>%
pivot_longer(-Cluster) %>%
ggplot(aes(x = factor(Cluster), y = value, fill = factor(Cluster))) +
geom_boxplot() +
facet_wrap(~name, scales = "free") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribusi Variabel pada Setiap Cluster",
x = "Cluster",
y = "Nilai") +
theme_minimal()

Distribusi Cluster
table(data_eda$Cluster)
##
## 1 2 3
## 2429 2253 2618
prop.table(table(data_eda$Cluster))
##
## 1 2 3
## 0.3327397 0.3086301 0.3586301
Interpretasi Cluster
cat("Interpretasi Cluster:\n")
## Interpretasi Cluster:
for(i in unique(data_eda$Cluster)){
cat("\nCluster", i, ":\n")
cluster_mean <- colMeans(data_eda[data_eda$Cluster == i, -ncol(data_eda)])
print(round(cluster_mean, 3))
}
##
## Cluster 1 :
## avg_temperature humidity co2_emission
## 13.652 58.554 368.124
## energy_consumption renewable_share urban_population
## 5570.233 15.833 75.550
## industrial_activity_index energy_price
## 69.344 69.267
##
## Cluster 2 :
## avg_temperature humidity co2_emission
## 13.546 60.216 629.802
## energy_consumption renewable_share urban_population
## 10980.213 15.910 75.584
## industrial_activity_index energy_price
## 70.124 113.579
##
## Cluster 3 :
## avg_temperature humidity co2_emission
## 13.528 61.139 360.938
## energy_consumption renewable_share urban_population
## 5677.150 16.018 74.132
## industrial_activity_index energy_price
## 70.745 159.381
PCA Interpretatif
pca_res <- prcomp(data_scaled)
fviz_pca_ind(pca_res,
geom.ind = "point",
col.ind = factor(best_cluster),
palette = "jco",
addEllipses = TRUE) +
ggtitle(paste("PCA Visualization -", best_method))

Pair Plot (Insight Antar Variabel)
ggpairs(data_eda, aes(color = factor(Cluster))) +
ggtitle("Pair Plot Antar Variabel Berdasarkan Cluster") +
theme_minimal()
## Warning: There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
