LOAD LIBRARY

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.3
library(lubridate)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(clusterCrit)
## Warning: package 'clusterCrit' was built under R version 4.5.3

LOAD DATA

data <- read.csv("global_climate_energy_2020_2024.csv", stringsAsFactors = FALSE)

head(data)
##         date country avg_temperature humidity co2_emission energy_consumption
## 1 2020-01-01 Germany           28.29    31.08       212.63           11348.75
## 2 2020-01-02 Germany           28.38    37.94       606.05            4166.64
## 3 2020-01-03 Germany           28.74    57.67       268.72            4503.80
## 4 2020-01-04 Germany           26.66    51.34       167.32            3259.13
## 5 2020-01-05 Germany           26.81    65.38       393.89            7023.72
## 6 2020-01-06 Germany           27.81    32.92       644.48           11955.99
##   renewable_share urban_population industrial_activity_index energy_price
## 1           14.42            76.39                     51.22        83.93
## 2            5.63            86.26                     78.27       110.40
## 3           14.20            75.92                     48.96       173.58
## 4           13.84            63.15                     97.42        89.13
## 5            6.93            76.02                     81.89        40.60
## 6            7.08            78.71                     78.70       128.62
str(data)
## 'data.frame':    36540 obs. of  10 variables:
##  $ date                     : chr  "2020-01-01" "2020-01-02" "2020-01-03" "2020-01-04" ...
##  $ country                  : chr  "Germany" "Germany" "Germany" "Germany" ...
##  $ avg_temperature          : num  28.3 28.4 28.7 26.7 26.8 ...
##  $ humidity                 : num  31.1 37.9 57.7 51.3 65.4 ...
##  $ co2_emission             : num  213 606 269 167 394 ...
##  $ energy_consumption       : num  11349 4167 4504 3259 7024 ...
##  $ renewable_share          : num  14.42 5.63 14.2 13.84 6.93 ...
##  $ urban_population         : num  76.4 86.3 75.9 63.1 76 ...
##  $ industrial_activity_index: num  51.2 78.3 49 97.4 81.9 ...
##  $ energy_price             : num  83.9 110.4 173.6 89.1 40.6 ...

FILTER DATA TAHUN 2022

data_2022 <- data %>% 
  mutate(year = year(as.Date(date))) %>% 
  filter(year == 2022)

unique(data_2022$year)
## [1] 2022
nrow(data_2022)
## [1] 7300

PREPROCESSING

data_num <- data_2022 %>% select(where(is.numeric))

data_num <- na.omit(data_num)

data_num <- data_num[, sapply(data_num, var) != 0]

data_scaled <- scale(data_num)

any(is.na(data_scaled))
## [1] FALSE
any(is.nan(data_scaled))
## [1] FALSE
any(is.infinite(data_scaled))
## [1] FALSE

MENENTUKAN JUMLAH CLUSTER (K)

fviz_nbclust(data_scaled, kmeans, method = "wss") +
  ggtitle("Elbow Method")
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 365000)
## Warning: did not converge in 10 iterations

fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
  ggtitle("Silhouette Method")
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

k <- 3

CLUSTERING

K-MEANS
kmeans_res <- kmeans(data_scaled, centers = k, nstart = 25)

cluster_kmeans <- kmeans_res$cluster
K-MEDIAN
kmedian_res <- pam(data_scaled, k)

cluster_kmedian <- kmedian_res$clustering
DBSCAN
kNNdistplot(data_scaled, k = 5)
abline(h = 2, col = "red")

minPts <- ncol(data_scaled) + 1

dbscan_res <- dbscan::dbscan(data_scaled, eps = 2, minPts = minPts)
cluster_dbscan <- dbscan_res$cluster
MEAN SHIFT
bandwidth <- apply(data_scaled, 2, sd)

ms_res <- meanShiftR::meanShift(data_scaled, bandwidth = bandwidth)
cluster_ms <- ms_res$assignment
FUZZY C-MEANS
fcm_res <- cmeans(data_scaled, centers = k, m = 2)
cluster_fcm <- apply(fcm_res$membership, 1, which.max)

FUNGSI EVALUASI

evaluate_cluster <- function(data, cluster){
  
  n_cluster <- length(unique(cluster))
  
  if(n_cluster < 2){
    return(c(Silhouette = NA, Dunn = NA, CH = NA))
  }

  sil <- tryCatch(silhouette(cluster, dist(data)), error = function(e) NULL)
  
  if(is.null(sil)){
    sil_score <- NA
  } else {
    sil_score <- mean(sil[,3])
  }
  
  dunn <- intCriteria(as.matrix(data), as.integer(cluster), "Dunn")$dunn
  
  ch <- intCriteria(as.matrix(data), as.integer(cluster), 
                    "Calinski_Harabasz")$calinski_harabasz
  
  return(c(Silhouette = sil_score, Dunn = dunn, CH = ch))
}

EVALUASI SEMUA METODE

eval_kmeans <- evaluate_cluster(data_scaled, cluster_kmeans)

eval_kmedian <- evaluate_cluster(data_scaled, cluster_kmedian)

valid_idx <- cluster_dbscan != 0

if(length(unique(cluster_dbscan[valid_idx])) > 1){
  eval_dbscan <- evaluate_cluster(data_scaled[valid_idx,], cluster_dbscan[valid_idx])
} else {
  eval_dbscan <- c(Silhouette = NA, Dunn = NA, CH = NA)
}

eval_ms <- evaluate_cluster(data_scaled, cluster_ms)

eval_fcm <- evaluate_cluster(data_scaled, cluster_fcm)

RINGKAS HASIL

results <- rbind(
  KMeans = eval_kmeans,
  KMedian = eval_kmedian,
  DBSCAN = eval_dbscan,
  MeanShift = eval_ms,
  FCM = eval_fcm
)

round(results, 3)
##           Silhouette  Dunn      CH
## KMeans         0.089 0.068 717.317
## KMedian        0.069 0.041 543.767
## DBSCAN            NA    NA      NA
## MeanShift     -0.017 0.104   8.008
## FCM            0.076 0.042 605.449

PILIH METODE TERBAIK

best_method <- rownames(results)[which.max(ifelse(is.na(results[, "Silhouette"]), -Inf, results[, "Silhouette"]))
]

print(paste("Metode terbaik:", best_method))
## [1] "Metode terbaik: KMeans"

EDA

if(best_method == "KMeans"){
  best_cluster <- cluster_kmeans
} else if(best_method == "KMedian"){
  best_cluster <- cluster_kmedian
} else if(best_method == "DBSCAN"){
  best_cluster <- cluster_dbscan
} else if(best_method == "MeanShift"){
  best_cluster <- cluster_ms
} else if(best_method == "FCM"){
  best_cluster <- cluster_fcm
}

data_eda <- data_num
data_eda$Cluster <- best_cluster

cluster_summary <- data_eda %>%
  group_by(Cluster) %>%
  summarise(across(everything(), mean))

cluster_summary
## # A tibble: 3 × 9
##   Cluster avg_temperature humidity co2_emission energy_consumption
##     <int>           <dbl>    <dbl>        <dbl>              <dbl>
## 1       1            13.7     58.6         368.              5570.
## 2       2            13.5     60.2         630.             10980.
## 3       3            13.5     61.1         361.              5677.
## # ℹ 4 more variables: renewable_share <dbl>, urban_population <dbl>,
## #   industrial_activity_index <dbl>, energy_price <dbl>
Visualisasi Cluster
fviz_cluster(list(data = data_scaled, cluster = best_cluster),
             geom = "point",
             ellipse.type = "convex")

Boxplot
data_eda %>%
  pivot_longer(-Cluster) %>%
  ggplot(aes(x = factor(Cluster), y = value, fill = factor(Cluster))) +
  geom_boxplot() +
  facet_wrap(~name, scales = "free") +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Distribusi Variabel pada Setiap Cluster",
       x = "Cluster",
       y = "Nilai") +
  theme_minimal()

Distribusi Cluster
table(data_eda$Cluster)
## 
##    1    2    3 
## 2429 2253 2618
prop.table(table(data_eda$Cluster))
## 
##         1         2         3 
## 0.3327397 0.3086301 0.3586301
Interpretasi Cluster
cat("Interpretasi Cluster:\n")
## Interpretasi Cluster:
for(i in unique(data_eda$Cluster)){
  cat("\nCluster", i, ":\n")
  
  cluster_mean <- colMeans(data_eda[data_eda$Cluster == i, -ncol(data_eda)])
  
  print(round(cluster_mean, 3))
}
## 
## Cluster 1 :
##           avg_temperature                  humidity              co2_emission 
##                    13.652                    58.554                   368.124 
##        energy_consumption           renewable_share          urban_population 
##                  5570.233                    15.833                    75.550 
## industrial_activity_index              energy_price 
##                    69.344                    69.267 
## 
## Cluster 2 :
##           avg_temperature                  humidity              co2_emission 
##                    13.546                    60.216                   629.802 
##        energy_consumption           renewable_share          urban_population 
##                 10980.213                    15.910                    75.584 
## industrial_activity_index              energy_price 
##                    70.124                   113.579 
## 
## Cluster 3 :
##           avg_temperature                  humidity              co2_emission 
##                    13.528                    61.139                   360.938 
##        energy_consumption           renewable_share          urban_population 
##                  5677.150                    16.018                    74.132 
## industrial_activity_index              energy_price 
##                    70.745                   159.381
PCA Interpretatif
pca_res <- prcomp(data_scaled)

fviz_pca_ind(pca_res,
             geom.ind = "point",
             col.ind = factor(best_cluster),
             palette = "jco",
             addEllipses = TRUE) +
  ggtitle(paste("PCA Visualization -", best_method))

Pair Plot (Insight Antar Variabel)
ggpairs(data_eda, aes(color = factor(Cluster))) +
  ggtitle("Pair Plot Antar Variabel Berdasarkan Cluster") +
  theme_minimal()
## Warning: There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 1: `color = 1`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.