library(readr)
## Warning: package 'readr' was built under R version 4.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(mclust)
## Warning: package 'mclust' was built under R version 4.5.3
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
##
## The following object is masked from 'package:dplyr':
##
## count
##
## The following object is masked from 'package:purrr':
##
## map
df <- read_csv("SeoulBikeData.csv", locale = locale(encoding = "latin1"))
## Rows: 8760 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Date, Seasons, Holiday, Functioning Day
## dbl (10): Rented Bike Count, Hour, Temperature(°C), Humidity(%), Wind speed ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 14
## Date `Rented Bike Count` Hour `Temperature(°C)` `Humidity(%)`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 01/12/2017 254 0 -5.2 37
## 2 01/12/2017 204 1 -5.5 38
## 3 01/12/2017 173 2 -6 39
## 4 01/12/2017 107 3 -6.2 40
## 5 01/12/2017 78 4 -6 36
## 6 01/12/2017 100 5 -6.4 37
## # ℹ 9 more variables: `Wind speed (m/s)` <dbl>, `Visibility (10m)` <dbl>,
## # `Dew point temperature(°C)` <dbl>, `Solar Radiation (MJ/m2)` <dbl>,
## # `Rainfall(mm)` <dbl>, `Snowfall (cm)` <dbl>, Seasons <chr>, Holiday <chr>,
## # `Functioning Day` <chr>
Dipilih 10 fitur numerik utama (kolom 2 sampai 11) yang bersifat kontinu, yaitu mulai dari Rented Bike Count hingga Snowfall. Pemilihan ini dilakukan karena algoritma clustering berbasis jarak seperti K-Means dan K-Median bekerja lebih baik pada data yang memiliki nilai yang bertahap dan tidak diskrit. Fitur-fitur tersebut mewakili kondisi lingkungan seperti suhu, kelembaban, dan radiasi matahari, serta kondisi cuaca seperti hujan dan salju. Faktor-faktor ini secara logis memengaruhi keputusan seseorang dalam menyewa sepeda. Sementara itu, data kategorikal dan biner seperti Seasons, Holiday dan Functioning Day tidak digunakan agar tidak memengaruhi perhitungan jarak. Dengan demikian, hasil pengelompokan yang terbentuk benar-benar didasarkan pada pola aktivitas dan kondisi lingkungan yang dapat diukur.
df_selected <- df[, c("Rented Bike Count", "Hour", "Temperature(°C)", "Humidity(%)",
"Wind speed (m/s)", "Visibility (10m)", "Dew point temperature(°C)",
"Solar Radiation (MJ/m2)", "Rainfall(mm)", "Snowfall (cm)")]
str(df_selected)
## tibble [8,760 × 10] (S3: tbl_df/tbl/data.frame)
## $ Rented Bike Count : num [1:8760] 254 204 173 107 78 100 181 460 930 490 ...
## $ Hour : num [1:8760] 0 1 2 3 4 5 6 7 8 9 ...
## $ Temperature(°C) : num [1:8760] -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
## $ Humidity(%) : num [1:8760] 37 38 39 40 36 37 35 38 37 27 ...
## $ Wind speed (m/s) : num [1:8760] 2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
## $ Visibility (10m) : num [1:8760] 2000 2000 2000 2000 2000 ...
## $ Dew point temperature(°C): num [1:8760] -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
## $ Solar Radiation (MJ/m2) : num [1:8760] 0 0 0 0 0 0 0 0 0.01 0.23 ...
## $ Rainfall(mm) : num [1:8760] 0 0 0 0 0 0 0 0 0 0 ...
## $ Snowfall (cm) : num [1:8760] 0 0 0 0 0 0 0 0 0 0 ...
sum(is.na(df_selected))
## [1] 0
# Mengubah data menjadi skala yang sama (Mean = 0, SD = 1)
df_scaled <- scale(df_selected)
df_scaled <- as.data.frame(df_scaled)
head(df_scaled)
## Rented Bike Count Hour Temperature(°C) Humidity(%) Wind speed (m/s)
## 1 -0.6986106 -1.6612299 -1.513871 -1.0424234 0.4584496
## 2 -0.7761303 -1.5167752 -1.538986 -0.9933133 -0.8925105
## 3 -0.8241925 -1.3723204 -1.580845 -0.9442032 -0.6995162
## 4 -0.9265185 -1.2278656 -1.597589 -0.8950931 -0.7960134
## 5 -0.9714799 -1.0834108 -1.580845 -1.0915335 0.5549468
## 6 -0.9373712 -0.9389561 -1.614333 -1.0424234 -0.2170305
## Visibility (10m) Dew point temperature(°C) Solar Radiation (MJ/m2)
## 1 0.9258185 -1.659510 -0.6550943
## 2 0.9258185 -1.659510 -0.6550943
## 3 0.9258185 -1.667167 -0.6550943
## 4 0.9258185 -1.659510 -0.6550943
## 5 0.9258185 -1.736077 -0.6550943
## 6 0.9258185 -1.743734 -0.6550943
## Rainfall(mm) Snowfall (cm)
## 1 -0.1317924 -0.1718813
## 2 -0.1317924 -0.1718813
## 3 -0.1317924 -0.1718813
## 4 -0.1317924 -0.1718813
## 5 -0.1317924 -0.1718813
## 6 -0.1317924 -0.1718813
set.seed(123)
wss <- sapply(1:10, function(k){
kmeans(df_scaled, centers = k, nstart = 20)$tot.withinss
})
## Warning: did not converge in 10 iterations
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 438000)
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of squares",
main = "Elbow Method (Scaled Data)")
# Silhouette Analysis(dengan sample) Mengambil 2000 baris acak agar
perhitungan jarak tidak membebani RAM
set.seed(123)
df_sample <- df_scaled[sample(1:nrow(df_scaled), 2000), ]
avg_sil <- function(k) {
km_res <- kmeans(df_sample, centers = k, nstart = 25)
ss <- silhouette(km_res$cluster, dist(df_sample))
mean(ss[, 3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouette Width",
main = "Silhouette Analysis (Sample 2000 obs)")
Terdapat perbedaan penentuan jumlah klaster optimal antara Elbow Method
(k=3) dan Silhouette Analysis (k=6). Hal ini umum terjadi pada dataset
dengan dimensi tinggi dan sebaran data yang padat. Elbow Method
menunjukkan bahwa penurunan inersia paling signifikan terjadi hingga
k=3, sementara Silhouette memberikan skor validitas tertinggi pada k=6.
Untuk keseimbangan antara efisiensi model dan kemudahan interpretasi
(interpretability), maka dipilih k=3 sebagai jumlah klaster final
set.seed(123)
k_fix <- 3
km_res <- kmeans(df_scaled, centers = k_fix, nstart = 25)
kmed_res <- kcca(df_scaled, k = k_fix, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
Note: eps 1.5 - 2.0 biasanya pas untuk data yang sudah di-scale
db_res <- dbscan(df_scaled, eps = 1.5, MinPts = 10)
fcm_res <- cmeans(df_scaled, centers = k_fix, m = 2)
pam_res <- pam(df_scaled, k = k_fix)
# Pilih: Rented Bike, Hour, Temperature, Humidity
df_plot <- df_scaled[, c(1, 2, 3, 4)]
par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
plot(df_plot, col = km_res$cluster, main = "K-means")
plot(df_plot, col = clusters(kmed_res), main = "K-medians")
plot(df_plot, col = db_res$cluster + 1L, main = "DBSCAN (0 = Noise)")
plot(df_plot, col = fcm_res$cluster, main = "Fuzzy C-means")
plot(df_plot, col = pam_res$clustering, main = "K-Medoids (PAM)")
Berdasarkan visualisasi K-Medoids (PAM), data berhasil terbagi menjadi 3
kelompok yang memiliki karakteristik jelas:
Klaster Hijau (High Demand): Terkonsentrasi pada jam-jam sibuk dengan suhu udara yang hangat/nyaman.
Klaster Hitam (Low Demand): Mencakup periode dengan suhu rendah (dingin) di mana minat penyewaan sepeda menurun drastis.
Klaster Merah (Moderate): Mewakili kondisi rata-rata atau transisi di antara kedua kondisi ekstrem tersebut.
Penggunaan K-Medoids terlihat sangat efektif karena mampu memisahkan kelompok data meskipun terdapat tumpang tindih (overlap) pada variabel kelembapan (Humidity).
Karena kita tidak punya “label asli” (Ground Truth), kita tidak bisa pakai ARI. Kita fokus pada metrik internal: Silhouette dan Dunn Index.
# Hitung jarak sekali saja untuk efisiensi
dist_matrix <- dist(df_scaled)
sil_km <- silhouette(km_res$cluster, dist_matrix)
print(paste("Average Silhouette K-Means:", mean(sil_km[, 3])))
## [1] "Average Silhouette K-Means: 0.229233963480475"
fpc::cluster.stats sangat berguna di sini
stats_km <- cluster.stats(dist_matrix, km_res$cluster)
print(paste("Dunn Index K-Means:", stats_km$dunn))
## [1] "Dunn Index K-Means: 0.00453798826849493"
print(paste("Within-cluster SS:", km_res$tot.withinss))
## [1] "Within-cluster SS: 58631.8274803518"
Alasan ARI tidak dipakai karena Adjusted Rand Index (ARI) digunakan jika kita sudah tahu jawaban benarnya (misal: kita tahu baris 1-100 itu pasti ‘Musim Dingin’). Karena ini adalah Unsupervised Learning (kita sedang mencari kelompok yang belum diketahui), kita tidak punya pembandingnya.
Jadi, Evaluasi dilakukan menggunakan Silhouette Width untuk mengukur seberapa baik sebuah objek cocok dengan klasternya sendiri dibandingkan klaster lain, serta Dunn Index untuk melihat seberapa padat dan terpisah antar klaster yang terbentuk. Karena dataset ini tidak memiliki label kategori awal, metrik internal ini menjadi acuan utama dalam menentukan kualitas segmentasi.