library(readr)
## Warning: package 'readr' was built under R version 4.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:flexclust':
## 
##     bclust
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(mclust)
## Warning: package 'mclust' was built under R version 4.5.3
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## The following object is masked from 'package:purrr':
## 
##     map
df <- read_csv("SeoulBikeData.csv", locale = locale(encoding = "latin1"))
## Rows: 8760 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Date, Seasons, Holiday, Functioning Day
## dbl (10): Rented Bike Count, Hour, Temperature(°C), Humidity(%), Wind speed ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 14
##   Date       `Rented Bike Count`  Hour `Temperature(°C)` `Humidity(%)`
##   <chr>                    <dbl> <dbl>             <dbl>         <dbl>
## 1 01/12/2017                 254     0              -5.2            37
## 2 01/12/2017                 204     1              -5.5            38
## 3 01/12/2017                 173     2              -6              39
## 4 01/12/2017                 107     3              -6.2            40
## 5 01/12/2017                  78     4              -6              36
## 6 01/12/2017                 100     5              -6.4            37
## # ℹ 9 more variables: `Wind speed (m/s)` <dbl>, `Visibility (10m)` <dbl>,
## #   `Dew point temperature(°C)` <dbl>, `Solar Radiation (MJ/m2)` <dbl>,
## #   `Rainfall(mm)` <dbl>, `Snowfall (cm)` <dbl>, Seasons <chr>, Holiday <chr>,
## #   `Functioning Day` <chr>

Seleksi Fitur

Dipilih 10 fitur numerik utama (kolom 2 sampai 11) yang bersifat kontinu, yaitu mulai dari Rented Bike Count hingga Snowfall. Pemilihan ini dilakukan karena algoritma clustering berbasis jarak seperti K-Means dan K-Median bekerja lebih baik pada data yang memiliki nilai yang bertahap dan tidak diskrit. Fitur-fitur tersebut mewakili kondisi lingkungan seperti suhu, kelembaban, dan radiasi matahari, serta kondisi cuaca seperti hujan dan salju. Faktor-faktor ini secara logis memengaruhi keputusan seseorang dalam menyewa sepeda. Sementara itu, data kategorikal dan biner seperti Seasons, Holiday dan Functioning Day tidak digunakan agar tidak memengaruhi perhitungan jarak. Dengan demikian, hasil pengelompokan yang terbentuk benar-benar didasarkan pada pola aktivitas dan kondisi lingkungan yang dapat diukur.

df_selected <- df[, c("Rented Bike Count", "Hour", "Temperature(°C)", "Humidity(%)", 
                      "Wind speed (m/s)", "Visibility (10m)", "Dew point temperature(°C)", 
                      "Solar Radiation (MJ/m2)", "Rainfall(mm)", "Snowfall (cm)")]

Cek Data

str(df_selected)
## tibble [8,760 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Rented Bike Count        : num [1:8760] 254 204 173 107 78 100 181 460 930 490 ...
##  $ Hour                     : num [1:8760] 0 1 2 3 4 5 6 7 8 9 ...
##  $ Temperature(°C)          : num [1:8760] -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
##  $ Humidity(%)              : num [1:8760] 37 38 39 40 36 37 35 38 37 27 ...
##  $ Wind speed (m/s)         : num [1:8760] 2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
##  $ Visibility (10m)         : num [1:8760] 2000 2000 2000 2000 2000 ...
##  $ Dew point temperature(°C): num [1:8760] -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
##  $ Solar Radiation (MJ/m2)  : num [1:8760] 0 0 0 0 0 0 0 0 0.01 0.23 ...
##  $ Rainfall(mm)             : num [1:8760] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Snowfall (cm)            : num [1:8760] 0 0 0 0 0 0 0 0 0 0 ...

Cek missing value

sum(is.na(df_selected))
## [1] 0

Prepare data (Scale to ensure distance metrics aren’t biased by units)

# Mengubah data menjadi skala yang sama (Mean = 0, SD = 1)
df_scaled <- scale(df_selected)
df_scaled <- as.data.frame(df_scaled)

head(df_scaled)
##   Rented Bike Count       Hour Temperature(°C) Humidity(%) Wind speed (m/s)
## 1        -0.6986106 -1.6612299       -1.513871  -1.0424234        0.4584496
## 2        -0.7761303 -1.5167752       -1.538986  -0.9933133       -0.8925105
## 3        -0.8241925 -1.3723204       -1.580845  -0.9442032       -0.6995162
## 4        -0.9265185 -1.2278656       -1.597589  -0.8950931       -0.7960134
## 5        -0.9714799 -1.0834108       -1.580845  -1.0915335        0.5549468
## 6        -0.9373712 -0.9389561       -1.614333  -1.0424234       -0.2170305
##   Visibility (10m) Dew point temperature(°C) Solar Radiation (MJ/m2)
## 1        0.9258185                 -1.659510              -0.6550943
## 2        0.9258185                 -1.659510              -0.6550943
## 3        0.9258185                 -1.667167              -0.6550943
## 4        0.9258185                 -1.659510              -0.6550943
## 5        0.9258185                 -1.736077              -0.6550943
## 6        0.9258185                 -1.743734              -0.6550943
##   Rainfall(mm) Snowfall (cm)
## 1   -0.1317924    -0.1718813
## 2   -0.1317924    -0.1718813
## 3   -0.1317924    -0.1718813
## 4   -0.1317924    -0.1718813
## 5   -0.1317924    -0.1718813
## 6   -0.1317924    -0.1718813

number of K

Elbow Method using base R and a loop

set.seed(123)
wss <- sapply(1:10, function(k){
  kmeans(df_scaled, centers = k, nstart = 20)$tot.withinss
})
## Warning: did not converge in 10 iterations
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 438000)
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of clusters K",
     ylab = "Total within-clusters sum of squares",
     main = "Elbow Method (Scaled Data)")

# Silhouette Analysis(dengan sample) Mengambil 2000 baris acak agar perhitungan jarak tidak membebani RAM

set.seed(123)
df_sample <- df_scaled[sample(1:nrow(df_scaled), 2000), ]

avg_sil <- function(k) {
  km_res <- kmeans(df_sample, centers = k, nstart = 25)
  ss <- silhouette(km_res$cluster, dist(df_sample))
  mean(ss[, 3])
}

k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)

plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of clusters K",
     ylab = "Average Silhouette Width",
     main = "Silhouette Analysis (Sample 2000 obs)")

Terdapat perbedaan penentuan jumlah klaster optimal antara Elbow Method (k=3) dan Silhouette Analysis (k=6). Hal ini umum terjadi pada dataset dengan dimensi tinggi dan sebaran data yang padat. Elbow Method menunjukkan bahwa penurunan inersia paling signifikan terjadi hingga k=3, sementara Silhouette memberikan skor validitas tertinggi pada k=6. Untuk keseimbangan antara efisiensi model dan kemudahan interpretasi (interpretability), maka dipilih k=3 sebagai jumlah klaster final

Metode

set.seed(123)
k_fix <- 3

1. Kmeans

km_res <- kmeans(df_scaled, centers = k_fix, nstart = 25)

2. K-Median

kmed_res <- kcca(df_scaled, k = k_fix, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'

3. DBSCAN

Note: eps 1.5 - 2.0 biasanya pas untuk data yang sudah di-scale

db_res <- dbscan(df_scaled, eps = 1.5, MinPts = 10)

4. Fuzzy C-means

fcm_res <- cmeans(df_scaled, centers = k_fix, m = 2)

5. K-Medoids (PAM)

Ganti MeanShift ke PAM karena MeanShift sering crash di data 8rb baris

pam_res <- pam(df_scaled, k = k_fix)

Visualization Comparison

# Pilih: Rented Bike, Hour, Temperature, Humidity
df_plot <- df_scaled[, c(1, 2, 3, 4)]

par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
plot(df_plot, col = km_res$cluster, main = "K-means")

plot(df_plot, col = clusters(kmed_res), main = "K-medians")

plot(df_plot, col = db_res$cluster + 1L, main = "DBSCAN (0 = Noise)")

plot(df_plot, col = fcm_res$cluster, main = "Fuzzy C-means")

plot(df_plot, col = pam_res$clustering, main = "K-Medoids (PAM)")

Berdasarkan visualisasi K-Medoids (PAM), data berhasil terbagi menjadi 3 kelompok yang memiliki karakteristik jelas:

Penggunaan K-Medoids terlihat sangat efektif karena mampu memisahkan kelompok data meskipun terdapat tumpang tindih (overlap) pada variabel kelembapan (Humidity).

metrics

Karena kita tidak punya “label asli” (Ground Truth), kita tidak bisa pakai ARI. Kita fokus pada metrik internal: Silhouette dan Dunn Index.

# Hitung jarak sekali saja untuk efisiensi
dist_matrix <- dist(df_scaled)

1. Silhouette Score (K-Means)

sil_km <- silhouette(km_res$cluster, dist_matrix)
print(paste("Average Silhouette K-Means:", mean(sil_km[, 3])))
## [1] "Average Silhouette K-Means: 0.229233963480475"

2. Dunn Index (K-Means)

fpc::cluster.stats sangat berguna di sini

stats_km <- cluster.stats(dist_matrix, km_res$cluster)
print(paste("Dunn Index K-Means:", stats_km$dunn))
## [1] "Dunn Index K-Means: 0.00453798826849493"
print(paste("Within-cluster SS:", km_res$tot.withinss))
## [1] "Within-cluster SS: 58631.8274803518"

Alasan ARI tidak dipakai karena Adjusted Rand Index (ARI) digunakan jika kita sudah tahu jawaban benarnya (misal: kita tahu baris 1-100 itu pasti ‘Musim Dingin’). Karena ini adalah Unsupervised Learning (kita sedang mencari kelompok yang belum diketahui), kita tidak punya pembandingnya.

Jadi, Evaluasi dilakukan menggunakan Silhouette Width untuk mengukur seberapa baik sebuah objek cocok dengan klasternya sendiri dibandingkan klaster lain, serta Dunn Index untuk melihat seberapa padat dan terpisah antar klaster yang terbentuk. Karena dataset ini tidak memiliki label kategori awal, metrik internal ini menjadi acuan utama dalam menentukan kualitas segmentasi.