Clustering Kelompok 12

Implementasi Clustering

Implementasi clustering dilakukan menggunakan metode K-Means, K-Medians, DBSCAN, Mean Shift, dan Fuzzy C-Means, pada data yang telah dinormalisasi. Setiap metode digunakan untuk mengelompokkan data berdasarkan kemiripan karakteristik, kemudian hasilnya divisualisasikan untuk melihat pola cluster yang terbentuk.

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.3

## Warning: package 'ggplot2' was built under R version 4.5.3

## Warning: package 'tidyr' was built under R version 4.5.3

## Warning: package 'readr' was built under R version 4.5.3

## Warning: package 'purrr' was built under R version 4.5.3

## Warning: package 'forcats' was built under R version 4.5.3

## Warning: package 'lubridate' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(flexclust)

## Warning: package 'flexclust' was built under R version 4.5.3

library(dbscan)

## Warning: package 'dbscan' was built under R version 4.5.3

## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram

library(meanShiftR)
library(e1071)

## Warning: package 'e1071' was built under R version 4.5.3

## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:flexclust':
## 
##     bclust
## 
## The following object is masked from 'package:ggplot2':
## 
##     element

library(cluster)

## Warning: package 'cluster' was built under R version 4.5.3

library(fpc)

## Warning: package 'fpc' was built under R version 4.5.3

## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan

library(mclust)

## Warning: package 'mclust' was built under R version 4.5.3

## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## The following object is masked from 'package:purrr':
## 
##     map

library(ggplot2)
library(GGally)

## Warning: package 'GGally' was built under R version 4.5.3

library(corrplot)

## corrplot 0.95 loaded

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.5.3

## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/

DATA UNDERSTANDING

df_raw <- read.csv("D:/PERKULIAHAN/Semester 4/Analisis Multivariat/Modul 3/archive/gym_members_exercise_tracking.csv")

head(df_raw)

##   Age Gender Weight..kg. Height..m. Max_BPM Avg_BPM Resting_BPM
## 1  56   Male        88.3       1.71     180     157          60
## 2  46 Female        74.9       1.53     179     151          66
## 3  32 Female        68.1       1.66     167     122          54
## 4  25   Male        53.2       1.70     190     164          56
## 5  38   Male        46.1       1.79     188     158          68
## 6  56 Female        58.0       1.68     168     156          74
##   Session_Duration..hours. Calories_Burned Workout_Type Fat_Percentage
## 1                     1.69            1313         Yoga           12.6
## 2                     1.30             883         HIIT           33.9
## 3                     1.11             677       Cardio           33.4
## 4                     0.59             532     Strength           28.8
## 5                     0.64             556     Strength           29.2
## 6                     1.59            1116         HIIT           15.5
##   Water_Intake..liters. Workout_Frequency..days.week. Experience_Level   BMI
## 1                   3.5                             4                3 30.20
## 2                   2.1                             4                2 32.00
## 3                   2.3                             4                2 24.71
## 4                   2.1                             3                1 18.41
## 5                   2.8                             3                1 14.39
## 6                   2.7                             5                3 20.55

str(df_raw)

## 'data.frame':    973 obs. of  15 variables:
##  $ Age                          : int  56 46 32 25 38 56 36 40 28 28 ...
##  $ Gender                       : chr  "Male" "Female" "Female" "Male" ...
##  $ Weight..kg.                  : num  88.3 74.9 68.1 53.2 46.1 ...
##  $ Height..m.                   : num  1.71 1.53 1.66 1.7 1.79 1.68 1.72 1.51 1.94 1.84 ...
##  $ Max_BPM                      : int  180 179 167 190 188 168 174 189 185 169 ...
##  $ Avg_BPM                      : int  157 151 122 164 158 156 169 141 127 136 ...
##  $ Resting_BPM                  : int  60 66 54 56 68 74 73 64 52 64 ...
##  $ Session_Duration..hours.     : num  1.69 1.3 1.11 0.59 0.64 1.59 1.49 1.27 1.03 1.08 ...
##  $ Calories_Burned              : num  1313 883 677 532 556 ...
##  $ Workout_Type                 : chr  "Yoga" "HIIT" "Cardio" "Strength" ...
##  $ Fat_Percentage               : num  12.6 33.9 33.4 28.8 29.2 15.5 21.3 30.6 28.9 29.7 ...
##  $ Water_Intake..liters.        : num  3.5 2.1 2.3 2.1 2.8 2.7 2.3 1.9 2.6 2.7 ...
##  $ Workout_Frequency..days.week.: int  4 4 4 3 3 5 3 3 4 3 ...
##  $ Experience_Level             : int  3 2 2 1 1 3 2 2 2 1 ...
##  $ BMI                          : num  30.2 32 24.7 18.4 14.4 ...

dim (df_raw)

## [1] 973  15

summary(df_raw)

##       Age           Gender           Weight..kg.       Height..m.   
##  Min.   :18.00   Length:973         Min.   : 40.00   Min.   :1.500  
##  1st Qu.:28.00   Class :character   1st Qu.: 58.10   1st Qu.:1.620  
##  Median :40.00   Mode  :character   Median : 70.00   Median :1.710  
##  Mean   :38.68                      Mean   : 73.85   Mean   :1.723  
##  3rd Qu.:49.00                      3rd Qu.: 86.00   3rd Qu.:1.800  
##  Max.   :59.00                      Max.   :129.90   Max.   :2.000  
##     Max_BPM         Avg_BPM       Resting_BPM    Session_Duration..hours.
##  Min.   :160.0   Min.   :120.0   Min.   :50.00   Min.   :0.500           
##  1st Qu.:170.0   1st Qu.:131.0   1st Qu.:56.00   1st Qu.:1.040           
##  Median :180.0   Median :143.0   Median :62.00   Median :1.260           
##  Mean   :179.9   Mean   :143.8   Mean   :62.22   Mean   :1.256           
##  3rd Qu.:190.0   3rd Qu.:156.0   3rd Qu.:68.00   3rd Qu.:1.460           
##  Max.   :199.0   Max.   :169.0   Max.   :74.00   Max.   :2.000           
##  Calories_Burned  Workout_Type       Fat_Percentage  Water_Intake..liters.
##  Min.   : 303.0   Length:973         Min.   :10.00   Min.   :1.500        
##  1st Qu.: 720.0   Class :character   1st Qu.:21.30   1st Qu.:2.200        
##  Median : 893.0   Mode  :character   Median :26.20   Median :2.600        
##  Mean   : 905.4                      Mean   :24.98   Mean   :2.627        
##  3rd Qu.:1076.0                      3rd Qu.:29.30   3rd Qu.:3.100        
##  Max.   :1783.0                      Max.   :35.00   Max.   :3.700        
##  Workout_Frequency..days.week. Experience_Level      BMI       
##  Min.   :2.000                 Min.   :1.00     Min.   :12.32  
##  1st Qu.:3.000                 1st Qu.:1.00     1st Qu.:20.11  
##  Median :3.000                 Median :2.00     Median :24.16  
##  Mean   :3.322                 Mean   :1.81     Mean   :24.91  
##  3rd Qu.:4.000                 3rd Qu.:2.00     3rd Qu.:28.56  
##  Max.   :5.000                 Max.   :3.00     Max.   :49.84

Data Cleaning

# Cek missing Value
cat("\nJumlah Missing Value\n")

## 
## Jumlah Missing Value

colSums(is.na(df_raw))

##                           Age                        Gender 
##                             0                             0 
##                   Weight..kg.                    Height..m. 
##                             0                             0 
##                       Max_BPM                       Avg_BPM 
##                             0                             0 
##                   Resting_BPM      Session_Duration..hours. 
##                             0                             0 
##               Calories_Burned                  Workout_Type 
##                             0                             0 
##                Fat_Percentage         Water_Intake..liters. 
##                             0                             0 
## Workout_Frequency..days.week.              Experience_Level 
##                             0                             0 
##                           BMI 
##                             0

# Cek duplikat
cat("\nJumlah Baris Duplikat\n")

## 
## Jumlah Baris Duplikat

sum(duplicated(df_raw))

## [1] 0

# Hapus duplikat jika ada
df_clean <- df_raw[!duplicated(df_raw), ]
cat("\nDimensi setelah hapus duplikat:", dim(df_clean), "\n")

## 
## Dimensi setelah hapus duplikat: 973 15

# Rename kolom supaya lebih clean (hapus spasi & simbol)
colnames(df_clean) <- c("Age", "Weight_kg", "Height_m",
                       "Max_BPM", "Avg_BPM", "Resting_BPM",
                       "Session_Duration_hours", "Fat_Percentage",
                       "Water_Intake_liters", "Workout_Frequency_days")

# Cek nama kolom baru
cat("\n Nama Kolom Setelah Rename\n")

## 
##  Nama Kolom Setelah Rename

colnames(df_clean)

##  [1] "Age"                    "Weight_kg"              "Height_m"              
##  [4] "Max_BPM"                "Avg_BPM"                "Resting_BPM"           
##  [7] "Session_Duration_hours" "Fat_Percentage"         "Water_Intake_liters"   
## [10] "Workout_Frequency_days" NA                       NA                      
## [13] NA                       NA                       NA

AMBIL FITUR NUMERIK (SEBELUM SELEKSI)

colnames(df_clean) <- c("Age", "Weight_kg", "Height_m", "Max_BPM", 
                        "Avg_BPM", "Resting_BPM", "Session_Duration_hours", 
                        "Calories_Burned", "Fat_Percentage", "Water_Intake_liters", 
                        "Workout_Frequency_days", "Experience_Level", "BMI")

df_numeric <- df_clean %>%
  select(Age, Weight_kg, Height_m, Max_BPM, Avg_BPM, Resting_BPM,
         Session_Duration_hours, Calories_Burned, Fat_Percentage,
         Water_Intake_liters, Workout_Frequency_days,
         Experience_Level, BMI)
dim(df_numeric)

## [1] 973  13

str(df_numeric)

## 'data.frame':    973 obs. of  13 variables:
##  $ Age                   : int  56 46 32 25 38 56 36 40 28 28 ...
##  $ Weight_kg             : chr  "Male" "Female" "Female" "Male" ...
##  $ Height_m              : num  88.3 74.9 68.1 53.2 46.1 ...
##  $ Max_BPM               : num  1.71 1.53 1.66 1.7 1.79 1.68 1.72 1.51 1.94 1.84 ...
##  $ Avg_BPM               : int  180 179 167 190 188 168 174 189 185 169 ...
##  $ Resting_BPM           : int  157 151 122 164 158 156 169 141 127 136 ...
##  $ Session_Duration_hours: int  60 66 54 56 68 74 73 64 52 64 ...
##  $ Calories_Burned       : num  1.69 1.3 1.11 0.59 0.64 1.59 1.49 1.27 1.03 1.08 ...
##  $ Fat_Percentage        : num  1313 883 677 532 556 ...
##  $ Water_Intake_liters   : chr  "Yoga" "HIIT" "Cardio" "Strength" ...
##  $ Workout_Frequency_days: num  12.6 33.9 33.4 28.8 29.2 15.5 21.3 30.6 28.9 29.7 ...
##  $ Experience_Level      : num  3.5 2.1 2.3 2.1 2.8 2.7 2.3 1.9 2.6 2.7 ...
##  $ BMI                   : int  4 4 4 3 3 5 3 3 4 3 ...

CEK KORELASI (SEBELUM FEATURE SELECTION)

# Pastikan hanya kolom angka yang dihitung
df_numeric_fix <- df_numeric %>% select(where(is.numeric))

# Hitung correlation matrix
cor_matrix <- cor(df_numeric_fix, use = "complete.obs")
 
cat("Correlation Matrix\n")

## Correlation Matrix

print(round(cor_matrix, 2))

##                          Age Height_m Max_BPM Avg_BPM Resting_BPM
## Age                     1.00    -0.04   -0.03   -0.02        0.04
## Height_m               -0.04     1.00    0.37    0.06        0.01
## Max_BPM                -0.03     0.37    1.00   -0.02       -0.01
## Avg_BPM                -0.02     0.06   -0.02    1.00       -0.04
## Resting_BPM             0.04     0.01   -0.01   -0.04        1.00
## Session_Duration_hours  0.00    -0.03   -0.01    0.04        0.06
## Calories_Burned        -0.02    -0.01   -0.01    0.01        0.02
## Fat_Percentage         -0.15     0.10    0.09    0.00        0.34
## Workout_Frequency_days  0.00    -0.23   -0.24   -0.01       -0.01
## Experience_Level        0.04     0.39    0.39    0.03        0.00
## BMI                     0.01    -0.01   -0.01   -0.03       -0.01
##                        Session_Duration_hours Calories_Burned Fat_Percentage
## Age                                      0.00           -0.02          -0.15
## Height_m                                -0.03           -0.01           0.10
## Max_BPM                                 -0.01           -0.01           0.09
## Avg_BPM                                  0.04            0.01           0.00
## Resting_BPM                              0.06            0.02           0.34
## Session_Duration_hours                   1.00           -0.02           0.02
## Calories_Burned                         -0.02            1.00           0.91
## Fat_Percentage                           0.02            0.91           1.00
## Workout_Frequency_days                  -0.02           -0.58          -0.60
## Experience_Level                         0.01            0.28           0.36
## BMI                                     -0.01            0.64           0.58
##                        Workout_Frequency_days Experience_Level   BMI
## Age                                      0.00             0.04  0.01
## Height_m                                -0.23             0.39 -0.01
## Max_BPM                                 -0.24             0.39 -0.01
## Avg_BPM                                 -0.01             0.03 -0.03
## Resting_BPM                             -0.01             0.00 -0.01
## Session_Duration_hours                  -0.02             0.01 -0.01
## Calories_Burned                         -0.58             0.28  0.64
## Fat_Percentage                          -0.60             0.36  0.58
## Workout_Frequency_days                   1.00            -0.59 -0.54
## Experience_Level                        -0.59             1.00  0.24
## BMI                                     -0.54             0.24  1.00

cat("Correlation Matrix\n")

## Correlation Matrix

print(round(cor_matrix, 2))

##                          Age Height_m Max_BPM Avg_BPM Resting_BPM
## Age                     1.00    -0.04   -0.03   -0.02        0.04
## Height_m               -0.04     1.00    0.37    0.06        0.01
## Max_BPM                -0.03     0.37    1.00   -0.02       -0.01
## Avg_BPM                -0.02     0.06   -0.02    1.00       -0.04
## Resting_BPM             0.04     0.01   -0.01   -0.04        1.00
## Session_Duration_hours  0.00    -0.03   -0.01    0.04        0.06
## Calories_Burned        -0.02    -0.01   -0.01    0.01        0.02
## Fat_Percentage         -0.15     0.10    0.09    0.00        0.34
## Workout_Frequency_days  0.00    -0.23   -0.24   -0.01       -0.01
## Experience_Level        0.04     0.39    0.39    0.03        0.00
## BMI                     0.01    -0.01   -0.01   -0.03       -0.01
##                        Session_Duration_hours Calories_Burned Fat_Percentage
## Age                                      0.00           -0.02          -0.15
## Height_m                                -0.03           -0.01           0.10
## Max_BPM                                 -0.01           -0.01           0.09
## Avg_BPM                                  0.04            0.01           0.00
## Resting_BPM                              0.06            0.02           0.34
## Session_Duration_hours                   1.00           -0.02           0.02
## Calories_Burned                         -0.02            1.00           0.91
## Fat_Percentage                           0.02            0.91           1.00
## Workout_Frequency_days                  -0.02           -0.58          -0.60
## Experience_Level                         0.01            0.28           0.36
## BMI                                     -0.01            0.64           0.58
##                        Workout_Frequency_days Experience_Level   BMI
## Age                                      0.00             0.04  0.01
## Height_m                                -0.23             0.39 -0.01
## Max_BPM                                 -0.24             0.39 -0.01
## Avg_BPM                                 -0.01             0.03 -0.03
## Resting_BPM                             -0.01             0.00 -0.01
## Session_Duration_hours                  -0.02             0.01 -0.01
## Calories_Burned                         -0.58             0.28  0.64
## Fat_Percentage                          -0.60             0.36  0.58
## Workout_Frequency_days                   1.00            -0.59 -0.54
## Experience_Level                        -0.59             1.00  0.24
## BMI                                     -0.54             0.24  1.00

#Heatmap korelasi semua fitur numerik
corrplot(cor_matrix, method = "color", type = "upper")

#Identifikasi pasangan fitur HIGHLY CORRELATED (|r| > 0.8)
cat("\nPasangan Fitur dengan Korelasi Tinggi (|r| > 0.8)\n")

## 
## Pasangan Fitur dengan Korelasi Tinggi (|r| > 0.8)

cor_pairs <- which(abs(cor_matrix) > 0.8 & abs(cor_matrix) < 1,
                   arr.ind = TRUE)
if (nrow(cor_pairs) > 0) {
  for (i in 1:nrow(cor_pairs)) {
    r1 <- rownames(cor_matrix)[cor_pairs[i, 1]]
    r2 <- colnames(cor_matrix)[cor_pairs[i, 2]]
    if (cor_pairs[i, 1] < cor_pairs[i, 2]) {
      cat(sprintf("  %s <-> %s : r = %.3f\n",
                  r1, r2,
                  cor_matrix[cor_pairs[i, 1], cor_pairs[i, 2]]))
    }
  }
} else {
  cat("  Tidak ada pasangan dengan |r| > 0.8\n")
}

##   Calories_Burned <-> Fat_Percentage : r = 0.908

#FEATURE SELECTION (BERDASARKAN HASIL KORELASI)
features_selected <- c("Age", "Weight_kg", "Height_m",
                       "Max_BPM", "Avg_BPM", "Resting_BPM",
                       "Session_Duration_hours", "Fat_Percentage",
                       "Water_Intake_liters", "Workout_Frequency_days")

df_selected <- df_clean[, features_selected]

cat(" Fitur yang Digunakan (setelah seleksi) \n")

##  Fitur yang Digunakan (setelah seleksi)

print(features_selected)

##  [1] "Age"                    "Weight_kg"              "Height_m"              
##  [4] "Max_BPM"                "Avg_BPM"                "Resting_BPM"           
##  [7] "Session_Duration_hours" "Fat_Percentage"         "Water_Intake_liters"   
## [10] "Workout_Frequency_days"

cat("\nDimensi df_selected:", dim(df_selected), "\n")

## 
## Dimensi df_selected: 973 10

# Filter hanya yang angka sebelum korelasi
df_selected_numeric <- df_selected %>% select(where(is.numeric))

# Konfirmasi korelasi setelah seleksi
cor_after <- cor(df_selected_numeric, use = "complete.obs")

cat("\nCek Korelasi Tinggi Setelah Seleksi\n")

## 
## Cek Korelasi Tinggi Setelah Seleksi

cor_pairs_after <- which(abs(cor_after) > 0.8 & abs(cor_after) < 1,
                         arr.ind = TRUE)
if (nrow(cor_pairs_after) > 0) {
  for (i in 1:nrow(cor_pairs_after)) {
    r1 <- rownames(cor_after)[cor_pairs_after[i, 1]]
    r2 <- colnames(cor_after)[cor_pairs_after[i, 2]]
    if (cor_pairs_after[i, 1] < cor_pairs_after[i, 2]) {
      cat(sprintf("  %s <-> %s : r = %.3f\n",
                  r1, r2,
                  cor_after[cor_pairs_after[i, 1],
                             cor_pairs_after[i, 2]]))
    }
  }
} else {
  cat("  Tidak ada lagi pasangan dengan |r| > 0.8\n")
}

##   Tidak ada lagi pasangan dengan |r| > 0.8

# Heatmap korelasi setelah seleksi
corrplot(cor_after, method = "color", type = "upper")

DETEKSI & HANDLING OUTLIER (IQR Method)

cat("Deteksi Outlier per Fitur (IQR Method)\n")

## Deteksi Outlier per Fitur (IQR Method)

# Fungsi deteksi outlier menggunakan IQR
count_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR_val
  upper <- Q3 + 1.5 * IQR_val
  sum(x < lower | x > upper, na.rm = TRUE)
}

# Pastikan hanya kolom numerik yang dihitung
outlier_counts <- sapply(df_selected[, sapply(df_selected, is.numeric)], count_outliers)

print(outlier_counts)

##                    Age               Height_m                Max_BPM 
##                      0                      9                      0 
##                Avg_BPM            Resting_BPM Session_Duration_hours 
##                      0                      0                      0 
##         Fat_Percentage Workout_Frequency_days 
##                     10                      0

# Visualisasi Boxplot untuk deteksi outlier

# Filter hanya fitur yang benar-benar numerik
numeric_features <- features_selected[sapply(df_selected[features_selected], is.numeric)]

# Sesuaikan layout (mfrow) agar tidak terlalu kosong jika jumlah kolom berkurang
par(mfrow = c(2, 5), mar = c(4, 3, 2, 1))

for (col in numeric_features) {
  boxplot(df_selected[[col]],
          main = col,
          col = "#4ECDC4",
          border = "#2C3E50",
          horizontal = FALSE)
}
par(mfrow = c(1, 1))

# Handling outlier: Capping (Winsorization) lebih aman dari hapus row
# Outlier diganti dengan batas bawah / atas IQR
winsorize <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR_val
  upper <- Q3 + 1.5 * IQR_val
  x[x < lower] <- lower
  x[x > upper] <- upper
  return(x)
}

# Gunakan lapply yang mengecek tipe data terlebih dahulu
df_no_outlier <- as.data.frame(lapply(df_selected, function(x) {
  if(is.numeric(x)) {
    return(winsorize(x))
  } else {
    return(x) # Jika bukan angka, kembalikan apa adanya tanpa di-winsorize
  }
}))

cat("\nOutlier Setelah Winsorization\n")

## 
## Outlier Setelah Winsorization

# Cek outlier hanya pada kolom yang numerik saja agar tidak error lagi
numeric_cols <- sapply(df_no_outlier, is.numeric)
print(sapply(df_no_outlier[, numeric_cols], count_outliers))

##                    Age               Height_m                Max_BPM 
##                      0                      0                      0 
##                Avg_BPM            Resting_BPM Session_Duration_hours 
##                      0                      0                      0 
##         Fat_Percentage Workout_Frequency_days 
##                      0                      0

EXPLORATORY DATA ANALYSIS (EDA)

#Statistik Deskriptif
cat(" Statistik Deskriptif\n")

##  Statistik Deskriptif

print(summary(df_no_outlier))

##       Age         Weight_kg            Height_m         Max_BPM     
##  Min.   :18.00   Length:973         Min.   : 40.00   Min.   :1.500  
##  1st Qu.:28.00   Class :character   1st Qu.: 58.10   1st Qu.:1.620  
##  Median :40.00   Mode  :character   Median : 70.00   Median :1.710  
##  Mean   :38.68                      Mean   : 73.85   Mean   :1.723  
##  3rd Qu.:49.00                      3rd Qu.: 86.00   3rd Qu.:1.800  
##  Max.   :59.00                      Max.   :127.85   Max.   :2.000  
##     Avg_BPM       Resting_BPM    Session_Duration_hours Fat_Percentage  
##  Min.   :160.0   Min.   :120.0   Min.   :50.00          Min.   : 303.0  
##  1st Qu.:170.0   1st Qu.:131.0   1st Qu.:56.00          1st Qu.: 720.0  
##  Median :180.0   Median :143.0   Median :62.00          Median : 893.0  
##  Mean   :179.9   Mean   :143.8   Mean   :62.22          Mean   : 904.6  
##  3rd Qu.:190.0   3rd Qu.:156.0   3rd Qu.:68.00          3rd Qu.:1076.0  
##  Max.   :199.0   Max.   :169.0   Max.   :74.00          Max.   :1610.0  
##  Water_Intake_liters Workout_Frequency_days
##  Length:973          Min.   :10.00         
##  Class :character    1st Qu.:21.30         
##  Mode  :character    Median :26.20         
##                      Mean   :24.98         
##                      3rd Qu.:29.30         
##                      Max.   :35.00

# Distribusi Setiap Fitur (Histogram) - Hanya untuk kolom Angka
df_long <- df_no_outlier %>%
  select(where(is.numeric)) %>% 
  pivot_longer(everything(), names_to = "Variable", values_to = "Value")

# Lanjut ke ggplot
ggplot(df_long, aes(x = Value)) +
  geom_histogram(bins = 30, fill = "#4ECDC4", color = "white") +
  facet_wrap(~Variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribusi Fitur Numerik (Setelah Winsorization)")

ggplot(df_long, aes(x = Value, fill = Variable)) +
  geom_histogram(bins = 30, color = "white", show.legend = FALSE) +
  facet_wrap(~Variable, scales = "free") +
  scale_fill_brewer(palette = "Set3") +
  labs(title = "Distribusi Setiap Fitur (Setelah Outlier Handling)",
       x = "Nilai", y = "Frekuensi") +
  theme_minimal()

#Pairplot (subset 5 fitur agar tidak terlalu berat)
ggpairs(df_no_outlier[, c("Age", "Weight_kg", "Fat_Percentage",
                           "Session_Duration_hours", "Avg_BPM")],
        title = "Pairplot - Fitur Utama Gym Dataset",
        lower = list(continuous = "smooth"),
        diag = list(continuous = "densityDiag")) +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

DATA PREPARATION - SCALING (df_scaled)

# Standarisasi Z-score hanya untuk fitur numerik
# Kita filter dulu kolom yang benar-benar angka
df_numeric_only <- df_no_outlier %>% select(where(is.numeric))

# Lakukan scaling
df_scaled <- as.data.frame(scale(df_numeric_only))

# Ubah nama Fitur menjadi X1 sampai X 10
colnames(df_scaled) <- paste0("x", 1:ncol(df_scaled))

cat("Cek df_scaled (mean harus ~0, sd harus ~1)\n")

## Cek df_scaled (mean harus ~0, sd harus ~1)

cat("\nMean per kolom:\n")

## 
## Mean per kolom:

print(round(colMeans(df_scaled), 4))

## x1 x2 x3 x4 x5 x6 x7 x8 
##  0  0  0  0  0  0  0  0

cat("\nSD per kolom:\n")

## 
## SD per kolom:

print(round(apply(df_scaled, 2, sd), 4))

## x1 x2 x3 x4 x5 x6 x7 x8 
##  1  1  1  1  1  1  1  1

cat("\nDimensi df_scaled:", dim(df_scaled), "\n")

## 
## Dimensi df_scaled: 973 8

head(df_scaled)

##            x1          x2          x3          x4         x5         x6
## 1  1.42161147  0.68224629 -0.09849406  0.01007625  0.9224960 -0.3033989
## 2  0.60065595  0.04974144 -1.50782814 -0.07668648  0.5042348  0.5154835
## 3 -0.54868178 -0.27123117 -0.48997575 -1.11783927 -1.5173612 -1.1222812
## 4 -1.12335065 -0.97453881 -0.17679039  0.87770357  1.4104675 -0.8493204
## 5 -0.05610847 -1.30967197  0.52787664  0.70417810  0.9922062  0.7884443
## 6  1.42161147 -0.74796990 -0.33338307 -1.03107653  0.8527858  1.6073266
##            x7         x8
## 1  1.51004885 -1.9773038
## 2 -0.08000631  1.4255680
## 3 -0.84175367  1.3456884
## 4 -1.37793506  0.6107959
## 5 -1.28918780  0.6746996
## 6  0.78158172 -1.5140020

cat("Cek df_scaled (mean harus ~0, sd harus ~1)\n")

## Cek df_scaled (mean harus ~0, sd harus ~1)

cat("\nMean per kolom:\n")

## 
## Mean per kolom:

print(round(colMeans(df_scaled), 4))

## x1 x2 x3 x4 x5 x6 x7 x8 
##  0  0  0  0  0  0  0  0

cat("\nSD per kolom:\n")

## 
## SD per kolom:

print(round(apply(df_scaled, 2, sd), 4))

## x1 x2 x3 x4 x5 x6 x7 x8 
##  1  1  1  1  1  1  1  1

PENENTUAN JUMLAH K ELBOW METHOD

set.seed(123)

# Hitung WSS
wss_values <- sapply(1:10, function(k) {
  kmeans(df_scaled, centers = k, nstart = 25, iter.max = 100)$tot.withinss
})

# Print tabel WSS + penurunan
cat("Tabel Elbow Method\n")

## Tabel Elbow Method

elbow_table <- data.frame(
  K             = 1:10,
  WSS           = round(wss_values, 2),
  Penurunan_WSS = c(NA, round(-diff(wss_values), 2))
)
print(elbow_table)

##     K     WSS Penurunan_WSS
## 1   1 7776.00            NA
## 2   2 6548.08       1227.92
## 3   3 5820.66        727.42
## 4   4 5324.85        495.81
## 5   5 5031.16        293.69
## 6   6 4780.81        250.35
## 7   7 4572.87        207.94
## 8   8 4388.75        184.12
## 9   9 4221.88        166.87
## 10 10 4070.08        151.80

# Titik penurunan paling drastis
K_OPTIMAL <- which.max(-diff(wss_values)) + 1
cat("\nK Optimal (Elbow):", K_OPTIMAL, "\n")

## 
## K Optimal (Elbow): 2

# Plot Elbow
plot(1:10, wss_values,
     type = "b", pch = 19,
     col = "#E74C3C",
     frame = FALSE,
     xlab = "Jumlah Cluster (K)",
     ylab = "Total Within-Cluster SS",
     main = "Elbow Method - Penentuan K Optimal")
abline(v = K_OPTIMAL, lty = 2, col = "gray40")
text(K_OPTIMAL, max(wss_values) * 0.95,
     paste("K =", K_OPTIMAL),
     col = "#E74C3C", font = 2)

CLUSTERING - 1. K-MEANS

## CLUSTERING - 1. K-MEANS
set.seed(123)

km_res <- kmeans(df_scaled, centers = K_OPTIMAL, nstart = 25, iter.max = 100)

cat("K-Means\n")

## K-Means

cat("Jumlah cluster:", K_OPTIMAL, "\n")

## Jumlah cluster: 2

cat("Cluster sizes:", km_res$size, "\n")

## Cluster sizes: 413 560

cat("Total Within-SS:", round(km_res$tot.withinss, 4), "\n")

## Total Within-SS: 6548.084

cat("Between-SS / Total-SS:",
    round(km_res$betweenss / km_res$totss * 100, 2), "%\n")

## Between-SS / Total-SS: 15.79 %

# Visualisasi
fviz_cluster(km_res, data = df_scaled,
             palette = "Set2",
             geom = "point",
             ellipse.type = "convex",
             ggtheme = theme_minimal(),
             main = paste("K-Means Clustering (K =", K_OPTIMAL, ")"))

CLUSTERING - 2. K-MEDIANS

## CLUSTERING - 2. K-MEDIANS
library(flexclust)

set.seed(123)
kmed_res <- kcca(df_scaled, k = K_OPTIMAL, family = kccaFamily("kmedians"))

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

cat("\nK-Medians\n")

## 
## K-Medians

cat("Jumlah cluster:", K_OPTIMAL, "\n")

## Jumlah cluster: 2

cat("Cluster sizes:\n")

## Cluster sizes:

print(table(clusters(kmed_res)))

## 
##   1   2 
## 634 339

# Visualisasi
fviz_cluster(list(data = df_scaled, cluster = clusters(kmed_res)),
             palette = "Set2",
             geom = "point",
             ggtheme = theme_minimal(),
             main = paste("K-Medians Clustering (K =", K_OPTIMAL, ")"))

CLUSTERING - 3. DBSCAN

# CLUSTERING - 3. DBSCAN
library(dbscan)

# k-distance plot untuk menentukan eps
kNNdistplot(df_scaled, k = ncol(df_scaled))
abline(h = 2.5, lty = 2, col = "red")
title(main = "k-NN Distance Plot (Penentuan eps DBSCAN)")

set.seed(123)
db_res <- dbscan::dbscan(df_scaled, eps = 2.5, minPts = ncol(df_scaled))

cat("\nDBSCAN\n")

## 
## DBSCAN

print(table(db_res$cluster))

## 
##   1 
## 973

cat("Jumlah cluster (tanpa noise):", length(unique(db_res$cluster)) - 1, "\n")

## Jumlah cluster (tanpa noise): 0

cat("Jumlah noise:", sum(db_res$cluster == 0), "\n")

## Jumlah noise: 0

# Visualisasi
fviz_cluster(list(data = df_scaled, cluster = db_res$cluster),
             palette = "Set2",
             geom = "point",
             ggtheme = theme_minimal(),
             main = "DBSCAN Clustering (0 = Noise)")

CLUSTERING - 4. Mean Shift

# Mean Shift
library(meanShiftR)

X <- data.matrix(df_scaled)

bw <- rep(1.4, ncol(X))
ms_result <- meanShift(X, bandwidth = bw)

labels_ms <- ms_result$assignment

cat("\nMean Shift\n")

## 
## Mean Shift

cat("Jumlah cluster:", length(unique(labels_ms)), "\n")

## Jumlah cluster: 3

cat("Cluster sizes:\n")

## Cluster sizes:

print(table(labels_ms))

## labels_ms
##   1   2   3 
## 195 761  17

# Visualisasi
fviz_cluster(
  list(data = df_scaled, cluster = labels_ms),
  palette = "Set2",
  geom = "point",
  ggtheme = theme_minimal(),
  main = paste("Mean Shift Clustering (K =", length(unique(labels_ms)), ")")
)

CLUSTERING - 5. Fuzzy C-means

# Fuzzy C-Means
library(e1071)
library(factoextra)

X <- df_scaled

fcm_result <- cmeans(X, centers = 2, m = 2)
labels_fcm <- fcm_result$cluster

cat("\nFuzzy C-Means\n")

## 
## Fuzzy C-Means

cat("Jumlah cluster:", length(unique(labels_fcm)), "\n")

## Jumlah cluster: 2

cat("Cluster sizes:\n")

## Cluster sizes:

print(table(labels_fcm))

## labels_fcm
##   1   2 
## 520 453

# Visualisasi
fviz_cluster(
  list(data = df_scaled, cluster = labels_fcm),
  palette = "Set2",
  geom = "point",
  ggtheme = theme_minimal(),
  main = "Fuzzy C-Means Clustering (K = 2)"
)

Clustering Kelompok 12

Yuli and Vincent

2026-04-05

Implementasi Clustering