Implementasi clustering dilakukan menggunakan metode K-Means, K-Medians, DBSCAN, Mean Shift, dan Fuzzy C-Means, pada data yang telah dinormalisasi. Setiap metode digunakan untuk mengelompokkan data berdasarkan kemiripan karakteristik, kemudian hasilnya divisualisasikan untuk melihat pola cluster yang terbentuk.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(mclust)
## Warning: package 'mclust' was built under R version 4.5.3
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
##
## The following object is masked from 'package:dplyr':
##
## count
##
## The following object is masked from 'package:purrr':
##
## map
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.3
library(corrplot)
## corrplot 0.95 loaded
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
DATA UNDERSTANDING
df_raw <- read.csv("D:/PERKULIAHAN/Semester 4/Analisis Multivariat/Modul 3/archive/gym_members_exercise_tracking.csv")
head(df_raw)
## Age Gender Weight..kg. Height..m. Max_BPM Avg_BPM Resting_BPM
## 1 56 Male 88.3 1.71 180 157 60
## 2 46 Female 74.9 1.53 179 151 66
## 3 32 Female 68.1 1.66 167 122 54
## 4 25 Male 53.2 1.70 190 164 56
## 5 38 Male 46.1 1.79 188 158 68
## 6 56 Female 58.0 1.68 168 156 74
## Session_Duration..hours. Calories_Burned Workout_Type Fat_Percentage
## 1 1.69 1313 Yoga 12.6
## 2 1.30 883 HIIT 33.9
## 3 1.11 677 Cardio 33.4
## 4 0.59 532 Strength 28.8
## 5 0.64 556 Strength 29.2
## 6 1.59 1116 HIIT 15.5
## Water_Intake..liters. Workout_Frequency..days.week. Experience_Level BMI
## 1 3.5 4 3 30.20
## 2 2.1 4 2 32.00
## 3 2.3 4 2 24.71
## 4 2.1 3 1 18.41
## 5 2.8 3 1 14.39
## 6 2.7 5 3 20.55
str(df_raw)
## 'data.frame': 973 obs. of 15 variables:
## $ Age : int 56 46 32 25 38 56 36 40 28 28 ...
## $ Gender : chr "Male" "Female" "Female" "Male" ...
## $ Weight..kg. : num 88.3 74.9 68.1 53.2 46.1 ...
## $ Height..m. : num 1.71 1.53 1.66 1.7 1.79 1.68 1.72 1.51 1.94 1.84 ...
## $ Max_BPM : int 180 179 167 190 188 168 174 189 185 169 ...
## $ Avg_BPM : int 157 151 122 164 158 156 169 141 127 136 ...
## $ Resting_BPM : int 60 66 54 56 68 74 73 64 52 64 ...
## $ Session_Duration..hours. : num 1.69 1.3 1.11 0.59 0.64 1.59 1.49 1.27 1.03 1.08 ...
## $ Calories_Burned : num 1313 883 677 532 556 ...
## $ Workout_Type : chr "Yoga" "HIIT" "Cardio" "Strength" ...
## $ Fat_Percentage : num 12.6 33.9 33.4 28.8 29.2 15.5 21.3 30.6 28.9 29.7 ...
## $ Water_Intake..liters. : num 3.5 2.1 2.3 2.1 2.8 2.7 2.3 1.9 2.6 2.7 ...
## $ Workout_Frequency..days.week.: int 4 4 4 3 3 5 3 3 4 3 ...
## $ Experience_Level : int 3 2 2 1 1 3 2 2 2 1 ...
## $ BMI : num 30.2 32 24.7 18.4 14.4 ...
dim (df_raw)
## [1] 973 15
summary(df_raw)
## Age Gender Weight..kg. Height..m.
## Min. :18.00 Length:973 Min. : 40.00 Min. :1.500
## 1st Qu.:28.00 Class :character 1st Qu.: 58.10 1st Qu.:1.620
## Median :40.00 Mode :character Median : 70.00 Median :1.710
## Mean :38.68 Mean : 73.85 Mean :1.723
## 3rd Qu.:49.00 3rd Qu.: 86.00 3rd Qu.:1.800
## Max. :59.00 Max. :129.90 Max. :2.000
## Max_BPM Avg_BPM Resting_BPM Session_Duration..hours.
## Min. :160.0 Min. :120.0 Min. :50.00 Min. :0.500
## 1st Qu.:170.0 1st Qu.:131.0 1st Qu.:56.00 1st Qu.:1.040
## Median :180.0 Median :143.0 Median :62.00 Median :1.260
## Mean :179.9 Mean :143.8 Mean :62.22 Mean :1.256
## 3rd Qu.:190.0 3rd Qu.:156.0 3rd Qu.:68.00 3rd Qu.:1.460
## Max. :199.0 Max. :169.0 Max. :74.00 Max. :2.000
## Calories_Burned Workout_Type Fat_Percentage Water_Intake..liters.
## Min. : 303.0 Length:973 Min. :10.00 Min. :1.500
## 1st Qu.: 720.0 Class :character 1st Qu.:21.30 1st Qu.:2.200
## Median : 893.0 Mode :character Median :26.20 Median :2.600
## Mean : 905.4 Mean :24.98 Mean :2.627
## 3rd Qu.:1076.0 3rd Qu.:29.30 3rd Qu.:3.100
## Max. :1783.0 Max. :35.00 Max. :3.700
## Workout_Frequency..days.week. Experience_Level BMI
## Min. :2.000 Min. :1.00 Min. :12.32
## 1st Qu.:3.000 1st Qu.:1.00 1st Qu.:20.11
## Median :3.000 Median :2.00 Median :24.16
## Mean :3.322 Mean :1.81 Mean :24.91
## 3rd Qu.:4.000 3rd Qu.:2.00 3rd Qu.:28.56
## Max. :5.000 Max. :3.00 Max. :49.84
Data Cleaning
# Cek missing Value
cat("\nJumlah Missing Value\n")
##
## Jumlah Missing Value
colSums(is.na(df_raw))
## Age Gender
## 0 0
## Weight..kg. Height..m.
## 0 0
## Max_BPM Avg_BPM
## 0 0
## Resting_BPM Session_Duration..hours.
## 0 0
## Calories_Burned Workout_Type
## 0 0
## Fat_Percentage Water_Intake..liters.
## 0 0
## Workout_Frequency..days.week. Experience_Level
## 0 0
## BMI
## 0
# Cek duplikat
cat("\nJumlah Baris Duplikat\n")
##
## Jumlah Baris Duplikat
sum(duplicated(df_raw))
## [1] 0
# Hapus duplikat jika ada
df_clean <- df_raw[!duplicated(df_raw), ]
cat("\nDimensi setelah hapus duplikat:", dim(df_clean), "\n")
##
## Dimensi setelah hapus duplikat: 973 15
# Rename kolom supaya lebih clean (hapus spasi & simbol)
colnames(df_clean) <- c("Age", "Weight_kg", "Height_m",
"Max_BPM", "Avg_BPM", "Resting_BPM",
"Session_Duration_hours", "Fat_Percentage",
"Water_Intake_liters", "Workout_Frequency_days")
# Cek nama kolom baru
cat("\n Nama Kolom Setelah Rename\n")
##
## Nama Kolom Setelah Rename
colnames(df_clean)
## [1] "Age" "Weight_kg" "Height_m"
## [4] "Max_BPM" "Avg_BPM" "Resting_BPM"
## [7] "Session_Duration_hours" "Fat_Percentage" "Water_Intake_liters"
## [10] "Workout_Frequency_days" NA NA
## [13] NA NA NA
AMBIL FITUR NUMERIK (SEBELUM SELEKSI)
colnames(df_clean) <- c("Age", "Weight_kg", "Height_m", "Max_BPM",
"Avg_BPM", "Resting_BPM", "Session_Duration_hours",
"Calories_Burned", "Fat_Percentage", "Water_Intake_liters",
"Workout_Frequency_days", "Experience_Level", "BMI")
df_numeric <- df_clean %>%
select(Age, Weight_kg, Height_m, Max_BPM, Avg_BPM, Resting_BPM,
Session_Duration_hours, Calories_Burned, Fat_Percentage,
Water_Intake_liters, Workout_Frequency_days,
Experience_Level, BMI)
dim(df_numeric)
## [1] 973 13
str(df_numeric)
## 'data.frame': 973 obs. of 13 variables:
## $ Age : int 56 46 32 25 38 56 36 40 28 28 ...
## $ Weight_kg : chr "Male" "Female" "Female" "Male" ...
## $ Height_m : num 88.3 74.9 68.1 53.2 46.1 ...
## $ Max_BPM : num 1.71 1.53 1.66 1.7 1.79 1.68 1.72 1.51 1.94 1.84 ...
## $ Avg_BPM : int 180 179 167 190 188 168 174 189 185 169 ...
## $ Resting_BPM : int 157 151 122 164 158 156 169 141 127 136 ...
## $ Session_Duration_hours: int 60 66 54 56 68 74 73 64 52 64 ...
## $ Calories_Burned : num 1.69 1.3 1.11 0.59 0.64 1.59 1.49 1.27 1.03 1.08 ...
## $ Fat_Percentage : num 1313 883 677 532 556 ...
## $ Water_Intake_liters : chr "Yoga" "HIIT" "Cardio" "Strength" ...
## $ Workout_Frequency_days: num 12.6 33.9 33.4 28.8 29.2 15.5 21.3 30.6 28.9 29.7 ...
## $ Experience_Level : num 3.5 2.1 2.3 2.1 2.8 2.7 2.3 1.9 2.6 2.7 ...
## $ BMI : int 4 4 4 3 3 5 3 3 4 3 ...
CEK KORELASI (SEBELUM FEATURE SELECTION)
# Pastikan hanya kolom angka yang dihitung
df_numeric_fix <- df_numeric %>% select(where(is.numeric))
# Hitung correlation matrix
cor_matrix <- cor(df_numeric_fix, use = "complete.obs")
cat("Correlation Matrix\n")
## Correlation Matrix
print(round(cor_matrix, 2))
## Age Height_m Max_BPM Avg_BPM Resting_BPM
## Age 1.00 -0.04 -0.03 -0.02 0.04
## Height_m -0.04 1.00 0.37 0.06 0.01
## Max_BPM -0.03 0.37 1.00 -0.02 -0.01
## Avg_BPM -0.02 0.06 -0.02 1.00 -0.04
## Resting_BPM 0.04 0.01 -0.01 -0.04 1.00
## Session_Duration_hours 0.00 -0.03 -0.01 0.04 0.06
## Calories_Burned -0.02 -0.01 -0.01 0.01 0.02
## Fat_Percentage -0.15 0.10 0.09 0.00 0.34
## Workout_Frequency_days 0.00 -0.23 -0.24 -0.01 -0.01
## Experience_Level 0.04 0.39 0.39 0.03 0.00
## BMI 0.01 -0.01 -0.01 -0.03 -0.01
## Session_Duration_hours Calories_Burned Fat_Percentage
## Age 0.00 -0.02 -0.15
## Height_m -0.03 -0.01 0.10
## Max_BPM -0.01 -0.01 0.09
## Avg_BPM 0.04 0.01 0.00
## Resting_BPM 0.06 0.02 0.34
## Session_Duration_hours 1.00 -0.02 0.02
## Calories_Burned -0.02 1.00 0.91
## Fat_Percentage 0.02 0.91 1.00
## Workout_Frequency_days -0.02 -0.58 -0.60
## Experience_Level 0.01 0.28 0.36
## BMI -0.01 0.64 0.58
## Workout_Frequency_days Experience_Level BMI
## Age 0.00 0.04 0.01
## Height_m -0.23 0.39 -0.01
## Max_BPM -0.24 0.39 -0.01
## Avg_BPM -0.01 0.03 -0.03
## Resting_BPM -0.01 0.00 -0.01
## Session_Duration_hours -0.02 0.01 -0.01
## Calories_Burned -0.58 0.28 0.64
## Fat_Percentage -0.60 0.36 0.58
## Workout_Frequency_days 1.00 -0.59 -0.54
## Experience_Level -0.59 1.00 0.24
## BMI -0.54 0.24 1.00
cat("Correlation Matrix\n")
## Correlation Matrix
print(round(cor_matrix, 2))
## Age Height_m Max_BPM Avg_BPM Resting_BPM
## Age 1.00 -0.04 -0.03 -0.02 0.04
## Height_m -0.04 1.00 0.37 0.06 0.01
## Max_BPM -0.03 0.37 1.00 -0.02 -0.01
## Avg_BPM -0.02 0.06 -0.02 1.00 -0.04
## Resting_BPM 0.04 0.01 -0.01 -0.04 1.00
## Session_Duration_hours 0.00 -0.03 -0.01 0.04 0.06
## Calories_Burned -0.02 -0.01 -0.01 0.01 0.02
## Fat_Percentage -0.15 0.10 0.09 0.00 0.34
## Workout_Frequency_days 0.00 -0.23 -0.24 -0.01 -0.01
## Experience_Level 0.04 0.39 0.39 0.03 0.00
## BMI 0.01 -0.01 -0.01 -0.03 -0.01
## Session_Duration_hours Calories_Burned Fat_Percentage
## Age 0.00 -0.02 -0.15
## Height_m -0.03 -0.01 0.10
## Max_BPM -0.01 -0.01 0.09
## Avg_BPM 0.04 0.01 0.00
## Resting_BPM 0.06 0.02 0.34
## Session_Duration_hours 1.00 -0.02 0.02
## Calories_Burned -0.02 1.00 0.91
## Fat_Percentage 0.02 0.91 1.00
## Workout_Frequency_days -0.02 -0.58 -0.60
## Experience_Level 0.01 0.28 0.36
## BMI -0.01 0.64 0.58
## Workout_Frequency_days Experience_Level BMI
## Age 0.00 0.04 0.01
## Height_m -0.23 0.39 -0.01
## Max_BPM -0.24 0.39 -0.01
## Avg_BPM -0.01 0.03 -0.03
## Resting_BPM -0.01 0.00 -0.01
## Session_Duration_hours -0.02 0.01 -0.01
## Calories_Burned -0.58 0.28 0.64
## Fat_Percentage -0.60 0.36 0.58
## Workout_Frequency_days 1.00 -0.59 -0.54
## Experience_Level -0.59 1.00 0.24
## BMI -0.54 0.24 1.00
#Heatmap korelasi semua fitur numerik
corrplot(cor_matrix, method = "color", type = "upper")
#Identifikasi pasangan fitur HIGHLY CORRELATED (|r| > 0.8)
cat("\nPasangan Fitur dengan Korelasi Tinggi (|r| > 0.8)\n")
##
## Pasangan Fitur dengan Korelasi Tinggi (|r| > 0.8)
cor_pairs <- which(abs(cor_matrix) > 0.8 & abs(cor_matrix) < 1,
arr.ind = TRUE)
if (nrow(cor_pairs) > 0) {
for (i in 1:nrow(cor_pairs)) {
r1 <- rownames(cor_matrix)[cor_pairs[i, 1]]
r2 <- colnames(cor_matrix)[cor_pairs[i, 2]]
if (cor_pairs[i, 1] < cor_pairs[i, 2]) {
cat(sprintf(" %s <-> %s : r = %.3f\n",
r1, r2,
cor_matrix[cor_pairs[i, 1], cor_pairs[i, 2]]))
}
}
} else {
cat(" Tidak ada pasangan dengan |r| > 0.8\n")
}
## Calories_Burned <-> Fat_Percentage : r = 0.908
#FEATURE SELECTION (BERDASARKAN HASIL KORELASI)
features_selected <- c("Age", "Weight_kg", "Height_m",
"Max_BPM", "Avg_BPM", "Resting_BPM",
"Session_Duration_hours", "Fat_Percentage",
"Water_Intake_liters", "Workout_Frequency_days")
df_selected <- df_clean[, features_selected]
cat(" Fitur yang Digunakan (setelah seleksi) \n")
## Fitur yang Digunakan (setelah seleksi)
print(features_selected)
## [1] "Age" "Weight_kg" "Height_m"
## [4] "Max_BPM" "Avg_BPM" "Resting_BPM"
## [7] "Session_Duration_hours" "Fat_Percentage" "Water_Intake_liters"
## [10] "Workout_Frequency_days"
cat("\nDimensi df_selected:", dim(df_selected), "\n")
##
## Dimensi df_selected: 973 10
# Filter hanya yang angka sebelum korelasi
df_selected_numeric <- df_selected %>% select(where(is.numeric))
# Konfirmasi korelasi setelah seleksi
cor_after <- cor(df_selected_numeric, use = "complete.obs")
cat("\nCek Korelasi Tinggi Setelah Seleksi\n")
##
## Cek Korelasi Tinggi Setelah Seleksi
cor_pairs_after <- which(abs(cor_after) > 0.8 & abs(cor_after) < 1,
arr.ind = TRUE)
if (nrow(cor_pairs_after) > 0) {
for (i in 1:nrow(cor_pairs_after)) {
r1 <- rownames(cor_after)[cor_pairs_after[i, 1]]
r2 <- colnames(cor_after)[cor_pairs_after[i, 2]]
if (cor_pairs_after[i, 1] < cor_pairs_after[i, 2]) {
cat(sprintf(" %s <-> %s : r = %.3f\n",
r1, r2,
cor_after[cor_pairs_after[i, 1],
cor_pairs_after[i, 2]]))
}
}
} else {
cat(" Tidak ada lagi pasangan dengan |r| > 0.8\n")
}
## Tidak ada lagi pasangan dengan |r| > 0.8
# Heatmap korelasi setelah seleksi
corrplot(cor_after, method = "color", type = "upper")
DETEKSI & HANDLING OUTLIER (IQR Method)
cat("Deteksi Outlier per Fitur (IQR Method)\n")
## Deteksi Outlier per Fitur (IQR Method)
# Fungsi deteksi outlier menggunakan IQR
count_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
sum(x < lower | x > upper, na.rm = TRUE)
}
# Pastikan hanya kolom numerik yang dihitung
outlier_counts <- sapply(df_selected[, sapply(df_selected, is.numeric)], count_outliers)
print(outlier_counts)
## Age Height_m Max_BPM
## 0 9 0
## Avg_BPM Resting_BPM Session_Duration_hours
## 0 0 0
## Fat_Percentage Workout_Frequency_days
## 10 0
# Visualisasi Boxplot untuk deteksi outlier
# Filter hanya fitur yang benar-benar numerik
numeric_features <- features_selected[sapply(df_selected[features_selected], is.numeric)]
# Sesuaikan layout (mfrow) agar tidak terlalu kosong jika jumlah kolom berkurang
par(mfrow = c(2, 5), mar = c(4, 3, 2, 1))
for (col in numeric_features) {
boxplot(df_selected[[col]],
main = col,
col = "#4ECDC4",
border = "#2C3E50",
horizontal = FALSE)
}
par(mfrow = c(1, 1))
# Handling outlier: Capping (Winsorization) lebih aman dari hapus row
# Outlier diganti dengan batas bawah / atas IQR
winsorize <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
x[x < lower] <- lower
x[x > upper] <- upper
return(x)
}
# Gunakan lapply yang mengecek tipe data terlebih dahulu
df_no_outlier <- as.data.frame(lapply(df_selected, function(x) {
if(is.numeric(x)) {
return(winsorize(x))
} else {
return(x) # Jika bukan angka, kembalikan apa adanya tanpa di-winsorize
}
}))
cat("\nOutlier Setelah Winsorization\n")
##
## Outlier Setelah Winsorization
# Cek outlier hanya pada kolom yang numerik saja agar tidak error lagi
numeric_cols <- sapply(df_no_outlier, is.numeric)
print(sapply(df_no_outlier[, numeric_cols], count_outliers))
## Age Height_m Max_BPM
## 0 0 0
## Avg_BPM Resting_BPM Session_Duration_hours
## 0 0 0
## Fat_Percentage Workout_Frequency_days
## 0 0
EXPLORATORY DATA ANALYSIS (EDA)
#Statistik Deskriptif
cat(" Statistik Deskriptif\n")
## Statistik Deskriptif
print(summary(df_no_outlier))
## Age Weight_kg Height_m Max_BPM
## Min. :18.00 Length:973 Min. : 40.00 Min. :1.500
## 1st Qu.:28.00 Class :character 1st Qu.: 58.10 1st Qu.:1.620
## Median :40.00 Mode :character Median : 70.00 Median :1.710
## Mean :38.68 Mean : 73.85 Mean :1.723
## 3rd Qu.:49.00 3rd Qu.: 86.00 3rd Qu.:1.800
## Max. :59.00 Max. :127.85 Max. :2.000
## Avg_BPM Resting_BPM Session_Duration_hours Fat_Percentage
## Min. :160.0 Min. :120.0 Min. :50.00 Min. : 303.0
## 1st Qu.:170.0 1st Qu.:131.0 1st Qu.:56.00 1st Qu.: 720.0
## Median :180.0 Median :143.0 Median :62.00 Median : 893.0
## Mean :179.9 Mean :143.8 Mean :62.22 Mean : 904.6
## 3rd Qu.:190.0 3rd Qu.:156.0 3rd Qu.:68.00 3rd Qu.:1076.0
## Max. :199.0 Max. :169.0 Max. :74.00 Max. :1610.0
## Water_Intake_liters Workout_Frequency_days
## Length:973 Min. :10.00
## Class :character 1st Qu.:21.30
## Mode :character Median :26.20
## Mean :24.98
## 3rd Qu.:29.30
## Max. :35.00
# Distribusi Setiap Fitur (Histogram) - Hanya untuk kolom Angka
df_long <- df_no_outlier %>%
select(where(is.numeric)) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Value")
# Lanjut ke ggplot
ggplot(df_long, aes(x = Value)) +
geom_histogram(bins = 30, fill = "#4ECDC4", color = "white") +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
labs(title = "Distribusi Fitur Numerik (Setelah Winsorization)")
ggplot(df_long, aes(x = Value, fill = Variable)) +
geom_histogram(bins = 30, color = "white", show.legend = FALSE) +
facet_wrap(~Variable, scales = "free") +
scale_fill_brewer(palette = "Set3") +
labs(title = "Distribusi Setiap Fitur (Setelah Outlier Handling)",
x = "Nilai", y = "Frekuensi") +
theme_minimal()
#Pairplot (subset 5 fitur agar tidak terlalu berat)
ggpairs(df_no_outlier[, c("Age", "Weight_kg", "Fat_Percentage",
"Session_Duration_hours", "Avg_BPM")],
title = "Pairplot - Fitur Utama Gym Dataset",
lower = list(continuous = "smooth"),
diag = list(continuous = "densityDiag")) +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
DATA PREPARATION - SCALING (df_scaled)
# Standarisasi Z-score hanya untuk fitur numerik
# Kita filter dulu kolom yang benar-benar angka
df_numeric_only <- df_no_outlier %>% select(where(is.numeric))
# Lakukan scaling
df_scaled <- as.data.frame(scale(df_numeric_only))
# Ubah nama Fitur menjadi X1 sampai X 10
colnames(df_scaled) <- paste0("x", 1:ncol(df_scaled))
cat("Cek df_scaled (mean harus ~0, sd harus ~1)\n")
## Cek df_scaled (mean harus ~0, sd harus ~1)
cat("\nMean per kolom:\n")
##
## Mean per kolom:
print(round(colMeans(df_scaled), 4))
## x1 x2 x3 x4 x5 x6 x7 x8
## 0 0 0 0 0 0 0 0
cat("\nSD per kolom:\n")
##
## SD per kolom:
print(round(apply(df_scaled, 2, sd), 4))
## x1 x2 x3 x4 x5 x6 x7 x8
## 1 1 1 1 1 1 1 1
cat("\nDimensi df_scaled:", dim(df_scaled), "\n")
##
## Dimensi df_scaled: 973 8
head(df_scaled)
## x1 x2 x3 x4 x5 x6
## 1 1.42161147 0.68224629 -0.09849406 0.01007625 0.9224960 -0.3033989
## 2 0.60065595 0.04974144 -1.50782814 -0.07668648 0.5042348 0.5154835
## 3 -0.54868178 -0.27123117 -0.48997575 -1.11783927 -1.5173612 -1.1222812
## 4 -1.12335065 -0.97453881 -0.17679039 0.87770357 1.4104675 -0.8493204
## 5 -0.05610847 -1.30967197 0.52787664 0.70417810 0.9922062 0.7884443
## 6 1.42161147 -0.74796990 -0.33338307 -1.03107653 0.8527858 1.6073266
## x7 x8
## 1 1.51004885 -1.9773038
## 2 -0.08000631 1.4255680
## 3 -0.84175367 1.3456884
## 4 -1.37793506 0.6107959
## 5 -1.28918780 0.6746996
## 6 0.78158172 -1.5140020
cat("Cek df_scaled (mean harus ~0, sd harus ~1)\n")
## Cek df_scaled (mean harus ~0, sd harus ~1)
cat("\nMean per kolom:\n")
##
## Mean per kolom:
print(round(colMeans(df_scaled), 4))
## x1 x2 x3 x4 x5 x6 x7 x8
## 0 0 0 0 0 0 0 0
cat("\nSD per kolom:\n")
##
## SD per kolom:
print(round(apply(df_scaled, 2, sd), 4))
## x1 x2 x3 x4 x5 x6 x7 x8
## 1 1 1 1 1 1 1 1
PENENTUAN JUMLAH K ELBOW METHOD
set.seed(123)
# Hitung WSS
wss_values <- sapply(1:10, function(k) {
kmeans(df_scaled, centers = k, nstart = 25, iter.max = 100)$tot.withinss
})
# Print tabel WSS + penurunan
cat("Tabel Elbow Method\n")
## Tabel Elbow Method
elbow_table <- data.frame(
K = 1:10,
WSS = round(wss_values, 2),
Penurunan_WSS = c(NA, round(-diff(wss_values), 2))
)
print(elbow_table)
## K WSS Penurunan_WSS
## 1 1 7776.00 NA
## 2 2 6548.08 1227.92
## 3 3 5820.66 727.42
## 4 4 5324.85 495.81
## 5 5 5031.16 293.69
## 6 6 4780.81 250.35
## 7 7 4572.87 207.94
## 8 8 4388.75 184.12
## 9 9 4221.88 166.87
## 10 10 4070.08 151.80
# Titik penurunan paling drastis
K_OPTIMAL <- which.max(-diff(wss_values)) + 1
cat("\nK Optimal (Elbow):", K_OPTIMAL, "\n")
##
## K Optimal (Elbow): 2
# Plot Elbow
plot(1:10, wss_values,
type = "b", pch = 19,
col = "#E74C3C",
frame = FALSE,
xlab = "Jumlah Cluster (K)",
ylab = "Total Within-Cluster SS",
main = "Elbow Method - Penentuan K Optimal")
abline(v = K_OPTIMAL, lty = 2, col = "gray40")
text(K_OPTIMAL, max(wss_values) * 0.95,
paste("K =", K_OPTIMAL),
col = "#E74C3C", font = 2)
CLUSTERING - 1. K-MEANS
## CLUSTERING - 1. K-MEANS
set.seed(123)
km_res <- kmeans(df_scaled, centers = K_OPTIMAL, nstart = 25, iter.max = 100)
cat("K-Means\n")
## K-Means
cat("Jumlah cluster:", K_OPTIMAL, "\n")
## Jumlah cluster: 2
cat("Cluster sizes:", km_res$size, "\n")
## Cluster sizes: 413 560
cat("Total Within-SS:", round(km_res$tot.withinss, 4), "\n")
## Total Within-SS: 6548.084
cat("Between-SS / Total-SS:",
round(km_res$betweenss / km_res$totss * 100, 2), "%\n")
## Between-SS / Total-SS: 15.79 %
# Visualisasi
fviz_cluster(km_res, data = df_scaled,
palette = "Set2",
geom = "point",
ellipse.type = "convex",
ggtheme = theme_minimal(),
main = paste("K-Means Clustering (K =", K_OPTIMAL, ")"))
CLUSTERING - 2. K-MEDIANS
## CLUSTERING - 2. K-MEDIANS
library(flexclust)
set.seed(123)
kmed_res <- kcca(df_scaled, k = K_OPTIMAL, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
cat("\nK-Medians\n")
##
## K-Medians
cat("Jumlah cluster:", K_OPTIMAL, "\n")
## Jumlah cluster: 2
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(clusters(kmed_res)))
##
## 1 2
## 634 339
# Visualisasi
fviz_cluster(list(data = df_scaled, cluster = clusters(kmed_res)),
palette = "Set2",
geom = "point",
ggtheme = theme_minimal(),
main = paste("K-Medians Clustering (K =", K_OPTIMAL, ")"))
CLUSTERING - 3. DBSCAN
# CLUSTERING - 3. DBSCAN
library(dbscan)
# k-distance plot untuk menentukan eps
kNNdistplot(df_scaled, k = ncol(df_scaled))
abline(h = 2.5, lty = 2, col = "red")
title(main = "k-NN Distance Plot (Penentuan eps DBSCAN)")
set.seed(123)
db_res <- dbscan::dbscan(df_scaled, eps = 2.5, minPts = ncol(df_scaled))
cat("\nDBSCAN\n")
##
## DBSCAN
print(table(db_res$cluster))
##
## 1
## 973
cat("Jumlah cluster (tanpa noise):", length(unique(db_res$cluster)) - 1, "\n")
## Jumlah cluster (tanpa noise): 0
cat("Jumlah noise:", sum(db_res$cluster == 0), "\n")
## Jumlah noise: 0
# Visualisasi
fviz_cluster(list(data = df_scaled, cluster = db_res$cluster),
palette = "Set2",
geom = "point",
ggtheme = theme_minimal(),
main = "DBSCAN Clustering (0 = Noise)")
CLUSTERING - 4. Mean Shift
# Mean Shift
library(meanShiftR)
X <- data.matrix(df_scaled)
bw <- rep(1.4, ncol(X))
ms_result <- meanShift(X, bandwidth = bw)
labels_ms <- ms_result$assignment
cat("\nMean Shift\n")
##
## Mean Shift
cat("Jumlah cluster:", length(unique(labels_ms)), "\n")
## Jumlah cluster: 3
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(labels_ms))
## labels_ms
## 1 2 3
## 195 761 17
# Visualisasi
fviz_cluster(
list(data = df_scaled, cluster = labels_ms),
palette = "Set2",
geom = "point",
ggtheme = theme_minimal(),
main = paste("Mean Shift Clustering (K =", length(unique(labels_ms)), ")")
)
CLUSTERING - 5. Fuzzy C-means
# Fuzzy C-Means
library(e1071)
library(factoextra)
X <- df_scaled
fcm_result <- cmeans(X, centers = 2, m = 2)
labels_fcm <- fcm_result$cluster
cat("\nFuzzy C-Means\n")
##
## Fuzzy C-Means
cat("Jumlah cluster:", length(unique(labels_fcm)), "\n")
## Jumlah cluster: 2
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(labels_fcm))
## labels_fcm
## 1 2
## 520 453
# Visualisasi
fviz_cluster(
list(data = df_scaled, cluster = labels_fcm),
palette = "Set2",
geom = "point",
ggtheme = theme_minimal(),
main = "Fuzzy C-Means Clustering (K = 2)"
)