Metode: K-Means & Fuzzy C-Means (FCM) Dataset: Student Performance and Learning Behavior Sumber: https://www.kaggle.com/datasets/adilshamim8/student-performance-and-learning-style
packages <- c("tidyverse", "cluster", "factoextra", "ppclust",
"ggplot2", "corrplot", "clValid", "gridExtra", "reshape2")
install_if_missing <- function(pkg) {
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg, dependencies = TRUE)
library(pkg, character.only = TRUE)
}
}
invisible(sapply(packages, install_if_missing))
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: cluster
##
## Loading required package: factoextra
##
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
##
## Loading required package: ppclust
## Warning: package 'ppclust' was built under R version 4.5.3
## Loading required package: corrplot
## corrplot 0.95 loaded
## Loading required package: clValid
## Warning: package 'clValid' was built under R version 4.5.3
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
df_raw <- read.csv("C:/Users/Fadhila Zulfa/Downloads/student_performance.csv", stringsAsFactors = FALSE)
cat("STRUKTUR DATA\n")
## STRUKTUR DATA
str(df_raw)
## 'data.frame': 14003 obs. of 16 variables:
## $ StudyHours : int 19 19 19 19 19 19 19 19 19 19 ...
## $ Attendance : int 64 64 64 64 64 64 64 64 64 64 ...
## $ Resources : int 1 1 1 1 1 1 0 0 0 1 ...
## $ Extracurricular : int 0 0 0 1 1 1 1 1 1 1 ...
## $ Motivation : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Internet : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Age : int 19 23 28 19 23 28 19 23 28 19 ...
## $ LearningStyle : int 2 3 1 2 3 1 2 3 1 2 ...
## $ OnlineCourses : int 8 16 19 8 16 19 8 16 19 8 ...
## $ Discussions : int 1 0 0 1 0 0 1 0 0 1 ...
## $ AssignmentCompletion: int 59 90 67 59 90 67 59 90 67 59 ...
## $ ExamScore : int 40 66 99 40 66 99 40 66 99 40 ...
## $ EduTech : int 0 0 1 0 0 1 0 0 1 0 ...
## $ StressLevel : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FinalGrade : int 3 2 0 3 2 0 3 2 0 3 ...
cat("\n 6 BARIS PERTAMA\n")
##
## 6 BARIS PERTAMA
print(head(df_raw))
## StudyHours Attendance Resources Extracurricular Motivation Internet Gender
## 1 19 64 1 0 0 1 0
## 2 19 64 1 0 0 1 0
## 3 19 64 1 0 0 1 0
## 4 19 64 1 1 0 1 0
## 5 19 64 1 1 0 1 0
## 6 19 64 1 1 0 1 0
## Age LearningStyle OnlineCourses Discussions AssignmentCompletion ExamScore
## 1 19 2 8 1 59 40
## 2 23 3 16 0 90 66
## 3 28 1 19 0 67 99
## 4 19 2 8 1 59 40
## 5 23 3 16 0 90 66
## 6 28 1 19 0 67 99
## EduTech StressLevel FinalGrade
## 1 0 1 3
## 2 0 1 2
## 3 1 1 0
## 4 0 1 3
## 5 0 1 2
## 6 1 1 0
cat("\n STATISTIK DESKRIPTIF\n")
##
## STATISTIK DESKRIPTIF
print(summary(df_raw))
## StudyHours Attendance Resources Extracurricular
## Min. : 5.00 Min. : 60.00 Min. :0.000 Min. :0.0000
## 1st Qu.:16.00 1st Qu.: 70.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :20.00 Median : 80.00 Median :1.000 Median :1.0000
## Mean :19.99 Mean : 80.19 Mean :1.104 Mean :0.5942
## 3rd Qu.:24.00 3rd Qu.: 90.00 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :44.00 Max. :100.00 Max. :2.000 Max. :1.0000
## Motivation Internet Gender Age
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :18.00
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.000 1st Qu.:20.00
## Median :1.0000 Median :1.0000 Median :1.000 Median :24.00
## Mean :0.9058 Mean :0.9255 Mean :0.552 Mean :23.53
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:27.00
## Max. :2.0000 Max. :1.0000 Max. :1.000 Max. :29.00
## LearningStyle OnlineCourses Discussions AssignmentCompletion
## Min. :0.000 Min. : 0.000 Min. :0.0000 Min. : 50.0
## 1st Qu.:1.000 1st Qu.: 5.000 1st Qu.:0.0000 1st Qu.: 62.0
## Median :2.000 Median :10.000 Median :1.0000 Median : 74.0
## Mean :1.515 Mean : 9.892 Mean :0.6059 Mean : 74.5
## 3rd Qu.:3.000 3rd Qu.:15.000 3rd Qu.:1.0000 3rd Qu.: 87.0
## Max. :3.000 Max. :20.000 Max. :1.0000 Max. :100.0
## ExamScore EduTech StressLevel FinalGrade
## Min. : 40.00 Min. :0.0000 Min. :0.000 Min. :0.000
## 1st Qu.: 55.00 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.000
## Median : 70.00 Median :1.0000 Median :2.000 Median :1.000
## Mean : 70.35 Mean :0.7091 Mean :1.304 Mean :1.448
## 3rd Qu.: 86.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :100.00 Max. :1.0000 Max. :2.000 Max. :3.000
cat("\nMISSING VALUES\n")
##
## MISSING VALUES
missing_count <- colSums(is.na(df_raw))
if (sum(missing_count) == 0) {
cat("tidak ada missing values!\n")
} else {
print(missing_count[missing_count > 0])
}
## tidak ada missing values!
fitur_numerik <- c("Age", "StudyHours", "Attendance",
"AssignmentCompletion", "OnlineCourses",
"Discussions", "Resources",
"Motivation", "StressLevel", "ExamScore")
df_num <- na.omit(df_raw[, fitur_numerik])
df_scaled <- as.data.frame(scale(df_num))
cat("\n Fitur yang digunakan:\n")
##
## Fitur yang digunakan:
cat(paste(" •", fitur_numerik, collapse = "\n"), "\n")
## • Age
## • StudyHours
## • Attendance
## • AssignmentCompletion
## • OnlineCourses
## • Discussions
## • Resources
## • Motivation
## • StressLevel
## • ExamScore
cat("Data dinormalisasi dengan Z-score\n")
## Data dinormalisasi dengan Z-score
cor_matrix <- cor(df_num)
par(mar = c(2, 2, 3, 2))
corrplot(cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
number.cex = 0.7,
title = "Korelasi Antar Fitur",
mar = c(0, 0, 2, 0))
par(mar = c(5, 4, 4, 2))
# Elbow Method
wss_values <- sapply(1:10, function(k) {
kmeans(df_scaled, centers = k, nstart = 25, iter.max = 100)$tot.withinss
})
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 700150)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 700150)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 700150)
par(mar = c(5, 5, 4, 2))
plot(1:10, wss_values,
type = "b", pch = 19, col = "#E74C3C", lwd = 2,
xlab = "Jumlah Cluster (k)",
ylab = "Total Within-Cluster SS",
main = "Elbow Method")
abline(v = 3, lty = 2, col = "blue", lwd = 1.5)
text(3.3, max(wss_values) * 0.9, "k=3", col = "blue")
# Silhouette Score
sil_values <- sapply(2:10, function(k) {
km <- kmeans(df_scaled, centers = k, nstart = 25)
sil <- silhouette(km$cluster, dist(df_scaled))
mean(sil[, 3])
})
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 700150)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
par(mar = c(5, 5, 4, 2))
plot(2:10, sil_values,
type = "b", pch = 19, col = "#2ECC71", lwd = 2,
xlab = "Jumlah Cluster (k)",
ylab = "Rata-rata Silhouette Score",
main = "Silhouette Method")
k_optimal <- which.max(sil_values) + 1
abline(v = k_optimal, lty = 2, col = "darkgreen", lwd = 1.5)
text(k_optimal + 0.2, max(sil_values) * 0.98,
paste0("k=", k_optimal), col = "darkgreen")
cat("k optimal =", k_optimal, "\n")
## k optimal = 2
set.seed(42)
kmeans_result <- kmeans(df_scaled,
centers = k_optimal,
nstart = 25,
iter.max = 300)
df_num$Cluster_KMeans <- as.factor(kmeans_result$cluster)
cat("K-Means selesai!\n")
## K-Means selesai!
cat(" Anggota per cluster:\n")
## Anggota per cluster:
print(table(kmeans_result$cluster))
##
## 1 2
## 8484 5519
cat(sprintf(" Rasio BSS/TSS : %.2f%%\n",
kmeans_result$betweenss / kmeans_result$totss * 100))
## Rasio BSS/TSS : 10.03%
# Cluster plot (PCA biplot)
print(
fviz_cluster(kmeans_result,
data = df_scaled,
geom = "point",
ellipse = TRUE,
ellipse.type = "convex",
palette = c("#E74C3C","#3498DB","#2ECC71","#F39C12","#9B59B6"),
ggtheme = theme_bw(),
main = paste("K-Means Clustering (k =", k_optimal, ")"))
)
# Silhouette plot
sil_km <- silhouette(kmeans_result$cluster, dist(df_scaled))
print(
fviz_silhouette(sil_km,
palette = c("#E74C3C","#3498DB","#2ECC71","#F39C12","#9B59B6"),
ggtheme = theme_bw(),
main = "Silhouette Plot - K-Means")
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## cluster size ave.sil.width
## 1 1 8484 0.11
## 2 2 5519 0.11
set.seed(42)
fcm_result <- ppclust::fcm(df_scaled, centers = k_optimal, nstart = 5)
df_num$Cluster_FCM <- as.factor(fcm_result$cluster)
cat(" Anggota per cluster:\n")
## Anggota per cluster:
print(table(fcm_result$cluster))
##
## 1 2
## 6979 7024
membership_df <- as.data.frame(fcm_result$u)
colnames(membership_df) <- paste0("Cluster_", 1:k_optimal)
cat("\n Derajat Keanggotaan (6 baris pertama):\n")
##
## Derajat Keanggotaan (6 baris pertama):
print(round(head(membership_df), 4))
## Cluster_1 Cluster_2
## 1 0.5 0.5
## 2 0.5 0.5
## 3 0.5 0.5
## 4 0.5 0.5
## 5 0.5 0.5
## 6 0.5 0.5
# Cluster plot FCM
print(
fviz_cluster(list(data = df_scaled, cluster = fcm_result$cluster),
geom = "point",
ellipse = TRUE,
ellipse.type = "convex",
palette = c("#E74C3C","#3498DB","#2ECC71","#F39C12","#9B59B6"),
ggtheme = theme_bw(),
main = paste("Fuzzy C-Means Clustering (c =", k_optimal, ")"))
)
# Heatmap derajat keanggotaan (50 sampel acak)
set.seed(42)
idx_sample <- sample(1:nrow(membership_df), min(50, nrow(membership_df)))
mem_melt <- melt(as.matrix(membership_df[idx_sample, ]))
colnames(mem_melt) <- c("Siswa", "Cluster", "Derajat")
print(
ggplot(mem_melt, aes(x = Cluster, y = factor(Siswa), fill = Derajat)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "#EBF5FB", high = "#1A5276") +
labs(title = "Heatmap Derajat Keanggotaan FCM (50 Sampel)",
x = "Cluster",
y = "Siswa",
fill = "Derajat") +
theme_bw() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5))
)
# PROFIL KARAKTERISTIK CLUSTER
cat("\n PROFIL KARAKTERISTIK CLUSTER (K-Means):\n")
##
## PROFIL KARAKTERISTIK CLUSTER (K-Means):
cat(strrep("=", 60), "\n")
## ============================================================
profil_cluster <- df_num %>%
group_by(Cluster_KMeans) %>%
summarise(
N = n(),
Rata_StudyHours = round(mean(StudyHours), 2),
Rata_Attendance = round(mean(Attendance), 2),
Rata_Assignment = round(mean(AssignmentCompletion), 2),
Rata_Motivation = round(mean(Motivation), 2),
Rata_StressLevel = round(mean(StressLevel), 2),
Rata_ExamScore = round(mean(ExamScore), 2),
.groups = "drop"
)
print(profil_cluster)
## # A tibble: 2 × 8
## Cluster_KMeans N Rata_StudyHours Rata_Attendance Rata_Assignment
## <fct> <int> <dbl> <dbl> <dbl>
## 1 1 8484 20.1 80.1 74.5
## 2 2 5519 19.8 80.3 74.6
## # ℹ 3 more variables: Rata_Motivation <dbl>, Rata_StressLevel <dbl>,
## # Rata_ExamScore <dbl>
profil_sorted <- profil_cluster[order(profil_cluster$Rata_ExamScore,
decreasing = TRUE), ]
label_map <- c("Siswa Berprestasi Tinggi",
"Siswa Berprestasi Sedang",
"Siswa Berprestasi Rendah",
"Kelompok 4", "Kelompok 5")[1:k_optimal]
cat("\n🏷️ INTERPRETASI CLUSTER:\n")
##
## 🏷️ INTERPRETASI CLUSTER:
for (i in 1:k_optimal) {
cat(sprintf(" Cluster %s → %s (Rata ExamScore: %.1f)\n",
profil_sorted$Cluster_KMeans[i],
label_map[i],
profil_sorted$Rata_ExamScore[i]))
}
## Cluster 2 → Siswa Berprestasi Tinggi (Rata ExamScore: 71.0)
## Cluster 1 → Siswa Berprestasi Sedang (Rata ExamScore: 69.9)
# Boxplot ExamScore per cluster
print(
ggplot(df_num, aes(x = Cluster_KMeans, y = ExamScore,
fill = Cluster_KMeans)) +
geom_boxplot(alpha = 0.8, outlier.colour = "red", outlier.size = 1.5) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribusi ExamScore per Cluster (K-Means)",
x = "Cluster",
y = "Exam Score") +
theme_bw() +
theme(legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5))
)
# Barplot profil rata-rata per cluster
profil_long <- profil_cluster %>%
select(Cluster_KMeans, Rata_StudyHours, Rata_Attendance,
Rata_Motivation, Rata_StressLevel) %>%
pivot_longer(-Cluster_KMeans, names_to = "Variabel", values_to = "Nilai")
print(
ggplot(profil_long, aes(x = Cluster_KMeans, y = Nilai,
fill = Cluster_KMeans)) +
geom_col(width = 0.6, alpha = 0.85) +
facet_wrap(~ Variabel, scales = "free_y", nrow = 2) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Profil Rata-rata Setiap Cluster",
x = "Cluster",
y = "Nilai Rata-rata") +
theme_bw() +
theme(legend.position = "bottom",
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold"))
)
avg_sil_km <- mean(silhouette(kmeans_result$cluster, dist(df_scaled))[, 3])
avg_sil_fcm <- mean(silhouette(fcm_result$cluster, dist(df_scaled))[, 3])
dunn_km <- clValid::dunn(dist(df_scaled), kmeans_result$cluster)
dunn_fcm <- clValid::dunn(dist(df_scaled), fcm_result$cluster)
cat("\n EVALUASI CLUSTERING:\n")
##
## EVALUASI CLUSTERING:
cat(strrep("=", 60), "\n")
## ============================================================
cat(sprintf(" %-35s : %.4f\n", "Silhouette Score K-Means", avg_sil_km))
## Silhouette Score K-Means : 0.1081
cat(sprintf(" %-35s : %.4f\n", "Silhouette Score FCM", avg_sil_fcm))
## Silhouette Score FCM : 0.0703
cat(sprintf(" %-35s : %.4f\n", "Dunn Index K-Means", dunn_km))
## Dunn Index K-Means : 0.2323
cat(sprintf(" %-35s : %.4f\n", "Dunn Index FCM", dunn_fcm))
## Dunn Index FCM : 0.0479
best_method <- ifelse(avg_sil_km > avg_sil_fcm, "K-Means", "Fuzzy C-Means")
cat(sprintf("\n Metode terbaik (Silhouette): %s\n", best_method))
##
## Metode terbaik (Silhouette): K-Means
eval_df <- data.frame(
Metode = rep(c("K-Means", "FCM"), 2),
Metrik = c("Silhouette", "Silhouette", "Dunn Index", "Dunn Index"),
Nilai = c(avg_sil_km, avg_sil_fcm, dunn_km, dunn_fcm)
)
print(
ggplot(eval_df, aes(x = Metode, y = Nilai, fill = Metode)) +
geom_col(width = 0.5, alpha = 0.85) +
facet_wrap(~ Metrik, scales = "free_y") +
scale_fill_manual(values = c("#3498DB", "#E67E22")) +
labs(title = "Perbandingan Evaluasi: K-Means vs Fuzzy C-Means",
x = "Metode",
y = "Nilai Indeks") +
theme_bw() +
theme(legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold"))
)
# RINGKASAN AKHIR
cat("\n")
cat(" RINGKASAN HASIL CLUSTERING\n")
## RINGKASAN HASIL CLUSTERING
cat(sprintf(" Dataset : Student Performance & Learning Behavior\n"))
## Dataset : Student Performance & Learning Behavior
cat(sprintf(" Total Observasi : %d\n", nrow(df_num)))
## Total Observasi : 14003
cat(sprintf(" Jumlah Fitur : %d\n", length(fitur_numerik)))
## Jumlah Fitur : 10
cat(sprintf(" Cluster Optimal : k = %d\n", k_optimal))
## Cluster Optimal : k = 2
cat(sprintf(" Sil. K-Means : %.4f\n", avg_sil_km))
## Sil. K-Means : 0.1081
cat(sprintf(" Sil. FCM : %.4f\n", avg_sil_fcm))
## Sil. FCM : 0.0703
cat(sprintf(" Metode Terbaik : %s\n", best_method))
## Metode Terbaik : K-Means
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.