Dataset: https://www.kaggle.com/datasets/aljarah/xAPI-Edu-Data
Variabel Dependen (Y): Class — Ordinal
dengan tiga kategori: L < M < H (L = Low, M = Middle,
H = High)
Variabel Independen (X):
| Variabel | Tipe | Keterangan |
|---|---|---|
raisedhands |
Numerik | Jumlah kali siswa mengangkat tangan di kelas |
AnnouncementsView |
Numerik | Jumlah kali siswa melihat pengumuman |
Discussion |
Numerik | Jumlah kali siswa berpartisipasi dalam diskusi |
gender |
Kategorik (M / F) | Jenis kelamin siswa |
StageID |
Kategorik (lowerlevel / MiddleSchool / HighSchool) | Jenjang pendidikan siswa |
Semester |
Kategorik (F / S) | Semester (First / Second) |
ParentAnsweringSurvey |
Kategorik (Yes / No) | Orang tua mengisi survei |
ParentschoolSatisfaction |
Kategorik (Good / Bad) | Kepuasan orang tua terhadap sekolah |
StudentAbsenceDays |
Kategorik (Under-7 / Above-7) | Jumlah hari absen siswa |
Tujuan: Memodelkan faktor-faktor yang memengaruhi tingkat performa siswa menggunakan Ordinal Logistic Regression.
# install.packages(c("MASS", "brant", "car", "dplyr", "tidyr", "ggplot2"), quiet = TRUE)
library(MASS)
library(brant)
library(car)
library(dplyr)
library(tidyr)
library(ggplot2)
df <- read.csv("xAPI-Edu-Data.csv")
head(df, 10)
| gender | NationalITy | PlaceofBirth | StageID | GradeID | SectionID | Topic | Semester | Relation | raisedhands | VisITedResources | AnnouncementsView | Discussion | ParentAnsweringSurvey | ParentschoolSatisfaction | StudentAbsenceDays | Class |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| M | KW | KuwaIT | lowerlevel | G-04 | A | IT | F | Father | 15 | 16 | 2 | 20 | Yes | Good | Under-7 | M |
| M | KW | KuwaIT | lowerlevel | G-04 | A | IT | F | Father | 20 | 20 | 3 | 25 | Yes | Good | Under-7 | M |
| M | KW | KuwaIT | lowerlevel | G-04 | A | IT | F | Father | 10 | 7 | 0 | 30 | No | Bad | Above-7 | L |
| M | KW | KuwaIT | lowerlevel | G-04 | A | IT | F | Father | 30 | 25 | 5 | 35 | No | Bad | Above-7 | L |
| M | KW | KuwaIT | lowerlevel | G-04 | A | IT | F | Father | 40 | 50 | 12 | 50 | No | Bad | Above-7 | M |
| F | KW | KuwaIT | lowerlevel | G-04 | A | IT | F | Father | 42 | 30 | 13 | 70 | Yes | Bad | Above-7 | M |
| M | KW | KuwaIT | MiddleSchool | G-07 | A | Math | F | Father | 35 | 12 | 0 | 17 | No | Bad | Above-7 | L |
| M | KW | KuwaIT | MiddleSchool | G-07 | A | Math | F | Father | 50 | 10 | 15 | 22 | Yes | Good | Under-7 | M |
| F | KW | KuwaIT | MiddleSchool | G-07 | A | Math | F | Father | 12 | 21 | 16 | 50 | Yes | Good | Under-7 | M |
| F | KW | KuwaIT | MiddleSchool | G-07 | B | IT | F | Father | 70 | 80 | 25 | 70 | Yes | Good | Under-7 | M |
str(df)
## 'data.frame': 480 obs. of 17 variables:
## $ gender : chr "M" "M" "M" "M" ...
## $ NationalITy : chr "KW" "KW" "KW" "KW" ...
## $ PlaceofBirth : chr "KuwaIT" "KuwaIT" "KuwaIT" "KuwaIT" ...
## $ StageID : chr "lowerlevel" "lowerlevel" "lowerlevel" "lowerlevel" ...
## $ GradeID : chr "G-04" "G-04" "G-04" "G-04" ...
## $ SectionID : chr "A" "A" "A" "A" ...
## $ Topic : chr "IT" "IT" "IT" "IT" ...
## $ Semester : chr "F" "F" "F" "F" ...
## $ Relation : chr "Father" "Father" "Father" "Father" ...
## $ raisedhands : int 15 20 10 30 40 42 35 50 12 70 ...
## $ VisITedResources : int 16 20 7 25 50 30 12 10 21 80 ...
## $ AnnouncementsView : int 2 3 0 5 12 13 0 15 16 25 ...
## $ Discussion : int 20 25 30 35 50 70 17 22 50 70 ...
## $ ParentAnsweringSurvey : chr "Yes" "Yes" "No" "No" ...
## $ ParentschoolSatisfaction: chr "Good" "Good" "Bad" "Bad" ...
## $ StudentAbsenceDays : chr "Under-7" "Under-7" "Above-7" "Above-7" ...
## $ Class : chr "M" "M" "L" "L" ...
summary(df)
## gender NationalITy PlaceofBirth StageID
## Length:480 Length:480 Length:480 Length:480
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## GradeID SectionID Topic Semester
## Length:480 Length:480 Length:480 Length:480
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Relation raisedhands VisITedResources AnnouncementsView
## Length:480 Min. : 0.00 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 15.75 1st Qu.:20.0 1st Qu.:14.00
## Mode :character Median : 50.00 Median :65.0 Median :33.00
## Mean : 46.77 Mean :54.8 Mean :37.92
## 3rd Qu.: 75.00 3rd Qu.:84.0 3rd Qu.:58.00
## Max. :100.00 Max. :99.0 Max. :98.00
## Discussion ParentAnsweringSurvey ParentschoolSatisfaction
## Min. : 1.00 Length:480 Length:480
## 1st Qu.:20.00 Class :character Class :character
## Median :39.00 Mode :character Mode :character
## Mean :43.28
## 3rd Qu.:70.00
## Max. :99.00
## StudentAbsenceDays Class
## Length:480 Length:480
## Class :character Class :character
## Mode :character Mode :character
##
##
##
colSums(is.na(df))
## gender NationalITy PlaceofBirth
## 0 0 0
## StageID GradeID SectionID
## 0 0 0
## Topic Semester Relation
## 0 0 0
## raisedhands VisITedResources AnnouncementsView
## 0 0 0
## Discussion ParentAnsweringSurvey ParentschoolSatisfaction
## 0 0 0
## StudentAbsenceDays Class
## 0 0
df %>%
select(gender, StageID, Semester, ParentAnsweringSurvey,
ParentschoolSatisfaction, StudentAbsenceDays, Class) %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(x = value)) +
geom_bar(fill = "orange", color = "white") +
facet_wrap(~name, scales = "free") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Distribusi Variabel Kategorik", x = NULL, y = "Frekuensi")
df %>%
select(raisedhands, AnnouncementsView, Discussion) %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(x = value)) +
geom_histogram(fill = "steelblue", color = "white", bins = 20) +
facet_wrap(~name, scales = "free") +
theme_minimal(base_size = 12) +
labs(title = "Distribusi Variabel Numerik", x = NULL, y = "Frekuensi")
df %>%
select(raisedhands, AnnouncementsView, Discussion, Class) %>%
mutate(Class = factor(Class, levels = c("L", "M", "H"))) %>%
pivot_longer(cols = -Class) %>%
ggplot(aes(x = Class, y = value, fill = Class)) +
geom_boxplot(alpha = 0.7) +
facet_wrap(~name, scales = "free_y") +
scale_fill_manual(values = c("L" = "#e74c3c", "M" = "#f39c12", "H" = "#2ecc71")) +
theme_minimal(base_size = 12) +
labs(title = "Distribusi Variabel Numerik per Kelas Performa",
x = "Kelas (L < M < H)", y = "Nilai", fill = "Kelas")
ordinaldf <- df[, c("Class", "raisedhands", "AnnouncementsView", "Discussion",
"gender", "StageID", "Semester",
"ParentAnsweringSurvey", "ParentschoolSatisfaction",
"StudentAbsenceDays")]
ordinaldf$Class <- factor(
ordinaldf$Class,
levels = c("L", "M", "H"),
ordered = TRUE
)
ordinaldf$gender <- factor(
ordinaldf$gender,
levels = c("M", "F")
)
ordinaldf$StageID <- factor(
ordinaldf$StageID,
levels = c("lowerlevel", "MiddleSchool", "HighSchool")
)
ordinaldf$Semester <- factor(
ordinaldf$Semester,
levels = c("F", "S")
)
ordinaldf$ParentAnsweringSurvey <- factor(
ordinaldf$ParentAnsweringSurvey,
levels = c("No", "Yes")
)
ordinaldf$ParentschoolSatisfaction <- factor(
ordinaldf$ParentschoolSatisfaction,
levels = c("Bad", "Good")
)
ordinaldf$StudentAbsenceDays <- factor(
ordinaldf$StudentAbsenceDays,
levels = c("Under-7", "Above-7")
)
cat("Dimensi data:", dim(ordinaldf), "\n")
## Dimensi data: 480 10
head(ordinaldf)
| Class | raisedhands | AnnouncementsView | Discussion | gender | StageID | Semester | ParentAnsweringSurvey | ParentschoolSatisfaction | StudentAbsenceDays |
|---|---|---|---|---|---|---|---|---|---|
| M | 15 | 2 | 20 | M | lowerlevel | F | Yes | Good | Under-7 |
| M | 20 | 3 | 25 | M | lowerlevel | F | Yes | Good | Under-7 |
| L | 10 | 0 | 30 | M | lowerlevel | F | No | Bad | Above-7 |
| L | 30 | 5 | 35 | M | lowerlevel | F | No | Bad | Above-7 |
| M | 40 | 12 | 50 | M | lowerlevel | F | No | Bad | Above-7 |
| M | 42 | 13 | 70 | F | lowerlevel | F | Yes | Bad | Above-7 |
Catatan: Dalam Ordinal Logistic Regression, asumsi yang wajib dipenuhi adalah Tidak Ada Multikolinearitas. Asumsi lainnya (variabel dependen ordinal, independensi observasi, dan tidak ada outlier) bersifat opsional diujikan sebagai kelengkapan analisis, namun tidak menjadi syarat mutlak.
Asumsi ini mensyaratkan bahwa variabel dependen harus berupa data ordinal, yaitu data kategorik yang memiliki urutan atau tingkatan yang bermakna antar kategorinya.
cat("Class:", class(ordinaldf$Class), "\n")
## Class: ordered factor
cat("Is ordered:", is.ordered(ordinaldf$Class), "\n")
## Is ordered: TRUE
cat("Levels:", paste(levels(ordinaldf$Class), collapse = " < "), "\n\n")
## Levels: L < M < H
tbl <- table(ordinaldf$Class)
data.frame(
Kategori = names(tbl),
Frekuensi = as.integer(tbl),
Proporsi = paste0(round(prop.table(tbl) * 100, 2), "%")
)
| Kategori | Frekuensi | Proporsi |
|---|---|---|
| L | 127 | 26.46% |
| M | 211 | 43.96% |
| H | 142 | 29.58% |
Terpenuhi —
Classmerupakan ordered factor dengan urutanL < M < H. Variabel ini mencerminkan tingkatan performa siswa, sehingga memenuhi syarat sebagai variabel dependen ordinal.
Multikolinearitas terjadi ketika dua atau lebih variabel independen memiliki korelasi yang tinggi satu sama lain. Kondisi ini dapat menyebabkan estimasi koefisien menjadi tidak stabil dan interpretasi menjadi sulit.
Karena model ini mengandung campuran variabel numerik dan
kategorik, fungsi vif() dari package
car secara otomatis menghitung Generalized VIF
(GVIF) untuk variabel kategorik dan VIF biasa untuk variabel
numerik. Nilai GVIF mentah tidak dapat dibandingkan
langsung dengan threshold 5 atau 10, karena nilainya dipengaruhi oleh
jumlah derajat bebas (Df) tiap variabel. Nilai yang tepat untuk
dievaluasi adalah kolom GVIF^(1/(2*Df))
yaitu nilai yang sudah disesuaikan dan setara dengan √VIF pada variabel
numerik.
Kriteria yang digunakan:
Nilai GVIF^(1/(2*Df)) |
Interpretasi |
|---|---|
| < √5 ≈ 2.236 | Aman, tidak ada multikolinearitas |
| √5 s.d. √10 (2.236 – 3.162) | Peringatan, perlu diperhatikan |
| > √10 ≈ 3.162 | Multikolinearitas serius |
ordinaldf$Y_num <- as.numeric(ordinaldf$Class)
model_ols <- lm(
Y_num ~ raisedhands + AnnouncementsView + Discussion +
gender + StageID + Semester +
ParentAnsweringSurvey + ParentschoolSatisfaction + StudentAbsenceDays,
data = ordinaldf
)
# Tampilkan tabel GVIF lengkap
gvif_full <- vif(model_ols)
print(round(gvif_full, 4))
## GVIF Df GVIF^(1/(2*Df))
## raisedhands 2.0596 1 1.4351
## AnnouncementsView 2.1379 1 1.4622
## Discussion 1.3036 1 1.1418
## gender 1.0844 1 1.0414
## StageID 1.1297 2 1.0310
## Semester 1.1410 1 1.0682
## ParentAnsweringSurvey 1.6065 1 1.2675
## ParentschoolSatisfaction 1.5261 1 1.2354
## StudentAbsenceDays 1.3427 1 1.1588
# Ekstrak kolom GVIF^(1/(2*Df)) — kolom ke-3 untuk kategorik, sqrt(VIF) untuk numerik
if (is.matrix(gvif_full)) {
gvif_adj <- gvif_full[, 3]
} else {
gvif_adj <- sqrt(gvif_full)
}
ordinaldf$Y_num <- NULL
ordinaldf$Y_num <- as.numeric(ordinaldf$Class)
model_ols_viz <- lm(
Y_num ~ raisedhands + AnnouncementsView + Discussion +
gender + StageID + Semester +
ParentAnsweringSurvey + ParentschoolSatisfaction + StudentAbsenceDays,
data = ordinaldf
)
gvif_vals <- vif(model_ols_viz)
ordinaldf$Y_num <- NULL
if (is.matrix(gvif_vals)) {
gvif_adj_vals <- gvif_vals[, 3]
vif_df <- data.frame(
Variabel = rownames(gvif_vals),
GVIF_adj = as.numeric(gvif_adj_vals)
)
} else {
vif_df <- data.frame(
Variabel = names(gvif_vals),
GVIF_adj = as.numeric(sqrt(gvif_vals))
)
}
thr_warn <- sqrt(5)
thr_crit <- sqrt(10)
ggplot(vif_df, aes(x = reorder(Variabel, GVIF_adj),
y = GVIF_adj,
fill = GVIF_adj > thr_warn)) +
geom_col(show.legend = FALSE) +
geom_hline(yintercept = thr_warn, linetype = "dashed", color = "orange", linewidth = 0.8) +
geom_hline(yintercept = thr_crit, linetype = "dashed", color = "red", linewidth = 0.8) +
annotate("text", x = 0.6, y = thr_warn + 0.03,
label = paste0("\u221a5 = ", round(thr_warn, 3), " (peringatan)"),
color = "orange", hjust = 0, size = 3.5, vjust = 0) +
annotate("text", x = 0.6, y = thr_crit + 0.03,
label = paste0("\u221a10 = ", round(thr_crit, 3), " (kritis)"),
color = "red", hjust = 0, size = 3.5, vjust = 0) +
scale_fill_manual(values = c("FALSE" = "steelblue", "TRUE" = "tomato")) +
coord_flip() +
theme_minimal(base_size = 12) +
labs(title = "Generalized VIF \u2014 GVIF\u00b9\u141f\u00b2\u1d30\u1da0 per Variabel",
subtitle = "Nilai yang tepat untuk variabel campuran numerik & kategorik (setara \u221aVIF)",
x = "Variabel", y = "GVIF^(1/(2*Df))")
Terpenuhi — Seluruh variabel berada di bawah threshold kritis √10 ≈ 3.162. Variabel numerik (
raisedhands,AnnouncementsView,Discussion) memiliki nilai VIF yang rendah, demikian pula seluruh variabel kategorik, sehingga tidak terdapat multikolinearitas yang serius. Ini adalah asumsi wajib dan terkonfirmasi terpenuhi.
Outlier ekstrem dapat memengaruhi estimasi model secara signifikan. Deteksi outlier dilakukan menggunakan Cook’s Distance pada model OLS bantu, observasi dengan Cook’s Distance > 4/n dianggap berpengaruh besar.
ordinaldf$Y_num <- as.numeric(ordinaldf$Class)
model_ols2 <- lm(
Y_num ~ raisedhands + AnnouncementsView + Discussion +
gender + StageID + Semester +
ParentAnsweringSurvey + ParentschoolSatisfaction + StudentAbsenceDays,
data = ordinaldf
)
n <- nrow(ordinaldf)
cooksd <- cooks.distance(model_ols2)
threshold <- 4 / n
data.frame(
Threshold = round(threshold, 6),
N_Berpengaruh = sum(cooksd > threshold, na.rm = TRUE),
Pct_Berpengaruh = paste0(round(mean(cooksd > threshold, na.rm = TRUE) * 100, 2), "%"),
Cook_Max = round(max(cooksd, na.rm = TRUE), 6)
)
| Threshold | N_Berpengaruh | Pct_Berpengaruh | Cook_Max |
|---|---|---|---|
| 0.008333 | 24 | 5% | 0.030143 |
ordinaldf$Y_num <- NULL
ordinaldf$Y_num <- as.numeric(ordinaldf$Class)
model_ols3 <- lm(
Y_num ~ raisedhands + AnnouncementsView + Discussion +
gender + StageID + Semester +
ParentAnsweringSurvey + ParentschoolSatisfaction + StudentAbsenceDays,
data = ordinaldf
)
n3 <- nrow(ordinaldf)
cooksd3 <- cooks.distance(model_ols3)
thresh3 <- 4 / n3
ordinaldf$Y_num <- NULL
plot(cooksd3, type = "h",
main = "Cook's Distance untuk Deteksi Outlier / Influential Observation",
ylab = "Cook's Distance",
xlab = "Indeks Observasi",
col = ifelse(cooksd3 > thresh3, "tomato", "steelblue"))
abline(h = thresh3, col = "red", lwd = 2, lty = 2)
legend("topright",
legend = c("Normal", "Berpengaruh (> 4/n)", "Threshold (4/n)"),
col = c("steelblue", "tomato", "red"),
lty = c(1, 1, 2),
lwd = 2,
cex = 0.85)
Perlu perhatian — Terdapat sejumlah observasi yang melampaui threshold Cook’s Distance (4/n). Hal ini wajar terjadi pada data kategorik dengan variasi nilai yang terbatas, karena observasi dalam kelompok kecil cenderung memiliki leverage lebih tinggi. Selama proporsinya tidak terlalu besar dan tidak ada satu titik pun yang nilainya sangat jauh melampaui yang lain, model masih dapat digunakan dan diinterpretasikan.
model_polr <- polr(
Class ~ raisedhands + AnnouncementsView + Discussion +
gender + StageID + Semester +
ParentAnsweringSurvey + ParentschoolSatisfaction + StudentAbsenceDays,
data = ordinaldf,
method = "logistic",
Hess = TRUE
)
summary(model_polr)
## Call:
## polr(formula = Class ~ raisedhands + AnnouncementsView + Discussion +
## gender + StageID + Semester + ParentAnsweringSurvey + ParentschoolSatisfaction +
## StudentAbsenceDays, data = ordinaldf, Hess = TRUE, method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## raisedhands 0.035831 0.005276 6.7917
## AnnouncementsView 0.016207 0.005763 2.8123
## Discussion 0.003448 0.004413 0.7813
## genderF 0.852891 0.236393 3.6079
## StageIDMiddleSchool -0.636233 0.241628 -2.6331
## StageIDHighSchool -0.184949 0.453196 -0.4081
## SemesterS 0.181573 0.231126 0.7856
## ParentAnsweringSurveyYes 1.138743 0.271505 4.1942
## ParentschoolSatisfactionGood 0.556481 0.268851 2.0699
## StudentAbsenceDaysAbove-7 -3.136844 0.322074 -9.7395
##
## Intercepts:
## Value Std. Error t value
## L|M -0.2710 0.3950 -0.6860
## M|H 4.4919 0.4634 9.6932
##
## Residual Deviance: 550.9474
## AIC: 574.9474
model_null <- polr(
Class ~ 1,
data = ordinaldf,
method = "logistic",
Hess = TRUE
)
H₀: Semua koefisien prediktor = 0
H₁: Minimal satu koefisien ≠ 0
Tolak H₀ jika p-value < 0.05
LL_full <- as.numeric(logLik(model_polr))
LL_null <- as.numeric(logLik(model_null))
G2 <- -2 * (LL_null - LL_full)
df_lrt <- length(coef(model_polr))
p_lrt <- pchisq(G2, df = df_lrt, lower.tail = FALSE)
data.frame(
Statistik = round(G2, 4),
df = df_lrt,
p_value = format(p_lrt, scientific = TRUE, digits = 4),
Keputusan = ifelse(p_lrt < 0.05, "TOLAK H0", "GAGAL TOLAK H0"),
Kesimpulan = ifelse(p_lrt < 0.05,
"Model signifikan secara serentak",
"Model tidak signifikan secara serentak")
)
| Statistik | df | p_value | Keputusan | Kesimpulan |
|---|---|---|---|---|
| 479.5247 | 10 | 1.044e-96 | TOLAK H0 | Model signifikan secara serentak |
H₀: Koefisien variabel ke-j = 0
H₁: Koefisien variabel ke-j ≠ 0
Tolak H₀ jika p-value < 0.05
coef_tbl <- coef(summary(model_polr))
z_vals <- coef_tbl[, "t value"]
p_vals <- 2 * pnorm(abs(z_vals), lower.tail = FALSE)
wald_result <- data.frame(
Variabel = rownames(coef_tbl),
Koefisien = round(coef_tbl[, "Value"], 4),
Std_Error = round(coef_tbl[, "Std. Error"], 4),
z_value = round(z_vals, 4),
p_value = round(p_vals, 6),
Sig = case_when(
p_vals < 0.001 ~ "***",
p_vals < 0.01 ~ "**",
p_vals < 0.05 ~ "*",
p_vals < 0.1 ~ ".",
TRUE ~ "ns"
)
)
wald_result
| Variabel | Koefisien | Std_Error | z_value | p_value | Sig | |
|---|---|---|---|---|---|---|
| raisedhands | raisedhands | 0.0358 | 0.0053 | 6.7917 | 0.000000 | *** |
| AnnouncementsView | AnnouncementsView | 0.0162 | 0.0058 | 2.8123 | 0.004918 | ** |
| Discussion | Discussion | 0.0034 | 0.0044 | 0.7813 | 0.434654 | ns |
| genderF | genderF | 0.8529 | 0.2364 | 3.6079 | 0.000309 | *** |
| StageIDMiddleSchool | StageIDMiddleSchool | -0.6362 | 0.2416 | -2.6331 | 0.008461 | ** |
| StageIDHighSchool | StageIDHighSchool | -0.1849 | 0.4532 | -0.4081 | 0.683201 | ns |
| SemesterS | SemesterS | 0.1816 | 0.2311 | 0.7856 | 0.432100 | ns |
| ParentAnsweringSurveyYes | ParentAnsweringSurveyYes | 1.1387 | 0.2715 | 4.1942 | 0.000027 | *** |
| ParentschoolSatisfactionGood | ParentschoolSatisfactionGood | 0.5565 | 0.2689 | 2.0699 | 0.038466 | * |
| StudentAbsenceDaysAbove-7 | StudentAbsenceDaysAbove-7 | -3.1368 | 0.3221 | -9.7395 | 0.000000 | *** |
| L|M | L|M | -0.2710 | 0.3950 | -0.6860 | 0.492687 | ns |
| M|H | M|H | 4.4919 | 0.4634 | 9.6932 | 0.000000 | *** |
| McFadden R² | Interpretasi |
|---|---|
| 0.00 – 0.10 | Lemah |
| 0.10 – 0.20 | Cukup |
| 0.20 – 0.40 | Baik |
| > 0.40 | Sangat Baik |
n <- nrow(ordinaldf)
r2_mcfadden <- 1 - (LL_full / LL_null)
r2_cox <- 1 - exp((2 / n) * (LL_null - LL_full))
r2_nag <- r2_cox / (1 - exp((2 / n) * LL_null))
data.frame(
Metrik = c("McFadden Pseudo R²", "Cox & Snell R²", "Nagelkerke R²",
"AIC Full Model", "AIC Null Model", "Residual Deviance"),
Nilai = round(c(r2_mcfadden, r2_cox, r2_nag,
AIC(model_polr), AIC(model_null),
model_polr$deviance), 4)
)
| Metrik | Nilai |
|---|---|
| McFadden Pseudo R² | 0.4653 |
| Cox & Snell R² | 0.6318 |
| Nagelkerke R² | 0.7153 |
| AIC Full Model | 574.9474 |
| AIC Null Model | 1034.4721 |
| Residual Deviance | 550.9474 |
ordinaldf$pred_class <- predict(model_polr, newdata = ordinaldf, type = "class")
head(data.frame(
Aktual = ordinaldf$Class,
Prediksi = ordinaldf$pred_class
), 10)
| Aktual | Prediksi |
|---|---|
| M | M |
| M | M |
| L | L |
| L | L |
| M | L |
| M | M |
| L | L |
| M | M |
| M | M |
| M | H |
prob_pred <- predict(model_polr, newdata = ordinaldf, type = "probs")
head(round(prob_pred, 4), 10)
## L M H
## 1 0.0688 0.8276 0.1036
## 2 0.0564 0.8185 0.1251
## 3 0.9171 0.0821 0.0008
## 4 0.8305 0.1678 0.0017
## 5 0.7438 0.2533 0.0029
## 6 0.2530 0.7224 0.0246
## 7 0.8993 0.0997 0.0010
## 8 0.0311 0.7585 0.2104
## 9 0.0455 0.8025 0.1521
## 10 0.0048 0.3554 0.6398
cm <- table(Aktual = ordinaldf$Class, Prediksi = ordinaldf$pred_class)
print(cm)
## Prediksi
## Aktual L M H
## L 105 22 0
## M 26 143 42
## H 0 38 104
akurasi <- sum(diag(cm)) / sum(cm)
cat(sprintf("Akurasi Model: %.2f%%\n\n", akurasi * 100))
## Akurasi Model: 73.33%
metrics <- do.call(rbind, lapply(rownames(cm), function(kelas) {
tp <- cm[kelas, kelas]
fp <- sum(cm[, kelas]) - tp
fn <- sum(cm[kelas, ]) - tp
prec <- ifelse((tp + fp) > 0, tp / (tp + fp), NA)
rec <- ifelse((tp + fn) > 0, tp / (tp + fn), NA)
data.frame(Kelas = kelas, Precision = round(prec, 3), Recall = round(rec, 3))
}))
print(metrics)
## Kelas Precision Recall
## 1 L 0.802 0.827
## 2 M 0.704 0.678
## 3 H 0.712 0.732
cm_df <- as.data.frame(cm)
ggplot(cm_df, aes(x = Prediksi, y = Aktual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 7, fontface = "bold") +
scale_fill_gradient(low = "#f0f7ff", high = "#2171b5") +
theme_minimal(base_size = 13) +
labs(title = "Confusion Matrix — Ordinal Logistic Regression",
x = "Prediksi", y = "Aktual", fill = "Jumlah")
or_vals <- exp(coef(model_polr))
se_vals <- sqrt(diag(vcov(model_polr))[names(coef(model_polr))])
or_tbl <- data.frame(
Variabel = names(or_vals),
OR = round(or_vals, 4),
CI_Lower = round(exp(coef(model_polr) - 1.96 * se_vals), 4),
CI_Upper = round(exp(coef(model_polr) + 1.96 * se_vals), 4)
)
or_tbl
| Variabel | OR | CI_Lower | CI_Upper | |
|---|---|---|---|---|
| raisedhands | raisedhands | 1.0365 | 1.0258 | 1.0473 |
| AnnouncementsView | AnnouncementsView | 1.0163 | 1.0049 | 1.0279 |
| Discussion | Discussion | 1.0035 | 0.9948 | 1.0122 |
| genderF | genderF | 2.3464 | 1.4763 | 3.7293 |
| StageIDMiddleSchool | StageIDMiddleSchool | 0.5293 | 0.3296 | 0.8499 |
| StageIDHighSchool | StageIDHighSchool | 0.8311 | 0.3419 | 2.0204 |
| SemesterS | SemesterS | 1.1991 | 0.7623 | 1.8862 |
| ParentAnsweringSurveyYes | ParentAnsweringSurveyYes | 3.1228 | 1.8342 | 5.3169 |
| ParentschoolSatisfactionGood | ParentschoolSatisfactionGood | 1.7445 | 1.0300 | 2.9548 |
| StudentAbsenceDaysAbove-7 | StudentAbsenceDaysAbove-7 | 0.0434 | 0.0231 | 0.0816 |
ggplot(or_tbl, aes(x = OR, y = reorder(Variabel, OR))) +
geom_point(size = 3.5, color = "steelblue") +
geom_errorbarh(aes(xmin = CI_Lower, xmax = CI_Upper), height = 0.25, color = "steelblue") +
geom_vline(xintercept = 1, linetype = "dashed", color = "red", linewidth = 0.8) +
theme_minimal(base_size = 12) +
labs(title = "Odds Ratio dengan 95% Confidence Interval",
subtitle = "Garis merah = OR 1 (tidak ada efek)",
x = "Odds Ratio", y = "Variabel")
# Buat grid nilai StudentAbsenceDays
grid <- data.frame(
StudentAbsenceDays = factor(c("Under-7", "Above-7"),
levels = c("Under-7", "Above-7")),
gender = factor("M", levels = levels(ordinaldf$gender)),
StageID = factor("MiddleSchool", levels = levels(ordinaldf$StageID)),
Semester = factor("F", levels = levels(ordinaldf$Semester)),
ParentAnsweringSurvey = factor("Yes", levels = levels(ordinaldf$ParentAnsweringSurvey)),
ParentschoolSatisfaction= factor("Good", levels = levels(ordinaldf$ParentschoolSatisfaction)),
raisedhands = 50,
AnnouncementsView = 38,
Discussion = 43
)
# Prediksi probabilitas
prob <- predict(model_polr, newdata = grid, type = "probs")
# Gabungkan ke data
prob_df <- cbind(grid, prob)
# Ubah ke long format
prob_long <- prob_df %>%
pivot_longer(cols = c("L", "M", "H"),
names_to = "Kategori",
values_to = "Probabilitas") %>%
mutate(Kategori = factor(Kategori, levels = c("L", "M", "H")))
# Plot bar
ggplot(prob_long, aes(x = StudentAbsenceDays, y = Probabilitas, fill = Kategori)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = c("L" = "#e74c3c", "M" = "#f39c12", "H" = "#2ecc71")) +
theme_minimal(base_size = 13) +
labs(
title = "Pengaruh Kehadiran Siswa terhadap Probabilitas Kelas Performa",
x = "Student Absence Days",
y = "Probabilitas",
fill = "Kategori"
)
# Buat grid nilai raisedhands
grid_rh <- data.frame(
raisedhands = seq(0, 100, by = 10),
AnnouncementsView = 38,
Discussion = 43,
gender = factor("M", levels = levels(ordinaldf$gender)),
StageID = factor("MiddleSchool", levels = levels(ordinaldf$StageID)),
Semester = factor("F", levels = levels(ordinaldf$Semester)),
ParentAnsweringSurvey = factor("Yes", levels = levels(ordinaldf$ParentAnsweringSurvey)),
ParentschoolSatisfaction= factor("Good", levels = levels(ordinaldf$ParentschoolSatisfaction)),
StudentAbsenceDays = factor("Under-7", levels = levels(ordinaldf$StudentAbsenceDays))
)
# Prediksi probabilitas
prob_rh <- predict(model_polr, newdata = grid_rh, type = "probs")
prob_rh_df <- cbind(grid_rh, prob_rh) %>%
pivot_longer(cols = c("L", "M", "H"),
names_to = "Kategori",
values_to = "Probabilitas") %>%
mutate(Kategori = factor(Kategori, levels = c("L", "M", "H")))
# Plot garis
ggplot(prob_rh_df, aes(x = raisedhands, y = Probabilitas,
group = Kategori, color = Kategori)) +
geom_line(linewidth = 1.5) +
geom_point(size = 3) +
scale_color_manual(values = c("L" = "#e74c3c", "M" = "#f39c12", "H" = "#2ecc71")) +
theme_minimal(base_size = 13) +
labs(
title = "Kurva Probabilitas Ordinal Logistic Regression",
subtitle = "Pengaruh Raised Hands terhadap Kelas Performa",
x = "Raised Hands",
y = "Probabilitas",
color = "Kategori"
)
Linear Discriminant Analysis (LDA) adalah metode klasifikasi dan reduksi dimensi yang bertujuan menemukan kombinasi linear dari variabel prediktor yang terbaik memisahkan dua atau lebih kelompok (kelas). LDA mengasumsikan bahwa data dalam setiap kelas berdistribusi multivariat normal dengan matriks kovarians yang sama (homoscedasticity).
Variabel yang digunakan dalam LDA:
| Variabel | Tipe | Keterangan |
|---|---|---|
raisedhands |
Numerik | Jumlah kali siswa mengangkat tangan |
AnnouncementsView |
Numerik | Jumlah kali siswa melihat pengumuman |
Discussion |
Numerik | Jumlah kali siswa berpartisipasi dalam diskusi |
Catatan: LDA hanya menggunakan variabel numerik sebagai prediktor. Variabel kategorik tidak disertakan karena LDA mengasumsikan prediktor kontinu dan berdistribusi normal.
# install.packages(c("MVN", "heplots"), quiet = TRUE)
library(MASS) # lda()
library(MVN) # uji normalitas multivariat (Mardia, HZ)
library(heplots) # boxM() — uji homogenitas kovarians
library(ggplot2)
library(dplyr)
library(tidyr)
# Ambil variabel numerik + variabel kelas
ldadf <- df[, c("Class", "raisedhands", "AnnouncementsView", "Discussion")]
# Konversi Class ke factor (tidak ordered untuk LDA)
ldadf$Class <- factor(ldadf$Class, levels = c("L", "M", "H"))
cat("Dimensi data LDA:", dim(ldadf), "\n")
## Dimensi data LDA: 480 4
print(table(ldadf$Class))
##
## L M H
## 127 211 142
Uji normalitas multivariat dilakukan per kelas menggunakan dua metode:
Hipotesis:
vars_num <- c("raisedhands", "AnnouncementsView", "Discussion")
kelas_list <- levels(ldadf$Class)
sw_result <- do.call(rbind, lapply(kelas_list, function(k) {
sub_data <- ldadf[ldadf$Class == k, vars_num]
do.call(rbind, lapply(vars_num, function(v) {
sw <- shapiro.test(sub_data[[v]])
data.frame(
Kelas = k,
Variabel = v,
W = round(sw$statistic, 5),
p_value = round(sw$p.value, 6),
Keputusan = ifelse(sw$p.value < 0.05,
"TOLAK H0 (Tidak Normal)",
"GAGAL TOLAK H0 (Normal)")
)
}))
}))
row.names(sw_result) <- NULL
sw_result
| Kelas | Variabel | W | p_value | Keputusan |
|---|---|---|---|---|
| L | raisedhands | 0.78349 | 0.000000 | TOLAK H0 (Tidak Normal) |
| L | AnnouncementsView | 0.83685 | 0.000000 | TOLAK H0 (Tidak Normal) |
| L | Discussion | 0.88962 | 0.000000 | TOLAK H0 (Tidak Normal) |
| M | raisedhands | 0.93278 | 0.000000 | TOLAK H0 (Tidak Normal) |
| M | AnnouncementsView | 0.95819 | 0.000007 | TOLAK H0 (Tidak Normal) |
| M | Discussion | 0.93510 | 0.000000 | TOLAK H0 (Tidak Normal) |
| H | raisedhands | 0.87795 | 0.000000 | TOLAK H0 (Tidak Normal) |
| H | AnnouncementsView | 0.96101 | 0.000462 | TOLAK H0 (Tidak Normal) |
| H | Discussion | 0.92846 | 0.000001 | TOLAK H0 (Tidak Normal) |
plot_data <- ldadf %>%
pivot_longer(cols = all_of(vars_num), names_to = "Variabel", values_to = "Nilai")
ggplot(plot_data, aes(sample = Nilai, color = Class)) +
stat_qq() +
stat_qq_line(linewidth = 0.8) +
facet_grid(Class ~ Variabel, scales = "free") +
scale_color_manual(values = c("L" = "#e74c3c", "M" = "#f39c12", "H" = "#2ecc71")) +
theme_minimal(base_size = 12) +
labs(
title = "Q-Q Plot Normalitas — Per Variabel Per Kelas",
subtitle = "Titik yang mengikuti garis diagonal mengindikasikan distribusi normal",
x = "Theoretical Quantiles",
y = "Sample Quantiles",
color = "Kelas"
)
cat(" UJI NORMALITAS MULTIVARIAT PER KELAS \n")
## UJI NORMALITAS MULTIVARIAT PER KELAS
for (k in kelas_list) {
cat(paste0("Kelas: ", k, " (n=", sum(ldadf$Class == k), ")\n"))
sub_data <- ldadf[ldadf$Class == k, vars_num]
# Mardia
mvn_mardia <- tryCatch(
mvn(data = sub_data, mvnTest = "mardia", desc = FALSE),
error = function(e) NULL
)
if (!is.null(mvn_mardia)) {
cat("Mardia Test:\n")
print(mvn_mardia$multivariateNormality)
}
# Henze-Zirkler
mvn_hz <- tryCatch(
mvn(data = sub_data, mvnTest = "hz", desc = FALSE),
error = function(e) NULL
)
if (!is.null(mvn_hz)) {
cat("Henze-Zirkler Test:\n")
print(mvn_hz$multivariateNormality)
}
cat("\n")
}
## Kelas: L (n=127)
##
## Kelas: M (n=211)
##
## Kelas: H (n=142)
# Tabel ringkasan normalitas multivariat
mvn_summary <- do.call(rbind, lapply(kelas_list, function(k) {
sub_data <- ldadf[ldadf$Class == k, vars_num]
m_out <- tryCatch(mvn(data = sub_data, mvnTest = "mardia", desc = FALSE)$multivariateNormality,
error = function(e) NULL)
hz_out <- tryCatch(mvn(data = sub_data, mvnTest = "hz", desc = FALSE)$multivariateNormality,
error = function(e) NULL)
mardia_skew <- if (!is.null(m_out)) as.numeric(m_out$`p value`[m_out$Test == "Mardia Skewness"]) else NA
mardia_kurt <- if (!is.null(m_out)) as.numeric(m_out$`p value`[m_out$Test == "Mardia Kurtosis"]) else NA
hz_p <- if (!is.null(hz_out)) as.numeric(hz_out$`p value`[1]) else NA
data.frame(
Kelas = k,
n = nrow(sub_data),
Mardia_Skew_p = round(mardia_skew, 5),
Mardia_Kurt_p = round(mardia_kurt, 5),
HZ_p = round(hz_p, 5),
Status_Mardia = ifelse(!is.na(mardia_skew) & mardia_skew > 0.05 &
!is.na(mardia_kurt) & mardia_kurt > 0.05,
"Normal", "Tidak Normal"),
Status_HZ = ifelse(!is.na(hz_p) & hz_p > 0.05, "Normal", "Tidak Normal")
)
}))
row.names(mvn_summary) <- NULL
mvn_summary
| Kelas | n | Mardia_Skew_p | Mardia_Kurt_p | HZ_p | Status_Mardia | Status_HZ |
|---|---|---|---|---|---|---|
| L | 127 | NA | NA | NA | Tidak Normal | Tidak Normal |
| M | 211 | NA | NA | NA | Tidak Normal | Tidak Normal |
| H | 142 | NA | NA | NA | Tidak Normal | Tidak Normal |
Catatan: Pada dataset nyata (real-world data), terutama data survei perilaku siswa, normalitas multivariat yang sempurna jarang terpenuhi. LDA tetap cukup robust terhadap pelanggaran normalitas ringan hingga sedang, khususnya jika ukuran sampel setiap kelas cukup besar (n > 30 per kelas).
Asumsi ini mensyaratkan bahwa matriks kovarians antar kelas bersifat sama (equal covariance matrices).
boxm_test <- heplots::boxM(
cbind(raisedhands, AnnouncementsView, Discussion) ~ Class,
data = ldadf
)
print(boxm_test)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: ldadf
## Chi-Sq (approx.) = 64.7291, df = 12, p-value = 3.059e-09
cat("\n--- Ringkasan ---\n")
##
## --- Ringkasan ---
cat(sprintf("Chi-square : %.4f\n", boxm_test$statistic))
## Chi-square : 64.7291
cat(sprintf("df : %d\n", boxm_test$parameter))
## df : 12
cat(sprintf("p-value : %.6f\n", boxm_test$p.value))
## p-value : 0.000000
cat(sprintf("Keputusan : %s\n",
ifelse(boxm_test$p.value < 0.05,
"TOLAK H0 — Matriks kovarians TIDAK homogen",
"GAGAL TOLAK H0 — Matriks kovarians homogen")))
## Keputusan : TOLAK H0 — Matriks kovarians TIDAK homogen
ggplot(ldadf, aes(x = raisedhands, y = AnnouncementsView, color = Class)) +
geom_point(alpha = 0.5, size = 2) +
stat_ellipse(level = 0.95, linewidth = 1) +
scale_color_manual(values = c("L" = "#e74c3c", "M" = "#f39c12", "H" = "#2ecc71")) +
theme_minimal(base_size = 13) +
labs(
title = "Sebaran Titik & Elips Konsentrasi 95% per Kelas",
subtitle = "Elips yang berbeda ukuran/bentuk mengindikasikan kovarians tidak homogen",
x = "Raised Hands",
y = "Announcements View",
color = "Kelas"
)
cat("Matriks Kovarians per Kelas\n\n")
## Matriks Kovarians per Kelas
for (k in kelas_list) {
cat(paste0("Kelas ", k, " (n=", sum(ldadf$Class == k), "):\n"))
print(round(cov(ldadf[ldadf$Class == k, vars_num]), 3))
cat("\n")
}
## Kelas L (n=127):
## raisedhands AnnouncementsView Discussion
## raisedhands 296.162 76.778 -30.749
## AnnouncementsView 76.778 234.532 49.826
## Discussion -30.749 49.826 661.012
##
## Kelas M (n=211):
## raisedhands AnnouncementsView Discussion
## raisedhands 723.268 340.993 158.363
## AnnouncementsView 340.993 580.180 198.163
## Discussion 158.363 198.163 682.271
##
## Kelas H (n=142):
## raisedhands AnnouncementsView Discussion
## raisedhands 508.207 237.804 189.886
## AnnouncementsView 237.804 627.755 288.491
## Discussion 189.886 288.491 739.615
Interpretasi: Box’s M Test sangat sensitif terhadap pelanggaran normalitas dan ukuran sampel besar. Jika p-value < 0.05. Namun jika sampel cukup besar dan perbedaan kovarians tidak terlalu ekstrem, LDA masih dapat digunakan.
prior_vec <- as.vector(table(ldadf$Class) / nrow(ldadf))
model_lda <- lda(
Class ~ raisedhands + AnnouncementsView + Discussion,
data = ldadf,
prior = prior_vec,
CV = FALSE
)
cat("HASIL MODEL LDA\n\n")
## HASIL MODEL LDA
print(model_lda)
## Call:
## lda(Class ~ raisedhands + AnnouncementsView + Discussion, data = ldadf,
## prior = prior_vec, CV = FALSE)
##
## Prior probabilities of groups:
## L M H
## 0.2645833 0.4395833 0.2958333
##
## Group means:
## raisedhands AnnouncementsView Discussion
## L 16.88976 15.57480 30.83465
## M 48.93839 40.96209 43.79147
## H 70.28873 53.38028 53.66197
##
## Coefficients of linear discriminants:
## LD1 LD2
## raisedhands 0.033430141 0.02704799
## AnnouncementsView 0.013953021 -0.04912908
## Discussion 0.004163116 0.01906428
##
## Proportion of trace:
## LD1 LD2
## 0.9936 0.0064
data.frame(
Kelas = names(model_lda$prior),
Prior_Prob = round(model_lda$prior, 4),
Frekuensi = as.integer(table(ldadf$Class))
)
| Kelas | Prior_Prob | Frekuensi | |
|---|---|---|---|
| L | L | 0.2646 | 127 |
| M | M | 0.4396 | 211 |
| H | H | 0.2958 | 142 |
round(model_lda$means, 4)
## raisedhands AnnouncementsView Discussion
## L 16.8898 15.5748 30.8346
## M 48.9384 40.9621 43.7915
## H 70.2887 53.3803 53.6620
cat("Koefisien LD (Linear Discriminants):\n")
## Koefisien LD (Linear Discriminants):
round(model_lda$scaling, 4)
## LD1 LD2
## raisedhands 0.0334 0.0270
## AnnouncementsView 0.0140 -0.0491
## Discussion 0.0042 0.0191
svd_vals <- model_lda$svd
prop_var <- svd_vals^2 / sum(svd_vals^2)
cum_var <- cumsum(prop_var)
data.frame(
Fungsi_Diskriminan = paste0("LD", seq_along(svd_vals)),
Eigenvalue = round(svd_vals^2, 4),
Proporsi_Varians = paste0(round(prop_var * 100, 2), "%"),
Kumulatif = paste0(round(cum_var * 100, 2), "%")
)
| Fungsi_Diskriminan | Eigenvalue | Proporsi_Varians | Kumulatif |
|---|---|---|---|
| LD1 | 196.8862 | 99.36% | 99.36% |
| LD2 | 1.2612 | 0.64% | 100% |
Uji Wilks’ Lambda menguji apakah fungsi diskriminan secara keseluruhan signifikan memisahkan kelompok.
manova_test <- manova(
cbind(raisedhands, AnnouncementsView, Discussion) ~ Class,
data = ldadf
)
wilks_result <- summary(manova_test, test = "Wilks")
print(wilks_result)
## Df Wilks approx F num Df den Df Pr(>F)
## Class 2 0.54491 56.159 6 950 < 2.2e-16 ***
## Residuals 477
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
wl <- wilks_result$stats
p_val <- wl["Class", "Pr(>F)"]
cat(sprintf("\nWilks' Lambda : %.6f\n", wl["Class", "Wilks"]))
##
## Wilks' Lambda : 0.544908
cat(sprintf("F-approx : %.4f\n", wl["Class", "approx F"]))
## F-approx : 56.1585
cat(sprintf("p-value : %s\n", format(p_val, scientific = TRUE, digits = 4)))
## p-value : 1.875e-59
cat(sprintf("Keputusan : %s\n",
ifelse(p_val < 0.05,
"TOLAK H0 — Fungsi diskriminan signifikan memisahkan kelas",
"GAGAL TOLAK H0 — Tidak ada perbedaan signifikan antar kelas")))
## Keputusan : TOLAK H0 — Fungsi diskriminan signifikan memisahkan kelas
pred_lda <- predict(model_lda, newdata = ldadf)
ldadf$pred_class_lda <- pred_lda$class
head(data.frame(Aktual = ldadf$Class, Prediksi = ldadf$pred_class_lda), 10)
| Aktual | Prediksi |
|---|---|
| M | L |
| M | L |
| L | L |
| L | L |
| M | M |
| M | M |
| L | L |
| M | M |
| M | L |
| M | M |
ld_scores <- as.data.frame(pred_lda$x)
ld_scores$Class <- ldadf$Class
# round hanya kolom numerik (LD1, LD2), Class tetap factor
ld_num <- ld_scores[, c("LD1", "LD2")]
head(data.frame(round(ld_num, 4), Class = ld_scores$Class), 10)
| LD1 | LD2 | Class |
|---|---|---|
| -1.6603 | 0.4613 | M |
| -1.4584 | 0.6428 | M |
| -1.8138 | 0.6150 | L |
| -1.0546 | 1.0056 | L |
| -0.5602 | 1.2182 | M |
| -0.3961 | 1.6044 | M |
| -1.0321 | 1.0434 | L |
| -0.3006 | 0.8075 | M |
| -1.4404 | 0.2643 | M |
| 0.7074 | 1.7722 | M |
post_probs <- as.data.frame(pred_lda$posterior)
head(round(post_probs, 4), 10)
| L | M | H |
|---|---|---|
| 0.7319 | 0.2459 | 0.0222 |
| 0.6704 | 0.2964 | 0.0332 |
| 0.7792 | 0.2044 | 0.0164 |
| 0.5270 | 0.4034 | 0.0696 |
| 0.3352 | 0.5184 | 0.1463 |
| 0.2836 | 0.5311 | 0.1853 |
| 0.5189 | 0.4086 | 0.0724 |
| 0.2377 | 0.5702 | 0.1921 |
| 0.6541 | 0.3123 | 0.0336 |
| 0.0507 | 0.4752 | 0.4741 |
ggplot(ld_scores, aes(x = LD1, y = LD2, color = Class)) +
geom_point(alpha = 0.6, size = 2.5) +
stat_ellipse(level = 0.95, linewidth = 1) +
scale_color_manual(values = c("L" = "#e74c3c", "M" = "#f39c12", "H" = "#2ecc71")) +
theme_minimal(base_size = 13) +
labs(
title = "Skor Diskriminan LDA: LD1 vs LD2",
subtitle = "Elips konsentrasi 95% per kelas",
x = paste0("LD1 (", round(prop_var[1] * 100, 1), "% varians)"),
y = paste0("LD2 (", round(prop_var[2] * 100, 1), "% varians)"),
color = "Kelas"
)
ggplot(ld_scores, aes(x = LD1, fill = Class, color = Class)) +
geom_density(alpha = 0.4, linewidth = 1) +
scale_fill_manual(values = c("L" = "#e74c3c", "M" = "#f39c12", "H" = "#2ecc71")) +
scale_color_manual(values = c("L" = "#e74c3c", "M" = "#f39c12", "H" = "#2ecc71")) +
theme_minimal(base_size = 13) +
labs(
title = "Density Plot Skor LD1 per Kelas",
subtitle = "Pemisahan distribusi yang baik mengindikasikan diskriminan yang efektif",
x = "Skor LD1", y = "Densitas", fill = "Kelas", color = "Kelas"
)
coef_df <- as.data.frame(model_lda$scaling)
coef_df$Variabel <- rownames(coef_df)
coef_long <- pivot_longer(coef_df, cols = starts_with("LD"),
names_to = "LD", values_to = "Koefisien")
ggplot(coef_long, aes(x = reorder(Variabel, abs(Koefisien)), y = Koefisien, fill = LD)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
scale_fill_manual(values = c("LD1" = "#2171b5", "LD2" = "#fd8d3c")) +
theme_minimal(base_size = 13) +
labs(
title = "Koefisien Fungsi Diskriminan per Variabel",
subtitle = "Koefisien besar (absolut) = kontribusi tinggi terhadap diskriminasi kelas",
x = "Variabel", y = "Koefisien LD", fill = "Fungsi Diskriminan"
)
cm_lda <- table(Aktual = ldadf$Class, Prediksi = ldadf$pred_class_lda)
print(cm_lda)
## Prediksi
## Aktual L M H
## L 100 25 2
## M 44 106 61
## H 10 40 92
akurasi_lda <- sum(diag(cm_lda)) / sum(cm_lda)
cat(sprintf("Akurasi Model LDA: %.2f%%\n\n", akurasi_lda * 100))
## Akurasi Model LDA: 62.08%
metrics_lda <- do.call(rbind, lapply(rownames(cm_lda), function(kelas) {
tp <- cm_lda[kelas, kelas]
fp <- sum(cm_lda[, kelas]) - tp
fn <- sum(cm_lda[kelas, ]) - tp
prec <- ifelse((tp + fp) > 0, tp / (tp + fp), NA)
rec <- ifelse((tp + fn) > 0, tp / (tp + fn), NA)
f1 <- ifelse(!is.na(prec) & !is.na(rec) & (prec + rec) > 0,
2 * prec * rec / (prec + rec), NA)
data.frame(Kelas = kelas,
Precision = round(prec, 3),
Recall = round(rec, 3),
F1_Score = round(f1, 3))
}))
print(metrics_lda)
## Kelas Precision Recall F1_Score
## 1 L 0.649 0.787 0.712
## 2 M 0.620 0.502 0.555
## 3 H 0.594 0.648 0.620
cm_lda_df <- as.data.frame(cm_lda)
ggplot(cm_lda_df, aes(x = Prediksi, y = Aktual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 7, fontface = "bold") +
scale_fill_gradient(low = "#f0f7ff", high = "#2171b5") +
theme_minimal(base_size = 13) +
labs(
title = "Confusion Matrix — Linear Discriminant Analysis (LDA)",
x = "Prediksi", y = "Aktual", fill = "Jumlah"
)
akurasi_olr <- sum(diag(cm)) / sum(cm)
data.frame(
Metode = c("Ordinal Logistic Regression (OLR)",
"Linear Discriminant Analysis (LDA)"),
Akurasi = paste0(round(c(akurasi_olr, akurasi_lda) * 100, 2), "%"),
Variabel = c("9 variabel (numerik + kategorik)",
"3 variabel numerik")
)
| Metode | Akurasi | Variabel |
|---|---|---|
| Ordinal Logistic Regression (OLR) | 73.33% | 9 variabel (numerik + kategorik) |
| Linear Discriminant Analysis (LDA) | 62.08% | 3 variabel numerik |