Hepatitis C Virus (HCV) merupakan penyakit yang menyerang organ hati dan dapat berkembang menjadi fibrosis hingga sirosis apabila tidak ditangani. Tingkat keparahan penyakit ini dinilai melalui Baselinehistological Staging (Staging 1–4) yang mencerminkan derajat kerusakan jaringan hati pasien.
Dataset yang digunakan adalah HCV for Egyptian Patients dari UCI Machine Learning Repository, yang memuat 28 variabel klinis dan biomarker dari 1.385 pasien. Dengan jumlah variabel yang besar, analisis diskriminan dimanfaatkan tidak hanya sebagai alat klasifikasi, tetapi sekaligus sebagai teknik reduksi dimensi berbasis kelas melalui fungsi diskriminan (LD1, LD2, LD3).
LDA menghasilkan fungsi diskriminan (LD1, LD2, …) yang merupakan kombinasi linear dari prediktor. Berbeda dengan PCA yang memaksimalkan keragaman total (unsupervised), LDA memaksimalkan jarak antar kelas sambil meminimalkan keragaman dalam kelas (supervised):
\[D = b_1X_1 + b_2X_2 + \cdots + b_pX_p\]
Untuk \(g\) kelas, LDA menghasilkan maksimal \(g-1\) fungsi diskriminan. Dengan 4 kelas Staging, terdapat 3 fungsi diskriminan (LD1, LD2, LD3) yang memproyeksikan data dari 28 dimensi ke ruang 3 dimensi. Proporsi trace menunjukkan seberapa besar setiap LD menjelaskan pemisahan antar kelas.
library(tidyverse)
library(caret)
library(MASS) # lda(), qda()
library(klaR) # partimat()
library(ggplot2)
library(dplyr)
library(reshape2)
library(gridExtra)# Sesuaikan path file dengan lokasi di komputer Anda
path <- "D:/02. UNS/01. Kuliah/Sem 4/PDM/Diskriminan/HCV-Egy-Data.csv"
df <- read.csv(path, check.names = TRUE)
# Hapus kolom dengan nama kosong (mencegah error zero-length variable name)
df <- df[, nzchar(trimws(colnames(df)))]
# Bersihkan spasi di nama kolom
colnames(df) <- trimws(colnames(df))## 'data.frame': 1385 obs. of 29 variables:
## $ Age : int 56 46 57 49 59 58 42 48 44 45 ...
## $ Gender : int 1 1 1 2 1 2 2 2 1 1 ...
## $ BMI : int 35 29 33 33 32 22 26 30 23 30 ...
## $ Fever : int 2 1 2 1 1 2 1 1 1 2 ...
## $ Nausea.Vomting : int 1 2 2 2 1 2 1 1 1 1 ...
## $ Headache : int 1 2 2 1 2 2 2 2 2 2 ...
## $ Diarrhea : int 1 1 2 2 1 1 2 2 2 2 ...
## $ Fatigue...generalized.bone.ache: int 2 2 1 1 2 2 2 1 2 1 ...
## $ Jaundice : int 2 2 1 2 2 2 2 1 1 1 ...
## $ Epigastric.pain : int 2 1 1 1 2 1 2 2 2 2 ...
## $ WBC : int 7425 12101 4178 6490 3661 11785 11620 7335 10480 6681 ...
## $ RBC : num 4248807 4429425 4621191 4794631 4606375 ...
## $ HGB : int 14 10 12 10 11 15 12 11 12 12 ...
## $ Plat : num 112132 129367 151522 146457 187684 ...
## $ AST.1 : int 99 91 113 43 99 66 78 119 93 55 ...
## $ ALT.1 : int 84 123 49 64 104 104 57 112 83 68 ...
## $ ALT4 : num 52 95 95 109 67 121 113 80 55 72 ...
## $ ALT.12 : int 109 75 107 80 48 96 118 127 102 127 ...
## $ ALT.24 : int 81 113 116 88 120 65 107 45 97 81 ...
## $ ALT.36 : int 5 57 5 48 94 73 84 96 122 125 ...
## $ ALT.48 : int 5 123 5 77 90 114 80 53 39 43 ...
## $ ALT.after.24.w : int 5 44 5 33 30 29 28 39 45 30 ...
## $ RNA.Base : int 655330 40620 571148 1041941 660410 1157452 325694 641129 591441 1151206 ...
## $ RNA.4 : int 634536 538635 661346 449939 738756 1086852 1034008 72050 757361 230488 ...
## $ RNA.12 : int 288194 637056 5 585688 3731527 5 275095 787295 5 267320 ...
## $ RNA.EOT : int 5 336804 735945 744463 338946 5 214566 370605 371090 275295 ...
## $ RNA.EF : int 5 31085 558829 582301 242861 5 635157 506296 203042 555516 ...
## $ Baseline.histological.Grading : int 13 4 4 10 11 4 12 12 5 4 ...
## $ Baselinehistological.staging : int 2 2 4 3 1 4 4 3 2 2 ...
## Jumlah baris : 1385
## Jumlah kolom : 29
## Jumlah fitur : 28 (exclude target)
##
## === CEK MISSING VALUE ===
## Age Gender
## 0 0
## BMI Fever
## 0 0
## Nausea.Vomting Headache
## 0 0
## Diarrhea Fatigue...generalized.bone.ache
## 0 0
## Jaundice Epigastric.pain
## 0 0
## WBC RBC
## 0 0
## HGB Plat
## 0 0
## AST.1 ALT.1
## 0 0
## ALT4 ALT.12
## 0 0
## ALT.24 ALT.36
## 0 0
## ALT.48 ALT.after.24.w
## 0 0
## RNA.Base RNA.4
## 0 0
## RNA.12 RNA.EOT
## 0 0
## RNA.EF Baseline.histological.Grading
## 0 0
## Baselinehistological.staging
## 0
##
## === DISTRIBUSI VARIABEL TARGET ===
##
## 1 2 3 4
## 336 332 355 362
##
## 1 2 3 4
## 0.2425993 0.2397112 0.2563177 0.2613718
# Rename kolom agar mudah dirujuk
colnames(df)[colnames(df) == "Baselinehistological.staging"] <- "Staging"
colnames(df)[colnames(df) == "Fatigue...generalized.bone.ache"] <- "Fatigue"
colnames(df)[colnames(df) == "Nausea.Vomting"] <- "Nausea"
colnames(df)[colnames(df) == "Epigastric.pain"] <- "EpigastricPain"
colnames(df)[colnames(df) == "Baseline.histological.Grading"] <- "Grading"
colnames(df)[colnames(df) == "ALT.after.24.w"] <- "ALT_after24w"
colnames(df)[colnames(df) == "RNA.Base"] <- "RNA_Base"
colnames(df)[colnames(df) == "RNA.EOT"] <- "RNA_EOT"
colnames(df)[colnames(df) == "RNA.EF"] <- "RNA_EF"
colnames(df)[colnames(df) == "AST.1"] <- "AST1"
colnames(df)[colnames(df) == "ALT.1"] <- "ALT1"
colnames(df)[colnames(df) == "ALT4"] <- "ALT4"
colnames(df)[colnames(df) == "ALT.12"] <- "ALT12"
colnames(df)[colnames(df) == "ALT.24"] <- "ALT24"
colnames(df)[colnames(df) == "ALT.36"] <- "ALT36"
colnames(df)[colnames(df) == "ALT.48"] <- "ALT48"
colnames(df)[colnames(df) == "RNA.4"] <- "RNA4"
colnames(df)[colnames(df) == "RNA.12"] <- "RNA12"
# Target sebagai faktor
df$Staging <- as.factor(df$Staging)
levels(df$Staging) <- c("Staging1", "Staging2", "Staging3", "Staging4")
# Daftar fitur prediktor
features <- setdiff(colnames(df), "Staging")
cat("Nama kolom setelah cleaning:\n")## Nama kolom setelah cleaning:
## [1] "Age" "Gender" "BMI" "Fever"
## [5] "Nausea" "Headache" "Diarrhea" "Fatigue"
## [9] "Jaundice" "EpigastricPain" "WBC" "RBC"
## [13] "HGB" "Plat" "AST1" "ALT1"
## [17] "ALT4" "ALT12" "ALT24" "ALT36"
## [21] "ALT48" "ALT_after24w" "RNA_Base" "RNA4"
## [25] "RNA12" "RNA_EOT" "RNA_EF" "Grading"
## [29] "Staging"
##
## Total fitur prediktor: 28
set.seed(123)
train_idx <- createDataPartition(df_scaled$Staging, p = 0.7, list = FALSE)
data_train <- df_scaled[ train_idx, ]
data_test <- df_scaled[-train_idx, ]
cat("Data latih :", nrow(data_train), "observasi\n")## Data latih : 972 observasi
## Data uji : 413 observasi
##
## Distribusi kelas — Data Latih:
##
## Staging1 Staging2 Staging3 Staging4
## 236 233 249 254
##
## Distribusi kelas — Data Uji:
##
## Staging1 Staging2 Staging3 Staging4
## 100 99 106 108
## Call:
## lda(Staging ~ ., data = data_train)
##
## Prior probabilities of groups:
## Staging1 Staging2 Staging3 Staging4
## 0.2427984 0.2397119 0.2561728 0.2613169
##
## Group means:
## Age Gender BMI Fever Nausea
## Staging1 -0.04116680 0.01246228 -0.006907335 0.07913214 -0.03893889
## Staging2 0.07509045 -0.09492773 0.192871439 0.03333479 -0.04366555
## Staging3 0.03271559 0.10526104 -0.008431122 0.03723067 0.01502094
## Staging4 -0.08745112 0.02880856 -0.147389230 -0.07042553 0.02643270
## Headache Diarrhea Fatigue Jaundice EpigastricPain
## Staging1 0.04182675 0.06272060 -0.05713547 -0.01063682 0.08525000
## Staging2 -0.03067426 -0.02650414 -0.01928624 0.03644746 -0.08087695
## Staging3 -0.04425201 0.05516759 -0.04199557 -0.08647238 -0.09224918
## Staging4 0.01581104 0.05004652 0.05726363 0.06867543 0.03928890
## WBC RBC HGB Plat AST1
## Staging1 -0.01277460 -0.004632153 0.05513693 -0.047917098 0.011600782
## Staging2 -0.04539041 -0.023528199 0.03772072 0.099416138 0.002887526
## Staging3 -0.03541789 -0.017863048 -0.03127443 -0.008423181 0.003413371
## Staging4 0.04323104 -0.016507172 0.04070862 -0.042663762 -0.097660336
## ALT1 ALT4 ALT12 ALT24 ALT36
## Staging1 -0.08503638 0.015051362 0.001874303 0.048777499 0.003246343
## Staging2 0.03336331 0.002985419 -0.064867090 -0.005929071 -0.035835244
## Staging3 0.01051235 0.065238886 -0.076132923 -0.018320603 -0.021496574
## Staging4 0.06762564 -0.083855879 0.061226237 0.012004764 0.030290252
## ALT48 ALT_after24w RNA_Base RNA4 RNA12
## Staging1 -0.040328356 -0.03979426 0.07645703 0.07828545 0.050878930
## Staging2 0.017070283 -0.05953144 -0.11574169 0.05099877 -0.074352120
## Staging3 0.049653991 0.02717934 0.04280821 0.01317952 0.005110087
## Staging4 -0.009746323 0.04880102 0.03611084 -0.04085885 0.106641632
## RNA_EOT RNA_EF Grading
## Staging1 0.14305299 -0.006916713 0.04341757
## Staging2 -0.10623974 -0.101179568 0.04854715
## Staging3 -0.02516523 0.086308247 -0.01564098
## Staging4 -0.01384450 -0.004377552 -0.02982198
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## Age 0.304420344 0.0355057710 -0.085461654
## Gender -0.208235926 0.0001945808 -0.441128599
## BMI 0.585538083 -0.0830247788 0.159771418
## Fever 0.194403392 -0.2822427589 -0.120373630
## Nausea -0.112853340 0.1105997544 -0.128933904
## Headache -0.085905745 -0.1376034198 0.124515948
## Diarrhea -0.148928492 -0.1084406725 -0.061721460
## Fatigue -0.099764161 0.1735850191 0.116059830
## Jaundice -0.008305686 0.0954049366 0.416129304
## EpigastricPain -0.222476591 -0.2843435532 0.243163217
## WBC -0.187742217 0.0866761356 0.061365590
## RBC -0.041890082 -0.0723032061 -0.009364629
## HGB -0.101204917 -0.0761733349 0.217646974
## Plat 0.242742608 0.1117464695 0.117477970
## AST1 0.153025587 -0.2258156349 -0.132581774
## ALT1 -0.050658402 0.3760997031 0.106962592
## ALT4 0.168810309 -0.0946687954 -0.269205915
## ALT12 -0.240016123 -0.0109706770 0.188114335
## ALT24 -0.070844936 -0.1347833180 0.083067167
## ALT36 -0.112686363 0.0030455409 0.046868449
## ALT48 0.058017418 0.1190836389 -0.139282079
## ALT_after24w -0.154272838 0.1774843339 -0.123382266
## RNA_Base -0.234142964 -0.2245889625 -0.221706072
## RNA4 0.121282058 -0.2167854796 0.001079667
## RNA12 -0.355495002 0.1020991500 0.140437498
## RNA_EOT -0.043404873 -0.6803202110 0.110198155
## RNA_EF 0.129047273 0.2493186847 -0.573723369
## Grading 0.132994252 -0.2087344773 0.073705341
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.5140 0.2508 0.2351
Proporsi trace menunjukkan seberapa besar setiap fungsi diskriminan berkontribusi dalam memisahkan antar kelas. LDA mereduksi 28 dimensi fitur menjadi maksimal 3 dimensi (LD1, LD2, LD3) untuk 4 kelas.
ev <- model_lda$svd^2 / sum(model_lda$svd^2)
ev_pct <- round(ev * 100, 2)
ev_tbl <- data.frame(
Fungsi = paste0("LD", 1:3),
Proporsi = paste0(ev_pct, "%"),
Kumulatif = paste0(round(cumsum(ev_pct), 2), "%")
)
print(ev_tbl)## Fungsi Proporsi Kumulatif
## 1 LD1 51.4% 51.4%
## 2 LD2 25.08% 76.48%
## 3 LD3 23.51% 99.99%
##
## LD1 menjelaskan 51.40% pemisahan antar kelas.
## LD1 + LD2 menjelaskan 76.48% pemisahan antar kelas.
Koefisien ini membentuk persamaan fungsi diskriminan yang memproyeksikan data ke ruang berdimensi lebih rendah. Variabel dengan nilai absolut koefisien terbesar paling berpengaruh dalam pemisahan kelas.
## LD1 LD2 LD3
## Age 0.3044 0.0355 -0.0855
## Gender -0.2082 0.0002 -0.4411
## BMI 0.5855 -0.0830 0.1598
## Fever 0.1944 -0.2822 -0.1204
## Nausea -0.1129 0.1106 -0.1289
## Headache -0.0859 -0.1376 0.1245
## Diarrhea -0.1489 -0.1084 -0.0617
## Fatigue -0.0998 0.1736 0.1161
## Jaundice -0.0083 0.0954 0.4161
## EpigastricPain -0.2225 -0.2843 0.2432
## WBC -0.1877 0.0867 0.0614
## RBC -0.0419 -0.0723 -0.0094
## HGB -0.1012 -0.0762 0.2176
## Plat 0.2427 0.1117 0.1175
## AST1 0.1530 -0.2258 -0.1326
## ALT1 -0.0507 0.3761 0.1070
## ALT4 0.1688 -0.0947 -0.2692
## ALT12 -0.2400 -0.0110 0.1881
## ALT24 -0.0708 -0.1348 0.0831
## ALT36 -0.1127 0.0030 0.0469
## ALT48 0.0580 0.1191 -0.1393
## ALT_after24w -0.1543 0.1775 -0.1234
## RNA_Base -0.2341 -0.2246 -0.2217
## RNA4 0.1213 -0.2168 0.0011
## RNA12 -0.3555 0.1021 0.1404
## RNA_EOT -0.0434 -0.6803 0.1102
## RNA_EF 0.1290 0.2493 -0.5737
## Grading 0.1330 -0.2087 0.0737
## Staging1 Staging2 Staging3 Staging4
## 5 0.1735 0.1212 0.0763 0.6291
## 6 0.1848 0.1670 0.2942 0.3541
## 7 0.2767 0.1656 0.2271 0.3307
## 8 0.2354 0.1638 0.2642 0.3365
## 9 0.2728 0.1157 0.1577 0.4538
## 10 0.3381 0.1339 0.2243 0.3037
## Reference
## Prediction Staging1 Staging2 Staging3 Staging4
## Staging1 67 40 48 34
## Staging2 45 84 52 49
## Staging3 46 57 77 54
## Staging4 78 52 72 117
##
## Akurasi LDA (train): 35.49%
pred_test_lda <- predict(model_lda, data_test)
cm_test_lda <- confusionMatrix(pred_test_lda$class, data_test$Staging)
print(cm_test_lda$table)## Reference
## Prediction Staging1 Staging2 Staging3 Staging4
## Staging1 12 25 18 12
## Staging2 28 23 20 32
## Staging3 28 20 33 27
## Staging4 32 31 35 37
##
## Akurasi LDA (test): 25.42%
## Sensitivity Specificity Precision F1
## Class: Staging1 0.1200000 0.8242812 0.1791045 0.1437126
## Class: Staging2 0.2323232 0.7452229 0.2233010 0.2277228
## Class: Staging3 0.3113208 0.7557003 0.3055556 0.3084112
## Class: Staging4 0.3425926 0.6786885 0.2740741 0.3045267
## Call:
## qda(Staging ~ ., data = data_train)
##
## Prior probabilities of groups:
## Staging1 Staging2 Staging3 Staging4
## 0.2427984 0.2397119 0.2561728 0.2613169
##
## Group means:
## Age Gender BMI Fever Nausea
## Staging1 -0.04116680 0.01246228 -0.006907335 0.07913214 -0.03893889
## Staging2 0.07509045 -0.09492773 0.192871439 0.03333479 -0.04366555
## Staging3 0.03271559 0.10526104 -0.008431122 0.03723067 0.01502094
## Staging4 -0.08745112 0.02880856 -0.147389230 -0.07042553 0.02643270
## Headache Diarrhea Fatigue Jaundice EpigastricPain
## Staging1 0.04182675 0.06272060 -0.05713547 -0.01063682 0.08525000
## Staging2 -0.03067426 -0.02650414 -0.01928624 0.03644746 -0.08087695
## Staging3 -0.04425201 0.05516759 -0.04199557 -0.08647238 -0.09224918
## Staging4 0.01581104 0.05004652 0.05726363 0.06867543 0.03928890
## WBC RBC HGB Plat AST1
## Staging1 -0.01277460 -0.004632153 0.05513693 -0.047917098 0.011600782
## Staging2 -0.04539041 -0.023528199 0.03772072 0.099416138 0.002887526
## Staging3 -0.03541789 -0.017863048 -0.03127443 -0.008423181 0.003413371
## Staging4 0.04323104 -0.016507172 0.04070862 -0.042663762 -0.097660336
## ALT1 ALT4 ALT12 ALT24 ALT36
## Staging1 -0.08503638 0.015051362 0.001874303 0.048777499 0.003246343
## Staging2 0.03336331 0.002985419 -0.064867090 -0.005929071 -0.035835244
## Staging3 0.01051235 0.065238886 -0.076132923 -0.018320603 -0.021496574
## Staging4 0.06762564 -0.083855879 0.061226237 0.012004764 0.030290252
## ALT48 ALT_after24w RNA_Base RNA4 RNA12
## Staging1 -0.040328356 -0.03979426 0.07645703 0.07828545 0.050878930
## Staging2 0.017070283 -0.05953144 -0.11574169 0.05099877 -0.074352120
## Staging3 0.049653991 0.02717934 0.04280821 0.01317952 0.005110087
## Staging4 -0.009746323 0.04880102 0.03611084 -0.04085885 0.106641632
## RNA_EOT RNA_EF Grading
## Staging1 0.14305299 -0.006916713 0.04341757
## Staging2 -0.10623974 -0.101179568 0.04854715
## Staging3 -0.02516523 0.086308247 -0.01564098
## Staging4 -0.01384450 -0.004377552 -0.02982198
pred_train_qda <- predict(model_qda, data_train)
cm_train_qda <- confusionMatrix(pred_train_qda$class, data_train$Staging)
print(cm_train_qda$table)## Reference
## Prediction Staging1 Staging2 Staging3 Staging4
## Staging1 129 24 33 21
## Staging2 37 152 26 24
## Staging3 39 29 158 32
## Staging4 31 28 32 177
##
## Akurasi QDA (train): 63.37%
pred_test_qda <- predict(model_qda, data_test)
cm_test_qda <- confusionMatrix(pred_test_qda$class, data_test$Staging)
print(cm_test_qda$table)## Reference
## Prediction Staging1 Staging2 Staging3 Staging4
## Staging1 16 27 29 22
## Staging2 24 24 17 22
## Staging3 31 28 33 28
## Staging4 29 20 27 36
##
## Akurasi QDA (test): 26.39%
## Sensitivity Specificity Precision F1
## Class: Staging1 0.1600000 0.7507987 0.1702128 0.1649485
## Class: Staging2 0.2424242 0.7993631 0.2758621 0.2580645
## Class: Staging3 0.3113208 0.7166124 0.2750000 0.2920354
## Class: Staging4 0.3333333 0.7508197 0.3214286 0.3272727
summary_tbl <- data.frame(
Model = c("LDA", "QDA"),
Akurasi_Train = c(round(cm_train_lda$overall["Accuracy"] * 100, 2),
round(cm_train_qda$overall["Accuracy"] * 100, 2)),
Akurasi_Test = c(round(cm_test_lda$overall["Accuracy"] * 100, 2),
round(cm_test_qda$overall["Accuracy"] * 100, 2))
)
summary_tbl$Gap_Overfit <- summary_tbl$Akurasi_Train - summary_tbl$Akurasi_Test
print(summary_tbl)## Model Akurasi_Train Akurasi_Test Gap_Overfit
## 1 LDA 35.49 25.42 10.07
## 2 QDA 63.37 26.39 36.98
Plot ini menampilkan data yang telah direduksi dari 28 dimensi ke ruang LD1 dan LD2. Semakin terpisah klaster antar warna, semakin baik kemampuan LDA memisahkan kelas.
lda_scores <- as.data.frame(pred_train_lda$x)
lda_scores$Staging <- data_train$Staging
ggplot(lda_scores, aes(x = LD1, y = LD2, color = Staging)) +
geom_point(alpha = 0.5, size = 1.5) +
stat_ellipse(level = 0.68, linewidth = 0.8) +
labs(
title = "Proyeksi Data pada Ruang LD1 dan LD2 (Data Latih)",
subtitle = sprintf("LD1: %.2f%% | LD2: %.2f%% | LD3: %.2f%%",
ev_pct[1], ev_pct[2], ev_pct[3]),
x = sprintf("LD1 (%.2f%%)", ev_pct[1]),
y = sprintf("LD2 (%.2f%%)", ev_pct[2]),
color = "Staging"
) +
scale_color_manual(values = c("#e74c3c", "#3498db", "#2ecc71", "#f39c12")) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))ev_df <- data.frame(
LD = paste0("LD", 1:3),
Var = ev_pct,
Cum = cumsum(ev_pct)
)
ggplot(ev_df, aes(x = LD, y = Var)) +
geom_bar(stat = "identity",
fill = c("#2c3e50", "#7f8c8d", "#bdc3c7"), width = 0.5) +
geom_line(aes(y = Cum, group = 1), color = "red", linewidth = 0.8) +
geom_point(aes(y = Cum), color = "red", size = 2.5) +
geom_text(aes(label = paste0(Var, "%")), vjust = -0.5, size = 3.5) +
ylim(0, 120) +
labs(
title = "Proporsi Varian Antar Kelas — Fungsi Diskriminan LDA",
subtitle = "Garis merah = varian kumulatif",
x = "Fungsi Diskriminan",
y = "Explained Variance (%)"
) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))Koefisien LD1 menunjukkan seberapa besar setiap variabel asli berkontribusi pada fungsi diskriminan pertama. Ini merupakan interpretasi reduksi dimensi LDA: variabel mana yang paling membedakan antar kelas Staging.
scaling_df <- as.data.frame(model_lda$scaling)
scaling_df$Variabel <- rownames(scaling_df)
scaling_df <- scaling_df %>%
arrange(abs(LD1)) %>%
mutate(
Variabel = factor(Variabel, levels = Variabel),
Warna = ifelse(LD1 >= 0, "Positif", "Negatif")
)
ggplot(scaling_df, aes(x = LD1, y = Variabel, fill = Warna)) +
geom_bar(stat = "identity") +
geom_vline(xintercept = 0, linewidth = 0.6) +
scale_fill_manual(values = c("Positif" = "#3498db", "Negatif" = "#e74c3c")) +
labs(
title = "Koefisien Fungsi Diskriminan LD1",
subtitle = "Variabel dengan |koefisien| besar = paling berpengaruh dalam pemisahan kelas",
x = "Koefisien LD1",
y = "Variabel",
fill = "Arah"
) +
theme_bw(base_size = 10) +
theme(
plot.title = element_text(face = "bold"),
axis.text.y = element_text(size = 8)
)scaling_df2 <- as.data.frame(model_lda$scaling)
scaling_df2$Variabel <- rownames(scaling_df2)
scaling_df2 <- scaling_df2 %>%
arrange(abs(LD2)) %>%
mutate(
Variabel = factor(Variabel, levels = Variabel),
Warna = ifelse(LD2 >= 0, "Positif", "Negatif")
)
ggplot(scaling_df2, aes(x = LD2, y = Variabel, fill = Warna)) +
geom_bar(stat = "identity") +
geom_vline(xintercept = 0, linewidth = 0.6) +
scale_fill_manual(values = c("Positif" = "#3498db", "Negatif" = "#e74c3c")) +
labs(
title = "Koefisien Fungsi Diskriminan LD2",
subtitle = "Variabel dengan |koefisien| besar = paling berpengaruh pada LD2",
x = "Koefisien LD2",
y = "Variabel",
fill = "Arah"
) +
theme_bw(base_size = 10) +
theme(
plot.title = element_text(face = "bold"),
axis.text.y = element_text(size = 8)
)cm_lda_df <- as.data.frame(cm_test_lda$table)
colnames(cm_lda_df) <- c("Prediksi", "Aktual", "Freq")
ggplot(cm_lda_df, aes(x = Prediksi, y = Aktual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 4.5, fontface = "bold") +
scale_fill_gradient(low = "#dceefb", high = "#2980b9") +
labs(
title = sprintf("Confusion Matrix LDA (Data Uji) — Akurasi: %.2f%%",
cm_test_lda$overall["Accuracy"] * 100),
x = "Prediksi", y = "Aktual"
) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))cm_qda_df <- as.data.frame(cm_test_qda$table)
colnames(cm_qda_df) <- c("Prediksi", "Aktual", "Freq")
ggplot(cm_qda_df, aes(x = Prediksi, y = Aktual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 4.5, fontface = "bold") +
scale_fill_gradient(low = "#fdebd0", high = "#e67e22") +
labs(
title = sprintf("Confusion Matrix QDA (Data Uji) — Akurasi: %.2f%%",
cm_test_qda$overall["Accuracy"] * 100),
x = "Prediksi", y = "Aktual"
) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))acc_df <- data.frame(
Model = c("LDA Train", "LDA Test", "QDA Train", "QDA Test"),
Akurasi = c(
cm_train_lda$overall["Accuracy"] * 100,
cm_test_lda$overall["Accuracy"] * 100,
cm_train_qda$overall["Accuracy"] * 100,
cm_test_qda$overall["Accuracy"] * 100
),
Jenis = c("Train", "Test", "Train", "Test")
)
acc_df$Model <- factor(acc_df$Model, levels = acc_df$Model)
ggplot(acc_df, aes(x = Model, y = Akurasi, fill = Jenis)) +
geom_bar(stat = "identity", width = 0.5) +
geom_text(aes(label = paste0(round(Akurasi, 2), "%")),
vjust = -0.4, size = 3.5) +
scale_fill_manual(values = c("Train" = "#2c3e50", "Test" = "#3498db")) +
ylim(0, 100) +
labs(
title = "Perbandingan Akurasi LDA vs QDA",
x = "",
y = "Akurasi (%)",
fill = "Data"
) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))Plot densitas ini menunjukkan sejauh mana keempat kelas Staging terpisah pada sumbu LD1 setelah reduksi dimensi.
lda_scores_test <- as.data.frame(pred_test_lda$x)
lda_scores_test$Staging <- data_test$Staging
ggplot(lda_scores_test, aes(x = LD1, fill = Staging)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#e74c3c", "#3498db", "#2ecc71", "#f39c12")) +
labs(
title = "Distribusi Skor LD1 per Staging (Data Uji)",
x = "Skor LD1",
y = "Densitas",
fill = "Staging"
) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))ggplot(lda_scores_test, aes(x = LD2, fill = Staging)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#e74c3c", "#3498db", "#2ecc71", "#f39c12")) +
labs(
title = "Distribusi Skor LD2 per Staging (Data Uji)",
x = "Skor LD2",
y = "Densitas",
fill = "Staging"
) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))Partition plot menggunakan dua variabel paling berpengaruh pada LD1: RNA_EF dan RNA_EOT. Proses ini membutuhkan waktu komputasi lebih lama.
partimat(
Staging ~ RNA_EF + RNA_EOT,
data = df_scaled,
method = "lda",
main = "Partition Plot LDA: RNA_EF vs RNA_EOT"
)partimat(
Staging ~ RNA_EF + RNA_EOT,
data = df_scaled,
method = "qda",
main = "Partition Plot QDA: RNA_EF vs RNA_EOT"
)Analisis diskriminan pada dataset HCV Egyptian Patients menghasilkan temuan sebagai berikut:
Reduksi Dimensi via LDA: LDA berhasil mereduksi 28 variabel prediktor menjadi 3 fungsi diskriminan (LD1, LD2, LD3). LD1 menjelaskan porsi terbesar pemisahan antar kelas Staging, yang berarti sebagian besar informasi pembeda kelas dapat dirangkum hanya dalam satu dimensi baru.
Perbandingan LDA vs QDA:
## === SESSION INFO ===
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=Indonesian_Indonesia.utf8 LC_CTYPE=Indonesian_Indonesia.utf8
## [3] LC_MONETARY=Indonesian_Indonesia.utf8 LC_NUMERIC=C
## [5] LC_TIME=Indonesian_Indonesia.utf8
##
## time zone: Asia/Jakarta
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gridExtra_2.3 reshape2_1.4.5 klaR_1.7-4 MASS_7.3-60.2
## [5] caret_7.0-1 lattice_0.22-6 lubridate_1.9.4 forcats_1.0.0
## [9] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [13] tidyr_1.3.1 tibble_3.2.1 ggplot2_4.0.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4052.112 farver_2.1.2
## [4] S7_0.2.1 fastmap_1.2.0 combinat_0.0-8
## [7] promises_1.3.3 pROC_1.19.0.1 labelled_2.16.0
## [10] digest_0.6.37 rpart_4.1.23 mime_0.12
## [13] timechange_0.3.0 lifecycle_1.0.4 survival_3.6-4
## [16] magrittr_2.0.3 compiler_4.4.1 rlang_1.1.4
## [19] sass_0.4.9 tools_4.4.1 utf8_1.2.4
## [22] yaml_2.3.10 data.table_1.16.4 knitr_1.49
## [25] labeling_0.4.3 plyr_1.8.9 RColorBrewer_1.1-3
## [28] miniUI_0.1.2 withr_3.0.1 nnet_7.3-19
## [31] grid_4.4.1 stats4_4.4.1 fansi_1.0.6
## [34] e1071_1.7-17 xtable_1.8-4 future_1.69.0
## [37] globals_0.19.0 scales_1.4.0 iterators_1.0.14
## [40] cli_3.6.3 rmarkdown_2.30 generics_0.1.3
## [43] rstudioapi_0.17.1 future.apply_1.20.2 tzdb_0.4.0
## [46] proxy_0.4-29 cachem_1.1.0 splines_4.4.1
## [49] parallel_4.4.1 vctrs_0.6.5 hardhat_1.4.2
## [52] Matrix_1.7-0 jsonlite_1.8.9 hms_1.1.3
## [55] listenv_0.10.0 foreach_1.5.2 gower_1.0.2
## [58] jquerylib_0.1.4 recipes_1.3.1 glue_1.7.0
## [61] parallelly_1.46.1 codetools_0.2-20 stringi_1.8.4
## [64] gtable_0.3.6 later_1.4.2 questionr_0.8.2
## [67] pillar_1.9.0 htmltools_0.5.8.1 ipred_0.9-15
## [70] lava_1.8.2 R6_2.5.1 evaluate_1.0.1
## [73] shiny_1.10.0 haven_2.5.4 highr_0.11
## [76] httpuv_1.6.16 bslib_0.8.0 class_7.3-22
## [79] Rcpp_1.0.13 nlme_3.1-164 prodlim_2025.04.28
## [82] xfun_0.49 pkgconfig_2.0.3 ModelMetrics_1.2.2.2