library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(heplots)
## Warning: package 'heplots' was built under R version 4.4.3
## Loading required package: broom
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Load Data

data <- read.csv("C:/Kuliah/semester 4/Analisis Multivariat/Projek Uas/data/retina_word2vec_1_balanced_scaled.csv", header = TRUE)

Exploratory Data Analysis (EDA)

Distribusi setiap variabel

fitur <- data[, 1:ncol(data)-1]
barplot(table(data$class),
        main = "Distribusi Kelas",
        xlab = "Kelas",
        ylab = "Jumlah",
        col = "lightblue")

Persebaran semua kelas sudah seimbang.

par(mfrow = c(2, 5))
for (i in 1:50) {
  hist(data[[i]], main = paste("Hist", names(data)[i]), ylab='', xlab = "", col = "skyblue")
}

par(mfrow = c(1, 1))  # reset layout
par(mfrow = c(2, 5))
for (i in 1:50) {
  boxplot(data[, i] ~ data$class, main = paste("Box", names(data)[i]), ylab='', xlab='')
}

par(mfrow = c(1, 1))  # reset layout

Asumsi

Uji Normalitas Multivariat

Uji Menggunakan Mardia Test (H0 = data mengikuti distribusi normal multivariat)

library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 4.4.3
## Loading required package: boot
## Loading required package: purrr
## 
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
## 
##     norm
mult.norm(data)$mult.test
##          Beta-hat      kappa p-val
## Skewness 1078.086 81395.4841     0
## Kurtosis 4041.260   193.6964     0

Terlihat p-value yang bernilai 0 yang menunjukkan bahwa data tidak berdistribusi Normal. Selain itu jika dilihat dari nilai kurtosis dan skewness, distribusinya sangat jauh dari nilai distribusi normal.

Kami mencoba melakukan uji normalitas univariat pada masing-masing fitur menggunakan shapiro-wilk test dengan H0 = data mengikuti distribusi Normal.

cat("\n== Uji Normalitas Multivariat ==\n")
## 
## == Uji Normalitas Multivariat ==
shapiro_results <- apply(fitur, 2, function(x) shapiro.test(x)$p.value)
shapiro_results
##           X0           X1           X2           X3           X4           X5 
## 6.963906e-07 1.604687e-14 1.640520e-05 3.148654e-15 1.003173e-14 1.478672e-16 
##           X6           X7           X8           X9          X10          X11 
## 4.427764e-04 4.972329e-05 5.035307e-10 1.294207e-04 9.740120e-13 9.155725e-15 
##          X12          X13          X14          X15          X16          X17 
## 1.099835e-10 2.527609e-13 3.753918e-06 2.523177e-13 3.063433e-09 3.533561e-08 
##          X18          X19          X20          X21          X22          X23 
## 9.434544e-10 6.863818e-12 1.188802e-09 8.663981e-16 3.961241e-17 6.595248e-07 
##          X24          X25          X26          X27          X28          X29 
## 2.551322e-10 3.780256e-16 7.166485e-06 2.910042e-13 1.792330e-03 5.935991e-16 
##          X30          X31          X32          X33          X34          X35 
## 3.439895e-11 9.732520e-16 3.090199e-09 4.226251e-16 1.193515e-17 8.768592e-16 
##          X36          X37          X38          X39          X40          X41 
## 1.297957e-12 1.988906e-15 5.437568e-17 3.662327e-15 2.360782e-09 7.779772e-09 
##          X42          X43          X44          X45          X46          X47 
## 5.150693e-04 1.198235e-12 1.037975e-09 9.667379e-16 3.717822e-16 2.934634e-07 
##          X48          X49 
## 7.512918e-16 2.565761e-04
print('Fitur yang berdistribusi normal (p-value > 0.05)')
## [1] "Fitur yang berdistribusi normal (p-value > 0.05)"
shapiro_results[shapiro_results > 0.05]
## named numeric(0)

dari hasil diatas, bahkan tidak ada satupun fitur yang mengikuti distribusi normal.

Homogenitas Matriks Kovarian

Kami menggunakan metode Box’s M untuk melakukan uji homogenitas dengan H0 = data homogen.

cat("\n== Uji Homogenitas Matriks Kovarian (Box's M Test) ==\n")
## 
## == Uji Homogenitas Matriks Kovarian (Box's M Test) ==
boxm_result <- boxM(data[, 1:50], data$class)
print(boxm_result)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  data[, 1:50]
## Chi-Sq (approx.) = 9523.1, df = 2550, p-value < 2.2e-16

Berdasarkan nilai p-value yang sangat mendekati 0, menunjukkan bahwa data tidak homogen.

Usaha Pemenuhan Asumsi

Hapus outlier

Kami melakukan penghapusan outlier univariat dengan pendekatan Z-score. Kami menghapus data yang > 3 atau <-3, karena jika menggunakan threshold 2 akan banyak data yang terbuang. referensi: https://medium.com/@akashsri306/detecting-anomalies-with-z-scores-a-practical-approach-2f9a0f27458d

z_scores <- scale(fitur)
threshold <- 3
data_normal <- cbind(z_scores, class = data$class)
clean_data <- as.data.frame(data_normal[!rowSums(data_normal[, 1:50] > threshold | data_normal[, 1:50] < -threshold) > 0, ])
dim(clean_data)
## [1] 366  51
fitur_cleaned <- clean_data[, 1:ncol(clean_data)-1]
cat("\n== Uji Normalitas Multivariat ==\n")
## 
## == Uji Normalitas Multivariat ==
shapiro_results <- apply(fitur_cleaned, 2, function(x) shapiro.test(x)$p.value)
shapiro_results
##           X0           X1           X2           X3           X4           X5 
## 6.253015e-03 2.387879e-08 8.462104e-02 4.704573e-07 8.698872e-09 6.324212e-13 
##           X6           X7           X8           X9          X10          X11 
## 4.972782e-01 6.877321e-04 2.712752e-05 1.535620e-02 5.739744e-05 6.331434e-11 
##          X12          X13          X14          X15          X16          X17 
## 9.576251e-04 2.355566e-05 2.892646e-02 7.864390e-08 1.103860e-01 6.844607e-07 
##          X18          X19          X20          X21          X22          X23 
## 3.195697e-02 2.290572e-05 1.407034e-02 4.028478e-10 1.340838e-13 7.687334e-02 
##          X24          X25          X26          X27          X28          X29 
## 9.455401e-02 3.703135e-10 1.869998e-01 3.632614e-08 1.996327e-01 2.187539e-09 
##          X30          X31          X32          X33          X34          X35 
## 2.618588e-08 2.051325e-12 6.639302e-05 5.495392e-08 9.897382e-08 9.964600e-05 
##          X36          X37          X38          X39          X40          X41 
## 1.228714e-03 1.917413e-10 4.797918e-11 3.265734e-09 6.859460e-04 3.617990e-04 
##          X42          X43          X44          X45          X46          X47 
## 1.976524e-01 3.116289e-06 1.281897e-05 4.526407e-11 6.668533e-11 2.191283e-03 
##          X48          X49 
## 7.650051e-10 8.302312e-03
print('Fitur yang berdistribusi normal (p-value > 0.05)')
## [1] "Fitur yang berdistribusi normal (p-value > 0.05)"
shapiro_results[shapiro_results > 0.05]
##         X2         X6        X16        X23        X24        X26        X28 
## 0.08462104 0.49727818 0.11038599 0.07687334 0.09455401 0.18699978 0.19963266 
##        X42 
## 0.19765238

Secara uji normalitas univariat, beberapa fitur sudah berdistribusi normal berdasarkan hasil shapiro-wilk test.

mult.norm(fitur)$mult.test
##          Beta-hat      kappa p-val
## Skewness 1035.282 78163.7613     0
## Kurtosis 3930.907   196.4107     0

Namun dari Uji Normalitas Multiveriat, menunjukkan hasil yang kurang lebih sama seperti sebelumnya, yaitu masih jauh dari kata Normal.

Model Linear Discriminant Analysis (LDA)

cat("\n== Model LDA ==\n")
## 
## == Model LDA ==
lda_model <- lda(class ~ ., data = clean_data)
summary(lda_model)
##         Length Class  Mode     
## prior     3    -none- numeric  
## counts    3    -none- numeric  
## means   150    -none- numeric  
## scaling 100    -none- numeric  
## lev       3    -none- character
## svd       2    -none- numeric  
## N         1    -none- numeric  
## call      3    -none- call     
## terms     3    terms  call     
## xlevels   0    -none- list
plot(lda_model)

# Mapping Biplot

library(MASS)
lda_model <- lda(class ~ ., data = data)
biplot(prcomp(predict(lda_model)$x))

# Evaluasi Model
## Akurasi Model

lda_model <- lda(class ~ ., data = clean_data)
prediksi <- predict(lda_model)$class
conf_matrix <- table(Prediksi = prediksi, Aktual = clean_data$class)
print(conf_matrix)
##         Aktual
## Prediksi   0   1   2
##        0  66  11   7
##        1  18 106  14
##        2  15   9 120
akurasi <- mean(prediksi == clean_data$class)
cat("Akurasi LDA:", round(akurasi * 100, 2), "%\n")
## Akurasi LDA: 79.78 %
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
actual <- factor(clean_data$class)
predicted <- factor(prediksi, levels = levels(actual))

cm <- confusionMatrix(prediksi, actual)

# TPR per kelas = sensitivity
cm$byClass[, "Sensitivity"]
##  Class: 0  Class: 1  Class: 2 
## 0.6666667 0.8412698 0.8510638
mean(cm$byClass[, "Sensitivity"])
## [1] 0.7863334
#FPR
mat <- as.matrix(cm$table)  # ambil confusion matrix
classes <- colnames(mat)

# fungsi untuk menghitung FPR per kelas
fpr_per_class <- function(cm) {
  sapply(1:ncol(cm), function(i) {
    FP <- sum(cm[i, -i])        # prediksi kelas i padahal bukan (baris i, kolom ≠ i)
    TN <- sum(cm[-i, -i])       # semua yang bukan kelas i dan diprediksi juga bukan
    FP / (FP + TN)
  })
}
mean(fpr_per_class(mat))
## [1] 0.1024719

Receiver Operating Characteristic (ROC)

true_labels <- as.factor(clean_data$class)
posterior <- predict(lda_model)$posterior  # Probabilitas per kelas

# ROC untuk setiap kelas (One vs Rest)
roc_list <- lapply(levels(true_labels), function(class) {
  roc(response = as.numeric(true_labels == class), predictor = posterior[, class])
})
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Tampilkan AUC per kelas
auc_values <- sapply(roc_list, auc)
names(auc_values) <- levels(true_labels)
print(auc_values)
##         0         1         2 
## 0.8848409 0.9103671 0.9335067
print(mean(auc_values))
## [1] 0.9095716
plot(roc_list[[1]], col = "red", main = "Multiclass ROC")
for (i in 2:length(roc_list)) {
  plot(roc_list[[i]], col = i + 1, add = TRUE)
}
legend("bottomright", legend = levels(true_labels), col = 2:(length(roc_list) + 1), lwd = 2)

Dari hasil evaluasi ROC, berdasarkan AUC bisa dilihat bahwa model tergolong sangat baik dalam menjelaskan data yaitu dengan nilai AUC tiap kelas berada di antara 90%.