library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(heplots)
## Warning: package 'heplots' was built under R version 4.4.3
## Loading required package: broom
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Load Data

data <- read.csv("C:/Kuliah/semester 4/Analisis Multivariat/Projek Uas/data/retina_word2vec_1_balanced_scaled.csv", header = TRUE)

Principal Component Analysis (PCA)

pca_result <- prcomp(data[1:50], center = TRUE, scale. = TRUE)
summary(pca_result)
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     4.0057 2.13472 1.76298 1.72025 1.48890 1.28805 1.23219
## Proportion of Variance 0.3209 0.09114 0.06216 0.05919 0.04434 0.03318 0.03037
## Cumulative Proportion  0.3209 0.41205 0.47421 0.53340 0.57774 0.61092 0.64128
##                            PC8     PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     1.19404 1.14844 1.08272 1.0001 0.98263 0.94960 0.91939
## Proportion of Variance 0.02851 0.02638 0.02345 0.0200 0.01931 0.01803 0.01691
## Cumulative Proportion  0.66980 0.69618 0.71962 0.7396 0.75894 0.77697 0.79388
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.87107 0.83091 0.78691 0.76556 0.75516 0.73786 0.71016
## Proportion of Variance 0.01518 0.01381 0.01238 0.01172 0.01141 0.01089 0.01009
## Cumulative Proportion  0.80905 0.82286 0.83525 0.84697 0.85837 0.86926 0.87935
##                           PC22    PC23    PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.69172 0.65283 0.64192 0.62581 0.59520 0.58501 0.56174
## Proportion of Variance 0.00957 0.00852 0.00824 0.00783 0.00709 0.00684 0.00631
## Cumulative Proportion  0.88892 0.89744 0.90568 0.91351 0.92060 0.92745 0.93376
##                           PC29    PC30    PC31    PC32    PC33    PC34   PC35
## Standard deviation     0.54643 0.53709 0.51267 0.49925 0.46784 0.46289 0.4527
## Proportion of Variance 0.00597 0.00577 0.00526 0.00498 0.00438 0.00429 0.0041
## Cumulative Proportion  0.93973 0.94550 0.95075 0.95574 0.96012 0.96440 0.9685
##                           PC36   PC37    PC38    PC39    PC40   PC41    PC42
## Standard deviation     0.43684 0.4243 0.40068 0.39182 0.37132 0.3534 0.33384
## Proportion of Variance 0.00382 0.0036 0.00321 0.00307 0.00276 0.0025 0.00223
## Cumulative Proportion  0.97232 0.9759 0.97913 0.98220 0.98496 0.9875 0.98968
##                           PC43    PC44    PC45    PC46    PC47    PC48    PC49
## Standard deviation     0.32294 0.31042 0.29438 0.25675 0.24058 0.23098 0.19421
## Proportion of Variance 0.00209 0.00193 0.00173 0.00132 0.00116 0.00107 0.00075
## Cumulative Proportion  0.99177 0.99370 0.99543 0.99675 0.99791 0.99897 0.99973
##                           PC50
## Standard deviation     0.11679
## Proportion of Variance 0.00027
## Cumulative Proportion  1.00000

Berdasarkan nilai varians kumulatif, kami mengambil 10 Komponen Utama karena 10 komponen tersebut sudah mampu menjelaskan >70% variansi dari data.

pca_data <- as.data.frame(pca_result$x[, 1:10])
colnames(pca_data) <- paste0("PC", 1:10)
final_data <- cbind(pca_data, class = as.factor(data$class))

Exploratory Data Analysis (EDA)

Distribusi setiap variabel

fitur <- final_data[, 1:ncol(final_data)-1]
barplot(table(final_data$class),
        main = "Distribusi Kelas",
        xlab = "Kelas",
        ylab = "Jumlah",
        col = "lightblue")

Persebaran semua kelas sudah seimbang.

par(mfrow = c(2, 5))
for (i in 1:50) {
  hist(data[[i]], main = paste("Hist", names(data)[i]), ylab='', xlab = "", col = "skyblue")
}

par(mfrow = c(1, 1))  # reset layout
par(mfrow = c(2, 5))
for (i in 1:50) {
  boxplot(data[, i] ~ data$class, main = paste("Box", names(data)[i]), ylab='', xlab='')
}

par(mfrow = c(1, 1))  # reset layout

Asumsi

Uji Normalitas Multivariat

Uji Menggunakan Mardia Test (H0 = data mengikuti distribusi normal multivariat)

library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 4.4.3
## Loading required package: boot
## Loading required package: purrr
## 
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
## 
##     norm
mult.norm(data)$mult.test
##          Beta-hat      kappa p-val
## Skewness 1078.086 81395.4841     0
## Kurtosis 4041.260   193.6964     0

Terlihat p-value yang bernilai 0 yang menunjukkan bahwa data tidak berdistribusi Normal. Selain itu jika dilihat dari nilai kurtosis dan skewness, distribusinya sangat jauh dari nilai distribusi normal.

Kami mencoba melakukan uji normalitas univariat pada masing-masing fitur menggunakan shapiro-wilk test dengan H0 = data mengikuti distribusi Normal.

cat("\n== Uji Normalitas Multivariat ==\n")
## 
## == Uji Normalitas Multivariat ==
shapiro_results <- apply(final_data[, 1:10], 2, function(x) shapiro.test(x)$p.value)
shapiro_results
##          PC1          PC2          PC3          PC4          PC5          PC6 
## 8.255248e-17 1.017185e-10 2.963185e-08 7.917934e-07 1.370670e-11 2.630571e-07 
##          PC7          PC8          PC9         PC10 
## 4.310177e-03 1.677905e-07 6.504445e-02 2.582335e-07
print('Fitur yang berdistribusi normal (p-value > 0.05)')
## [1] "Fitur yang berdistribusi normal (p-value > 0.05)"
shapiro_results[shapiro_results > 0.05]
##        PC9 
## 0.06504445

Dari hasil uji normalitas univariat, hanya PC9 yang berdistribusi Normal

Homogenitas Matriks Kovarian

Kami menggunakan metode Box’s M untuk melakukan uji homogenitas dengan H0 = data homogen.

cat("\n== Uji Homogenitas Matriks Kovarian (Box's M Test) ==\n")
## 
## == Uji Homogenitas Matriks Kovarian (Box's M Test) ==
boxm_result <- boxM(final_data[, 1:10], final_data$class)
print(boxm_result)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  final_data[, 1:10]
## Chi-Sq (approx.) = 690.37, df = 110, p-value < 2.2e-16

Berdasarkan nilai p-value yang sangat mendekati 0, menunjukkan bahwa data tidak homogen.

Usaha Pemenuhan Asumsi

Hapus outlier

Kami melakukan penghapusan outlier univariat dengan pendekatan Z-score. Kami menghapus data yang > 3 atau <-3, karena jika menggunakan threshold 2 akan banyak data yang terbuang. referensi: https://medium.com/@akashsri306/detecting-anomalies-with-z-scores-a-practical-approach-2f9a0f27458d

z_scores <- scale(fitur)
threshold <- 3
data_normal <- cbind(z_scores, class = final_data$class)
clean_data <- as.data.frame(data_normal[!rowSums(data_normal[, 1:ncol(data_normal)] > threshold | data_normal[, 1:ncol(data_normal)] < -threshold) > 0, ])
dim(clean_data)
## [1] 406  11
fitur_cleaned <- clean_data[, 1:ncol(clean_data)-1]
cat("\n== Uji Normalitas Multivariat ==\n")
## 
## == Uji Normalitas Multivariat ==
shapiro_results <- apply(fitur_cleaned, 2, function(x) shapiro.test(x)$p.value)
shapiro_results
##          PC1          PC2          PC3          PC4          PC5          PC6 
## 2.460240e-15 6.897091e-05 1.055899e-03 1.012391e-01 6.745121e-01 1.256420e-02 
##          PC7          PC8          PC9         PC10 
## 3.365758e-02 1.500229e-02 5.429327e-02 1.110877e-04
print('Fitur yang berdistribusi normal (p-value > 0.05)')
## [1] "Fitur yang berdistribusi normal (p-value > 0.05)"
shapiro_results[shapiro_results > 0.05]
##        PC4        PC5        PC9 
## 0.10123909 0.67451207 0.05429327

Secara uji normalitas univariat, beberapa fitur sudah berdistribusi normal berdasarkan hasil shapiro-wilk test.

mult.norm(fitur)$mult.test
##           Beta-hat      kappa p-val
## Skewness  35.38211 2671.34894     0
## Kurtosis 197.55743   53.27665     0

Namun dari Uji Normalitas Multiveriat, menunjukkan hasil yang kurang lebih sama seperti sebelumnya, yaitu masih jauh dari kata Normal.

Model Linear Discriminant Analysis (LDA)

cat("\n== Model LDA ==\n")
## 
## == Model LDA ==
lda_model <- lda(class ~ ., data = clean_data)
summary(lda_model)
##         Length Class  Mode     
## prior    3     -none- numeric  
## counts   3     -none- numeric  
## means   30     -none- numeric  
## scaling 20     -none- numeric  
## lev      3     -none- character
## svd      2     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  0     -none- list
plot(lda_model)

Mapping Biplot

library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_pca_biplot(pca_result, 
                label = "var",                
                habillage = data$class, 
                addEllipses = TRUE,           
                palette = "jco")               

Evaluasi Model

Akurasi Model

prediksi <- predict(lda_model)$class
conf_matrix <- table(Prediksi = prediksi, Aktual = clean_data$class)
print(conf_matrix)
##         Aktual
## Prediksi   1   2   3
##        1  55  21  13
##        2  43  85  26
##        3  26  32 105
akurasi <- mean(prediksi == clean_data$class)
cat("Akurasi LDA:", round(akurasi * 100, 2), "%\n")
## Akurasi LDA: 60.34 %
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
actual <- factor(clean_data$class)
predicted <- factor(prediksi, levels = levels(actual))

cm <- confusionMatrix(prediksi, actual)

# TPR per kelas = sensitivity
cm$byClass[, "Sensitivity"]
##  Class: 1  Class: 2  Class: 3 
## 0.4435484 0.6159420 0.7291667
mean(cm$byClass[, "Sensitivity"])
## [1] 0.596219
#FPR
mat <- as.matrix(cm$table)  # ambil confusion matrix
classes <- colnames(mat)

# fungsi untuk menghitung FPR per kelas
fpr_per_class <- function(cm) {
  sapply(1:ncol(cm), function(i) {
    FP <- sum(cm[i, -i])        # prediksi kelas i padahal bukan (baris i, kolom ≠ i)
    TN <- sum(cm[-i, -i])       # semua yang bukan kelas i dan diprediksi juga bukan
    FP / (FP + TN)
  })
}
mean(fpr_per_class(mat))
## [1] 0.1998014

Receiver Operating Characteristic (ROC)

true_labels <- as.factor(clean_data$class)
posterior <- predict(lda_model)$posterior  # Probabilitas per kelas

# ROC untuk setiap kelas (One vs Rest)
roc_list <- lapply(levels(true_labels), function(class) {
  roc(response = as.numeric(true_labels == class), predictor = posterior[, class])
})
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Tampilkan AUC per kelas
auc_values <- sapply(roc_list, auc)
names(auc_values) <- levels(true_labels)
print(auc_values)
##         1         2         3 
## 0.7644990 0.7437946 0.7985714
print(mean(auc_values))
## [1] 0.768955
plot(roc_list[[1]], col = "red", main = "Multiclass ROC")
for (i in 2:length(roc_list)) {
  plot(roc_list[[i]], col = i + 1, add = TRUE)
}
legend("bottomright", legend = levels(true_labels), col = 2:(length(roc_list) + 1), lwd = 2)

Dari hasil evaluasi ROC, berdasarkan AUC bisa dilihat bahwa model tergolong lumayan dalam menjelaskan data yaitu dengan nilai AUC tiap kelas berada di antara >70%. Namu dari hasil akurasi keseluruhan model tergolong kurang karena hanya memiliki akurasi 60% dalam menjelaskan data. Hal ini disebabkan karena dari hasil confussion matrix model sulit membedakan kelas 0(negatif) dan 1(netral).