Prepration package

library(nnet)
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Load Data

data <- read.csv("../../data/retina_word2vec_1_balanced_scaled.csv", header = TRUE)

PCA Data

numeric_data <- data[, 1:(ncol(data)-1)]
pca_result <- prcomp(numeric_data, scale. = TRUE)
summary(pca_result)
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     4.0057 2.13472 1.76298 1.72025 1.48890 1.28805 1.23219
## Proportion of Variance 0.3209 0.09114 0.06216 0.05919 0.04434 0.03318 0.03037
## Cumulative Proportion  0.3209 0.41205 0.47421 0.53340 0.57774 0.61092 0.64128
##                            PC8     PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     1.19404 1.14844 1.08272 1.0001 0.98263 0.94960 0.91939
## Proportion of Variance 0.02851 0.02638 0.02345 0.0200 0.01931 0.01803 0.01691
## Cumulative Proportion  0.66980 0.69618 0.71962 0.7396 0.75894 0.77697 0.79388
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.87107 0.83091 0.78691 0.76556 0.75516 0.73786 0.71016
## Proportion of Variance 0.01518 0.01381 0.01238 0.01172 0.01141 0.01089 0.01009
## Cumulative Proportion  0.80905 0.82286 0.83525 0.84697 0.85837 0.86926 0.87935
##                           PC22    PC23    PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.69172 0.65283 0.64192 0.62581 0.59520 0.58501 0.56174
## Proportion of Variance 0.00957 0.00852 0.00824 0.00783 0.00709 0.00684 0.00631
## Cumulative Proportion  0.88892 0.89744 0.90568 0.91351 0.92060 0.92745 0.93376
##                           PC29    PC30    PC31    PC32    PC33    PC34   PC35
## Standard deviation     0.54643 0.53709 0.51267 0.49925 0.46784 0.46289 0.4527
## Proportion of Variance 0.00597 0.00577 0.00526 0.00498 0.00438 0.00429 0.0041
## Cumulative Proportion  0.93973 0.94550 0.95075 0.95574 0.96012 0.96440 0.9685
##                           PC36   PC37    PC38    PC39    PC40   PC41    PC42
## Standard deviation     0.43684 0.4243 0.40068 0.39182 0.37132 0.3534 0.33384
## Proportion of Variance 0.00382 0.0036 0.00321 0.00307 0.00276 0.0025 0.00223
## Cumulative Proportion  0.97232 0.9759 0.97913 0.98220 0.98496 0.9875 0.98968
##                           PC43    PC44    PC45    PC46    PC47    PC48    PC49
## Standard deviation     0.32294 0.31042 0.29438 0.25675 0.24058 0.23098 0.19421
## Proportion of Variance 0.00209 0.00193 0.00173 0.00132 0.00116 0.00107 0.00075
## Cumulative Proportion  0.99177 0.99370 0.99543 0.99675 0.99791 0.99897 0.99973
##                           PC50
## Standard deviation     0.11679
## Proportion of Variance 0.00027
## Cumulative Proportion  1.00000
pca_transformed <- pca_result$x
pca_combined <- cbind(pca_transformed[, 1:10], class = data$class)
data <- as.data.frame(pca_combined)

EDA

Distribusi setiap variabel

barplot(table(data$class),
        main = "Distribusi Kelas",
        xlab = "Kelas",
        ylab = "Jumlah",
        col = "lightblue")

par(mfrow = c(2, 5))
for (i in 1:10) {
  hist(data[[i]], main = paste("Hist", names(data)[i]), ylab='', xlab = "", col = "skyblue")
}

par(mfrow = c(1, 1))  # reset layout
par(mfrow = c(2, 5))
for (i in 1:10) {
  boxplot(data[, i] ~ data$class, main = paste("Box", names(data)[i]), ylab='', xlab='')
}

par(mfrow = c(1, 1))  # reset layout

Asumsi

Multikolinearitas

# 4. Hitung matriks korelasi
correlation_matrix <- cor(data[,1:ncol(data)-1])

# 5. Visualisasi matriks korelasi (heatmap)
corrplot(correlation_matrix, method = "color", col = colorRampPalette(c("red", "white", "blue"))(200),
         tl.cex = 0.7, number.cex = 0.7, addrect = 2,
         title = "Matriks Korelasi antar Fitur", mar = c(0,0,1,0))

library(car)
## Loading required package: carData
data_cleaned <- data
vif_model <- lm(class ~ ., data = data_cleaned)
vif(vif_model)
##  PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10 
##    1    1    1    1    1    1    1    1    1    1

Tidak ada multikolinearitas antar PC

Outliers Multivariate Handle

Kami melakukan penanganan outliers dengan menghitung jarak mahalanobis terhadap rata-rata data. lalu menggunakan persebaran chi-square dengan tingkat kepercayaan sebesar 95% untuk mengindikasikan data kedalam outliers.

# Hitung Mahalanobis Distance
median_vector <- sapply(data, median)
d <- mahalanobis(data_cleaned, 
                 center = colMeans(data_cleaned), 
                 cov = cov(data_cleaned))


# Threshold chi-square
cutoff <- qchisq(0.95, df = ncol(data_cleaned))  # 95% confidence

# Tandai outlier
outliers <- which(d > cutoff)
plot(d, type = "h", main = "Mahalanobis Distance", ylab = "Distance")
abline(h = cutoff, col = "red")

non_outlierss <- data_cleaned[-outliers, ]

data_cleaned <- cbind(non_outlierss)

Linearitas Log-odds

feature_names <- names(data_cleaned)
formula_str <- paste("class ~", paste(feature_names, collapse = " + "))
model <- multinom(class ~ ., data = data_cleaned)
## # weights:  36 (22 variable)
## initial  value 418.571282 
## iter  10 value 343.949268
## iter  20 value 325.540058
## iter  30 value 325.283216
## iter  30 value 325.283213
## iter  30 value 325.283213
## final  value 325.283213 
## converged
lintest_data <- data_cleaned[, 1:ncol(data_cleaned)-1]
prob_pred <- predict(model, type = "prob")
# Class 1 sebagai Reference class
lintest_data$logit1 <- log(prob_pred[, 1]/prob_pred[, 2])
lintest_data$logit2 <- log(prob_pred[, 3]/prob_pred[, 2])

# Plot logit terhadap feature1
ggplot(lintest_data, aes(x = PC1, y = logit1)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Logit vs Mean Feature", x = "Mean Data", y = "Logit1") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(lintest_data, aes(x = PC1, y = logit2)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "loess", color = "green") +
  labs(title = "Logit vs Mean Feature", x = "Mean Data", y = "Logit2") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Untuk melihat linearitas dari logit multinom, kami menggunakan class 1 (Netral) sebagai kelas referensi. Dan dari hasil visualisasi tersebut kami berasumsi bahwa rata-rata data yang kami pakai merupakan data yang bersifat linear terhadap log-odds.

Modelling

data_final <- data_cleaned
# Fit model: prediksi class berdasarkan feature1 dan feature2
model <- multinom(class ~ ., data = data_final)
## # weights:  36 (22 variable)
## initial  value 418.571282 
## iter  10 value 343.949268
## iter  20 value 325.540058
## iter  30 value 325.283216
## iter  30 value 325.283213
## iter  30 value 325.283213
## final  value 325.283213 
## converged
# Ringkasan model
summary(model)
## Call:
## multinom(formula = class ~ ., data = data_final)
## 
## Coefficients:
##   (Intercept)        PC1          PC2        PC3        PC4       PC5
## 1  -0.1287636 -0.2343002 -0.315760743  0.2667005 0.09036364 0.2200210
## 2  -0.6435981 -0.4487465 -0.003191817 -0.1770911 0.60633537 0.6279564
##           PC6         PC7       PC8        PC9       PC10
## 1  0.09023505 -0.08568616 -0.201845 -0.1533926  0.2035451
## 2 -0.88156810  0.40996365 -0.163569  0.5593798 -0.3300155
## 
## Std. Errors:
##   (Intercept)        PC1        PC2       PC3       PC4       PC5       PC6
## 1   0.1631299 0.05161835 0.08725778 0.1083841 0.1041503 0.1255710 0.1477708
## 2   0.2137617 0.07128573 0.09781288 0.1206482 0.1339655 0.1464158 0.1836445
##         PC7       PC8       PC9      PC10
## 1 0.1328167 0.1497165 0.1399499 0.1668201
## 2 0.1518185 0.1739280 0.1728432 0.1977730
## 
## Residual Deviance: 650.5664 
## AIC: 694.5664
z <- summary(model)$coefficients / summary(model)$standard.errors
p <- 2 * (1 - pnorm(abs(z)))
print(p)
##   (Intercept)          PC1          PC2        PC3          PC4          PC5
## 1 0.429918281 5.649839e-06 0.0002960733 0.01386672 3.855983e-01 7.974550e-02
## 2 0.002605428 3.073222e-10 0.9739681521 0.14215041 6.009507e-06 1.795945e-05
##            PC6         PC7       PC8         PC9       PC10
## 1 5.414367e-01 0.518832494 0.1776002 0.273055342 0.22240895
## 2 1.583436e-06 0.006926574 0.3469916 0.001210722 0.09518517

Evalusi Model

Akurasi & Conf Matriks

clean_data <- data_final
prediksi <- predict(model, newdata = data_cleaned)
conf_matrix <- table(Prediksi = prediksi, Aktual = clean_data$class)
print(conf_matrix)
##         Aktual
## Prediksi   0   1   2
##        0  52  24  11
##        1  38  81  21
##        2  23  24 107
akurasi <- mean(prediksi == clean_data$class)
cat("Akurasi LDA:", round(akurasi * 100, 2), "%\n")
## Akurasi LDA: 62.99 %
library(caret)
## Loading required package: lattice
actual <- factor(clean_data$class)
predicted <- factor(prediksi, levels = levels(actual))

cm <- confusionMatrix(prediksi, actual)

# TPR per kelas = sensitivity
cm$byClass[, "Sensitivity"]
##  Class: 0  Class: 1  Class: 2 
## 0.4601770 0.6279070 0.7697842
mean(cm$byClass[, "Sensitivity"])
## [1] 0.6192894
#FPR
mat <- as.matrix(cm$table)  # ambil confusion matrix
classes <- colnames(mat)

# fungsi untuk menghitung FPR per kelas
fpr_per_class <- function(cm) {
  sapply(1:ncol(cm), function(i) {
    FP <- sum(cm[i, -i])        # prediksi kelas i padahal bukan (baris i, kolom ≠ i)
    TN <- sum(cm[-i, -i])       # semua yang bukan kelas i dan diprediksi juga bukan
    FP / (FP + TN)
  })
}
mean(fpr_per_class(mat))
## [1] 0.186313
predict_proba <- predict(model, type = "prob")
y_true <- as.factor(data_final$class)

roc_list <- lapply(levels(y_true), function(k) {
  roc(response = as.numeric(y_true == k), predictor = predict_proba[, k])
})
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_values <- sapply(roc_list, auc)
names(auc_values) <- levels(y_true)
print('AUC per Class of PCA MNLR')
## [1] "AUC per Class of PCA MNLR"
print(auc_values)
##         0         1         2 
## 0.7742042 0.7801926 0.8528896
print('Mean AUC')
## [1] "Mean AUC"
print(mean(auc_values))
## [1] 0.8024288
plot(roc_list[[1]], col = "red", main = "Multiclass ROC PCA MNLR")
for (i in 2:length(roc_list)) {
  plot(roc_list[[i]], col = i + 1, add = TRUE)
}
legend("bottomright", legend = levels(y_true), col = 2:(length(roc_list) + 1), lwd = 2)

Berdasarkan hasil dari ROC, nilai AUC tergolong cukup baik yakni dengan rata-rata 80% pada setiap kelas.