M. Raihan Firdaus (23031554164)
Mata Kuliah:
Analisis Multivariat
Dosen: Ike Fitriyaningsih, M.Si
NIDN :
0109049001
library(nnet)
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data <- read.csv("../../data/retina_word2vec_1_balanced_scaled.csv", header = TRUE)
numeric_data <- data[, 1:(ncol(data)-1)]
pca_result <- prcomp(numeric_data, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 4.0057 2.13472 1.76298 1.72025 1.48890 1.28805 1.23219
## Proportion of Variance 0.3209 0.09114 0.06216 0.05919 0.04434 0.03318 0.03037
## Cumulative Proportion 0.3209 0.41205 0.47421 0.53340 0.57774 0.61092 0.64128
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.19404 1.14844 1.08272 1.0001 0.98263 0.94960 0.91939
## Proportion of Variance 0.02851 0.02638 0.02345 0.0200 0.01931 0.01803 0.01691
## Cumulative Proportion 0.66980 0.69618 0.71962 0.7396 0.75894 0.77697 0.79388
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.87107 0.83091 0.78691 0.76556 0.75516 0.73786 0.71016
## Proportion of Variance 0.01518 0.01381 0.01238 0.01172 0.01141 0.01089 0.01009
## Cumulative Proportion 0.80905 0.82286 0.83525 0.84697 0.85837 0.86926 0.87935
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.69172 0.65283 0.64192 0.62581 0.59520 0.58501 0.56174
## Proportion of Variance 0.00957 0.00852 0.00824 0.00783 0.00709 0.00684 0.00631
## Cumulative Proportion 0.88892 0.89744 0.90568 0.91351 0.92060 0.92745 0.93376
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.54643 0.53709 0.51267 0.49925 0.46784 0.46289 0.4527
## Proportion of Variance 0.00597 0.00577 0.00526 0.00498 0.00438 0.00429 0.0041
## Cumulative Proportion 0.93973 0.94550 0.95075 0.95574 0.96012 0.96440 0.9685
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.43684 0.4243 0.40068 0.39182 0.37132 0.3534 0.33384
## Proportion of Variance 0.00382 0.0036 0.00321 0.00307 0.00276 0.0025 0.00223
## Cumulative Proportion 0.97232 0.9759 0.97913 0.98220 0.98496 0.9875 0.98968
## PC43 PC44 PC45 PC46 PC47 PC48 PC49
## Standard deviation 0.32294 0.31042 0.29438 0.25675 0.24058 0.23098 0.19421
## Proportion of Variance 0.00209 0.00193 0.00173 0.00132 0.00116 0.00107 0.00075
## Cumulative Proportion 0.99177 0.99370 0.99543 0.99675 0.99791 0.99897 0.99973
## PC50
## Standard deviation 0.11679
## Proportion of Variance 0.00027
## Cumulative Proportion 1.00000
pca_transformed <- pca_result$x
pca_combined <- cbind(pca_transformed[, 1:10], class = data$class)
data <- as.data.frame(pca_combined)
barplot(table(data$class),
main = "Distribusi Kelas",
xlab = "Kelas",
ylab = "Jumlah",
col = "lightblue")
par(mfrow = c(2, 5))
for (i in 1:10) {
hist(data[[i]], main = paste("Hist", names(data)[i]), ylab='', xlab = "", col = "skyblue")
}
par(mfrow = c(1, 1)) # reset layout
par(mfrow = c(2, 5))
for (i in 1:10) {
boxplot(data[, i] ~ data$class, main = paste("Box", names(data)[i]), ylab='', xlab='')
}
par(mfrow = c(1, 1)) # reset layout
# 4. Hitung matriks korelasi
correlation_matrix <- cor(data[,1:ncol(data)-1])
# 5. Visualisasi matriks korelasi (heatmap)
corrplot(correlation_matrix, method = "color", col = colorRampPalette(c("red", "white", "blue"))(200),
tl.cex = 0.7, number.cex = 0.7, addrect = 2,
title = "Matriks Korelasi antar Fitur", mar = c(0,0,1,0))
library(car)
## Loading required package: carData
data_cleaned <- data
vif_model <- lm(class ~ ., data = data_cleaned)
vif(vif_model)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## 1 1 1 1 1 1 1 1 1 1
Tidak ada multikolinearitas antar PC
Kami melakukan penanganan outliers dengan menghitung jarak mahalanobis terhadap rata-rata data. lalu menggunakan persebaran chi-square dengan tingkat kepercayaan sebesar 95% untuk mengindikasikan data kedalam outliers.
# Hitung Mahalanobis Distance
median_vector <- sapply(data, median)
d <- mahalanobis(data_cleaned,
center = colMeans(data_cleaned),
cov = cov(data_cleaned))
# Threshold chi-square
cutoff <- qchisq(0.95, df = ncol(data_cleaned)) # 95% confidence
# Tandai outlier
outliers <- which(d > cutoff)
plot(d, type = "h", main = "Mahalanobis Distance", ylab = "Distance")
abline(h = cutoff, col = "red")
non_outlierss <- data_cleaned[-outliers, ]
data_cleaned <- cbind(non_outlierss)
feature_names <- names(data_cleaned)
formula_str <- paste("class ~", paste(feature_names, collapse = " + "))
model <- multinom(class ~ ., data = data_cleaned)
## # weights: 36 (22 variable)
## initial value 418.571282
## iter 10 value 343.949268
## iter 20 value 325.540058
## iter 30 value 325.283216
## iter 30 value 325.283213
## iter 30 value 325.283213
## final value 325.283213
## converged
lintest_data <- data_cleaned[, 1:ncol(data_cleaned)-1]
prob_pred <- predict(model, type = "prob")
# Class 1 sebagai Reference class
lintest_data$logit1 <- log(prob_pred[, 1]/prob_pred[, 2])
lintest_data$logit2 <- log(prob_pred[, 3]/prob_pred[, 2])
# Plot logit terhadap feature1
ggplot(lintest_data, aes(x = PC1, y = logit1)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", color = "red") +
labs(title = "Logit vs Mean Feature", x = "Mean Data", y = "Logit1") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lintest_data, aes(x = PC1, y = logit2)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", color = "green") +
labs(title = "Logit vs Mean Feature", x = "Mean Data", y = "Logit2") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Untuk melihat linearitas dari logit multinom, kami menggunakan class 1 (Netral) sebagai kelas referensi. Dan dari hasil visualisasi tersebut kami berasumsi bahwa rata-rata data yang kami pakai merupakan data yang bersifat linear terhadap log-odds.
data_final <- data_cleaned
# Fit model: prediksi class berdasarkan feature1 dan feature2
model <- multinom(class ~ ., data = data_final)
## # weights: 36 (22 variable)
## initial value 418.571282
## iter 10 value 343.949268
## iter 20 value 325.540058
## iter 30 value 325.283216
## iter 30 value 325.283213
## iter 30 value 325.283213
## final value 325.283213
## converged
# Ringkasan model
summary(model)
## Call:
## multinom(formula = class ~ ., data = data_final)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 1 -0.1287636 -0.2343002 -0.315760743 0.2667005 0.09036364 0.2200210
## 2 -0.6435981 -0.4487465 -0.003191817 -0.1770911 0.60633537 0.6279564
## PC6 PC7 PC8 PC9 PC10
## 1 0.09023505 -0.08568616 -0.201845 -0.1533926 0.2035451
## 2 -0.88156810 0.40996365 -0.163569 0.5593798 -0.3300155
##
## Std. Errors:
## (Intercept) PC1 PC2 PC3 PC4 PC5 PC6
## 1 0.1631299 0.05161835 0.08725778 0.1083841 0.1041503 0.1255710 0.1477708
## 2 0.2137617 0.07128573 0.09781288 0.1206482 0.1339655 0.1464158 0.1836445
## PC7 PC8 PC9 PC10
## 1 0.1328167 0.1497165 0.1399499 0.1668201
## 2 0.1518185 0.1739280 0.1728432 0.1977730
##
## Residual Deviance: 650.5664
## AIC: 694.5664
z <- summary(model)$coefficients / summary(model)$standard.errors
p <- 2 * (1 - pnorm(abs(z)))
print(p)
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 1 0.429918281 5.649839e-06 0.0002960733 0.01386672 3.855983e-01 7.974550e-02
## 2 0.002605428 3.073222e-10 0.9739681521 0.14215041 6.009507e-06 1.795945e-05
## PC6 PC7 PC8 PC9 PC10
## 1 5.414367e-01 0.518832494 0.1776002 0.273055342 0.22240895
## 2 1.583436e-06 0.006926574 0.3469916 0.001210722 0.09518517
clean_data <- data_final
prediksi <- predict(model, newdata = data_cleaned)
conf_matrix <- table(Prediksi = prediksi, Aktual = clean_data$class)
print(conf_matrix)
## Aktual
## Prediksi 0 1 2
## 0 52 24 11
## 1 38 81 21
## 2 23 24 107
akurasi <- mean(prediksi == clean_data$class)
cat("Akurasi LDA:", round(akurasi * 100, 2), "%\n")
## Akurasi LDA: 62.99 %
library(caret)
## Loading required package: lattice
actual <- factor(clean_data$class)
predicted <- factor(prediksi, levels = levels(actual))
cm <- confusionMatrix(prediksi, actual)
# TPR per kelas = sensitivity
cm$byClass[, "Sensitivity"]
## Class: 0 Class: 1 Class: 2
## 0.4601770 0.6279070 0.7697842
mean(cm$byClass[, "Sensitivity"])
## [1] 0.6192894
#FPR
mat <- as.matrix(cm$table) # ambil confusion matrix
classes <- colnames(mat)
# fungsi untuk menghitung FPR per kelas
fpr_per_class <- function(cm) {
sapply(1:ncol(cm), function(i) {
FP <- sum(cm[i, -i]) # prediksi kelas i padahal bukan (baris i, kolom ≠ i)
TN <- sum(cm[-i, -i]) # semua yang bukan kelas i dan diprediksi juga bukan
FP / (FP + TN)
})
}
mean(fpr_per_class(mat))
## [1] 0.186313
predict_proba <- predict(model, type = "prob")
y_true <- as.factor(data_final$class)
roc_list <- lapply(levels(y_true), function(k) {
roc(response = as.numeric(y_true == k), predictor = predict_proba[, k])
})
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_values <- sapply(roc_list, auc)
names(auc_values) <- levels(y_true)
print('AUC per Class of PCA MNLR')
## [1] "AUC per Class of PCA MNLR"
print(auc_values)
## 0 1 2
## 0.7742042 0.7801926 0.8528896
print('Mean AUC')
## [1] "Mean AUC"
print(mean(auc_values))
## [1] 0.8024288
plot(roc_list[[1]], col = "red", main = "Multiclass ROC PCA MNLR")
for (i in 2:length(roc_list)) {
plot(roc_list[[i]], col = i + 1, add = TRUE)
}
legend("bottomright", legend = levels(y_true), col = 2:(length(roc_list) + 1), lwd = 2)
Berdasarkan hasil dari ROC, nilai AUC tergolong cukup baik yakni dengan
rata-rata 80% pada setiap kelas.