Lofian Rafi Qurrotul Ain (23031554221)
Mata
Kuliah: Analisis Multivariat
Dosen: Ike Fitriyaningsih, M.Si
NIDN : 0109049001
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(heplots)
## Warning: package 'heplots' was built under R version 4.4.3
## Loading required package: broom
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data <- read.csv("C:/Kuliah/semester 4/Analisis Multivariat/Projek Uas/data/retina_word2vec_1_balanced_scaled.csv", header = TRUE)
fitur <- data[, 1:ncol(data)-1]
barplot(table(data$class),
main = "Distribusi Kelas",
xlab = "Kelas",
ylab = "Jumlah",
col = "lightblue")
Persebaran semua kelas sudah seimbang.
par(mfrow = c(2, 5))
for (i in 1:50) {
hist(data[[i]], main = paste("Hist", names(data)[i]), ylab='', xlab = "", col = "skyblue")
}
par(mfrow = c(1, 1)) # reset layout
par(mfrow = c(2, 5))
for (i in 1:50) {
boxplot(data[, i] ~ data$class, main = paste("Box", names(data)[i]), ylab='', xlab='')
}
par(mfrow = c(1, 1)) # reset layout
Uji Menggunakan Mardia Test (H0 = data mengikuti distribusi normal multivariat)
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 4.4.3
## Loading required package: boot
## Loading required package: purrr
##
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
##
## norm
mult.norm(data)$mult.test
## Beta-hat kappa p-val
## Skewness 1078.086 81395.4841 0
## Kurtosis 4041.260 193.6964 0
Terlihat p-value yang bernilai 0 yang menunjukkan bahwa data tidak berdistribusi Normal. Selain itu jika dilihat dari nilai kurtosis dan skewness, distribusinya sangat jauh dari nilai distribusi normal.
Kami mencoba melakukan uji normalitas univariat pada masing-masing fitur menggunakan shapiro-wilk test dengan H0 = data mengikuti distribusi Normal.
cat("\n== Uji Normalitas Multivariat ==\n")
##
## == Uji Normalitas Multivariat ==
shapiro_results <- apply(fitur, 2, function(x) shapiro.test(x)$p.value)
shapiro_results
## X0 X1 X2 X3 X4 X5
## 6.963906e-07 1.604687e-14 1.640520e-05 3.148654e-15 1.003173e-14 1.478672e-16
## X6 X7 X8 X9 X10 X11
## 4.427764e-04 4.972329e-05 5.035307e-10 1.294207e-04 9.740120e-13 9.155725e-15
## X12 X13 X14 X15 X16 X17
## 1.099835e-10 2.527609e-13 3.753918e-06 2.523177e-13 3.063433e-09 3.533561e-08
## X18 X19 X20 X21 X22 X23
## 9.434544e-10 6.863818e-12 1.188802e-09 8.663981e-16 3.961241e-17 6.595248e-07
## X24 X25 X26 X27 X28 X29
## 2.551322e-10 3.780256e-16 7.166485e-06 2.910042e-13 1.792330e-03 5.935991e-16
## X30 X31 X32 X33 X34 X35
## 3.439895e-11 9.732520e-16 3.090199e-09 4.226251e-16 1.193515e-17 8.768592e-16
## X36 X37 X38 X39 X40 X41
## 1.297957e-12 1.988906e-15 5.437568e-17 3.662327e-15 2.360782e-09 7.779772e-09
## X42 X43 X44 X45 X46 X47
## 5.150693e-04 1.198235e-12 1.037975e-09 9.667379e-16 3.717822e-16 2.934634e-07
## X48 X49
## 7.512918e-16 2.565761e-04
print('Fitur yang berdistribusi normal (p-value > 0.05)')
## [1] "Fitur yang berdistribusi normal (p-value > 0.05)"
shapiro_results[shapiro_results > 0.05]
## named numeric(0)
dari hasil diatas, bahkan tidak ada satupun fitur yang mengikuti distribusi normal.
Kami menggunakan metode Box’s M untuk melakukan uji homogenitas dengan H0 = data homogen.
cat("\n== Uji Homogenitas Matriks Kovarian (Box's M Test) ==\n")
##
## == Uji Homogenitas Matriks Kovarian (Box's M Test) ==
boxm_result <- boxM(data[, 1:50], data$class)
print(boxm_result)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: data[, 1:50]
## Chi-Sq (approx.) = 9523.1, df = 2550, p-value < 2.2e-16
Berdasarkan nilai p-value yang sangat mendekati 0, menunjukkan bahwa data tidak homogen.
cat("\n== Model LDA ==\n")
##
## == Model LDA ==
lda_model <- lda(class ~ ., data = clean_data)
summary(lda_model)
## Length Class Mode
## prior 3 -none- numeric
## counts 3 -none- numeric
## means 150 -none- numeric
## scaling 100 -none- numeric
## lev 3 -none- character
## svd 2 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
plot(lda_model)
# Mapping Biplot
library(MASS)
lda_model <- lda(class ~ ., data = data)
biplot(prcomp(predict(lda_model)$x))
# Evaluasi Model
## Akurasi Model
lda_model <- lda(class ~ ., data = clean_data)
prediksi <- predict(lda_model)$class
conf_matrix <- table(Prediksi = prediksi, Aktual = clean_data$class)
print(conf_matrix)
## Aktual
## Prediksi 0 1 2
## 0 66 11 7
## 1 18 106 14
## 2 15 9 120
akurasi <- mean(prediksi == clean_data$class)
cat("Akurasi LDA:", round(akurasi * 100, 2), "%\n")
## Akurasi LDA: 79.78 %
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
actual <- factor(clean_data$class)
predicted <- factor(prediksi, levels = levels(actual))
cm <- confusionMatrix(prediksi, actual)
# TPR per kelas = sensitivity
cm$byClass[, "Sensitivity"]
## Class: 0 Class: 1 Class: 2
## 0.6666667 0.8412698 0.8510638
mean(cm$byClass[, "Sensitivity"])
## [1] 0.7863334
#FPR
mat <- as.matrix(cm$table) # ambil confusion matrix
classes <- colnames(mat)
# fungsi untuk menghitung FPR per kelas
fpr_per_class <- function(cm) {
sapply(1:ncol(cm), function(i) {
FP <- sum(cm[i, -i]) # prediksi kelas i padahal bukan (baris i, kolom ≠ i)
TN <- sum(cm[-i, -i]) # semua yang bukan kelas i dan diprediksi juga bukan
FP / (FP + TN)
})
}
mean(fpr_per_class(mat))
## [1] 0.1024719
true_labels <- as.factor(clean_data$class)
posterior <- predict(lda_model)$posterior # Probabilitas per kelas
# ROC untuk setiap kelas (One vs Rest)
roc_list <- lapply(levels(true_labels), function(class) {
roc(response = as.numeric(true_labels == class), predictor = posterior[, class])
})
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Tampilkan AUC per kelas
auc_values <- sapply(roc_list, auc)
names(auc_values) <- levels(true_labels)
print(auc_values)
## 0 1 2
## 0.8848409 0.9103671 0.9335067
print(mean(auc_values))
## [1] 0.9095716
plot(roc_list[[1]], col = "red", main = "Multiclass ROC")
for (i in 2:length(roc_list)) {
plot(roc_list[[i]], col = i + 1, add = TRUE)
}
legend("bottomright", legend = levels(true_labels), col = 2:(length(roc_list) + 1), lwd = 2)
Dari hasil evaluasi ROC, berdasarkan AUC bisa dilihat bahwa model
tergolong sangat baik dalam menjelaskan data yaitu dengan nilai AUC tiap
kelas berada di antara 90%.