Moch Faiz Febriawan (23031554068)
Mata
Kuliah: Analisis Multivariat
Dosen: Ike Fitriyaningsih, M.Si
NIDN : 0109049001
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(heplots)
## Warning: package 'heplots' was built under R version 4.4.3
## Loading required package: broom
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data <- read.csv("C:/Kuliah/semester 4/Analisis Multivariat/Projek Uas/data/retina_word2vec_1_balanced_scaled.csv", header = TRUE)
pca_result <- prcomp(data[1:50], center = TRUE, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 4.0057 2.13472 1.76298 1.72025 1.48890 1.28805 1.23219
## Proportion of Variance 0.3209 0.09114 0.06216 0.05919 0.04434 0.03318 0.03037
## Cumulative Proportion 0.3209 0.41205 0.47421 0.53340 0.57774 0.61092 0.64128
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.19404 1.14844 1.08272 1.0001 0.98263 0.94960 0.91939
## Proportion of Variance 0.02851 0.02638 0.02345 0.0200 0.01931 0.01803 0.01691
## Cumulative Proportion 0.66980 0.69618 0.71962 0.7396 0.75894 0.77697 0.79388
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.87107 0.83091 0.78691 0.76556 0.75516 0.73786 0.71016
## Proportion of Variance 0.01518 0.01381 0.01238 0.01172 0.01141 0.01089 0.01009
## Cumulative Proportion 0.80905 0.82286 0.83525 0.84697 0.85837 0.86926 0.87935
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.69172 0.65283 0.64192 0.62581 0.59520 0.58501 0.56174
## Proportion of Variance 0.00957 0.00852 0.00824 0.00783 0.00709 0.00684 0.00631
## Cumulative Proportion 0.88892 0.89744 0.90568 0.91351 0.92060 0.92745 0.93376
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.54643 0.53709 0.51267 0.49925 0.46784 0.46289 0.4527
## Proportion of Variance 0.00597 0.00577 0.00526 0.00498 0.00438 0.00429 0.0041
## Cumulative Proportion 0.93973 0.94550 0.95075 0.95574 0.96012 0.96440 0.9685
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.43684 0.4243 0.40068 0.39182 0.37132 0.3534 0.33384
## Proportion of Variance 0.00382 0.0036 0.00321 0.00307 0.00276 0.0025 0.00223
## Cumulative Proportion 0.97232 0.9759 0.97913 0.98220 0.98496 0.9875 0.98968
## PC43 PC44 PC45 PC46 PC47 PC48 PC49
## Standard deviation 0.32294 0.31042 0.29438 0.25675 0.24058 0.23098 0.19421
## Proportion of Variance 0.00209 0.00193 0.00173 0.00132 0.00116 0.00107 0.00075
## Cumulative Proportion 0.99177 0.99370 0.99543 0.99675 0.99791 0.99897 0.99973
## PC50
## Standard deviation 0.11679
## Proportion of Variance 0.00027
## Cumulative Proportion 1.00000
Berdasarkan nilai varians kumulatif, kami mengambil 10 Komponen Utama karena 10 komponen tersebut sudah mampu menjelaskan >70% variansi dari data.
pca_data <- as.data.frame(pca_result$x[, 1:10])
colnames(pca_data) <- paste0("PC", 1:10)
final_data <- cbind(pca_data, class = as.factor(data$class))
fitur <- final_data[, 1:ncol(final_data)-1]
barplot(table(final_data$class),
main = "Distribusi Kelas",
xlab = "Kelas",
ylab = "Jumlah",
col = "lightblue")
Persebaran semua kelas sudah seimbang.
par(mfrow = c(2, 5))
for (i in 1:50) {
hist(data[[i]], main = paste("Hist", names(data)[i]), ylab='', xlab = "", col = "skyblue")
}
par(mfrow = c(1, 1)) # reset layout
par(mfrow = c(2, 5))
for (i in 1:50) {
boxplot(data[, i] ~ data$class, main = paste("Box", names(data)[i]), ylab='', xlab='')
}
par(mfrow = c(1, 1)) # reset layout
Uji Menggunakan Mardia Test (H0 = data mengikuti distribusi normal multivariat)
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 4.4.3
## Loading required package: boot
## Loading required package: purrr
##
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
##
## norm
mult.norm(data)$mult.test
## Beta-hat kappa p-val
## Skewness 1078.086 81395.4841 0
## Kurtosis 4041.260 193.6964 0
Terlihat p-value yang bernilai 0 yang menunjukkan bahwa data tidak berdistribusi Normal. Selain itu jika dilihat dari nilai kurtosis dan skewness, distribusinya sangat jauh dari nilai distribusi normal.
Kami mencoba melakukan uji normalitas univariat pada masing-masing fitur menggunakan shapiro-wilk test dengan H0 = data mengikuti distribusi Normal.
cat("\n== Uji Normalitas Multivariat ==\n")
##
## == Uji Normalitas Multivariat ==
shapiro_results <- apply(final_data[, 1:10], 2, function(x) shapiro.test(x)$p.value)
shapiro_results
## PC1 PC2 PC3 PC4 PC5 PC6
## 8.255248e-17 1.017185e-10 2.963185e-08 7.917934e-07 1.370670e-11 2.630571e-07
## PC7 PC8 PC9 PC10
## 4.310177e-03 1.677905e-07 6.504445e-02 2.582335e-07
print('Fitur yang berdistribusi normal (p-value > 0.05)')
## [1] "Fitur yang berdistribusi normal (p-value > 0.05)"
shapiro_results[shapiro_results > 0.05]
## PC9
## 0.06504445
Dari hasil uji normalitas univariat, hanya PC9 yang berdistribusi Normal
Kami menggunakan metode Box’s M untuk melakukan uji homogenitas dengan H0 = data homogen.
cat("\n== Uji Homogenitas Matriks Kovarian (Box's M Test) ==\n")
##
## == Uji Homogenitas Matriks Kovarian (Box's M Test) ==
boxm_result <- boxM(final_data[, 1:10], final_data$class)
print(boxm_result)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: final_data[, 1:10]
## Chi-Sq (approx.) = 690.37, df = 110, p-value < 2.2e-16
Berdasarkan nilai p-value yang sangat mendekati 0, menunjukkan bahwa data tidak homogen.
cat("\n== Model LDA ==\n")
##
## == Model LDA ==
lda_model <- lda(class ~ ., data = clean_data)
summary(lda_model)
## Length Class Mode
## prior 3 -none- numeric
## counts 3 -none- numeric
## means 30 -none- numeric
## scaling 20 -none- numeric
## lev 3 -none- character
## svd 2 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
plot(lda_model)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_pca_biplot(pca_result,
label = "var",
habillage = data$class,
addEllipses = TRUE,
palette = "jco")
prediksi <- predict(lda_model)$class
conf_matrix <- table(Prediksi = prediksi, Aktual = clean_data$class)
print(conf_matrix)
## Aktual
## Prediksi 1 2 3
## 1 55 21 13
## 2 43 85 26
## 3 26 32 105
akurasi <- mean(prediksi == clean_data$class)
cat("Akurasi LDA:", round(akurasi * 100, 2), "%\n")
## Akurasi LDA: 60.34 %
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
actual <- factor(clean_data$class)
predicted <- factor(prediksi, levels = levels(actual))
cm <- confusionMatrix(prediksi, actual)
# TPR per kelas = sensitivity
cm$byClass[, "Sensitivity"]
## Class: 1 Class: 2 Class: 3
## 0.4435484 0.6159420 0.7291667
mean(cm$byClass[, "Sensitivity"])
## [1] 0.596219
#FPR
mat <- as.matrix(cm$table) # ambil confusion matrix
classes <- colnames(mat)
# fungsi untuk menghitung FPR per kelas
fpr_per_class <- function(cm) {
sapply(1:ncol(cm), function(i) {
FP <- sum(cm[i, -i]) # prediksi kelas i padahal bukan (baris i, kolom ≠ i)
TN <- sum(cm[-i, -i]) # semua yang bukan kelas i dan diprediksi juga bukan
FP / (FP + TN)
})
}
mean(fpr_per_class(mat))
## [1] 0.1998014
true_labels <- as.factor(clean_data$class)
posterior <- predict(lda_model)$posterior # Probabilitas per kelas
# ROC untuk setiap kelas (One vs Rest)
roc_list <- lapply(levels(true_labels), function(class) {
roc(response = as.numeric(true_labels == class), predictor = posterior[, class])
})
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Tampilkan AUC per kelas
auc_values <- sapply(roc_list, auc)
names(auc_values) <- levels(true_labels)
print(auc_values)
## 1 2 3
## 0.7644990 0.7437946 0.7985714
print(mean(auc_values))
## [1] 0.768955
plot(roc_list[[1]], col = "red", main = "Multiclass ROC")
for (i in 2:length(roc_list)) {
plot(roc_list[[i]], col = i + 1, add = TRUE)
}
legend("bottomright", legend = levels(true_labels), col = 2:(length(roc_list) + 1), lwd = 2)
Dari hasil evaluasi ROC, berdasarkan AUC bisa dilihat bahwa model
tergolong lumayan dalam menjelaskan data yaitu dengan nilai AUC tiap
kelas berada di antara >70%. Namu dari hasil akurasi keseluruhan
model tergolong kurang karena hanya memiliki akurasi 60% dalam
menjelaskan data. Hal ini disebabkan karena dari hasil confussion matrix
model sulit membedakan kelas 0(negatif) dan 1(netral).