Rakan Refaya Dewangga (23031554108)
Mata
Kuliah: Analisis Multivariat
Dosen: Ike Fitriyaningsih, M.Si
NIDN : 0109049001
library(nnet)
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
Data merupakan hasil ekstraksi fitur menggunakan metode Word2vec dengan model Word2vec milik orang lain yang dilatih menggunakan corpus wikipedia bahasa Indonesia. Panjang vector merupakan jumlah variabel independen yaitu dengan jumlah 50 variabel dan 1 varibel dependen yaitu variabel class. Sumber Model: https://github.com/irzaip/ID_WORD2VEC
data <- read.csv("../../data/retina_word2vec_1_balanced_scaled.csv", header = TRUE)
n <- ncol(data)
barplot(table(data$class),
main = "Distribusi Kelas",
xlab = "Kelas",
ylab = "Jumlah",
col = "lightblue")
Distribusi setiap kelas sudah seimbang
par(mfrow = c(2, 5))
for (i in 1:50) {
hist(data[[i]], main = paste("Hist", names(data)[i]), ylab='', xlab = "", col = "skyblue")
}
par(mfrow = c(1, 1)) # reset layout
Secara univariat distribusi data banyak yang tidak berdistribusi normal.
par(mfrow = c(2, 5))
for (i in 1:50) {
boxplot(data[, i] ~ data$class, main = paste("Box", names(data)[i]), ylab='', xlab='')
}
par(mfrow = c(1, 1)) # reset layout
Kami melakukan penghapusan outlier univariat dengan pendekatan Z-score. Kami menghapus data yang > 3 atau <-3, karena jika menggunakan threshold 2 akan banyak data yang terbuang. referensi: https://medium.com/@akashsri306/detecting-anomalies-with-z-scores-a-practical-approach-2f9a0f27458d
z_scores <- scale(data[1:(n-1)])
threshold <- 3
normal <- cbind(z_scores, class = data$class)
dt_cl_nonpca_mnlr <- as.data.frame(normal[!rowSums(normal[, 1:n-1] > threshold | normal[, 1:n-1] < -threshold) > 0, ])
dim(dt_cl_nonpca_mnlr)
## [1] 366 51
model <- multinom(class ~ ., data = dt_cl_nonpca_mnlr_2)
## # weights: 117 (76 variable)
## initial value 325.189237
## iter 10 value 169.162446
## iter 20 value 146.522750
## iter 30 value 140.316082
## iter 40 value 140.046003
## iter 50 value 140.043816
## final value 140.043812
## converged
summary(model)
## Call:
## multinom(formula = class ~ ., data = dt_cl_nonpca_mnlr_2)
##
## Coefficients:
## (Intercept) X0 X1 X2 X4 X6 X7
## 1 -1.0973004 -1.2851495 0.06391762 -0.9699541 -0.78171906 2.552815 0.3635663
## 2 0.2772397 -0.4669279 1.56471573 -1.1450252 0.01502977 2.756663 -0.5560533
## X8 X9 X10 X12 X13 X14 X15
## 1 -0.2575804 0.709568 0.01978805 0.8628414 0.2517499 -0.6679638 -0.1310947
## 2 -0.2367564 2.103442 1.33769604 -0.7669557 1.4271128 0.2462147 0.5409799
## X16 X17 X18 X19 X20 X23 X24
## 1 -0.1531428 -0.3297117 -1.0495311 0.6807206 -1.727101 0.48793069 0.2787209
## 2 -1.1211925 -0.3437721 0.3204709 -0.6893699 1.638769 -0.01219288 1.9360328
## X26 X28 X29 X30 X31 X32 X33
## 1 -1.2453400 -0.1304579 0.05241874 1.462745 -2.400006 -0.85426907 -1.681092
## 2 -0.7454733 -0.4481219 -0.00555644 -2.041453 1.405267 0.02677555 -1.201950
## X35 X36 X40 X41 X42 X43 X44
## 1 3.508672 -0.7796774 0.008383296 0.9224601 0.9398443 -0.9611800 0.07848536
## 2 1.089108 -1.8642341 -1.126808181 -1.2650878 0.3706618 -0.5671306 -1.98523819
## X45 X47 X49
## 1 -0.77764704 -0.02641634 1.07269417
## 2 0.02935223 -0.35427171 0.06908327
##
## Std. Errors:
## (Intercept) X0 X1 X2 X4 X6 X7
## 1 0.6005150 0.5844438 0.8062785 0.5570908 0.7363879 0.5894788 0.6212573
## 2 0.4620165 0.5976448 0.8775416 0.5267326 0.7406985 0.6066712 0.6483467
## X8 X9 X10 X12 X13 X14 X15
## 1 0.6339709 0.6186350 0.7018316 0.5880041 0.8082507 0.6193789 0.7006126
## 2 0.6269424 0.7263509 0.7755321 0.6632907 0.8443257 0.5970788 0.6152887
## X16 X17 X18 X19 X20 X23 X24
## 1 0.6634964 0.5641803 0.6561384 0.5836335 0.6679786 0.5179175 0.6094785
## 2 0.7884812 0.6367104 0.6977091 0.6843595 0.7318177 0.5282360 0.6928024
## X26 X28 X29 X30 X31 X32 X33
## 1 0.5694557 0.6427710 0.6929771 0.7453055 0.9516314 0.6938016 0.7493486
## 2 0.5814700 0.6746977 0.7645654 0.7775255 0.9357541 0.7219559 0.7846047
## X35 X36 X40 X41 X42 X43 X44
## 1 0.9232066 0.5984943 0.6767603 0.5940385 0.6188962 0.4845506 0.5735067
## 2 0.9894700 0.6476076 0.6618066 0.6589600 0.6412913 0.5240367 0.6319952
## X45 X47 X49
## 1 0.5845673 0.5183204 0.5592713
## 2 0.5738903 0.5981178 0.5939494
##
## Residual Deviance: 280.0876
## AIC: 432.0876
z <- summary(model)$coefficients / summary(model)$standard.errors
p <- 2 * (1 - pnorm(abs(z)))
print(p)
## (Intercept) X0 X1 X2 X4 X6 X7
## 1 0.06765982 0.02788308 0.93681399 0.08166504 0.288436 1.486825e-05 0.5584062
## 2 0.54846331 0.43463790 0.07457536 0.02971818 0.983811 5.521856e-06 0.3910869
## X8 X9 X10 X12 X13 X14 X15
## 1 0.6845245 0.251385907 0.97750673 0.1422654 0.7554395 0.2808369 0.8515710
## 2 0.7057006 0.003780679 0.08454998 0.2475629 0.0909821 0.6800715 0.3792769
## X16 X17 X18 X19 X20 X23 X24
## 1 0.8174610 0.5589456 0.1096968 0.2434732 0.009722015 0.3461409 0.647447934
## 2 0.1550365 0.5892529 0.6460052 0.3137804 0.025135514 0.9815847 0.005198084
## X26 X28 X29 X30 X31 X32 X33
## 1 0.02875016 0.8391649 0.9397033 0.049691368 0.01166927 0.2182154 0.02487072
## 2 0.19982529 0.5065740 0.9942015 0.008650238 0.13316227 0.9704152 0.12554256
## X35 X36 X40 X41 X42 X43 X44
## 1 0.0001443884 0.192666389 0.99011654 0.120456 0.1288679 0.0472946 0.891148016
## 2 0.2710281076 0.003993842 0.08863832 0.054880 0.5632690 0.2791484 0.001682435
## X45 X47 X49
## 1 0.1834210 0.9593532 0.05510835
## 2 0.9592091 0.5536424 0.90740550
prediksi <- predict(model, newdata = dt_cl_nonpca_mnlr_2)
conf_matrix <- table(Prediksi = prediksi, Aktual = dt_cl_nonpca_mnlr_2$class)
print(conf_matrix)
## Aktual
## Prediksi 0 1 2
## 0 40 13 5
## 1 12 83 10
## 2 10 7 116
akurasi <- mean(prediksi == dt_cl_nonpca_mnlr_2$class)
cat("Akurasi LDA:", round(akurasi * 100, 2), "%\n")
## Akurasi LDA: 80.74 %
actual <- factor(dt_cl_nonpca_mnlr_2$class)
predicted <- factor(prediksi, levels = levels(actual))
cm <- confusionMatrix(prediksi, actual)
# TPR per kelas = sensitivity
cm$byClass[, "Sensitivity"]
## Class: 0 Class: 1 Class: 2
## 0.6451613 0.8058252 0.8854962
mean(cm$byClass[, "Sensitivity"])
## [1] 0.7788276
#FPR
mat <- as.matrix(cm$table) # ambil confusion matrix
classes <- colnames(mat)
# fungsi untuk menghitung FPR per kelas
fpr_per_class <- function(cm) {
sapply(1:ncol(cm), function(i) {
FP <- sum(cm[i, -i]) # prediksi kelas i padahal bukan (baris i, kolom ≠i)
TN <- sum(cm[-i, -i]) # semua yang bukan kelas i dan diprediksi juga bukan
FP / (FP + TN)
})
}
mean(fpr_per_class(mat))
## [1] 0.09798101
predict_proba <- predict(model, type = "prob")
y_true <- as.factor(dt_cl_nonpca_mnlr_2$class)
roc_list <- lapply(levels(y_true), function(k) {
roc(response = as.numeric(y_true == k), predictor = predict_proba[, k])
})
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_values <- sapply(roc_list, auc)
names(auc_values) <- levels(y_true)
print('AUC per Class of Non-PCA MNLR')
## [1] "AUC per Class of Non-PCA MNLR"
print(auc_values)
## 0 1 2
## 0.9019851 0.9391569 0.9540828
print('Mean AUC')
## [1] "Mean AUC"
print(mean(auc_values))
## [1] 0.9317416
plot(roc_list[[1]], col = "red", main = "Multiclass ROC Non-PCA MNLR")
for (i in 2:length(roc_list)) {
plot(roc_list[[i]], col = i + 1, add = TRUE)
}
legend("bottomright", legend = levels(y_true), col = 2:(length(roc_list) + 1), lwd = 2)
Dari hasil ROC, terlihat model sangat baik dalam menebak kelas 1 dan2,
dan secara kesuluruhan model tergolong sangat baik dengan rata-rata AUC
93%
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:caret':
##
## predictors
dt_cl_nonpca_mnlr_3 <- dt_cl_nonpca_mnlr_2
dt_cl_nonpca_mnlr_3$class <- factor(dt_cl_nonpca_mnlr_3$class, levels = c(0, 2, 1))
dt_cl_nonpca_mnlr_3$class
## [1] 2 1 1 1 0 1 2 1 1 1 2 1 2 1 2 1 0 0 2 2 1 1 0 1 0 1 1 1 0 2 0 1 0 2 2 0 0
## [38] 1 1 0 1 0 2 0 2 1 0 2 1 0 0 2 1 0 0 1 2 0 0 2 2 2 0 0 0 0 2 1 2 0 0 0 2 0
## [75] 1 0 1 0 1 1 0 2 2 1 1 2 1 1 0 1 1 1 1 1 1 0 0 1 1 2 0 0 1 0 1 2 2 2 2 1 1
## [112] 1 0 2 1 0 1 0 1 1 1 1 0 1 2 1 1 1 0 0 1 0 1 1 1 0 1 2 2 2 1 2 0 1 2 0 2 1
## [149] 1 1 2 1 1 2 0 0 2 0 2 1 1 1 2 1 1 0 0 0 0 1 1 2 1 0 0 0 1 2 1 0 1 0 1 0 2
## [186] 1 2 1 0 2 0 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
## [223] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## Levels: 0 2 1
levels(dt_cl_nonpca_mnlr_3$class)
## [1] "0" "2" "1"
fit <- vglm(class ~ ., family = multinomial(), data = dt_cl_nonpca_mnlr_3)
# Deviance
deviance(fit)
## [1] 280.0876
# Pearson test
summary(fit) # lihat Pearson residuals
##
## Call:
## vglm(formula = class ~ ., family = multinomial(), data = dt_cl_nonpca_mnlr_3)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 1.097116 0.600467 1.827 0.067684 .
## (Intercept):2 1.374284 0.601671 2.284 0.022365 *
## X0:1 1.284907 0.584418 2.199 0.027906 *
## X0:2 0.818122 0.602120 1.359 0.174230
## X1:1 -0.063847 0.806239 -0.079 0.936881
## X1:2 1.500491 0.808320 1.856 0.063410 .
## X2:1 0.969805 0.557058 1.741 0.081694 .
## X2:2 -0.175199 0.480386 -0.365 0.715333
## X4:1 0.781750 0.736348 1.062 0.288391
## X4:2 0.796705 0.713184 1.117 0.263947
## X6:1 -2.552771 0.589463 -4.331 1.49e-05 ***
## X6:2 0.203942 0.575517 0.354 0.723067
## X7:1 -0.363438 0.621230 -0.585 0.558528
## X7:2 -0.919556 0.673950 -1.364 0.172433
## X8:1 0.257432 0.633950 0.406 0.684686
## X8:2 0.020839 0.616825 0.034 0.973049
## X9:1 -0.709402 0.618618 -1.147 0.251483
## X9:2 1.393731 0.710401 1.962 0.049775 *
## X10:1 -0.019806 0.701803 -0.028 0.977486
## X10:2 1.317803 0.787819 1.673 0.094382 .
## X12:1 -0.862739 0.587986 -1.467 0.142300
## X12:2 -1.629705 0.698696 -2.332 0.019675 *
## X13:1 -0.251628 0.808216 -0.311 0.755544
## X13:2 1.175452 0.781048 1.505 0.132332
## X14:1 0.667709 0.619351 1.078 0.280998
## X14:2 0.914122 0.640497 1.427 0.153520
## X15:1 0.131075 0.700579 0.187 0.851586
## X15:2 0.672110 0.745908 0.901 0.367554
## X16:1 0.153290 0.663481 0.231 0.817284
## X16:2 -0.967918 0.728268 -1.329 0.183825
## X17:1 0.329707 0.564159 0.584 0.558936
## X17:2 -0.014246 0.630794 -0.023 0.981982
## X18:1 1.049525 0.656118 1.600 0.109688
## X18:2 1.369946 0.652736 2.099 0.035837 *
## X19:1 -0.680688 0.583610 -1.166 0.243477
## X19:2 -1.370012 0.658561 -2.080 0.037497 *
## X20:1 1.726980 0.667953 2.585 0.009724 **
## X20:2 3.365442 0.751408 4.479 7.50e-06 ***
## X23:1 -0.487813 0.517901 -0.942 0.346241
## X23:2 -0.500068 0.528019 -0.947 0.343606
## X24:1 -0.278753 0.609456 -0.457 0.647398
## X24:2 1.657124 0.692261 2.394 0.016676 *
## X26:1 1.245087 0.569426 2.187 0.028774 *
## X26:2 0.499860 0.584096 0.856 0.392118
## X28:1 0.130215 0.642737 0.203 0.839452
## X28:2 -0.317634 0.676556 -0.469 0.638722
## X29:1 -0.052582 0.692947 -0.076 0.939513
## X29:2 -0.057895 0.789164 -0.073 0.941518
## X30:1 -1.462281 0.745241 -1.962 0.049744 *
## X30:2 -3.503729 0.833482 -4.204 2.63e-05 ***
## X31:1 2.399466 0.951575 2.522 0.011683 *
## X31:2 3.805160 1.017154 3.741 0.000183 ***
## X32:1 0.853962 0.693776 1.231 0.218364
## X32:2 0.881085 0.658555 1.338 0.180927
## X33:1 1.680988 0.749314 2.243 0.024873 *
## X33:2 0.478964 0.788787 0.607 0.543707
## X35:1 -3.508178 0.923143 -3.800 0.000145 ***
## X35:2 -2.419400 0.878950 -2.753 0.005912 **
## X36:1 0.779682 0.598475 1.303 0.192650
## X36:2 -1.084340 0.662606 -1.636 0.101740
## X40:1 -0.008147 0.676736 -0.012 0.990395
## X40:2 -1.135038 0.715121 -1.587 0.112468
## X41:1 -0.922348 0.594012 -1.553 0.120484
## X41:2 -2.187382 0.668946 -3.270 0.001076 **
## X42:1 -0.940007 0.618885 -1.519 0.128795
## X42:2 -0.569089 0.661577 -0.860 0.389679
## X43:1 0.961030 0.484533 1.983 0.047321 *
## X43:2 0.393967 0.504570 0.781 0.434922
## X44:1 -0.078399 0.573484 -0.137 0.891262
## X44:2 -2.063561 0.664226 -3.107 0.001892 **
## X45:1 0.777695 0.584542 1.330 0.183375
## X45:2 0.806946 0.603787 1.336 0.181394
## X47:1 0.026293 0.518300 0.051 0.959541
## X47:2 -0.327926 0.596007 -0.550 0.582178
## X49:1 -1.072718 0.559254 -1.918 0.055096 .
## X49:2 -1.003554 0.587927 -1.707 0.087834 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
##
## Residual deviance: 280.0876 on 516 degrees of freedom
##
## Log-likelihood: -140.0438 on 516 degrees of freedom
##
## Number of Fisher scoring iterations: 7
##
## No Hauck-Donner effect found in any of the estimates
##
##
## Reference group is level 3 of the response
disini kelas 3 = (Netral), kelas 1 = (Negatif), dan kelas 2 = (Positif). Hal ini disebabkan karena default referensi dari modul vgam menjadikan level terakhir pada kelas merupakan kelas referensi, dan saya menjadikan kelas 1 sebagai kelas referensi.
Beberapa variabel yang paling kuat dan signifikan: X6:1 = Nilai negatif besar, prediktor kuat untuk membedakan kelas 3 dari kelas 1 X20:2 = Koefisien sangat besar dan signifikan, X20 yang tinggi hampir pasti kelas 2 X30:2, X35, X41:2, X44:2 = nilai negatif signifikan → condong ke kelas 3 X31 = sangat kuat mempengaruhi ke kelas 1 dan 2 (tergantung subkoefisien)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fitur_signif <- dt_cl_nonpca_mnlr_2[, c("X0", "X12", "X18", "X19", "X20",
"X24", "X26", "X30", "X31", "X33",
"X35", "X41", "X43", "X44")]
pca_result <- prcomp(fitur_signif, scale. = TRUE)
fviz_pca_biplot(pca_result,
label = "var",
habillage = dt_cl_nonpca_mnlr_2$class,
addEllipses = TRUE)
Disini kami melakukan perbandingan antara model Null (tanpa variabel prediktor) dengan model full. Hal ini bertujuan untuk mencari tau apakah keberadaan variabel prediktor mempengaruhi kelayakan model.
# Model null (tanpa prediktor)
model_null <- multinom(class ~ 1, data = dt_cl_nonpca_mnlr_3)
## # weights: 6 (2 variable)
## initial value 325.189237
## final value 312.436131
## converged
# Full model
model_full <- multinom(class ~ ., data = dt_cl_nonpca_mnlr_3)
## # weights: 117 (76 variable)
## initial value 325.189237
## iter 10 value 169.162446
## iter 20 value 146.522750
## iter 30 value 140.316082
## iter 40 value 140.046003
## iter 50 value 140.043816
## final value 140.043812
## converged
# Signifikan model
model_signif <- multinom(
class ~ X0 + X12 + X18 + X19 + X20 + X24 + X26 + X30 +
X31 + X33 + X35 + X41 + X43 + X44,
data = dt_cl_nonpca_mnlr_3
)
## # weights: 48 (30 variable)
## initial value 325.189237
## iter 10 value 240.037488
## iter 20 value 211.523065
## iter 30 value 210.864834
## final value 210.864226
## converged
# Likelihood ratio test (Uji deviance)
anova(model_null, model_full, test = "Chisq")
## Likelihood ratio tests of Multinomial Models
##
## Response: class
## Model
## 1 1
## 2 X0 + X1 + X2 + X4 + X6 + X7 + X8 + X9 + X10 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X23 + X24 + X26 + X28 + X29 + X30 + X31 + X32 + X33 + X35 + X36 + X40 + X41 + X42 + X43 + X44 + X45 + X47 + X49
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 590 624.8723
## 2 516 280.0876 1 vs 2 74 344.7846 0
anova(model_null, model_signif, test = "Chisq")
## Likelihood ratio tests of Multinomial Models
##
## Response: class
## Model
## 1 1
## 2 X0 + X12 + X18 + X19 + X20 + X24 + X26 + X30 + X31 + X33 + X35 + X41 + X43 + X44
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 590 624.8723
## 2 562 421.7285 1 vs 2 28 203.1438 0
anova(model_full, model_signif, test = "Chisq")
## Likelihood ratio tests of Multinomial Models
##
## Response: class
## Model
## 1 X0 + X12 + X18 + X19 + X20 + X24 + X26 + X30 + X31 + X33 + X35 + X41 + X43 + X44
## 2 X0 + X1 + X2 + X4 + X6 + X7 + X8 + X9 + X10 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X23 + X24 + X26 + X28 + X29 + X30 + X31 + X32 + X33 + X35 + X36 + X40 + X41 + X42 + X43 + X44 + X45 + X47 + X49
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 562 421.7285
## 2 516 280.0876 1 vs 2 46 141.6408 1.131317e-11
Dari hasil di atas bisa kita lihat bahwa, nilai residual deviance pada model full lebih kecil dibandingkan dengan model Null dan signif. Hal ini menunjukkan bahwa adanya variabel prediktor lain (meskipun tidak signifikan) membuat model lebih baik jika dibanding dengan tidak adanya variabel prediktor.