Nama: Ratu Arum Rahma Gati
Dosen Pengampu: Ike Fitriyaningsih, M.Si
Mata Kuliah: Analisis Multivariat
Universitas Negeri Surabaya
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.4.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.4.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.0.4 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.4.3
## Loaded ROSE 0.0-4
library(parsnip)
## Warning: package 'parsnip' was built under R version 4.4.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(car) # Untuk uji VIF
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
##
## The following object is masked from 'package:psych':
##
## logit
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
library(ResourceSelection) # Untuk Hosmer-Lemeshow
## Warning: package 'ResourceSelection' was built under R version 4.4.3
## ResourceSelection 0.3-6 2023-06-27
df <- read.csv("C:\\Users\\dell\\Downloads\\alzheimers_disease_data (1).csv")
str(df)
## 'data.frame': 2149 obs. of 35 variables:
## $ PatientID : int 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 ...
## $ Age : int 73 89 73 74 89 86 68 75 72 87 ...
## $ Gender : int 0 0 0 1 0 1 0 0 1 0 ...
## $ Ethnicity : int 0 0 3 0 0 1 3 0 1 0 ...
## $ EducationLevel : int 2 0 1 1 0 1 2 1 0 0 ...
## $ BMI : num 22.9 26.8 17.8 33.8 20.7 ...
## $ Smoking : int 0 0 0 1 0 0 1 0 0 1 ...
## $ AlcoholConsumption : num 13.3 4.54 19.56 12.21 18.45 ...
## $ PhysicalActivity : num 6.33 7.62 7.84 8.43 6.31 ...
## $ DietQuality : num 1.347 0.519 1.826 7.436 0.795 ...
## $ SleepQuality : num 9.03 7.15 9.67 8.39 5.6 ...
## $ FamilyHistoryAlzheimers : int 0 0 1 0 0 0 0 0 0 0 ...
## $ CardiovascularDisease : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Diabetes : int 1 0 0 0 0 1 0 0 0 0 ...
## $ Depression : int 1 0 0 0 0 0 0 0 0 0 ...
## $ HeadInjury : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Hypertension : int 0 0 0 0 0 0 0 0 1 0 ...
## $ SystolicBP : int 142 115 99 118 94 168 143 117 117 130 ...
## $ DiastolicBP : int 72 64 116 115 117 62 88 63 119 78 ...
## $ CholesterolTotal : num 242 231 284 160 238 ...
## $ CholesterolLDL : num 56.2 193.4 153.3 65.4 92.9 ...
## $ CholesterolHDL : num 33.7 79 69.8 68.5 56.9 ...
## $ CholesterolTriglycerides : num 162.2 294.6 83.6 277.6 291.2 ...
## $ MMSE : num 21.46 20.61 7.36 13.99 13.52 ...
## $ FunctionalAssessment : num 6.52 7.12 5.9 8.97 6.05 ...
## $ MemoryComplaints : int 0 0 0 0 0 0 0 0 0 0 ...
## $ BehavioralProblems : int 0 0 0 1 0 0 0 0 1 1 ...
## $ ADL : num 1.7259 2.5924 7.1195 6.4812 0.0147 ...
## $ Confusion : int 0 0 0 0 0 1 0 1 0 0 ...
## $ Disorientation : int 0 0 1 0 0 0 0 0 0 0 ...
## $ PersonalityChanges : int 0 0 0 0 1 0 0 0 1 0 ...
## $ DifficultyCompletingTasks: int 1 0 1 0 1 0 0 0 0 0 ...
## $ Forgetfulness : int 0 1 0 0 0 0 1 1 0 0 ...
## $ Diagnosis : int 0 0 0 0 0 0 0 1 0 0 ...
## $ DoctorInCharge : chr "XXXConfid" "XXXConfid" "XXXConfid" "XXXConfid" ...
cat("Jumlah duplikasi:", sum(duplicated(df)), "\n")
## Jumlah duplikasi: 0
print(colSums(is.na(df)))
## PatientID Age Gender
## 0 0 0
## Ethnicity EducationLevel BMI
## 0 0 0
## Smoking AlcoholConsumption PhysicalActivity
## 0 0 0
## DietQuality SleepQuality FamilyHistoryAlzheimers
## 0 0 0
## CardiovascularDisease Diabetes Depression
## 0 0 0
## HeadInjury Hypertension SystolicBP
## 0 0 0
## DiastolicBP CholesterolTotal CholesterolLDL
## 0 0 0
## CholesterolHDL CholesterolTriglycerides MMSE
## 0 0 0
## FunctionalAssessment MemoryComplaints BehavioralProblems
## 0 0 0
## ADL Confusion Disorientation
## 0 0 0
## PersonalityChanges DifficultyCompletingTasks Forgetfulness
## 0 0 0
## Diagnosis DoctorInCharge
## 0 0
df <- df %>% dplyr::select(-PatientID, -DoctorInCharge)
str(df)
## 'data.frame': 2149 obs. of 33 variables:
## $ Age : int 73 89 73 74 89 86 68 75 72 87 ...
## $ Gender : int 0 0 0 1 0 1 0 0 1 0 ...
## $ Ethnicity : int 0 0 3 0 0 1 3 0 1 0 ...
## $ EducationLevel : int 2 0 1 1 0 1 2 1 0 0 ...
## $ BMI : num 22.9 26.8 17.8 33.8 20.7 ...
## $ Smoking : int 0 0 0 1 0 0 1 0 0 1 ...
## $ AlcoholConsumption : num 13.3 4.54 19.56 12.21 18.45 ...
## $ PhysicalActivity : num 6.33 7.62 7.84 8.43 6.31 ...
## $ DietQuality : num 1.347 0.519 1.826 7.436 0.795 ...
## $ SleepQuality : num 9.03 7.15 9.67 8.39 5.6 ...
## $ FamilyHistoryAlzheimers : int 0 0 1 0 0 0 0 0 0 0 ...
## $ CardiovascularDisease : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Diabetes : int 1 0 0 0 0 1 0 0 0 0 ...
## $ Depression : int 1 0 0 0 0 0 0 0 0 0 ...
## $ HeadInjury : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Hypertension : int 0 0 0 0 0 0 0 0 1 0 ...
## $ SystolicBP : int 142 115 99 118 94 168 143 117 117 130 ...
## $ DiastolicBP : int 72 64 116 115 117 62 88 63 119 78 ...
## $ CholesterolTotal : num 242 231 284 160 238 ...
## $ CholesterolLDL : num 56.2 193.4 153.3 65.4 92.9 ...
## $ CholesterolHDL : num 33.7 79 69.8 68.5 56.9 ...
## $ CholesterolTriglycerides : num 162.2 294.6 83.6 277.6 291.2 ...
## $ MMSE : num 21.46 20.61 7.36 13.99 13.52 ...
## $ FunctionalAssessment : num 6.52 7.12 5.9 8.97 6.05 ...
## $ MemoryComplaints : int 0 0 0 0 0 0 0 0 0 0 ...
## $ BehavioralProblems : int 0 0 0 1 0 0 0 0 1 1 ...
## $ ADL : num 1.7259 2.5924 7.1195 6.4812 0.0147 ...
## $ Confusion : int 0 0 0 0 0 1 0 1 0 0 ...
## $ Disorientation : int 0 0 1 0 0 0 0 0 0 0 ...
## $ PersonalityChanges : int 0 0 0 0 1 0 0 0 1 0 ...
## $ DifficultyCompletingTasks: int 1 0 1 0 1 0 0 0 0 0 ...
## $ Forgetfulness : int 0 1 0 0 0 0 1 1 0 0 ...
## $ Diagnosis : int 0 0 0 0 0 0 0 1 0 0 ...
Pada kode di atas data dimuat, lalu dilakukan proses pembersihan data seperti pengecekan nilai kosong (NA) dan transformasi variabel jika diperlukan, misalnya mengubah tipe data variabel menjadi faktor atau numerik sesuai kebutuhan model.
df$Diagnosis <- as.factor(df$Diagnosis)
df$Diagnosis <- factor(df$Diagnosis, levels = c(0,1), labels = c("Tidak Alzheimer", "Alzheimer"))
num_cols <- sapply(df, is.numeric)
num_cols["Diagnosis"] <- FALSE
num_cols["Gender"] <- FALSE
df[num_cols] <- scale(df[num_cols])
ggplot(df, aes(x = Diagnosis)) +
geom_bar(fill = "lightblue") +
ggtitle("Distribusi Diagnosis Alzheimer") +
xlab("Status Diagnosis") + ylab("Jumlah Pasien")
cat("Jumlah tiap kategori:\n")
## Jumlah tiap kategori:
print(table(df$Diagnosis))
##
## Tidak Alzheimer Alzheimer
## 1389 760
label_percentages <- prop.table(table(df$Diagnosis)) * 100
cat("\nPersentase Tiap Label Diagnosis:\n")
##
## Persentase Tiap Label Diagnosis:
print(round(label_percentages, 2))
##
## Tidak Alzheimer Alzheimer
## 64.63 35.37
valid_vars <- names(df)[sapply(df, function(x) is.numeric(x) && length(unique(x)) > 1)]
anova_pvals <- sapply(valid_vars, function(var) {
formula <- as.formula(paste0("`", var, "` ~ Diagnosis"))
result <- tryCatch({
aov_model <- aov(formula, data = df)
summary(aov_model)[[1]][["Pr(>F)"]][1]
}, error = function(e) NA)
return(result)
})
selected_features <- names(anova_pvals[anova_pvals < 0.05])
cat("Fitur terpilih (p < 0.05):\n")
## Fitur terpilih (p < 0.05):
print(selected_features)
## [1] "EducationLevel" "SleepQuality" "CholesterolHDL"
## [4] "MMSE" "FunctionalAssessment" "MemoryComplaints"
## [7] "BehavioralProblems" "ADL"
final_data <- df[, c("Diagnosis", selected_features)]
Bagian ini digunakan untuk memahami struktur data dan melihat distribusi masing-masing variabel. Proses ini mencakup visualisasi distribusi variabel numerik dan kategorikal, serta identifikasi kemungkinan adanya outlier atau ketidakseimbangan kelas (class imbalance).
model_vif <- glm(Diagnosis ~ ., data = final_data, family = binomial)
vif_values <- vif(model_vif)
print("VIF Tiap Variabel:")
## [1] "VIF Tiap Variabel:"
print(vif_values)
## EducationLevel SleepQuality CholesterolHDL
## 1.003401 1.002325 1.005188
## MMSE FunctionalAssessment MemoryComplaints
## 1.192906 1.293479 1.250640
## BehavioralProblems ADL
## 1.259085 1.302034
plot(model_vif, which = 4, caption = "Influence plot")
hoslem <- hoslem.test(as.integer(final_data$Diagnosis == "Alzheimer"), fitted(model_vif), g = 10)
print(hoslem)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: as.integer(final_data$Diagnosis == "Alzheimer"), fitted(model_vif)
## X-squared = 62.119, df = 8, p-value = 1.787e-10
plot(fitted(model_vif), residuals(model_vif, type = "deviance"),
xlab = "Fitted values", ylab = "Deviance Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red")
set.seed(123)
train_idx <- createDataPartition(final_data$Diagnosis, p = 0.7, list = FALSE)
train_data <- final_data[train_idx, ]
test_data <- final_data[-train_idx, ]
write.csv(train_data, "C:\\Users\\dell\\latihan\\SEMESTER 4\\ANMUL\\PROJECT\\train_data.csv", row.names = FALSE)
write.csv(test_data, "C:\\Users\\dell\\latihan\\SEMESTER 4\\ANMUL\\PROJECT\\test_data.csv", row.names = FALSE)
cat("Distribusi sebelum oversampling:\n")
## Distribusi sebelum oversampling:
print(table(train_data$Diagnosis))
##
## Tidak Alzheimer Alzheimer
## 973 532
set.seed(123)
data_balanced <- ovun.sample(Diagnosis ~ ., data = train_data, method = "over",
N = max(table(train_data$Diagnosis)) * 2)$data
cat("Distribusi setelah oversampling:\n")
## Distribusi setelah oversampling:
print(table(data_balanced$Diagnosis))
##
## Tidak Alzheimer Alzheimer
## 973 973
write.csv(data_balanced, "C:\\Users\\dell\\latihan\\SEMESTER 4\\ANMUL\\PROJECT\\train_data_oversampled.csv", row.names = FALSE)
Lalu data dibagi menjadi dua bagian yaitu data latih (training) dan data uji (testing) menggunakan fungsi createDataPartition() dari package caret. Pembagian ini bertujuan agar model dapat dilatih menggunakan sebagian data, lalu dievaluasi pada data yang belum pernah dilihat sebelumnya untuk mengukur performa secara objektif.
train_data <- read.csv("train_data_oversampled.csv")
train_data$Diagnosis <- factor(train_data$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))
model <- logistic_reg(mixture = 1, penalty = 0.1) %>%
set_engine("glmnet") %>%
set_mode("classification") %>%
fit(Diagnosis ~ ., data = train_data)
tidy(model)
## # A tibble: 9 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -0.136 0.1
## 2 EducationLevel 0 0.1
## 3 SleepQuality 0 0.1
## 4 CholesterolHDL 0 0.1
## 5 MMSE -0.0758 0.1
## 6 FunctionalAssessment -0.457 0.1
## 7 MemoryComplaints 0.172 0.1
## 8 BehavioralProblems 0.0147 0.1
## 9 ADL -0.305 0.1
head(train_data, 5)
## Diagnosis EducationLevel SleepQuality CholesterolHDL MMSE
## 1 Tidak Alzheimer -1.4224508 0.05682309 0.8455334 0.6801384
## 2 Tidak Alzheimer -1.4224508 -0.82437383 -0.1118981 -0.1436783
## 3 Tidak Alzheimer -0.3169004 0.11957058 0.8477818 1.4817338
## 4 Tidak Alzheimer -1.4224508 -0.73876003 -0.7082206 1.2847330
## 5 Tidak Alzheimer -1.4224508 0.28390795 0.6408057 1.5828443
## FunctionalAssessment MemoryComplaints BehavioralProblems ADL
## 1 0.7047429 -0.5123573 -0.4311563 -0.8104125
## 2 0.3335878 -0.5123573 -0.4311563 -1.6842869
## 3 0.1486786 -0.5123573 -0.4311563 1.3671308
## 4 0.8006261 -0.5123573 2.3182650 -1.4328981
## 5 -1.3589699 -0.5123573 2.3182650 -0.1452872
table(train_data$Diagnosis)
##
## Tidak Alzheimer Alzheimer
## 973 973
test <- read.csv("test_data.csv")
test$Diagnosis <- factor(test$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))
pred_class <- predict(model, new_data = test, type = "class")
pred_class <- factor(pred_class$.pred_class, levels = levels(test$Diagnosis))
conf_matrix <- confusionMatrix(pred_class, test$Diagnosis)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak Alzheimer Alzheimer
## Tidak Alzheimer 329 59
## Alzheimer 87 169
##
## Accuracy : 0.7733
## 95% CI : (0.739, 0.8051)
## No Information Rate : 0.646
## P-Value [Acc > NIR] : 1.787e-12
##
## Kappa : 0.5177
##
## Mcnemar's Test P-Value : 0.02545
##
## Sensitivity : 0.7909
## Specificity : 0.7412
## Pos Pred Value : 0.8479
## Neg Pred Value : 0.6602
## Prevalence : 0.6460
## Detection Rate : 0.5109
## Detection Prevalence : 0.6025
## Balanced Accuracy : 0.7660
##
## 'Positive' Class : Tidak Alzheimer
##
#MODEL LOGISTIC REGRESSION
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ dials 1.4.0 ✔ tune 1.3.0
## ✔ infer 1.0.8 ✔ workflows 1.2.0
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ recipes 1.3.0 ✔ yardstick 1.3.2
## ✔ rsample 1.3.0
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ scales::alpha() masks psych::alpha(), ggplot2::alpha()
## ✖ scales::discard() masks purrr::discard()
## ✖ Matrix::expand() masks tidyr::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ caret::lift() masks purrr::lift()
## ✖ Matrix::pack() masks tidyr::pack()
## ✖ yardstick::precision() masks caret::precision()
## ✖ yardstick::recall() masks caret::recall()
## ✖ car::recode() masks dplyr::recode()
## ✖ MASS::select() masks dplyr::select()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ car::some() masks purrr::some()
## ✖ yardstick::spec() masks readr::spec()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step() masks stats::step()
## ✖ Matrix::unpack() masks tidyr::unpack()
## ✖ recipes::update() masks Matrix::update(), stats::update()
train_data$Diagnosis <- factor(train_data$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))
test_data$Diagnosis <- factor(test_data$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))
log_reg <- logistic_reg(penalty = tune(), mixture = tune()) %>%
set_engine("glmnet") %>%
set_mode("classification")
rec <- recipe(Diagnosis ~ ., data = train_data)
wf <- workflow() %>%
add_model(log_reg) %>%
add_recipe(rec)
grid_vals <- grid_regular(
mixture(range = c(0, 1)),
penalty(range = c(-3, 0)),
levels = 4
)
set.seed(123)
folds <- vfold_cv(train_data, v = 5)
log_reg_tuned <- tune_grid(
wf,
resamples = folds,
grid = grid_vals,
metrics = metric_set(roc_auc),
control = control_grid(save_pred = TRUE)
)
best_model <- select_best(log_reg_tuned, metric = "roc_auc")
final_wf <- finalize_workflow(wf, best_model)
final_model <- fit(final_wf, data = train_data)
pred_class <- predict(final_model, test_data, type = "class")
pred_prob <- predict(final_model, test_data, type = "prob")
results <- bind_cols(test_data, pred_class, pred_prob) %>%
rename(.pred_class = .pred_class)
roc_data <- roc_curve(results, truth = Diagnosis, .pred_Alzheimer)
autoplot(roc_data) + labs(title = "ROC Curve")
results <- results %>%
mutate(final_pred = factor(
if_else(.pred_Alzheimer >= 0.5, "Alzheimer", "Tidak Alzheimer"),
levels = c("Tidak Alzheimer", "Alzheimer")
))
conf_mat_custom <- conf_mat(results, truth = Diagnosis, estimate = final_pred)
autoplot(conf_mat_custom)
metrics_all <- metric_set(accuracy, precision, recall, f_meas, sens, spec)
all_metrics <- metrics_all(results, truth = Diagnosis, estimate = final_pred)
print(all_metrics)
## # A tibble: 6 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.832
## 2 precision binary 0.893
## 3 recall binary 0.841
## 4 f_meas binary 0.866
## 5 sens binary 0.841
## 6 spec binary 0.816
Model regresi logistik dibangun menggunakan fungsi glm() dengan family = binomial. Variabel dependen biasanya berupa kategori biner (misalnya: sakit vs tidak sakit), dan variabel independen terdiri dari sejumlah prediktor. Model ini bertujuan mempelajari hubungan antara variabel input dengan probabilitas kejadian dari target. Setelah dilakukan hyperparameter ternyata akurasi naik menjadi 83%