Klasifikasi Status Alzheimer Menggunakan Regresi Logistik
Nama : Jesika Trisyanda
Dosen Pengampu :Ike Fitriyaningsih, M.Si
Mata Kuliah : Analisis Multivariat
Universitas Negeri Surabaya
suppressPackageStartupMessages({
library(ROSE)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(tidyr)
library(ggcorrplot)
library(tidyverse)
library(ResourceSelection)
library(tidymodels)
library(MASS)
library(corrplot)
library(psych)
library(car)
library(caret)
library(parsnip)
})
PRE PROCESSING
df <- read.csv("alzheimers_disease_data.csv")
# Info struktur data
str(df)
## 'data.frame': 2149 obs. of 35 variables:
## $ PatientID : int 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 ...
## $ Age : int 73 89 73 74 89 86 68 75 72 87 ...
## $ Gender : int 0 0 0 1 0 1 0 0 1 0 ...
## $ Ethnicity : int 0 0 3 0 0 1 3 0 1 0 ...
## $ EducationLevel : int 2 0 1 1 0 1 2 1 0 0 ...
## $ BMI : num 22.9 26.8 17.8 33.8 20.7 ...
## $ Smoking : int 0 0 0 1 0 0 1 0 0 1 ...
## $ AlcoholConsumption : num 13.3 4.54 19.56 12.21 18.45 ...
## $ PhysicalActivity : num 6.33 7.62 7.84 8.43 6.31 ...
## $ DietQuality : num 1.347 0.519 1.826 7.436 0.795 ...
## $ SleepQuality : num 9.03 7.15 9.67 8.39 5.6 ...
## $ FamilyHistoryAlzheimers : int 0 0 1 0 0 0 0 0 0 0 ...
## $ CardiovascularDisease : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Diabetes : int 1 0 0 0 0 1 0 0 0 0 ...
## $ Depression : int 1 0 0 0 0 0 0 0 0 0 ...
## $ HeadInjury : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Hypertension : int 0 0 0 0 0 0 0 0 1 0 ...
## $ SystolicBP : int 142 115 99 118 94 168 143 117 117 130 ...
## $ DiastolicBP : int 72 64 116 115 117 62 88 63 119 78 ...
## $ CholesterolTotal : num 242 231 284 160 238 ...
## $ CholesterolLDL : num 56.2 193.4 153.3 65.4 92.9 ...
## $ CholesterolHDL : num 33.7 79 69.8 68.5 56.9 ...
## $ CholesterolTriglycerides : num 162.2 294.6 83.6 277.6 291.2 ...
## $ MMSE : num 21.46 20.61 7.36 13.99 13.52 ...
## $ FunctionalAssessment : num 6.52 7.12 5.9 8.97 6.05 ...
## $ MemoryComplaints : int 0 0 0 0 0 0 0 0 0 0 ...
## $ BehavioralProblems : int 0 0 0 1 0 0 0 0 1 1 ...
## $ ADL : num 1.7259 2.5924 7.1195 6.4812 0.0147 ...
## $ Confusion : int 0 0 0 0 0 1 0 1 0 0 ...
## $ Disorientation : int 0 0 1 0 0 0 0 0 0 0 ...
## $ PersonalityChanges : int 0 0 0 0 1 0 0 0 1 0 ...
## $ DifficultyCompletingTasks: int 1 0 1 0 1 0 0 0 0 0 ...
## $ Forgetfulness : int 0 1 0 0 0 0 1 1 0 0 ...
## $ Diagnosis : int 0 0 0 0 0 0 0 1 0 0 ...
## $ DoctorInCharge : chr "XXXConfid" "XXXConfid" "XXXConfid" "XXXConfid" ...
# Cek data duplikat
duplicates <- sum(duplicated(df))
cat("Jumlah duplikasi:", duplicates, "\n")
## Jumlah duplikasi: 0
# Cek kolom kosong
colSums(is.na(df))
## PatientID Age Gender
## 0 0 0
## Ethnicity EducationLevel BMI
## 0 0 0
## Smoking AlcoholConsumption PhysicalActivity
## 0 0 0
## DietQuality SleepQuality FamilyHistoryAlzheimers
## 0 0 0
## CardiovascularDisease Diabetes Depression
## 0 0 0
## HeadInjury Hypertension SystolicBP
## 0 0 0
## DiastolicBP CholesterolTotal CholesterolLDL
## 0 0 0
## CholesterolHDL CholesterolTriglycerides MMSE
## 0 0 0
## FunctionalAssessment MemoryComplaints BehavioralProblems
## 0 0 0
## ADL Confusion Disorientation
## 0 0 0
## PersonalityChanges DifficultyCompletingTasks Forgetfulness
## 0 0 0
## Diagnosis DoctorInCharge
## 0 0
# Drop kolom tidak diperlukan
df <- df %>% dplyr::select(-PatientID, -DoctorInCharge)
# Konversi Diagnosis jadi faktor
df$Diagnosis <- as.factor(df$Diagnosis)
# Ubah nilai Diagnosis jadi faktor dengan label
df$Diagnosis <- factor(df$Diagnosis, levels = c(0, 1), labels = c("Tidak Alzheimer", "Alzheimer"))
EDA
names(df)
## [1] "Age" "Gender"
## [3] "Ethnicity" "EducationLevel"
## [5] "BMI" "Smoking"
## [7] "AlcoholConsumption" "PhysicalActivity"
## [9] "DietQuality" "SleepQuality"
## [11] "FamilyHistoryAlzheimers" "CardiovascularDisease"
## [13] "Diabetes" "Depression"
## [15] "HeadInjury" "Hypertension"
## [17] "SystolicBP" "DiastolicBP"
## [19] "CholesterolTotal" "CholesterolLDL"
## [21] "CholesterolHDL" "CholesterolTriglycerides"
## [23] "MMSE" "FunctionalAssessment"
## [25] "MemoryComplaints" "BehavioralProblems"
## [27] "ADL" "Confusion"
## [29] "Disorientation" "PersonalityChanges"
## [31] "DifficultyCompletingTasks" "Forgetfulness"
## [33] "Diagnosis"
summary(df)
## Age Gender Ethnicity EducationLevel
## Min. :60.00 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:67.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.000
## Median :75.00 Median :1.0000 Median :0.0000 Median :1.000
## Mean :74.91 Mean :0.5063 Mean :0.6975 Mean :1.287
## 3rd Qu.:83.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:2.000
## Max. :90.00 Max. :1.0000 Max. :3.0000 Max. :3.000
## BMI Smoking AlcoholConsumption PhysicalActivity
## Min. :15.01 Min. :0.0000 Min. : 0.002003 Min. :0.003616
## 1st Qu.:21.61 1st Qu.:0.0000 1st Qu.: 5.139810 1st Qu.:2.570626
## Median :27.82 Median :0.0000 Median : 9.934412 Median :4.766424
## Mean :27.66 Mean :0.2885 Mean :10.039442 Mean :4.920202
## 3rd Qu.:33.87 3rd Qu.:1.0000 3rd Qu.:15.157931 3rd Qu.:7.427899
## Max. :39.99 Max. :1.0000 Max. :19.989293 Max. :9.987429
## DietQuality SleepQuality FamilyHistoryAlzheimers
## Min. :0.009385 Min. : 4.003 Min. :0.0000
## 1st Qu.:2.458455 1st Qu.: 5.483 1st Qu.:0.0000
## Median :5.076087 Median : 7.116 Median :0.0000
## Mean :4.993138 Mean : 7.051 Mean :0.2522
## 3rd Qu.:7.558625 3rd Qu.: 8.563 3rd Qu.:1.0000
## Max. :9.998346 Max. :10.000 Max. :1.0000
## CardiovascularDisease Diabetes Depression HeadInjury
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1443 Mean :0.1508 Mean :0.2006 Mean :0.0926
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Hypertension SystolicBP DiastolicBP CholesterolTotal
## Min. :0.0000 Min. : 90.0 Min. : 60.00 Min. :150.1
## 1st Qu.:0.0000 1st Qu.:112.0 1st Qu.: 74.00 1st Qu.:190.3
## Median :0.0000 Median :134.0 Median : 91.00 Median :225.1
## Mean :0.1489 Mean :134.3 Mean : 89.85 Mean :225.2
## 3rd Qu.:0.0000 3rd Qu.:157.0 3rd Qu.:105.00 3rd Qu.:262.0
## Max. :1.0000 Max. :179.0 Max. :119.00 Max. :300.0
## CholesterolLDL CholesterolHDL CholesterolTriglycerides MMSE
## Min. : 50.23 Min. :20.00 Min. : 50.41 Min. : 0.005312
## 1st Qu.: 87.20 1st Qu.:39.10 1st Qu.:137.58 1st Qu.: 7.167602
## Median :123.34 Median :59.77 Median :230.30 Median :14.441660
## Mean :124.34 Mean :59.46 Mean :228.28 Mean :14.755132
## 3rd Qu.:161.73 3rd Qu.:78.94 3rd Qu.:314.84 3rd Qu.:22.161028
## Max. :199.97 Max. :99.98 Max. :399.94 Max. :29.991381
## FunctionalAssessment MemoryComplaints BehavioralProblems ADL
## Min. :0.0004596 Min. :0.000 Min. :0.0000 Min. : 0.001288
## 1st Qu.:2.5662809 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 2.342836
## Median :5.0944387 Median :0.000 Median :0.0000 Median : 5.038973
## Mean :5.0800550 Mean :0.208 Mean :0.1568 Mean : 4.982958
## 3rd Qu.:7.5469813 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.: 7.581490
## Max. :9.9964671 Max. :1.000 Max. :1.0000 Max. : 9.999747
## Confusion Disorientation PersonalityChanges DifficultyCompletingTasks
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2052 Mean :0.1582 Mean :0.1508 Mean :0.1587
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Forgetfulness Diagnosis
## Min. :0.0000 Tidak Alzheimer:1389
## 1st Qu.:0.0000 Alzheimer : 760
## Median :0.0000
## Mean :0.3015
## 3rd Qu.:1.0000
## Max. :1.0000
# Histogram umur
ggplot(df, aes(x = Age)) + geom_histogram(binwidth = 5, fill = "skyblue", color = "black") + theme_minimal()

df$Gender <- factor(df$Gender, levels = c(0, 1), labels = c("Male", "Female"))
ggplot(df, aes(x = Gender)) +
geom_bar(fill = "skyblue", color = "black") +
labs(title = "Distribusi Gender", x = "Gender", y = "Jumlah") +
theme_minimal()

ggplot(df, aes(x = Diagnosis, fill = Gender)) +
geom_bar(position = "dodge") +
labs(title = "Distribusi Gender terhadap Diagnosis",
x = "Diagnosis", y = "Jumlah") +
theme_minimal()

ggplot(df, aes(x = Diagnosis, y = Age, fill = Diagnosis)) +
geom_boxplot() +
labs(title = "Distribusi Usia berdasarkan Diagnosis",
x = "Diagnosis", y = "Usia") +
theme_minimal()

ggplot(df, aes(x = Age)) +
geom_histogram(binwidth = 5, fill ="skyblue", color = "black") +
facet_wrap(~ Diagnosis) +
labs(title = "Histogram Usia terhadap Diagnosis",
x = "Usia", y = "Jumlah") +
theme_minimal()

ggplot(df, aes(x = Diagnosis)) +
geom_bar(fill = "skyblue", color = "black") +
ggtitle("Distribusi Diagnosis Alzheimer") +
xlab("Status Diagnosis") +
ylab("Jumlah Pasien")

cat("Jumlah tiap kategori:\n")
## Jumlah tiap kategori:
print(table(df$Diagnosis))
##
## Tidak Alzheimer Alzheimer
## 1389 760
label_percentages <- prop.table(table(df$Diagnosis)) * 100
cat("\nPersentase Tiap Label Diagnosis:\n")
##
## Persentase Tiap Label Diagnosis:
print(round(label_percentages, 2))
##
## Tidak Alzheimer Alzheimer
## 64.63 35.37
# Korelasi antar fitur numerik
numeric_df <- df[, sapply(df, is.numeric)]
library(corrplot)
corrplot(cor(numeric_df),
method = "color",
tl.cex = 0.6,
addCoef.col = "black",
number.cex = 0.3,
type = "full")

FITUR SELECTION MENGGUNAKAN ANOVA
# Standarisasi variabel numerik (kecuali Diagnosis)
num_cols <- sapply(df, is.numeric)
df[num_cols] <- scale(df[num_cols])
valid_vars <- names(df)[
sapply(df, function(x) is.numeric(x) && length(unique(x)) > 1)
]
# Hitung p-value ANOVA
anova_pvals <- sapply(valid_vars, function(var) {
formula <- as.formula(paste0("`", var, "` ~ Diagnosis"))
aov_model <- aov(formula, data = df)
summary(aov_model)[[1]][["Pr(>F)"]][1]
})
# Ambil fitur dengan p-value < 0.05
selected_features <- names(anova_pvals[anova_pvals < 0.05])
print(selected_features)
## [1] "EducationLevel" "SleepQuality" "CholesterolHDL"
## [4] "MMSE" "FunctionalAssessment" "MemoryComplaints"
## [7] "BehavioralProblems" "ADL"
final_data <- df[, c("Diagnosis", selected_features)]
# Cek hasil
head(final_data)
## Diagnosis EducationLevel SleepQuality CholesterolHDL MMSE
## 1 Tidak Alzheimer 0.7886499 1.11965745 -1.1141698 0.77885552
## 2 Tidak Alzheimer -1.4224508 0.05682309 0.8455334 0.68013845
## 3 Tidak Alzheimer -0.3169004 1.48703408 0.4455111 -0.85902164
## 4 Tidak Alzheimer -0.3169004 0.76065615 0.3886897 -0.08870211
## 5 Tidak Alzheimer -1.4224508 -0.82437383 -0.1118981 -0.14367832
## 6 Tidak Alzheimer -0.3169004 0.11957058 0.8477818 1.48173376
## FunctionalAssessment MemoryComplaints BehavioralProblems ADL
## 1 0.4973901 -0.5123573 -0.4311563 -1.1041775
## 2 0.7047429 -0.5123573 -0.4311563 -0.8104125
## 3 0.2817472 -0.5123573 -0.4311563 0.7243229
## 4 1.3430335 -0.5123573 2.3182650 0.5079260
## 5 0.3335878 -0.5123573 -0.4311563 -1.6842869
## 6 0.1486786 -0.5123573 -0.4311563 1.3671308
set.seed(123)
sample_size <- floor(0.7 * nrow(final_data))
train_indices <- sample(seq_len(nrow(final_data)), size = sample_size)
UJI ASUMSI SEBELUM MODELING
# (VIF)
model_vif <- glm(Diagnosis ~ ., data = final_data, family = binomial)
vif_values <- vif(model_vif)
print("VIF Tiap Variabel:")
## [1] "VIF Tiap Variabel:"
print(vif_values)
## EducationLevel SleepQuality CholesterolHDL
## 1.003401 1.002325 1.005188
## MMSE FunctionalAssessment MemoryComplaints
## 1.192906 1.293479 1.250640
## BehavioralProblems ADL
## 1.259085 1.302034
# Outlier dan leverage plot
plot(model_vif, which = 4, caption = "Influence plot")

# Hosmer-Lemeshow Test
hoslem <- hoslem.test(as.integer(final_data$Diagnosis == "Alzheimer"), fitted(model_vif), g = 10)
print(hoslem)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: as.integer(final_data$Diagnosis == "Alzheimer"), fitted(model_vif)
## X-squared = 62.119, df = 8, p-value = 1.787e-10
# Plot residual deviance
plot(fitted(model_vif), residuals(model_vif, type = "deviance"),
xlab = "Fitted values", ylab = "Deviance Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red")

SPLIT DATA
train_data <- final_data[train_indices, ]
test_data <- final_data[-train_indices, ]
# Simpan ke CSV
write.csv(train_data, "train_data.csv", row.names = FALSE)
write.csv(test_data, "test_data.csv", row.names = FALSE)
set.seed(123)
train_idx <- createDataPartition(final_data$Diagnosis, p = 0.7, list = FALSE)
train_data <- final_data[train_idx, ]
test_data <- final_data[-train_idx, ]
write.csv(train_data, "train_data.csv", row.names = FALSE)
write.csv(test_data, "test_data.csv", row.names = FALSE)
OVERSAMPLING
data_train <- read.csv("train_data.csv")
ggplot(data_train, aes(x = Diagnosis)) +
geom_bar(fill = "lightblue", color = "black") +
ggtitle("Distribusi Diagnosis Alzheimer") +
xlab("Status Diagnosis") +
ylab("Jumlah Pasien")

cat("Jumlah tiap kategori:\n")
## Jumlah tiap kategori:
print(table(data_train$Diagnosis))
##
## Alzheimer Tidak Alzheimer
## 532 973
cat("\nPersentase Tiap Label Diagnosis:\n")
##
## Persentase Tiap Label Diagnosis:
print(round(label_percentages, 2))
##
## Tidak Alzheimer Alzheimer
## 64.63 35.37
set.seed(123)
data_balanced <- ovun.sample(Diagnosis ~ ., data = data_train, method = "over", N = max(table(data_train$Diagnosis)) * length(unique(data_train$Diagnosis)))$data
ggplot(data_balanced, aes(x = Diagnosis)) +
geom_bar(fill = "lightblue", color = "black") +
ggtitle("Distribusi Diagnosis Alzheimer (Setelah Oversampling)") +
xlab("Status Diagnosis") +
ylab("Jumlah Pasien")

cat("Jumlah tiap kategori (setelah oversampling):\n")
## Jumlah tiap kategori (setelah oversampling):
print(table(data_balanced$Diagnosis))
##
## Alzheimer Tidak Alzheimer
## 973 973
data_balanced$Diagnosis <- ifelse(data_balanced$Diagnosis == "Alzheimer", 1, 0)
write.csv(data_balanced, "train_data_oversampled.csv", row.names = FALSE)
MODELING LOGISTIK REGRESI
train_data <- read.csv("train_data_oversampled.csv")
train_data$Diagnosis <- factor(train_data$Diagnosis,
levels = c(0, 1),
labels = c("Tidak Alzheimer", "Alzheimer"))
summary(train_data)
## Diagnosis EducationLevel SleepQuality CholesterolHDL
## Tidak Alzheimer:973 Min. :-1.42245 Min. :-1.72857 Min. :-1.70534
## Alzheimer :973 1st Qu.:-0.31690 1st Qu.:-0.86290 1st Qu.:-0.83524
## Median :-0.31690 Median : 0.04097 Median : 0.06394
## Mean :-0.02148 Mean :-0.01481 Mean : 0.02811
## 3rd Qu.: 0.78865 3rd Qu.: 0.82859 3rd Qu.: 0.85873
## Max. : 1.89420 Max. : 1.67204 Max. : 1.75100
## MMSE FunctionalAssessment MemoryComplaints BehavioralProblems
## Min. :-1.71248 Min. :-1.7520 Min. :-0.51236 Min. :-0.43116
## 1st Qu.:-0.94214 1st Qu.:-1.0373 1st Qu.:-0.51236 1st Qu.:-0.43116
## Median :-0.18240 Median :-0.1852 Median :-0.51236 Median :-0.43116
## Mean :-0.08885 Mean :-0.1454 Mean : 0.09016 Mean : 0.03933
## 3rd Qu.: 0.74578 3rd Qu.: 0.7191 3rd Qu.:-0.51236 3rd Qu.:-0.43116
## Max. : 1.76895 Max. : 1.6996 Max. : 1.95085 Max. : 2.31826
## ADL
## Min. :-1.6888
## 1st Qu.:-1.0520
## Median :-0.2259
## Mean :-0.1426
## 3rd Qu.: 0.7321
## Max. : 1.7007
model <- logistic_reg(mixture = 1, penalty = 0.1) %>%
set_engine("glmnet") %>%
set_mode("classification") %>%
fit(Diagnosis ~ ., data = train_data)
tidy(model)
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
## # A tibble: 9 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -0.136 0.1
## 2 EducationLevel 0 0.1
## 3 SleepQuality 0 0.1
## 4 CholesterolHDL 0 0.1
## 5 MMSE -0.0758 0.1
## 6 FunctionalAssessment -0.457 0.1
## 7 MemoryComplaints 0.172 0.1
## 8 BehavioralProblems 0.0147 0.1
## 9 ADL -0.305 0.1
table(train_data$Diagnosis)
##
## Tidak Alzheimer Alzheimer
## 973 973
head(train_data, 5)
## Diagnosis EducationLevel SleepQuality CholesterolHDL MMSE
## 1 Tidak Alzheimer -1.4224508 0.05682309 0.8455334 0.6801384
## 2 Tidak Alzheimer -1.4224508 -0.82437383 -0.1118981 -0.1436783
## 3 Tidak Alzheimer -0.3169004 0.11957058 0.8477818 1.4817338
## 4 Tidak Alzheimer -1.4224508 -0.73876003 -0.7082206 1.2847330
## 5 Tidak Alzheimer -1.4224508 0.28390795 0.6408057 1.5828443
## FunctionalAssessment MemoryComplaints BehavioralProblems ADL
## 1 0.7047429 -0.5123573 -0.4311563 -0.8104125
## 2 0.3335878 -0.5123573 -0.4311563 -1.6842869
## 3 0.1486786 -0.5123573 -0.4311563 1.3671308
## 4 0.8006261 -0.5123573 2.3182650 -1.4328981
## 5 -1.3589699 -0.5123573 2.3182650 -0.1452872
CEK MODEL DENGAN DATA TEST
test <- read.csv("test_data.csv")
test$Diagnosis <- factor(test$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))
#Prediksi kelas dengan model
pred_class <- predict(model, new_data = test, type = "class")
pred_class <- factor(pred_class$.pred_class, levels = levels(test$Diagnosis))
#Evaluasi model dengan confusionMatrix
conf_matrix <- confusionMatrix(pred_class, test$Diagnosis)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak Alzheimer Alzheimer
## Tidak Alzheimer 329 59
## Alzheimer 87 169
##
## Accuracy : 0.7733
## 95% CI : (0.739, 0.8051)
## No Information Rate : 0.646
## P-Value [Acc > NIR] : 1.787e-12
##
## Kappa : 0.5177
##
## Mcnemar's Test P-Value : 0.02545
##
## Sensitivity : 0.7909
## Specificity : 0.7412
## Pos Pred Value : 0.8479
## Neg Pred Value : 0.6602
## Prevalence : 0.6460
## Detection Rate : 0.5109
## Detection Prevalence : 0.6025
## Balanced Accuracy : 0.7660
##
## 'Positive' Class : Tidak Alzheimer
##