#PRE-PROCESSING install.packages(c( “dplyr”, “ggplot2”, “ggpubr”, “tidyr”, “ggcorrplot”, “tidyverse”, “caret”, “MASS”, “corrplot”, “psych”, “ROSE”, “parsnip”, “glmnet”, “broom”, “car”, “ResourceSelection” ))
library(dplyr) library(ggplot2) library(ggpubr) library(tidyr) library(ggcorrplot) library(tidyverse) library(caret) library(MASS) library(corrplot) library(psych) library(ROSE) library(parsnip) library(glmnet) library(broom) library(car) # Untuk uji VIF library(ResourceSelection) # Untuk Hosmer-Lemeshow
df <- read.csv(“C:\Users\dell\latihan\SEMESTER 4\ANMUL\PROJECT\alzheimers_disease_data (1).csv”) str(df)
cat(“Jumlah duplikasi:”, sum(duplicated(df)), “”) print(colSums(is.na(df)))
df <- df %>% dplyr::select(-PatientID, -DoctorInCharge) str(df)
df\(Diagnosis <- as.factor(df\)Diagnosis) df\(Diagnosis <- factor(df\)Diagnosis, levels = c(0,1), labels = c(“Tidak Alzheimer”, “Alzheimer”))
num_cols <- sapply(df, is.numeric) num_cols[“Diagnosis”] <- FALSE num_cols[“Gender”] <- FALSE df[num_cols] <- scale(df[num_cols])
ggplot(df, aes(x = Diagnosis)) + geom_bar(fill = “lightblue”) + ggtitle(“Distribusi Diagnosis Alzheimer”) + xlab(“Status Diagnosis”) + ylab(“Jumlah Pasien”)
cat(“Jumlah tiap kategori:”) print(table(df$Diagnosis))
label_percentages <- prop.table(table(df$Diagnosis)) * 100 cat(“Tiap Label Diagnosis:”) print(round(label_percentages, 2))
valid_vars <- names(df)[sapply(df, function(x) is.numeric(x) && length(unique(x)) > 1)]
anova_pvals <- sapply(valid_vars, function(var) { formula <-
as.formula(paste0(“", var, " ~ Diagnosis”)) result <-
tryCatch({ aov_model <- aov(formula, data = df)
summary(aov_model)[[1]][[“Pr(>F)”]][1] }, error = function(e) NA)
return(result) })
selected_features <- names(anova_pvals[anova_pvals < 0.05]) cat(“Fitur terpilih (p < 0.05):”) print(selected_features)
final_data <- df[, c(“Diagnosis”, selected_features)]
model_vif <- glm(Diagnosis ~ ., data = final_data, family = binomial) vif_values <- vif(model_vif) print(“VIF Tiap Variabel:”) print(vif_values)
plot(model_vif, which = 4, caption = “Influence plot”)
hoslem <- hoslem.test(as.integer(final_data$Diagnosis == “Alzheimer”), fitted(model_vif), g = 10) print(hoslem)
plot(fitted(model_vif), residuals(model_vif, type = “deviance”), xlab = “Fitted values”, ylab = “Deviance Residuals”, main = “Residuals vs Fitted”) abline(h = 0, col = “red”)
set.seed(123) train_idx <- createDataPartition(final_data$Diagnosis, p = 0.7, list = FALSE) train_data <- final_data[train_idx, ] test_data <- final_data[-train_idx, ]
write.csv(train_data, “C:\Users\dell\latihan\SEMESTER 4\ANMUL\PROJECT\train_data.csv”, row.names = FALSE) write.csv(test_data, “C:\Users\dell\latihan\SEMESTER 4\ANMUL\PROJECT\test_data.csv”, row.names = FALSE)
cat(“Distribusi sebelum oversampling:”) print(table(train_data$Diagnosis))
set.seed(123) data_balanced <- ovun.sample(Diagnosis ~ ., data = train_data, method = “over”, N = max(table(train_data\(Diagnosis)) * 2)\)data
cat(“Distribusi setelah oversampling:”) print(table(data_balanced$Diagnosis))
write.csv(data_balanced, “C:\Users\dell\latihan\SEMESTER 4\ANMUL\PROJECT\train_data_oversampled.csv”, row.names = FALSE)
#MODEL LOGISTIC REGRESSION train_data <- read.csv(“train_data_oversampled.csv”) train_data\(Diagnosis <- factor(train_data\)Diagnosis, levels = c(“Tidak Alzheimer”, “Alzheimer”))
model <- logistic_reg(mixture = 1, penalty = 0.1) %>% set_engine(“glmnet”) %>% set_mode(“classification”) %>% fit(Diagnosis ~ ., data = train_data)
tidy(model)
head(train_data, 5) table(train_data$Diagnosis)
test <- read.csv(“test_data.csv”) test\(Diagnosis <- factor(test\)Diagnosis, levels = c(“Tidak Alzheimer”, “Alzheimer”))
pred_class <- predict(model, new_data = test, type = “class”) pred_class <- factor(pred_class\(.pred_class, levels = levels(test\)Diagnosis))
conf_matrix <- confusionMatrix(pred_class, test$Diagnosis) print(conf_matrix)
install.packages(“tidymodels”) library(tidymodels)
train_data\(Diagnosis <- factor(train_data\)Diagnosis, levels = c(“Tidak Alzheimer”, “Alzheimer”)) test_data\(Diagnosis <- factor(test_data\)Diagnosis, levels = c(“Tidak Alzheimer”, “Alzheimer”))
log_reg <- logistic_reg( penalty = tune(), mixture = tune() ) %>% set_engine(“glmnet”) %>% set_mode(“classification”)
rec <- recipe(Diagnosis ~ ., data = train_data)
wf <- workflow() %>% add_model(log_reg) %>% add_recipe(rec)
grid_vals <- grid_regular( mixture(range = c(0,1)), penalty(range = c(-3, 0)), levels = 4 )
set.seed(123) folds <- vfold_cv(train_data, v = 5)
log_reg_tuned <- tune_grid( wf, resamples = folds, grid = grid_vals, metrics = metric_set(roc_auc), control = control_grid(save_pred = TRUE) )
best_model <- select_best(log_reg_tuned, metric = “roc_auc”) final_wf <- finalize_workflow(wf, best_model) final_model <- fit(final_wf, data = train_data)
pred_class <- predict(final_model, test_data, type = “class”) pred_prob <- predict(final_model, test_data, type = “prob”)
results <- bind_cols(test_data, pred_class, pred_prob)
conf_mat <- conf_mat(results, truth = Diagnosis, estimate = .pred_class) accuracy(results, truth = Diagnosis, estimate = .pred_class) autoplot(conf_mat)