library(dplyr) library(ggplot2) library(nnet) library(tidyr) library(knitr) library(kableExtra) library(scales)
data_multi <- read.csv(“data_multinomial.csv”, sep = “;”)
var_y_multi <- “Species” var_x_multi <- setdiff(colnames(data_multi), c(“Id”, “Species”))
data_multi <- data_multi %>% mutate( Species = factor( Species, levels = c(“Iris-setosa”, “Iris-versicolor”, “Iris-virginica”) ) )
dplyr::glimpse(data_multi)
data_multi %>% count(Species) %>% mutate(proporsi = n / sum(n)) %>% ggplot(aes(x = Species, y = proporsi, fill = Species)) + geom_col(width = 0.65, alpha = 0.94) + geom_text(aes(label = percent(proporsi, accuracy = 0.1)), vjust = -0.4, fontface = “bold”, color = “#243142”) + scale_y_continuous(labels = percent_format(), limits = c(0, 0.65)) + scale_fill_manual(values = c(“#2f7f73”, “#5568b8”, “#d18b2f”)) + labs( title = “Distribusi Spesies Iris”, subtitle = “Respons multinomial pada dataset Iris”, x = NULL, y = “Proporsi” ) + theme_minimal() + theme(legend.position = “none”)
formula_multi <- as.formula(paste(var_y_multi, “~”, paste(var_x_multi, collapse = ” + “)))
fit_multi <- nnet::multinom( formula_multi, data = data_multi, trace = FALSE )
summary(fit_multi)
multi_sum <- summary(fit_multi)
coef_multi <- as.data.frame(multi_sum\(coefficients) se_multi <- as.data.frame(multi_sum\)standard.errors)
coef_long <- coef_multi %>% tibble::rownames_to_column(“kategori”) %>% tidyr::pivot_longer( cols = -kategori, names_to = “variabel”, values_to = “estimate” )
se_long <- se_multi %>% tibble::rownames_to_column(“kategori”) %>% tidyr::pivot_longer( cols = -kategori, names_to = “variabel”, values_to = “std_error” )
result_multi <- coef_long %>% left_join(se_long, by = c(“kategori”, “variabel”)) %>% mutate( z_value = estimate / std_error, p_value = 2 * (1 - pnorm(abs(z_value))), RRR = exp(estimate), CI_low = exp(estimate - 1.96 * std_error), CI_high = exp(estimate + 1.96 * std_error) )
result_multi %>% mutate( across( c(estimate, std_error, z_value, p_value, RRR, CI_low, CI_high), ~ round(.x, 4) ) ) %>% kable( caption = “Ringkasan hasil regresi logistik multinomial (Base: Iris-setosa)”, col.names = c( “Kategori”, “Variabel”, “Estimate”, “SE”, “z-value”, “p-value”, “RRR”, “CI 2.5%”, “CI 97.5%” ) ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )
pred_prob_multi <- predict(fit_multi, type = “probs”) head(pred_prob_multi)
data_multi_pred <- data_multi %>% bind_cols(as.data.frame(pred_prob_multi)) %>% mutate( prediksi = predict(fit_multi, type = “class”) )
conf_multi <- table( Aktual = data_multi_pred\(Species, Prediksi = data_multi_pred\)prediksi ) conf_multi
accuracy_multi <- sum(diag(conf_multi)) / sum(conf_multi) print(paste(“Akurasi Model:”, round(accuracy_multi * 100, 2), “%”))
grid_multi <- expand.grid( SepalLengthCm = seq( min(data_multi\(SepalLengthCm), max(data_multi\)SepalLengthCm), length.out = 100 ), SepalWidthCm = mean(data_multi\(SepalWidthCm), PetalLengthCm = mean(data_multi\)PetalLengthCm), PetalWidthCm = mean(data_multi$PetalWidthCm) )
grid_prob_multi <- predict(fit_multi, newdata = grid_multi, type = “probs”)
grid_multi_plot <- grid_multi %>% bind_cols(as.data.frame(grid_prob_multi)) %>% tidyr::pivot_longer( cols = c(“Iris-setosa”, “Iris-versicolor”, “Iris-virginica”), names_to = “Species”, values_to = “probabilitas” )
ggplot( grid_multi_plot, aes(x = SepalLengthCm, y = probabilitas, color = Species) ) + geom_line(linewidth = 1.35) + scale_y_continuous(labels = percent_format()) + scale_color_manual(values = c(“#2f7f73”, “#5568b8”, “#d18b2f”)) + labs( title = “Prediksi Probabilitas Spesies Iris”, subtitle = “Variabel lain ditahan pada nilai rata-rata”, x = “Sepal Length (Cm)”, y = “Probabilitas prediksi”, color = “Spesies” ) + theme_minimal()
#INI ORDINAL
library(dplyr) library(ggplot2) library(MASS) library(tidyr) library(knitr) library(kableExtra) library(scales) # Jika belum punya paket ordinal, hilangkan tanda # di bawah ini: # install.packages(“ordinal”) library(ordinal)
data_ord <- read.csv(“data_ordinal.csv”, sep = “;”)
data_ord <- data_ord %>% mutate( Grade = factor(Grade, levels = c(“Rendah”, “Sedang”, “Tinggi”), ordered = TRUE), Gender = as.factor(Gender) )
dplyr::glimpse(data_ord)
data_ord %>% count(Grade) %>% mutate(proporsi = n / sum(n)) %>% kable( digits = 3, caption = “Distribusi Tingkat Grade Mahasiswa” ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = FALSE )
data_ord %>% count(Grade) %>% mutate(proporsi = n / sum(n)) %>% ggplot(aes(x = Grade, y = proporsi, fill = Grade)) + geom_col(width = 0.65, alpha = 0.94) + geom_text(aes(label = percent(proporsi, accuracy = 0.1)), vjust = -0.4, fontface = “bold”, color = “#243142”) + scale_y_continuous(labels = percent_format(), limits = c(0, max(n/sum(n)) + 0.15)) + scale_fill_manual(values = c(“#2f7f73”, “#d18b2f”, “#5568b8”)) + labs( title = “Distribusi Tingkat Grade”, subtitle = “Respons ordinal: kategori memiliki urutan alami.”, x = “Grade”, y = “Proporsi” ) + theme_minimal() + theme(legend.position = “none”)
fit_ord <- MASS::polr( Grade ~ Hours_Studied + Attendance + Previous_Scores + Sleep_Hours + Gender, data = data_ord, method = “logistic”, Hess = TRUE )
summary(fit_ord)
ord_coef <- coef(summary(fit_ord))
result_ord <- as.data.frame(ord_coef) %>%
tibble::rownames_to_column(“parameter”) %>% dplyr::rename( estimate =
Value, std_error = Std. Error, t_value =
t value ) %>% mutate( p_value = 2 * (1 -
pnorm(abs(t_value))), jenis = ifelse(grepl(“\|”, parameter), “Cutpoint”,
“Koefisien”), estimate_slide = ifelse(jenis == “Koefisien”, -estimate,
estimate), OR_polr = ifelse(jenis == “Koefisien”, exp(estimate),
NA_real_), OR_slide = ifelse(jenis == “Koefisien”, exp(estimate_slide),
NA_real_), CI_low_polr = ifelse(jenis == “Koefisien”, exp(estimate -
1.96 * std_error), NA_real_), CI_high_polr = ifelse(jenis ==
“Koefisien”, exp(estimate + 1.96 * std_error), NA_real_) )
result_ord %>% mutate( across( c(estimate, estimate_slide, std_error, t_value, p_value, OR_polr, OR_slide, CI_low_polr, CI_high_polr), ~ round(.x, 4) ) ) %>% kable( caption = “Ringkasan hasil regresi logistik ordinal”, col.names = c( “Parameter”, “Estimate polr”, “SE”, “z/t-value”, “p-value”, “Jenis”, “Estimate slide”, “OR polr”, “OR slide”, “CI polr 2.5%”, “CI polr 97.5%” ) ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )
pred_prob_ord <- predict(fit_ord, type = “probs”) head(pred_prob_ord)
data_ord_pred <- data_ord %>% bind_cols(as.data.frame(pred_prob_ord)) %>% mutate( prediksi = predict(fit_ord, type = “class”) )
head(data_ord_pred)
conf_ord <- table( Aktual = data_ord_pred\(Grade, Prediksi = data_ord_pred\)prediksi ) conf_ord
accuracy_ord <- sum(diag(conf_ord)) / sum(conf_ord) print(paste(“Akurasi Model:”, round(accuracy_ord * 100, 2), “%”))
grid_ord <- expand.grid( Hours_Studied = seq( min(data_ord\(Hours_Studied, na.rm = TRUE), max(data_ord\)Hours_Studied, na.rm = TRUE), length.out = 120 ), Attendance = mean(data_ord\(Attendance, na.rm = TRUE), Previous_Scores = mean(data_ord\)Previous_Scores, na.rm = TRUE), Sleep_Hours = mean(data_ord$Sleep_Hours, na.rm = TRUE), Gender = “Female” )
grid_prob_ord <- predict(fit_ord, newdata = grid_ord, type = “probs”)
grid_ord_plot <- grid_ord %>% bind_cols(as.data.frame(grid_prob_ord)) %>% tidyr::pivot_longer( cols = c(“Rendah”, “Sedang”, “Tinggi”), names_to = “Grade”, values_to = “probabilitas” ) %>% mutate(Grade = factor(Grade, levels = c(“Rendah”, “Sedang”, “Tinggi”)))
ggplot( grid_ord_plot, aes(x = Hours_Studied, y = probabilitas, color = Grade) ) + geom_line(linewidth = 1.25) + scale_y_continuous(labels = percent_format()) + scale_color_manual(values = c(“#2f7f73”, “#d18b2f”, “#5568b8”)) + labs( title = “Prediksi Probabilitas Tingkat Grade”, subtitle = “Variabel lain ditahan pada nilai rata-rata; Gender = Female.”, x = “Jam Belajar (Hours Studied)”, y = “Probabilitas prediksi”, color = “Grade” ) + theme_minimal()
eps <- 1e-6
grid_ord_cumlogit <- grid_ord %>%
bind_cols(as.data.frame(grid_prob_ord)) %>% mutate(
P(Y <= Rendah) = Rendah, P(Y <= Sedang)
= Rendah + Sedang ) %>% tidyr::pivot_longer( cols =
c(P(Y <= Rendah), P(Y <= Sedang)),
names_to = “batas_kumulatif”, values_to = “prob_kumulatif” ) %>%
mutate( prob_kumulatif = pmin(pmax(prob_kumulatif, eps), 1 - eps),
cumulative_logit = qlogis(prob_kumulatif) )
ggplot( grid_ord_cumlogit, aes(x = Hours_Studied, y = cumulative_logit, color = batas_kumulatif) ) + geom_line(linewidth = 1.25) + scale_color_manual(values = c( “P(Y <= Rendah)” = “#2f7f73”, “P(Y <= Sedang)” = “#d18b2f” )) + labs( title = “Parallel Lines pada Model Ordinal”, subtitle = “Garis yang sejajar adalah cumulative logit, bukan probabilitas kategori tunggal.”, x = “Jam Belajar (Hours Studied)”, y = “Logit peluang kumulatif”, color = “Batas kumulatif” ) + theme_minimal()
fit_clm <- ordinal::clm( Grade ~ Hours_Studied + Attendance + Previous_Scores + Sleep_Hours + Gender, data = data_ord, link = “logit” )
summary(fit_clm)
ordinal::nominal_test(fit_clm)
library(dplyr) library(ggplot2) library(tidyr) library(knitr) library(kableExtra) library(scales)
data_poi <- read.csv(“data_poisson1.csv”, sep = “;”)
var_y_poi <- “injuries”
var_x_poi <- setdiff( colnames(data_poi), c(“id”, “date”, “time”, “latitude”, “longitude”, “injuries”) )
data_poi <- data_poi %>% mutate(across(where(is.character), as.factor))
dplyr::glimpse(data_poi)
data_poi %>% count(.data[[var_y_poi]], name = “n”) %>% mutate(proporsi = n / sum(n)) %>% kable( digits = 3, caption = “Distribusi Jumlah Cedera (Injuries) pada Kecelakaan” ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = FALSE )
data_poi %>% count(.data[[var_y_poi]], name = “n”) %>% ggplot(aes(x = .data[[var_y_poi]], y = n)) + geom_col(width = 0.8, fill = “#2f7f73”, alpha = 0.92) + scale_x_continuous(breaks = scales::pretty_breaks()) + labs( title = “Distribusi Jumlah Cedera”, subtitle = “Respons Poisson berupa hitungan bilangan bulat nonnegatif.”, x = “Jumlah Cedera (Injuries)”, y = “Frekuensi” ) + theme_minimal()
formula_poi <- as.formula(paste(var_y_poi, “~”, paste(var_x_poi, collapse = ” + “)))
fit_pois <- glm( formula_poi, data = data_poi, family = poisson(link = “log”) )
summary(fit_pois)
pois_coef <- as.data.frame(coef(summary(fit_pois))) %>%
tibble::rownames_to_column(“parameter”) %>% dplyr::rename( estimate =
Estimate, std_error = Std. Error, z_value =
z value, p_value = Pr(>|z|) ) %>%
mutate( IRR = exp(estimate), CI_low = exp(estimate - 1.96 * std_error),
CI_high = exp(estimate + 1.96 * std_error), perubahan_persen = 100 *
(IRR - 1) )
pois_coef %>% mutate( across( c(estimate, std_error, z_value, p_value, IRR, CI_low, CI_high, perubahan_persen), ~ round(.x, 4) ) ) %>% kable( caption = “Ringkasan Hasil Regresi Poisson”, col.names = c( “Parameter”, “Estimate”, “SE”, “z-value”, “p-value”, “IRR”, “CI 2.5%”, “CI 97.5%”, “Perubahan (%)” ) ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )
get_typical <- function(x) { if(is.numeric(x)) return(mean(x, na.rm = TRUE)) t <- table(x) return(names(t)[which.max(t)]) }
baseline_data <- lapply(data_poi[var_x_poi], get_typical) baseline_data <- as.data.frame(baseline_data)
grid_pois <- baseline_data[rep(1, 100), ] grid_pois\(vehicles_involved <- seq( min(data_poi\)vehicles_involved, na.rm = TRUE), max(data_poi$vehicles_involved, na.rm = TRUE), length.out = 100 )
pred_pois <- predict(fit_pois, newdata = grid_pois, type = “link”, se.fit = TRUE)
grid_pois_plot <- grid_pois %>% mutate( fit_link = pred_pois\(fit, se_link = pred_pois\)se.fit, rate = exp(fit_link), rate_low = exp(fit_link - 1.96 * se_link), rate_high = exp(fit_link + 1.96 * se_link) )
ggplot(grid_pois_plot, aes(x = vehicles_involved, y = rate)) + geom_ribbon(aes(ymin = rate_low, ymax = rate_high), fill = “#2f7f73”, alpha = 0.16) + geom_line(color = “#2f7f73”, linewidth = 1.35) + labs( title = “Prediksi Jumlah Cedera Berdasarkan Jumlah Kendaraan Terlibat”, subtitle = “Variabel lain ditahan pada nilai rata-rata/modus.”, x = “Jumlah Kendaraan Terlibat (Vehicles Involved)”, y = “Prediksi Jumlah Cedera (Injuries)” ) + theme_minimal()
dispersion_pois <- sum(residuals(fit_pois, type = “pearson”)^2) / df.residual(fit_pois)
tibble::tibble( Dispersion Pearson = dispersion_pois,
Interpretasi ringkas = dplyr::case_when( dispersion_pois
< 1.5 ~ “Tidak ada indikasi overdispersion berat”, dispersion_pois
< 2.5 ~ “Ada indikasi overdispersion sedang”, TRUE ~ “Ada indikasi
overdispersion kuat” ) ) %>% mutate(Dispersion Pearson =
round(Dispersion Pearson, 3)) %>% kable(caption =
“Indikasi overdispersion pada model Poisson”) %>% kable_styling(
bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”),
full_width = FALSE )
zero_share <- mean(data_poi[[var_y_poi]] == 0)
tibble::tibble( Proporsi Y = 0 = zero_share,
Catatan = ifelse( zero_share > 0, “OLS pada log(Y) akan
kehilangan observasi nol atau perlu transformasi log(Y+c).”, “Tidak ada
nilai nol, tetapi asumsi residual log-scale tetap perlu diperiksa.” ) )
%>% mutate(Proporsi Y = 0 =
percent(Proporsi Y = 0, accuracy = 0.1)) %>%
kable(caption = “Konsekuensi nilai nol untuk model log(Y) dengan OLS”)
%>% kable_styling( bootstrap_options = c(“striped”, “hover”,
“condensed”, “responsive”), full_width = FALSE )
data_poi_positive <- data_poi %>% filter(.data[[var_y_poi]] > 0)
formula_log_ols <- as.formula(paste(“log(”, var_y_poi, “) ~”, paste(var_x_poi, collapse = ” + “)))
fit_log_ols <- lm( formula_log_ols, data = data_poi_positive )
log_ols_coef <- as.data.frame(coef(summary(fit_log_ols))) %>%
tibble::rownames_to_column(“parameter”) %>% dplyr::rename( estimate =
Estimate, std_error = Std. Error, t_value =
t value, p_value = Pr(>|t|) ) %>%
mutate( multiplier = exp(estimate), perubahan_persen = 100 * (multiplier
- 1) )
log_ols_coef %>% mutate(across(c(estimate, std_error, t_value, p_value, multiplier, perubahan_persen), ~ round(.x, 4))) %>% kable( caption = “Ilustrasi OLS pada log(Y) untuk observasi positif saja”, col.names = c( “Parameter”, “Estimate”, “SE”, “t-value”, “p-value”, “exp(Estimate)”, “Perubahan (%)” ) ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )
poisson_logols_table <- tibble::tibble( Aspek = c( “Jenis
respons”, “Nilai nol”, “Target model”, “Bentuk mean”, “Interpretasi”,
“Prediksi”, “Exposure”, “Kapan lebih tepat?” ),
Regresi Poisson = c( “Hitungan nonnegatif”, “Dapat langsung
menangani Y = 0”, “E(Y | X)”, “log{E(Y | X)} = X beta”, “exp(beta)
sebagai incidence rate ratio”, “Selalu nonnegatif”, “Offset
log(exposure) sangat alami”, “Count data, banyak nol, rate kejadian,
atau target mean hitungan” ), OLS pada log(Y) = c( “Kontinu
positif atau hitungan besar tanpa nol”, “Tidak dapat memakai Y = 0 tanpa
modifikasi”, “E(log Y | X)”, “log(Y) = X beta + error”, “exp(beta)
sebagai multiplier pada geometric mean”, “Perlu retransformation untuk
kembali ke skala Y”, “Biasanya memakai log(rate) atau kontrol exposure
manual”, “Y positif, log-scale residual baik, dan fokus pada perubahan
persentase” ) )
poisson_logols_table %>% kable(caption = “Perbandingan regresi Poisson dan OLS pada log(Y)”) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )
required_packages <- c(“dplyr”, “ggplot2”, “broom”, “knitr”, “scales”, “tidyr”) missing_packages <- required_packages[ !vapply(required_packages, requireNamespace, logical(1), quietly = TRUE)]
if (length(missing_packages) > 0) { stop( “Paket berikut belum tersedia:”, paste(missing_packages, collapse = “,”), “. Silakan install terlebih dahulu.” ) }
invisible(lapply(required_packages, library, character.only = TRUE))
raw_data <- read.csv(“data_biner.csv”, sep = “;”) # Menyamakan nama kolom menjadi huruf kecil semua agar sesuai permintaan colnames(raw_data) <- tolower(colnames(raw_data))
ringkasan_data <- data.frame( Keterangan = c(“Jumlah observasi awal”, “Jumlah variabel”), Nilai = c(nrow(raw_data), ncol(raw_data)) )
knitr::kable( ringkasan_data, caption = “Ukuran dataset awal” )
data_model <- raw_data %>% select(survived, sex, age, pclass) %>% drop_na() %>% # Membuang missing values pada Age agar regresi bisa jalan mutate( status_keselamatan = factor( ifelse(survived == 1, “Selamat”, “Meninggal”), levels = c(“Selamat”, “Meninggal”) ), survived_num = survived, sex = factor(sex, levels = c(“male”, “female”), labels = c(“Laki-laki”, “Perempuan”)), pclass = factor(pclass, levels = c(1, 2, 3), labels = c(“Kelas 1”, “Kelas 2”, “Kelas 3”)) ) %>% droplevels()
class_summary <- data_model %>% count(status_keselamatan, name
= “Jumlah”) %>% mutate(Proporsi = scales::percent(Jumlah /
sum(Jumlah), accuracy = 0.1)) %>%
rename(Status Keselamatan = status_keselamatan)
knitr::kable( class_summary, caption = “Distribusi kelas respon (Survival)” )
ggplot(data_model, aes(x = status_keselamatan, fill = status_keselamatan)) + geom_bar(width = 0.62, color = “white”, linewidth = 0.8) + geom_text( stat = “count”, aes(label = after_stat(count)), vjust = -0.4, fontface = “bold” ) + scale_fill_manual(values = c(“Selamat” = “#2a9d8f”, “Meninggal” = “#e76f51”)) + labs( title = “Distribusi Status Keselamatan”, x = NULL, y = “Jumlah Penumpang” ) + theme_minimal(base_size = 12) + theme(legend.position = “none”)
ggplot(data_model, aes(x = status_keselamatan, y = age, fill = status_keselamatan)) + geom_boxplot(alpha = 0.72) + scale_fill_manual(values = c(“Selamat” = “#2a9d8f”, “Meninggal” = “#e76f51”)) + labs( title = “Distribusi Usia Berdasarkan Status Keselamatan”, x = “Status Keselamatan”, y = “Usia” ) + theme_minimal(base_size = 12) + theme(legend.position = “none”)
stratified_split <- function(y, prop = 0.8) { idx_by_class <- split(seq_along(y), y) train_idx <- lapply( idx_by_class, function(idx) sample(idx, size = floor(length(idx) * prop)) ) unlist(train_idx, use.names = FALSE) }
set.seed(123) # Agara hasil split konsisten train_id <- stratified_split(data_model$survived_num, prop = 0.8)
train_data <- data_model[train_id, ] test_data <- data_model[-train_id, ]
biner_fit <- glm( survived_num ~ sex + age + pclass, data = train_data, family = binomial(link = “logit”) )
ringkasan_model <- data.frame( Keterangan = c( “Jumlah observasi training”, “Null deviance”, “Residual deviance”, “AIC” ), Nilai = c( nobs(biner_fit), round(biner_fit\(null.deviance, 3), round(biner_fit\)deviance, 3), round(AIC(biner_fit), 3) ) )
knitr::kable( ringkasan_model, caption = “Ringkasan kecocokan model regresi logistik” )
coef_table <- broom::tidy(biner_fit) %>% filter(term !=
“(Intercept)”) %>% mutate( odds_ratio = exp(estimate), ci_low =
exp(estimate - 1.96 * std.error), ci_high = exp(estimate + 1.96 *
std.error) ) %>% arrange(p.value) %>% transmute(
Variabel/level = term, Odds ratio =
round(odds_ratio, 3), Interval kepercayaan 95% =
paste0(round(ci_low, 3), ” - “, round(ci_high, 3)), p-value
= signif(p.value, 3) )
knitr::kable( coef_table, caption = “Koefisien Signifikan (berdasarkan p-value)” )
p_train <- predict(biner_fit, newdata = train_data, type = “response”) p_test <- predict(biner_fit, newdata = test_data, type = “response”)
safe_div <- function(num, den) { ifelse(den == 0, NA_real_, num / den) }
classification_metrics <- function(actual, prob, threshold = 0.5) { pred <- as.integer(prob >= threshold)
tp <- sum(pred == 1 & actual == 1) tn <- sum(pred == 0 & actual == 0) fp <- sum(pred == 1 & actual == 0) fn <- sum(pred == 0 & actual == 1)
sensitivity <- safe_div(tp, tp + fn) specificity <- safe_div(tn, tn + fp) precision <- safe_div(tp, tp + fp) negative_predictive_value <- safe_div(tn, tn + fn) accuracy <- safe_div(tp + tn, tp + tn + fp + fn)
data.frame( threshold = threshold, accuracy = accuracy, error_rate = 1 - accuracy, sensitivity = sensitivity, specificity = specificity, precision = precision, negative_predictive_value = negative_predictive_value, f1_score = safe_div(2 * precision * sensitivity, precision + sensitivity), balanced_accuracy = (sensitivity + specificity) / 2, false_positive_rate = 1 - specificity, false_negative_rate = 1 - sensitivity ) }
confusion_matrix <- function(actual, prob, threshold = 0.5) { pred_label <- factor( ifelse(prob >= threshold, “Prediksi Selamat”, “Prediksi Meninggal”), levels = c(“Prediksi Selamat”, “Prediksi Meninggal”) ) actual_label <- factor( ifelse(actual == 1, “Aktual Selamat”, “Aktual Meninggal”), levels = c(“Aktual Selamat”, “Aktual Meninggal”) ) addmargins(table(actual_label, pred_label)) }
confusion_default <- confusion_matrix(test_data$survived_num, p_test, threshold = 0.5) knitr::kable(confusion_default, caption = “Confusion matrix testing pada threshold 0.50”)
roc_points <- function(actual, prob) { thresholds <- c(Inf, sort(unique(prob), decreasing = TRUE), -Inf)
out <- lapply(thresholds, function(th) { pred <- as.integer(prob >= th) tp <- sum(pred == 1 & actual == 1) tn <- sum(pred == 0 & actual == 0) fp <- sum(pred == 1 & actual == 0) fn <- sum(pred == 0 & actual == 1)
sensitivity <- safe_div(tp, tp + fn)
specificity <- safe_div(tn, tn + fp)
data.frame(
threshold = th,
sensitivity = sensitivity,
specificity = specificity,
fpr = 1 - specificity,
youden = sensitivity + specificity - 1
)
}) bind_rows(out) }
auc_value <- function(roc_df) { roc_sorted <- roc_df %>% arrange(fpr, sensitivity) sum(diff(roc_sorted\(fpr) * (head(roc_sorted\)sensitivity, -1) + tail(roc_sorted$sensitivity, -1)) / 2) }
roc_train <- roc_points(train_data\(survived_num, p_train) %>% mutate(data = "Training") roc_test <- roc_points(test_data\)survived_num, p_test) %>% mutate(data = “Testing”)
auc_train <- auc_value(roc_train) auc_test <- auc_value(roc_test)
optimal_train <- roc_train %>% filter(is.finite(threshold)) %>% arrange(desc(youden), desc(sensitivity)) %>% slice(1)
threshold_opt <- optimal_train$threshold[1]
test_at_opt <- roc_points(test_data$survived_num, p_test) %>% filter(is.finite(threshold)) %>% slice_min(abs(threshold - threshold_opt), n = 1, with_ties = FALSE) %>% mutate(data = “Testing pada threshold optimal”)
roc_plot <- bind_rows(roc_train, roc_test)
ggplot(roc_plot, aes(x = fpr, y = sensitivity, color = data)) + geom_path(linewidth = 1.1) + geom_abline(intercept = 0, slope = 1, linetype = “dashed”, color = “#6c757d”) + geom_point( data = optimal_train, aes(x = fpr, y = sensitivity), inherit.aes = FALSE, color = “#ffb703”, fill = “#fb8500”, shape = 21, size = 4, stroke = 1.2 ) + coord_equal() + scale_color_manual(values = c(“Training” = “#0077b6”, “Testing” = “#e76f51”)) + labs( title = “Kurva ROC Model Keselamatan Penumpang”, subtitle = paste0(“AUC train:”, round(auc_train, 3), ” | AUC test: “, round(auc_test, 3)), x =”False Positive Rate”, y = “True Positive Rate (Sensitivity)” ) + theme_minimal(base_size = 12) + theme(legend.position = “bottom”)
test_prob_plot <- test_data %>% mutate(peluang_selamat = p_test)
ggplot(test_prob_plot, aes(x = peluang_selamat, fill = status_keselamatan)) + geom_density(alpha = 0.55, color = “white”, linewidth = 0.7) + geom_vline( xintercept = threshold_opt, color = “#fb8500”, linewidth = 1.2, linetype = “dashed” ) + annotate( “label”, x = threshold_opt, y = Inf, label = paste0(“Optimal Threshold =”, round(threshold_opt, 3)), vjust = 1.4, fill = “#fff3b0”, color = “#5f370e”, label.size = 0 ) + scale_fill_manual(values = c(“Selamat” = “#2a9d8f”, “Meninggal” = “#e76f51”)) + labs( title = “Distribusi Peluang Prediksi Selamat (Testing Data)”, x = “Peluang Prediksi Selamat”, y = “Kepadatan (Density)”, fill = “Status Aktual” ) + theme_minimal(base_size = 12)