==========================================

0. MEMUAT LIBRARY

==========================================

Pastikan kamu sudah install paket-paket ini sebelumnya

library(dplyr) library(ggplot2) library(nnet) library(tidyr) library(knitr) library(kableExtra) library(scales)

==========================================

1. PERSIAPAN DATA

==========================================

Membaca data CSV (karena pemisahnya titik koma, gunakan sep = “;”)

data_multi <- read.csv(“data_multinomial.csv”, sep = “;”)

Mendefinisikan variabel sesuai permintaan

var_y_multi <- “Species” var_x_multi <- setdiff(colnames(data_multi), c(“Id”, “Species”))

Menjadikan target (Species) sebagai tipe factor dengan urutan level yang jelas

data_multi <- data_multi %>% mutate( Species = factor( Species, levels = c(“Iris-setosa”, “Iris-versicolor”, “Iris-virginica”) ) )

dplyr::glimpse(data_multi)

==========================================

2. VISUALISASI DISTRIBUSI TARGET

==========================================

data_multi %>% count(Species) %>% mutate(proporsi = n / sum(n)) %>% ggplot(aes(x = Species, y = proporsi, fill = Species)) + geom_col(width = 0.65, alpha = 0.94) + geom_text(aes(label = percent(proporsi, accuracy = 0.1)), vjust = -0.4, fontface = “bold”, color = “#243142”) + scale_y_continuous(labels = percent_format(), limits = c(0, 0.65)) + scale_fill_manual(values = c(“#2f7f73”, “#5568b8”, “#d18b2f”)) + labs( title = “Distribusi Spesies Iris”, subtitle = “Respons multinomial pada dataset Iris”, x = NULL, y = “Proporsi” ) + theme_minimal() + theme(legend.position = “none”)

==========================================

3. MEMBANGUN MODEL MULTINOMIAL

==========================================

Membuat formula dinamis: Species ~ SepalLengthCm + SepalWidthCm + …

formula_multi <- as.formula(paste(var_y_multi, “~”, paste(var_x_multi, collapse = ” + “)))

fit_multi <- nnet::multinom( formula_multi, data = data_multi, trace = FALSE )

summary(fit_multi)

==========================================

4. EKSTRAKSI HASIL (KOEFISIEN, P-VALUE, RRR, CI)

==========================================

multi_sum <- summary(fit_multi)

coef_multi <- as.data.frame(multi_sum\(coefficients) se_multi <- as.data.frame(multi_sum\)standard.errors)

coef_long <- coef_multi %>% tibble::rownames_to_column(“kategori”) %>% tidyr::pivot_longer( cols = -kategori, names_to = “variabel”, values_to = “estimate” )

se_long <- se_multi %>% tibble::rownames_to_column(“kategori”) %>% tidyr::pivot_longer( cols = -kategori, names_to = “variabel”, values_to = “std_error” )

result_multi <- coef_long %>% left_join(se_long, by = c(“kategori”, “variabel”)) %>% mutate( z_value = estimate / std_error, p_value = 2 * (1 - pnorm(abs(z_value))), RRR = exp(estimate), CI_low = exp(estimate - 1.96 * std_error), CI_high = exp(estimate + 1.96 * std_error) )

Membuat tabel ringkasan yang rapi

result_multi %>% mutate( across( c(estimate, std_error, z_value, p_value, RRR, CI_low, CI_high), ~ round(.x, 4) ) ) %>% kable( caption = “Ringkasan hasil regresi logistik multinomial (Base: Iris-setosa)”, col.names = c( “Kategori”, “Variabel”, “Estimate”, “SE”, “z-value”, “p-value”, “RRR”, “CI 2.5%”, “CI 97.5%” ) ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )

==========================================

5. PREDIKSI DAN EVALUASI (CONFUSION MATRIX)

==========================================

pred_prob_multi <- predict(fit_multi, type = “probs”) head(pred_prob_multi)

data_multi_pred <- data_multi %>% bind_cols(as.data.frame(pred_prob_multi)) %>% mutate( prediksi = predict(fit_multi, type = “class”) )

Menampilkan Confusion Matrix

conf_multi <- table( Aktual = data_multi_pred\(Species, Prediksi = data_multi_pred\)prediksi ) conf_multi

Menghitung Akurasi

accuracy_multi <- sum(diag(conf_multi)) / sum(conf_multi) print(paste(“Akurasi Model:”, round(accuracy_multi * 100, 2), “%”))

==========================================

6. PLOT PROBABILITAS PREDIKSI

==========================================

Membuat grid: Memvariasikan SepalLengthCm, menahan variabel lain di nilai rata-rata

grid_multi <- expand.grid( SepalLengthCm = seq( min(data_multi\(SepalLengthCm), max(data_multi\)SepalLengthCm), length.out = 100 ), SepalWidthCm = mean(data_multi\(SepalWidthCm), PetalLengthCm = mean(data_multi\)PetalLengthCm), PetalWidthCm = mean(data_multi$PetalWidthCm) )

grid_prob_multi <- predict(fit_multi, newdata = grid_multi, type = “probs”)

grid_multi_plot <- grid_multi %>% bind_cols(as.data.frame(grid_prob_multi)) %>% tidyr::pivot_longer( cols = c(“Iris-setosa”, “Iris-versicolor”, “Iris-virginica”), names_to = “Species”, values_to = “probabilitas” )

ggplot( grid_multi_plot, aes(x = SepalLengthCm, y = probabilitas, color = Species) ) + geom_line(linewidth = 1.35) + scale_y_continuous(labels = percent_format()) + scale_color_manual(values = c(“#2f7f73”, “#5568b8”, “#d18b2f”)) + labs( title = “Prediksi Probabilitas Spesies Iris”, subtitle = “Variabel lain ditahan pada nilai rata-rata”, x = “Sepal Length (Cm)”, y = “Probabilitas prediksi”, color = “Spesies” ) + theme_minimal()

#INI ORDINAL

==========================================

0. MEMUAT LIBRARY

==========================================

library(dplyr) library(ggplot2) library(MASS) library(tidyr) library(knitr) library(kableExtra) library(scales) # Jika belum punya paket ordinal, hilangkan tanda # di bawah ini: # install.packages(“ordinal”) library(ordinal)

==========================================

1. PERSIAPAN DATA

==========================================

Membaca data

data_ord <- read.csv(“data_ordinal.csv”, sep = “;”)

Mengatur variabel Grade sebagai tipe ordinal factor (berurutan)

Level disesuaikan dengan dataset: Rendah -> Sedang -> Tinggi

data_ord <- data_ord %>% mutate( Grade = factor(Grade, levels = c(“Rendah”, “Sedang”, “Tinggi”), ordered = TRUE), Gender = as.factor(Gender) )

dplyr::glimpse(data_ord)

==========================================

2. VISUALISASI DISTRIBUSI TARGET ORDINAL

==========================================

Tabel Proporsi

data_ord %>% count(Grade) %>% mutate(proporsi = n / sum(n)) %>% kable( digits = 3, caption = “Distribusi Tingkat Grade Mahasiswa” ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = FALSE )

Bar Plot

data_ord %>% count(Grade) %>% mutate(proporsi = n / sum(n)) %>% ggplot(aes(x = Grade, y = proporsi, fill = Grade)) + geom_col(width = 0.65, alpha = 0.94) + geom_text(aes(label = percent(proporsi, accuracy = 0.1)), vjust = -0.4, fontface = “bold”, color = “#243142”) + scale_y_continuous(labels = percent_format(), limits = c(0, max(n/sum(n)) + 0.15)) + scale_fill_manual(values = c(“#2f7f73”, “#d18b2f”, “#5568b8”)) + labs( title = “Distribusi Tingkat Grade”, subtitle = “Respons ordinal: kategori memiliki urutan alami.”, x = “Grade”, y = “Proporsi” ) + theme_minimal() + theme(legend.position = “none”)

==========================================

3. MEMBANGUN MODEL POLR (REGRESI ORDINAL)

==========================================

Menggunakan 5 variabel sebagai prediktor (tanpa Exam_Score untuk menghindari perfect separation)

fit_ord <- MASS::polr( Grade ~ Hours_Studied + Attendance + Previous_Scores + Sleep_Hours + Gender, data = data_ord, method = “logistic”, Hess = TRUE )

summary(fit_ord)

==========================================

4. EKSTRAKSI HASIL (KOEFISIEN, OR, CI, P-VALUE)

==========================================

ord_coef <- coef(summary(fit_ord))

result_ord <- as.data.frame(ord_coef) %>% tibble::rownames_to_column(“parameter”) %>% dplyr::rename( estimate = Value, std_error = Std. Error, t_value = t value ) %>% mutate( p_value = 2 * (1 - pnorm(abs(t_value))), jenis = ifelse(grepl(“\|”, parameter), “Cutpoint”, “Koefisien”), estimate_slide = ifelse(jenis == “Koefisien”, -estimate, estimate), OR_polr = ifelse(jenis == “Koefisien”, exp(estimate), NA_real_), OR_slide = ifelse(jenis == “Koefisien”, exp(estimate_slide), NA_real_), CI_low_polr = ifelse(jenis == “Koefisien”, exp(estimate - 1.96 * std_error), NA_real_), CI_high_polr = ifelse(jenis == “Koefisien”, exp(estimate + 1.96 * std_error), NA_real_) )

result_ord %>% mutate( across( c(estimate, estimate_slide, std_error, t_value, p_value, OR_polr, OR_slide, CI_low_polr, CI_high_polr), ~ round(.x, 4) ) ) %>% kable( caption = “Ringkasan hasil regresi logistik ordinal”, col.names = c( “Parameter”, “Estimate polr”, “SE”, “z/t-value”, “p-value”, “Jenis”, “Estimate slide”, “OR polr”, “OR slide”, “CI polr 2.5%”, “CI polr 97.5%” ) ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )

==========================================

5. PREDIKSI DAN CONFUSION MATRIX

==========================================

pred_prob_ord <- predict(fit_ord, type = “probs”) head(pred_prob_ord)

data_ord_pred <- data_ord %>% bind_cols(as.data.frame(pred_prob_ord)) %>% mutate( prediksi = predict(fit_ord, type = “class”) )

head(data_ord_pred)

conf_ord <- table( Aktual = data_ord_pred\(Grade, Prediksi = data_ord_pred\)prediksi ) conf_ord

Menghitung Akurasi

accuracy_ord <- sum(diag(conf_ord)) / sum(conf_ord) print(paste(“Akurasi Model:”, round(accuracy_ord * 100, 2), “%”))

==========================================

6. PLOT PROBABILITAS PREDIKSI

==========================================

Grid: Variasikan Hours_Studied, tahan variabel numerik di rata-rata, Gender = “Female”

grid_ord <- expand.grid( Hours_Studied = seq( min(data_ord\(Hours_Studied, na.rm = TRUE), max(data_ord\)Hours_Studied, na.rm = TRUE), length.out = 120 ), Attendance = mean(data_ord\(Attendance, na.rm = TRUE), Previous_Scores = mean(data_ord\)Previous_Scores, na.rm = TRUE), Sleep_Hours = mean(data_ord$Sleep_Hours, na.rm = TRUE), Gender = “Female” )

grid_prob_ord <- predict(fit_ord, newdata = grid_ord, type = “probs”)

grid_ord_plot <- grid_ord %>% bind_cols(as.data.frame(grid_prob_ord)) %>% tidyr::pivot_longer( cols = c(“Rendah”, “Sedang”, “Tinggi”), names_to = “Grade”, values_to = “probabilitas” ) %>% mutate(Grade = factor(Grade, levels = c(“Rendah”, “Sedang”, “Tinggi”)))

ggplot( grid_ord_plot, aes(x = Hours_Studied, y = probabilitas, color = Grade) ) + geom_line(linewidth = 1.25) + scale_y_continuous(labels = percent_format()) + scale_color_manual(values = c(“#2f7f73”, “#d18b2f”, “#5568b8”)) + labs( title = “Prediksi Probabilitas Tingkat Grade”, subtitle = “Variabel lain ditahan pada nilai rata-rata; Gender = Female.”, x = “Jam Belajar (Hours Studied)”, y = “Probabilitas prediksi”, color = “Grade” ) + theme_minimal()

==========================================

7. PARALLEL LINES PADA MODEL ORDINAL (CUMULATIVE LOGIT)

==========================================

eps <- 1e-6

grid_ord_cumlogit <- grid_ord %>% bind_cols(as.data.frame(grid_prob_ord)) %>% mutate( P(Y <= Rendah) = Rendah, P(Y <= Sedang) = Rendah + Sedang ) %>% tidyr::pivot_longer( cols = c(P(Y <= Rendah), P(Y <= Sedang)), names_to = “batas_kumulatif”, values_to = “prob_kumulatif” ) %>% mutate( prob_kumulatif = pmin(pmax(prob_kumulatif, eps), 1 - eps), cumulative_logit = qlogis(prob_kumulatif) )

ggplot( grid_ord_cumlogit, aes(x = Hours_Studied, y = cumulative_logit, color = batas_kumulatif) ) + geom_line(linewidth = 1.25) + scale_color_manual(values = c( “P(Y <= Rendah)” = “#2f7f73”, “P(Y <= Sedang)” = “#d18b2f” )) + labs( title = “Parallel Lines pada Model Ordinal”, subtitle = “Garis yang sejajar adalah cumulative logit, bukan probabilitas kategori tunggal.”, x = “Jam Belajar (Hours Studied)”, y = “Logit peluang kumulatif”, color = “Batas kumulatif” ) + theme_minimal()

==========================================

8. UJI ASUMSI PROPORTIONAL ODDS (dengan ordinal::clm)

==========================================

fit_clm <- ordinal::clm( Grade ~ Hours_Studied + Attendance + Previous_Scores + Sleep_Hours + Gender, data = data_ord, link = “logit” )

summary(fit_clm)

Uji indikatif apakah efek prediktor perlu dibuat non-proportional

ordinal::nominal_test(fit_clm)

INI POISSON

==========================================

0. MEMUAT LIBRARY

==========================================

library(dplyr) library(ggplot2) library(tidyr) library(knitr) library(kableExtra) library(scales)

==========================================

1. PERSIAPAN DATA

==========================================

Membaca data

data_poi <- read.csv(“data_poisson1.csv”, sep = “;”)

Mendefinisikan target dan prediktor sesuai permintaan

var_y_poi <- “injuries”

var_x_poi <- setdiff( colnames(data_poi), c(“id”, “date”, “time”, “latitude”, “longitude”, “injuries”) )

Memastikan variabel-variabel karakter/kategorikal diubah menjadi factor

data_poi <- data_poi %>% mutate(across(where(is.character), as.factor))

dplyr::glimpse(data_poi)

==========================================

2. VISUALISASI DISTRIBUSI TARGET (INJURIES)

==========================================

Tabel Proporsi

data_poi %>% count(.data[[var_y_poi]], name = “n”) %>% mutate(proporsi = n / sum(n)) %>% kable( digits = 3, caption = “Distribusi Jumlah Cedera (Injuries) pada Kecelakaan” ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = FALSE )

Bar Plot

data_poi %>% count(.data[[var_y_poi]], name = “n”) %>% ggplot(aes(x = .data[[var_y_poi]], y = n)) + geom_col(width = 0.8, fill = “#2f7f73”, alpha = 0.92) + scale_x_continuous(breaks = scales::pretty_breaks()) + labs( title = “Distribusi Jumlah Cedera”, subtitle = “Respons Poisson berupa hitungan bilangan bulat nonnegatif.”, x = “Jumlah Cedera (Injuries)”, y = “Frekuensi” ) + theme_minimal()

==========================================

3. MEMBANGUN MODEL POISSON

==========================================

Membuat formula dinamis: injuries ~ severity + weather + … dst

formula_poi <- as.formula(paste(var_y_poi, “~”, paste(var_x_poi, collapse = ” + “)))

Model poisson tanpa offset (karena data per kejadian/event base = 1)

fit_pois <- glm( formula_poi, data = data_poi, family = poisson(link = “log”) )

summary(fit_pois)

==========================================

4. EKSTRAKSI HASIL (KOEFISIEN, IRR, CI)

==========================================

pois_coef <- as.data.frame(coef(summary(fit_pois))) %>% tibble::rownames_to_column(“parameter”) %>% dplyr::rename( estimate = Estimate, std_error = Std. Error, z_value = z value, p_value = Pr(>|z|) ) %>% mutate( IRR = exp(estimate), CI_low = exp(estimate - 1.96 * std_error), CI_high = exp(estimate + 1.96 * std_error), perubahan_persen = 100 * (IRR - 1) )

pois_coef %>% mutate( across( c(estimate, std_error, z_value, p_value, IRR, CI_low, CI_high, perubahan_persen), ~ round(.x, 4) ) ) %>% kable( caption = “Ringkasan Hasil Regresi Poisson”, col.names = c( “Parameter”, “Estimate”, “SE”, “z-value”, “p-value”, “IRR”, “CI 2.5%”, “CI 97.5%”, “Perubahan (%)” ) ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )

==========================================

5. PLOT PREDIKSI POISSON

==========================================

Helper untuk mendapatkan nilai rata-rata (numerik) atau modus (kategorik)

get_typical <- function(x) { if(is.numeric(x)) return(mean(x, na.rm = TRUE)) t <- table(x) return(names(t)[which.max(t)]) }

Membuat data referensi (semua ditahan pada nilai rata-rata / modus)

baseline_data <- lapply(data_poi[var_x_poi], get_typical) baseline_data <- as.data.frame(baseline_data)

Replikasi menjadi 100 baris untuk memvariasikan salah satu variabel (misalnya: vehicles_involved)

grid_pois <- baseline_data[rep(1, 100), ] grid_pois\(vehicles_involved <- seq( min(data_poi\)vehicles_involved, na.rm = TRUE), max(data_poi$vehicles_involved, na.rm = TRUE), length.out = 100 )

Melakukan prediksi

pred_pois <- predict(fit_pois, newdata = grid_pois, type = “link”, se.fit = TRUE)

grid_pois_plot <- grid_pois %>% mutate( fit_link = pred_pois\(fit, se_link = pred_pois\)se.fit, rate = exp(fit_link), rate_low = exp(fit_link - 1.96 * se_link), rate_high = exp(fit_link + 1.96 * se_link) )

ggplot(grid_pois_plot, aes(x = vehicles_involved, y = rate)) + geom_ribbon(aes(ymin = rate_low, ymax = rate_high), fill = “#2f7f73”, alpha = 0.16) + geom_line(color = “#2f7f73”, linewidth = 1.35) + labs( title = “Prediksi Jumlah Cedera Berdasarkan Jumlah Kendaraan Terlibat”, subtitle = “Variabel lain ditahan pada nilai rata-rata/modus.”, x = “Jumlah Kendaraan Terlibat (Vehicles Involved)”, y = “Prediksi Jumlah Cedera (Injuries)” ) + theme_minimal()

==========================================

6. EVALUASI OVERDISPERSION

==========================================

dispersion_pois <- sum(residuals(fit_pois, type = “pearson”)^2) / df.residual(fit_pois)

tibble::tibble( Dispersion Pearson = dispersion_pois, Interpretasi ringkas = dplyr::case_when( dispersion_pois < 1.5 ~ “Tidak ada indikasi overdispersion berat”, dispersion_pois < 2.5 ~ “Ada indikasi overdispersion sedang”, TRUE ~ “Ada indikasi overdispersion kuat” ) ) %>% mutate(Dispersion Pearson = round(Dispersion Pearson, 3)) %>% kable(caption = “Indikasi overdispersion pada model Poisson”) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = FALSE )

==========================================

7. EVALUASI NILAI NOL (ZERO-INFLATION CHECK)

==========================================

zero_share <- mean(data_poi[[var_y_poi]] == 0)

tibble::tibble( Proporsi Y = 0 = zero_share, Catatan = ifelse( zero_share > 0, “OLS pada log(Y) akan kehilangan observasi nol atau perlu transformasi log(Y+c).”, “Tidak ada nilai nol, tetapi asumsi residual log-scale tetap perlu diperiksa.” ) ) %>% mutate(Proporsi Y = 0 = percent(Proporsi Y = 0, accuracy = 0.1)) %>% kable(caption = “Konsekuensi nilai nol untuk model log(Y) dengan OLS”) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = FALSE )

==========================================

8. OLS PADA LOG(Y) UNTUK OBSERVASI POSITIF SAJA

==========================================

data_poi_positive <- data_poi %>% filter(.data[[var_y_poi]] > 0)

Membangun formula: log(injuries) ~ prediktor …

formula_log_ols <- as.formula(paste(“log(”, var_y_poi, “) ~”, paste(var_x_poi, collapse = ” + “)))

fit_log_ols <- lm( formula_log_ols, data = data_poi_positive )

log_ols_coef <- as.data.frame(coef(summary(fit_log_ols))) %>% tibble::rownames_to_column(“parameter”) %>% dplyr::rename( estimate = Estimate, std_error = Std. Error, t_value = t value, p_value = Pr(>|t|) ) %>% mutate( multiplier = exp(estimate), perubahan_persen = 100 * (multiplier - 1) )

log_ols_coef %>% mutate(across(c(estimate, std_error, t_value, p_value, multiplier, perubahan_persen), ~ round(.x, 4))) %>% kable( caption = “Ilustrasi OLS pada log(Y) untuk observasi positif saja”, col.names = c( “Parameter”, “Estimate”, “SE”, “t-value”, “p-value”, “exp(Estimate)”, “Perubahan (%)” ) ) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )

==========================================

9. PERBANDINGAN POISSON vs OLS (TABEL)

==========================================

poisson_logols_table <- tibble::tibble( Aspek = c( “Jenis respons”, “Nilai nol”, “Target model”, “Bentuk mean”, “Interpretasi”, “Prediksi”, “Exposure”, “Kapan lebih tepat?” ), Regresi Poisson = c( “Hitungan nonnegatif”, “Dapat langsung menangani Y = 0”, “E(Y | X)”, “log{E(Y | X)} = X beta”, “exp(beta) sebagai incidence rate ratio”, “Selalu nonnegatif”, “Offset log(exposure) sangat alami”, “Count data, banyak nol, rate kejadian, atau target mean hitungan” ), OLS pada log(Y) = c( “Kontinu positif atau hitungan besar tanpa nol”, “Tidak dapat memakai Y = 0 tanpa modifikasi”, “E(log Y | X)”, “log(Y) = X beta + error”, “exp(beta) sebagai multiplier pada geometric mean”, “Perlu retransformation untuk kembali ke skala Y”, “Biasanya memakai log(rate) atau kontrol exposure manual”, “Y positif, log-scale residual baik, dan fokus pada perubahan persentase” ) )

poisson_logols_table %>% kable(caption = “Perbandingan regresi Poisson dan OLS pada log(Y)”) %>% kable_styling( bootstrap_options = c(“striped”, “hover”, “condensed”, “responsive”), full_width = TRUE )

INI BINER

==========================================

0. PERSIAPAN PAKET

==========================================

required_packages <- c(“dplyr”, “ggplot2”, “broom”, “knitr”, “scales”, “tidyr”) missing_packages <- required_packages[ !vapply(required_packages, requireNamespace, logical(1), quietly = TRUE)]

if (length(missing_packages) > 0) { stop( “Paket berikut belum tersedia:”, paste(missing_packages, collapse = “,”), “. Silakan install terlebih dahulu.” ) }

invisible(lapply(required_packages, library, character.only = TRUE))

==========================================

1. MEMBACA & MENYIAPKAN DATA

==========================================

raw_data <- read.csv(“data_biner.csv”, sep = “;”) # Menyamakan nama kolom menjadi huruf kecil semua agar sesuai permintaan colnames(raw_data) <- tolower(colnames(raw_data))

ringkasan_data <- data.frame( Keterangan = c(“Jumlah observasi awal”, “Jumlah variabel”), Nilai = c(nrow(raw_data), ncol(raw_data)) )

knitr::kable( ringkasan_data, caption = “Ukuran dataset awal” )

Menentukan variabel sesuai permintaan dan menangani missing values

data_model <- raw_data %>% select(survived, sex, age, pclass) %>% drop_na() %>% # Membuang missing values pada Age agar regresi bisa jalan mutate( status_keselamatan = factor( ifelse(survived == 1, “Selamat”, “Meninggal”), levels = c(“Selamat”, “Meninggal”) ), survived_num = survived, sex = factor(sex, levels = c(“male”, “female”), labels = c(“Laki-laki”, “Perempuan”)), pclass = factor(pclass, levels = c(1, 2, 3), labels = c(“Kelas 1”, “Kelas 2”, “Kelas 3”)) ) %>% droplevels()

==========================================

2. EKSPLORASI DATA (EDA)

==========================================

class_summary <- data_model %>% count(status_keselamatan, name = “Jumlah”) %>% mutate(Proporsi = scales::percent(Jumlah / sum(Jumlah), accuracy = 0.1)) %>% rename(Status Keselamatan = status_keselamatan)

knitr::kable( class_summary, caption = “Distribusi kelas respon (Survival)” )

ggplot(data_model, aes(x = status_keselamatan, fill = status_keselamatan)) + geom_bar(width = 0.62, color = “white”, linewidth = 0.8) + geom_text( stat = “count”, aes(label = after_stat(count)), vjust = -0.4, fontface = “bold” ) + scale_fill_manual(values = c(“Selamat” = “#2a9d8f”, “Meninggal” = “#e76f51”)) + labs( title = “Distribusi Status Keselamatan”, x = NULL, y = “Jumlah Penumpang” ) + theme_minimal(base_size = 12) + theme(legend.position = “none”)

Boxplot Usia vs Status Keselamatan

ggplot(data_model, aes(x = status_keselamatan, y = age, fill = status_keselamatan)) + geom_boxplot(alpha = 0.72) + scale_fill_manual(values = c(“Selamat” = “#2a9d8f”, “Meninggal” = “#e76f51”)) + labs( title = “Distribusi Usia Berdasarkan Status Keselamatan”, x = “Status Keselamatan”, y = “Usia” ) + theme_minimal(base_size = 12) + theme(legend.position = “none”)

==========================================

3. SPLIT DATA (TRAINING & TESTING)

==========================================

stratified_split <- function(y, prop = 0.8) { idx_by_class <- split(seq_along(y), y) train_idx <- lapply( idx_by_class, function(idx) sample(idx, size = floor(length(idx) * prop)) ) unlist(train_idx, use.names = FALSE) }

set.seed(123) # Agara hasil split konsisten train_id <- stratified_split(data_model$survived_num, prop = 0.8)

train_data <- data_model[train_id, ] test_data <- data_model[-train_id, ]

==========================================

4. MEMBANGUN MODEL LOGISTIK BINER

==========================================

biner_fit <- glm( survived_num ~ sex + age + pclass, data = train_data, family = binomial(link = “logit”) )

ringkasan_model <- data.frame( Keterangan = c( “Jumlah observasi training”, “Null deviance”, “Residual deviance”, “AIC” ), Nilai = c( nobs(biner_fit), round(biner_fit\(null.deviance, 3), round(biner_fit\)deviance, 3), round(AIC(biner_fit), 3) ) )

knitr::kable( ringkasan_model, caption = “Ringkasan kecocokan model regresi logistik” )

coef_table <- broom::tidy(biner_fit) %>% filter(term != “(Intercept)”) %>% mutate( odds_ratio = exp(estimate), ci_low = exp(estimate - 1.96 * std.error), ci_high = exp(estimate + 1.96 * std.error) ) %>% arrange(p.value) %>% transmute( Variabel/level = term, Odds ratio = round(odds_ratio, 3), Interval kepercayaan 95% = paste0(round(ci_low, 3), ” - “, round(ci_high, 3)), p-value = signif(p.value, 3) )

knitr::kable( coef_table, caption = “Koefisien Signifikan (berdasarkan p-value)” )

==========================================

5. PREDIKSI DAN EVALUASI MATRIX

==========================================

p_train <- predict(biner_fit, newdata = train_data, type = “response”) p_test <- predict(biner_fit, newdata = test_data, type = “response”)

safe_div <- function(num, den) { ifelse(den == 0, NA_real_, num / den) }

classification_metrics <- function(actual, prob, threshold = 0.5) { pred <- as.integer(prob >= threshold)

tp <- sum(pred == 1 & actual == 1) tn <- sum(pred == 0 & actual == 0) fp <- sum(pred == 1 & actual == 0) fn <- sum(pred == 0 & actual == 1)

sensitivity <- safe_div(tp, tp + fn) specificity <- safe_div(tn, tn + fp) precision <- safe_div(tp, tp + fp) negative_predictive_value <- safe_div(tn, tn + fn) accuracy <- safe_div(tp + tn, tp + tn + fp + fn)

data.frame( threshold = threshold, accuracy = accuracy, error_rate = 1 - accuracy, sensitivity = sensitivity, specificity = specificity, precision = precision, negative_predictive_value = negative_predictive_value, f1_score = safe_div(2 * precision * sensitivity, precision + sensitivity), balanced_accuracy = (sensitivity + specificity) / 2, false_positive_rate = 1 - specificity, false_negative_rate = 1 - sensitivity ) }

confusion_matrix <- function(actual, prob, threshold = 0.5) { pred_label <- factor( ifelse(prob >= threshold, “Prediksi Selamat”, “Prediksi Meninggal”), levels = c(“Prediksi Selamat”, “Prediksi Meninggal”) ) actual_label <- factor( ifelse(actual == 1, “Aktual Selamat”, “Aktual Meninggal”), levels = c(“Aktual Selamat”, “Aktual Meninggal”) ) addmargins(table(actual_label, pred_label)) }

confusion_default <- confusion_matrix(test_data$survived_num, p_test, threshold = 0.5) knitr::kable(confusion_default, caption = “Confusion matrix testing pada threshold 0.50”)

==========================================

6. EVALUASI ROC KURVA DAN AUC

==========================================

roc_points <- function(actual, prob) { thresholds <- c(Inf, sort(unique(prob), decreasing = TRUE), -Inf)

out <- lapply(thresholds, function(th) { pred <- as.integer(prob >= th) tp <- sum(pred == 1 & actual == 1) tn <- sum(pred == 0 & actual == 0) fp <- sum(pred == 1 & actual == 0) fn <- sum(pred == 0 & actual == 1)

sensitivity <- safe_div(tp, tp + fn)
specificity <- safe_div(tn, tn + fp)

data.frame(
  threshold = th,
  sensitivity = sensitivity,
  specificity = specificity,
  fpr = 1 - specificity,
  youden = sensitivity + specificity - 1
)

}) bind_rows(out) }

auc_value <- function(roc_df) { roc_sorted <- roc_df %>% arrange(fpr, sensitivity) sum(diff(roc_sorted\(fpr) * (head(roc_sorted\)sensitivity, -1) + tail(roc_sorted$sensitivity, -1)) / 2) }

roc_train <- roc_points(train_data\(survived_num, p_train) %>% mutate(data = "Training") roc_test <- roc_points(test_data\)survived_num, p_test) %>% mutate(data = “Testing”)

auc_train <- auc_value(roc_train) auc_test <- auc_value(roc_test)

optimal_train <- roc_train %>% filter(is.finite(threshold)) %>% arrange(desc(youden), desc(sensitivity)) %>% slice(1)

threshold_opt <- optimal_train$threshold[1]

test_at_opt <- roc_points(test_data$survived_num, p_test) %>% filter(is.finite(threshold)) %>% slice_min(abs(threshold - threshold_opt), n = 1, with_ties = FALSE) %>% mutate(data = “Testing pada threshold optimal”)

roc_plot <- bind_rows(roc_train, roc_test)

ggplot(roc_plot, aes(x = fpr, y = sensitivity, color = data)) + geom_path(linewidth = 1.1) + geom_abline(intercept = 0, slope = 1, linetype = “dashed”, color = “#6c757d”) + geom_point( data = optimal_train, aes(x = fpr, y = sensitivity), inherit.aes = FALSE, color = “#ffb703”, fill = “#fb8500”, shape = 21, size = 4, stroke = 1.2 ) + coord_equal() + scale_color_manual(values = c(“Training” = “#0077b6”, “Testing” = “#e76f51”)) + labs( title = “Kurva ROC Model Keselamatan Penumpang”, subtitle = paste0(“AUC train:”, round(auc_train, 3), ” | AUC test: “, round(auc_test, 3)), x =”False Positive Rate”, y = “True Positive Rate (Sensitivity)” ) + theme_minimal(base_size = 12) + theme(legend.position = “bottom”)

==========================================

7. DENSITY PLOT PREDIKSI

==========================================

test_prob_plot <- test_data %>% mutate(peluang_selamat = p_test)

ggplot(test_prob_plot, aes(x = peluang_selamat, fill = status_keselamatan)) + geom_density(alpha = 0.55, color = “white”, linewidth = 0.7) + geom_vline( xintercept = threshold_opt, color = “#fb8500”, linewidth = 1.2, linetype = “dashed” ) + annotate( “label”, x = threshold_opt, y = Inf, label = paste0(“Optimal Threshold =”, round(threshold_opt, 3)), vjust = 1.4, fill = “#fff3b0”, color = “#5f370e”, label.size = 0 ) + scale_fill_manual(values = c(“Selamat” = “#2a9d8f”, “Meninggal” = “#e76f51”)) + labs( title = “Distribusi Peluang Prediksi Selamat (Testing Data)”, x = “Peluang Prediksi Selamat”, y = “Kepadatan (Density)”, fill = “Status Aktual” ) + theme_minimal(base_size = 12)