Distribusi
Variabel
cat_vars <- list(
list(var = "country", label = "Negara"),
list(var = "syndrome", label = "Sindrom"),
list(var = "sex", label = "Jenis Kelamin"),
list(var = "severity", label = "Tingkat Keparahan"),
list(var = "hospitalized", label = "Rawat Inap"),
list(var = "icu_admission", label = "Masuk ICU"),
list(var = "mechanical_ventilation", label = "Ventilasi Mekanik"),
list(var = "ecmo_used", label = "Penggunaan ECMO"),
list(var = "dialysis", label = "Dialisis"),
list(var = "comorbidity", label = "Komorbiditas"),
list(var = "exposure_type", label = "Jenis Paparan"),
list(var = "blood_type", label = "Golongan Darah"),
list(var = "viral_load_category", label = "Kategori Viral Load"),
list(var = "treatment_protocol", label = "Protokol Pengobatan"),
list(var = "geographic_setting", label = "Lokasi Geografis"),
list(var = "outcome_label", label = "Status Akhir Pasien")
)
# Palet warna konsisten untuk binary/sedikit kategori
binary_colors <- c("#2a9d8f", "#e76f51", "#457b9d", "#e9c46a", "#8ecae6")
multi_colors <- c("#264653", "#2a9d8f", "#e9c46a", "#f4a261", "#e76f51",
"#457b9d", "#8ecae6", "#a8dadc", "#6d6875", "#b5838d")
# Fungsi helper
make_summary_table <- function(data, var, label) {
data %>%
count(.data[[var]], name = "Jumlah") %>%
rename(`Kategori` = 1) %>%
mutate(
Proporsi = scales::percent(Jumlah / sum(Jumlah), accuracy = 0.1)
) %>%
knitr::kable(
caption = glue::glue("Distribusi {label} ({var})")
) %>%
print()
}
make_bar_plot <- function(data, var, label) {
n_levels <- nlevels(data[[var]])
pal <- if (n_levels <= 5) binary_colors[seq_len(n_levels)]
else multi_colors[seq_len(n_levels)]
# Flip horizontal jika kategori > 5 atau label panjang
use_flip <- n_levels > 5 ||
max(nchar(as.character(levels(data[[var]])))) > 12
p <- ggplot(data, aes(x = .data[[var]], fill = .data[[var]])) +
geom_bar(width = 0.62, color = "white", linewidth = 0.8) +
geom_text(
stat = "count",
aes(label = after_stat(count)),
hjust = if (use_flip) -0.2 else NA,
vjust = if (use_flip) NA else -0.4,
fontface = "bold",
size = 3.5
) +
scale_fill_manual(values = setNames(pal, levels(data[[var]]))) +
labs(
title = glue::glue("Distribusi {label}"),
x = NULL,
y = "Jumlah pasien"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "none")
if (use_flip) {
p <- p +
coord_flip() +
scale_y_continuous(expand = expansion(mult = c(0, 0.15)))
} else {
p <- p +
scale_y_continuous(expand = expansion(mult = c(0, 0.12)))
}
print(p)
}
#Loop utama
for (item in cat_vars) {
cat("\n\n### ", item$label, " (", item$var, ")\n\n", sep = "")
# Tabel ringkasan
make_summary_table(disease_model, item$var, item$label)
cat("\n")
# Bar chart
make_bar_plot(disease_model, item$var, item$label)
}
##
##
## ### Negara (country)
##
##
##
## Table: Distribusi Negara (country)
##
## |Kategori | Jumlah|Proporsi |
## |:--------------------|------:|:--------|
## |Argentina | 361|3.6% |
## |Bolivia | 380|3.8% |
## |Brazil | 353|3.5% |
## |Canada | 352|3.5% |
## |Chile | 364|3.6% |
## |China | 789|7.9% |
## |Colombia | 353|3.5% |
## |Croatia | 814|8.1% |
## |Finland | 817|8.2% |
## |Germany | 767|7.7% |
## |Kategori lain/jarang | 13|0.1% |
## |Panama | 368|3.7% |
## |Paraguay | 320|3.2% |
## |Russia | 824|8.2% |
## |Slovenia | 793|7.9% |
## |South Korea | 833|8.3% |
## |Sweden | 823|8.2% |
## |Uruguay | 324|3.2% |
## |USA | 352|3.5% |

##
##
## ### Sindrom (syndrome)
##
##
##
## Table: Distribusi Sindrom (syndrome)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |HFRS | 6460|64.6% |
## |HPS | 3540|35.4% |

##
##
## ### Jenis Kelamin (sex)
##
##
##
## Table: Distribusi Jenis Kelamin (sex)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |Female | 3842|38.4% |
## |Male | 6158|61.6% |

##
##
## ### Tingkat Keparahan (severity)
##
##
##
## Table: Distribusi Tingkat Keparahan (severity)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |Critical | 1385|13.8% |
## |Mild | 2504|25.0% |
## |Moderate | 3289|32.9% |
## |Severe | 2822|28.2% |

##
##
## ### Rawat Inap (hospitalized)
##
##
##
## Table: Distribusi Rawat Inap (hospitalized)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |0 | 1760|17.6% |
## |1 | 8240|82.4% |

##
##
## ### Masuk ICU (icu_admission)
##
##
##
## Table: Distribusi Masuk ICU (icu_admission)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |0 | 7075|70.8% |
## |1 | 2925|29.2% |

##
##
## ### Ventilasi Mekanik (mechanical_ventilation)
##
##
##
## Table: Distribusi Ventilasi Mekanik (mechanical_ventilation)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |0 | 9157|91.6% |
## |1 | 843|8.4% |

##
##
## ### Penggunaan ECMO (ecmo_used)
##
##
##
## Table: Distribusi Penggunaan ECMO (ecmo_used)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |0 | 9655|96.6% |
## |1 | 345|3.4% |

##
##
## ### Dialisis (dialysis)
##
##
##
## Table: Distribusi Dialisis (dialysis)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |0 | 9202|92.0% |
## |1 | 798|8.0% |

##
##
## ### Komorbiditas (comorbidity)
##
##
##
## Table: Distribusi Komorbiditas (comorbidity)
##
## |Kategori | Jumlah|Proporsi |
## |:----------------------|------:|:--------|
## |Asthma | 648|6.5% |
## |Cardiovascular Disease | 645|6.4% |
## |Chronic Kidney Disease | 474|4.7% |
## |COPD | 827|8.3% |
## |Diabetes | 988|9.9% |
## |Hypertension | 1020|10.2% |
## |Immunosuppression | 379|3.8% |
## |None | 3940|39.4% |
## |Obesity | 762|7.6% |
## |Smoking | 317|3.2% |

##
##
## ### Jenis Paparan (exposure_type)
##
##
##
## Table: Distribusi Jenis Paparan (exposure_type)
##
## |Kategori | Jumlah|Proporsi |
## |:------------------------------|------:|:--------|
## |Agricultural work | 1698|17.0% |
## |Camping/hiking | 999|10.0% |
## |Cleaning rodent-infested area | 1452|14.5% |
## |Close contact with HPS patient | 598|6.0% |
## |Cruise ship exposure | 331|3.3% |
## |Forestry/logging | 1152|11.5% |
## |Laboratory accident | 116|1.2% |
## |Military exercise | 294|2.9% |
## |Rural dwelling | 2347|23.5% |
## |Unknown | 1013|10.1% |

##
##
## ### Golongan Darah (blood_type)
##
##
##
## Table: Distribusi Golongan Darah (blood_type)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |A- | 589|5.9% |
## |A+ | 2976|29.8% |
## |AB- | 107|1.1% |
## |AB+ | 360|3.6% |
## |B- | 211|2.1% |
## |B+ | 913|9.1% |
## |O- | 897|9.0% |
## |O+ | 3947|39.5% |

##
##
## ### Kategori Viral Load (viral_load_category)
##
##
##
## Table: Distribusi Kategori Viral Load (viral_load_category)
##
## |Kategori | Jumlah|Proporsi |
## |:--------|------:|:--------|
## |Critical | 2546|25.5% |
## |High | 3163|31.6% |
## |Low | 1658|16.6% |
## |Medium | 2633|26.3% |

##
##
## ### Protokol Pengobatan (treatment_protocol)
##
##
##
## Table: Distribusi Protokol Pengobatan (treatment_protocol)
##
## |Kategori | Jumlah|Proporsi |
## |:-------------------------------------|------:|:--------|
## |Dialysis + supportive | 466|4.7% |
## |ECMO + full ICU support | 405|4.0% |
## |IV fluids + oxygen | 3258|32.6% |
## |Plasma exchange + supportive | 279|2.8% |
## |Ribavirin + supportive | 937|9.4% |
## |Supportive care only | 2887|28.9% |
## |Vasopressors + mechanical ventilation | 1768|17.7% |

##
##
## ### Lokasi Geografis (geographic_setting)
##
##
##
## Table: Distribusi Lokasi Geografis (geographic_setting)
##
## |Kategori | Jumlah|Proporsi |
## |:----------|------:|:--------|
## |Peri-urban | 1532|15.3% |
## |Remote | 2484|24.8% |
## |Rural | 4943|49.4% |
## |Urban | 1041|10.4% |

##
##
## ### Status Akhir Pasien (outcome_label)
##
##
##
## Table: Distribusi Status Akhir Pasien (outcome_label)
##
## |Kategori | Jumlah|Proporsi |
## |:---------------|------:|:--------|
## |Meninggal/Buruk | 1062|10.6% |
## |Sembuh | 8938|89.4% |

Multikolinearitas dan
Linearitas Variabel Kontinu
model_vif <- glm(
outcome_biner ~ age + number_of_symptoms + incubation_days +
length_of_stay_days + days_to_diagnosis +
sex + severity + hospitalized + icu_admission +
mechanical_ventilation + ecmo_used + dialysis +
comorbidity + exposure_type + blood_type +
viral_load_category + treatment_protocol + geographic_setting,
data = disease_model,
family = binomial(link = "logit")
)
vif_values <- vif(model_vif)
# Untuk prediktor kategorikal, car::vif() menghasilkan GVIF
# Kolom yang relevan untuk interpretasi adalah GVIF^(1/(2*Df))
vif_df <- as.data.frame(vif_values)
vif_df$Variabel <- rownames(vif_df)
# Deteksi apakah output berupa GVIF (kolom > 1) atau VIF biasa (kolom = 1)
if (ncol(vif_df) > 2) {
# Output GVIF (prediktor campuran numerik + kategorikal)
vif_df <- vif_df %>%
rename(
GVIF = 1,
Df = 2,
`GVIF^(1/(2*Df))` = 3
) %>%
select(Variabel, GVIF, Df, `GVIF^(1/(2*Df))`) %>%
mutate(
Interpretasi = case_when(
`GVIF^(1/(2*Df))` < sqrt(5) ~ "Aman",
`GVIF^(1/(2*Df))` < sqrt(10) ~ "Moderat",
TRUE ~ "Multikolinearitas tinggi"
)
)
message("Catatan: Prediktor kategorikal menghasilkan GVIF. ",
"Gunakan kolom GVIF^(1/(2*Df)) sebagai padanan sqrt(VIF).")
} else {
# Output VIF biasa (semua prediktor numerik)
vif_df <- vif_df %>%
rename(VIF = 1) %>%
select(Variabel, VIF) %>%
mutate(
Interpretasi = case_when(
VIF < 5 ~ "Aman",
VIF < 10 ~ "Moderat",
TRUE ~ "Multikolinearitas tinggi"
)
)
}
knitr::kable(
vif_df,
caption = "Hasil Uji VIF/GVIF – Deteksi Multikolinearitas",
digits = 3
)
Hasil Uji VIF/GVIF – Deteksi Multikolinearitas
| age |
age |
1.015 |
1 |
1.008 |
Aman |
| number_of_symptoms |
number_of_symptoms |
1.010 |
1 |
1.005 |
Aman |
| incubation_days |
incubation_days |
1.009 |
1 |
1.004 |
Aman |
| length_of_stay_days |
length_of_stay_days |
1.124 |
1 |
1.060 |
Aman |
| days_to_diagnosis |
days_to_diagnosis |
1.008 |
1 |
1.004 |
Aman |
| sex |
sex |
1.007 |
1 |
1.003 |
Aman |
| severity |
severity |
16.896 |
3 |
1.602 |
Aman |
| hospitalized |
hospitalized |
2.121 |
1 |
1.456 |
Aman |
| icu_admission |
icu_admission |
1.344 |
1 |
1.159 |
Aman |
| mechanical_ventilation |
mechanical_ventilation |
1.933 |
1 |
1.390 |
Aman |
| ecmo_used |
ecmo_used |
1.226 |
1 |
1.107 |
Aman |
| dialysis |
dialysis |
1.034 |
1 |
1.017 |
Aman |
| comorbidity |
comorbidity |
1.072 |
9 |
1.004 |
Aman |
| exposure_type |
exposure_type |
1.071 |
9 |
1.004 |
Aman |
| blood_type |
blood_type |
1.062 |
7 |
1.004 |
Aman |
| viral_load_category |
viral_load_category |
1.176 |
3 |
1.027 |
Aman |
| treatment_protocol |
treatment_protocol |
5.863 |
6 |
1.159 |
Aman |
| geographic_setting |
geographic_setting |
1.024 |
3 |
1.004 |
Aman |
numeric_preds <- c(
"age", "number_of_symptoms", "incubation_days",
"length_of_stay_days", "days_to_diagnosis"
)
# Buat salinan data dengan log-transform interaksi Box-Tidwell
bt_data <- disease_model %>%
select(outcome_biner, all_of(numeric_preds)) %>%
na.omit()
# Geser nilai ≤ 0 agar log() valid
for (v in numeric_preds) {
min_val <- min(bt_data[[v]], na.rm = TRUE)
if (min_val <= 0) {
bt_data[[v]] <- bt_data[[v]] - min_val + 1
message("Variabel '", v, "': digeser +", abs(min_val) + 1,
" agar semua nilai positif untuk Box-Tidwell.")
}
}
# Buat term interaksi x * log(x) untuk setiap prediktor numerik
for (v in numeric_preds) {
bt_data[[paste0(v, "_log")]] <- bt_data[[v]] * log(bt_data[[v]])
}
# Formula model Box-Tidwell (prediktor asli + interaksi log)
bt_formula <- as.formula(
paste(
"outcome_biner ~",
paste(numeric_preds, collapse = " + "),
"+",
paste(paste0(numeric_preds, "_log"), collapse = " + ")
)
)
model_bt <- glm(bt_formula, data = bt_data, family = binomial(link = "logit"))
# Ambil koefisien hanya untuk term interaksi _log
bt_summary <- summary(model_bt)$coefficients
bt_log_rows <- bt_summary[grepl("_log$", rownames(bt_summary)), , drop = FALSE]
bt_result <- as.data.frame(bt_log_rows) %>%
rename(
Estimasi = Estimate,
`Std. Error` = `Std. Error`,
`z value` = `z value`,
`p-value` = `Pr(>|z|)`
) %>%
mutate(
Variabel = sub("_log$", "", rownames(.)),
`Linearitas` = ifelse(`p-value` > 0.05, "Terpenuhi (linier)", "Tidak linier")
) %>%
select(Variabel, Estimasi, `Std. Error`, `z value`, `p-value`, Linearitas)
rownames(bt_result) <- NULL
knitr::kable(
bt_result,
caption = "Hasil Uji Box-Tidwell – Linearitas Log-Odds terhadap Prediktor Numerik",
digits = 4
)
Hasil Uji Box-Tidwell – Linearitas Log-Odds terhadap Prediktor
Numerik
| age |
0.0047 |
0.0077 |
0.6151 |
0.5385 |
Terpenuhi (linier) |
| number_of_symptoms |
-0.6584 |
0.2878 |
-2.2874 |
0.0222 |
Tidak linier |
| incubation_days |
0.0196 |
0.0188 |
1.0474 |
0.2949 |
Terpenuhi (linier) |
| length_of_stay_days |
0.0833 |
0.0148 |
5.6198 |
0.0000 |
Tidak linier |
| days_to_diagnosis |
0.0116 |
0.0347 |
0.3336 |
0.7386 |
Terpenuhi (linier) |
Pemilihan Model
Terbaik
all_predictors <- c(
"country", "syndrome", "age", "sex", "incubation_days",
"severity", "hospitalized", "icu_admission", "mechanical_ventilation",
"ecmo_used", "dialysis", "comorbidity", "exposure_type",
"blood_type", "viral_load_category",
"days_to_diagnosis", "treatment_protocol", "geographic_setting"
)
check_var <- function(var, data) {
col <- data[[var]]
if (is.factor(col)) nlevels(droplevels(col)) >= 2
else length(unique(na.omit(col))) >= 2
}
valid_predictors <- Filter(function(v) check_var(v, train_data), all_predictors)
dropped_predictors <- setdiff(all_predictors, valid_predictors)
if (length(dropped_predictors) > 0) {
message(
"Variabel DIKELUARKAN (hanya 1 level di training): ",
paste(dropped_predictors, collapse = ", ")
)
}
train_data <- train_data %>% mutate(across(where(is.factor), droplevels))
test_data <- test_data %>% mutate(across(where(is.factor), droplevels))
formula_full <- as.formula(
paste("outcome_biner ~", paste(valid_predictors, collapse = " + "))
)
full_model <- glm(formula_full, data = train_data, family = binomial(link = "logit"))
null_model <- glm(outcome_biner ~ 1, data = train_data, family = binomial(link = "logit"))
# --- Helper: tabel koefisien ---
coef_tidy <- function(model) {
broom::tidy(model) %>%
filter(term != "(Intercept)") %>%
mutate(
odds_ratio = exp(estimate),
ci_low = exp(estimate - 1.96 * std.error),
ci_high = exp(estimate + 1.96 * std.error)
) %>%
arrange(p.value) %>%
transmute(
`Variabel/level` = term,
`Estimasi (beta)` = round(estimate, 4),
`Std. Error` = round(std.error, 4),
`Odds ratio` = round(odds_ratio, 3),
`CI 95%` = paste0(round(ci_low, 3), " - ", round(ci_high, 3)),
`p-value` = signif(p.value, 4),
`Signifikan` = ifelse(p.value < 0.05, "Ya", "Tidak")
)
}
# 10a. Forward
cat("\n=== FORWARD SELECTION ===\n")
##
## === FORWARD SELECTION ===
forward_model <- step(null_model,
scope = list(lower = null_model, upper = full_model),
direction = "forward", trace = TRUE, k = 2)
## Start: AIC=5415.16
## outcome_biner ~ 1
##
## Df Deviance AIC
## + severity 3 4244.3 4252.3
## + treatment_protocol 6 4486.6 4500.6
## + syndrome 1 4860.7 4864.7
## + country 18 4852.2 4890.2
## + icu_admission 1 4958.7 4962.7
## + mechanical_ventilation 1 5006.8 5010.8
## + hospitalized 1 5188.6 5192.6
## + ecmo_used 1 5235.0 5239.0
## + viral_load_category 3 5233.7 5241.7
## <none> 5413.2 5415.2
## + dialysis 1 5411.9 5415.9
## + sex 1 5412.7 5416.7
## + age 1 5413.0 5417.0
## + days_to_diagnosis 1 5413.0 5417.0
## + geographic_setting 3 5409.1 5417.1
## + incubation_days 1 5413.1 5417.1
## + exposure_type 9 5401.7 5421.7
## + comorbidity 9 5403.4 5423.4
## + blood_type 7 5408.6 5424.6
##
## Step: AIC=4252.3
## outcome_biner ~ severity
##
## Df Deviance AIC
## + syndrome 1 3916.8 3926.8
## + country 18 3902.5 3946.5
## + dialysis 1 4192.6 4202.6
## <none> 4244.3 4252.3
## + hospitalized 1 4242.5 4252.5
## + age 1 4243.3 4253.3
## + days_to_diagnosis 1 4243.4 4253.4
## + geographic_setting 3 4239.5 4253.5
## + ecmo_used 1 4243.8 4253.8
## + mechanical_ventilation 1 4244.1 4254.1
## + incubation_days 1 4244.2 4254.2
## + sex 1 4244.2 4254.2
## + icu_admission 1 4244.2 4254.2
## + viral_load_category 3 4241.8 4255.8
## + blood_type 7 4235.2 4257.2
## + treatment_protocol 6 4240.1 4260.1
## + exposure_type 9 4235.3 4261.3
## + comorbidity 9 4239.1 4265.1
##
## Step: AIC=3926.78
## outcome_biner ~ severity + syndrome
##
## Df Deviance AIC
## <none> 3916.8 3926.8
## + hospitalized 1 3915.1 3927.1
## + geographic_setting 3 3911.6 3927.6
## + dialysis 1 3915.8 3927.8
## + age 1 3916.1 3928.1
## + icu_admission 1 3916.1 3928.1
## + days_to_diagnosis 1 3916.2 3928.2
## + incubation_days 1 3916.3 3928.3
## + mechanical_ventilation 1 3916.5 3928.5
## + sex 1 3916.6 3928.6
## + ecmo_used 1 3916.7 3928.7
## + viral_load_category 3 3915.1 3931.1
## + blood_type 7 3908.5 3932.5
## + treatment_protocol 6 3911.4 3933.4
## + exposure_type 9 3908.5 3936.5
## + comorbidity 9 3910.9 3938.9
## + country 17 3902.5 3946.5
print(summary(forward_model))
##
## Call:
## glm(formula = outcome_biner ~ severity + syndrome, family = binomial(link = "logit"),
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.30876 0.08506 15.39 <2e-16 ***
## severityMild 3.36346 0.19090 17.62 <2e-16 ***
## severityModerate 3.21644 0.15004 21.44 <2e-16 ***
## severitySevere 1.31022 0.09018 14.53 <2e-16 ***
## syndromeHPS -1.50320 0.08722 -17.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5413.2 on 7998 degrees of freedom
## Residual deviance: 3916.8 on 7994 degrees of freedom
## AIC: 3926.8
##
## Number of Fisher Scoring iterations: 6
knitr::kable(coef_tidy(forward_model),
caption = "Forward Selection - Signifikansi Variabel")
Forward Selection - Signifikansi Variabel
| severityModerate |
3.2164 |
0.1500 |
24.939 |
18.585 - 33.466 |
0 |
Ya |
| severityMild |
3.3635 |
0.1909 |
28.889 |
19.872 - 41.998 |
0 |
Ya |
| syndromeHPS |
-1.5032 |
0.0872 |
0.222 |
0.187 - 0.264 |
0 |
Ya |
| severitySevere |
1.3102 |
0.0902 |
3.707 |
3.106 - 4.424 |
0 |
Ya |
# 10b. Backward
cat("\n=== BACKWARD ELIMINATION ===\n")
##
## === BACKWARD ELIMINATION ===
backward_model <- step(full_model, direction = "backward", trace = TRUE, k = 2)
## Start: AIC=3998.39
## outcome_biner ~ country + syndrome + age + sex + incubation_days +
## severity + hospitalized + icu_admission + mechanical_ventilation +
## ecmo_used + dialysis + comorbidity + exposure_type + blood_type +
## viral_load_category + days_to_diagnosis + treatment_protocol +
## geographic_setting
##
##
## Step: AIC=3998.39
## outcome_biner ~ country + age + sex + incubation_days + severity +
## hospitalized + icu_admission + mechanical_ventilation + ecmo_used +
## dialysis + comorbidity + exposure_type + blood_type + viral_load_category +
## days_to_diagnosis + treatment_protocol + geographic_setting
##
## Df Deviance AIC
## - comorbidity 9 3867.9 3985.9
## - exposure_type 9 3870.8 3988.8
## - treatment_protocol 6 3866.6 3990.6
## - blood_type 7 3871.2 3993.2
## - viral_load_category 3 3864.0 3994.0
## - ecmo_used 1 3862.4 3996.4
## - mechanical_ventilation 1 3862.5 3996.5
## - sex 1 3862.7 3996.7
## - icu_admission 1 3862.7 3996.7
## - incubation_days 1 3862.7 3996.7
## - days_to_diagnosis 1 3862.8 3996.8
## - age 1 3862.9 3996.9
## - dialysis 1 3863.6 3997.6
## - hospitalized 1 3864.1 3998.1
## <none> 3862.4 3998.4
## - geographic_setting 3 3868.4 3998.4
## - severity 3 4001.7 4131.7
## - country 18 4154.8 4254.8
##
## Step: AIC=3985.95
## outcome_biner ~ country + age + sex + incubation_days + severity +
## hospitalized + icu_admission + mechanical_ventilation + ecmo_used +
## dialysis + exposure_type + blood_type + viral_load_category +
## days_to_diagnosis + treatment_protocol + geographic_setting
##
## Df Deviance AIC
## - exposure_type 9 3876.3 3976.3
## - treatment_protocol 6 3872.4 3978.4
## - blood_type 7 3876.8 3980.8
## - viral_load_category 3 3869.5 3981.5
## - ecmo_used 1 3868.0 3984.0
## - mechanical_ventilation 1 3868.0 3984.0
## - sex 1 3868.3 3984.3
## - icu_admission 1 3868.3 3984.3
## - incubation_days 1 3868.3 3984.3
## - days_to_diagnosis 1 3868.3 3984.3
## - age 1 3868.5 3984.5
## - dialysis 1 3869.2 3985.2
## - hospitalized 1 3869.6 3985.6
## <none> 3867.9 3985.9
## - geographic_setting 3 3874.1 3986.1
## - severity 3 4006.7 4118.7
## - country 18 4161.2 4243.2
##
## Step: AIC=3976.29
## outcome_biner ~ country + age + sex + incubation_days + severity +
## hospitalized + icu_admission + mechanical_ventilation + ecmo_used +
## dialysis + blood_type + viral_load_category + days_to_diagnosis +
## treatment_protocol + geographic_setting
##
## Df Deviance AIC
## - treatment_protocol 6 3880.9 3968.9
## - blood_type 7 3885.4 3971.4
## - viral_load_category 3 3877.8 3971.8
## - ecmo_used 1 3876.4 3974.4
## - mechanical_ventilation 1 3876.4 3974.4
## - sex 1 3876.6 3974.6
## - days_to_diagnosis 1 3876.6 3974.6
## - icu_admission 1 3876.6 3974.6
## - incubation_days 1 3876.7 3974.7
## - age 1 3876.9 3974.9
## - dialysis 1 3877.5 3975.5
## - hospitalized 1 3877.9 3975.9
## - geographic_setting 3 3882.3 3976.3
## <none> 3876.3 3976.3
## - severity 3 4014.9 4108.9
## - country 18 4169.3 4233.3
##
## Step: AIC=3968.85
## outcome_biner ~ country + age + sex + incubation_days + severity +
## hospitalized + icu_admission + mechanical_ventilation + ecmo_used +
## dialysis + blood_type + viral_load_category + days_to_diagnosis +
## geographic_setting
##
## Df Deviance AIC
## - blood_type 7 3889.9 3963.9
## - viral_load_category 3 3882.3 3964.3
## - ecmo_used 1 3880.9 3966.9
## - mechanical_ventilation 1 3881.0 3967.0
## - sex 1 3881.1 3967.1
## - days_to_diagnosis 1 3881.2 3967.2
## - icu_admission 1 3881.3 3967.3
## - incubation_days 1 3881.3 3967.3
## - age 1 3881.5 3967.5
## - dialysis 1 3882.1 3968.1
## - hospitalized 1 3882.5 3968.5
## <none> 3880.9 3968.9
## - geographic_setting 3 3887.0 3969.0
## - country 18 4173.3 4225.3
## - severity 3 4190.4 4272.4
##
## Step: AIC=3963.93
## outcome_biner ~ country + age + sex + incubation_days + severity +
## hospitalized + icu_admission + mechanical_ventilation + ecmo_used +
## dialysis + viral_load_category + days_to_diagnosis + geographic_setting
##
## Df Deviance AIC
## - viral_load_category 3 3891.3 3959.3
## - ecmo_used 1 3890.0 3962.0
## - mechanical_ventilation 1 3890.1 3962.1
## - sex 1 3890.2 3962.2
## - days_to_diagnosis 1 3890.3 3962.3
## - incubation_days 1 3890.4 3962.4
## - icu_admission 1 3890.5 3962.5
## - age 1 3890.5 3962.5
## - dialysis 1 3891.1 3963.1
## - hospitalized 1 3891.5 3963.5
## - geographic_setting 3 3895.9 3963.9
## <none> 3889.9 3963.9
## - country 18 4181.5 4219.5
## - severity 3 4197.4 4265.4
##
## Step: AIC=3959.31
## outcome_biner ~ country + age + sex + incubation_days + severity +
## hospitalized + icu_admission + mechanical_ventilation + ecmo_used +
## dialysis + days_to_diagnosis + geographic_setting
##
## Df Deviance AIC
## - ecmo_used 1 3891.4 3957.4
## - mechanical_ventilation 1 3891.5 3957.5
## - sex 1 3891.6 3957.6
## - days_to_diagnosis 1 3891.7 3957.7
## - incubation_days 1 3891.8 3957.8
## - age 1 3891.8 3957.8
## - icu_admission 1 3891.9 3957.9
## - dialysis 1 3892.5 3958.5
## - hospitalized 1 3892.9 3958.9
## <none> 3891.3 3959.3
## - geographic_setting 3 3897.5 3959.5
## - country 18 4183.7 4215.7
## - severity 3 4218.3 4280.3
##
## Step: AIC=3957.41
## outcome_biner ~ country + age + sex + incubation_days + severity +
## hospitalized + icu_admission + mechanical_ventilation + dialysis +
## days_to_diagnosis + geographic_setting
##
## Df Deviance AIC
## - mechanical_ventilation 1 3891.6 3955.6
## - sex 1 3891.7 3955.7
## - days_to_diagnosis 1 3891.8 3955.8
## - incubation_days 1 3891.9 3955.9
## - age 1 3891.9 3955.9
## - icu_admission 1 3892.0 3956.0
## - dialysis 1 3892.5 3956.5
## - hospitalized 1 3893.0 3957.0
## <none> 3891.4 3957.4
## - geographic_setting 3 3897.5 3957.5
## - country 18 4183.9 4213.9
## - severity 3 4238.2 4298.2
##
## Step: AIC=3955.6
## outcome_biner ~ country + age + sex + incubation_days + severity +
## hospitalized + icu_admission + dialysis + days_to_diagnosis +
## geographic_setting
##
## Df Deviance AIC
## - sex 1 3891.9 3953.9
## - days_to_diagnosis 1 3892.0 3954.0
## - incubation_days 1 3892.1 3954.1
## - age 1 3892.1 3954.1
## - icu_admission 1 3892.2 3954.2
## - dialysis 1 3892.7 3954.7
## - hospitalized 1 3893.2 3955.2
## <none> 3891.6 3955.6
## - geographic_setting 3 3897.7 3955.7
## - country 18 4184.1 4212.1
## - severity 3 4422.0 4480.0
##
## Step: AIC=3953.89
## outcome_biner ~ country + age + incubation_days + severity +
## hospitalized + icu_admission + dialysis + days_to_diagnosis +
## geographic_setting
##
## Df Deviance AIC
## - days_to_diagnosis 1 3892.3 3952.3
## - incubation_days 1 3892.4 3952.4
## - age 1 3892.4 3952.4
## - icu_admission 1 3892.5 3952.5
## - dialysis 1 3893.0 3953.0
## - hospitalized 1 3893.5 3953.5
## <none> 3891.9 3953.9
## - geographic_setting 3 3898.0 3954.0
## - country 18 4184.2 4210.2
## - severity 3 4422.0 4478.0
##
## Step: AIC=3952.33
## outcome_biner ~ country + age + incubation_days + severity +
## hospitalized + icu_admission + dialysis + geographic_setting
##
## Df Deviance AIC
## - incubation_days 1 3892.8 3950.8
## - age 1 3892.8 3950.8
## - icu_admission 1 3892.9 3950.9
## - dialysis 1 3893.4 3951.4
## - hospitalized 1 3894.0 3952.0
## <none> 3892.3 3952.3
## - geographic_setting 3 3898.4 3952.4
## - country 18 4185.2 4209.2
## - severity 3 4422.4 4476.4
##
## Step: AIC=3950.78
## outcome_biner ~ country + age + severity + hospitalized + icu_admission +
## dialysis + geographic_setting
##
## Df Deviance AIC
## - age 1 3893.2 3949.2
## - icu_admission 1 3893.3 3949.3
## - dialysis 1 3893.9 3949.9
## - hospitalized 1 3894.4 3950.4
## <none> 3892.8 3950.8
## - geographic_setting 3 3898.8 3950.8
## - country 18 4185.3 4207.3
## - severity 3 4422.6 4474.6
##
## Step: AIC=3949.25
## outcome_biner ~ country + severity + hospitalized + icu_admission +
## dialysis + geographic_setting
##
## Df Deviance AIC
## - icu_admission 1 3893.8 3947.8
## - dialysis 1 3894.4 3948.4
## - hospitalized 1 3894.9 3948.9
## <none> 3893.2 3949.2
## - geographic_setting 3 3899.3 3949.3
## - country 18 4186.6 4206.6
## - severity 3 4422.8 4472.8
##
## Step: AIC=3947.78
## outcome_biner ~ country + severity + hospitalized + dialysis +
## geographic_setting
##
## Df Deviance AIC
## - dialysis 1 3894.9 3946.9
## - hospitalized 1 3895.4 3947.4
## <none> 3893.8 3947.8
## - geographic_setting 3 3899.9 3947.9
## - country 18 4186.7 4204.7
## - severity 3 4643.6 4691.6
##
## Step: AIC=3946.9
## outcome_biner ~ country + severity + hospitalized + geographic_setting
##
## Df Deviance AIC
## - hospitalized 1 3896.5 3946.5
## - geographic_setting 3 3900.9 3946.9
## <none> 3894.9 3946.9
## - country 18 4237.7 4253.7
## - severity 3 4692.8 4738.8
##
## Step: AIC=3946.51
## outcome_biner ~ country + severity + geographic_setting
##
## Df Deviance AIC
## - geographic_setting 3 3902.5 3946.5
## <none> 3896.5 3946.5
## - country 18 4239.5 4253.5
## - severity 3 4848.2 4892.2
##
## Step: AIC=3946.46
## outcome_biner ~ country + severity
##
## Df Deviance AIC
## <none> 3902.5 3946.5
## - country 18 4244.3 4252.3
## - severity 3 4852.2 4890.2
print(summary(backward_model))
##
## Call:
## glm(formula = outcome_biner ~ country + severity, family = binomial(link = "logit"),
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.249442 0.169845 -1.469 0.142
## countryBolivia 0.137332 0.226179 0.607 0.544
## countryBrazil 0.280675 0.232498 1.207 0.227
## countryCanada 0.070500 0.230642 0.306 0.760
## countryChile 0.142522 0.230316 0.619 0.536
## countryChina 1.579085 0.265533 5.947 2.73e-09 ***
## countryColombia 0.035885 0.227888 0.157 0.875
## countryCroatia 1.625391 0.258632 6.285 3.29e-10 ***
## countryFinland 1.587500 0.259163 6.125 9.04e-10 ***
## countryGermany 1.335080 0.254835 5.239 1.61e-07 ***
## countryKategori lain/jarang 1.714030 1.099879 1.558 0.119
## countryPanama -0.251346 0.220976 -1.137 0.255
## countryParaguay 0.008408 0.237127 0.035 0.972
## countryRussia 1.402093 0.241230 5.812 6.16e-09 ***
## countrySlovenia 1.506715 0.259625 5.803 6.50e-09 ***
## countrySouth Korea 1.525430 0.249243 6.120 9.34e-10 ***
## countrySweden 1.839740 0.266855 6.894 5.42e-12 ***
## countryUruguay -0.034531 0.235536 -0.147 0.883
## countryUSA 0.002952 0.230611 0.013 0.990
## severityMild 3.388394 0.191311 17.711 < 2e-16 ***
## severityModerate 3.235463 0.150422 21.509 < 2e-16 ***
## severitySevere 1.327982 0.090894 14.610 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5413.2 on 7998 degrees of freedom
## Residual deviance: 3902.5 on 7977 degrees of freedom
## AIC: 3946.5
##
## Number of Fisher Scoring iterations: 6
knitr::kable(coef_tidy(backward_model),
caption = "Backward Elimination - Signifikansi Variabel")
Backward Elimination - Signifikansi Variabel
| severityModerate |
3.2355 |
0.1504 |
25.418 |
18.928 - 34.134 |
0.0000000 |
Ya |
| severityMild |
3.3884 |
0.1913 |
29.618 |
20.357 - 43.093 |
0.0000000 |
Ya |
| severitySevere |
1.3280 |
0.0909 |
3.773 |
3.158 - 4.509 |
0.0000000 |
Ya |
| countrySweden |
1.8397 |
0.2669 |
6.295 |
3.731 - 10.62 |
0.0000000 |
Ya |
| countryCroatia |
1.6254 |
0.2586 |
5.080 |
3.06 - 8.434 |
0.0000000 |
Ya |
| countryFinland |
1.5875 |
0.2592 |
4.892 |
2.943 - 8.129 |
0.0000000 |
Ya |
| countrySouth Korea |
1.5254 |
0.2492 |
4.597 |
2.82 - 7.493 |
0.0000000 |
Ya |
| countryChina |
1.5791 |
0.2655 |
4.851 |
2.882 - 8.162 |
0.0000000 |
Ya |
| countryRussia |
1.4021 |
0.2412 |
4.064 |
2.533 - 6.52 |
0.0000000 |
Ya |
| countrySlovenia |
1.5067 |
0.2596 |
4.512 |
2.712 - 7.505 |
0.0000000 |
Ya |
| countryGermany |
1.3351 |
0.2548 |
3.800 |
2.306 - 6.262 |
0.0000002 |
Ya |
| countryKategori lain/jarang |
1.7140 |
1.0999 |
5.551 |
0.643 - 47.932 |
0.1191000 |
Tidak |
| countryBrazil |
0.2807 |
0.2325 |
1.324 |
0.839 - 2.088 |
0.2273000 |
Tidak |
| countryPanama |
-0.2513 |
0.2210 |
0.778 |
0.504 - 1.199 |
0.2554000 |
Tidak |
| countryChile |
0.1425 |
0.2303 |
1.153 |
0.734 - 1.811 |
0.5360000 |
Tidak |
| countryBolivia |
0.1373 |
0.2262 |
1.147 |
0.736 - 1.787 |
0.5437000 |
Tidak |
| countryCanada |
0.0705 |
0.2306 |
1.073 |
0.683 - 1.686 |
0.7599000 |
Tidak |
| countryColombia |
0.0359 |
0.2279 |
1.037 |
0.663 - 1.62 |
0.8749000 |
Tidak |
| countryUruguay |
-0.0345 |
0.2355 |
0.966 |
0.609 - 1.533 |
0.8834000 |
Tidak |
| countryParaguay |
0.0084 |
0.2371 |
1.008 |
0.634 - 1.605 |
0.9717000 |
Tidak |
| countryUSA |
0.0030 |
0.2306 |
1.003 |
0.638 - 1.576 |
0.9898000 |
Tidak |
# 10c. Stepwise
cat("\n=== STEPWISE (BOTH) ===\n")
##
## === STEPWISE (BOTH) ===
stepwise_model <- step(null_model,
scope = list(lower = null_model, upper = full_model),
direction = "both", trace = TRUE, k = 2)
## Start: AIC=5415.16
## outcome_biner ~ 1
##
## Df Deviance AIC
## + severity 3 4244.3 4252.3
## + treatment_protocol 6 4486.6 4500.6
## + syndrome 1 4860.7 4864.7
## + country 18 4852.2 4890.2
## + icu_admission 1 4958.7 4962.7
## + mechanical_ventilation 1 5006.8 5010.8
## + hospitalized 1 5188.6 5192.6
## + ecmo_used 1 5235.0 5239.0
## + viral_load_category 3 5233.7 5241.7
## <none> 5413.2 5415.2
## + dialysis 1 5411.9 5415.9
## + sex 1 5412.7 5416.7
## + age 1 5413.0 5417.0
## + days_to_diagnosis 1 5413.0 5417.0
## + geographic_setting 3 5409.1 5417.1
## + incubation_days 1 5413.1 5417.1
## + exposure_type 9 5401.7 5421.7
## + comorbidity 9 5403.4 5423.4
## + blood_type 7 5408.6 5424.6
##
## Step: AIC=4252.3
## outcome_biner ~ severity
##
## Df Deviance AIC
## + syndrome 1 3916.8 3926.8
## + country 18 3902.5 3946.5
## + dialysis 1 4192.6 4202.6
## <none> 4244.3 4252.3
## + hospitalized 1 4242.5 4252.5
## + age 1 4243.3 4253.3
## + days_to_diagnosis 1 4243.4 4253.4
## + geographic_setting 3 4239.5 4253.5
## + ecmo_used 1 4243.8 4253.8
## + mechanical_ventilation 1 4244.1 4254.1
## + incubation_days 1 4244.2 4254.2
## + sex 1 4244.2 4254.2
## + icu_admission 1 4244.2 4254.2
## + viral_load_category 3 4241.8 4255.8
## + blood_type 7 4235.2 4257.2
## + treatment_protocol 6 4240.1 4260.1
## + exposure_type 9 4235.3 4261.3
## + comorbidity 9 4239.1 4265.1
## - severity 3 5413.2 5415.2
##
## Step: AIC=3926.78
## outcome_biner ~ severity + syndrome
##
## Df Deviance AIC
## <none> 3916.8 3926.8
## + hospitalized 1 3915.1 3927.1
## + geographic_setting 3 3911.6 3927.6
## + dialysis 1 3915.8 3927.8
## + age 1 3916.1 3928.1
## + icu_admission 1 3916.1 3928.1
## + days_to_diagnosis 1 3916.2 3928.2
## + incubation_days 1 3916.3 3928.3
## + mechanical_ventilation 1 3916.5 3928.5
## + sex 1 3916.6 3928.6
## + ecmo_used 1 3916.7 3928.7
## + viral_load_category 3 3915.1 3931.1
## + blood_type 7 3908.5 3932.5
## + treatment_protocol 6 3911.4 3933.4
## + exposure_type 9 3908.5 3936.5
## + comorbidity 9 3910.9 3938.9
## + country 17 3902.5 3946.5
## - syndrome 1 4244.3 4252.3
## - severity 3 4860.7 4864.7
print(summary(stepwise_model))
##
## Call:
## glm(formula = outcome_biner ~ severity + syndrome, family = binomial(link = "logit"),
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.30876 0.08506 15.39 <2e-16 ***
## severityMild 3.36346 0.19090 17.62 <2e-16 ***
## severityModerate 3.21644 0.15004 21.44 <2e-16 ***
## severitySevere 1.31022 0.09018 14.53 <2e-16 ***
## syndromeHPS -1.50320 0.08722 -17.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5413.2 on 7998 degrees of freedom
## Residual deviance: 3916.8 on 7994 degrees of freedom
## AIC: 3926.8
##
## Number of Fisher Scoring iterations: 6
knitr::kable(coef_tidy(stepwise_model),
caption = "Stepwise Selection - Signifikansi Variabel")
Stepwise Selection - Signifikansi Variabel
| severityModerate |
3.2164 |
0.1500 |
24.939 |
18.585 - 33.466 |
0 |
Ya |
| severityMild |
3.3635 |
0.1909 |
28.889 |
19.872 - 41.998 |
0 |
Ya |
| syndromeHPS |
-1.5032 |
0.0872 |
0.222 |
0.187 - 0.264 |
0 |
Ya |
| severitySevere |
1.3102 |
0.0902 |
3.707 |
3.106 - 4.424 |
0 |
Ya |
# 10d. Perbandingan
model_comparison <- data.frame(
Metode = c("Full Model", "Forward", "Backward", "Stepwise"),
`Jumlah Variabel` = c(length(coef(full_model)) - 1,
length(coef(forward_model)) - 1,
length(coef(backward_model)) - 1,
length(coef(stepwise_model)) - 1),
AIC = round(c(AIC(full_model), AIC(forward_model),
AIC(backward_model), AIC(stepwise_model)), 2),
`Deviance Residual`= round(c(full_model$deviance, forward_model$deviance,
backward_model$deviance, stepwise_model$deviance), 2),
check.names = FALSE
)
knitr::kable(model_comparison, caption = "Perbandingan Metode Pemilihan Model")
Perbandingan Metode Pemilihan Model
| Full Model |
68 |
3998.39 |
3862.39 |
| Forward |
4 |
3926.78 |
3916.78 |
| Backward |
21 |
3946.46 |
3902.46 |
| Stepwise |
4 |
3926.78 |
3916.78 |
# Model terpilih
disease_fit <- stepwise_model
cat("\n=== MODEL TERPILIH ===\n")
##
## === MODEL TERPILIH ===
print(summary(disease_fit))
##
## Call:
## glm(formula = outcome_biner ~ severity + syndrome, family = binomial(link = "logit"),
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.30876 0.08506 15.39 <2e-16 ***
## severityMild 3.36346 0.19090 17.62 <2e-16 ***
## severityModerate 3.21644 0.15004 21.44 <2e-16 ***
## severitySevere 1.31022 0.09018 14.53 <2e-16 ***
## syndromeHPS -1.50320 0.08722 -17.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5413.2 on 7998 degrees of freedom
## Residual deviance: 3916.8 on 7994 degrees of freedom
## AIC: 3926.8
##
## Number of Fisher Scoring iterations: 6
Goodness of Fit
# 11a. Ringkasan model
ringkasan_model <- data.frame(
Keterangan = c("Jumlah observasi training", "Null deviance", "Residual deviance",
"Derajat bebas residual", "AIC", "BIC"),
Nilai = c(nobs(disease_fit),
round(disease_fit$null.deviance, 3),
round(disease_fit$deviance, 3),
disease_fit$df.residual,
round(AIC(disease_fit), 3),
round(BIC(disease_fit), 3))
)
knitr::kable(ringkasan_model, caption = "Ringkasan kecocokan model")
Ringkasan kecocokan model
| Jumlah observasi training |
7999.000 |
| Null deviance |
5413.155 |
| Residual deviance |
3916.778 |
| Derajat bebas residual |
7994.000 |
| AIC |
3926.778 |
| BIC |
3961.714 |
# 11b. Likelihood Ratio Test
lrt_result <- anova(null_model, disease_fit, test = "Chisq")
cat("\nHasil Likelihood Ratio Test:\n")
##
## Hasil Likelihood Ratio Test:
## Analysis of Deviance Table
##
## Model 1: outcome_biner ~ 1
## Model 2: outcome_biner ~ severity + syndrome
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7998 5413.2
## 2 7994 3916.8 4 1496.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Hitung nilai LRT secara manual untuk tabel ringkasan
lrt_chisq <- lrt_result$Deviance[2]
lrt_df <- lrt_result$Df[2]
lrt_pvalue <- lrt_result$`Pr(>Chi)`[2]
lrt_table <- data.frame(
Model = c("Null (intercept saja)", "Model akhir"),
`Log-Likelihood` = round(c(logLik(null_model), logLik(disease_fit)), 3),
`Derajat Bebas` = c(attr(logLik(null_model), "df"),
attr(logLik(disease_fit), "df")),
Deviance = round(c(null_model$deviance, disease_fit$deviance), 3),
`Chi-square` = c(NA, round(lrt_chisq, 4)),
df = c(NA, lrt_df),
`p-value` = c(NA, signif(lrt_pvalue, 4)),
Keputusan = c(NA,
ifelse(lrt_pvalue < 0.05,
"Model signifikan (tolak H0)",
"Model tidak signifikan")),
check.names = FALSE
)
knitr::kable(
lrt_table,
caption = "Likelihood Ratio Test — perbandingan null model vs model akhir"
)
Likelihood Ratio Test — perbandingan null model vs model
akhir
| Null (intercept saja) |
-2706.578 |
1 |
5413.155 |
NA |
NA |
NA |
NA |
| Model akhir |
-1958.389 |
5 |
3916.778 |
1496.377 |
4 |
0 |
Model signifikan (tolak H0) |
cat(
"\nStatistik LRT:\n",
" Chi-square =", round(lrt_chisq, 4), "\n",
" df =", lrt_df, "\n",
" p-value =", signif(lrt_pvalue, 4), "\n",
" Kesimpulan :", ifelse(lrt_pvalue < 0.05,
"Minimal satu prediktor berpengaruh signifikan (p < 0.05).",
"Tidak ada prediktor yang berpengaruh signifikan (p >= 0.05)."),
"\n"
)
##
## Statistik LRT:
## Chi-square = 1496.377
## df = 4
## p-value = 8.893182e-323
## Kesimpulan : Minimal satu prediktor berpengaruh signifikan (p < 0.05).
# 11c. Pseudo R²
pseudo_r2 <- PseudoR2(disease_fit, which = c("CoxSnell", "Nagelkerke", "McFadden"))
pseudo_r2_table <- data.frame(
Metode = c("Cox & Snell", "Nagelkerke", "McFadden"),
`Pseudo R-Squared` = round(pseudo_r2, 3),
Interpretasi = c("Mirip R-Squared regresi linier, max < 1",
"Versi adjusted, max = 1",
"0.2–0.4 sudah dianggap baik"),
check.names = FALSE
)
knitr::kable(pseudo_r2_table, caption = "Pseudo R-Squared")
Pseudo R-Squared
| CoxSnell |
Cox & Snell |
0.171 |
Mirip R-Squared regresi linier, max < 1 |
| Nagelkerke |
Nagelkerke |
0.347 |
Versi adjusted, max = 1 |
| McFadden |
McFadden |
0.276 |
0.2–0.4 sudah dianggap baik |
# 11d. Deviance dan AIC
n_par <- length(coef(disease_fit))
n_obs <- nobs(disease_fit)
aic_val <- AIC(disease_fit)
aicc_val <- aic_val + (2 * n_par * (n_par + 1)) / (n_obs - n_par - 1)
# p-value untuk deviance residual: uji apakah model fit terhadap data
# H0: model fit (deviance kecil), menggunakan distribusi chi-square
pval_resid_dev <- pchisq(
disease_fit$deviance,
df = disease_fit$df.residual,
lower.tail = FALSE
)
# p-value untuk penurunan deviance (sama dengan LRT di atas, ditampilkan ulang)
selisih_dev <- disease_fit$null.deviance - disease_fit$deviance
selisih_df <- disease_fit$df.null - disease_fit$df.residual
pval_dev_drop <- pchisq(selisih_dev, df = selisih_df, lower.tail = FALSE)
deviance_table <- data.frame(
Komponen = c(
"Null deviance",
"Residual deviance",
"Selisih deviance (null − residual)",
"AIC",
"AICc (terkoreksi)"
),
Nilai = c(
round(disease_fit$null.deviance, 3),
round(disease_fit$deviance, 3),
round(selisih_dev, 3),
round(aic_val, 3),
round(aicc_val, 3)
),
df = c(
disease_fit$df.null,
disease_fit$df.residual,
selisih_df,
NA,
NA
),
`p-value` = c(
NA, # null deviance: tidak ada uji langsung
signif(pval_resid_dev, 4), # uji goodness-of-fit residual deviance
signif(pval_dev_drop, 4), # uji signifikansi penurunan deviance (= LRT)
NA,
NA
),
Keterangan = c(
"Deviance model tanpa prediktor",
paste0("p ", ifelse(pval_resid_dev >= 0.05, ">= 0.05: model fit memadai",
"< 0.05: indikasi lack-of-fit")),
paste0("p ", ifelse(pval_dev_drop < 0.05,
"< 0.05: prediktor signifikan meningkatkan fit",
">= 0.05: prediktor tidak signifikan")),
"Akaike Information Criterion",
"AIC terkoreksi untuk sampel kecil"
),
check.names = FALSE
)
knitr::kable(
deviance_table,
caption = "Deviance dan AIC model regresi logistik"
)
Deviance dan AIC model regresi logistik
| Null deviance |
5413.155 |
7998 |
NA |
Deviance model tanpa prediktor |
| Residual deviance |
3916.778 |
7994 |
1 |
p >= 0.05: model fit memadai |
| Selisih deviance (null − residual) |
1496.377 |
4 |
0 |
p < 0.05: prediktor signifikan meningkatkan fit |
| AIC |
3926.778 |
NA |
NA |
Akaike Information Criterion |
| AICc (terkoreksi) |
3926.786 |
NA |
NA |
AIC terkoreksi untuk sampel kecil |
Confusion
Matrix
p_train <- predict(disease_fit, newdata = train_data, type = "response")
p_test <- predict(disease_fit, newdata = test_data, type = "response")
safe_div <- function(num, den) ifelse(den == 0, NA_real_, num / den)
# Catatan: threshold di sini = batas untuk PREDIKSI SEMBUH (outcome = 1)
classification_metrics <- function(actual, prob, threshold = 0.5) {
pred <- as.integer(prob >= threshold) # >= threshold → prediksi Sembuh (1)
tp <- sum(pred == 1 & actual == 1) # prediksi Sembuh, aktual Sembuh
tn <- sum(pred == 0 & actual == 0) # prediksi Meninggal, aktual Meninggal
fp <- sum(pred == 1 & actual == 0) # prediksi Sembuh, aktual Meninggal
fn <- sum(pred == 0 & actual == 1) # prediksi Meninggal, aktual Sembuh
sensitivity <- safe_div(tp, tp + fn) # recall Sembuh
specificity <- safe_div(tn, tn + fp) # recall Meninggal
precision <- safe_div(tp, tp + fp)
npv <- safe_div(tn, tn + fn)
accuracy <- safe_div(tp + tn, tp + tn + fp + fn)
data.frame(
threshold = threshold,
accuracy = accuracy,
error_rate = 1 - accuracy,
sensitivity = sensitivity,
specificity = specificity,
precision = precision,
negative_predictive_value = npv,
f1_score = safe_div(2 * precision * sensitivity,
precision + sensitivity),
balanced_accuracy = (sensitivity + specificity) / 2,
false_positive_rate = 1 - specificity,
false_negative_rate = 1 - sensitivity
)
}
format_metrics_indonesia <- function(x) {
x %>% transmute(
Threshold = threshold,
Akurasi = accuracy,
`Error rate` = error_rate,
Sensitivity = sensitivity,
Specificity = specificity,
Presisi = precision,
NPV = negative_predictive_value,
`F1-score` = f1_score,
`Balanced accuracy` = balanced_accuracy,
FPR = false_positive_rate,
FNR = false_negative_rate
)
}
confusion_matrix <- function(actual, prob, threshold = 0.5) {
pred_label <- factor(
ifelse(prob >= threshold, "Prediksi Sembuh", "Prediksi Meninggal/Buruk"),
levels = c("Prediksi Meninggal/Buruk", "Prediksi Sembuh")
)
actual_label <- factor(
ifelse(actual == 1, "Aktual Sembuh", "Aktual Meninggal/Buruk"),
levels = c("Aktual Meninggal/Buruk", "Aktual Sembuh")
)
addmargins(table(actual_label, pred_label))
}
Evaluasi
cat("\n=== CONFUSION MATRIX (caret) ===\n")
##
## === CONFUSION MATRIX (caret) ===
pred_class_test <- factor(
ifelse(p_test >= 0.5, "Sembuh", "Meninggal/Buruk"),
levels = c("Meninggal/Buruk", "Sembuh")
)
actual_class_test <- factor(
ifelse(test_data$outcome_biner == 1, "Sembuh", "Meninggal/Buruk"),
levels = c("Meninggal/Buruk", "Sembuh")
)
cm_caret <- confusionMatrix(
data = pred_class_test,
reference = actual_class_test,
positive = "Sembuh" # kelas positif = Sembuh (outcome_biner = 1)
)
print(cm_caret)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Meninggal/Buruk Sembuh
## Meninggal/Buruk 91 69
## Sembuh 122 1719
##
## Accuracy : 0.9045
## 95% CI : (0.8908, 0.9171)
## No Information Rate : 0.8936
## P-Value [Acc > NIR] : 0.0578743
##
## Kappa : 0.4365
##
## Mcnemar's Test P-Value : 0.0001682
##
## Sensitivity : 0.9614
## Specificity : 0.4272
## Pos Pred Value : 0.9337
## Neg Pred Value : 0.5687
## Prevalence : 0.8936
## Detection Rate : 0.8591
## Detection Prevalence : 0.9200
## Balanced Accuracy : 0.6943
##
## 'Positive' Class : Sembuh
##
# Tabel confusion matrix
cm_table <- as.data.frame.matrix(cm_caret$table)
cm_table <- cbind(cm_table, Total = rowSums(cm_table))
cm_table <- rbind(cm_table, Total = colSums(cm_table))
rownames(cm_table)[3] <- "Total"
knitr::kable(cm_table,
caption = "Confusion Matrix Testing (Threshold 0.50)")
Confusion Matrix Testing (Threshold 0.50)
| Meninggal/Buruk |
91 |
69 |
160 |
| Sembuh |
122 |
1719 |
1841 |
| Total |
213 |
1788 |
2001 |
# Tabel metrik
caret_metrics <- data.frame(
Metrik = c("Accuracy", "Kappa", "Sensitivity", "Specificity",
"Precision", "F1 Score", "Prevalence", "Detection Rate",
"Balanced Accuracy"),
Nilai = round(c(
cm_caret$overall["Accuracy"],
cm_caret$overall["Kappa"],
cm_caret$byClass["Sensitivity"],
cm_caret$byClass["Specificity"],
cm_caret$byClass["Precision"],
cm_caret$byClass["F1"],
cm_caret$byClass["Prevalence"],
cm_caret$byClass["Detection Rate"],
cm_caret$byClass["Balanced Accuracy"]
), 3)
)
knitr::kable(caret_metrics,
caption = "Metrik Evaluasi (Threshold 0.50, positif = Sembuh)")
Metrik Evaluasi (Threshold 0.50, positif = Sembuh)
| Accuracy |
Accuracy |
0.905 |
| Kappa |
Kappa |
0.436 |
| Sensitivity |
Sensitivity |
0.961 |
| Specificity |
Specificity |
0.427 |
| Precision |
Precision |
0.934 |
| F1 |
F1 Score |
0.947 |
| Prevalence |
Prevalence |
0.894 |
| Detection Rate |
Detection Rate |
0.859 |
| Balanced Accuracy |
Balanced Accuracy |
0.694 |
# Confusion matrix manual & metrik manual
knitr::kable(
confusion_matrix(test_data$outcome_biner, p_test, 0.5),
caption = "Confusion Matrix Manual - Threshold 0.50"
)
Confusion Matrix Manual - Threshold 0.50
| Aktual Meninggal/Buruk |
91 |
122 |
213 |
| Aktual Sembuh |
69 |
1719 |
1788 |
| Sum |
160 |
1841 |
2001 |
knitr::kable(
classification_metrics(test_data$outcome_biner, p_test, 0.5) %>%
format_metrics_indonesia() %>%
mutate(across(where(is.numeric), round, 3)),
caption = "Metrik Testing Manual - Threshold 0.50"
)
Metrik Testing Manual - Threshold 0.50
| 0.5 |
0.905 |
0.095 |
0.961 |
0.427 |
0.934 |
0.569 |
0.947 |
0.694 |
0.573 |
0.039 |
ROC dan AUC
cat("\n=== ROC-AUC ===\n")
##
## === ROC-AUC ===
# p = P(Sembuh=1), jadi roc() langsung dengan response = outcome_biner
roc_train_pROC <- roc(train_data$outcome_biner, p_train)
roc_test_pROC <- roc(test_data$outcome_biner, p_test)
cat("AUC Training:", round(auc(roc_train_pROC), 3), "\n")
## AUC Training: 0.857
cat("AUC Testing: ", round(auc(roc_test_pROC), 3), "\n")
## AUC Testing: 0.865
ci_train <- ci.auc(roc_train_pROC)
ci_test <- ci.auc(roc_test_pROC)
cat("CI 95% AUC Training:", round(ci_train, 3), "\n")
## CI 95% AUC Training: 0.845 0.857 0.87
cat("CI 95% AUC Testing: ", round(ci_test, 3), "\n")
## CI 95% AUC Testing: 0.842 0.865 0.889
# ROC manual
safe_div <- function(num, den) ifelse(den == 0, NA_real_, num / den)
roc_points <- function(actual, prob) {
thresholds <- c(Inf, sort(unique(prob), decreasing = TRUE), -Inf)
bind_rows(lapply(thresholds, function(th) {
pred <- as.integer(prob >= th)
tp <- sum(pred == 1 & actual == 1)
tn <- sum(pred == 0 & actual == 0)
fp <- sum(pred == 1 & actual == 0)
fn <- sum(pred == 0 & actual == 1)
data.frame(
threshold = th,
sensitivity = safe_div(tp, tp + fn),
specificity = safe_div(tn, tn + fp),
fpr = 1 - safe_div(tn, tn + fp),
youden = safe_div(tp, tp + fn) + safe_div(tn, tn + fp) - 1
)
}))
}
auc_value <- function(roc_df) {
d <- roc_df %>% arrange(fpr, sensitivity)
sum(diff(d$fpr) * (head(d$sensitivity, -1) + tail(d$sensitivity, -1)) / 2)
}
roc_train <- roc_points(train_data$outcome_biner, p_train) %>% mutate(data = "Training")
roc_test <- roc_points(test_data$outcome_biner, p_test) %>% mutate(data = "Testing")
auc_train_manual <- auc_value(roc_train)
auc_test_manual <- auc_value(roc_test)
knitr::kable(
data.frame(
Data = c("Training", "Testing"),
`AUC (Manual)`= round(c(auc_train_manual, auc_test_manual), 3),
`AUC (pROC)` = round(c(auc(roc_train_pROC), auc(roc_test_pROC)), 3),
check.names = FALSE
),
caption = "Perbandingan Nilai AUC"
)
Perbandingan Nilai AUC
| Training |
0.857 |
0.857 |
| Testing |
0.865 |
0.865 |
# Threshold optimal (Youden)
optimal_train <- roc_train %>%
filter(is.finite(threshold)) %>%
arrange(desc(youden), desc(sensitivity)) %>%
slice(1)
threshold_opt <- optimal_train$threshold[1]
cat("Threshold optimal (Youden):", round(threshold_opt, 3), "\n")
## Threshold optimal (Youden): 0.932
test_at_opt <- roc_points(test_data$outcome_biner, p_test) %>%
filter(is.finite(threshold)) %>%
slice_min(abs(threshold - threshold_opt), n = 1, with_ties = FALSE) %>%
mutate(data = "Testing (threshold optimal)")
ggplot(bind_rows(roc_train, roc_test), aes(x = fpr, y = sensitivity, color = data)) +
geom_path(linewidth = 1.1) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#6c757d") +
geom_point(data = optimal_train, aes(x = fpr, y = sensitivity),
inherit.aes = FALSE, color = "#ffb703", fill = "#fb8500",
shape = 21, size = 4, stroke = 1.2) +
geom_point(data = test_at_opt, aes(x = fpr, y = sensitivity),
inherit.aes = FALSE, color = "#8338ec", fill = "#3a86ff",
shape = 24, size = 4, stroke = 1.2) +
coord_equal() +
scale_color_manual(values = c("Training" = "#0077b6", "Testing" = "#e76f51")) +
labs(
title = "Kurva ROC Model Regresi Logistik",
subtitle = paste0("AUC training = ", round(auc_train_manual, 3),
"; AUC testing = ", round(auc_test_manual, 3),
"; threshold optimal = ", round(threshold_opt, 3)),
x = "False positive rate (1 - specificity)",
y = "Sensitivity / true positive rate",
color = "Data"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")

Precission Recall
Curve
# Kelas positif = Sembuh (outcome_biner = 1)
# scores.class0 = skor untuk kelas positif (1 = Sembuh)
# scores.class1 = skor untuk kelas negatif (0 = Meninggal)
pr_train_prroc <- pr.curve(
scores.class0 = p_train[train_data$outcome_biner == 1],
scores.class1 = p_train[train_data$outcome_biner == 0],
curve = TRUE
)
pr_test_prroc <- pr.curve(
scores.class0 = p_test[test_data$outcome_biner == 1],
scores.class1 = p_test[test_data$outcome_biner == 0],
curve = TRUE
)
cat("AUPRC Training:", round(pr_train_prroc$auc.integral, 3), "\n")
## AUPRC Training: 0.978
cat("AUPRC Testing: ", round(pr_test_prroc$auc.integral, 3), "\n")
## AUPRC Testing: 0.98
# PR manual
pr_curve_manual <- function(actual, prob) {
thresholds <- sort(unique(prob), decreasing = TRUE)
bind_rows(lapply(thresholds, function(th) {
pred <- as.integer(prob >= th)
tp <- sum(pred == 1 & actual == 1)
fp <- sum(pred == 1 & actual == 0)
fn <- sum(pred == 0 & actual == 1)
data.frame(
threshold = th,
precision = safe_div(tp, tp + fp),
recall = safe_div(tp, tp + fn)
)
})) %>%
filter(!is.na(precision) & !is.na(recall))
}
pr_train_manual <- pr_curve_manual(train_data$outcome_biner, p_train) %>%
mutate(data = "Training")
pr_test_manual <- pr_curve_manual(test_data$outcome_biner, p_test) %>%
mutate(data = "Testing")
auprc_manual <- function(pr_df) {
d <- pr_df %>% arrange(desc(recall), precision)
abs(sum(diff(d$recall) *
(head(d$precision, -1) + tail(d$precision, -1)) / 2,
na.rm = TRUE))
}
knitr::kable(
data.frame(
Data = c("Training", "Testing"),
`AUPRC (Manual)`= round(c(auprc_manual(pr_train_manual),
auprc_manual(pr_test_manual)), 3),
`AUPRC (PRROC)` = round(c(pr_train_prroc$auc.integral,
pr_test_prroc$auc.integral), 3),
check.names = FALSE
),
caption = "Area Under Precision-Recall Curve (AUPRC)"
)
Area Under Precision-Recall Curve (AUPRC)
| Training |
0.759 |
0.978 |
| Testing |
0.770 |
0.980 |
prevalence_train <- mean(train_data$outcome_biner == 1)
prevalence_test <- mean(test_data$outcome_biner == 1)
ggplot(bind_rows(pr_train_manual, pr_test_manual),
aes(x = recall, y = precision, color = data)) +
geom_path(linewidth = 1.1) +
geom_hline(yintercept = prevalence_train, linetype = "dashed",
color = "#0077b6", alpha = 0.5) +
geom_hline(yintercept = prevalence_test, linetype = "dashed",
color = "#e76f51", alpha = 0.5) +
scale_color_manual(values = c("Training" = "#0077b6", "Testing" = "#e76f51")) +
labs(
title = "Precision-Recall Curve Model Regresi Logistik",
subtitle = paste0("AUPRC Training = ", round(auprc_manual(pr_train_manual), 3),
"; Testing = ", round(auprc_manual(pr_test_manual), 3),
"\nGaris putus-putus = proporsi kelas Sembuh (baseline)"),
x = "Recall (Sensitivity terhadap kelas Sembuh)",
y = "Precision",
color = "Data"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom") +
coord_cartesian(ylim = c(0, 1), xlim = c(0, 1))
