Regresi logistik multinomial digunakan ketika variabel respons bersifat nominal dengan tiga kategori atau lebih tanpa urutan yang bermakna. Model memodelkan log-odds setiap kategori \(k\) terhadap kategori referensi \(K\):
\[\log\frac{P(Y = k)}{P(Y = K)} = \beta_{k0} + \beta_{k1}X_1 + \cdots + \beta_{kp}X_p, \quad k = 1, \ldots, K-1\]
Eksponensial koefisien menghasilkan Odds Ratio (OR) yang menunjukkan seberapa besar suatu prediktor mengubah peluang berada di kategori \(k\) dibanding kategori referensi.
Data yang digunakan adalah Palmer Archipelago Antarctica
Penguin Dataset. Variabel respons adalah species
(nominal, 3 kategori: Adelie, Chinstrap,
Gentoo). Prediktor: culmen_length_mm,
culmen_depth_mm, flipper_length_mm,
body_mass_g, dan sex.
penguins <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/penguins_size.csv",
stringsAsFactors = TRUE,
na.strings = c("NA", ".", ""))
penguins_clean <- penguins |>
filter(
!is.na(culmen_length_mm), !is.na(culmen_depth_mm),
!is.na(flipper_length_mm), !is.na(body_mass_g),
!is.na(sex), sex %in% c("MALE", "FEMALE")
) |>
mutate(species = factor(species), sex = factor(sex))
cat("Dimensi data:", nrow(penguins_clean), "baris x", ncol(penguins_clean), "kolom\n")## Dimensi data: 333 baris x 7 kolom
head(penguins_clean) |>
kable(caption = "Enam baris pertama data penguin") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| species | island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex |
|---|---|---|---|---|---|---|
| Adelie | Torgersen | 39.1 | 18.7 | 181 | 3750 | MALE |
| Adelie | Torgersen | 39.5 | 17.4 | 186 | 3800 | FEMALE |
| Adelie | Torgersen | 40.3 | 18.0 | 195 | 3250 | FEMALE |
| Adelie | Torgersen | 36.7 | 19.3 | 193 | 3450 | FEMALE |
| Adelie | Torgersen | 39.3 | 20.6 | 190 | 3650 | MALE |
| Adelie | Torgersen | 38.9 | 17.8 | 181 | 3625 | FEMALE |
penguins_clean |>
count(species) |>
mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
rename(Spesies = species, Frekuensi = n) |>
kable(caption = "Frekuensi dan proporsi tiap spesies penguin") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Spesies | Frekuensi | Proporsi |
|---|---|---|
| Adelie | 146 | 43.8% |
| Chinstrap | 68 | 20.4% |
| Gentoo | 119 | 35.7% |
ggplot(penguins_clean, aes(x = species, fill = species)) +
geom_bar(width = 0.6, alpha = 0.9) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
labs(title = "Sebaran Spesies Penguin", x = "Spesies", y = "Jumlah Observasi") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")# Boxplot panjang paruh per spesies
ggplot(penguins_clean, aes(x = species, y = culmen_length_mm, fill = species)) +
geom_boxplot(alpha = 0.8, outlier.shape = 21) +
scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
labs(title = "Panjang Paruh per Spesies",
x = "Spesies", y = "Panjang Paruh (mm)") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")# Scatter panjang paruh vs panjang sirip
ggplot(penguins_clean, aes(x = culmen_length_mm, y = flipper_length_mm,
colour = species, shape = sex)) +
geom_point(alpha = 0.7, size = 2.2) +
scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
labs(title = "Panjang Paruh vs Panjang Sirip",
x = "Panjang Paruh (mm)", y = "Panjang Sirip (mm)",
colour = "Spesies", shape = "Jenis Kelamin") +
theme_minimal(base_size = 12)penguins_clean$species <- relevel(penguins_clean$species, ref = "Adelie")
fit_multi <- nnet::multinom(
species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm + body_mass_g + sex,
data = penguins_clean,
trace = FALSE
)
summary(fit_multi)## Call:
## nnet::multinom(formula = species ~ culmen_length_mm + culmen_depth_mm +
## flipper_length_mm + body_mass_g + sex, data = penguins_clean,
## trace = FALSE)
##
## Coefficients:
## (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap -103.422234 16.51643 -24.41759 -0.31003945
## Gentoo -6.756322 12.21924 -30.09759 -0.08757856
## body_mass_g sexMALE
## Chinstrap -0.030200987 -15.820608
## Gentoo 0.005019241 -8.132104
##
## Std. Errors:
## (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap 0.5715537 27.750159 27.43735 4.097368
## Gentoo 0.5715312 8.972629 33.25505 11.905646
## body_mass_g sexMALE
## Chinstrap 0.1350179 3.077204
## Gentoo 0.5848798 3.498127
##
## Residual Deviance: 0.002996928
## AIC: 24.003
multi_sum <- summary(fit_multi)
z_vals <- multi_sum$coefficients / multi_sum$standard.errors
p_vals <- 2 * (1 - pnorm(abs(z_vals)))
coef_df <- as.data.frame(t(multi_sum$coefficients)) |> tibble::rownames_to_column("Variabel")
se_df <- as.data.frame(t(multi_sum$standard.errors)) |> tibble::rownames_to_column("Variabel")
pv_df <- as.data.frame(t(p_vals)) |> tibble::rownames_to_column("Variabel")
result_multi <- coef_df |>
tidyr::pivot_longer(-Variabel, names_to = "Kategori", values_to = "Koefisien") |>
left_join(se_df |> tidyr::pivot_longer(-Variabel, names_to = "Kategori", values_to = "SE"),
by = c("Variabel", "Kategori")) |>
left_join(pv_df |> tidyr::pivot_longer(-Variabel, names_to = "Kategori", values_to = "p_value"),
by = c("Variabel", "Kategori")) |>
mutate(
OR = round(exp(Koefisien), 3),
p_value = round(p_value, 4),
Koefisien = round(Koefisien, 4),
SE = round(SE, 4),
Sig = dplyr::case_when(p_value < 0.001 ~ "***", p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*", TRUE ~ "")
)
result_multi |>
kable(caption = "Koefisien dan OR regresi logistik multinomial (referensi: Adelie)") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Variabel | Kategori | Koefisien | SE | p_value | OR | Sig |
|---|---|---|---|---|---|---|
| (Intercept) | Chinstrap | -103.4222 | 0.5716 | 0.0000 | 0.000 | *** |
| (Intercept) | Gentoo | -6.7563 | 0.5715 | 0.0000 | 0.001 | *** |
| culmen_length_mm | Chinstrap | 16.5164 | 27.7502 | 0.5517 | 14893428.878 | |
| culmen_length_mm | Gentoo | 12.2192 | 8.9726 | 0.1732 | 202651.063 | |
| culmen_depth_mm | Chinstrap | -24.4176 | 27.4374 | 0.3735 | 0.000 | |
| culmen_depth_mm | Gentoo | -30.0976 | 33.2551 | 0.3654 | 0.000 | |
| flipper_length_mm | Chinstrap | -0.3100 | 4.0974 | 0.9397 | 0.733 | |
| flipper_length_mm | Gentoo | -0.0876 | 11.9056 | 0.9941 | 0.916 | |
| body_mass_g | Chinstrap | -0.0302 | 0.1350 | 0.8230 | 0.970 | |
| body_mass_g | Gentoo | 0.0050 | 0.5849 | 0.9932 | 1.005 | |
| sexMALE | Chinstrap | -15.8206 | 3.0772 | 0.0000 | 0.000 | *** |
| sexMALE | Gentoo | -8.1321 | 3.4981 | 0.0201 | 0.000 |
|
Koefisien diinterpretasikan sebagai log-odds suatu spesies dibanding Adelie. OR > 1 berarti prediktor meningkatkan peluang masuk kategori tersebut, OR < 1 berarti menurunkan peluang.
fit_multi_null <- nnet::multinom(species ~ 1, data = penguins_clean, trace = FALSE)
lrt_stat <- 2 * (as.numeric(logLik(fit_multi)) - as.numeric(logLik(fit_multi_null)))
lrt_df <- length(coef(fit_multi)) - length(coef(fit_multi_null))
lrt_p <- 1 - pchisq(lrt_stat, lrt_df)
tibble::tibble(
`Log-Lik Model Penuh` = round(as.numeric(logLik(fit_multi)), 3),
`Log-Lik Model Null` = round(as.numeric(logLik(fit_multi_null)), 3),
`G² (LRT)` = round(lrt_stat, 3),
`df` = lrt_df,
`p-value` = round(lrt_p, 4)
) |>
kable(caption = "Likelihood Ratio Test: model penuh vs model null") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Log-Lik Model Penuh | Log-Lik Model Null | G² (LRT) | df | p-value |
|---|---|---|---|---|
| -0.001 | -350.863 | 701.722 | 10 | 0 |
pred_class <- predict(fit_multi, type = "class")
conf_mat <- table(Aktual = penguins_clean$species, Prediksi = pred_class)
conf_mat## Prediksi
## Aktual Adelie Chinstrap Gentoo
## Adelie 146 0 0
## Chinstrap 0 68 0
## Gentoo 0 0 119
akurasi <- sum(diag(conf_mat)) / sum(conf_mat)
cat(sprintf("\nAkurasi klasifikasi: %.2f%%\n", akurasi * 100))##
## Akurasi klasifikasi: 100.00%
grid_multi <- expand.grid(
culmen_length_mm = seq(min(penguins_clean$culmen_length_mm),
max(penguins_clean$culmen_length_mm), length.out = 150),
culmen_depth_mm = mean(penguins_clean$culmen_depth_mm),
flipper_length_mm = mean(penguins_clean$flipper_length_mm),
body_mass_g = mean(penguins_clean$body_mass_g),
sex = factor("FEMALE", levels = levels(penguins_clean$sex))
)
grid_probs <- predict(fit_multi, newdata = grid_multi, type = "probs")
grid_plot <- grid_multi |>
dplyr::bind_cols(as.data.frame(grid_probs)) |>
tidyr::pivot_longer(cols = c("Adelie", "Chinstrap", "Gentoo"),
names_to = "Spesies", values_to = "Probabilitas")
ggplot(grid_plot, aes(x = culmen_length_mm, y = Probabilitas, colour = Spesies)) +
geom_line(linewidth = 1.3) +
scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Kurva Probabilitas Prediksi vs Panjang Paruh",
subtitle = "Prediktor lain dikondisikan pada rata-ratanya (sex = FEMALE)",
x = "Panjang Paruh (mm)", y = "Probabilitas Prediksi"
) +
theme_minimal(base_size = 12)Regresi logistik ordinal (proportional odds model) digunakan ketika kategori respons memiliki urutan yang bermakna namun jarak antar kategori tidak perlu sama. Model mengasumsikan satu set koefisien berlaku untuk semua batas kategori (proportional odds assumption):
\[\log\frac{P(Y \leq k)}{P(Y > k)} = \alpha_k - \boldsymbol{\beta}^{\top}\mathbf{x}, \quad k = 1, \ldots, K-1\]
\(\alpha_k\) disebut intercept atau cutpoint ke-\(k\), sedangkan \(\boldsymbol{\beta}\) adalah vektor koefisien yang sama di semua cutpoint.
Data yang digunakan adalah Cervical Cancer Risk Factors
Dataset (UCI). Variabel respons ordinal dibentuk dari jumlah
tes diagnostik positif (Hinselmann, Schiller,
Citology, Biopsy): Rendah (0
tes positif), Sedang (1), Tinggi (2),
Sangat Tinggi (≥3). Prediktor: Age,
Smokes, Hormonal_Contraceptives,
IUD, STDs.
cervical <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/risk_factors_cervical_cancer.csv",
stringsAsFactors = FALSE,
na.strings = c("?", "NA", ""),
check.names = TRUE)
cat("Dimensi data:", nrow(cervical), "baris x", ncol(cervical), "kolom\n")## Dimensi data: 858 baris x 36 kolom
cervical <- cervical |>
dplyr::rename(Hormonal_Contraceptives = Hormonal.Contraceptives) |>
mutate(across(c(Hinselmann, Schiller, Citology, Biopsy,
Smokes, Hormonal_Contraceptives, IUD, STDs, Age), as.numeric))
cervical <- cervical |>
mutate(
skor_tes = Hinselmann + Schiller + Citology + Biopsy,
risiko_ordinal = factor(
dplyr::case_when(
skor_tes == 0 ~ "Rendah",
skor_tes == 1 ~ "Sedang",
skor_tes == 2 ~ "Tinggi",
skor_tes >= 3 ~ "Sangat Tinggi"
),
levels = c("Rendah", "Sedang", "Tinggi", "Sangat Tinggi"),
ordered = TRUE
)
)
cervical_model <- cervical |>
dplyr::filter(!is.na(Age), !is.na(Smokes), !is.na(Hormonal_Contraceptives),
!is.na(IUD), !is.na(STDs), !is.na(risiko_ordinal)) |>
dplyr::select(risiko_ordinal, Age, Smokes, Hormonal_Contraceptives, IUD, STDs)
cat("Dimensi data model:", nrow(cervical_model), "baris\n")## Dimensi data model: 726 baris
head(cervical_model) |>
kable(caption = "Enam baris pertama data ordinal") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| risiko_ordinal | Age | Smokes | Hormonal_Contraceptives | IUD | STDs |
|---|---|---|---|---|---|
| Rendah | 18 | 0 | 0 | 0 | 0 |
| Rendah | 15 | 0 | 0 | 0 | 0 |
| Rendah | 34 | 0 | 0 | 0 | 0 |
| Rendah | 52 | 1 | 1 | 0 | 0 |
| Rendah | 46 | 0 | 1 | 0 | 0 |
| Rendah | 42 | 0 | 0 | 0 | 0 |
cervical_model |>
count(risiko_ordinal) |>
mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
rename(`Tingkat Risiko` = risiko_ordinal, Frekuensi = n) |>
kable(caption = "Frekuensi dan proporsi tingkat risiko kanker serviks") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Tingkat Risiko | Frekuensi | Proporsi |
|---|---|---|
| Rendah | 633 | 87.2% |
| Sedang | 36 | 5.0% |
| Tinggi | 20 | 2.8% |
| Sangat Tinggi | 37 | 5.1% |
ggplot(cervical_model, aes(x = risiko_ordinal, fill = risiko_ordinal)) +
geom_bar(width = 0.65, alpha = 0.92) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
labs(title = "Sebaran Tingkat Risiko Kanker Serviks",
subtitle = "Urutan: Rendah < Sedang < Tinggi < Sangat Tinggi",
x = "Tingkat Risiko", y = "Jumlah Observasi") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")ggplot(cervical_model, aes(x = risiko_ordinal, y = Age, fill = risiko_ordinal)) +
geom_violin(alpha = 0.6, trim = FALSE) +
geom_boxplot(width = 0.15, fill = "white", outlier.shape = 21) +
scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
labs(title = "Sebaran Usia per Tingkat Risiko",
x = "Tingkat Risiko", y = "Usia (tahun)") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")cervical_model |>
tidyr::pivot_longer(cols = c(Smokes, Hormonal_Contraceptives, IUD, STDs),
names_to = "Variabel", values_to = "Nilai") |>
group_by(risiko_ordinal, Variabel) |>
summarise(Proporsi = mean(Nilai, na.rm = TRUE), .groups = "drop") |>
ggplot(aes(x = risiko_ordinal, y = Proporsi, fill = risiko_ordinal)) +
geom_col(width = 0.65, alpha = 0.9) +
scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
scale_y_continuous(labels = percent_format()) +
facet_wrap(~ Variabel, scales = "free_y") +
labs(title = "Proporsi Prediktor Biner per Tingkat Risiko",
x = "Tingkat Risiko", y = "Proporsi") +
theme_minimal(base_size = 11) +
theme(legend.position = "none", axis.text.x = element_text(angle = 30, hjust = 1))fit_ord <- MASS::polr(
risiko_ordinal ~ Age + Smokes + Hormonal_Contraceptives + IUD + STDs,
data = cervical_model,
method = "logistic",
Hess = TRUE
)
summary(fit_ord)## Call:
## MASS::polr(formula = risiko_ordinal ~ Age + Smokes + Hormonal_Contraceptives +
## IUD + STDs, data = cervical_model, Hess = TRUE, method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## Age 0.0005485 0.01301 0.04216
## Smokes 0.4683227 0.28611 1.63687
## Hormonal_Contraceptives -0.0258956 0.23305 -0.11111
## IUD 0.5791122 0.32449 1.78467
## STDs 0.9210369 0.30475 3.02223
##
## Intercepts:
## Value Std. Error t value
## Rendah|Sedang 2.1944 0.3928 5.5864
## Sedang|Tinggi 2.7504 0.4020 6.8415
## Tinggi|Sangat Tinggi 3.2177 0.4142 7.7679
##
## Residual Deviance: 737.8958
## AIC: 753.8958
ord_coef <- coef(summary(fit_ord))
result_ord <- as.data.frame(ord_coef) |>
tibble::rownames_to_column("Parameter") |>
dplyr::rename(Koefisien = Value, SE = `Std. Error`, `t-value` = `t value`) |>
mutate(
`p-value` = round(2 * (1 - pnorm(abs(`t-value`))), 4),
Jenis = ifelse(grepl("\\|", Parameter), "Cutpoint", "Koefisien"),
OR = ifelse(Jenis == "Koefisien", round(exp(-Koefisien), 3), NA_real_),
Koefisien = round(Koefisien, 4),
SE = round(SE, 4),
`t-value` = round(`t-value`, 4),
Sig = dplyr::case_when(`p-value` < 0.001 ~ "***", `p-value` < 0.01 ~ "**",
`p-value` < 0.05 ~ "*", TRUE ~ "")
)
result_ord |>
kable(caption = "Koefisien regresi logistik ordinal — proportional odds model") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Parameter | Koefisien | SE | t-value | p-value | Jenis | OR | Sig |
|---|---|---|---|---|---|---|---|
| Age | 0.0005 | 0.0130 | 0.0422 | 0.9664 | Koefisien | 0.999 | |
| Smokes | 0.4683 | 0.2861 | 1.6369 | 0.1017 | Koefisien | 0.626 | |
| Hormonal_Contraceptives | -0.0259 | 0.2331 | -0.1111 | 0.9115 | Koefisien | 1.026 | |
| IUD | 0.5791 | 0.3245 | 1.7847 | 0.0743 | Koefisien | 0.560 | |
| STDs | 0.9210 | 0.3048 | 3.0222 | 0.0025 | Koefisien | 0.398 | ** |
| Rendah|Sedang | 2.1944 | 0.3928 | 5.5864 | 0.0000 | Cutpoint | NA | *** |
| Sedang|Tinggi | 2.7504 | 0.4020 | 6.8415 | 0.0000 | Cutpoint | NA | *** |
| Tinggi|Sangat Tinggi | 3.2177 | 0.4142 | 7.7679 | 0.0000 | Cutpoint | NA | *** |
OR kumulatif dihitung sebagai \(\exp(-\hat\beta)\) sesuai konvensi
polr. OR > 1 berarti prediktor
meningkatkan peluang berada di kategori yang lebih tinggi;
cutpoint menandai batas log-odds antar kategori yang
berdekatan.
fit_ord_null <- MASS::polr(risiko_ordinal ~ 1, data = cervical_model,
method = "logistic", Hess = TRUE)
lrt_ord <- 2 * (as.numeric(logLik(fit_ord)) - as.numeric(logLik(fit_ord_null)))
lrt_ord_df <- length(coef(fit_ord)) - length(coef(fit_ord_null))
lrt_ord_p <- 1 - pchisq(lrt_ord, lrt_ord_df)
tibble::tibble(
`Log-Lik Model Penuh` = round(as.numeric(logLik(fit_ord)), 3),
`Log-Lik Model Null` = round(as.numeric(logLik(fit_ord_null)), 3),
`G² (LRT)` = round(lrt_ord, 3),
`df` = lrt_ord_df,
`p-value` = round(lrt_ord_p, 4)
) |>
kable(caption = "Likelihood Ratio Test: model penuh vs model null") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Log-Lik Model Penuh | Log-Lik Model Null | G² (LRT) | df | p-value |
|---|---|---|---|---|
| -368.948 | -376.888 | 15.881 | 5 | 0.0072 |
pred_ord <- predict(fit_ord, type = "class")
conf_ord <- table(Aktual = cervical_model$risiko_ordinal, Prediksi = pred_ord)
conf_ord## Prediksi
## Aktual Rendah Sedang Tinggi Sangat Tinggi
## Rendah 633 0 0 0
## Sedang 36 0 0 0
## Tinggi 20 0 0 0
## Sangat Tinggi 37 0 0 0
akurasi_ord <- sum(diag(conf_ord)) / sum(conf_ord)
cat(sprintf("\nAkurasi klasifikasi: %.2f%%\n", akurasi_ord * 100))##
## Akurasi klasifikasi: 87.19%
grid_ord <- data.frame(
Age = seq(min(cervical_model$Age), max(cervical_model$Age), length.out = 120),
Smokes = 0,
Hormonal_Contraceptives = 0,
IUD = 0,
STDs = 0
)
grid_probs_ord <- predict(fit_ord, newdata = grid_ord, type = "probs")
grid_ord_plot <- grid_ord |>
dplyr::bind_cols(as.data.frame(grid_probs_ord)) |>
tidyr::pivot_longer(cols = c("Rendah", "Sedang", "Tinggi", "Sangat Tinggi"),
names_to = "Kategori", values_to = "Probabilitas") |>
mutate(Kategori = factor(Kategori,
levels = c("Rendah", "Sedang", "Tinggi", "Sangat Tinggi")))
ggplot(grid_ord_plot, aes(x = Age, y = Probabilitas, colour = Kategori)) +
geom_line(linewidth = 1.3) +
scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Kurva Probabilitas Prediksi vs Usia",
subtitle = "Prediktor lain dikondisikan pada nilai nol",
x = "Usia (tahun)", y = "Probabilitas Prediksi"
) +
theme_minimal(base_size = 12)Regresi logistik biner memodelkan peluang kejadian (\(Y=1\)) sebagai fungsi dari prediktor menggunakan fungsi logit sebagai link:
\[\log\frac{P(Y=1)}{1 - P(Y=1)} = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Eksponensial koefisien menghasilkan Odds Ratio (OR) yang mengukur perubahan odds kejadian untuk setiap satu satuan kenaikan prediktor.
Data yang digunakan adalah Maternal Health Risk
Dataset (UCI). Variabel RiskLevel dikodekan biner:
high risk = 1, selain itu = 0. Prediktor: Age,
SystolicBP, DiastolicBP, BS,
BodyTemp, HeartRate.
maternal <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/Maternal Health Risk Data Set.csv",
stringsAsFactors = FALSE)
maternal <- maternal |>
mutate(high_risk = as.integer(RiskLevel == "high risk"))
cat("Dimensi data:", nrow(maternal), "baris x", ncol(maternal), "kolom\n")## Dimensi data: 1014 baris x 8 kolom
head(maternal) |>
kable(caption = "Enam baris pertama data maternal health") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Age | SystolicBP | DiastolicBP | BS | BodyTemp | HeartRate | RiskLevel | high_risk |
|---|---|---|---|---|---|---|---|
| 25 | 130 | 80 | 15.00 | 98 | 86 | high risk | 1 |
| 35 | 140 | 90 | 13.00 | 98 | 70 | high risk | 1 |
| 29 | 90 | 70 | 8.00 | 100 | 80 | high risk | 1 |
| 30 | 140 | 85 | 7.00 | 98 | 70 | high risk | 1 |
| 35 | 120 | 60 | 6.10 | 98 | 76 | low risk | 0 |
| 23 | 140 | 80 | 7.01 | 98 | 70 | high risk | 1 |
maternal |>
count(RiskLevel) |>
mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
rename(`Level Risiko` = RiskLevel, Frekuensi = n) |>
kable(caption = "Frekuensi dan proporsi tingkat risiko kesehatan ibu") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Level Risiko | Frekuensi | Proporsi |
|---|---|---|
| high risk | 272 | 26.8% |
| low risk | 406 | 40.0% |
| mid risk | 336 | 33.1% |
ggplot(maternal, aes(x = factor(high_risk, labels = c("Bukan High Risk", "High Risk")),
fill = factor(high_risk))) +
geom_bar(width = 0.6, alpha = 0.9) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
scale_fill_manual(values = c("#2a9d8f", "#e76f51")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
labs(title = "Sebaran Variabel Respons Biner",
x = "Status Risiko", y = "Jumlah Observasi") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")ggplot(maternal, aes(x = factor(high_risk, labels = c("Bukan High Risk", "High Risk")),
y = SystolicBP, fill = factor(high_risk))) +
geom_violin(alpha = 0.6, trim = FALSE) +
geom_boxplot(width = 0.12, fill = "white", outlier.shape = 21) +
scale_fill_manual(values = c("#2a9d8f", "#e76f51")) +
labs(title = "Tekanan Darah Sistolik per Status Risiko",
x = "Status Risiko", y = "Tekanan Darah Sistolik (mmHg)") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")ggplot(maternal, aes(x = Age, y = BS,
colour = factor(high_risk, labels = c("Bukan High Risk", "High Risk")))) +
geom_point(alpha = 0.55, size = 1.8) +
scale_colour_manual(values = c("#2a9d8f", "#e76f51")) +
labs(title = "Usia vs Kadar Gula Darah",
x = "Usia (tahun)", y = "Kadar Gula Darah (mmol/L)",
colour = "Status Risiko") +
theme_minimal(base_size = 12)set.seed(42)
train_idx <- sample(seq_len(nrow(maternal)), size = floor(0.8 * nrow(maternal)))
train_mat <- maternal[train_idx, ]
test_mat <- maternal[-train_idx, ]
fit_biner <- glm(
high_risk ~ Age + SystolicBP + DiastolicBP + BS + BodyTemp + HeartRate,
data = train_mat,
family = binomial(link = "logit")
)
summary(fit_biner)##
## Call:
## glm(formula = high_risk ~ Age + SystolicBP + DiastolicBP + BS +
## BodyTemp + HeartRate, family = binomial(link = "logit"),
## data = train_mat)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -68.09573 8.36954 -8.136 4.08e-16 ***
## Age -0.01496 0.01087 -1.376 0.16886
## SystolicBP 0.02332 0.01095 2.129 0.03328 *
## DiastolicBP 0.05075 0.01360 3.732 0.00019 ***
## BS 0.46388 0.04891 9.484 < 2e-16 ***
## BodyTemp 0.52932 0.07837 6.754 1.43e-11 ***
## HeartRate 0.05774 0.01414 4.085 4.41e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 944.11 on 810 degrees of freedom
## Residual deviance: 549.47 on 804 degrees of freedom
## AIC: 563.47
##
## Number of Fisher Scoring iterations: 6
biner_result <- broom::tidy(fit_biner) |>
mutate(
OR = round(exp(estimate), 3),
CI_low = round(exp(estimate - 1.96 * std.error), 3),
CI_high = round(exp(estimate + 1.96 * std.error), 3),
p.value = round(p.value, 4),
estimate = round(estimate, 4),
Sig = dplyr::case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*", TRUE ~ "")
) |>
dplyr::rename(Variabel = term, Koefisien = estimate, SE = std.error,
`z-value` = statistic, `p-value` = p.value,
`CI 2.5%` = CI_low, `CI 97.5%` = CI_high)
biner_result |>
kable(caption = "Koefisien dan Odds Ratio regresi logistik biner") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Variabel | Koefisien | SE | z-value | p-value | OR | CI 2.5% | CI 97.5% | Sig |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | -68.0957 | 8.3695354 | -8.136143 | 0.0000 | 0.000 | 0.000 | 0.000 | *** |
| Age | -0.0150 | 0.0108730 | -1.375880 | 0.1689 | 0.985 | 0.964 | 1.006 | |
| SystolicBP | 0.0233 | 0.0109531 | 2.128684 | 0.0333 | 1.024 | 1.002 | 1.046 |
|
| DiastolicBP | 0.0507 | 0.0135996 | 3.731511 | 0.0002 | 1.052 | 1.024 | 1.080 | *** |
| BS | 0.4639 | 0.0489123 | 9.483909 | 0.0000 | 1.590 | 1.445 | 1.750 | *** |
| BodyTemp | 0.5293 | 0.0783667 | 6.754403 | 0.0000 | 1.698 | 1.456 | 1.980 | *** |
| HeartRate | 0.0577 | 0.0141363 | 4.084724 | 0.0000 | 1.059 | 1.030 | 1.089 | *** |
OR > 1 berarti prediktor meningkatkan odds high risk; OR < 1 berarti menurunkan. Interval kepercayaan 95% yang tidak mencakup 1 mengonfirmasi signifikansi statistik.
p_test <- predict(fit_biner, newdata = test_mat, type = "response")
pred_class_biner <- ifelse(p_test >= 0.5, 1, 0)
conf_biner <- table(Aktual = test_mat$high_risk, Prediksi = pred_class_biner)
conf_biner## Prediksi
## Aktual 0 1
## 0 138 11
## 1 20 34
akurasi_biner <- sum(diag(conf_biner)) / sum(conf_biner)
cat(sprintf("\nAkurasi pada data testing: %.2f%%\n", akurasi_biner * 100))##
## Akurasi pada data testing: 84.73%
grid_biner <- data.frame(
Age = mean(train_mat$Age),
SystolicBP = seq(min(maternal$SystolicBP), max(maternal$SystolicBP), length.out = 150),
DiastolicBP = mean(train_mat$DiastolicBP),
BS = mean(train_mat$BS),
BodyTemp = mean(train_mat$BodyTemp),
HeartRate = mean(train_mat$HeartRate)
)
grid_biner$prob <- predict(fit_biner, newdata = grid_biner, type = "response")
ggplot(grid_biner, aes(x = SystolicBP, y = prob)) +
geom_ribbon(aes(ymin = 0, ymax = prob), fill = "#e76f51", alpha = 0.15) +
geom_line(colour = "#e76f51", linewidth = 1.4) +
geom_hline(yintercept = 0.5, linetype = "dashed", colour = "grey40") +
scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
labs(
title = "Kurva Sigmoid: Probabilitas High Risk vs Tekanan Darah Sistolik",
subtitle = "Prediktor lain dikondisikan pada nilai rata-rata | garis putus-putus = cutoff 50%",
x = "Tekanan Darah Sistolik (mmHg)", y = "P(High Risk)"
) +
theme_minimal(base_size = 12)Regresi Poisson digunakan untuk memodelkan data cacahan (count data) — variabel respons berupa bilangan bulat non-negatif (\(Y \in \{0, 1, 2, \ldots\}\)). Model mengasumsikan \(Y \sim \text{Poisson}(\mu)\) dengan fungsi link logaritma:
\[\log(\mu_i) = \beta_0 + \beta_1 X_{1i} + \cdots + \beta_p X_{pi}\]
Eksponensial koefisien menghasilkan Incidence Rate Ratio (IRR) yang mengukur perubahan relatif rata-rata cacahan.
Data yang digunakan adalah Student Awards UCLA OARC.
Variabel respons adalah num_awards (jumlah penghargaan per
mahasiswa). Prediktor: prog (program studi: General,
Academic, Vocational) dan math (nilai ujian
matematika).
students <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/Student Awards — UCLA OARC.csv", sep = ";",
stringsAsFactors = FALSE)
students <- students |>
mutate(prog = factor(prog, levels = 1:3,
labels = c("General", "Academic", "Vocational")))
cat("Dimensi data:", nrow(students), "baris x", ncol(students), "kolom\n")## Dimensi data: 200 baris x 4 kolom
head(students, 8) |>
kable(caption = "Delapan baris pertama data student awards") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| id | num_awards | prog | math |
|---|---|---|---|
| 45 | 0 | Vocational | 41 |
| 108 | 0 | General | 41 |
| 15 | 0 | Vocational | 44 |
| 67 | 0 | Vocational | 42 |
| 153 | 0 | Vocational | 40 |
| 51 | 0 | General | 42 |
| 164 | 0 | Vocational | 46 |
| 133 | 0 | Vocational | 40 |
students |>
count(num_awards) |>
mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
rename(`Jumlah Penghargaan` = num_awards, Frekuensi = n) |>
kable(caption = "Frekuensi dan proporsi jumlah penghargaan mahasiswa") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Jumlah Penghargaan | Frekuensi | Proporsi |
|---|---|---|
| 0 | 124 | 62.0% |
| 1 | 49 | 24.5% |
| 2 | 13 | 6.5% |
| 3 | 9 | 4.5% |
| 4 | 2 | 1.0% |
| 5 | 2 | 1.0% |
| 6 | 1 | 0.5% |
ggplot(students, aes(x = num_awards)) +
geom_bar(fill = "#2f7f73", colour = "white", width = 0.7) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
scale_x_continuous(breaks = 0:max(students$num_awards)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
labs(title = "Sebaran Jumlah Penghargaan Mahasiswa",
subtitle = "Data cacahan dengan banyak nilai nol — tipikal distribusi Poisson",
x = "Jumlah Penghargaan", y = "Frekuensi") +
theme_minimal(base_size = 12)students |>
group_by(prog) |>
summarise(n = n(), `Rata-rata` = round(mean(num_awards), 3),
Varians = round(var(num_awards), 3), .groups = "drop") |>
rename(Program = prog) |>
kable(caption = "Rata-rata dan varians jumlah penghargaan per program studi") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Program | n | Rata-rata | Varians |
|---|---|---|---|
| General | 45 | 0.20 | 0.164 |
| Academic | 105 | 1.00 | 1.635 |
| Vocational | 50 | 0.24 | 0.268 |
ggplot(students, aes(x = math, y = num_awards, colour = prog)) +
geom_jitter(height = 0.15, alpha = 0.65, size = 2) +
geom_smooth(method = "loess", se = FALSE, linewidth = 1.1) +
scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
labs(title = "Nilai Matematika vs Jumlah Penghargaan per Program",
x = "Nilai Matematika", y = "Jumlah Penghargaan",
colour = "Program") +
theme_minimal(base_size = 12)fit_pois <- glm(
num_awards ~ prog + math,
data = students,
family = poisson(link = "log")
)
summary(fit_pois)##
## Call:
## glm(formula = num_awards ~ prog + math, family = poisson(link = "log"),
## data = students)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.24712 0.65845 -7.969 1.60e-15 ***
## progAcademic 1.08386 0.35825 3.025 0.00248 **
## progVocational 0.36981 0.44107 0.838 0.40179
## math 0.07015 0.01060 6.619 3.63e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 287.67 on 199 degrees of freedom
## Residual deviance: 189.45 on 196 degrees of freedom
## AIC: 373.5
##
## Number of Fisher Scoring iterations: 6
pois_result <- broom::tidy(fit_pois) |>
mutate(
IRR = round(exp(estimate), 3),
CI_low = round(exp(estimate - 1.96 * std.error), 3),
CI_high = round(exp(estimate + 1.96 * std.error), 3),
p.value = round(p.value, 4),
estimate = round(estimate, 4),
`% Perubahan` = round((exp(estimate) - 1) * 100, 2),
Sig = dplyr::case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*", TRUE ~ "")
) |>
dplyr::rename(Variabel = term, Koefisien = estimate, SE = std.error,
`z-value` = statistic, `p-value` = p.value,
`CI 2.5%` = CI_low, `CI 97.5%` = CI_high)
pois_result |>
kable(caption = "Koefisien dan IRR regresi Poisson") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Variabel | Koefisien | SE | z-value | p-value | IRR | CI 2.5% | CI 97.5% | % Perubahan | Sig |
|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | -5.2471 | 0.6584531 | -7.968866 | 0.0000 | 0.005 | 0.001 | 0.019 | -99.47 | *** |
| progAcademic | 1.0839 | 0.3582530 | 3.025402 | 0.0025 | 2.956 | 1.465 | 5.966 | 195.62 | ** |
| progVocational | 0.3698 | 0.4410703 | 0.838436 | 0.4018 | 1.447 | 0.610 | 3.436 | 44.74 | |
| math | 0.0702 | 0.0105992 | 6.618647 | 0.0000 | 1.073 | 1.051 | 1.095 | 7.27 | *** |
IRR > 1 berarti prediktor meningkatkan rata-rata
jumlah penghargaan; IRR < 1 berarti menurunkan.
Kolom % Perubahan menunjukkan besar perubahan relatif
rata-rata cacahan per satu satuan kenaikan prediktor.
dispersion <- sum(residuals(fit_pois, type = "pearson")^2) / df.residual(fit_pois)
tibble::tibble(
`Dispersion Pearson` = round(dispersion, 4),
Interpretasi = dplyr::case_when(
dispersion < 1.5 ~ "Tidak ada indikasi overdispersion berat",
dispersion < 2.5 ~ "Ada indikasi overdispersion sedang",
TRUE ~ "Ada indikasi overdispersion kuat"
)
) |>
kable(caption = "Rasio dispersi Pearson — indikator overdispersion") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)| Dispersion Pearson | Interpretasi |
|---|---|
| 1.0824 | Tidak ada indikasi overdispersion berat |
Jika nilai dispersi Pearson jauh di atas 1, model Negative Binomial lebih sesuai dibanding Poisson.
grid_pois <- expand.grid(
math = seq(min(students$math), max(students$math), length.out = 100),
prog = factor(c("General", "Academic", "Vocational"), levels = levels(students$prog))
)
pred_link <- predict(fit_pois, newdata = grid_pois, type = "link", se.fit = TRUE)
grid_pois <- grid_pois |>
mutate(
pred_rate = exp(pred_link$fit),
ci_low = exp(pred_link$fit - 1.96 * pred_link$se.fit),
ci_high = exp(pred_link$fit + 1.96 * pred_link$se.fit)
)
ggplot(grid_pois, aes(x = math, y = pred_rate, colour = prog, fill = prog)) +
geom_ribbon(aes(ymin = ci_low, ymax = ci_high), alpha = 0.15, colour = NA) +
geom_line(linewidth = 1.3) +
geom_jitter(data = students, aes(x = math, y = num_awards, colour = prog),
height = 0.1, alpha = 0.3, size = 1.5, inherit.aes = FALSE) +
scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
labs(
title = "Kurva Prediksi Rate Poisson vs Nilai Matematika",
subtitle = "Pita = interval kepercayaan 95%; titik = data aktual",
x = "Nilai Matematika", y = "Rata-rata Prediksi Jumlah Penghargaan",
colour = "Program", fill = "Program"
) +
theme_minimal(base_size = 12)