1 Regresi Logistik Multinomial

1.1 Konsep dan Formula

Regresi logistik multinomial digunakan ketika variabel respons bersifat nominal dengan tiga kategori atau lebih tanpa urutan yang bermakna. Model memodelkan log-odds setiap kategori \(k\) terhadap kategori referensi \(K\):

\[\log\frac{P(Y = k)}{P(Y = K)} = \beta_{k0} + \beta_{k1}X_1 + \cdots + \beta_{kp}X_p, \quad k = 1, \ldots, K-1\]

Eksponensial koefisien menghasilkan Odds Ratio (OR) yang menunjukkan seberapa besar suatu prediktor mengubah peluang berada di kategori \(k\) dibanding kategori referensi.

1.2 Persiapan Data

Data yang digunakan adalah Palmer Archipelago Antarctica Penguin Dataset. Variabel respons adalah species (nominal, 3 kategori: Adelie, Chinstrap, Gentoo). Prediktor: culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, dan sex.

penguins <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/penguins_size.csv",
                     stringsAsFactors = TRUE,
                     na.strings = c("NA", ".", ""))

penguins_clean <- penguins |>
  filter(
    !is.na(culmen_length_mm), !is.na(culmen_depth_mm),
    !is.na(flipper_length_mm), !is.na(body_mass_g),
    !is.na(sex), sex %in% c("MALE", "FEMALE")
  ) |>
  mutate(species = factor(species), sex = factor(sex))

cat("Dimensi data:", nrow(penguins_clean), "baris x", ncol(penguins_clean), "kolom\n")
## Dimensi data: 333 baris x 7 kolom
head(penguins_clean) |>
  kable(caption = "Enam baris pertama data penguin") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Enam baris pertama data penguin
species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
Adelie Torgersen 39.1 18.7 181 3750 MALE
Adelie Torgersen 39.5 17.4 186 3800 FEMALE
Adelie Torgersen 40.3 18.0 195 3250 FEMALE
Adelie Torgersen 36.7 19.3 193 3450 FEMALE
Adelie Torgersen 39.3 20.6 190 3650 MALE
Adelie Torgersen 38.9 17.8 181 3625 FEMALE

1.3 Sebaran Variabel Respons

penguins_clean |>
  count(species) |>
  mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
  rename(Spesies = species, Frekuensi = n) |>
  kable(caption = "Frekuensi dan proporsi tiap spesies penguin") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Frekuensi dan proporsi tiap spesies penguin
Spesies Frekuensi Proporsi
Adelie 146 43.8%
Chinstrap 68 20.4%
Gentoo 119 35.7%
ggplot(penguins_clean, aes(x = species, fill = species)) +
  geom_bar(width = 0.6, alpha = 0.9) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(title = "Sebaran Spesies Penguin", x = "Spesies", y = "Jumlah Observasi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

1.4 Eksplorasi Prediktor

# Boxplot panjang paruh per spesies
ggplot(penguins_clean, aes(x = species, y = culmen_length_mm, fill = species)) +
  geom_boxplot(alpha = 0.8, outlier.shape = 21) +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  labs(title = "Panjang Paruh per Spesies",
       x = "Spesies", y = "Panjang Paruh (mm)") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

# Scatter panjang paruh vs panjang sirip
ggplot(penguins_clean, aes(x = culmen_length_mm, y = flipper_length_mm,
                            colour = species, shape = sex)) +
  geom_point(alpha = 0.7, size = 2.2) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  labs(title = "Panjang Paruh vs Panjang Sirip",
       x = "Panjang Paruh (mm)", y = "Panjang Sirip (mm)",
       colour = "Spesies", shape = "Jenis Kelamin") +
  theme_minimal(base_size = 12)

1.5 Estimasi Model

penguins_clean$species <- relevel(penguins_clean$species, ref = "Adelie")

fit_multi <- nnet::multinom(
  species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm + body_mass_g + sex,
  data  = penguins_clean,
  trace = FALSE
)

summary(fit_multi)
## Call:
## nnet::multinom(formula = species ~ culmen_length_mm + culmen_depth_mm + 
##     flipper_length_mm + body_mass_g + sex, data = penguins_clean, 
##     trace = FALSE)
## 
## Coefficients:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap -103.422234         16.51643       -24.41759       -0.31003945
## Gentoo      -6.756322         12.21924       -30.09759       -0.08757856
##            body_mass_g    sexMALE
## Chinstrap -0.030200987 -15.820608
## Gentoo     0.005019241  -8.132104
## 
## Std. Errors:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap   0.5715537        27.750159        27.43735          4.097368
## Gentoo      0.5715312         8.972629        33.25505         11.905646
##           body_mass_g  sexMALE
## Chinstrap   0.1350179 3.077204
## Gentoo      0.5848798 3.498127
## 
## Residual Deviance: 0.002996928 
## AIC: 24.003

1.6 Koefisien, OR, dan Signifikansi

multi_sum <- summary(fit_multi)
z_vals    <- multi_sum$coefficients / multi_sum$standard.errors
p_vals    <- 2 * (1 - pnorm(abs(z_vals)))

coef_df <- as.data.frame(t(multi_sum$coefficients)) |> tibble::rownames_to_column("Variabel")
se_df   <- as.data.frame(t(multi_sum$standard.errors)) |> tibble::rownames_to_column("Variabel")
pv_df   <- as.data.frame(t(p_vals)) |> tibble::rownames_to_column("Variabel")

result_multi <- coef_df |>
  tidyr::pivot_longer(-Variabel, names_to = "Kategori", values_to = "Koefisien") |>
  left_join(se_df |> tidyr::pivot_longer(-Variabel, names_to = "Kategori", values_to = "SE"),
            by = c("Variabel", "Kategori")) |>
  left_join(pv_df |> tidyr::pivot_longer(-Variabel, names_to = "Kategori", values_to = "p_value"),
            by = c("Variabel", "Kategori")) |>
  mutate(
    OR        = round(exp(Koefisien), 3),
    p_value   = round(p_value, 4),
    Koefisien = round(Koefisien, 4),
    SE        = round(SE, 4),
    Sig       = dplyr::case_when(p_value < 0.001 ~ "***", p_value < 0.01 ~ "**",
                                 p_value < 0.05 ~ "*", TRUE ~ "")
  )

result_multi |>
  kable(caption = "Koefisien dan OR regresi logistik multinomial (referensi: Adelie)") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Koefisien dan OR regresi logistik multinomial (referensi: Adelie)
Variabel Kategori Koefisien SE p_value OR Sig
(Intercept) Chinstrap -103.4222 0.5716 0.0000 0.000 ***
(Intercept) Gentoo -6.7563 0.5715 0.0000 0.001 ***
culmen_length_mm Chinstrap 16.5164 27.7502 0.5517 14893428.878
culmen_length_mm Gentoo 12.2192 8.9726 0.1732 202651.063
culmen_depth_mm Chinstrap -24.4176 27.4374 0.3735 0.000
culmen_depth_mm Gentoo -30.0976 33.2551 0.3654 0.000
flipper_length_mm Chinstrap -0.3100 4.0974 0.9397 0.733
flipper_length_mm Gentoo -0.0876 11.9056 0.9941 0.916
body_mass_g Chinstrap -0.0302 0.1350 0.8230 0.970
body_mass_g Gentoo 0.0050 0.5849 0.9932 1.005
sexMALE Chinstrap -15.8206 3.0772 0.0000 0.000 ***
sexMALE Gentoo -8.1321 3.4981 0.0201 0.000

Koefisien diinterpretasikan sebagai log-odds suatu spesies dibanding Adelie. OR > 1 berarti prediktor meningkatkan peluang masuk kategori tersebut, OR < 1 berarti menurunkan peluang.

1.7 Uji Kebaikan Model (LRT)

fit_multi_null <- nnet::multinom(species ~ 1, data = penguins_clean, trace = FALSE)

lrt_stat <- 2 * (as.numeric(logLik(fit_multi)) - as.numeric(logLik(fit_multi_null)))
lrt_df   <- length(coef(fit_multi)) - length(coef(fit_multi_null))
lrt_p    <- 1 - pchisq(lrt_stat, lrt_df)

tibble::tibble(
  `Log-Lik Model Penuh` = round(as.numeric(logLik(fit_multi)), 3),
  `Log-Lik Model Null`  = round(as.numeric(logLik(fit_multi_null)), 3),
  `G² (LRT)`            = round(lrt_stat, 3),
  `df`                  = lrt_df,
  `p-value`             = round(lrt_p, 4)
) |>
  kable(caption = "Likelihood Ratio Test: model penuh vs model null") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Likelihood Ratio Test: model penuh vs model null
Log-Lik Model Penuh Log-Lik Model Null G² (LRT) df p-value
-0.001 -350.863 701.722 10 0

1.8 Confusion Matrix & Akurasi

pred_class <- predict(fit_multi, type = "class")
conf_mat   <- table(Aktual = penguins_clean$species, Prediksi = pred_class)
conf_mat
##            Prediksi
## Aktual      Adelie Chinstrap Gentoo
##   Adelie       146         0      0
##   Chinstrap      0        68      0
##   Gentoo         0         0    119
akurasi <- sum(diag(conf_mat)) / sum(conf_mat)
cat(sprintf("\nAkurasi klasifikasi: %.2f%%\n", akurasi * 100))
## 
## Akurasi klasifikasi: 100.00%

1.9 Kurva Probabilitas Prediksi

grid_multi <- expand.grid(
  culmen_length_mm  = seq(min(penguins_clean$culmen_length_mm),
                          max(penguins_clean$culmen_length_mm), length.out = 150),
  culmen_depth_mm   = mean(penguins_clean$culmen_depth_mm),
  flipper_length_mm = mean(penguins_clean$flipper_length_mm),
  body_mass_g       = mean(penguins_clean$body_mass_g),
  sex               = factor("FEMALE", levels = levels(penguins_clean$sex))
)

grid_probs <- predict(fit_multi, newdata = grid_multi, type = "probs")

grid_plot <- grid_multi |>
  dplyr::bind_cols(as.data.frame(grid_probs)) |>
  tidyr::pivot_longer(cols = c("Adelie", "Chinstrap", "Gentoo"),
                      names_to = "Spesies", values_to = "Probabilitas")

ggplot(grid_plot, aes(x = culmen_length_mm, y = Probabilitas, colour = Spesies)) +
  geom_line(linewidth = 1.3) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title    = "Kurva Probabilitas Prediksi vs Panjang Paruh",
    subtitle = "Prediktor lain dikondisikan pada rata-ratanya (sex = FEMALE)",
    x = "Panjang Paruh (mm)", y = "Probabilitas Prediksi"
  ) +
  theme_minimal(base_size = 12)


2 Regresi Logistik Ordinal

2.1 Konsep dan Formula

Regresi logistik ordinal (proportional odds model) digunakan ketika kategori respons memiliki urutan yang bermakna namun jarak antar kategori tidak perlu sama. Model mengasumsikan satu set koefisien berlaku untuk semua batas kategori (proportional odds assumption):

\[\log\frac{P(Y \leq k)}{P(Y > k)} = \alpha_k - \boldsymbol{\beta}^{\top}\mathbf{x}, \quad k = 1, \ldots, K-1\]

\(\alpha_k\) disebut intercept atau cutpoint ke-\(k\), sedangkan \(\boldsymbol{\beta}\) adalah vektor koefisien yang sama di semua cutpoint.

2.2 Persiapan Data

Data yang digunakan adalah Cervical Cancer Risk Factors Dataset (UCI). Variabel respons ordinal dibentuk dari jumlah tes diagnostik positif (Hinselmann, Schiller, Citology, Biopsy): Rendah (0 tes positif), Sedang (1), Tinggi (2), Sangat Tinggi (≥3). Prediktor: Age, Smokes, Hormonal_Contraceptives, IUD, STDs.

cervical <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/risk_factors_cervical_cancer.csv",
                     stringsAsFactors = FALSE,
                     na.strings       = c("?", "NA", ""),
                     check.names      = TRUE)

cat("Dimensi data:", nrow(cervical), "baris x", ncol(cervical), "kolom\n")
## Dimensi data: 858 baris x 36 kolom
cervical <- cervical |>
  dplyr::rename(Hormonal_Contraceptives = Hormonal.Contraceptives) |>
  mutate(across(c(Hinselmann, Schiller, Citology, Biopsy,
                  Smokes, Hormonal_Contraceptives, IUD, STDs, Age), as.numeric))

cervical <- cervical |>
  mutate(
    skor_tes = Hinselmann + Schiller + Citology + Biopsy,
    risiko_ordinal = factor(
      dplyr::case_when(
        skor_tes == 0 ~ "Rendah",
        skor_tes == 1 ~ "Sedang",
        skor_tes == 2 ~ "Tinggi",
        skor_tes >= 3 ~ "Sangat Tinggi"
      ),
      levels = c("Rendah", "Sedang", "Tinggi", "Sangat Tinggi"),
      ordered = TRUE
    )
  )

cervical_model <- cervical |>
  dplyr::filter(!is.na(Age), !is.na(Smokes), !is.na(Hormonal_Contraceptives),
                !is.na(IUD), !is.na(STDs), !is.na(risiko_ordinal)) |>
  dplyr::select(risiko_ordinal, Age, Smokes, Hormonal_Contraceptives, IUD, STDs)

cat("Dimensi data model:", nrow(cervical_model), "baris\n")
## Dimensi data model: 726 baris
head(cervical_model) |>
  kable(caption = "Enam baris pertama data ordinal") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Enam baris pertama data ordinal
risiko_ordinal Age Smokes Hormonal_Contraceptives IUD STDs
Rendah 18 0 0 0 0
Rendah 15 0 0 0 0
Rendah 34 0 0 0 0
Rendah 52 1 1 0 0
Rendah 46 0 1 0 0
Rendah 42 0 0 0 0

2.3 Sebaran Variabel Respons

cervical_model |>
  count(risiko_ordinal) |>
  mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
  rename(`Tingkat Risiko` = risiko_ordinal, Frekuensi = n) |>
  kable(caption = "Frekuensi dan proporsi tingkat risiko kanker serviks") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Frekuensi dan proporsi tingkat risiko kanker serviks
Tingkat Risiko Frekuensi Proporsi
Rendah 633 87.2%
Sedang 36 5.0%
Tinggi 20 2.8%
Sangat Tinggi 37 5.1%
ggplot(cervical_model, aes(x = risiko_ordinal, fill = risiko_ordinal)) +
  geom_bar(width = 0.65, alpha = 0.92) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(title = "Sebaran Tingkat Risiko Kanker Serviks",
       subtitle = "Urutan: Rendah < Sedang < Tinggi < Sangat Tinggi",
       x = "Tingkat Risiko", y = "Jumlah Observasi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

2.4 Eksplorasi Prediktor

ggplot(cervical_model, aes(x = risiko_ordinal, y = Age, fill = risiko_ordinal)) +
  geom_violin(alpha = 0.6, trim = FALSE) +
  geom_boxplot(width = 0.15, fill = "white", outlier.shape = 21) +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
  labs(title = "Sebaran Usia per Tingkat Risiko",
       x = "Tingkat Risiko", y = "Usia (tahun)") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

cervical_model |>
  tidyr::pivot_longer(cols = c(Smokes, Hormonal_Contraceptives, IUD, STDs),
                      names_to = "Variabel", values_to = "Nilai") |>
  group_by(risiko_ordinal, Variabel) |>
  summarise(Proporsi = mean(Nilai, na.rm = TRUE), .groups = "drop") |>
  ggplot(aes(x = risiko_ordinal, y = Proporsi, fill = risiko_ordinal)) +
  geom_col(width = 0.65, alpha = 0.9) +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
  scale_y_continuous(labels = percent_format()) +
  facet_wrap(~ Variabel, scales = "free_y") +
  labs(title = "Proporsi Prediktor Biner per Tingkat Risiko",
       x = "Tingkat Risiko", y = "Proporsi") +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none", axis.text.x = element_text(angle = 30, hjust = 1))

2.5 Estimasi Model

fit_ord <- MASS::polr(
  risiko_ordinal ~ Age + Smokes + Hormonal_Contraceptives + IUD + STDs,
  data   = cervical_model,
  method = "logistic",
  Hess   = TRUE
)

summary(fit_ord)
## Call:
## MASS::polr(formula = risiko_ordinal ~ Age + Smokes + Hormonal_Contraceptives + 
##     IUD + STDs, data = cervical_model, Hess = TRUE, method = "logistic")
## 
## Coefficients:
##                              Value Std. Error  t value
## Age                      0.0005485    0.01301  0.04216
## Smokes                   0.4683227    0.28611  1.63687
## Hormonal_Contraceptives -0.0258956    0.23305 -0.11111
## IUD                      0.5791122    0.32449  1.78467
## STDs                     0.9210369    0.30475  3.02223
## 
## Intercepts:
##                      Value   Std. Error t value
## Rendah|Sedang         2.1944  0.3928     5.5864
## Sedang|Tinggi         2.7504  0.4020     6.8415
## Tinggi|Sangat Tinggi  3.2177  0.4142     7.7679
## 
## Residual Deviance: 737.8958 
## AIC: 753.8958

2.6 Koefisien, OR, dan Signifikansi

ord_coef <- coef(summary(fit_ord))

result_ord <- as.data.frame(ord_coef) |>
  tibble::rownames_to_column("Parameter") |>
  dplyr::rename(Koefisien = Value, SE = `Std. Error`, `t-value` = `t value`) |>
  mutate(
    `p-value` = round(2 * (1 - pnorm(abs(`t-value`))), 4),
    Jenis     = ifelse(grepl("\\|", Parameter), "Cutpoint", "Koefisien"),
    OR        = ifelse(Jenis == "Koefisien", round(exp(-Koefisien), 3), NA_real_),
    Koefisien = round(Koefisien, 4),
    SE        = round(SE, 4),
    `t-value` = round(`t-value`, 4),
    Sig       = dplyr::case_when(`p-value` < 0.001 ~ "***", `p-value` < 0.01 ~ "**",
                                 `p-value` < 0.05 ~ "*", TRUE ~ "")
  )

result_ord |>
  kable(caption = "Koefisien regresi logistik ordinal — proportional odds model") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Koefisien regresi logistik ordinal — proportional odds model
Parameter Koefisien SE t-value p-value Jenis OR Sig
Age 0.0005 0.0130 0.0422 0.9664 Koefisien 0.999
Smokes 0.4683 0.2861 1.6369 0.1017 Koefisien 0.626
Hormonal_Contraceptives -0.0259 0.2331 -0.1111 0.9115 Koefisien 1.026
IUD 0.5791 0.3245 1.7847 0.0743 Koefisien 0.560
STDs 0.9210 0.3048 3.0222 0.0025 Koefisien 0.398 **
Rendah&#124;Sedang 2.1944 0.3928 5.5864 0.0000 Cutpoint NA ***
Sedang&#124;Tinggi 2.7504 0.4020 6.8415 0.0000 Cutpoint NA ***
Tinggi&#124;Sangat Tinggi 3.2177 0.4142 7.7679 0.0000 Cutpoint NA ***

OR kumulatif dihitung sebagai \(\exp(-\hat\beta)\) sesuai konvensi polr. OR > 1 berarti prediktor meningkatkan peluang berada di kategori yang lebih tinggi; cutpoint menandai batas log-odds antar kategori yang berdekatan.

2.7 Uji Kebaikan Model (LRT)

fit_ord_null <- MASS::polr(risiko_ordinal ~ 1, data = cervical_model,
                           method = "logistic", Hess = TRUE)

lrt_ord    <- 2 * (as.numeric(logLik(fit_ord)) - as.numeric(logLik(fit_ord_null)))
lrt_ord_df <- length(coef(fit_ord)) - length(coef(fit_ord_null))
lrt_ord_p  <- 1 - pchisq(lrt_ord, lrt_ord_df)

tibble::tibble(
  `Log-Lik Model Penuh` = round(as.numeric(logLik(fit_ord)), 3),
  `Log-Lik Model Null`  = round(as.numeric(logLik(fit_ord_null)), 3),
  `G² (LRT)`            = round(lrt_ord, 3),
  `df`                  = lrt_ord_df,
  `p-value`             = round(lrt_ord_p, 4)
) |>
  kable(caption = "Likelihood Ratio Test: model penuh vs model null") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Likelihood Ratio Test: model penuh vs model null
Log-Lik Model Penuh Log-Lik Model Null G² (LRT) df p-value
-368.948 -376.888 15.881 5 0.0072

2.8 Confusion Matrix & Akurasi

pred_ord <- predict(fit_ord, type = "class")
conf_ord <- table(Aktual = cervical_model$risiko_ordinal, Prediksi = pred_ord)
conf_ord
##                Prediksi
## Aktual          Rendah Sedang Tinggi Sangat Tinggi
##   Rendah           633      0      0             0
##   Sedang            36      0      0             0
##   Tinggi            20      0      0             0
##   Sangat Tinggi     37      0      0             0
akurasi_ord <- sum(diag(conf_ord)) / sum(conf_ord)
cat(sprintf("\nAkurasi klasifikasi: %.2f%%\n", akurasi_ord * 100))
## 
## Akurasi klasifikasi: 87.19%

2.9 Kurva Probabilitas Prediksi

grid_ord <- data.frame(
  Age                     = seq(min(cervical_model$Age), max(cervical_model$Age), length.out = 120),
  Smokes                  = 0,
  Hormonal_Contraceptives = 0,
  IUD                     = 0,
  STDs                    = 0
)

grid_probs_ord <- predict(fit_ord, newdata = grid_ord, type = "probs")

grid_ord_plot <- grid_ord |>
  dplyr::bind_cols(as.data.frame(grid_probs_ord)) |>
  tidyr::pivot_longer(cols = c("Rendah", "Sedang", "Tinggi", "Sangat Tinggi"),
                      names_to = "Kategori", values_to = "Probabilitas") |>
  mutate(Kategori = factor(Kategori,
                           levels = c("Rendah", "Sedang", "Tinggi", "Sangat Tinggi")))

ggplot(grid_ord_plot, aes(x = Age, y = Probabilitas, colour = Kategori)) +
  geom_line(linewidth = 1.3) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title    = "Kurva Probabilitas Prediksi vs Usia",
    subtitle = "Prediktor lain dikondisikan pada nilai nol",
    x = "Usia (tahun)", y = "Probabilitas Prediksi"
  ) +
  theme_minimal(base_size = 12)


3 Regresi Logistik Biner

3.1 Konsep dan Formula

Regresi logistik biner memodelkan peluang kejadian (\(Y=1\)) sebagai fungsi dari prediktor menggunakan fungsi logit sebagai link:

\[\log\frac{P(Y=1)}{1 - P(Y=1)} = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]

Eksponensial koefisien menghasilkan Odds Ratio (OR) yang mengukur perubahan odds kejadian untuk setiap satu satuan kenaikan prediktor.

3.2 Persiapan Data

Data yang digunakan adalah Maternal Health Risk Dataset (UCI). Variabel RiskLevel dikodekan biner: high risk = 1, selain itu = 0. Prediktor: Age, SystolicBP, DiastolicBP, BS, BodyTemp, HeartRate.

maternal <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/Maternal Health Risk Data Set.csv",
                     stringsAsFactors = FALSE)

maternal <- maternal |>
  mutate(high_risk = as.integer(RiskLevel == "high risk"))

cat("Dimensi data:", nrow(maternal), "baris x", ncol(maternal), "kolom\n")
## Dimensi data: 1014 baris x 8 kolom
head(maternal) |>
  kable(caption = "Enam baris pertama data maternal health") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Enam baris pertama data maternal health
Age SystolicBP DiastolicBP BS BodyTemp HeartRate RiskLevel high_risk
25 130 80 15.00 98 86 high risk 1
35 140 90 13.00 98 70 high risk 1
29 90 70 8.00 100 80 high risk 1
30 140 85 7.00 98 70 high risk 1
35 120 60 6.10 98 76 low risk 0
23 140 80 7.01 98 70 high risk 1

3.3 Sebaran Variabel Respons

maternal |>
  count(RiskLevel) |>
  mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
  rename(`Level Risiko` = RiskLevel, Frekuensi = n) |>
  kable(caption = "Frekuensi dan proporsi tingkat risiko kesehatan ibu") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Frekuensi dan proporsi tingkat risiko kesehatan ibu
Level Risiko Frekuensi Proporsi
high risk 272 26.8%
low risk 406 40.0%
mid risk 336 33.1%
ggplot(maternal, aes(x = factor(high_risk, labels = c("Bukan High Risk", "High Risk")),
                     fill = factor(high_risk))) +
  geom_bar(width = 0.6, alpha = 0.9) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
  scale_fill_manual(values = c("#2a9d8f", "#e76f51")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(title = "Sebaran Variabel Respons Biner",
       x = "Status Risiko", y = "Jumlah Observasi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

3.4 Eksplorasi Prediktor

ggplot(maternal, aes(x = factor(high_risk, labels = c("Bukan High Risk", "High Risk")),
                     y = SystolicBP, fill = factor(high_risk))) +
  geom_violin(alpha = 0.6, trim = FALSE) +
  geom_boxplot(width = 0.12, fill = "white", outlier.shape = 21) +
  scale_fill_manual(values = c("#2a9d8f", "#e76f51")) +
  labs(title = "Tekanan Darah Sistolik per Status Risiko",
       x = "Status Risiko", y = "Tekanan Darah Sistolik (mmHg)") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

ggplot(maternal, aes(x = Age, y = BS,
                     colour = factor(high_risk, labels = c("Bukan High Risk", "High Risk")))) +
  geom_point(alpha = 0.55, size = 1.8) +
  scale_colour_manual(values = c("#2a9d8f", "#e76f51")) +
  labs(title = "Usia vs Kadar Gula Darah",
       x = "Usia (tahun)", y = "Kadar Gula Darah (mmol/L)",
       colour = "Status Risiko") +
  theme_minimal(base_size = 12)

3.5 Estimasi Model

set.seed(42)
train_idx <- sample(seq_len(nrow(maternal)), size = floor(0.8 * nrow(maternal)))
train_mat <- maternal[train_idx, ]
test_mat  <- maternal[-train_idx, ]

fit_biner <- glm(
  high_risk ~ Age + SystolicBP + DiastolicBP + BS + BodyTemp + HeartRate,
  data   = train_mat,
  family = binomial(link = "logit")
)

summary(fit_biner)
## 
## Call:
## glm(formula = high_risk ~ Age + SystolicBP + DiastolicBP + BS + 
##     BodyTemp + HeartRate, family = binomial(link = "logit"), 
##     data = train_mat)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -68.09573    8.36954  -8.136 4.08e-16 ***
## Age          -0.01496    0.01087  -1.376  0.16886    
## SystolicBP    0.02332    0.01095   2.129  0.03328 *  
## DiastolicBP   0.05075    0.01360   3.732  0.00019 ***
## BS            0.46388    0.04891   9.484  < 2e-16 ***
## BodyTemp      0.52932    0.07837   6.754 1.43e-11 ***
## HeartRate     0.05774    0.01414   4.085 4.41e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 944.11  on 810  degrees of freedom
## Residual deviance: 549.47  on 804  degrees of freedom
## AIC: 563.47
## 
## Number of Fisher Scoring iterations: 6

3.6 Koefisien, OR, dan Signifikansi

biner_result <- broom::tidy(fit_biner) |>
  mutate(
    OR        = round(exp(estimate), 3),
    CI_low    = round(exp(estimate - 1.96 * std.error), 3),
    CI_high   = round(exp(estimate + 1.96 * std.error), 3),
    p.value   = round(p.value, 4),
    estimate  = round(estimate, 4),
    Sig       = dplyr::case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
                                 p.value < 0.05 ~ "*", TRUE ~ "")
  ) |>
  dplyr::rename(Variabel = term, Koefisien = estimate, SE = std.error,
                `z-value` = statistic, `p-value` = p.value,
                `CI 2.5%` = CI_low, `CI 97.5%` = CI_high)

biner_result |>
  kable(caption = "Koefisien dan Odds Ratio regresi logistik biner") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Koefisien dan Odds Ratio regresi logistik biner
Variabel Koefisien SE z-value p-value OR CI 2.5% CI 97.5% Sig
(Intercept) -68.0957 8.3695354 -8.136143 0.0000 0.000 0.000 0.000 ***
Age -0.0150 0.0108730 -1.375880 0.1689 0.985 0.964 1.006
SystolicBP 0.0233 0.0109531 2.128684 0.0333 1.024 1.002 1.046
DiastolicBP 0.0507 0.0135996 3.731511 0.0002 1.052 1.024 1.080 ***
BS 0.4639 0.0489123 9.483909 0.0000 1.590 1.445 1.750 ***
BodyTemp 0.5293 0.0783667 6.754403 0.0000 1.698 1.456 1.980 ***
HeartRate 0.0577 0.0141363 4.084724 0.0000 1.059 1.030 1.089 ***

OR > 1 berarti prediktor meningkatkan odds high risk; OR < 1 berarti menurunkan. Interval kepercayaan 95% yang tidak mencakup 1 mengonfirmasi signifikansi statistik.

3.7 Confusion Matrix & Akurasi

p_test           <- predict(fit_biner, newdata = test_mat, type = "response")
pred_class_biner <- ifelse(p_test >= 0.5, 1, 0)

conf_biner    <- table(Aktual = test_mat$high_risk, Prediksi = pred_class_biner)
conf_biner
##       Prediksi
## Aktual   0   1
##      0 138  11
##      1  20  34
akurasi_biner <- sum(diag(conf_biner)) / sum(conf_biner)
cat(sprintf("\nAkurasi pada data testing: %.2f%%\n", akurasi_biner * 100))
## 
## Akurasi pada data testing: 84.73%

3.8 Kurva Probabilitas Prediksi

grid_biner <- data.frame(
  Age         = mean(train_mat$Age),
  SystolicBP  = seq(min(maternal$SystolicBP), max(maternal$SystolicBP), length.out = 150),
  DiastolicBP = mean(train_mat$DiastolicBP),
  BS          = mean(train_mat$BS),
  BodyTemp    = mean(train_mat$BodyTemp),
  HeartRate   = mean(train_mat$HeartRate)
)

grid_biner$prob <- predict(fit_biner, newdata = grid_biner, type = "response")

ggplot(grid_biner, aes(x = SystolicBP, y = prob)) +
  geom_ribbon(aes(ymin = 0, ymax = prob), fill = "#e76f51", alpha = 0.15) +
  geom_line(colour = "#e76f51", linewidth = 1.4) +
  geom_hline(yintercept = 0.5, linetype = "dashed", colour = "grey40") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
  labs(
    title    = "Kurva Sigmoid: Probabilitas High Risk vs Tekanan Darah Sistolik",
    subtitle = "Prediktor lain dikondisikan pada nilai rata-rata | garis putus-putus = cutoff 50%",
    x = "Tekanan Darah Sistolik (mmHg)", y = "P(High Risk)"
  ) +
  theme_minimal(base_size = 12)


4 Regresi Poisson

4.1 Konsep dan Formula

Regresi Poisson digunakan untuk memodelkan data cacahan (count data) — variabel respons berupa bilangan bulat non-negatif (\(Y \in \{0, 1, 2, \ldots\}\)). Model mengasumsikan \(Y \sim \text{Poisson}(\mu)\) dengan fungsi link logaritma:

\[\log(\mu_i) = \beta_0 + \beta_1 X_{1i} + \cdots + \beta_p X_{pi}\]

Eksponensial koefisien menghasilkan Incidence Rate Ratio (IRR) yang mengukur perubahan relatif rata-rata cacahan.

4.2 Persiapan Data

Data yang digunakan adalah Student Awards UCLA OARC. Variabel respons adalah num_awards (jumlah penghargaan per mahasiswa). Prediktor: prog (program studi: General, Academic, Vocational) dan math (nilai ujian matematika).

students <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/Student Awards — UCLA OARC.csv", sep = ";",
                     stringsAsFactors = FALSE)

students <- students |>
  mutate(prog = factor(prog, levels = 1:3,
                       labels = c("General", "Academic", "Vocational")))

cat("Dimensi data:", nrow(students), "baris x", ncol(students), "kolom\n")
## Dimensi data: 200 baris x 4 kolom
head(students, 8) |>
  kable(caption = "Delapan baris pertama data student awards") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Delapan baris pertama data student awards
id num_awards prog math
45 0 Vocational 41
108 0 General 41
15 0 Vocational 44
67 0 Vocational 42
153 0 Vocational 40
51 0 General 42
164 0 Vocational 46
133 0 Vocational 40

4.3 Sebaran Variabel Respons

students |>
  count(num_awards) |>
  mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
  rename(`Jumlah Penghargaan` = num_awards, Frekuensi = n) |>
  kable(caption = "Frekuensi dan proporsi jumlah penghargaan mahasiswa") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Frekuensi dan proporsi jumlah penghargaan mahasiswa
Jumlah Penghargaan Frekuensi Proporsi
0 124 62.0%
1 49 24.5%
2 13 6.5%
3 9 4.5%
4 2 1.0%
5 2 1.0%
6 1 0.5%
ggplot(students, aes(x = num_awards)) +
  geom_bar(fill = "#2f7f73", colour = "white", width = 0.7) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
  scale_x_continuous(breaks = 0:max(students$num_awards)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(title = "Sebaran Jumlah Penghargaan Mahasiswa",
       subtitle = "Data cacahan dengan banyak nilai nol — tipikal distribusi Poisson",
       x = "Jumlah Penghargaan", y = "Frekuensi") +
  theme_minimal(base_size = 12)

4.4 Eksplorasi Prediktor

students |>
  group_by(prog) |>
  summarise(n = n(), `Rata-rata` = round(mean(num_awards), 3),
            Varians = round(var(num_awards), 3), .groups = "drop") |>
  rename(Program = prog) |>
  kable(caption = "Rata-rata dan varians jumlah penghargaan per program studi") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Rata-rata dan varians jumlah penghargaan per program studi
Program n Rata-rata Varians
General 45 0.20 0.164
Academic 105 1.00 1.635
Vocational 50 0.24 0.268
ggplot(students, aes(x = math, y = num_awards, colour = prog)) +
  geom_jitter(height = 0.15, alpha = 0.65, size = 2) +
  geom_smooth(method = "loess", se = FALSE, linewidth = 1.1) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  labs(title = "Nilai Matematika vs Jumlah Penghargaan per Program",
       x = "Nilai Matematika", y = "Jumlah Penghargaan",
       colour = "Program") +
  theme_minimal(base_size = 12)

4.5 Estimasi Model

fit_pois <- glm(
  num_awards ~ prog + math,
  data   = students,
  family = poisson(link = "log")
)

summary(fit_pois)
## 
## Call:
## glm(formula = num_awards ~ prog + math, family = poisson(link = "log"), 
##     data = students)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -5.24712    0.65845  -7.969 1.60e-15 ***
## progAcademic    1.08386    0.35825   3.025  0.00248 ** 
## progVocational  0.36981    0.44107   0.838  0.40179    
## math            0.07015    0.01060   6.619 3.63e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 287.67  on 199  degrees of freedom
## Residual deviance: 189.45  on 196  degrees of freedom
## AIC: 373.5
## 
## Number of Fisher Scoring iterations: 6

4.6 Koefisien, IRR, dan Signifikansi

pois_result <- broom::tidy(fit_pois) |>
  mutate(
    IRR           = round(exp(estimate), 3),
    CI_low        = round(exp(estimate - 1.96 * std.error), 3),
    CI_high       = round(exp(estimate + 1.96 * std.error), 3),
    p.value       = round(p.value, 4),
    estimate      = round(estimate, 4),
    `% Perubahan` = round((exp(estimate) - 1) * 100, 2),
    Sig           = dplyr::case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
                                     p.value < 0.05 ~ "*", TRUE ~ "")
  ) |>
  dplyr::rename(Variabel = term, Koefisien = estimate, SE = std.error,
                `z-value` = statistic, `p-value` = p.value,
                `CI 2.5%` = CI_low, `CI 97.5%` = CI_high)

pois_result |>
  kable(caption = "Koefisien dan IRR regresi Poisson") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Koefisien dan IRR regresi Poisson
Variabel Koefisien SE z-value p-value IRR CI 2.5% CI 97.5% % Perubahan Sig
(Intercept) -5.2471 0.6584531 -7.968866 0.0000 0.005 0.001 0.019 -99.47 ***
progAcademic 1.0839 0.3582530 3.025402 0.0025 2.956 1.465 5.966 195.62 **
progVocational 0.3698 0.4410703 0.838436 0.4018 1.447 0.610 3.436 44.74
math 0.0702 0.0105992 6.618647 0.0000 1.073 1.051 1.095 7.27 ***

IRR > 1 berarti prediktor meningkatkan rata-rata jumlah penghargaan; IRR < 1 berarti menurunkan. Kolom % Perubahan menunjukkan besar perubahan relatif rata-rata cacahan per satu satuan kenaikan prediktor.

4.7 Cek Overdispersi

dispersion <- sum(residuals(fit_pois, type = "pearson")^2) / df.residual(fit_pois)

tibble::tibble(
  `Dispersion Pearson` = round(dispersion, 4),
  Interpretasi = dplyr::case_when(
    dispersion < 1.5 ~ "Tidak ada indikasi overdispersion berat",
    dispersion < 2.5 ~ "Ada indikasi overdispersion sedang",
    TRUE             ~ "Ada indikasi overdispersion kuat"
  )
) |>
  kable(caption = "Rasio dispersi Pearson — indikator overdispersion") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)
Rasio dispersi Pearson — indikator overdispersion
Dispersion Pearson Interpretasi
1.0824 Tidak ada indikasi overdispersion berat

Jika nilai dispersi Pearson jauh di atas 1, model Negative Binomial lebih sesuai dibanding Poisson.

4.8 Visualisasi Prediksi Rate Poisson

grid_pois <- expand.grid(
  math = seq(min(students$math), max(students$math), length.out = 100),
  prog = factor(c("General", "Academic", "Vocational"), levels = levels(students$prog))
)

pred_link <- predict(fit_pois, newdata = grid_pois, type = "link", se.fit = TRUE)
grid_pois <- grid_pois |>
  mutate(
    pred_rate = exp(pred_link$fit),
    ci_low    = exp(pred_link$fit - 1.96 * pred_link$se.fit),
    ci_high   = exp(pred_link$fit + 1.96 * pred_link$se.fit)
  )

ggplot(grid_pois, aes(x = math, y = pred_rate, colour = prog, fill = prog)) +
  geom_ribbon(aes(ymin = ci_low, ymax = ci_high), alpha = 0.15, colour = NA) +
  geom_line(linewidth = 1.3) +
  geom_jitter(data = students, aes(x = math, y = num_awards, colour = prog),
              height = 0.1, alpha = 0.3, size = 1.5, inherit.aes = FALSE) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  labs(
    title    = "Kurva Prediksi Rate Poisson vs Nilai Matematika",
    subtitle = "Pita = interval kepercayaan 95%; titik = data aktual",
    x = "Nilai Matematika", y = "Rata-rata Prediksi Jumlah Penghargaan",
    colour = "Program", fill = "Program"
  ) +
  theme_minimal(base_size = 12)