1 Regresi Logistik Multinomial

1.1 Konsep dan Formula

Regresi logistik multinomial digunakan ketika variabel respons bersifat nominal dengan tiga kategori atau lebih tanpa urutan yang bermakna. Model memodelkan log-odds setiap kategori \(k\) terhadap kategori referensi \(K\):

\[\log\frac{P(Y = k)}{P(Y = K)} = \beta_{k0} + \beta_{k1}X_1 + \cdots + \beta_{kp}X_p, \quad k = 1, \ldots, K-1\]

Eksponensial koefisien menghasilkan Odds Ratio (OR) yang menunjukkan seberapa besar suatu prediktor mengubah peluang berada di kategori \(k\) dibanding kategori referensi.

1.2 Persiapan Data

Data yang digunakan adalah Palmer Archipelago Antarctica Penguin Dataset. Variabel respons adalah species (nominal, 3 kategori: Adelie, Chinstrap, Gentoo). Prediktor: culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, dan sex.

penguins <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/penguins_size.csv",
                     stringsAsFactors = TRUE,
                     na.strings = c("NA", ".", ""))

penguins_clean <- penguins |>
  filter(
    !is.na(culmen_length_mm), !is.na(culmen_depth_mm),
    !is.na(flipper_length_mm), !is.na(body_mass_g),
    !is.na(sex), sex %in% c("MALE", "FEMALE")
  ) |>
  mutate(species = factor(species), sex = factor(sex))

cat("Dimensi data:", nrow(penguins_clean), "baris x", ncol(penguins_clean), "kolom\n")

## Dimensi data: 333 baris x 7 kolom

head(penguins_clean) |>
  kable(caption = "Enam baris pertama data penguin") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Enam baris pertama data penguin
species	island	culmen_length_mm	culmen_depth_mm	flipper_length_mm	body_mass_g	sex
Adelie	Torgersen	39.1	18.7	181	3750	MALE
Adelie	Torgersen	39.5	17.4	186	3800	FEMALE
Adelie	Torgersen	40.3	18.0	195	3250	FEMALE
Adelie	Torgersen	36.7	19.3	193	3450	FEMALE
Adelie	Torgersen	39.3	20.6	190	3650	MALE
Adelie	Torgersen	38.9	17.8	181	3625	FEMALE

1.3 Sebaran Variabel Respons

penguins_clean |>
  count(species) |>
  mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
  rename(Spesies = species, Frekuensi = n) |>
  kable(caption = "Frekuensi dan proporsi tiap spesies penguin") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Frekuensi dan proporsi tiap spesies penguin
Spesies	Frekuensi	Proporsi
Adelie	146	43.8%
Chinstrap	68	20.4%
Gentoo	119	35.7%

ggplot(penguins_clean, aes(x = species, fill = species)) +
  geom_bar(width = 0.6, alpha = 0.9) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(title = "Sebaran Spesies Penguin", x = "Spesies", y = "Jumlah Observasi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

1.4 Eksplorasi Prediktor

# Boxplot panjang paruh per spesies
ggplot(penguins_clean, aes(x = species, y = culmen_length_mm, fill = species)) +
  geom_boxplot(alpha = 0.8, outlier.shape = 21) +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  labs(title = "Panjang Paruh per Spesies",
       x = "Spesies", y = "Panjang Paruh (mm)") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

# Scatter panjang paruh vs panjang sirip
ggplot(penguins_clean, aes(x = culmen_length_mm, y = flipper_length_mm,
                            colour = species, shape = sex)) +
  geom_point(alpha = 0.7, size = 2.2) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  labs(title = "Panjang Paruh vs Panjang Sirip",
       x = "Panjang Paruh (mm)", y = "Panjang Sirip (mm)",
       colour = "Spesies", shape = "Jenis Kelamin") +
  theme_minimal(base_size = 12)

1.5 Estimasi Model

penguins_clean$species <- relevel(penguins_clean$species, ref = "Adelie")

fit_multi <- nnet::multinom(
  species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm + body_mass_g + sex,
  data  = penguins_clean,
  trace = FALSE
)

summary(fit_multi)

## Call:
## nnet::multinom(formula = species ~ culmen_length_mm + culmen_depth_mm + 
##     flipper_length_mm + body_mass_g + sex, data = penguins_clean, 
##     trace = FALSE)
## 
## Coefficients:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap -103.422234         16.51643       -24.41759       -0.31003945
## Gentoo      -6.756322         12.21924       -30.09759       -0.08757856
##            body_mass_g    sexMALE
## Chinstrap -0.030200987 -15.820608
## Gentoo     0.005019241  -8.132104
## 
## Std. Errors:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap   0.5715537        27.750159        27.43735          4.097368
## Gentoo      0.5715312         8.972629        33.25505         11.905646
##           body_mass_g  sexMALE
## Chinstrap   0.1350179 3.077204
## Gentoo      0.5848798 3.498127
## 
## Residual Deviance: 0.002996928 
## AIC: 24.003

1.6 Koefisien, OR, dan Signifikansi

multi_sum <- summary(fit_multi)
z_vals    <- multi_sum$coefficients / multi_sum$standard.errors
p_vals    <- 2 * (1 - pnorm(abs(z_vals)))

coef_df <- as.data.frame(t(multi_sum$coefficients)) |> tibble::rownames_to_column("Variabel")
se_df   <- as.data.frame(t(multi_sum$standard.errors)) |> tibble::rownames_to_column("Variabel")
pv_df   <- as.data.frame(t(p_vals)) |> tibble::rownames_to_column("Variabel")

result_multi <- coef_df |>
  tidyr::pivot_longer(-Variabel, names_to = "Kategori", values_to = "Koefisien") |>
  left_join(se_df |> tidyr::pivot_longer(-Variabel, names_to = "Kategori", values_to = "SE"),
            by = c("Variabel", "Kategori")) |>
  left_join(pv_df |> tidyr::pivot_longer(-Variabel, names_to = "Kategori", values_to = "p_value"),
            by = c("Variabel", "Kategori")) |>
  mutate(
    OR        = round(exp(Koefisien), 3),
    p_value   = round(p_value, 4),
    Koefisien = round(Koefisien, 4),
    SE        = round(SE, 4),
    Sig       = dplyr::case_when(p_value < 0.001 ~ "***", p_value < 0.01 ~ "**",
                                 p_value < 0.05 ~ "*", TRUE ~ "")
  )

result_multi |>
  kable(caption = "Koefisien dan OR regresi logistik multinomial (referensi: Adelie)") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Koefisien dan OR regresi logistik multinomial (referensi: Adelie)
Variabel	Kategori	Koefisien	SE	p_value	OR	Sig
(Intercept)	Chinstrap	-103.4222	0.5716	0.0000	0.000	***
(Intercept)	Gentoo	-6.7563	0.5715	0.0000	0.001	***
culmen_length_mm	Chinstrap	16.5164	27.7502	0.5517	14893428.878
culmen_length_mm	Gentoo	12.2192	8.9726	0.1732	202651.063
culmen_depth_mm	Chinstrap	-24.4176	27.4374	0.3735	0.000
culmen_depth_mm	Gentoo	-30.0976	33.2551	0.3654	0.000
flipper_length_mm	Chinstrap	-0.3100	4.0974	0.9397	0.733
flipper_length_mm	Gentoo	-0.0876	11.9056	0.9941	0.916
body_mass_g	Chinstrap	-0.0302	0.1350	0.8230	0.970
body_mass_g	Gentoo	0.0050	0.5849	0.9932	1.005
sexMALE	Chinstrap	-15.8206	3.0772	0.0000	0.000	***
sexMALE	Gentoo	-8.1321	3.4981	0.0201	0.000

Koefisien diinterpretasikan sebagai log-odds suatu spesies dibanding Adelie. OR > 1 berarti prediktor meningkatkan peluang masuk kategori tersebut, OR < 1 berarti menurunkan peluang.

1.7 Uji Kebaikan Model (LRT)

fit_multi_null <- nnet::multinom(species ~ 1, data = penguins_clean, trace = FALSE)

lrt_stat <- 2 * (as.numeric(logLik(fit_multi)) - as.numeric(logLik(fit_multi_null)))
lrt_df   <- length(coef(fit_multi)) - length(coef(fit_multi_null))
lrt_p    <- 1 - pchisq(lrt_stat, lrt_df)

tibble::tibble(
  `Log-Lik Model Penuh` = round(as.numeric(logLik(fit_multi)), 3),
  `Log-Lik Model Null`  = round(as.numeric(logLik(fit_multi_null)), 3),
  `G² (LRT)`            = round(lrt_stat, 3),
  `df`                  = lrt_df,
  `p-value`             = round(lrt_p, 4)
) |>
  kable(caption = "Likelihood Ratio Test: model penuh vs model null") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Likelihood Ratio Test: model penuh vs model null
Log-Lik Model Penuh	Log-Lik Model Null	G² (LRT)	df	p-value
-0.001	-350.863	701.722	10	0

1.8 Confusion Matrix & Akurasi

pred_class <- predict(fit_multi, type = "class")
conf_mat   <- table(Aktual = penguins_clean$species, Prediksi = pred_class)
conf_mat

##            Prediksi
## Aktual      Adelie Chinstrap Gentoo
##   Adelie       146         0      0
##   Chinstrap      0        68      0
##   Gentoo         0         0    119

akurasi <- sum(diag(conf_mat)) / sum(conf_mat)
cat(sprintf("\nAkurasi klasifikasi: %.2f%%\n", akurasi * 100))

## 
## Akurasi klasifikasi: 100.00%

1.9 Kurva Probabilitas Prediksi

grid_multi <- expand.grid(
  culmen_length_mm  = seq(min(penguins_clean$culmen_length_mm),
                          max(penguins_clean$culmen_length_mm), length.out = 150),
  culmen_depth_mm   = mean(penguins_clean$culmen_depth_mm),
  flipper_length_mm = mean(penguins_clean$flipper_length_mm),
  body_mass_g       = mean(penguins_clean$body_mass_g),
  sex               = factor("FEMALE", levels = levels(penguins_clean$sex))
)

grid_probs <- predict(fit_multi, newdata = grid_multi, type = "probs")

grid_plot <- grid_multi |>
  dplyr::bind_cols(as.data.frame(grid_probs)) |>
  tidyr::pivot_longer(cols = c("Adelie", "Chinstrap", "Gentoo"),
                      names_to = "Spesies", values_to = "Probabilitas")

ggplot(grid_plot, aes(x = culmen_length_mm, y = Probabilitas, colour = Spesies)) +
  geom_line(linewidth = 1.3) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title    = "Kurva Probabilitas Prediksi vs Panjang Paruh",
    subtitle = "Prediktor lain dikondisikan pada rata-ratanya (sex = FEMALE)",
    x = "Panjang Paruh (mm)", y = "Probabilitas Prediksi"
  ) +
  theme_minimal(base_size = 12)

2 Regresi Logistik Ordinal

2.1 Konsep dan Formula

Regresi logistik ordinal (proportional odds model) digunakan ketika kategori respons memiliki urutan yang bermakna namun jarak antar kategori tidak perlu sama. Model mengasumsikan satu set koefisien berlaku untuk semua batas kategori (proportional odds assumption):

\[\log\frac{P(Y \leq k)}{P(Y > k)} = \alpha_k - \boldsymbol{\beta}^{\top}\mathbf{x}, \quad k = 1, \ldots, K-1\]

\(\alpha_k\) disebut intercept atau cutpoint ke-\(k\), sedangkan \(\boldsymbol{\beta}\) adalah vektor koefisien yang sama di semua cutpoint.

2.2 Persiapan Data

Data yang digunakan adalah Cervical Cancer Risk Factors Dataset (UCI). Variabel respons ordinal dibentuk dari jumlah tes diagnostik positif (Hinselmann, Schiller, Citology, Biopsy): Rendah (0 tes positif), Sedang (1), Tinggi (2), Sangat Tinggi (≥3). Prediktor: Age, Smokes, Hormonal_Contraceptives, IUD, STDs.

cervical <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/risk_factors_cervical_cancer.csv",
                     stringsAsFactors = FALSE,
                     na.strings       = c("?", "NA", ""),
                     check.names      = TRUE)

cat("Dimensi data:", nrow(cervical), "baris x", ncol(cervical), "kolom\n")

## Dimensi data: 858 baris x 36 kolom

cervical <- cervical |>
  dplyr::rename(Hormonal_Contraceptives = Hormonal.Contraceptives) |>
  mutate(across(c(Hinselmann, Schiller, Citology, Biopsy,
                  Smokes, Hormonal_Contraceptives, IUD, STDs, Age), as.numeric))

cervical <- cervical |>
  mutate(
    skor_tes = Hinselmann + Schiller + Citology + Biopsy,
    risiko_ordinal = factor(
      dplyr::case_when(
        skor_tes == 0 ~ "Rendah",
        skor_tes == 1 ~ "Sedang",
        skor_tes == 2 ~ "Tinggi",
        skor_tes >= 3 ~ "Sangat Tinggi"
      ),
      levels = c("Rendah", "Sedang", "Tinggi", "Sangat Tinggi"),
      ordered = TRUE
    )
  )

cervical_model <- cervical |>
  dplyr::filter(!is.na(Age), !is.na(Smokes), !is.na(Hormonal_Contraceptives),
                !is.na(IUD), !is.na(STDs), !is.na(risiko_ordinal)) |>
  dplyr::select(risiko_ordinal, Age, Smokes, Hormonal_Contraceptives, IUD, STDs)

cat("Dimensi data model:", nrow(cervical_model), "baris\n")

## Dimensi data model: 726 baris

head(cervical_model) |>
  kable(caption = "Enam baris pertama data ordinal") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Enam baris pertama data ordinal
risiko_ordinal	Age	Smokes	Hormonal_Contraceptives
Rendah	18	0	0
Rendah	15	0	0
Rendah	34	0	0
Rendah	52	1	1
Rendah	46	0	1
Rendah	42	0	0

2.3 Sebaran Variabel Respons

cervical_model |>
  count(risiko_ordinal) |>
  mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
  rename(`Tingkat Risiko` = risiko_ordinal, Frekuensi = n) |>
  kable(caption = "Frekuensi dan proporsi tingkat risiko kanker serviks") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Frekuensi dan proporsi tingkat risiko kanker serviks
Tingkat Risiko	Frekuensi	Proporsi
Rendah	633	87.2%
Sedang	36	5.0%
Tinggi	20	2.8%
Sangat Tinggi	37	5.1%

ggplot(cervical_model, aes(x = risiko_ordinal, fill = risiko_ordinal)) +
  geom_bar(width = 0.65, alpha = 0.92) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(title = "Sebaran Tingkat Risiko Kanker Serviks",
       subtitle = "Urutan: Rendah < Sedang < Tinggi < Sangat Tinggi",
       x = "Tingkat Risiko", y = "Jumlah Observasi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

2.4 Eksplorasi Prediktor

ggplot(cervical_model, aes(x = risiko_ordinal, y = Age, fill = risiko_ordinal)) +
  geom_violin(alpha = 0.6, trim = FALSE) +
  geom_boxplot(width = 0.15, fill = "white", outlier.shape = 21) +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
  labs(title = "Sebaran Usia per Tingkat Risiko",
       x = "Tingkat Risiko", y = "Usia (tahun)") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

cervical_model |>
  tidyr::pivot_longer(cols = c(Smokes, Hormonal_Contraceptives, IUD, STDs),
                      names_to = "Variabel", values_to = "Nilai") |>
  group_by(risiko_ordinal, Variabel) |>
  summarise(Proporsi = mean(Nilai, na.rm = TRUE), .groups = "drop") |>
  ggplot(aes(x = risiko_ordinal, y = Proporsi, fill = risiko_ordinal)) +
  geom_col(width = 0.65, alpha = 0.9) +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
  scale_y_continuous(labels = percent_format()) +
  facet_wrap(~ Variabel, scales = "free_y") +
  labs(title = "Proporsi Prediktor Biner per Tingkat Risiko",
       x = "Tingkat Risiko", y = "Proporsi") +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none", axis.text.x = element_text(angle = 30, hjust = 1))

2.5 Estimasi Model

fit_ord <- MASS::polr(
  risiko_ordinal ~ Age + Smokes + Hormonal_Contraceptives + IUD + STDs,
  data   = cervical_model,
  method = "logistic",
  Hess   = TRUE
)

summary(fit_ord)

## Call:
## MASS::polr(formula = risiko_ordinal ~ Age + Smokes + Hormonal_Contraceptives + 
##     IUD + STDs, data = cervical_model, Hess = TRUE, method = "logistic")
## 
## Coefficients:
##                              Value Std. Error  t value
## Age                      0.0005485    0.01301  0.04216
## Smokes                   0.4683227    0.28611  1.63687
## Hormonal_Contraceptives -0.0258956    0.23305 -0.11111
## IUD                      0.5791122    0.32449  1.78467
## STDs                     0.9210369    0.30475  3.02223
## 
## Intercepts:
##                      Value   Std. Error t value
## Rendah|Sedang         2.1944  0.3928     5.5864
## Sedang|Tinggi         2.7504  0.4020     6.8415
## Tinggi|Sangat Tinggi  3.2177  0.4142     7.7679
## 
## Residual Deviance: 737.8958 
## AIC: 753.8958

2.6 Koefisien, OR, dan Signifikansi

ord_coef <- coef(summary(fit_ord))

result_ord <- as.data.frame(ord_coef) |>
  tibble::rownames_to_column("Parameter") |>
  dplyr::rename(Koefisien = Value, SE = `Std. Error`, `t-value` = `t value`) |>
  mutate(
    `p-value` = round(2 * (1 - pnorm(abs(`t-value`))), 4),
    Jenis     = ifelse(grepl("\\|", Parameter), "Cutpoint", "Koefisien"),
    OR        = ifelse(Jenis == "Koefisien", round(exp(-Koefisien), 3), NA_real_),
    Koefisien = round(Koefisien, 4),
    SE        = round(SE, 4),
    `t-value` = round(`t-value`, 4),
    Sig       = dplyr::case_when(`p-value` < 0.001 ~ "***", `p-value` < 0.01 ~ "**",
                                 `p-value` < 0.05 ~ "*", TRUE ~ "")
  )

result_ord |>
  kable(caption = "Koefisien regresi logistik ordinal — proportional odds model") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Koefisien regresi logistik ordinal — proportional odds model
Parameter	Koefisien	SE	t-value	p-value	Jenis	OR	Sig
Age	0.0005	0.0130	0.0422	0.9664	Koefisien	0.999
Smokes	0.4683	0.2861	1.6369	0.1017	Koefisien	0.626
Hormonal_Contraceptives	-0.0259	0.2331	-0.1111	0.9115	Koefisien	1.026
IUD	0.5791	0.3245	1.7847	0.0743	Koefisien	0.560
STDs	0.9210	0.3048	3.0222	0.0025	Koefisien	0.398	**
Rendah\|Sedang	2.1944	0.3928	5.5864	0.0000	Cutpoint	NA	***
Sedang\|Tinggi	2.7504	0.4020	6.8415	0.0000	Cutpoint	NA	***
Tinggi\|Sangat Tinggi	3.2177	0.4142	7.7679	0.0000	Cutpoint	NA	***

OR kumulatif dihitung sebagai \(\exp(-\hat\beta)\) sesuai konvensi polr. OR > 1 berarti prediktor meningkatkan peluang berada di kategori yang lebih tinggi; cutpoint menandai batas log-odds antar kategori yang berdekatan.

2.7 Uji Kebaikan Model (LRT)

fit_ord_null <- MASS::polr(risiko_ordinal ~ 1, data = cervical_model,
                           method = "logistic", Hess = TRUE)

lrt_ord    <- 2 * (as.numeric(logLik(fit_ord)) - as.numeric(logLik(fit_ord_null)))
lrt_ord_df <- length(coef(fit_ord)) - length(coef(fit_ord_null))
lrt_ord_p  <- 1 - pchisq(lrt_ord, lrt_ord_df)

tibble::tibble(
  `Log-Lik Model Penuh` = round(as.numeric(logLik(fit_ord)), 3),
  `Log-Lik Model Null`  = round(as.numeric(logLik(fit_ord_null)), 3),
  `G² (LRT)`            = round(lrt_ord, 3),
  `df`                  = lrt_ord_df,
  `p-value`             = round(lrt_ord_p, 4)
) |>
  kable(caption = "Likelihood Ratio Test: model penuh vs model null") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Likelihood Ratio Test: model penuh vs model null
Log-Lik Model Penuh	Log-Lik Model Null	G² (LRT)	df	p-value
-368.948	-376.888	15.881	5	0.0072

2.8 Confusion Matrix & Akurasi

pred_ord <- predict(fit_ord, type = "class")
conf_ord <- table(Aktual = cervical_model$risiko_ordinal, Prediksi = pred_ord)
conf_ord

##                Prediksi
## Aktual          Rendah Sedang Tinggi Sangat Tinggi
##   Rendah           633      0      0             0
##   Sedang            36      0      0             0
##   Tinggi            20      0      0             0
##   Sangat Tinggi     37      0      0             0

akurasi_ord <- sum(diag(conf_ord)) / sum(conf_ord)
cat(sprintf("\nAkurasi klasifikasi: %.2f%%\n", akurasi_ord * 100))

## 
## Akurasi klasifikasi: 87.19%

2.9 Kurva Probabilitas Prediksi

grid_ord <- data.frame(
  Age                     = seq(min(cervical_model$Age), max(cervical_model$Age), length.out = 120),
  Smokes                  = 0,
  Hormonal_Contraceptives = 0,
  IUD                     = 0,
  STDs                    = 0
)

grid_probs_ord <- predict(fit_ord, newdata = grid_ord, type = "probs")

grid_ord_plot <- grid_ord |>
  dplyr::bind_cols(as.data.frame(grid_probs_ord)) |>
  tidyr::pivot_longer(cols = c("Rendah", "Sedang", "Tinggi", "Sangat Tinggi"),
                      names_to = "Kategori", values_to = "Probabilitas") |>
  mutate(Kategori = factor(Kategori,
                           levels = c("Rendah", "Sedang", "Tinggi", "Sangat Tinggi")))

ggplot(grid_ord_plot, aes(x = Age, y = Probabilitas, colour = Kategori)) +
  geom_line(linewidth = 1.3) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f", "#b84f5a")) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title    = "Kurva Probabilitas Prediksi vs Usia",
    subtitle = "Prediktor lain dikondisikan pada nilai nol",
    x = "Usia (tahun)", y = "Probabilitas Prediksi"
  ) +
  theme_minimal(base_size = 12)

3 Regresi Logistik Biner

3.1 Konsep dan Formula

Regresi logistik biner memodelkan peluang kejadian (\(Y=1\)) sebagai fungsi dari prediktor menggunakan fungsi logit sebagai link:

\[\log\frac{P(Y=1)}{1 - P(Y=1)} = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]

Eksponensial koefisien menghasilkan Odds Ratio (OR) yang mengukur perubahan odds kejadian untuk setiap satu satuan kenaikan prediktor.

3.2 Persiapan Data

Data yang digunakan adalah Maternal Health Risk Dataset (UCI). Variabel RiskLevel dikodekan biner: high risk = 1, selain itu = 0. Prediktor: Age, SystolicBP, DiastolicBP, BS, BodyTemp, HeartRate.

maternal <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/Maternal Health Risk Data Set.csv",
                     stringsAsFactors = FALSE)

maternal <- maternal |>
  mutate(high_risk = as.integer(RiskLevel == "high risk"))

cat("Dimensi data:", nrow(maternal), "baris x", ncol(maternal), "kolom\n")

## Dimensi data: 1014 baris x 8 kolom

head(maternal) |>
  kable(caption = "Enam baris pertama data maternal health") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Enam baris pertama data maternal health
Age	SystolicBP	DiastolicBP	BS	BodyTemp	HeartRate	RiskLevel	high_risk
25	130	80	15.00	98	86	high risk	1
35	140	90	13.00	98	70	high risk	1
29	90	70	8.00	100	80	high risk	1
30	140	85	7.00	98	70	high risk	1
35	120	60	6.10	98	76	low risk	0
23	140	80	7.01	98	70	high risk	1

3.3 Sebaran Variabel Respons

maternal |>
  count(RiskLevel) |>
  mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
  rename(`Level Risiko` = RiskLevel, Frekuensi = n) |>
  kable(caption = "Frekuensi dan proporsi tingkat risiko kesehatan ibu") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Frekuensi dan proporsi tingkat risiko kesehatan ibu
Level Risiko	Frekuensi	Proporsi
high risk	272	26.8%
low risk	406	40.0%
mid risk	336	33.1%

ggplot(maternal, aes(x = factor(high_risk, labels = c("Bukan High Risk", "High Risk")),
                     fill = factor(high_risk))) +
  geom_bar(width = 0.6, alpha = 0.9) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
  scale_fill_manual(values = c("#2a9d8f", "#e76f51")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(title = "Sebaran Variabel Respons Biner",
       x = "Status Risiko", y = "Jumlah Observasi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

3.4 Eksplorasi Prediktor

ggplot(maternal, aes(x = factor(high_risk, labels = c("Bukan High Risk", "High Risk")),
                     y = SystolicBP, fill = factor(high_risk))) +
  geom_violin(alpha = 0.6, trim = FALSE) +
  geom_boxplot(width = 0.12, fill = "white", outlier.shape = 21) +
  scale_fill_manual(values = c("#2a9d8f", "#e76f51")) +
  labs(title = "Tekanan Darah Sistolik per Status Risiko",
       x = "Status Risiko", y = "Tekanan Darah Sistolik (mmHg)") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

ggplot(maternal, aes(x = Age, y = BS,
                     colour = factor(high_risk, labels = c("Bukan High Risk", "High Risk")))) +
  geom_point(alpha = 0.55, size = 1.8) +
  scale_colour_manual(values = c("#2a9d8f", "#e76f51")) +
  labs(title = "Usia vs Kadar Gula Darah",
       x = "Usia (tahun)", y = "Kadar Gula Darah (mmol/L)",
       colour = "Status Risiko") +
  theme_minimal(base_size = 12)

3.5 Estimasi Model

set.seed(42)
train_idx <- sample(seq_len(nrow(maternal)), size = floor(0.8 * nrow(maternal)))
train_mat <- maternal[train_idx, ]
test_mat  <- maternal[-train_idx, ]

fit_biner <- glm(
  high_risk ~ Age + SystolicBP + DiastolicBP + BS + BodyTemp + HeartRate,
  data   = train_mat,
  family = binomial(link = "logit")
)

summary(fit_biner)

## 
## Call:
## glm(formula = high_risk ~ Age + SystolicBP + DiastolicBP + BS + 
##     BodyTemp + HeartRate, family = binomial(link = "logit"), 
##     data = train_mat)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -68.09573    8.36954  -8.136 4.08e-16 ***
## Age          -0.01496    0.01087  -1.376  0.16886    
## SystolicBP    0.02332    0.01095   2.129  0.03328 *  
## DiastolicBP   0.05075    0.01360   3.732  0.00019 ***
## BS            0.46388    0.04891   9.484  < 2e-16 ***
## BodyTemp      0.52932    0.07837   6.754 1.43e-11 ***
## HeartRate     0.05774    0.01414   4.085 4.41e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 944.11  on 810  degrees of freedom
## Residual deviance: 549.47  on 804  degrees of freedom
## AIC: 563.47
## 
## Number of Fisher Scoring iterations: 6

3.6 Koefisien, OR, dan Signifikansi

biner_result <- broom::tidy(fit_biner) |>
  mutate(
    OR        = round(exp(estimate), 3),
    CI_low    = round(exp(estimate - 1.96 * std.error), 3),
    CI_high   = round(exp(estimate + 1.96 * std.error), 3),
    p.value   = round(p.value, 4),
    estimate  = round(estimate, 4),
    Sig       = dplyr::case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
                                 p.value < 0.05 ~ "*", TRUE ~ "")
  ) |>
  dplyr::rename(Variabel = term, Koefisien = estimate, SE = std.error,
                `z-value` = statistic, `p-value` = p.value,
                `CI 2.5%` = CI_low, `CI 97.5%` = CI_high)

biner_result |>
  kable(caption = "Koefisien dan Odds Ratio regresi logistik biner") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Koefisien dan Odds Ratio regresi logistik biner
Variabel	Koefisien	SE	z-value	p-value	OR	CI 2.5%	CI 97.5%	Sig
(Intercept)	-68.0957	8.3695354	-8.136143	0.0000	0.000	0.000	0.000	***
Age	-0.0150	0.0108730	-1.375880	0.1689	0.985	0.964	1.006
SystolicBP	0.0233	0.0109531	2.128684	0.0333	1.024	1.002	1.046
DiastolicBP	0.0507	0.0135996	3.731511	0.0002	1.052	1.024	1.080	***
BS	0.4639	0.0489123	9.483909	0.0000	1.590	1.445	1.750	***
BodyTemp	0.5293	0.0783667	6.754403	0.0000	1.698	1.456	1.980	***
HeartRate	0.0577	0.0141363	4.084724	0.0000	1.059	1.030	1.089	***

OR > 1 berarti prediktor meningkatkan odds high risk; OR < 1 berarti menurunkan. Interval kepercayaan 95% yang tidak mencakup 1 mengonfirmasi signifikansi statistik.

3.7 Confusion Matrix & Akurasi

p_test           <- predict(fit_biner, newdata = test_mat, type = "response")
pred_class_biner <- ifelse(p_test >= 0.5, 1, 0)

conf_biner    <- table(Aktual = test_mat$high_risk, Prediksi = pred_class_biner)
conf_biner

##       Prediksi
## Aktual   0   1
##      0 138  11
##      1  20  34

akurasi_biner <- sum(diag(conf_biner)) / sum(conf_biner)
cat(sprintf("\nAkurasi pada data testing: %.2f%%\n", akurasi_biner * 100))

## 
## Akurasi pada data testing: 84.73%

3.8 Kurva Probabilitas Prediksi

grid_biner <- data.frame(
  Age         = mean(train_mat$Age),
  SystolicBP  = seq(min(maternal$SystolicBP), max(maternal$SystolicBP), length.out = 150),
  DiastolicBP = mean(train_mat$DiastolicBP),
  BS          = mean(train_mat$BS),
  BodyTemp    = mean(train_mat$BodyTemp),
  HeartRate   = mean(train_mat$HeartRate)
)

grid_biner$prob <- predict(fit_biner, newdata = grid_biner, type = "response")

ggplot(grid_biner, aes(x = SystolicBP, y = prob)) +
  geom_ribbon(aes(ymin = 0, ymax = prob), fill = "#e76f51", alpha = 0.15) +
  geom_line(colour = "#e76f51", linewidth = 1.4) +
  geom_hline(yintercept = 0.5, linetype = "dashed", colour = "grey40") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
  labs(
    title    = "Kurva Sigmoid: Probabilitas High Risk vs Tekanan Darah Sistolik",
    subtitle = "Prediktor lain dikondisikan pada nilai rata-rata | garis putus-putus = cutoff 50%",
    x = "Tekanan Darah Sistolik (mmHg)", y = "P(High Risk)"
  ) +
  theme_minimal(base_size = 12)

4 Regresi Poisson

4.1 Konsep dan Formula

Regresi Poisson digunakan untuk memodelkan data cacahan (count data) — variabel respons berupa bilangan bulat non-negatif (\(Y \in \{0, 1, 2, \ldots\}\)). Model mengasumsikan \(Y \sim \text{Poisson}(\mu)\) dengan fungsi link logaritma:

\[\log(\mu_i) = \beta_0 + \beta_1 X_{1i} + \cdots + \beta_p X_{pi}\]

Eksponensial koefisien menghasilkan Incidence Rate Ratio (IRR) yang mengukur perubahan relatif rata-rata cacahan.

4.2 Persiapan Data

Data yang digunakan adalah Student Awards UCLA OARC. Variabel respons adalah num_awards (jumlah penghargaan per mahasiswa). Prediktor: prog (program studi: General, Academic, Vocational) dan math (nilai ujian matematika).

students <- read.csv("/Users/navirasmacbook/Desktop/Semester 4/Analisis Data Kategori/Tugas 3/Student Awards — UCLA OARC.csv", sep = ";",
                     stringsAsFactors = FALSE)

students <- students |>
  mutate(prog = factor(prog, levels = 1:3,
                       labels = c("General", "Academic", "Vocational")))

cat("Dimensi data:", nrow(students), "baris x", ncol(students), "kolom\n")

## Dimensi data: 200 baris x 4 kolom

head(students, 8) |>
  kable(caption = "Delapan baris pertama data student awards") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Delapan baris pertama data student awards
id	prog	math
45	Vocational	41
108	General	41
15	Vocational	44
67	Vocational	42
153	Vocational	40
51	General	42
164	Vocational	46
133	Vocational	40

4.3 Sebaran Variabel Respons

students |>
  count(num_awards) |>
  mutate(Proporsi = scales::percent(n / sum(n), accuracy = 0.1)) |>
  rename(`Jumlah Penghargaan` = num_awards, Frekuensi = n) |>
  kable(caption = "Frekuensi dan proporsi jumlah penghargaan mahasiswa") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Frekuensi dan proporsi jumlah penghargaan mahasiswa
Jumlah Penghargaan	Frekuensi	Proporsi
0	124	62.0%
1	49	24.5%
2	13	6.5%
3	9	4.5%
4	2	1.0%
5	2	1.0%
6	1	0.5%

ggplot(students, aes(x = num_awards)) +
  geom_bar(fill = "#2f7f73", colour = "white", width = 0.7) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
  scale_x_continuous(breaks = 0:max(students$num_awards)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(title = "Sebaran Jumlah Penghargaan Mahasiswa",
       subtitle = "Data cacahan dengan banyak nilai nol — tipikal distribusi Poisson",
       x = "Jumlah Penghargaan", y = "Frekuensi") +
  theme_minimal(base_size = 12)

4.4 Eksplorasi Prediktor

students |>
  group_by(prog) |>
  summarise(n = n(), `Rata-rata` = round(mean(num_awards), 3),
            Varians = round(var(num_awards), 3), .groups = "drop") |>
  rename(Program = prog) |>
  kable(caption = "Rata-rata dan varians jumlah penghargaan per program studi") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Rata-rata dan varians jumlah penghargaan per program studi
Program	n	Rata-rata	Varians
General	45	0.20	0.164
Academic	105	1.00	1.635
Vocational	50	0.24	0.268

ggplot(students, aes(x = math, y = num_awards, colour = prog)) +
  geom_jitter(height = 0.15, alpha = 0.65, size = 2) +
  geom_smooth(method = "loess", se = FALSE, linewidth = 1.1) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  labs(title = "Nilai Matematika vs Jumlah Penghargaan per Program",
       x = "Nilai Matematika", y = "Jumlah Penghargaan",
       colour = "Program") +
  theme_minimal(base_size = 12)

4.5 Estimasi Model

fit_pois <- glm(
  num_awards ~ prog + math,
  data   = students,
  family = poisson(link = "log")
)

summary(fit_pois)

## 
## Call:
## glm(formula = num_awards ~ prog + math, family = poisson(link = "log"), 
##     data = students)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -5.24712    0.65845  -7.969 1.60e-15 ***
## progAcademic    1.08386    0.35825   3.025  0.00248 ** 
## progVocational  0.36981    0.44107   0.838  0.40179    
## math            0.07015    0.01060   6.619 3.63e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 287.67  on 199  degrees of freedom
## Residual deviance: 189.45  on 196  degrees of freedom
## AIC: 373.5
## 
## Number of Fisher Scoring iterations: 6

4.6 Koefisien, IRR, dan Signifikansi

pois_result <- broom::tidy(fit_pois) |>
  mutate(
    IRR           = round(exp(estimate), 3),
    CI_low        = round(exp(estimate - 1.96 * std.error), 3),
    CI_high       = round(exp(estimate + 1.96 * std.error), 3),
    p.value       = round(p.value, 4),
    estimate      = round(estimate, 4),
    `% Perubahan` = round((exp(estimate) - 1) * 100, 2),
    Sig           = dplyr::case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
                                     p.value < 0.05 ~ "*", TRUE ~ "")
  ) |>
  dplyr::rename(Variabel = term, Koefisien = estimate, SE = std.error,
                `z-value` = statistic, `p-value` = p.value,
                `CI 2.5%` = CI_low, `CI 97.5%` = CI_high)

pois_result |>
  kable(caption = "Koefisien dan IRR regresi Poisson") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Koefisien dan IRR regresi Poisson
Variabel	Koefisien	SE	z-value	p-value	IRR	CI 2.5%	CI 97.5%	% Perubahan	Sig
(Intercept)	-5.2471	0.6584531	-7.968866	0.0000	0.005	0.001	0.019	-99.47	***
progAcademic	1.0839	0.3582530	3.025402	0.0025	2.956	1.465	5.966	195.62	**
progVocational	0.3698	0.4410703	0.838436	0.4018	1.447	0.610	3.436	44.74
math	0.0702	0.0105992	6.618647	0.0000	1.073	1.051	1.095	7.27	***

IRR > 1 berarti prediktor meningkatkan rata-rata jumlah penghargaan; IRR < 1 berarti menurunkan. Kolom % Perubahan menunjukkan besar perubahan relatif rata-rata cacahan per satu satuan kenaikan prediktor.

4.7 Cek Overdispersi

dispersion <- sum(residuals(fit_pois, type = "pearson")^2) / df.residual(fit_pois)

tibble::tibble(
  `Dispersion Pearson` = round(dispersion, 4),
  Interpretasi = dplyr::case_when(
    dispersion < 1.5 ~ "Tidak ada indikasi overdispersion berat",
    dispersion < 2.5 ~ "Ada indikasi overdispersion sedang",
    TRUE             ~ "Ada indikasi overdispersion kuat"
  )
) |>
  kable(caption = "Rasio dispersi Pearson — indikator overdispersion") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE)

Rasio dispersi Pearson — indikator overdispersion
Dispersion Pearson	Interpretasi
1.0824	Tidak ada indikasi overdispersion berat

Jika nilai dispersi Pearson jauh di atas 1, model Negative Binomial lebih sesuai dibanding Poisson.

4.8 Visualisasi Prediksi Rate Poisson

grid_pois <- expand.grid(
  math = seq(min(students$math), max(students$math), length.out = 100),
  prog = factor(c("General", "Academic", "Vocational"), levels = levels(students$prog))
)

pred_link <- predict(fit_pois, newdata = grid_pois, type = "link", se.fit = TRUE)
grid_pois <- grid_pois |>
  mutate(
    pred_rate = exp(pred_link$fit),
    ci_low    = exp(pred_link$fit - 1.96 * pred_link$se.fit),
    ci_high   = exp(pred_link$fit + 1.96 * pred_link$se.fit)
  )

ggplot(grid_pois, aes(x = math, y = pred_rate, colour = prog, fill = prog)) +
  geom_ribbon(aes(ymin = ci_low, ymax = ci_high), alpha = 0.15, colour = NA) +
  geom_line(linewidth = 1.3) +
  geom_jitter(data = students, aes(x = math, y = num_awards, colour = prog),
              height = 0.1, alpha = 0.3, size = 1.5, inherit.aes = FALSE) +
  scale_colour_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  scale_fill_manual(values = c("#2f7f73", "#5568b8", "#d18b2f")) +
  labs(
    title    = "Kurva Prediksi Rate Poisson vs Nilai Matematika",
    subtitle = "Pita = interval kepercayaan 95%; titik = data aktual",
    x = "Nilai Matematika", y = "Rata-rata Prediksi Jumlah Penghargaan",
    colour = "Program", fill = "Program"
  ) +
  theme_minimal(base_size = 12)

Pemodelan Regresi: Multinomial, Ordinal, Biner, dan Poisson

Navira Amalia

2026-06-01