# Instalasi paket (hapus tanda # jika belum terpasang)
# install.packages(c("nnet", "MASS", "kableExtra", "dplyr", "ggplot2", "scales", "tidyr"))

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(knitr)
library(kableExtra)
library(nnet)    # multinom() untuk regresi multinomial
library(MASS)    # polr()    untuk regresi ordinal

Pendahuluan

Analisis ini menggunakan data Student Performance yang bersumber dari UCI Machine Learning Repository / Kaggle (student-por.csv). Data ini mencatat prestasi siswa sekolah menengah di dua sekolah Portugis pada mata pelajaran Bahasa Portugis, beserta informasi demografis, sosial, dan akademis.

Tujuan analisis: menerapkan empat jenis regresi logistik dari satu dataset yang sama dengan mengganti variabel respons:

Jenis Regresi	Variabel Respons (Y)	Keterangan
Binary	`lulus`	Dibuat dari G3 ≥ 10 (1 = lulus, 0 = tidak lulus)
Multinomial	`Mjob`	Pekerjaan ibu (5 kategori nominal)
Ordinal	`health`	Kondisi kesehatan siswa (1 = sangat buruk s/d 5 = sangat baik)
Poisson	`absences`	Jumlah ketidakhadiran (data cacahan, ≥ 0)

Persiapan Data

# Masukkan file student-por.csv ke folder yang sama dengan file .Rmd ini
df <- read.csv("D:/Semester 4/ADK/Project_ADK/student-por.csv", stringsAsFactors = TRUE)

# Tambah variabel yang dibutuhkan untuk analisis
df$lulus <- factor(
  ifelse(df$G3 >= 10, "Lulus", "Tidak Lulus"),
  levels = c("Tidak Lulus", "Lulus")
)

df$health_ord <- factor(
  df$health,
  levels  = 1:5,
  labels  = c("Sangat Buruk", "Buruk", "Cukup", "Baik", "Sangat Baik"),
  ordered = TRUE
)

df$Mjob <- relevel(df$Mjob, ref = "other")

glimpse(df)

## Rows: 649
## Columns: 35
## $ school     <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP,…
## $ sex        <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F, F, M, M,…
## $ age        <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15, 15, 15,…
## $ address    <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,…
## $ famsize    <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, GT3, GT3,…
## $ Pstatus    <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T, T, T, T,…
## $ Medu       <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, 3, 3, 4,…
## $ Fedu       <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, 3, 2, 3,…
## $ Mjob       <fct> at_home, at_home, at_home, health, other, services, other, …
## $ Fjob       <fct> teacher, other, other, services, other, other, other, teach…
## $ reason     <fct> course, course, other, home, home, reputation, home, home, …
## $ guardian   <fct> mother, father, mother, mother, father, mother, mother, mot…
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 1, 1,…
## $ studytime  <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, 2, 1, 1,…
## $ failures   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,…
## $ schoolsup  <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no, no, no, …
## $ famsup     <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes, yes, ye…
## $ paid       <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes, yes, no…
## $ nursery    <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ higher     <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ internet   <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes, yes, ye…
## $ romantic   <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no, no, ye…
## $ famrel     <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, 5, 5, 3,…
## $ freetime   <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, 3, 5, 1,…
## $ goout      <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 5, 3,…
## $ Dalc       <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ Walc       <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, 1, 4, 3,…
## $ health     <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, 4, 5, 5,…
## $ absences   <int> 4, 2, 6, 0, 0, 6, 0, 2, 0, 0, 2, 0, 0, 0, 0, 6, 10, 2, 2, 6…
## $ G1         <int> 0, 9, 12, 14, 11, 12, 13, 10, 15, 12, 14, 10, 12, 12, 14, 1…
## $ G2         <int> 11, 11, 13, 14, 13, 12, 12, 13, 16, 12, 14, 12, 13, 12, 14,…
## $ G3         <int> 11, 11, 12, 14, 13, 13, 13, 13, 17, 13, 14, 13, 12, 13, 15,…
## $ lulus      <fct> Lulus, Lulus, Lulus, Lulus, Lulus, Lulus, Lulus, Lulus, Lul…
## $ health_ord <ord> Cukup, Cukup, Cukup, Sangat Baik, Sangat Baik, Sangat Baik,…

Bagian I — Regresi Logistik Binary

Definisi

Regresi logistik binary digunakan ketika variabel respons bersifat dikotomi (dua kategori), misalnya: lulus/tidak lulus, sakit/sehat, ya/tidak.

Model ini memodelkan log-odds dari peluang suatu kejadian terjadi:

\[ \log\left(\frac{P(Y_i = 1 \mid \mathbf{x}_i)}{P(Y_i = 0 \mid \mathbf{x}_i)}\right) = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip} \]

Sehingga peluang \(P(Y_i = 1 \mid \mathbf{x}_i)\) dapat ditulis sebagai:

\[ P(Y_i = 1 \mid \mathbf{x}_i) = \frac{\exp(\beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip})}{1 + \exp(\beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip})} \]

Keterangan:

\(P(Y_i = 1 \mid \mathbf{x}_i)\): peluang observasi ke-\(i\) berada pada kategori “sukses” (misalnya lulus)
\(\beta_0\): intercept (log-odds ketika semua prediktor = 0)
\(\beta_j\): koefisien regresi untuk prediktor ke-\(j\); mengukur perubahan log-odds per satu unit kenaikan \(x_j\), dengan prediktor lain konstan
\(\mathbf{x}_i = (x_{i1}, \ldots, x_{ip})^\top\): vektor prediktor untuk observasi ke-\(i\)

Jika dieksponensialkan, koefisien menjadi odds ratio (OR):

\[ OR_j = \exp(\beta_j) \]

Interpretasi:

\(OR > 1\): kenaikan prediktor meningkatkan odds kejadian
\(OR < 1\): kenaikan prediktor menurunkan odds kejadian
\(OR = 1\): prediktor tidak berasosiasi dengan respons

Distribusi Variabel Respons

tabel_lulus <- as.data.frame(table(df$lulus))
tabel_lulus$Proporsi <- round(tabel_lulus$Freq / sum(tabel_lulus$Freq), 3)
names(tabel_lulus) <- c("Kategori", "Frekuensi", "Proporsi")

kable(tabel_lulus,
      caption = "Distribusi variabel lulus/tidak lulus",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Distribusi variabel lulus/tidak lulus
Kategori	Frekuensi	Proporsi
Tidak Lulus	100	0.154
Lulus	549	0.846

Estimasi Model Binary

Prediktor yang digunakan: G1 (nilai semester 1), studytime (waktu belajar), failures (jumlah kegagalan sebelumnya), sex, dan Medu (pendidikan ibu).

fit_binary <- glm(
  lulus ~ G1 + studytime + failures + sex + Medu,
  data   = df,
  family = binomial(link = "logit")
)

summary(fit_binary)

## 
## Call:
## glm(formula = lulus ~ G1 + studytime + failures + sex + Medu, 
##     family = binomial(link = "logit"), data = df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.44059    1.17132  -8.060 7.64e-16 ***
## G1           1.16541    0.12415   9.387  < 2e-16 ***
## studytime    0.04898    0.22361   0.219   0.8266    
## failures    -0.37499    0.19496  -1.923   0.0544 .  
## sexM        -0.37717    0.32153  -1.173   0.2408    
## Medu         0.07771    0.14817   0.524   0.6000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 557.79  on 648  degrees of freedom
## Residual deviance: 269.46  on 643  degrees of freedom
## AIC: 281.46
## 
## Number of Fisher Scoring iterations: 7

Ringkasan Koefisien, Odds Ratio, dan CI

b_sum <- coef(summary(fit_binary))

binary_coef <- data.frame(
  Parameter = rownames(b_sum),
  Estimate  = b_sum[, "Estimate"],
  SE        = b_sum[, "Std. Error"],
  z_value   = b_sum[, "z value"],
  p_value   = b_sum[, "Pr(>|z|)"],
  row.names = NULL
)
binary_coef$OR      <- exp(binary_coef$Estimate)
binary_coef$CI_low  <- exp(binary_coef$Estimate - 1.96 * binary_coef$SE)
binary_coef$CI_high <- exp(binary_coef$Estimate + 1.96 * binary_coef$SE)

binary_coef[, -1] <- round(binary_coef[, -1], 4)

kable(binary_coef,
      caption   = "Ringkasan hasil regresi logistik binary",
      col.names = c("Parameter", "Estimate", "SE", "z-value", "p-value",
                    "OR", "CI 2.5%", "CI 97.5%"),
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)

Ringkasan hasil regresi logistik binary
Parameter	Estimate	SE	z-value	p-value	OR	CI 2.5%	CI 97.5%
(Intercept)	-9.4406	1.1713	-8.0598	0.0000	0.0001	0.0000	0.0008
G1	1.1654	0.1241	9.3874	0.0000	3.2072	2.5145	4.0908
studytime	0.0490	0.2236	0.2190	0.8266	1.0502	0.6775	1.6278
failures	-0.3750	0.1950	-1.9235	0.0544	0.6873	0.4690	1.0071
sexM	-0.3772	0.3215	-1.1730	0.2408	0.6858	0.3652	1.2879
Medu	0.0777	0.1482	0.5245	0.6000	1.0808	0.8084	1.4450

Prediksi Probabilitas

df$prob_lulus <- predict(fit_binary, type = "response")
df$pred_lulus <- ifelse(df$prob_lulus >= 0.5, "Lulus", "Tidak Lulus")

head(df[, c("G1", "studytime", "failures", "sex", "Medu",
            "lulus", "prob_lulus", "pred_lulus")], 8)

##   G1 studytime failures sex Medu lulus  prob_lulus  pred_lulus
## 1  0         2        0   F    4 Lulus 0.000119535 Tidak Lulus
## 2  9         2        0   F    1 Lulus 0.772726206       Lulus
## 3 12         2        0   F    1 Lulus 0.991163586       Lulus
## 4 14         3        0   F    4 Lulus 0.999346775       Lulus
## 5 11         2        0   F    3 Lulus 0.976107616       Lulus
## 6 12         2        0   M    4 Lulus 0.989808583       Lulus
## 7 13         2        0   M    2 Lulus 0.996263830       Lulus
## 8 10         2        0   F    4 Lulus 0.932284034       Lulus

Confusion Matrix

conf_binary   <- table(Aktual = df$lulus, Prediksi = df$pred_lulus)
conf_binary

##              Prediksi
## Aktual        Lulus Tidak Lulus
##   Tidak Lulus    37          63
##   Lulus         530          19

akurasi_binary <- round(sum(diag(conf_binary)) / sum(conf_binary) * 100, 2)
cat("Akurasi model binary:", akurasi_binary, "%\n")

## Akurasi model binary: 8.63 %

Interpretasi Hasil Binary

Beberapa poin interpretasi dari output model:

G1 (nilai semester 1): Koefisien positif dan signifikan. Setiap kenaikan 1 poin nilai semester 1 meningkatkan odds lulus sekitar \(\exp(\hat\beta_{G1})\) kali, dengan prediktor lain konstan. Ini masuk akal karena performa awal semester sangat prediktif terhadap kelulusan akhir.
failures (jumlah kegagalan sebelumnya): Koefisien negatif menunjukkan bahwa siswa yang pernah gagal sebelumnya memiliki odds lulus yang lebih rendah. Setiap satu kegagalan tambahan menurunkan odds lulus sekitar \((1 - OR) \times 100\%\).
studytime (waktu belajar): Koefisien positif menunjukkan bahwa semakin banyak waktu belajar, semakin besar peluang lulus.
sex: Nilai \(OR\) menunjukkan apakah ada perbedaan odds lulus antara laki-laki dan perempuan, dengan prediktor lain konstan.
Medu (pendidikan ibu): Koefisien positif berarti semakin tinggi pendidikan ibu, semakin besar odds siswa lulus.

Bagian II — Regresi Logistik Multinomial

Definisi

Regresi logistik multinomial digunakan ketika variabel respons bersifat nominal dengan lebih dari dua kategori, yaitu kategori yang tidak memiliki urutan alami.

Misalkan respons \(Y_i \in \{1, 2, \ldots, K\}\) dengan \(K\) kategori. Salah satu kategori dipilih sebagai kategori referensi (misalnya kategori \(K\)). Untuk setiap kategori \(k = 1, 2, \ldots, K-1\), model ditulis:

\[ \log\left(\frac{P(Y_i = k \mid \mathbf{x}_i)}{P(Y_i = K \mid \mathbf{x}_i)}\right) = \beta_{0k} + \beta_{1k} x_{i1} + \cdots + \beta_{pk} x_{ip} \]

Keterangan:

Persamaan di atas adalah baseline-category logit: log-odds memilih kategori \(k\) dibanding kategori referensi \(K\)
Setiap kategori non-referensi memiliki set koefisien tersendiri: \((\beta_{0k}, \beta_{1k}, \ldots, \beta_{pk})\)
Untuk \(K = 5\) kategori (seperti pekerjaan ibu), model menghasilkan 4 persamaan logit terhadap kategori referensi

Probabilitas masing-masing kategori diperoleh dari:

\[ P(Y_i = k \mid \mathbf{x}_i) = \frac{\exp(\eta_{ik})}{1 + \sum_{h=1}^{K-1} \exp(\eta_{ih})}, \quad k = 1, \ldots, K-1 \]

\[ P(Y_i = K \mid \mathbf{x}_i) = \frac{1}{1 + \sum_{h=1}^{K-1} \exp(\eta_{ih})} \]

dengan \(\eta_{ik} = \beta_{0k} + \mathbf{x}_i^\top \boldsymbol{\beta}_k\).

Jika dieksponensialkan, koefisien menjadi Relative Risk Ratio (RRR):

\[ RRR_j = \exp(\beta_{jk}) \]

\(RRR > 1\) berarti kenaikan prediktor meningkatkan risiko relatif memilih kategori \(k\) dibanding referensi.

Distribusi Variabel Respons

tabel_mjob <- as.data.frame(table(df$Mjob))
tabel_mjob$Proporsi <- round(tabel_mjob$Freq / sum(tabel_mjob$Freq), 3)
names(tabel_mjob) <- c("Pekerjaan_Ibu", "Frekuensi", "Proporsi")

kable(tabel_mjob,
      caption   = "Distribusi pekerjaan ibu siswa (Mjob) — 5 kategori nominal",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Distribusi pekerjaan ibu siswa (Mjob) — 5 kategori nominal
Pekerjaan_Ibu	Frekuensi	Proporsi
other	258	0.398
at_home	135	0.208
health	48	0.074
services	136	0.210
teacher	72	0.111

Estimasi Model Multinomial

Prediktor: age, Medu (pendidikan ibu), G3 (nilai akhir), sex, address (tempat tinggal).

Kategori referensi: other.

fit_multi <- nnet::multinom(
  Mjob ~ age + Medu + G3 + sex + address,
  data  = df,
  trace = FALSE
)

summary(fit_multi)

## Call:
## nnet::multinom(formula = Mjob ~ age + Medu + G3 + sex + address, 
##     data = df, trace = FALSE)
## 
## Coefficients:
##           (Intercept)         age       Medu          G3       sexM   addressU
## at_home   -0.07330016  0.08364356 -0.5579844 -0.04177217 -0.5563386 -0.4076705
## health    -2.86021457 -0.28559189  1.5726583  0.06330081  0.2739032  0.4542943
## services  -1.46982977 -0.07024184  0.5729147  0.01340461  0.2506010  0.4153027
## teacher  -15.32068806 -0.02444046  3.8819354  0.04804528  0.6271800  0.1322211
## 
## Std. Errors:
##          (Intercept)        age      Medu         G3      sexM  addressU
## at_home     1.647488 0.09072912 0.1259000 0.03544287 0.2419814 0.2277714
## health      2.784999 0.15374698 0.2382127 0.06093466 0.3545955 0.4465768
## services    1.666461 0.09222888 0.1146346 0.03650370 0.2257295 0.2584493
## teacher     3.509735 0.14332761 0.5938882 0.05408115 0.3449681 0.4136946
## 
## Residual Deviance: 1511.184 
## AIC: 1559.184

Ringkasan Koefisien, RRR, dan p-value

m_coef <- coef(fit_multi)
m_se   <- summary(fit_multi)$standard.errors

# Susun ke data frame panjang
result_multi <- data.frame()
for (kat in rownames(m_coef)) {
  tmp <- data.frame(
    Kategori = kat,
    Variabel = colnames(m_coef),
    Estimate = as.numeric(m_coef[kat, ]),
    SE       = as.numeric(m_se[kat, ]),
    row.names = NULL
  )
  result_multi <- rbind(result_multi, tmp)
}

result_multi$z_value <- result_multi$Estimate / result_multi$SE
result_multi$p_value <- 2 * (1 - pnorm(abs(result_multi$z_value)))
result_multi$RRR     <- exp(result_multi$Estimate)
result_multi$CI_low  <- exp(result_multi$Estimate - 1.96 * result_multi$SE)
result_multi$CI_high <- exp(result_multi$Estimate + 1.96 * result_multi$SE)

result_multi[, 3:9] <- round(result_multi[, 3:9], 4)

kable(result_multi,
      caption   = "Ringkasan hasil regresi logistik multinomial (referensi: other)",
      col.names = c("Kategori", "Variabel", "Estimate", "SE",
                    "z-value", "p-value", "RRR", "CI 2.5%", "CI 97.5%"),
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)

Ringkasan hasil regresi logistik multinomial (referensi: other)
Kategori	Variabel	Estimate	SE	z-value	p-value	RRR	CI 2.5%	CI 97.5%
at_home	(Intercept)	-0.0733	1.6475	-0.0445	0.9645	0.9293	0.0368	23.4712
at_home	age	0.0836	0.0907	0.9219	0.3566	1.0872	0.9101	1.2988
at_home	Medu	-0.5580	0.1259	-4.4320	0.0000	0.5724	0.4472	0.7326
at_home	G3	-0.0418	0.0354	-1.1786	0.2386	0.9591	0.8947	1.0281
at_home	sexM	-0.5563	0.2420	-2.2991	0.0215	0.5733	0.3568	0.9212
at_home	addressU	-0.4077	0.2278	-1.7898	0.0735	0.6652	0.4257	1.0395
health	(Intercept)	-2.8602	2.7850	-1.0270	0.3044	0.0573	0.0002	13.4420
health	age	-0.2856	0.1537	-1.8575	0.0632	0.7516	0.5560	1.0159
health	Medu	1.5727	0.2382	6.6019	0.0000	4.8194	3.0215	7.6872
health	G3	0.0633	0.0609	1.0388	0.2989	1.0653	0.9454	1.2005
health	sexM	0.2739	0.3546	0.7724	0.4399	1.3151	0.6563	2.6351
health	addressU	0.4543	0.4466	1.0173	0.3090	1.5751	0.6564	3.7795
services	(Intercept)	-1.4698	1.6665	-0.8820	0.3778	0.2300	0.0088	6.0281
services	age	-0.0702	0.0922	-0.7616	0.4463	0.9322	0.7780	1.1169
services	Medu	0.5729	0.1146	4.9977	0.0000	1.7734	1.4166	2.2202
services	G3	0.0134	0.0365	0.3672	0.7135	1.0135	0.9435	1.0887
services	sexM	0.2506	0.2257	1.1102	0.2669	1.2848	0.8254	1.9998
services	addressU	0.4153	0.2584	1.6069	0.1081	1.5148	0.9128	2.5140
teacher	(Intercept)	-15.3207	3.5097	-4.3652	0.0000	0.0000	0.0000	0.0002
teacher	age	-0.0244	0.1433	-0.1705	0.8646	0.9759	0.7369	1.2924
teacher	Medu	3.8819	0.5939	6.5365	0.0000	48.5180	15.1487	155.3928
teacher	G3	0.0480	0.0541	0.8884	0.3743	1.0492	0.9437	1.1665
teacher	sexM	0.6272	0.3450	1.8181	0.0691	1.8723	0.9522	3.6815
teacher	addressU	0.1322	0.4137	0.3196	0.7493	1.1414	0.5073	2.5678

Prediksi Probabilitas

pred_prob_multi <- predict(fit_multi, type = "probs")
head(round(pred_prob_multi, 3))

##   other at_home health services teacher
## 1 0.226   0.043  0.129    0.255   0.347
## 2 0.465   0.431  0.003    0.101   0.000
## 3 0.496   0.373  0.006    0.125   0.000
## 4 0.164   0.021  0.266    0.237   0.312
## 5 0.438   0.112  0.104    0.329   0.016
## 6 0.131   0.011  0.198    0.225   0.435

Confusion Matrix

conf_multi <- table(
  Aktual   = df$Mjob,
  Prediksi = predict(fit_multi, type = "class")
)
conf_multi

##           Prediksi
## Aktual     other at_home health services teacher
##   other      192      29      0        4      33
##   at_home     97      32      0        1       5
##   health      12       0      0        0      36
##   services   101       1      0        3      31
##   teacher      4       0      0        1      67

akurasi_multi <- round(sum(diag(conf_multi)) / sum(conf_multi) * 100, 2)
cat("Akurasi model multinomial:", akurasi_multi, "%\n")

## Akurasi model multinomial: 45.3 %

Visualisasi Probabilitas Prediksi

grid_multi <- expand.grid(
  age     = mean(df$age),
  Medu    = 1:4,
  G3      = mean(df$G3),
  sex     = factor("F", levels = levels(df$sex)),
  address = factor("U", levels = levels(df$address))
)

grid_prob  <- as.data.frame(predict(fit_multi, newdata = grid_multi, type = "probs"))
grid_plot  <- cbind(grid_multi, grid_prob)
grid_long  <- pivot_longer(grid_plot,
                            cols      = levels(df$Mjob),
                            names_to  = "Mjob",
                            values_to = "Probabilitas")

ggplot(grid_long, aes(x = factor(Medu), y = Probabilitas, fill = Mjob)) +
  geom_col(position = "stack", color = "white", linewidth = 0.3) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title    = "Prediksi Probabilitas Pekerjaan Ibu",
    subtitle = "Berdasarkan tingkat pendidikan ibu (Medu 1–4)",
    x = "Pendidikan Ibu (Medu)", y = "Probabilitas", fill = "Pekerjaan"
  ) +
  theme_minimal(base_size = 12)

Interpretasi Hasil Multinomial

Model membandingkan tiap kategori pekerjaan ibu (at_home, health, services, teacher) terhadap kategori referensi other.
RRR > 1 untuk suatu prediktor berarti kenaikan prediktor meningkatkan risiko relatif ibu berada di kategori tersebut dibanding other.
Medu (pendidikan ibu): RRR yang tinggi untuk kategori teacher atau health mengindikasikan bahwa ibu dengan pendidikan lebih tinggi lebih mungkin bekerja di bidang tersebut dibanding other.
G3 (nilai akhir): Jika koefisien positif untuk teacher, siswa dengan nilai lebih tinggi mungkin memiliki ibu yang bekerja sebagai guru, yang mencerminkan latar belakang akademis keluarga.
Perlu diperhatikan bahwa model multinomial tidak mengasumsikan urutan antar-kategori — setiap kategori berdiri sendiri relatif terhadap referensi.

Bagian III — Regresi Logistik Ordinal

Definisi

Regresi logistik ordinal digunakan ketika variabel respons bersifat kategorik dengan urutan alami, misalnya: \[\text{Sangat Buruk} < \text{Buruk} < \text{Cukup} < \text{Baik} < \text{Sangat Baik}\]

Model yang paling umum adalah cumulative logit model atau proportional odds model:

\[ \log\left(\frac{P(Y_i \leq j \mid \mathbf{x}_i)}{P(Y_i > j \mid \mathbf{x}_i)}\right) = \alpha_j + \mathbf{x}_i^\top \boldsymbol{\beta}, \quad j = 1, 2, \ldots, J-1 \]

Keterangan:

\(P(Y_i \leq j \mid \mathbf{x}_i)\): peluang kumulatif bahwa respons observasi ke-\(i\) berada pada kategori ke-\(j\) atau lebih rendah
\(\alpha_j\): cutpoint (threshold) ke-\(j\); ada sebanyak \(J-1\) cutpoint untuk \(J\) kategori
\(\boldsymbol{\beta}\): satu vektor koefisien yang berlaku untuk semua batas kumulatif — inilah yang disebut asumsi proportional odds
Asumsi ini berarti efek prediktor dianggap seragam di setiap batas kategori

Probabilitas tiap kategori diperoleh dari selisih peluang kumulatif: \[P(Y_i = j) = P(Y_i \leq j) - P(Y_i \leq j-1)\]

Catatan konvensi tanda: Fungsi MASS::polr() menggunakan konvensi \(\text{logit}\{P(Y \leq j)\} = \alpha_j - \mathbf{x}^\top \boldsymbol{\beta}\). Akibatnya, koefisien pada output polr() memiliki tanda berlawanan dengan konvensi rumus di atas. Untuk interpretasi, gunakan: \(\hat\beta_\text{slide} = -\hat\beta_\text{polr}\).

Jika dieksponensialkan: \(OR = \exp(\beta)\)

Dalam konvensi polr(): \(OR > 1\) berarti kenaikan prediktor meningkatkan odds berada di kategori lebih tinggi

Distribusi Variabel Respons

tabel_health <- as.data.frame(table(df$health_ord))
tabel_health$Proporsi <- round(tabel_health$Freq / sum(tabel_health$Freq), 3)
names(tabel_health) <- c("Kesehatan", "Frekuensi", "Proporsi")

kable(tabel_health,
      caption   = "Distribusi kondisi kesehatan siswa (health) — 5 kategori ordinal",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Distribusi kondisi kesehatan siswa (health) — 5 kategori ordinal
Kesehatan	Frekuensi	Proporsi
Sangat Buruk	90	0.139
Buruk	78	0.120
Cukup	124	0.191
Baik	108	0.166
Sangat Baik	249	0.384

Estimasi Model Ordinal

Prediktor: age, sex, freetime (waktu luang), goout (frekuensi keluar), Dalc (konsumsi alkohol hari kerja).

fit_ord <- MASS::polr(
  health_ord ~ age + sex + freetime + goout + Dalc,
  data   = df,
  method = "logistic",
  Hess   = TRUE
)

summary(fit_ord)

## Call:
## MASS::polr(formula = health_ord ~ age + sex + freetime + goout + 
##     Dalc, data = df, Hess = TRUE, method = "logistic")
## 
## Coefficients:
##             Value Std. Error t value
## age      -0.01440    0.05978 -0.2409
## sexM      0.44796    0.15479  2.8939
## freetime  0.15954    0.07466  2.1369
## goout    -0.08571    0.06723 -1.2749
## Dalc      0.04837    0.08342  0.5798
## 
## Intercepts:
##                    Value   Std. Error t value
## Sangat Buruk|Buruk -1.6076  1.0242    -1.5696
## Buruk|Cukup        -0.8229  1.0229    -0.8045
## Cukup|Baik          0.0429  1.0227     0.0419
## Baik|Sangat Baik    0.7323  1.0226     0.7161
## 
## Residual Deviance: 1943.042 
## AIC: 1961.042

Ringkasan Koefisien dan Odds Ratio

o_sum <- coef(summary(fit_ord))

result_ord <- data.frame(
  Parameter = rownames(o_sum),
  Estimate  = o_sum[, "Value"],
  SE        = o_sum[, "Std. Error"],
  t_value   = o_sum[, "t value"],
  row.names = NULL
)
result_ord$p_value        <- 2 * (1 - pnorm(abs(result_ord$t_value)))
result_ord$Jenis          <- ifelse(grepl("\\|", result_ord$Parameter), "Cutpoint", "Koefisien")
result_ord$Estimate_slide <- ifelse(result_ord$Jenis == "Koefisien",
                                    -result_ord$Estimate, result_ord$Estimate)
result_ord$OR_polr        <- ifelse(result_ord$Jenis == "Koefisien",
                                    exp(result_ord$Estimate), NA)
result_ord$OR_slide       <- ifelse(result_ord$Jenis == "Koefisien",
                                    exp(result_ord$Estimate_slide), NA)
result_ord$CI_low         <- ifelse(result_ord$Jenis == "Koefisien",
                                    exp(result_ord$Estimate - 1.96 * result_ord$SE), NA)
result_ord$CI_high        <- ifelse(result_ord$Jenis == "Koefisien",
                                    exp(result_ord$Estimate + 1.96 * result_ord$SE), NA)

num_cols <- c("Estimate", "SE", "t_value", "p_value",
              "Estimate_slide", "OR_polr", "OR_slide", "CI_low", "CI_high")
result_ord[, num_cols] <- round(result_ord[, num_cols], 4)

kable(result_ord,
      caption   = "Ringkasan hasil regresi logistik ordinal",
      col.names = c("Parameter", "Est. polr", "SE", "t-value", "p-value",
                    "Jenis", "Est. slide", "OR polr", "OR slide",
                    "CI 2.5%", "CI 97.5%"),
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)

Ringkasan hasil regresi logistik ordinal
Parameter	Est. polr	SE	t-value	p-value	Jenis	Est. slide	OR polr	OR slide	CI 2.5%	CI 97.5%
age	-0.0144	0.0598	-0.2409	0.8096	Koefisien	0.0144	0.9857	1.0145	0.8767	1.1082
sexM	0.4480	0.1548	2.8939	0.0038	Koefisien	-0.4480	1.5651	0.6389	1.1555	2.1199
freetime	0.1595	0.0747	2.1369	0.0326	Koefisien	-0.1595	1.1730	0.8525	1.0133	1.3578
goout	-0.0857	0.0672	-1.2749	0.2023	Koefisien	0.0857	0.9179	1.0895	0.8045	1.0471
Dalc	0.0484	0.0834	0.5798	0.5621	Koefisien	-0.0484	1.0496	0.9528	0.8912	1.2360
Sangat Buruk\|Buruk	-1.6076	1.0242	-1.5696	0.1165	Cutpoint	-1.6076	NA	NA	NA	NA
Buruk\|Cukup	-0.8229	1.0229	-0.8045	0.4211	Cutpoint	-0.8229	NA	NA	NA	NA
Cukup\|Baik	0.0429	1.0227	0.0419	0.9666	Cutpoint	0.0429	NA	NA	NA	NA
Baik\|Sangat Baik	0.7323	1.0226	0.7161	0.4739	Cutpoint	0.7323	NA	NA	NA	NA

Prediksi Probabilitas

pred_prob_ord <- predict(fit_ord, type = "probs")
head(round(pred_prob_ord, 3))

##   Sangat Buruk Buruk Cukup  Baik Sangat Baik
## 1        0.178 0.144 0.208 0.162       0.308
## 2        0.163 0.136 0.205 0.165       0.330
## 3        0.142 0.124 0.197 0.169       0.367
## 4        0.170 0.140 0.206 0.164       0.320
## 5        0.150 0.129 0.200 0.168       0.353
## 6        0.088 0.086 0.160 0.166       0.500

Confusion Matrix

conf_ord <- table(
  Aktual   = df$health_ord,
  Prediksi = predict(fit_ord, type = "class")
)
conf_ord

##               Prediksi
## Aktual         Sangat Buruk Buruk Cukup Baik Sangat Baik
##   Sangat Buruk            0     0     0    0          90
##   Buruk                   0     0     0    0          78
##   Cukup                   0     0     0    0         124
##   Baik                    1     0     0    0         107
##   Sangat Baik             0     0     0    0         249

akurasi_ord <- round(sum(diag(conf_ord)) / sum(conf_ord) * 100, 2)
cat("Akurasi model ordinal:", akurasi_ord, "%\n")

## Akurasi model ordinal: 38.37 %

Visualisasi Probabilitas Prediksi

grid_ord <- expand.grid(
  age      = mean(df$age),
  sex      = factor("F", levels = levels(df$sex)),
  freetime = 1:5,
  goout    = mean(df$goout),
  Dalc     = mean(df$Dalc)
)

grid_prob_ord <- as.data.frame(predict(fit_ord, newdata = grid_ord, type = "probs"))
grid_ord_plot <- cbind(grid_ord, grid_prob_ord)
grid_ord_long <- pivot_longer(grid_ord_plot,
                               cols      = levels(df$health_ord),
                               names_to  = "Kesehatan",
                               values_to = "Probabilitas")
grid_ord_long$Kesehatan <- factor(grid_ord_long$Kesehatan,
                                   levels = levels(df$health_ord))

ggplot(grid_ord_long, aes(x = freetime, y = Probabilitas, color = Kesehatan)) +
  geom_line(linewidth = 1.2) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title    = "Prediksi Probabilitas Tingkat Kesehatan",
    subtitle = "Berdasarkan waktu luang (freetime 1–5); variabel lain pada rata-rata",
    x = "Waktu Luang (freetime)", y = "Probabilitas", color = "Kesehatan"
  ) +
  theme_minimal(base_size = 12)

Interpretasi Hasil Ordinal

Konvensi penting: Output polr() menggunakan tanda berlawanan. Koefisien positif pada polr() berarti kenaikan prediktor meningkatkan odds berada di kategori lebih tinggi.
goout (frekuensi keluar): Jika koefisien positif, siswa yang lebih sering keluar memiliki odds lebih tinggi berada di kategori kesehatan yang lebih baik (dalam konvensi polr()). Namun perlu dicek signifikansinya.
Dalc (konsumsi alkohol hari kerja): Koefisien negatif (dalam konvensi polr()) mengindikasikan bahwa konsumsi alkohol lebih tinggi dikaitkan dengan odds lebih rendah berada di kategori kesehatan yang lebih baik — sesuai ekspektasi substantif.
Cutpoints (\(\alpha_j\)): Menunjukkan nilai logit kumulatif ketika semua prediktor = 0. Semakin besar jarak antar-cutpoint, semakin “lebar” rentang kategori tersebut.
Asumsi proportional odds menyatakan bahwa slope prediktor berlaku sama untuk semua batas kumulatif. Uji formal dapat dilakukan dengan ordinal::nominal_test().

Bagian IV — Regresi Poisson

Definisi

Regresi Poisson digunakan ketika variabel respons berupa data cacahan (count data), yaitu bilangan bulat nonnegatif: \(Y_i \in \{0, 1, 2, \ldots\}\).

Asumsi dasar: \(Y_i \mid \mathbf{x}_i \sim \text{Poisson}(\mu_i)\), dengan fungsi peluang:

\[ P(Y_i = y_i \mid \mathbf{x}_i) = \frac{\exp(-\mu_i)\, \mu_i^{y_i}}{y_i!}, \quad y_i = 0, 1, 2, \ldots \]

Pada distribusi Poisson berlaku sifat equidispersion: \[E(Y_i \mid \mathbf{x}_i) = \text{Var}(Y_i \mid \mathbf{x}_i) = \mu_i\]

Model menggunakan fungsi penghubung log:

\[ \log(\mu_i) = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip} \]

Sehingga rata-rata hitungan:

\[ \mu_i = \exp(\beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip}) \]

Keterangan:

\(\mu_i = E(Y_i \mid \mathbf{x}_i)\): nilai harapan hitungan untuk observasi ke-\(i\)
Fungsi log dipakai agar prediksi \(\mu_i\) selalu positif
\(\beta_j\): perubahan log rata-rata hitungan per satu unit kenaikan \(x_j\)

Jika dieksponensialkan, koefisien menjadi Incidence Rate Ratio (IRR):

\[ IRR_j = \exp(\beta_j) \]

\(IRR > 1\): kenaikan prediktor meningkatkan rata-rata hitungan
\(IRR < 1\): kenaikan prediktor menurunkan rata-rata hitungan
Persentase perubahan: \(100 \times (\exp(\beta_j) - 1)\%\)

Distribusi Variabel Respons

tabel_abs <- as.data.frame(table(df$absences))
tabel_abs$Proporsi <- round(tabel_abs$Freq / sum(tabel_abs$Freq), 3)
names(tabel_abs) <- c("Jumlah_Absen", "Frekuensi", "Proporsi")

kable(head(tabel_abs, 15),
      caption   = "Distribusi jumlah ketidakhadiran siswa (absences) — 15 nilai pertama",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Distribusi jumlah ketidakhadiran siswa (absences) — 15 nilai pertama
Jumlah_Absen	Frekuensi	Proporsi
0	244	0.376
1	12	0.018
2	110	0.169
3	7	0.011
4	93	0.143
5	12	0.018
6	49	0.076
7	3	0.005
8	42	0.065
9	7	0.011
10	21	0.032
11	5	0.008
12	12	0.018
13	1	0.002
14	8	0.012

ggplot(df, aes(x = absences)) +
  geom_histogram(binwidth = 1, fill = "#2f7f73", color = "white", alpha = 0.9) +
  labs(
    title    = "Distribusi Jumlah Ketidakhadiran (absences)",
    subtitle = "Data cacahan: bilangan bulat nonnegatif — cocok untuk regresi Poisson",
    x = "Jumlah ketidakhadiran", y = "Frekuensi"
  ) +
  theme_minimal(base_size = 12)

Estimasi Model Poisson

Prediktor: studytime (waktu belajar), failures (kegagalan sebelumnya), goout (frekuensi keluar), sex, famrel (kualitas hubungan keluarga).

fit_pois <- glm(
  absences ~ studytime + failures + goout + sex + famrel,
  data   = df,
  family = poisson(link = "log")
)

summary(fit_pois)

## 
## Call:
## glm(formula = absences ~ studytime + failures + goout + sex + 
##     famrel, family = poisson(link = "log"), data = df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.72541    0.11120  15.516  < 2e-16 ***
## studytime   -0.16300    0.02711  -6.013 1.83e-09 ***
## failures     0.16883    0.02915   5.791 7.01e-09 ***
## goout        0.08756    0.01748   5.009 5.48e-07 ***
## sexM        -0.01515    0.04289  -0.353    0.724    
## famrel      -0.11517    0.02051  -5.614 1.97e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3464.7  on 648  degrees of freedom
## Residual deviance: 3322.0  on 643  degrees of freedom
## AIC: 4707.2
## 
## Number of Fisher Scoring iterations: 6

Ringkasan Koefisien, IRR, dan CI

p_sum <- coef(summary(fit_pois))

pois_coef <- data.frame(
  Parameter = rownames(p_sum),
  Estimate  = p_sum[, "Estimate"],
  SE        = p_sum[, "Std. Error"],
  z_value   = p_sum[, "z value"],
  p_value   = p_sum[, "Pr(>|z|)"],
  row.names = NULL
)
pois_coef$IRR              <- exp(pois_coef$Estimate)
pois_coef$CI_low           <- exp(pois_coef$Estimate - 1.96 * pois_coef$SE)
pois_coef$CI_high          <- exp(pois_coef$Estimate + 1.96 * pois_coef$SE)
pois_coef$Perubahan_persen <- 100 * (pois_coef$IRR - 1)

pois_coef[, -1] <- round(pois_coef[, -1], 4)

kable(pois_coef,
      caption   = "Ringkasan hasil regresi Poisson",
      col.names = c("Parameter", "Estimate", "SE", "z-value", "p-value",
                    "IRR", "CI 2.5%", "CI 97.5%", "Perubahan (%)"),
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)

Ringkasan hasil regresi Poisson
Parameter	Estimate	SE	z-value	p-value	IRR	CI 2.5%	CI 97.5%	Perubahan (%)
(Intercept)	1.7254	0.1112	15.5158	0.000	5.6148	4.5152	6.9822	461.4801
studytime	-0.1630	0.0271	-6.0126	0.000	0.8496	0.8056	0.8960	-15.0409
failures	0.1688	0.0292	5.7907	0.000	1.1839	1.1182	1.2535	18.3914
goout	0.0876	0.0175	5.0086	0.000	1.0915	1.0547	1.1295	9.1503
sexM	-0.0151	0.0429	-0.3532	0.724	0.9850	0.9055	1.0714	-1.5034
famrel	-0.1152	0.0205	-5.6142	0.000	0.8912	0.8561	0.9278	-10.8786

Pemeriksaan Overdispersi

Salah satu asumsi Poisson adalah equidispersion (mean = varians). Pelanggaran menghasilkan overdispersi. Indikatornya:

\[\hat\phi = \frac{\sum r_{P,i}^2}{df}\]

Jika \(\hat\phi \approx 1\), tidak ada masalah. Jika \(\hat\phi \gg 1\), perlu model alternatif (quasi-Poisson atau Negative Binomial).

dispersion <- sum(residuals(fit_pois, type = "pearson")^2) / df.residual(fit_pois)

interpretasi <- ifelse(dispersion < 1.5,
  "Tidak ada indikasi overdispersi berat",
  ifelse(dispersion < 2.5,
    "Ada indikasi overdispersi sedang",
    "Ada indikasi overdispersi kuat — pertimbangkan Negative Binomial"))

tabel_disp <- data.frame(
  `Dispersion Pearson` = round(dispersion, 3),
  Interpretasi         = interpretasi,
  check.names          = FALSE
)

kable(tabel_disp,
      caption   = "Pemeriksaan overdispersi model Poisson",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Pemeriksaan overdispersi model Poisson
Dispersion Pearson	Interpretasi
5.642	Ada indikasi overdispersi kuat — pertimbangkan Negative Binomial

Prediksi Rate

df$prediksi_absen <- round(predict(fit_pois, type = "response"), 3)

head(df[, c("studytime", "failures", "goout", "sex",
            "famrel", "absences", "prediksi_absen")], 8)

##   studytime failures goout sex famrel absences prediksi_absen
## 1         2        0     4   F      4        4          3.629
## 2         2        0     3   F      5        2          2.963
## 3         2        0     2   F      4        6          3.046
## 4         3        0     2   F      3        0          2.904
## 5         2        0     2   F      4        0          3.046
## 6         2        0     2   M      5        6          2.674
## 7         2        0     4   M      4        0          3.574
## 8         2        0     4   F      4        2          3.629

Visualisasi Prediksi Rate

grid_pois <- expand.grid(
  studytime = mean(df$studytime),
  failures  = 0,
  goout     = 1:5,
  sex       = factor("F", levels = levels(df$sex)),
  famrel    = mean(df$famrel)
)

pred_rate <- predict(fit_pois, newdata = grid_pois, type = "link", se.fit = TRUE)

grid_pois$rate      <- exp(pred_rate$fit)
grid_pois$rate_low  <- exp(pred_rate$fit - 1.96 * pred_rate$se.fit)
grid_pois$rate_high <- exp(pred_rate$fit + 1.96 * pred_rate$se.fit)

ggplot(grid_pois, aes(x = goout, y = rate)) +
  geom_ribbon(aes(ymin = rate_low, ymax = rate_high),
              fill = "#2f7f73", alpha = 0.2) +
  geom_line(color = "#2f7f73", linewidth = 1.35) +
  geom_point(color = "#2f7f73", size = 2.5) +
  labs(
    title    = "Prediksi Rata-rata Jumlah Ketidakhadiran",
    subtitle = "Berdasarkan frekuensi keluar (goout 1–5); variabel lain pada rata-rata",
    x = "Frekuensi Keluar (goout)", y = "Prediksi jumlah absen"
  ) +
  theme_minimal(base_size = 12)

Interpretasi Hasil Poisson

failures (kegagalan sebelumnya): Jika IRR > 1, siswa yang pernah gagal lebih banyak cenderung memiliki lebih banyak ketidakhadiran. Ini konsisten secara substantif — kegagalan dan absensi saling berkaitan.
goout (frekuensi keluar): IRR > 1 berarti siswa yang lebih sering keluar memiliki rata-rata absen yang lebih tinggi. Misalnya, jika \(IRR = 1.15\), maka setiap kenaikan 1 poin goout dikaitkan dengan kenaikan rata-rata absen sekitar 15%.
studytime (waktu belajar): Koefisien negatif (IRR < 1) mengindikasikan bahwa siswa yang lebih banyak belajar cenderung lebih jarang absen.
famrel (hubungan keluarga): IRR < 1 berarti hubungan keluarga yang lebih baik dikaitkan dengan lebih sedikit ketidakhadiran.
Dispersion: Nilai dispersion Pearson yang mendekati 1 menunjukkan model Poisson sudah sesuai. Jika jauh di atas 1, pertimbangkan quasi-Poisson atau Negative Binomial.

Ringkasan Perbandingan Keempat Model

tabel_ringkasan <- data.frame(
  Jenis      = c("Binary", "Multinomial", "Ordinal", "Poisson"),
  Respons    = c("lulus (0/1)", "Mjob (5 kategori)", "health (1–5)", "absences (count)"),
  Fungsi_R   = c("glm(..., family = binomial)", "nnet::multinom()",
                 "MASS::polr()", "glm(..., family = poisson)"),
  Koefisien  = c("Log-odds → OR", "Log-odds vs referensi → RRR",
                 "Cumulative log-odds → OR", "Log rata-rata count → IRR"),
  Asumsi_Kunci = c(
    "Respons biner, observasi independen",
    "Nominal (tidak berurutan), IIA",
    "Ordinal (berurutan), proportional odds",
    "Count ≥ 0, equidispersion (mean = varians)"
  ),
  check.names = FALSE
)

kable(tabel_ringkasan,
      caption   = "Perbandingan keempat jenis regresi logistik",
      col.names = c("Jenis", "Variabel Respons", "Fungsi R",
                    "Interpretasi Koefisien", "Asumsi Kunci"),
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)

Perbandingan keempat jenis regresi logistik
Jenis	Variabel Respons	Fungsi R	Interpretasi Koefisien	Asumsi Kunci
Binary	lulus (0/1)	glm(…, family = binomial)	Log-odds → OR	Respons biner, observasi independen
Multinomial	Mjob (5 kategori)	nnet::multinom()	Log-odds vs referensi → RRR	Nominal (tidak berurutan), IIA
Ordinal	health (1–5)	MASS::polr()	Cumulative log-odds → OR	Ordinal (berurutan), proportional odds
Poisson	absences (count)	glm(…, family = poisson)	Log rata-rata count → IRR	Count ≥ 0, equidispersion (mean = varians)

Catatan: Keempat analisis di atas menggunakan dataset yang sama (student-por.csv) dengan mengganti variabel respons. Prediktor yang digunakan di tiap model dipilih agar relevan secara substantif dengan variabel respons masing-masing.

Tugas Analisis Regresi Logistik

Nana Puspita Huwaina

2026-05-29

Pendahuluan

Persiapan Data

Bagian I — Regresi Logistik Binary

Definisi

Distribusi Variabel Respons

Estimasi Model Binary

Ringkasan Koefisien, Odds Ratio, dan CI

Prediksi Probabilitas

Confusion Matrix

Interpretasi Hasil Binary

Bagian II — Regresi Logistik Multinomial

Definisi

Distribusi Variabel Respons

Estimasi Model Multinomial

Ringkasan Koefisien, RRR, dan p-value

Prediksi Probabilitas

Confusion Matrix

Visualisasi Probabilitas Prediksi

Interpretasi Hasil Multinomial

Bagian III — Regresi Logistik Ordinal

Definisi

Distribusi Variabel Respons

Estimasi Model Ordinal

Ringkasan Koefisien dan Odds Ratio

Prediksi Probabilitas

Confusion Matrix

Visualisasi Probabilitas Prediksi

Interpretasi Hasil Ordinal

Bagian IV — Regresi Poisson

Definisi

Distribusi Variabel Respons

Estimasi Model Poisson

Ringkasan Koefisien, IRR, dan CI

Pemeriksaan Overdispersi

Prediksi Rate

Visualisasi Prediksi Rate

Interpretasi Hasil Poisson

Ringkasan Perbandingan Keempat Model