# Instalasi paket (hapus tanda # jika belum terpasang)
# install.packages(c("nnet", "MASS", "kableExtra", "dplyr", "ggplot2", "scales", "tidyr"))
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(knitr)
library(kableExtra)
library(nnet) # multinom() untuk regresi multinomial
library(MASS) # polr() untuk regresi ordinalAnalisis ini menggunakan data Student Performance
yang bersumber dari UCI Machine Learning Repository / Kaggle
(student-por.csv). Data ini mencatat prestasi siswa sekolah
menengah di dua sekolah Portugis pada mata pelajaran Bahasa Portugis,
beserta informasi demografis, sosial, dan akademis.
Tujuan analisis: menerapkan empat jenis regresi logistik dari satu dataset yang sama dengan mengganti variabel respons:
| Jenis Regresi | Variabel Respons (Y) | Keterangan |
|---|---|---|
| Binary | lulus |
Dibuat dari G3 ≥ 10 (1 = lulus, 0 = tidak lulus) |
| Multinomial | Mjob |
Pekerjaan ibu (5 kategori nominal) |
| Ordinal | health |
Kondisi kesehatan siswa (1 = sangat buruk s/d 5 = sangat baik) |
| Poisson | absences |
Jumlah ketidakhadiran (data cacahan, ≥ 0) |
# Masukkan file student-por.csv ke folder yang sama dengan file .Rmd ini
df <- read.csv("D:/Semester 4/ADK/Project_ADK/student-por.csv", stringsAsFactors = TRUE)
# Tambah variabel yang dibutuhkan untuk analisis
df$lulus <- factor(
ifelse(df$G3 >= 10, "Lulus", "Tidak Lulus"),
levels = c("Tidak Lulus", "Lulus")
)
df$health_ord <- factor(
df$health,
levels = 1:5,
labels = c("Sangat Buruk", "Buruk", "Cukup", "Baik", "Sangat Baik"),
ordered = TRUE
)
df$Mjob <- relevel(df$Mjob, ref = "other")
glimpse(df)## Rows: 649
## Columns: 35
## $ school <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP,…
## $ sex <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F, F, M, M,…
## $ age <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15, 15, 15,…
## $ address <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,…
## $ famsize <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, GT3, GT3,…
## $ Pstatus <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T, T, T, T,…
## $ Medu <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, 3, 3, 4,…
## $ Fedu <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, 3, 2, 3,…
## $ Mjob <fct> at_home, at_home, at_home, health, other, services, other, …
## $ Fjob <fct> teacher, other, other, services, other, other, other, teach…
## $ reason <fct> course, course, other, home, home, reputation, home, home, …
## $ guardian <fct> mother, father, mother, mother, father, mother, mother, mot…
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 1, 1,…
## $ studytime <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, 2, 1, 1,…
## $ failures <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,…
## $ schoolsup <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no, no, no, …
## $ famsup <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes, yes, ye…
## $ paid <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes, yes, no…
## $ nursery <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ higher <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ internet <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes, yes, ye…
## $ romantic <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no, no, ye…
## $ famrel <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, 5, 5, 3,…
## $ freetime <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, 3, 5, 1,…
## $ goout <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 5, 3,…
## $ Dalc <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ Walc <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, 1, 4, 3,…
## $ health <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, 4, 5, 5,…
## $ absences <int> 4, 2, 6, 0, 0, 6, 0, 2, 0, 0, 2, 0, 0, 0, 0, 6, 10, 2, 2, 6…
## $ G1 <int> 0, 9, 12, 14, 11, 12, 13, 10, 15, 12, 14, 10, 12, 12, 14, 1…
## $ G2 <int> 11, 11, 13, 14, 13, 12, 12, 13, 16, 12, 14, 12, 13, 12, 14,…
## $ G3 <int> 11, 11, 12, 14, 13, 13, 13, 13, 17, 13, 14, 13, 12, 13, 15,…
## $ lulus <fct> Lulus, Lulus, Lulus, Lulus, Lulus, Lulus, Lulus, Lulus, Lul…
## $ health_ord <ord> Cukup, Cukup, Cukup, Sangat Baik, Sangat Baik, Sangat Baik,…
Regresi logistik binary digunakan ketika variabel respons bersifat dikotomi (dua kategori), misalnya: lulus/tidak lulus, sakit/sehat, ya/tidak.
Model ini memodelkan log-odds dari peluang suatu kejadian terjadi:
\[ \log\left(\frac{P(Y_i = 1 \mid \mathbf{x}_i)}{P(Y_i = 0 \mid \mathbf{x}_i)}\right) = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip} \]
Sehingga peluang \(P(Y_i = 1 \mid \mathbf{x}_i)\) dapat ditulis sebagai:
\[ P(Y_i = 1 \mid \mathbf{x}_i) = \frac{\exp(\beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip})}{1 + \exp(\beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip})} \]
Keterangan:
Jika dieksponensialkan, koefisien menjadi odds ratio (OR):
\[ OR_j = \exp(\beta_j) \]
Interpretasi:
tabel_lulus <- as.data.frame(table(df$lulus))
tabel_lulus$Proporsi <- round(tabel_lulus$Freq / sum(tabel_lulus$Freq), 3)
names(tabel_lulus) <- c("Kategori", "Frekuensi", "Proporsi")
kable(tabel_lulus,
caption = "Distribusi variabel lulus/tidak lulus",
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Kategori | Frekuensi | Proporsi |
|---|---|---|
| Tidak Lulus | 100 | 0.154 |
| Lulus | 549 | 0.846 |
Prediktor yang digunakan: G1 (nilai semester 1),
studytime (waktu belajar), failures (jumlah
kegagalan sebelumnya), sex, dan Medu
(pendidikan ibu).
fit_binary <- glm(
lulus ~ G1 + studytime + failures + sex + Medu,
data = df,
family = binomial(link = "logit")
)
summary(fit_binary)##
## Call:
## glm(formula = lulus ~ G1 + studytime + failures + sex + Medu,
## family = binomial(link = "logit"), data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.44059 1.17132 -8.060 7.64e-16 ***
## G1 1.16541 0.12415 9.387 < 2e-16 ***
## studytime 0.04898 0.22361 0.219 0.8266
## failures -0.37499 0.19496 -1.923 0.0544 .
## sexM -0.37717 0.32153 -1.173 0.2408
## Medu 0.07771 0.14817 0.524 0.6000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 557.79 on 648 degrees of freedom
## Residual deviance: 269.46 on 643 degrees of freedom
## AIC: 281.46
##
## Number of Fisher Scoring iterations: 7
b_sum <- coef(summary(fit_binary))
binary_coef <- data.frame(
Parameter = rownames(b_sum),
Estimate = b_sum[, "Estimate"],
SE = b_sum[, "Std. Error"],
z_value = b_sum[, "z value"],
p_value = b_sum[, "Pr(>|z|)"],
row.names = NULL
)
binary_coef$OR <- exp(binary_coef$Estimate)
binary_coef$CI_low <- exp(binary_coef$Estimate - 1.96 * binary_coef$SE)
binary_coef$CI_high <- exp(binary_coef$Estimate + 1.96 * binary_coef$SE)
binary_coef[, -1] <- round(binary_coef[, -1], 4)
kable(binary_coef,
caption = "Ringkasan hasil regresi logistik binary",
col.names = c("Parameter", "Estimate", "SE", "z-value", "p-value",
"OR", "CI 2.5%", "CI 97.5%"),
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE)| Parameter | Estimate | SE | z-value | p-value | OR | CI 2.5% | CI 97.5% |
|---|---|---|---|---|---|---|---|
| (Intercept) | -9.4406 | 1.1713 | -8.0598 | 0.0000 | 0.0001 | 0.0000 | 0.0008 |
| G1 | 1.1654 | 0.1241 | 9.3874 | 0.0000 | 3.2072 | 2.5145 | 4.0908 |
| studytime | 0.0490 | 0.2236 | 0.2190 | 0.8266 | 1.0502 | 0.6775 | 1.6278 |
| failures | -0.3750 | 0.1950 | -1.9235 | 0.0544 | 0.6873 | 0.4690 | 1.0071 |
| sexM | -0.3772 | 0.3215 | -1.1730 | 0.2408 | 0.6858 | 0.3652 | 1.2879 |
| Medu | 0.0777 | 0.1482 | 0.5245 | 0.6000 | 1.0808 | 0.8084 | 1.4450 |
df$prob_lulus <- predict(fit_binary, type = "response")
df$pred_lulus <- ifelse(df$prob_lulus >= 0.5, "Lulus", "Tidak Lulus")
head(df[, c("G1", "studytime", "failures", "sex", "Medu",
"lulus", "prob_lulus", "pred_lulus")], 8)## G1 studytime failures sex Medu lulus prob_lulus pred_lulus
## 1 0 2 0 F 4 Lulus 0.000119535 Tidak Lulus
## 2 9 2 0 F 1 Lulus 0.772726206 Lulus
## 3 12 2 0 F 1 Lulus 0.991163586 Lulus
## 4 14 3 0 F 4 Lulus 0.999346775 Lulus
## 5 11 2 0 F 3 Lulus 0.976107616 Lulus
## 6 12 2 0 M 4 Lulus 0.989808583 Lulus
## 7 13 2 0 M 2 Lulus 0.996263830 Lulus
## 8 10 2 0 F 4 Lulus 0.932284034 Lulus
## Prediksi
## Aktual Lulus Tidak Lulus
## Tidak Lulus 37 63
## Lulus 530 19
akurasi_binary <- round(sum(diag(conf_binary)) / sum(conf_binary) * 100, 2)
cat("Akurasi model binary:", akurasi_binary, "%\n")## Akurasi model binary: 8.63 %
Beberapa poin interpretasi dari output model:
G1 (nilai semester 1): Koefisien positif dan signifikan. Setiap kenaikan 1 poin nilai semester 1 meningkatkan odds lulus sekitar \(\exp(\hat\beta_{G1})\) kali, dengan prediktor lain konstan. Ini masuk akal karena performa awal semester sangat prediktif terhadap kelulusan akhir.
failures (jumlah kegagalan sebelumnya): Koefisien negatif menunjukkan bahwa siswa yang pernah gagal sebelumnya memiliki odds lulus yang lebih rendah. Setiap satu kegagalan tambahan menurunkan odds lulus sekitar \((1 - OR) \times 100\%\).
studytime (waktu belajar): Koefisien positif menunjukkan bahwa semakin banyak waktu belajar, semakin besar peluang lulus.
sex: Nilai \(OR\) menunjukkan apakah ada perbedaan odds lulus antara laki-laki dan perempuan, dengan prediktor lain konstan.
Medu (pendidikan ibu): Koefisien positif berarti semakin tinggi pendidikan ibu, semakin besar odds siswa lulus.
Regresi logistik multinomial digunakan ketika variabel respons bersifat nominal dengan lebih dari dua kategori, yaitu kategori yang tidak memiliki urutan alami.
Misalkan respons \(Y_i \in \{1, 2, \ldots, K\}\) dengan \(K\) kategori. Salah satu kategori dipilih sebagai kategori referensi (misalnya kategori \(K\)). Untuk setiap kategori \(k = 1, 2, \ldots, K-1\), model ditulis:
\[ \log\left(\frac{P(Y_i = k \mid \mathbf{x}_i)}{P(Y_i = K \mid \mathbf{x}_i)}\right) = \beta_{0k} + \beta_{1k} x_{i1} + \cdots + \beta_{pk} x_{ip} \]
Keterangan:
Probabilitas masing-masing kategori diperoleh dari:
\[ P(Y_i = k \mid \mathbf{x}_i) = \frac{\exp(\eta_{ik})}{1 + \sum_{h=1}^{K-1} \exp(\eta_{ih})}, \quad k = 1, \ldots, K-1 \]
\[ P(Y_i = K \mid \mathbf{x}_i) = \frac{1}{1 + \sum_{h=1}^{K-1} \exp(\eta_{ih})} \]
dengan \(\eta_{ik} = \beta_{0k} + \mathbf{x}_i^\top \boldsymbol{\beta}_k\).
Jika dieksponensialkan, koefisien menjadi Relative Risk Ratio (RRR):
\[ RRR_j = \exp(\beta_{jk}) \]
\(RRR > 1\) berarti kenaikan prediktor meningkatkan risiko relatif memilih kategori \(k\) dibanding referensi.
tabel_mjob <- as.data.frame(table(df$Mjob))
tabel_mjob$Proporsi <- round(tabel_mjob$Freq / sum(tabel_mjob$Freq), 3)
names(tabel_mjob) <- c("Pekerjaan_Ibu", "Frekuensi", "Proporsi")
kable(tabel_mjob,
caption = "Distribusi pekerjaan ibu siswa (Mjob) — 5 kategori nominal",
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Pekerjaan_Ibu | Frekuensi | Proporsi |
|---|---|---|
| other | 258 | 0.398 |
| at_home | 135 | 0.208 |
| health | 48 | 0.074 |
| services | 136 | 0.210 |
| teacher | 72 | 0.111 |
Prediktor: age, Medu (pendidikan ibu),
G3 (nilai akhir), sex, address
(tempat tinggal).
Kategori referensi: other.
fit_multi <- nnet::multinom(
Mjob ~ age + Medu + G3 + sex + address,
data = df,
trace = FALSE
)
summary(fit_multi)## Call:
## nnet::multinom(formula = Mjob ~ age + Medu + G3 + sex + address,
## data = df, trace = FALSE)
##
## Coefficients:
## (Intercept) age Medu G3 sexM addressU
## at_home -0.07330016 0.08364356 -0.5579844 -0.04177217 -0.5563386 -0.4076705
## health -2.86021457 -0.28559189 1.5726583 0.06330081 0.2739032 0.4542943
## services -1.46982977 -0.07024184 0.5729147 0.01340461 0.2506010 0.4153027
## teacher -15.32068806 -0.02444046 3.8819354 0.04804528 0.6271800 0.1322211
##
## Std. Errors:
## (Intercept) age Medu G3 sexM addressU
## at_home 1.647488 0.09072912 0.1259000 0.03544287 0.2419814 0.2277714
## health 2.784999 0.15374698 0.2382127 0.06093466 0.3545955 0.4465768
## services 1.666461 0.09222888 0.1146346 0.03650370 0.2257295 0.2584493
## teacher 3.509735 0.14332761 0.5938882 0.05408115 0.3449681 0.4136946
##
## Residual Deviance: 1511.184
## AIC: 1559.184
m_coef <- coef(fit_multi)
m_se <- summary(fit_multi)$standard.errors
# Susun ke data frame panjang
result_multi <- data.frame()
for (kat in rownames(m_coef)) {
tmp <- data.frame(
Kategori = kat,
Variabel = colnames(m_coef),
Estimate = as.numeric(m_coef[kat, ]),
SE = as.numeric(m_se[kat, ]),
row.names = NULL
)
result_multi <- rbind(result_multi, tmp)
}
result_multi$z_value <- result_multi$Estimate / result_multi$SE
result_multi$p_value <- 2 * (1 - pnorm(abs(result_multi$z_value)))
result_multi$RRR <- exp(result_multi$Estimate)
result_multi$CI_low <- exp(result_multi$Estimate - 1.96 * result_multi$SE)
result_multi$CI_high <- exp(result_multi$Estimate + 1.96 * result_multi$SE)
result_multi[, 3:9] <- round(result_multi[, 3:9], 4)
kable(result_multi,
caption = "Ringkasan hasil regresi logistik multinomial (referensi: other)",
col.names = c("Kategori", "Variabel", "Estimate", "SE",
"z-value", "p-value", "RRR", "CI 2.5%", "CI 97.5%"),
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE)| Kategori | Variabel | Estimate | SE | z-value | p-value | RRR | CI 2.5% | CI 97.5% |
|---|---|---|---|---|---|---|---|---|
| at_home | (Intercept) | -0.0733 | 1.6475 | -0.0445 | 0.9645 | 0.9293 | 0.0368 | 23.4712 |
| at_home | age | 0.0836 | 0.0907 | 0.9219 | 0.3566 | 1.0872 | 0.9101 | 1.2988 |
| at_home | Medu | -0.5580 | 0.1259 | -4.4320 | 0.0000 | 0.5724 | 0.4472 | 0.7326 |
| at_home | G3 | -0.0418 | 0.0354 | -1.1786 | 0.2386 | 0.9591 | 0.8947 | 1.0281 |
| at_home | sexM | -0.5563 | 0.2420 | -2.2991 | 0.0215 | 0.5733 | 0.3568 | 0.9212 |
| at_home | addressU | -0.4077 | 0.2278 | -1.7898 | 0.0735 | 0.6652 | 0.4257 | 1.0395 |
| health | (Intercept) | -2.8602 | 2.7850 | -1.0270 | 0.3044 | 0.0573 | 0.0002 | 13.4420 |
| health | age | -0.2856 | 0.1537 | -1.8575 | 0.0632 | 0.7516 | 0.5560 | 1.0159 |
| health | Medu | 1.5727 | 0.2382 | 6.6019 | 0.0000 | 4.8194 | 3.0215 | 7.6872 |
| health | G3 | 0.0633 | 0.0609 | 1.0388 | 0.2989 | 1.0653 | 0.9454 | 1.2005 |
| health | sexM | 0.2739 | 0.3546 | 0.7724 | 0.4399 | 1.3151 | 0.6563 | 2.6351 |
| health | addressU | 0.4543 | 0.4466 | 1.0173 | 0.3090 | 1.5751 | 0.6564 | 3.7795 |
| services | (Intercept) | -1.4698 | 1.6665 | -0.8820 | 0.3778 | 0.2300 | 0.0088 | 6.0281 |
| services | age | -0.0702 | 0.0922 | -0.7616 | 0.4463 | 0.9322 | 0.7780 | 1.1169 |
| services | Medu | 0.5729 | 0.1146 | 4.9977 | 0.0000 | 1.7734 | 1.4166 | 2.2202 |
| services | G3 | 0.0134 | 0.0365 | 0.3672 | 0.7135 | 1.0135 | 0.9435 | 1.0887 |
| services | sexM | 0.2506 | 0.2257 | 1.1102 | 0.2669 | 1.2848 | 0.8254 | 1.9998 |
| services | addressU | 0.4153 | 0.2584 | 1.6069 | 0.1081 | 1.5148 | 0.9128 | 2.5140 |
| teacher | (Intercept) | -15.3207 | 3.5097 | -4.3652 | 0.0000 | 0.0000 | 0.0000 | 0.0002 |
| teacher | age | -0.0244 | 0.1433 | -0.1705 | 0.8646 | 0.9759 | 0.7369 | 1.2924 |
| teacher | Medu | 3.8819 | 0.5939 | 6.5365 | 0.0000 | 48.5180 | 15.1487 | 155.3928 |
| teacher | G3 | 0.0480 | 0.0541 | 0.8884 | 0.3743 | 1.0492 | 0.9437 | 1.1665 |
| teacher | sexM | 0.6272 | 0.3450 | 1.8181 | 0.0691 | 1.8723 | 0.9522 | 3.6815 |
| teacher | addressU | 0.1322 | 0.4137 | 0.3196 | 0.7493 | 1.1414 | 0.5073 | 2.5678 |
## other at_home health services teacher
## 1 0.226 0.043 0.129 0.255 0.347
## 2 0.465 0.431 0.003 0.101 0.000
## 3 0.496 0.373 0.006 0.125 0.000
## 4 0.164 0.021 0.266 0.237 0.312
## 5 0.438 0.112 0.104 0.329 0.016
## 6 0.131 0.011 0.198 0.225 0.435
## Prediksi
## Aktual other at_home health services teacher
## other 192 29 0 4 33
## at_home 97 32 0 1 5
## health 12 0 0 0 36
## services 101 1 0 3 31
## teacher 4 0 0 1 67
akurasi_multi <- round(sum(diag(conf_multi)) / sum(conf_multi) * 100, 2)
cat("Akurasi model multinomial:", akurasi_multi, "%\n")## Akurasi model multinomial: 45.3 %
grid_multi <- expand.grid(
age = mean(df$age),
Medu = 1:4,
G3 = mean(df$G3),
sex = factor("F", levels = levels(df$sex)),
address = factor("U", levels = levels(df$address))
)
grid_prob <- as.data.frame(predict(fit_multi, newdata = grid_multi, type = "probs"))
grid_plot <- cbind(grid_multi, grid_prob)
grid_long <- pivot_longer(grid_plot,
cols = levels(df$Mjob),
names_to = "Mjob",
values_to = "Probabilitas")
ggplot(grid_long, aes(x = factor(Medu), y = Probabilitas, fill = Mjob)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Prediksi Probabilitas Pekerjaan Ibu",
subtitle = "Berdasarkan tingkat pendidikan ibu (Medu 1–4)",
x = "Pendidikan Ibu (Medu)", y = "Probabilitas", fill = "Pekerjaan"
) +
theme_minimal(base_size = 12)at_home, health, services,
teacher) terhadap kategori referensi
other.other.teacher atau health mengindikasikan
bahwa ibu dengan pendidikan lebih tinggi lebih mungkin bekerja di bidang
tersebut dibanding other.teacher, siswa dengan nilai lebih tinggi mungkin memiliki
ibu yang bekerja sebagai guru, yang mencerminkan latar belakang akademis
keluarga.Regresi logistik ordinal digunakan ketika variabel respons bersifat kategorik dengan urutan alami, misalnya: \[\text{Sangat Buruk} < \text{Buruk} < \text{Cukup} < \text{Baik} < \text{Sangat Baik}\]
Model yang paling umum adalah cumulative logit model atau proportional odds model:
\[ \log\left(\frac{P(Y_i \leq j \mid \mathbf{x}_i)}{P(Y_i > j \mid \mathbf{x}_i)}\right) = \alpha_j + \mathbf{x}_i^\top \boldsymbol{\beta}, \quad j = 1, 2, \ldots, J-1 \]
Keterangan:
Probabilitas tiap kategori diperoleh dari selisih peluang kumulatif: \[P(Y_i = j) = P(Y_i \leq j) - P(Y_i \leq j-1)\]
Catatan konvensi tanda: Fungsi
MASS::polr()menggunakan konvensi \(\text{logit}\{P(Y \leq j)\} = \alpha_j - \mathbf{x}^\top \boldsymbol{\beta}\). Akibatnya, koefisien pada outputpolr()memiliki tanda berlawanan dengan konvensi rumus di atas. Untuk interpretasi, gunakan: \(\hat\beta_\text{slide} = -\hat\beta_\text{polr}\).
Jika dieksponensialkan: \(OR = \exp(\beta)\)
polr(): \(OR
> 1\) berarti kenaikan prediktor meningkatkan odds berada di
kategori lebih tinggitabel_health <- as.data.frame(table(df$health_ord))
tabel_health$Proporsi <- round(tabel_health$Freq / sum(tabel_health$Freq), 3)
names(tabel_health) <- c("Kesehatan", "Frekuensi", "Proporsi")
kable(tabel_health,
caption = "Distribusi kondisi kesehatan siswa (health) — 5 kategori ordinal",
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Kesehatan | Frekuensi | Proporsi |
|---|---|---|
| Sangat Buruk | 90 | 0.139 |
| Buruk | 78 | 0.120 |
| Cukup | 124 | 0.191 |
| Baik | 108 | 0.166 |
| Sangat Baik | 249 | 0.384 |
Prediktor: age, sex, freetime
(waktu luang), goout (frekuensi keluar), Dalc
(konsumsi alkohol hari kerja).
fit_ord <- MASS::polr(
health_ord ~ age + sex + freetime + goout + Dalc,
data = df,
method = "logistic",
Hess = TRUE
)
summary(fit_ord)## Call:
## MASS::polr(formula = health_ord ~ age + sex + freetime + goout +
## Dalc, data = df, Hess = TRUE, method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## age -0.01440 0.05978 -0.2409
## sexM 0.44796 0.15479 2.8939
## freetime 0.15954 0.07466 2.1369
## goout -0.08571 0.06723 -1.2749
## Dalc 0.04837 0.08342 0.5798
##
## Intercepts:
## Value Std. Error t value
## Sangat Buruk|Buruk -1.6076 1.0242 -1.5696
## Buruk|Cukup -0.8229 1.0229 -0.8045
## Cukup|Baik 0.0429 1.0227 0.0419
## Baik|Sangat Baik 0.7323 1.0226 0.7161
##
## Residual Deviance: 1943.042
## AIC: 1961.042
o_sum <- coef(summary(fit_ord))
result_ord <- data.frame(
Parameter = rownames(o_sum),
Estimate = o_sum[, "Value"],
SE = o_sum[, "Std. Error"],
t_value = o_sum[, "t value"],
row.names = NULL
)
result_ord$p_value <- 2 * (1 - pnorm(abs(result_ord$t_value)))
result_ord$Jenis <- ifelse(grepl("\\|", result_ord$Parameter), "Cutpoint", "Koefisien")
result_ord$Estimate_slide <- ifelse(result_ord$Jenis == "Koefisien",
-result_ord$Estimate, result_ord$Estimate)
result_ord$OR_polr <- ifelse(result_ord$Jenis == "Koefisien",
exp(result_ord$Estimate), NA)
result_ord$OR_slide <- ifelse(result_ord$Jenis == "Koefisien",
exp(result_ord$Estimate_slide), NA)
result_ord$CI_low <- ifelse(result_ord$Jenis == "Koefisien",
exp(result_ord$Estimate - 1.96 * result_ord$SE), NA)
result_ord$CI_high <- ifelse(result_ord$Jenis == "Koefisien",
exp(result_ord$Estimate + 1.96 * result_ord$SE), NA)
num_cols <- c("Estimate", "SE", "t_value", "p_value",
"Estimate_slide", "OR_polr", "OR_slide", "CI_low", "CI_high")
result_ord[, num_cols] <- round(result_ord[, num_cols], 4)
kable(result_ord,
caption = "Ringkasan hasil regresi logistik ordinal",
col.names = c("Parameter", "Est. polr", "SE", "t-value", "p-value",
"Jenis", "Est. slide", "OR polr", "OR slide",
"CI 2.5%", "CI 97.5%"),
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE)| Parameter | Est. polr | SE | t-value | p-value | Jenis | Est. slide | OR polr | OR slide | CI 2.5% | CI 97.5% |
|---|---|---|---|---|---|---|---|---|---|---|
| age | -0.0144 | 0.0598 | -0.2409 | 0.8096 | Koefisien | 0.0144 | 0.9857 | 1.0145 | 0.8767 | 1.1082 |
| sexM | 0.4480 | 0.1548 | 2.8939 | 0.0038 | Koefisien | -0.4480 | 1.5651 | 0.6389 | 1.1555 | 2.1199 |
| freetime | 0.1595 | 0.0747 | 2.1369 | 0.0326 | Koefisien | -0.1595 | 1.1730 | 0.8525 | 1.0133 | 1.3578 |
| goout | -0.0857 | 0.0672 | -1.2749 | 0.2023 | Koefisien | 0.0857 | 0.9179 | 1.0895 | 0.8045 | 1.0471 |
| Dalc | 0.0484 | 0.0834 | 0.5798 | 0.5621 | Koefisien | -0.0484 | 1.0496 | 0.9528 | 0.8912 | 1.2360 |
| Sangat Buruk|Buruk | -1.6076 | 1.0242 | -1.5696 | 0.1165 | Cutpoint | -1.6076 | NA | NA | NA | NA |
| Buruk|Cukup | -0.8229 | 1.0229 | -0.8045 | 0.4211 | Cutpoint | -0.8229 | NA | NA | NA | NA |
| Cukup|Baik | 0.0429 | 1.0227 | 0.0419 | 0.9666 | Cutpoint | 0.0429 | NA | NA | NA | NA |
| Baik|Sangat Baik | 0.7323 | 1.0226 | 0.7161 | 0.4739 | Cutpoint | 0.7323 | NA | NA | NA | NA |
## Sangat Buruk Buruk Cukup Baik Sangat Baik
## 1 0.178 0.144 0.208 0.162 0.308
## 2 0.163 0.136 0.205 0.165 0.330
## 3 0.142 0.124 0.197 0.169 0.367
## 4 0.170 0.140 0.206 0.164 0.320
## 5 0.150 0.129 0.200 0.168 0.353
## 6 0.088 0.086 0.160 0.166 0.500
## Prediksi
## Aktual Sangat Buruk Buruk Cukup Baik Sangat Baik
## Sangat Buruk 0 0 0 0 90
## Buruk 0 0 0 0 78
## Cukup 0 0 0 0 124
## Baik 1 0 0 0 107
## Sangat Baik 0 0 0 0 249
akurasi_ord <- round(sum(diag(conf_ord)) / sum(conf_ord) * 100, 2)
cat("Akurasi model ordinal:", akurasi_ord, "%\n")## Akurasi model ordinal: 38.37 %
grid_ord <- expand.grid(
age = mean(df$age),
sex = factor("F", levels = levels(df$sex)),
freetime = 1:5,
goout = mean(df$goout),
Dalc = mean(df$Dalc)
)
grid_prob_ord <- as.data.frame(predict(fit_ord, newdata = grid_ord, type = "probs"))
grid_ord_plot <- cbind(grid_ord, grid_prob_ord)
grid_ord_long <- pivot_longer(grid_ord_plot,
cols = levels(df$health_ord),
names_to = "Kesehatan",
values_to = "Probabilitas")
grid_ord_long$Kesehatan <- factor(grid_ord_long$Kesehatan,
levels = levels(df$health_ord))
ggplot(grid_ord_long, aes(x = freetime, y = Probabilitas, color = Kesehatan)) +
geom_line(linewidth = 1.2) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Prediksi Probabilitas Tingkat Kesehatan",
subtitle = "Berdasarkan waktu luang (freetime 1–5); variabel lain pada rata-rata",
x = "Waktu Luang (freetime)", y = "Probabilitas", color = "Kesehatan"
) +
theme_minimal(base_size = 12)Konvensi penting: Output polr()
menggunakan tanda berlawanan. Koefisien positif pada polr()
berarti kenaikan prediktor meningkatkan odds berada di kategori
lebih tinggi.
goout (frekuensi keluar): Jika koefisien
positif, siswa yang lebih sering keluar memiliki odds lebih tinggi
berada di kategori kesehatan yang lebih baik (dalam konvensi
polr()). Namun perlu dicek signifikansinya.
Dalc (konsumsi alkohol hari kerja): Koefisien
negatif (dalam konvensi polr()) mengindikasikan bahwa
konsumsi alkohol lebih tinggi dikaitkan dengan odds lebih rendah berada
di kategori kesehatan yang lebih baik — sesuai ekspektasi
substantif.
Cutpoints (\(\alpha_j\)): Menunjukkan nilai logit kumulatif ketika semua prediktor = 0. Semakin besar jarak antar-cutpoint, semakin “lebar” rentang kategori tersebut.
Asumsi proportional odds menyatakan bahwa slope
prediktor berlaku sama untuk semua batas kumulatif. Uji formal dapat
dilakukan dengan ordinal::nominal_test().
Regresi Poisson digunakan ketika variabel respons berupa data cacahan (count data), yaitu bilangan bulat nonnegatif: \(Y_i \in \{0, 1, 2, \ldots\}\).
Asumsi dasar: \(Y_i \mid \mathbf{x}_i \sim \text{Poisson}(\mu_i)\), dengan fungsi peluang:
\[ P(Y_i = y_i \mid \mathbf{x}_i) = \frac{\exp(-\mu_i)\, \mu_i^{y_i}}{y_i!}, \quad y_i = 0, 1, 2, \ldots \]
Pada distribusi Poisson berlaku sifat equidispersion: \[E(Y_i \mid \mathbf{x}_i) = \text{Var}(Y_i \mid \mathbf{x}_i) = \mu_i\]
Model menggunakan fungsi penghubung log:
\[ \log(\mu_i) = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip} \]
Sehingga rata-rata hitungan:
\[ \mu_i = \exp(\beta_0 + \beta_1 x_{i1} + \cdots + \beta_p x_{ip}) \]
Keterangan:
Jika dieksponensialkan, koefisien menjadi Incidence Rate Ratio (IRR):
\[ IRR_j = \exp(\beta_j) \]
tabel_abs <- as.data.frame(table(df$absences))
tabel_abs$Proporsi <- round(tabel_abs$Freq / sum(tabel_abs$Freq), 3)
names(tabel_abs) <- c("Jumlah_Absen", "Frekuensi", "Proporsi")
kable(head(tabel_abs, 15),
caption = "Distribusi jumlah ketidakhadiran siswa (absences) — 15 nilai pertama",
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Jumlah_Absen | Frekuensi | Proporsi |
|---|---|---|
| 0 | 244 | 0.376 |
| 1 | 12 | 0.018 |
| 2 | 110 | 0.169 |
| 3 | 7 | 0.011 |
| 4 | 93 | 0.143 |
| 5 | 12 | 0.018 |
| 6 | 49 | 0.076 |
| 7 | 3 | 0.005 |
| 8 | 42 | 0.065 |
| 9 | 7 | 0.011 |
| 10 | 21 | 0.032 |
| 11 | 5 | 0.008 |
| 12 | 12 | 0.018 |
| 13 | 1 | 0.002 |
| 14 | 8 | 0.012 |
ggplot(df, aes(x = absences)) +
geom_histogram(binwidth = 1, fill = "#2f7f73", color = "white", alpha = 0.9) +
labs(
title = "Distribusi Jumlah Ketidakhadiran (absences)",
subtitle = "Data cacahan: bilangan bulat nonnegatif — cocok untuk regresi Poisson",
x = "Jumlah ketidakhadiran", y = "Frekuensi"
) +
theme_minimal(base_size = 12)Prediktor: studytime (waktu belajar),
failures (kegagalan sebelumnya), goout
(frekuensi keluar), sex, famrel (kualitas
hubungan keluarga).
fit_pois <- glm(
absences ~ studytime + failures + goout + sex + famrel,
data = df,
family = poisson(link = "log")
)
summary(fit_pois)##
## Call:
## glm(formula = absences ~ studytime + failures + goout + sex +
## famrel, family = poisson(link = "log"), data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.72541 0.11120 15.516 < 2e-16 ***
## studytime -0.16300 0.02711 -6.013 1.83e-09 ***
## failures 0.16883 0.02915 5.791 7.01e-09 ***
## goout 0.08756 0.01748 5.009 5.48e-07 ***
## sexM -0.01515 0.04289 -0.353 0.724
## famrel -0.11517 0.02051 -5.614 1.97e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 3464.7 on 648 degrees of freedom
## Residual deviance: 3322.0 on 643 degrees of freedom
## AIC: 4707.2
##
## Number of Fisher Scoring iterations: 6
p_sum <- coef(summary(fit_pois))
pois_coef <- data.frame(
Parameter = rownames(p_sum),
Estimate = p_sum[, "Estimate"],
SE = p_sum[, "Std. Error"],
z_value = p_sum[, "z value"],
p_value = p_sum[, "Pr(>|z|)"],
row.names = NULL
)
pois_coef$IRR <- exp(pois_coef$Estimate)
pois_coef$CI_low <- exp(pois_coef$Estimate - 1.96 * pois_coef$SE)
pois_coef$CI_high <- exp(pois_coef$Estimate + 1.96 * pois_coef$SE)
pois_coef$Perubahan_persen <- 100 * (pois_coef$IRR - 1)
pois_coef[, -1] <- round(pois_coef[, -1], 4)
kable(pois_coef,
caption = "Ringkasan hasil regresi Poisson",
col.names = c("Parameter", "Estimate", "SE", "z-value", "p-value",
"IRR", "CI 2.5%", "CI 97.5%", "Perubahan (%)"),
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE)| Parameter | Estimate | SE | z-value | p-value | IRR | CI 2.5% | CI 97.5% | Perubahan (%) |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 1.7254 | 0.1112 | 15.5158 | 0.000 | 5.6148 | 4.5152 | 6.9822 | 461.4801 |
| studytime | -0.1630 | 0.0271 | -6.0126 | 0.000 | 0.8496 | 0.8056 | 0.8960 | -15.0409 |
| failures | 0.1688 | 0.0292 | 5.7907 | 0.000 | 1.1839 | 1.1182 | 1.2535 | 18.3914 |
| goout | 0.0876 | 0.0175 | 5.0086 | 0.000 | 1.0915 | 1.0547 | 1.1295 | 9.1503 |
| sexM | -0.0151 | 0.0429 | -0.3532 | 0.724 | 0.9850 | 0.9055 | 1.0714 | -1.5034 |
| famrel | -0.1152 | 0.0205 | -5.6142 | 0.000 | 0.8912 | 0.8561 | 0.9278 | -10.8786 |
Salah satu asumsi Poisson adalah equidispersion (mean = varians). Pelanggaran menghasilkan overdispersi. Indikatornya:
\[\hat\phi = \frac{\sum r_{P,i}^2}{df}\]
Jika \(\hat\phi \approx 1\), tidak ada masalah. Jika \(\hat\phi \gg 1\), perlu model alternatif (quasi-Poisson atau Negative Binomial).
dispersion <- sum(residuals(fit_pois, type = "pearson")^2) / df.residual(fit_pois)
interpretasi <- ifelse(dispersion < 1.5,
"Tidak ada indikasi overdispersi berat",
ifelse(dispersion < 2.5,
"Ada indikasi overdispersi sedang",
"Ada indikasi overdispersi kuat — pertimbangkan Negative Binomial"))
tabel_disp <- data.frame(
`Dispersion Pearson` = round(dispersion, 3),
Interpretasi = interpretasi,
check.names = FALSE
)
kable(tabel_disp,
caption = "Pemeriksaan overdispersi model Poisson",
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Dispersion Pearson | Interpretasi |
|---|---|
| 5.642 | Ada indikasi overdispersi kuat — pertimbangkan Negative Binomial |
df$prediksi_absen <- round(predict(fit_pois, type = "response"), 3)
head(df[, c("studytime", "failures", "goout", "sex",
"famrel", "absences", "prediksi_absen")], 8)## studytime failures goout sex famrel absences prediksi_absen
## 1 2 0 4 F 4 4 3.629
## 2 2 0 3 F 5 2 2.963
## 3 2 0 2 F 4 6 3.046
## 4 3 0 2 F 3 0 2.904
## 5 2 0 2 F 4 0 3.046
## 6 2 0 2 M 5 6 2.674
## 7 2 0 4 M 4 0 3.574
## 8 2 0 4 F 4 2 3.629
grid_pois <- expand.grid(
studytime = mean(df$studytime),
failures = 0,
goout = 1:5,
sex = factor("F", levels = levels(df$sex)),
famrel = mean(df$famrel)
)
pred_rate <- predict(fit_pois, newdata = grid_pois, type = "link", se.fit = TRUE)
grid_pois$rate <- exp(pred_rate$fit)
grid_pois$rate_low <- exp(pred_rate$fit - 1.96 * pred_rate$se.fit)
grid_pois$rate_high <- exp(pred_rate$fit + 1.96 * pred_rate$se.fit)
ggplot(grid_pois, aes(x = goout, y = rate)) +
geom_ribbon(aes(ymin = rate_low, ymax = rate_high),
fill = "#2f7f73", alpha = 0.2) +
geom_line(color = "#2f7f73", linewidth = 1.35) +
geom_point(color = "#2f7f73", size = 2.5) +
labs(
title = "Prediksi Rata-rata Jumlah Ketidakhadiran",
subtitle = "Berdasarkan frekuensi keluar (goout 1–5); variabel lain pada rata-rata",
x = "Frekuensi Keluar (goout)", y = "Prediksi jumlah absen"
) +
theme_minimal(base_size = 12)failures (kegagalan sebelumnya): Jika IRR > 1, siswa yang pernah gagal lebih banyak cenderung memiliki lebih banyak ketidakhadiran. Ini konsisten secara substantif — kegagalan dan absensi saling berkaitan.
goout (frekuensi keluar): IRR > 1 berarti siswa yang lebih sering keluar memiliki rata-rata absen yang lebih tinggi. Misalnya, jika \(IRR = 1.15\), maka setiap kenaikan 1 poin goout dikaitkan dengan kenaikan rata-rata absen sekitar 15%.
studytime (waktu belajar): Koefisien negatif (IRR < 1) mengindikasikan bahwa siswa yang lebih banyak belajar cenderung lebih jarang absen.
famrel (hubungan keluarga): IRR < 1 berarti hubungan keluarga yang lebih baik dikaitkan dengan lebih sedikit ketidakhadiran.
Dispersion: Nilai dispersion Pearson yang mendekati 1 menunjukkan model Poisson sudah sesuai. Jika jauh di atas 1, pertimbangkan quasi-Poisson atau Negative Binomial.
tabel_ringkasan <- data.frame(
Jenis = c("Binary", "Multinomial", "Ordinal", "Poisson"),
Respons = c("lulus (0/1)", "Mjob (5 kategori)", "health (1–5)", "absences (count)"),
Fungsi_R = c("glm(..., family = binomial)", "nnet::multinom()",
"MASS::polr()", "glm(..., family = poisson)"),
Koefisien = c("Log-odds → OR", "Log-odds vs referensi → RRR",
"Cumulative log-odds → OR", "Log rata-rata count → IRR"),
Asumsi_Kunci = c(
"Respons biner, observasi independen",
"Nominal (tidak berurutan), IIA",
"Ordinal (berurutan), proportional odds",
"Count ≥ 0, equidispersion (mean = varians)"
),
check.names = FALSE
)
kable(tabel_ringkasan,
caption = "Perbandingan keempat jenis regresi logistik",
col.names = c("Jenis", "Variabel Respons", "Fungsi R",
"Interpretasi Koefisien", "Asumsi Kunci"),
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE)| Jenis | Variabel Respons | Fungsi R | Interpretasi Koefisien | Asumsi Kunci |
|---|---|---|---|---|
| Binary | lulus (0/1) | glm(…, family = binomial) | Log-odds → OR | Respons biner, observasi independen |
| Multinomial | Mjob (5 kategori) | nnet::multinom() | Log-odds vs referensi → RRR | Nominal (tidak berurutan), IIA |
| Ordinal | health (1–5) | MASS::polr() | Cumulative log-odds → OR | Ordinal (berurutan), proportional odds |
| Poisson | absences (count) | glm(…, family = poisson) | Log rata-rata count → IRR | Count ≥ 0, equidispersion (mean = varians) |
Catatan: Keempat analisis di atas menggunakan dataset yang sama (
student-por.csv) dengan mengganti variabel respons. Prediktor yang digunakan di tiap model dipilih agar relevan secara substantif dengan variabel respons masing-masing.