Pendahuluan

Analisis ini bertujuan untuk memodelkan jumlah dokter yang dikunjungi oleh lansia di Amerika Serikat menggunakan Regresi Logistik Multinomial. Dataset yang digunakan adalah National Poll on Healthy Aging (NPHA) yang berisi informasi kesehatan lansia.

Variabel Respon (Y): Number of Doctors Visited

  • 1 = Sedikit
  • 2 = Sedang
  • 3 = Banyak

Variabel Prediktor: kondisi kesehatan fisik, mental, gigi, status pekerjaan, gangguan tidur, ras, dan jenis kelamin.


Langkah 0: Load Package

# Hapus tanda '#' di bawah ini kalau belum install package-nya
# install.packages(c("nnet", "car", "DescTools", "tidyverse", "caret", "knitr", "kableExtra", "rcompanion"))

library(nnet)
library(car)
library(DescTools)
library(tidyverse)
library(caret)
library(knitr)
library(kableExtra)
library(rcompanion)

Langkah 1: Import Data

df <- read.csv("NPHA-doctor-visits.csv")

cat("Jumlah baris:", nrow(df), "\n")
## Jumlah baris: 714
cat("Jumlah kolom:", ncol(df), "\n")
## Jumlah kolom: 15

Langkah 2: Preprocessing (Pembersihan Data)

2a. Hapus Kolom Age

Kolom Age tidak informatif karena seluruh nilainya sama (= 2), sehingga dihapus.

df <- df[, -2]
cat("Kolom setelah Age dihapus:", ncol(df), "kolom\n")
## Kolom setelah Age dihapus: 14 kolom

2b. Rename Kolom

colnames(df) <- c(
  "Y",            # Number of Doctors Visited (VARIABEL RESPON)
  "PhysHealth",   # Physical Health
  "MentHealth",   # Mental Health
  "DentHealth",   # Dental Health
  "Employment",   # Status pekerjaan
  "Stress",       # Stress mengganggu tidur
  "Medication",   # Obat mengganggu tidur
  "Pain",         # Nyeri mengganggu tidur
  "Bathroom",     # Kebutuhan kamar mandi mengganggu tidur
  "Unknown",      # Tidak diketahui mengganggu tidur
  "TroubleSleep", # Masalah tidur
  "SleepMed",     # Obat tidur resep
  "Race",         # Ras
  "Gender"        # Jenis kelamin
)

cat("Rename kolom berhasil.\n")
## Rename kolom berhasil.

2c. Hapus Baris dengan Nilai -1

Nilai -1 menandakan responden menolak menjawab (missing), sehingga perlu dihapus.

cat("Jumlah nilai -1 sebelum cleaning:", sum(df == -1), "\n")
## Jumlah nilai -1 sebelum cleaning: 20
df <- df[!apply(df == -1, 1, any), ]

cat("Jumlah observasi setelah cleaning:", nrow(df), "\n")
## Jumlah observasi setelah cleaning: 696

2d. Ubah ke Factor

df$Y <- factor(df$Y, levels = c(1, 2, 3),
               labels = c("Sedikit", "Sedang", "Banyak"))

df$PhysHealth   <- factor(df$PhysHealth)
df$MentHealth   <- factor(df$MentHealth)
df$DentHealth   <- factor(df$DentHealth)
df$Employment   <- factor(df$Employment)
df$Stress       <- factor(df$Stress)
df$Medication   <- factor(df$Medication)
df$Pain         <- factor(df$Pain)
df$Bathroom     <- factor(df$Bathroom)
df$Unknown      <- factor(df$Unknown)
df$TroubleSleep <- factor(df$TroubleSleep)
df$SleepMed     <- factor(df$SleepMed)
df$Race         <- factor(df$Race)
df$Gender       <- factor(df$Gender)

str(df)
## 'data.frame':    696 obs. of  14 variables:
##  $ Y           : Factor w/ 3 levels "Sedikit","Sedang",..: 3 2 3 1 3 2 3 2 2 1 ...
##  $ PhysHealth  : Factor w/ 5 levels "1","2","3","4",..: 4 4 3 3 3 3 4 3 3 2 ...
##  $ MentHealth  : Factor w/ 5 levels "1","2","3","4",..: 3 2 2 2 3 2 1 2 1 1 ...
##  $ DentHealth  : Factor w/ 6 levels "1","2","3","4",..: 3 3 3 3 3 4 1 6 2 3 ...
##  $ Employment  : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 1 ...
##  $ Stress      : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 2 1 1 ...
##  $ Medication  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Pain        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 2 1 ...
##  $ Bathroom    : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 2 1 2 1 ...
##  $ Unknown     : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 1 2 ...
##  $ TroubleSleep: Factor w/ 3 levels "1","2","3": 2 3 3 3 2 3 2 3 3 3 ...
##  $ SleepMed    : Factor w/ 3 levels "1","2","3": 3 3 3 3 3 3 1 3 3 3 ...
##  $ Race        : Factor w/ 5 levels "1","2","3","4",..: 1 1 4 4 1 1 1 1 1 1 ...
##  $ Gender      : Factor w/ 2 levels "1","2": 2 1 1 2 2 1 1 1 2 1 ...

Langkah 3: Statistika Deskriptif

tabel_Y  <- table(df$Y)
persen_Y <- round(prop.table(tabel_Y) * 100, 2)

tabel_respon <- data.frame(
  Kategori   = names(tabel_Y),
  Frekuensi  = as.numeric(tabel_Y),
  Persentase = paste0(formatC(as.numeric(persen_Y), format = "f", digits = 2), "%")
)

tabel_respon %>%
  kable(caption = "Tabel 1. Distribusi Variabel Respon (Y)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Tabel 1. Distribusi Variabel Respon (Y)
Kategori Frekuensi Persentase
Sedikit 126 18.10%
Sedang 363 52.16%
Banyak 207 29.74%
barplot(tabel_Y,
        main      = "Distribusi Jumlah Dokter yang Dikunjungi",
        xlab      = "Kategori",
        ylab      = "Frekuensi",
        col       = c("steelblue", "orange", "tomato"),
        names.arg = c("Sedikit (1)", "Sedang (2)", "Banyak (3)"))


Langkah 4: Uji Independensi (Chi-Square)

Hipotesis:

  • H₀: Tidak ada hubungan antara prediktor dengan Y
  • H₁: Ada hubungan antara prediktor dengan Y
  • Keputusan: Tolak H₀ jika p-value < 0.05
prediktor <- c("PhysHealth", "MentHealth", "DentHealth", "Employment",
               "Stress", "Medication", "Pain", "Bathroom", "Unknown",
               "TroubleSleep", "SleepMed", "Race", "Gender")

hasil_chisq <- data.frame(
  Variabel  = prediktor,
  ChiSquare = NA,
  Pvalue    = NA,
  Keputusan = NA
)

for (i in seq_along(prediktor)) {
  var <- prediktor[i]
  tbl <- table(df[[var]], df$Y)
  uji <- chisq.test(tbl)
  hasil_chisq$ChiSquare[i] <- round(uji$statistic, 4)
  hasil_chisq$Pvalue[i]    <- round(uji$p.value, 4)
  hasil_chisq$Keputusan[i] <- ifelse(uji$p.value < 0.05, "Tolak H0", "Gagal Tolak H0")
}

hasil_chisq %>%
  kable(caption = "Tabel 2. Hasil Uji Independensi Chi-Square") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(hasil_chisq$Pvalue < 0.05), background = "#d4edda")
Tabel 2. Hasil Uji Independensi Chi-Square
Variabel ChiSquare Pvalue Keputusan
PhysHealth 22.5502 0.0040 Tolak H0
MentHealth 6.5532 0.5855 Gagal Tolak H0
DentHealth 11.2035 0.3419 Gagal Tolak H0
Employment 14.5235 0.0243 Tolak H0
Stress 3.6019 0.1651 Gagal Tolak H0
Medication 10.8690 0.0044 Tolak H0
Pain 4.2467 0.1196 Gagal Tolak H0
Bathroom 2.4595 0.2924 Gagal Tolak H0
Unknown 0.4441 0.8009 Gagal Tolak H0
TroubleSleep 5.5666 0.2339 Gagal Tolak H0
SleepMed 16.9255 0.0020 Tolak H0
Race 19.9840 0.0104 Tolak H0
Gender 1.2798 0.5274 Gagal Tolak H0
var_signifikan <- hasil_chisq$Variabel[hasil_chisq$Pvalue < 0.05]
cat("\nVariabel signifikan:", paste(var_signifikan, collapse = ", "), "\n")
## 
## Variabel signifikan: PhysHealth, Employment, Medication, SleepMed, Race

Langkah 5: Pemodelan Regresi Logistik Multinomial

Model dibentuk menggunakan variabel yang signifikan dari uji Chi-Square dengan kategori referensi Y = Sedikit.

df$Y <- relevel(df$Y, ref = "Sedikit")

formula_model <- as.formula(paste("Y ~", paste(var_signifikan, collapse = " + ")))
cat("Formula model:\n")
## Formula model:
print(formula_model)
## Y ~ PhysHealth + Employment + Medication + SleepMed + Race
model <- multinom(formula_model, data = df, maxit = 200)
## # weights:  48 (30 variable)
## initial  value 764.634153 
## iter  10 value 670.574394
## iter  20 value 663.823762
## iter  30 value 663.526216
## iter  40 value 663.471194
## final  value 663.471116 
## converged
summary(model)
## Call:
## multinom(formula = formula_model, data = df, maxit = 200)
## 
## Coefficients:
##        (Intercept) PhysHealth2 PhysHealth3 PhysHealth4 PhysHealth5 Employment2
## Sedang    2.188072   0.5034784   0.3932622    0.743801   0.5331025  -0.1037948
## Banyak    1.500083   0.3168909   0.7492710    1.290832   0.9367056   0.3444506
##        Employment3 Employment4 Medication1 SleepMed2 SleepMed3      Race2
## Sedang   0.1047031  -1.5693460    1.029053 -1.254507 -1.690097  0.2453823
## Banyak   0.6834303   0.4958809    1.759296 -1.794207 -2.373488 -0.3951672
##             Race3      Race4    Race5
## Sedang -1.2152522 -0.5540754 13.71819
## Banyak -0.3087676 -1.4578191 13.33296
## 
## Std. Errors:
##        (Intercept) PhysHealth2 PhysHealth3 PhysHealth4 PhysHealth5 Employment2
## Sedang    1.178804   0.4393285   0.4394547   0.5117139   0.8087895   0.5027683
## Banyak    1.242803   0.5434102   0.5365307   0.5976350   0.8769164   0.6261171
##        Employment3 Employment4 Medication1 SleepMed2 SleepMed3     Race2
## Sedang   0.3862213   0.8523014   0.7669523  1.191086  1.056960 0.4241660
## Banyak   0.4978889   0.7925190   0.7673019  1.196659  1.048583 0.4993017
##            Race3     Race4     Race5
## Sedang 0.6111763 0.3822433 0.2569032
## Banyak 0.5720616 0.5237545 0.2569031
## 
## Residual Deviance: 1326.942 
## AIC: 1386.942

Langkah 6: Uji Serentak (Likelihood Ratio Test)

Hipotesis:

  • H₀: β₁ = β₂ = β₃ = β₄ = β₅ = 0 (tidak ada prediktor yang berpengaruh)
  • H₁: minimal ada satu βi ≠ 0
  • Keputusan: Tolak H₀ jika p-value < 0.05
model_null <- multinom(Y ~ 1, data = df, maxit = 200)
## # weights:  6 (2 variable)
## initial  value 764.634153 
## final  value 702.650827 
## converged
lrt       <- deviance(model_null) - deviance(model)
df_lrt    <- attr(logLik(model), "df") - attr(logLik(model_null), "df")
p_value_lrt <- pchisq(lrt, df = df_lrt, lower.tail = FALSE)

hasil_serentak <- data.frame(
  Statistik = c("G² (Chi-Square)", "Derajat Bebas", "P-value", "Keputusan"),
  Nilai     = c(round(lrt, 4), df_lrt, round(p_value_lrt, 4),
                ifelse(p_value_lrt < 0.05, "Tolak H0 - Model Signifikan", "Gagal Tolak H0"))
)

hasil_serentak %>%
  kable(caption = "Tabel 3. Hasil Uji Serentak (Likelihood Ratio Test)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Tabel 3. Hasil Uji Serentak (Likelihood Ratio Test)
Statistik Nilai
G² (Chi-Square) 78.3594
Derajat Bebas 28
P-value 0
Keputusan Tolak H0 - Model Signifikan

Langkah 7: Uji Parsial (Wald Test)

Hipotesis:

  • H₀: βi = 0
  • H₁: βi ≠ 0
  • Keputusan: Tolak H₀ jika p-value < 0.05
z           <- summary(model)$coefficients / summary(model)$standard.errors
p_value_wald <- (1 - pnorm(abs(z))) * 2

cat("=== Nilai Wald (Z) ===\n")
## === Nilai Wald (Z) ===
print(round(z, 4))
##        (Intercept) PhysHealth2 PhysHealth3 PhysHealth4 PhysHealth5 Employment2
## Sedang      1.8562      1.1460      0.8949      1.4535      0.6591     -0.2064
## Banyak      1.2070      0.5832      1.3965      2.1599      1.0682      0.5501
##        Employment3 Employment4 Medication1 SleepMed2 SleepMed3   Race2   Race3
## Sedang      0.2711     -1.8413      1.3417   -1.0532   -1.5990  0.5785 -1.9884
## Banyak      1.3727      0.6257      2.2928   -1.4993   -2.2635 -0.7914 -0.5397
##          Race4   Race5
## Sedang -1.4495 53.3983
## Banyak -2.7834 51.8988
cat("\n=== P-value ===\n")
## 
## === P-value ===
print(round(p_value_wald, 4))
##        (Intercept) PhysHealth2 PhysHealth3 PhysHealth4 PhysHealth5 Employment2
## Sedang      0.0634      0.2518      0.3708      0.1461      0.5098      0.8364
## Banyak      0.2274      0.5598      0.1626      0.0308      0.2854      0.5822
##        Employment3 Employment4 Medication1 SleepMed2 SleepMed3  Race2  Race3
## Sedang      0.7863      0.0656      0.1797    0.2922    0.1098 0.5629 0.0468
## Banyak      0.1699      0.5315      0.0219    0.1338    0.0236 0.4287 0.5894
##         Race4 Race5
## Sedang 0.1472     0
## Banyak 0.0054     0

Langkah 8: Uji Kesesuaian Model (Pseudo R² Nagelkerke)

hasil_nagelkerke <- nagelkerke(model)
print(hasil_nagelkerke)
## $Models
##          
## Model: NA
## Null:  NA
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                                   NA
## Cox and Snell (ML)                         NA
## Nagelkerke (Cragg and Uhler)               NA
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff Chisq p.value
##       NA          NA    NA      NA
## 
## $Number.of.observations
##          
## Model: NA
## Null:  NA
## 
## $Messages
## [1] "This function will work with lm, gls, lme, lmer, glmer, glm, negbin, zeroinfl, glmmTMB, nls, clm, clmm, and vglm"
## 
## $Warnings
## [1] "None"
r2 <- hasil_nagelkerke$Pseudo.R.squared.for.model.vs.null[3]
cat("\nNagelkerke R²:", round(r2, 4), "\n")
## 
## Nagelkerke R²: NA
cat("Model mampu menjelaskan", round(r2 * 100, 2), "% variasi variabel respon\n")
## Model mampu menjelaskan NA % variasi variabel respon

Langkah 9: Odds Ratio

Odds Ratio = exp(β). Interpretasi:

  • OR > 1: kecenderungan masuk kategori j lebih tinggi dibanding referensi (Sedikit)
  • OR < 1: kecenderungan masuk kategori j lebih rendah dibanding referensi (Sedikit)
odds_ratio <- exp(coef(model))

odds_ratio %>%
  round(4) %>%
  kable(caption = "Tabel 4. Odds Ratio (exp(β))") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
Tabel 4. Odds Ratio (exp(β))
(Intercept) PhysHealth2 PhysHealth3 PhysHealth4 PhysHealth5 Employment2 Employment3 Employment4 Medication1 SleepMed2 SleepMed3 Race2 Race3 Race4 Race5
Sedang 8.9180 1.6545 1.4818 2.1039 1.7042 0.9014 1.1104 0.2082 2.7984 0.2852 0.1845 1.2781 0.2966 0.5746 907263.1
Banyak 4.4821 1.3729 2.1155 3.6358 2.5516 1.4112 1.9807 1.6419 5.8083 0.1663 0.0932 0.6736 0.7344 0.2327 617207.8

Langkah 10: Akurasi Model (Confusion Matrix)

pred        <- predict(model, newdata = df)
conf_matrix <- table(Prediksi = pred, Aktual = df$Y)

conf_matrix %>%
  kable(caption = "Tabel 5. Confusion Matrix") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Tabel 5. Confusion Matrix
Sedikit Sedang Banyak
Sedikit 5 4 4
Sedang 113 336 167
Banyak 8 23 36
akurasi <- mean(pred == df$Y)
cat("\nAkurasi Model:", round(akurasi * 100, 2), "%\n")
## 
## Akurasi Model: 54.17 %

Kesimpulan

## Ringkasan Hasil Analisis:
## - Total observasi     : 696
## - Variabel signifikan : PhysHealth, Employment, Medication, SleepMed, Race
## - Kategori referensi  : Sedikit (Y=1)
## - Model logit         : 2 model (Sedang vs Sedikit, Banyak vs Sedikit)
## - Akurasi model       : 54.17 %

Kelas 2024C | Dosen Pengampu: Ulfa Siti Nuraini, S.Stat., M.Stat