Load Library

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(brant)
## Warning: package 'brant' was built under R version 4.5.3
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Import Data

data_raw <- read.csv("C:/Users/Lenovo/Downloads/children anemia.csv/children anemia.csv")
names(data_raw) <- make.names(names(data_raw))
str(data_raw)
## 'data.frame':    33924 obs. of  17 variables:
##  $ Age.in.5.year.groups                                                 : chr  "40-44" "35-39" "25-29" "25-29" ...
##  $ Type.of.place.of.residence                                           : chr  "Urban" "Urban" "Urban" "Urban" ...
##  $ Highest.educational.level                                            : chr  "Higher" "Higher" "Higher" "Secondary" ...
##  $ Wealth.index.combined                                                : chr  "Richest" "Richest" "Richest" "Richest" ...
##  $ Births.in.last.five.years                                            : int  1 1 1 1 1 1 2 2 1 1 ...
##  $ Age.of.respondent.at.1st.birth                                       : int  22 28 26 25 21 30 32 32 32 19 ...
##  $ Hemoglobin.level.adjusted.for.altitude.and.smoking..g.dl...1.decimal.: num  NA NA NA 95 NA 113 121 121 NA 108 ...
##  $ Anemia.level                                                         : chr  "" "" "" "Moderate" ...
##  $ Have.mosquito.bed.net.for.sleeping..from.household.questionnaire.    : chr  "Yes" "Yes" "No" "Yes" ...
##  $ Smokes.cigarettes                                                    : chr  "No" "No" "No" "No" ...
##  $ Current.marital.status                                               : chr  "Living with partner" "Married" "Married" "Married" ...
##  $ Currently.residing.with.husband.partner                              : chr  "Staying elsewhere" "Living with her" "Living with her" "Living with her" ...
##  $ When.child.put.to.breast                                             : chr  "Immediately" "Hours: 1" "Immediately" "105.0" ...
##  $ Had.fever.in.last.two.weeks                                          : chr  "No" "No" "No" "No" ...
##  $ Hemoglobin.level.adjusted.for.altitude..g.dl...1.decimal.            : num  NA NA NA 114 NA 119 102 NA NA 113 ...
##  $ Anemia.level.1                                                       : chr  "" "" "" "Not anemic" ...
##  $ Taking.iron.pills..sprinkles.or.syrup                                : chr  "Yes" "No" "No" "No" ...
head(data)
##                                                                             
## 1 function (..., list = character(), package = NULL, lib.loc = NULL,        
## 2     verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE) 
## 3 {                                                                         
## 4     fileExt <- function(x) {                                              
## 5         db <- grepl("\\\\.[^.]+\\\\.(gz|bz2|xz)$", x)                     
## 6         ans <- sub(".*\\\\.", "", x)

Data Selection & Cleaning

data_sub <- data_raw[, c(
  "Age.in.5.year.groups",
  "Type.of.place.of.residence",
  "Highest.educational.level",
  "Wealth.index.combined",
  "Births.in.last.five.years",
  "Age.of.respondent.at.1st.birth",
  "Smokes.cigarettes",
  "Taking.iron.pills..sprinkles.or.syrup",
  "Anemia.level.1"
)]

colnames(data_sub) <- c("AgeGroup", "Residence", "Education", "Wealth",
                        "Births", "AgeFirstBirth", "Smoking", "Iron", "Anemia")

data_clean <- na.omit(data_sub)
data_clean <- data_clean[data_clean$Anemia != "", ]

cat("Jumlah observasi:", nrow(data_clean))
## Jumlah observasi: 10182

Setelah dilakukan seleksi variabel dan pembersihan data menggunakan na.omit() serta penghapusan baris dengan nilai kosong pada variabel Anemia, diperoleh 10.182 observasi yang siap dianalisis dari total 33.924 data awal. Variabel yang digunakan mencakup delapan variabel independen dan satu variabel dependen berupa tingkat anemia anak.

Transformasi Variabel

data_clean$Anemia <- factor(data_clean$Anemia,
                            levels  = c("Not anemic", "Mild", "Moderate", "Severe"),
                            ordered = TRUE)

data_clean$AgeGroup  <- factor(data_clean$AgeGroup)
data_clean$Residence <- factor(data_clean$Residence)
data_clean$Education <- factor(data_clean$Education,
                               levels = c("No education", "Primary",
                                          "Secondary", "Higher"))
data_clean$Wealth    <- factor(data_clean$Wealth,
                               levels = c("Poorest", "Poorer", "Middle",
                                          "Richer", "Richest"))
data_clean$Smoking   <- factor(data_clean$Smoking)
data_clean$Iron      <- factor(data_clean$Iron)

summary(data_clean)
##   AgeGroup    Residence           Education        Wealth         Births     
##  15-19: 340   Rural:6219   No education:3896   Poorest:2042   Min.   :1.000  
##  20-24:1752   Urban:3963   Primary     :1733   Poorer :2036   1st Qu.:1.000  
##  25-29:2865                Secondary   :3644   Middle :2249   Median :2.000  
##  30-34:2398                Higher      : 909   Richer :2141   Mean   :1.776  
##  35-39:1810                                    Richest:1714   3rd Qu.:2.000  
##  40-44: 733                                                   Max.   :6.000  
##  45-49: 284                                                                  
##  AgeFirstBirth   Smoking             Iron             Anemia    
##  Min.   :12.00   No :10158   Don't know:  31   Not anemic:3179  
##  1st Qu.:17.00   Yes:   24   No        :8113   Mild      :2754  
##  Median :19.00               Yes       :2038   Moderate  :3927  
##  Mean   :19.96                                 Severe    : 322  
##  3rd Qu.:22.00                                                  
##  Max.   :45.00                                                  
## 

Statistik Deskriptif

Tabel Frekuensi Anemia

freq_anemia <- table(data_clean$Anemia)
pct_anemia  <- round(prop.table(freq_anemia) * 100, 2)

data.frame(
  "Tingkat Anemia" = names(freq_anemia),
  "Frekuensi"      = as.integer(freq_anemia),
  "Persentase (%)" = as.numeric(pct_anemia)
) |>
  kable(caption = "Tabel 1. Distribusi Tingkat Anemia Anak", align = "lcc") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Tabel 1. Distribusi Tingkat Anemia Anak
Tingkat.Anemia Frekuensi Persentase….
Not anemic 3179 31.22
Mild 2754 27.05
Moderate 3927 38.57
Severe 322 3.16

Berdasarkan Tabel 1, kategori anemia sedang (Moderate) mendominasi dengan 3.927 kasus (38,57%), diikuti tidak anemia (Not anemic) sebanyak 3.179 kasus (31,22%), anemia ringan (Mild) sebanyak 2.754 kasus (27,05%), dan anemia parah (Severe) sebanyak 322 kasus (3,16%). Hasil ini menunjukkan bahwa lebih dari dua pertiga sampel (68,78%) masih berada dalam kondisi anemia dengan berbagai tingkat keparahan, sehingga perlu diidentifikasi faktor-faktor yang berkontribusi terhadap kondisi tersebut.

Visualisasi Distribusi Anemia

ggplot(data_clean, aes(x = Anemia, fill = Anemia)) +
  geom_bar(color = "white", width = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.4, fontface = "bold", size = 4) +
  scale_fill_manual(values = c(
    "Not anemic" = "#27ae60", "Mild"     = "#f1c40f",
    "Moderate"   = "#e67e22", "Severe"   = "#c0392b")) +
  labs(title = "Gambar 1. Distribusi Tingkat Anemia pada Anak",
       x = "Tingkat Anemia", y = "Frekuensi") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold"))

Anemia vs Pendidikan & Indeks Kekayaan

ggplot(data_clean, aes(x = Education, fill = Anemia)) +
  geom_bar(position = "fill", color = "white") +
  scale_fill_manual(values = c(
    "Not anemic" = "#27ae60", "Mild"   = "#f1c40f",
    "Moderate"   = "#e67e22", "Severe" = "#c0392b")) +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Gambar 2. Proporsi Anemia berdasarkan Tingkat Pendidikan Ibu",
       x = "Tingkat Pendidikan", y = "Proporsi", fill = "Anemia") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

ggplot(data_clean, aes(x = Wealth, fill = Anemia)) +
  geom_bar(position = "fill", color = "white") +
  scale_fill_manual(values = c(
    "Not anemic" = "#27ae60", "Mild"   = "#f1c40f",
    "Moderate"   = "#e67e22", "Severe" = "#c0392b")) +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Gambar 3. Proporsi Anemia berdasarkan Indeks Kekayaan",
       x = "Indeks Kekayaan", y = "Proporsi", fill = "Anemia") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Boxplot Variabel Numerik

ggplot(data_clean, aes(x = Anemia, y = AgeFirstBirth, fill = Anemia)) +
  geom_boxplot(outlier.colour = "red", outlier.size = 1.5) +
  scale_fill_manual(values = c(
    "Not anemic" = "#27ae60", "Mild"   = "#f1c40f",
    "Moderate"   = "#e67e22", "Severe" = "#c0392b")) +
  labs(title = "Gambar 4. Usia Ibu saat Kelahiran Pertama berdasarkan Tingkat Anemia",
       x = "Tingkat Anemia", y = "Usia (tahun)") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold"))

ggplot(data_clean, aes(x = Anemia, y = Births, fill = Anemia)) +
  geom_boxplot(outlier.colour = "red", outlier.size = 1.5) +
  scale_fill_manual(values = c(
    "Not anemic" = "#27ae60", "Mild"   = "#f1c40f",
    "Moderate"   = "#e67e22", "Severe" = "#c0392b")) +
  labs(title = "Gambar 5. Jumlah Kelahiran 5 Tahun Terakhir berdasarkan Tingkat Anemia",
       x = "Tingkat Anemia", y = "Jumlah Kelahiran") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold"))

Uji Asumsi

1. Linearitas

data_clean |>
  mutate(Anemia_num = as.numeric(Anemia)) |>
  ggplot(aes(x = AgeFirstBirth, y = Anemia_num)) +
  geom_jitter(alpha = 0.1, color = "steelblue", height = 0.1) +
  geom_smooth(method = "loess", color = "red", se = FALSE) +
  labs(title = "Gambar 6. Linearitas: Usia Ibu saat Kelahiran Pertama vs Anemia",
       x = "Usia saat Kelahiran Pertama", y = "Anemia (numerik)") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'

data_clean |>
  mutate(Anemia_num = as.numeric(Anemia)) |>
  ggplot(aes(x = Births, y = Anemia_num)) +
  geom_jitter(alpha = 0.1, color = "darkorange", height = 0.1) +
  geom_smooth(method = "loess", color = "red", se = FALSE) +
  labs(title = "Gambar 7. Linearitas: Jumlah Kelahiran vs Anemia",
       x = "Jumlah Kelahiran 5 Tahun Terakhir", y = "Anemia (numerik)") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 0.975
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1.025
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.9648e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 1

2. Independensi

cat("Data berasal dari survei cross-sectional.",
    "Setiap baris merepresentasikan satu individu yang berbeda.")
## Data berasal dari survei cross-sectional. Setiap baris merepresentasikan satu individu yang berbeda.

3. No Multikolinearitas (VIF)

model_vif <- lm(as.numeric(Anemia) ~ AgeGroup + Residence + Education +
                  Wealth + Births + AgeFirstBirth + Smoking + Iron,
                data = data_clean)

vif_result <- vif(model_vif)

data.frame(
  Variabel = rownames(vif_result),
  GVIF     = round(vif_result[, 1], 3),
  Df       = vif_result[, 2],
  "GVIF^(1/(2*Df))" = round(vif_result[, 3], 3)
) |>
  kable(caption = "Tabel 2. Hasil Uji VIF", align = "lccc",
        row.names = FALSE) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Tabel 2. Hasil Uji VIF
Variabel GVIF Df GVIF..1..2.Df..
AgeGroup 1.308 6 1.023
Residence 1.381 1 1.175
Education 2.243 3 1.144
Wealth 2.259 4 1.107
Births 1.058 1 1.028
AgeFirstBirth 1.559 1 1.249
Smoking 1.002 1 1.001
Iron 1.059 2 1.014

Berdasarkan Tabel 2, seluruh variabel independen memiliki nilai GVIF^(1/(2*Df)) di bawah 2, dengan nilai tertinggi pada variabel AgeFirstBirth (1,249) dan Residence (1,175). Nilai ini jauh di bawah ambang batas 5, sehingga dapat disimpulkan bahwa tidak terdapat masalah multikolinearitas yang serius di antara variabel-variabel prediktor dalam model ini.

4. No Outlier

par(mfrow = c(1, 2))
boxplot(data_clean$AgeFirstBirth,
        main = "Usia saat Kelahiran Pertama",
        ylab = "Usia (tahun)", col = "#AED6F1", outcol = "red")
boxplot(data_clean$Births,
        main = "Jumlah Kelahiran",
        ylab = "Jumlah", col = "#A9DFBF", outcol = "red")

par(mfrow = c(1, 1))

cat("Outlier AgeFirstBirth:", length(boxplot.stats(data_clean$AgeFirstBirth)$out), "\n")
## Outlier AgeFirstBirth: 374
cat("Outlier Births       :", length(boxplot.stats(data_clean$Births)$out), "\n")
## Outlier Births       : 94

Hasil deteksi outlier menggunakan metode IQR menunjukkan terdapat 374 outlier pada variabel AgeFirstBirth dan 94 outlier pada variabel Births. Keberadaan outlier pada kedua variabel numerik ini perlu diperhatikan karena regresi logistik cukup sensitif terhadap nilai ekstrem. Namun mengingat jumlah sampel yang besar (10.182 observasi), pengaruh outlier terhadap kestabilan model diharapkan relatif minimal.

Regresi Logistik Ordinal

# model
model_ordinal <- polr(Anemia ~ AgeGroup + Residence + Education +
                        Wealth + Births + AgeFirstBirth + Smoking + Iron,
                      data   = data_clean,
                      Hess   = TRUE,
                      method = "logistic")

summary(model_ordinal)
## Call:
## polr(formula = Anemia ~ AgeGroup + Residence + Education + Wealth + 
##     Births + AgeFirstBirth + Smoking + Iron, data = data_clean, 
##     Hess = TRUE, method = "logistic")
## 
## Coefficients:
##                        Value Std. Error  t value
## AgeGroup20-24      -0.423954   0.114207  -3.7122
## AgeGroup25-29      -0.540237   0.112025  -4.8225
## AgeGroup30-34      -0.557616   0.114018  -4.8906
## AgeGroup35-39      -0.574403   0.116753  -4.9198
## AgeGroup40-44      -0.673217   0.127845  -5.2659
## AgeGroup45-49      -0.797031   0.155830  -5.1147
## ResidenceUrban     -0.072405   0.043982  -1.6462
## EducationPrimary   -0.078599   0.056691  -1.3864
## EducationSecondary -0.186025   0.054858  -3.3911
## EducationHigher    -0.636043   0.088714  -7.1696
## WealthPoorer       -0.155671   0.059759  -2.6050
## WealthMiddle       -0.444837   0.062920  -7.0699
## WealthRicher       -0.492400   0.069538  -7.0810
## WealthRichest      -0.851143   0.080861 -10.5260
## Births              0.076044   0.027799   2.7355
## AgeFirstBirth      -0.006456   0.005183  -1.2456
## SmokingYes         -0.058682   0.372079  -0.1577
## IronNo             -0.231436   0.330821  -0.6996
## IronYes            -0.192720   0.332957  -0.5788
## 
## Intercepts:
##                 Value    Std. Error t value 
## Not anemic|Mild  -2.1200   0.3581    -5.9206
## Mild|Moderate    -0.9380   0.3576    -2.6232
## Moderate|Severe   2.2288   0.3604     6.1839
## 
## Residual Deviance: 23660.20 
## AIC: 23704.20
# estimasi Parameter & P-value
ctable  <- coef(summary(model_ordinal))
p_value <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
result  <- cbind(ctable, "p value" = round(p_value, 4))

result_df <- as.data.frame(round(result, 4))
result_df$Sig <- ifelse(result_df$`p value` < 0.001, "***",
                 ifelse(result_df$`p value` < 0.01,  "**",
                 ifelse(result_df$`p value` < 0.05,  "*", "")))

result_df |>
  kable(caption = "Tabel 3. Estimasi Parameter Model Ordinal Logistik",
        align = "lcccc") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) |>
  footnote(general = "*** p<0.001  ** p<0.01  * p<0.05")
Tabel 3. Estimasi Parameter Model Ordinal Logistik
Value Std. Error t value p value Sig
AgeGroup20-24 -0.4240 0.1142 -3.7122 0.0002 ***
AgeGroup25-29 -0.5402 0.1120 -4.8225 0.0000 ***
AgeGroup30-34 -0.5576 0.1140 -4.8906 0.0000 ***
AgeGroup35-39 -0.5744 0.1168 -4.9198 0.0000 ***
AgeGroup40-44 -0.6732 0.1278 -5.2659 0.0000 ***
AgeGroup45-49 -0.7970 0.1558 -5.1147 0.0000 ***
ResidenceUrban -0.0724 0.0440 -1.6462 0.0997
EducationPrimary -0.0786 0.0567 -1.3864 0.1656
EducationSecondary -0.1860 0.0549 -3.3911 0.0007 ***
EducationHigher -0.6360 0.0887 -7.1696 0.0000 ***
WealthPoorer -0.1557 0.0598 -2.6050 0.0092 **
WealthMiddle -0.4448 0.0629 -7.0699 0.0000 ***
WealthRicher -0.4924 0.0695 -7.0810 0.0000 ***
WealthRichest -0.8511 0.0809 -10.5260 0.0000 ***
Births 0.0760 0.0278 2.7355 0.0062 **
AgeFirstBirth -0.0065 0.0052 -1.2456 0.2129
SmokingYes -0.0587 0.3721 -0.1577 0.8747
IronNo -0.2314 0.3308 -0.6996 0.4842
IronYes -0.1927 0.3330 -0.5788 0.5627
Not anemic&#124;Mild -2.1200 0.3581 -5.9206 0.0000 ***
Mild&#124;Moderate -0.9380 0.3576 -2.6232 0.0087 **
Moderate&#124;Severe 2.2288 0.3604 6.1839 0.0000 ***
Note:
*** p<0.001 ** p<0.01 * p<0.05

Odds Ratio & Confidence Interval

OR <- exp(cbind("Odds Ratio" = coef(model_ordinal), confint(model_ordinal)))
## Waiting for profiling to be done...
as.data.frame(round(OR, 4)) |>
  kable(caption = "Tabel 4. Odds Ratio dan Confidence Interval 95%",
        align = "lccc") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Tabel 4. Odds Ratio dan Confidence Interval 95%
Odds Ratio 2.5 % 97.5 %
AgeGroup20-24 0.6545 0.5227 0.8179
AgeGroup25-29 0.5826 0.4673 0.7250
AgeGroup30-34 0.5726 0.4575 0.7154
AgeGroup35-39 0.5630 0.4474 0.7072
AgeGroup40-44 0.5101 0.3967 0.6548
AgeGroup45-49 0.4507 0.3319 0.6114
ResidenceUrban 0.9302 0.8534 1.0139
EducationPrimary 0.9244 0.8273 1.0331
EducationSecondary 0.8303 0.7456 0.9245
EducationHigher 0.5294 0.4448 0.6298
WealthPoorer 0.8558 0.7612 0.9622
WealthMiddle 0.6409 0.5665 0.7250
WealthRicher 0.6112 0.5332 0.7003
WealthRichest 0.4269 0.3642 0.5001
Births 1.0790 1.0218 1.1394
AgeFirstBirth 0.9936 0.9835 1.0037
SmokingYes 0.9430 0.4538 1.9711
IronNo 0.7934 0.4116 1.5144
IronYes 0.8247 0.4261 1.5809

Uji Proportional Odds (Brant Test)

brant(model_ordinal)
## ---------------------------------------------------- 
## Test for     X2  df  probability 
## ---------------------------------------------------- 
## Omnibus          60.78   38  0.01
## AgeGroup20-24        0.64    2   0.73
## AgeGroup25-29        0.71    2   0.7
## AgeGroup30-34        0.63    2   0.73
## AgeGroup35-39        0.95    2   0.62
## AgeGroup40-44        0.68    2   0.71
## AgeGroup45-49        6.39    2   0.04
## ResidenceUrban       12.59   2   0
## EducationPrimary 0.45    2   0.8
## EducationSecondary   0.67    2   0.71
## EducationHigher  2.58    2   0.27
## WealthPoorer     1.67    2   0.43
## WealthMiddle     3.33    2   0.19
## WealthRicher     0.36    2   0.84
## WealthRichest        3.03    2   0.22
## Births           4.27    2   0.12
## AgeFirstBirth        5.45    2   0.07
## SmokingYes       0.04    2   0.98
## IronNo           0.02    2   0.99
## IronYes          0   2   1
## ---------------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds
## Warning in brant(model_ordinal): 5468 combinations in table(dv,ivs) do not
## occur. Because of that, the test results might be invalid.

Hasil uji Brant menunjukkan nilai p pada uji Omnibus sebesar 0,01 (< 0,05), sehingga H₀ ditolak. Hal ini mengindikasikan bahwa asumsi proportional odds secara keseluruhan tidak terpenuhi. Namun apabila diperiksa per variabel, sebagian besar variabel individual memenuhi asumsi ini (p > 0,05), kecuali ResidenceUrban (p = 0,00) dan AgeGroup45-49 (p = 0,04). Pelanggaran asumsi pada dua variabel tersebut perlu dicatat sebagai keterbatasan model, meskipun model ordinal logistik tetap digunakan mengingat pendekatan ini masih lebih tepat dibandingkan regresi biner untuk variabel dependen ordinal.

Evaluasi Model

1. Uji Serentak

model_null <- polr(Anemia ~ 1, data = data_clean,
                   Hess = TRUE, method = "logistic")

# Statistik G (Chi-Square)
G_stat <- -2 * (as.numeric(logLik(model_null)) - 
                as.numeric(logLik(model_ordinal)))
df_diff <- length(coef(model_ordinal)) - length(coef(model_null))
p_serentak <- pchisq(G_stat, df = df_diff, lower.tail = FALSE)

cat("Statistik G (Chi-Square) :", round(G_stat, 4), "\n")
## Statistik G (Chi-Square) : 650.0718
cat("Derajat Bebas            :", df_diff, "\n")
## Derajat Bebas            : 19
cat("p-value                  :", round(p_serentak, 6), "\n")
## p-value                  : 0

Uji serentak dilakukan untuk menguji apakah secara simultan seluruh variabel prediktor berpengaruh signifikan terhadap model. Berdasarkan hasil pengujian, diperoleh nilai statistik G sebesar 650,07 dengan derajat bebas 19 dan p-value sebesar 0,000. Karena p-value < 0,05, maka H₀ ditolak, yang berarti minimal terdapat satu variabel prediktor yang berpengaruh signifikan terhadap tingkat anemia anak. Hal ini menunjukkan bahwa model regresi logistik ordinal yang dibangun secara keseluruhan lebih baik dibandingkan model tanpa prediktor (model null).

2. Uji Kesesuaian Model (Goodness of Fit)

obs_table <- table(data_clean$Anemia)
pred_probs <- predict(model_ordinal, data_clean, type = "probs")
pred_expected <- colSums(pred_probs)

gof_df <- data.frame(
  Kategori   = names(obs_table),
  Observasi  = as.integer(obs_table),
  Ekspektasi = round(pred_expected, 2),
  Chi_sq_kontribusi = round((as.integer(obs_table) - pred_expected)^2 / 
                              pred_expected, 4)
)

kable(gof_df,
      caption = "Tabel 5. Uji Kesesuaian Model (Goodness of Fit)",
      align = "lccc") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Tabel 5. Uji Kesesuaian Model (Goodness of Fit)
Kategori Observasi Ekspektasi Chi_sq_kontribusi
Not anemic Not anemic 3179 3168.26 0.0364
Mild Mild 2754 2750.72 0.0039
Moderate Moderate 3927 3940.16 0.0439
Severe Severe 322 322.86 0.0023
chi_sq_gof <- sum(gof_df$Chi_sq_kontribusi)
df_gof     <- nrow(gof_df) - 1
p_gof      <- pchisq(chi_sq_gof, df = df_gof, lower.tail = FALSE)

cat("Chi-Square GoF :", round(chi_sq_gof, 4), "\n")
## Chi-Square GoF : 0.0865
cat("Derajat Bebas  :", df_gof, "\n")
## Derajat Bebas  : 3
cat("p-value        :", round(p_gof, 6), "\n")
## p-value        : 0.993407

Uji kesesuaian model dilakukan untuk mengetahui apakah terdapat perbedaan yang signifikan antara hasil observasi dengan hasil prediksi model. Berdasarkan Tabel 5, nilai frekuensi ekspektasi pada setiap kategori sangat mendekati nilai observasinya — misalnya kategori Not anemic memiliki observasi 3.179 dan ekspektasi 3.168,26. Nilai statistik Chi-Square Goodness of Fit yang diperoleh adalah 0,0865 dengan derajat bebas 3 dan p-value sebesar 0,9934. Karena p-value > 0,05, maka H₀ gagal ditolak, sehingga dapat disimpulkan bahwa model telah sesuai dengan data observasi dan tidak terdapat perbedaan yang signifikan antara nilai pengamatan dengan prediksi model.

3. Confusion Matrix & Akurasi (APER)

pred_ord <- predict(model_ordinal, data_clean)

conf_ord <- table(Prediksi = as.character(pred_ord),
                  Aktual   = as.character(data_clean$Anemia))
print(conf_ord)
##             Aktual
## Prediksi     Mild Moderate Not anemic Severe
##   Moderate   1882     3118       1853    292
##   Not anemic  872      809       1326     30
# Hitung APER
n_misklasifikasi <- sum(conf_ord) - sum(diag(conf_ord))
n_total          <- sum(conf_ord)
APER_ord         <- n_misklasifikasi / n_total
akurasi_ord      <- 1 - APER_ord

cat("Jumlah Misklasifikasi :", n_misklasifikasi, "\n")
## Jumlah Misklasifikasi : 7491
cat("Total Observasi       :", n_total, "\n")
## Total Observasi       : 10182
cat("APER                  :", round(APER_ord * 100, 2), "%\n")
## APER                  : 73.57 %
cat("Akurasi               :", round(akurasi_ord * 100, 2), "%\n")
## Akurasi               : 26.43 %

Berdasarkan confusion matrix, model regresi logistik ordinal hanya mampu memprediksi dua kategori yaitu Moderate dan Not anemic, sementara kategori Mild dan Severe tidak terprediksi sama sekali. Hal ini disebabkan oleh ketidakseimbangan distribusi kelas (class imbalance) di mana kategori Moderate mendominasi data. Dari total 10.182 observasi, terdapat 7.491 observasi yang salah diklasifikasikan, menghasilkan nilai APER sebesar 73,57% dan akurasi sebesar 26,43%. Nilai APER yang tinggi ini menunjukkan bahwa model ordinal logistik kurang optimal dalam melakukan klasifikasi pada data dengan distribusi kelas yang tidak seimbang ini.

Linear Discriminant Analysis (LDA)

#1. Model LDA

data_lda <- data_clean |>
  mutate(
    AgeGroup_num  = as.numeric(AgeGroup),
    Residence_num = as.numeric(Residence),
    Education_num = as.numeric(Education),
    Wealth_num    = as.numeric(Wealth),
    Smoking_num   = as.numeric(Smoking),
    Iron_num      = as.numeric(Iron)
  )

model_lda <- lda(Anemia ~ AgeGroup_num + Residence_num + Education_num +
                   Wealth_num + Births + AgeFirstBirth +
                   Smoking_num + Iron_num,
                 data  = data_lda,
                 prior = rep(1/4, 4))

print(model_lda)
## Call:
## lda(Anemia ~ AgeGroup_num + Residence_num + Education_num + Wealth_num + 
##     Births + AgeFirstBirth + Smoking_num + Iron_num, data = data_lda, 
##     prior = rep(1/4, 4))
## 
## Prior probabilities of groups:
## Not anemic       Mild   Moderate     Severe 
##       0.25       0.25       0.25       0.25 
## 
## Group means:
##            AgeGroup_num Residence_num Education_num Wealth_num   Births
## Not anemic     3.802454      1.469959      2.396036   3.313621 1.750865
## Mild           3.709150      1.396877      2.200073   3.017066 1.762164
## Moderate       3.572702      1.336134      1.962821   2.659791 1.807996
## Severe         3.521739      1.173913      1.695652   2.195652 1.754658
##            AgeFirstBirth Smoking_num Iron_num
## Not anemic      20.69959    1.002202 2.213904
## Mild            20.16013    1.002179 2.210240
## Moderate        19.35549    1.002801 2.179526
## Severe          18.27950    1.000000 2.133540
## 
## Coefficients of linear discriminants:
##                       LD1         LD2         LD3
## AgeGroup_num  -0.09843872  0.33595020 -0.14780167
## Residence_num -0.48524190 -1.49211674 -0.86785303
## Education_num -0.22460322  0.52792060 -0.39131868
## Wealth_num    -0.42721649  0.18157113  0.08109289
## Births        -0.02679520 -0.74570784 -0.32025851
## AgeFirstBirth -0.04300790 -0.07813034  0.12905147
## Smoking_num   -1.19009436 -4.82632842 -0.28087161
## Iron_num      -0.04495379 -0.53211026  1.90139408
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.9641 0.0330 0.0029

2. Proporsi Varians

data.frame(
  "Fungsi Diskriminan"  = paste0("LD", seq_along(model_lda$svd)),
  "Proporsi Varians (%)" = round(model_lda$svd^2 / sum(model_lda$svd^2) * 100, 2)
) |>
  kable(caption = "Tabel 5. Proporsi Varians Fungsi Diskriminan",
        align = "lc") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Tabel 5. Proporsi Varians Fungsi Diskriminan
Fungsi.Diskriminan Proporsi.Varians….
LD1 96.41
LD2 3.30
LD3 0.29

Berdasarkan Tabel 5, fungsi diskriminan pertama (LD1) mampu menjelaskan 96,41% dari total variansi antarkelas, sementara LD2 hanya menjelaskan 3,30% dan LD3 sebesar 0,29%. Hal ini menunjukkan bahwa LD1 mendominasi pemisahan antar kategori tingkat anemia, sehingga hampir seluruh informasi pembeda antar kelompok sudah tertangkap oleh fungsi diskriminan pertama saja.

3. Confusion Matrix & Akurasi LDA

pred_lda <- predict(model_lda, data_lda)

conf_lda <- table(Prediksi = pred_lda$class,
                  Aktual   = data_lda$Anemia)
print(conf_lda)
##             Aktual
## Prediksi     Not anemic Mild Moderate Severe
##   Not anemic       1687 1206     1283     64
##   Mild              223  208      280     26
##   Moderate          344  294      501     26
##   Severe            925 1046     1863    206
accuracy_lda <- mean(as.character(pred_lda$class) == as.character(data_lda$Anemia))
cat("Akurasi LDA:", round(accuracy_lda * 100, 2), "%\n")
## Akurasi LDA: 25.55 %

Berdasarkan confusion matrix LDA, model banyak mengklasifikasikan observasi ke kategori Severe (total 4.040 prediksi) meskipun aktualnya sebagian besar bukan Severe. Sebaliknya, kategori yang seharusnya Severe (n=322) hanya 206 yang berhasil diklasifikasikan dengan benar. Akurasi model LDA sebesar 25,55% dengan APER 74,45%, yang berarti hampir tiga perempat dari seluruh observasi salah diklasifikasikan. Rendahnya akurasi ini mengindikasikan bahwa fungsi diskriminan linier kesulitan memisahkan keempat kategori anemia secara akurat, kemungkinan karena asumsi distribusi normal multivariat dan kehomogenan matriks kovarians antarkelas tidak sepenuhnya terpenuhi pada data ini.

4. Plot Fungsi Diskriminan

as.data.frame(pred_lda$x) |>
  mutate(Anemia = data_lda$Anemia) |>
  ggplot(aes(x = LD1, y = LD2, color = Anemia)) +
  geom_point(alpha = 0.4, size = 1.5) +
  scale_color_manual(values = c(
    "Not anemic" = "#27ae60", "Mild"   = "#f1c40f",
    "Moderate"   = "#e67e22", "Severe" = "#c0392b")) +
  stat_ellipse(level = 0.75, linewidth = 0.8) +
  labs(title = "Gambar 8. Plot Fungsi Diskriminan LDA (LD1 vs LD2)",
       x = "LD1", y = "LD2", color = "Tingkat Anemia") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Perbandingan Model

# Hitung APER LDA
n_mis_lda  <- sum(conf_lda) - sum(diag(conf_lda))
APER_lda   <- n_mis_lda / sum(conf_lda)
akurasi_lda_fix <- 1 - APER_lda

data.frame(
  Metode        = c("Regresi Logistik Ordinal", 
                    "Linear Discriminant Analysis (LDA)"),
  "Akurasi (%)" = c(round(akurasi_ord * 100, 2),
                    round(akurasi_lda_fix * 100, 2)),
  "APER (%)"    = c(round(APER_ord * 100, 2),
                    round(APER_lda * 100, 2))
) |>
  kable(caption = "Tabel 6. Perbandingan Akurasi dan APER Metode Klasifikasi",
        align = "lcc") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Tabel 6. Perbandingan Akurasi dan APER Metode Klasifikasi
Metode Akurasi…. APER….
Regresi Logistik Ordinal 26.43 73.57
Linear Discriminant Analysis (LDA) 25.55 74.45

Berdasarkan Tabel 6, kedua metode menghasilkan akurasi yang rendah — Regresi Logistik Ordinal sebesar 26,43% (APER 73,57%) dan LDA sebesar 25,55% (APER 74,45%). Meskipun selisihnya kecil, Regresi Logistik Ordinal sedikit lebih unggul dibandingkan LDA. Rendahnya akurasi pada kedua model kemungkinan besar disebabkan oleh ketidakseimbangan distribusi kelas pada data, di mana kategori Moderate mendominasi sehingga model cenderung memprediksi ke kategori mayoritas. Selain itu, LDA mensyaratkan variabel prediktor bertipe numerik dan berdistribusi normal multivariat, sementara sebagian besar variabel dalam penelitian ini bersifat kategorik yang dikonversi menjadi numerik, sehingga asumsi LDA tidak sepenuhnya terpenuhi. Dengan mempertimbangkan kesesuaian metode terhadap karakteristik data, Regresi Logistik Ordinal tetap menjadi pilihan yang lebih tepat karena secara eksplisit mampu mengakomodasi struktur ordinal pada variabel dependen tingkat anemia anak.