Anggota :

Link Dataset : https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition

Import Library

library(nnet)      
## Warning: package 'nnet' was built under R version 4.5.3
library(car)       
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
library(ggplot2)   
## Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)     
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)     
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(corrplot) 
## corrplot 0.95 loaded
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(klaR)
## Warning: package 'klaR' was built under R version 4.5.3

Import Data

data <- read.csv("ObesityDataSet.csv",
                 stringsAsFactors = FALSE)

head(data)
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad
## 1       Normal_Weight
## 2       Normal_Weight
## 3       Normal_Weight
## 4  Overweight_Level_I
## 5 Overweight_Level_II
## 6       Normal_Weight
cat("Dimensi awal:", dim(data), "\n")
## Dimensi awal: 2111 17
str(data)
## 'data.frame':    2111 obs. of  17 variables:
##  $ Gender                        : chr  "Female" "Female" "Male" "Male" ...
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ family_history_with_overweight: chr  "yes" "yes" "yes" "no" ...
##  $ FAVC                          : chr  "no" "no" "no" "no" ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : chr  "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
##  $ SMOKE                         : chr  "no" "yes" "no" "no" ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : chr  "no" "yes" "no" "no" ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : chr  "no" "Sometimes" "Frequently" "Frequently" ...
##  $ MTRANS                        : chr  "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
##  $ NObeyesdad                    : chr  "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...

Preprocessing Data

1. Hapus Kolom yang Tidak Digunakan

data$Gender <- NULL
data$Weight <- NULL

cat("Kolom setelah penghapusan:\n")
## Kolom setelah penghapusan:
print(colnames(data))
##  [1] "Age"                            "Height"                        
##  [3] "family_history_with_overweight" "FAVC"                          
##  [5] "FCVC"                           "NCP"                           
##  [7] "CAEC"                           "SMOKE"                         
##  [9] "CH2O"                           "SCC"                           
## [11] "FAF"                            "TUE"                           
## [13] "CALC"                           "MTRANS"                        
## [15] "NObeyesdad"

Gender dan Weight dihapus karena sangat berkorelasi dengan label.

2. Ubah Tipe Data ke Faktor

data$NObeyesdad <- as.factor(data$NObeyesdad)

kategorik_cols <- c("family_history_with_overweight", "FAVC", "CAEC",
                    "SMOKE", "SCC", "CALC", "MTRANS")

for (col in kategorik_cols) {
  if (col %in% colnames(data)) {
    data[[col]] <- as.factor(data[[col]])
  }
}

cat("Tipe data setelah konversi faktor:\n")
## Tipe data setelah konversi faktor:
str(data)
## 'data.frame':    2111 obs. of  15 variables:
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
##  $ MTRANS                        : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
##  $ NObeyesdad                    : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...

3. Konversi Kolom Numerik

numerik_cols <- c("Age", "Height", "FCVC", "NCP", "CH2O", "TUE")

for (col in numerik_cols) {
  if (col %in% colnames(data)) {
    data[[col]] <- as.numeric(data[[col]])
  }
}

cat("Tipe data setelah konversi numerik:\n")
## Tipe data setelah konversi numerik:
str(data)
## 'data.frame':    2111 obs. of  15 variables:
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
##  $ MTRANS                        : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
##  $ NObeyesdad                    : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...

4. Cek & Hapus Missing Value

cat("Jumlah NA :\n")
## Jumlah NA :
print(colSums(is.na(data)))
##                            Age                         Height 
##                              0                              0 
## family_history_with_overweight                           FAVC 
##                              0                              0 
##                           FCVC                            NCP 
##                              0                              0 
##                           CAEC                          SMOKE 
##                              0                              0 
##                           CH2O                            SCC 
##                              0                              0 
##                            FAF                            TUE 
##                              0                              0 
##                           CALC                         MTRANS 
##                              0                              0 
##                     NObeyesdad 
##                              0

5. Filter Kelas dengan Frekuensi Cukup (≥ 30 Observasi)

cat("Frekuensi NObeyesdad sebelum filter:\n")
## Frekuensi NObeyesdad sebelum filter:
print(sort(table(data$NObeyesdad), decreasing = TRUE))
## 
##      Obesity_Type_I    Obesity_Type_III     Obesity_Type_II  Overweight_Level_I 
##                 351                 324                 297                 290 
## Overweight_Level_II       Normal_Weight Insufficient_Weight 
##                 290                 287                 272
genre_valid <- names(table(data$NObeyesdad))[table(data$NObeyesdad) >= 30]
data <- data %>% filter(NObeyesdad %in% genre_valid)
data$NObeyesdad <- droplevels(data$NObeyesdad)

cat("\nKelas yang digunakan (", nlevels(data$NObeyesdad), "kelas):\n")
## 
## Kelas yang digunakan ( 7 kelas):
print(levels(data$NObeyesdad))
## [1] "Insufficient_Weight" "Normal_Weight"       "Obesity_Type_I"     
## [4] "Obesity_Type_II"     "Obesity_Type_III"    "Overweight_Level_I" 
## [7] "Overweight_Level_II"

Langkah ini memastikan model lebih stabil, mengurangi noise, dan mempermudah interpretasi.

6. Transformasi FAF & Sederhanakan Variabel Kategorik Bermasalah

cat("\nDistribusi FAF sebelum kategorisasi:\n")
## 
## Distribusi FAF sebelum kategorisasi:
print(summary(data$FAF))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1245  1.0000  1.0103  1.6667  3.0000
data$FAF_cat <- cut(data$FAF,
                    breaks = c(-Inf, 0, 1, 2, Inf),
                    labels = c("Tidak_Aktif", "Rendah", "Sedang", "Tinggi"),
                    right  = TRUE)
data$FAF_cat <- factor(data$FAF_cat,
                       levels = c("Tidak_Aktif", "Rendah", "Sedang", "Tinggi"),
                       ordered = FALSE)
data$FAF <- NULL

# Cek distribusi sebelum transformasi kategorik
cat("\nDistribusi FAF_cat setelah kategorisasi:\n")
## 
## Distribusi FAF_cat setelah kategorisasi:
print(table(data$FAF_cat))
## 
## Tidak_Aktif      Rendah      Sedang      Tinggi 
##         411         834         673         193
cat("(Proporsi):\n")
## (Proporsi):
print(round(prop.table(table(data$FAF_cat)) * 100, 2))
## 
## Tidak_Aktif      Rendah      Sedang      Tinggi 
##       19.47       39.51       31.88        9.14
cat("\nDistribusi SMOKE:\n"); print(table(data$SMOKE))
## 
## Distribusi SMOKE:
## 
##   no  yes 
## 2067   44
cat("Distribusi SCC:\n");    print(table(data$SCC))
## Distribusi SCC:
## 
##   no  yes 
## 2015   96
cat("Distribusi CAEC:\n");   print(table(data$CAEC))
## Distribusi CAEC:
## 
##     Always Frequently         no  Sometimes 
##         53        242         51       1765
cat("Distribusi CALC:\n");   print(table(data$CALC))
## Distribusi CALC:
## 
##     Always Frequently         no  Sometimes 
##          1         70        639       1401
cat("Distribusi MTRANS:\n"); print(table(data$MTRANS))
## Distribusi MTRANS:
## 
##            Automobile                  Bike             Motorbike 
##                   457                     7                    11 
## Public_Transportation               Walking 
##                  1580                    56
# Hapus SMOKE & SCC
data$SMOKE <- NULL
data$SCC   <- NULL

# Binarisasi CAEC: makan di luar waktu makan (1) vs tidak (0)
data$CAEC_bin <- ifelse(data$CAEC %in% c("Sometimes", "Frequently", "Always"), 1, 0)
data$CAEC_bin <- as.factor(data$CAEC_bin)
data$CAEC     <- NULL

# Binarisasi CALC: konsumsi alkohol (1) vs tidak (0)
data$CALC_bin <- ifelse(data$CALC != "no", 1, 0)
data$CALC_bin <- as.factor(data$CALC_bin)
data$CALC     <- NULL

# Binarisasi MTRANS: transportasi umum (1) vs lainnya (0)
data$MTRANS_bin <- ifelse(data$MTRANS == "Public_Transportation", 1, 0)
data$MTRANS_bin <- as.factor(data$MTRANS_bin)
data$MTRANS     <- NULL

cat("\nDistribusi CAEC_bin:\n");  print(table(data$CAEC_bin))
## 
## Distribusi CAEC_bin:
## 
##    0    1 
##   51 2060
cat("Distribusi CALC_bin:\n");   print(table(data$CALC_bin))
## Distribusi CALC_bin:
## 
##    0    1 
##  639 1472
cat("Distribusi MTRANS_bin:\n"); print(table(data$MTRANS_bin))
## Distribusi MTRANS_bin:
## 
##    0    1 
##  531 1580

Transformasi ini membuat variabel lebih informatif dan mudah dipakai dalam regresi logistik.

7. Cek Struktur Akhir

str(data)
## 'data.frame':    2111 obs. of  13 variables:
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ NObeyesdad                    : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...
##  $ FAF_cat                       : Factor w/ 4 levels "Tidak_Aktif",..: 1 4 3 3 1 1 2 4 2 2 ...
##  $ CAEC_bin                      : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ CALC_bin                      : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 1 ...
##  $ MTRANS_bin                    : Factor w/ 2 levels "0","1": 2 2 2 1 2 1 1 2 2 2 ...
summary(data)
##       Age            Height      family_history_with_overweight  FAVC     
##  Min.   :14.00   Min.   :1.450   no : 385                       no : 245  
##  1st Qu.:19.95   1st Qu.:1.630   yes:1726                       yes:1866  
##  Median :22.78   Median :1.700                                            
##  Mean   :24.31   Mean   :1.702                                            
##  3rd Qu.:26.00   3rd Qu.:1.768                                            
##  Max.   :61.00   Max.   :1.980                                            
##                                                                           
##       FCVC            NCP             CH2O            TUE        
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:2.659   1st Qu.:1.585   1st Qu.:0.0000  
##  Median :2.386   Median :3.000   Median :2.000   Median :0.6253  
##  Mean   :2.419   Mean   :2.686   Mean   :2.008   Mean   :0.6579  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:2.477   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :4.000   Max.   :3.000   Max.   :2.0000  
##                                                                  
##                NObeyesdad         FAF_cat    CAEC_bin CALC_bin MTRANS_bin
##  Insufficient_Weight:272   Tidak_Aktif:411   0:  51   0: 639   0: 531    
##  Normal_Weight      :287   Rendah     :834   1:2060   1:1472   1:1580    
##  Obesity_Type_I     :351   Sedang     :673                               
##  Obesity_Type_II    :297   Tinggi     :193                               
##  Obesity_Type_III   :324                                                 
##  Overweight_Level_I :290                                                 
##  Overweight_Level_II:290

Eksplorasi Data (EDA)

1. Distribusi Label Target (NObeyesdad)

ggplot(data, aes(x = forcats::fct_infreq(NObeyesdad), fill = NObeyesdad)) +
  geom_bar() +
  labs(title = "Distribusi Kategori Obesitas (Label Target)",
       x = "Obesity Level", y = "Frekuensi") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        legend.position = "none")

Distribusi obesitas cukup hampir merata antar kelas.

2. Distribusi Age per Kelas Obesitas

ggplot(data, aes(x = NObeyesdad, y = Age, fill = NObeyesdad)) +
  geom_boxplot() +
  labs(title = "Distribusi Age per Tingkat Obesitas",
       x = "Obesity Level", y = "Age") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        legend.position = "none")

3. Distribusi Height per Kelas Obesitas

ggplot(data, aes(x = NObeyesdad, y = Height, fill = NObeyesdad)) +
  geom_boxplot() +
  labs(title = "Distribusi Height per Tingkat Obesitas",
       x = "Obesity Level", y = "Height (m)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        legend.position = "none")

Tinggi badan tidak terlalu berbeda antar kelas.

4. Distribusi FAF_cat per Kelas Obesitas

ggplot(data, aes(x = NObeyesdad, fill = FAF_cat)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Proporsi Kategori FAF per Tingkat Obesitas",
       x = "Obesity Level", y = "Proporsi", fill = "Aktivitas Fisik") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

Ada indikasi hubungan antara usia & aktivitas fisik dengan kateogri obesitas.

Uji Asumsi

1. Linearitas Log-Odds

Multinomial Logistic Regression mengasumsikan hubungan linear antara prediktor numerik dengan log-odds. Pengujian dilakukan melalui tiga cara: visualisasi GAM smoother, uji formal Box-Tidwell, dan perbandingan AIC.

a) Visualisasi GAM Smoother

Jika kurva GAM mendekati garis lurus, asumsi linearitas terpenuhi.

ggplot(data, aes(x = Age, y = as.numeric(NObeyesdad))) +
  geom_point(alpha = 0.1, color = "steelblue") +
  geom_smooth(method = "gam", color = "red", se = TRUE) +
  labs(title = "Linearitas: Age vs NObeyesdad",
       x = "Age", y = "NObeyesdad (numerik)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

ggplot(data, aes(x = Height, y = as.numeric(NObeyesdad))) +
  geom_point(alpha = 0.1, color = "steelblue") +
  geom_smooth(method = "gam", color = "red", se = TRUE) +
  labs(title = "Linearitas: Height vs NObeyesdad",
       x = "Height", y = "NObeyesdad (numerik)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

b) Uji Box-Tidwell (Uji Formal)

Ditambahkan interaksi X·ln(X) pada model binomial. Jika semua koefisien interaksi tidak signifikan (p > 0.05), asumsi linearitas log-odds terpenuhi.

data_bt <- data %>%
  mutate(
    Age_lnAge       = ifelse(Age > 0,    Age    * log(Age),    0),
    Height_lnHeight = ifelse(Height > 0, Height * log(Height), 0)
  )
 
kelas_bt  <- levels(data_bt$NObeyesdad)[1:2]
data_bt2  <- data_bt %>%
  filter(NObeyesdad %in% kelas_bt) %>%
  mutate(NObeyesdad = droplevels(NObeyesdad))
 
bt_model <- glm(NObeyesdad ~ Age + Height + Age_lnAge + Height_lnHeight,
                data   = data_bt2,
                family = binomial)
 
cat("=== Hasil Box-Tidwell ===\n")
## === Hasil Box-Tidwell ===
print(summary(bt_model)$coefficients)
##                    Estimate Std. Error    z value  Pr(>|z|)
## (Intercept)     -37.5002920 48.4199329 -0.7744805 0.4386467
## Age               0.4068502  0.8182804  0.4972014 0.6190470
## Height           30.3619239 43.6619077  0.6953870 0.4868128
## Age_lnAge        -0.0610457  0.1977858 -0.3086455 0.7575912
## Height_lnHeight -20.5912055 28.6659258 -0.7183164 0.4725622
cat("\n--- Interpretasi ---\n")
## 
## --- Interpretasi ---
bt_coef   <- summary(bt_model)$coefficients
interaksi <- c("Age_lnAge", "Height_lnHeight")
 
for (var in interaksi) {
  p      <- bt_coef[var, "Pr(>|z|)"]
  status <- ifelse(p > 0.05,
                   "✓ Tidak signifikan → Linearitas terpenuhi",
                   "✗ Signifikan → Linearitas TIDAK terpenuhi")
  cat(sprintf("%-20s : p = %.4f  %s\n", var, p, status))
}
## Age_lnAge            : p = 0.7576  ✓ Tidak signifikan → Linearitas terpenuhi
## Height_lnHeight      : p = 0.4726  ✓ Tidak signifikan → Linearitas terpenuhi

Box-Tidwell menunjukkan p-value > 0.05 → linearitas terpenuhi.

c) Perbandingan AIC: Model Linear vs Kuadratik

AIC lebih kecil menunjukkan model yang lebih baik. Jika model kuadratik memiliki AIC lebih kecil, maka Age² perlu dimasukkan ke model final.

model_linear  <- multinom(
  NObeyesdad ~ Age + Height + FCVC + NCP + CH2O + TUE,
  data = data, trace = FALSE)

model_kuadrat <- multinom(
  NObeyesdad ~ Age + I(Age^2) + Height + FCVC + NCP + CH2O + TUE,
  data = data, trace = FALSE)

aic_compare <- AIC(model_linear, model_kuadrat)
cat("=== Perbandingan AIC Linear vs Kuadratik ===\n")
## === Perbandingan AIC Linear vs Kuadratik ===
print(aic_compare)
##               df      AIC
## model_linear  42 6292.374
## model_kuadrat 48 6051.577
if (aic_compare["model_kuadrat", "AIC"] < aic_compare["model_linear", "AIC"]) {
  cat("\nKesimpulan: Model kuadratik lebih baik → Age² dimasukkan ke model final.\n")
} else {
  cat("\nKesimpulan: Model linear sudah cukup.\n")
}
## 
## Kesimpulan: Model kuadratik lebih baik → Age² dimasukkan ke model final.

2. Independensi Observasi

Asumsi independensi pada Multinomial Logistic Regression mensyaratkan bahwa setiap observasi tidak saling memengaruhi satu sama lain. Uji statistik seperti Durbin-Watson tidak relevan di sini karena dirancang untuk data deret waktu (time series), bukan data cross-sectional.

Dataset ini merupakan data cross-sectional di mana setiap baris merepresentasikan satu individu yang berbeda dari survei populasi di Mexico, Peru, dan Colombia. Tidak ada struktur pengulangan (repeated measures), klaster, maupun urutan waktu antar observasi. Oleh karena itu, asumsi independensi observasi dianggap terpenuhi secara by design berdasarkan karakteristik pengumpulan data.

cat("=== Verifikasi Independensi Observasi ===\n\n")
## === Verifikasi Independensi Observasi ===
cat(sprintf("Jumlah total observasi    : %d\n", nrow(data)))
## Jumlah total observasi    : 2111
cat(sprintf("Jumlah variabel           : %d\n", ncol(data)))
## Jumlah variabel           : 13
cat("\nJenis data                : Cross-sectional\n")
## 
## Jenis data                : Cross-sectional
cat("Sumber data               : Survei individu (Mexico, Peru, Colombia)\n")
## Sumber data               : Survei individu (Mexico, Peru, Colombia)
cat("Struktur repeated measures : Tidak ada\n")
## Struktur repeated measures : Tidak ada
cat("Struktur time series      : Tidak ada\n\n")
## Struktur time series      : Tidak ada
cat("Kesimpulan: Setiap observasi merepresentasikan individu yang berbeda.\n")
## Kesimpulan: Setiap observasi merepresentasikan individu yang berbeda.
cat("Asumsi independensi observasi terpenuhi by design.\n")
## Asumsi independensi observasi terpenuhi by design.

3. No Multikolinieritas

a) Matriks Korelasi Prediktor Numerik

numerik    <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(numerik)

cat("Matriks Korelasi:\n")
## Matriks Korelasi:
print(round(cor_matrix, 3))
##           Age Height   FCVC    NCP   CH2O    TUE
## Age     1.000 -0.026  0.016 -0.044 -0.045 -0.297
## Height -0.026  1.000 -0.038  0.244  0.213  0.052
## FCVC    0.016 -0.038  1.000  0.042  0.068 -0.101
## NCP    -0.044  0.244  0.042  1.000  0.057  0.036
## CH2O   -0.045  0.213  0.068  0.057  1.000  0.012
## TUE    -0.297  0.052 -0.101  0.036  0.012  1.000
corrplot(cor_matrix, method = "color", type = "upper",
         addCoef.col = "black", tl.col = "black",
         title = "Matriks Korelasi Prediktor Numerik",
         mar = c(0, 0, 1.5, 0))

b) Variance Inflation Factor (VIF)

VIF < 5 menunjukkan tidak ada multikolinieritas yang mengkhawatirkan. VIF < 10 masih dapat diterima.

data$NObeyesdad_num <- as.numeric(data$NObeyesdad)
model_vif  <- lm(NObeyesdad_num ~ Age + Height + FCVC + NCP + CH2O + TUE,
                 data = data)
vif_result <- vif(model_vif)
data$NObeyesdad_num <- NULL

cat("Nilai VIF:\n")
## Nilai VIF:
print(round(vif_result, 4))
##    Age Height   FCVC    NCP   CH2O    TUE 
## 1.1001 1.1165 1.0210 1.0682 1.0561 1.1104
cat("\nKesimpulan:\n")
## 
## Kesimpulan:
if (all(vif_result < 5)) {
  cat("Semua VIF < 5 → Tidak ada multikolinieritas.\n")
} else if (all(vif_result < 10)) {
  cat("Semua VIF < 10 → Multikolinieritas masih dapat diterima.\n")
} else {
  cat("Terdapat VIF ≥ 10 → Ada multikolinieritas serius.\n")
}
## Semua VIF < 5 → Tidak ada multikolinieritas.

4. No Outlier (Mahalanobis Distance)

Outlier dihapus secara iteratif menggunakan Jarak Mahalanobis dengan cutoff Chi-Square (df = jumlah variabel numerik, α = 0.025) hingga tidak ada outlier tersisa.

data_bersih <- data
iterasi     <- 0

repeat {
  numerik_iter <- data_bersih[, c("Age", "Height", "FCVC", "NCP",
                                   "CH2O", "TUE")]
  md_iter      <- mahalanobis(numerik_iter, colMeans(numerik_iter), cov(numerik_iter))
  cutoff       <- qchisq(0.975, df = ncol(numerik_iter))
  outlier_idx  <- which(md_iter > cutoff)
  iterasi      <- iterasi + 1

  cat(sprintf("Iterasi %d: %d outlier ditemukan (n = %d)\n",
              iterasi, length(outlier_idx), nrow(data_bersih)))

  if (length(outlier_idx) == 0) break

  data_bersih <- data_bersih[-outlier_idx, ]
  data_bersih$NObeyesdad <- droplevels(data_bersih$NObeyesdad)
}
## Iterasi 1: 45 outlier ditemukan (n = 2111)
## Iterasi 2: 15 outlier ditemukan (n = 2066)
## Iterasi 3: 11 outlier ditemukan (n = 2051)
## Iterasi 4: 5 outlier ditemukan (n = 2040)
## Iterasi 5: 5 outlier ditemukan (n = 2035)
## Iterasi 6: 5 outlier ditemukan (n = 2030)
## Iterasi 7: 7 outlier ditemukan (n = 2025)
## Iterasi 8: 3 outlier ditemukan (n = 2018)
## Iterasi 9: 1 outlier ditemukan (n = 2015)
## Iterasi 10: 0 outlier ditemukan (n = 2014)
cat(sprintf("\nTotal iterasi            : %d\n",   iterasi))
## 
## Total iterasi            : 10
cat(sprintf("Data sebelum pembersihan : %d baris\n", nrow(data)))
## Data sebelum pembersihan : 2111 baris
cat(sprintf("Data setelah pembersihan : %d baris\n", nrow(data_bersih)))
## Data setelah pembersihan : 2014 baris
cat(sprintf("Cutoff Mahalanobis       : %.4f\n",   cutoff))
## Cutoff Mahalanobis       : 14.4494

Visualisasi Mahalanobis Distance (Data Final)

numerik_final <- data_bersih[, c("Age", "Height", "FCVC", "NCP",
                                  "CH2O", "TUE")]
md_final      <- mahalanobis(numerik_final, colMeans(numerik_final), cov(numerik_final))

df_md <- data.frame(index   = 1:length(md_final),
                    md      = md_final,
                    outlier = md_final > cutoff)

ggplot(df_md, aes(x = index, y = md, color = outlier)) +
  geom_point(alpha = 0.5, size = 1) +
  geom_hline(yintercept = cutoff, linetype = "dashed", color = "red", linewidth = 1) +
  scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "tomato"),
                     labels = c("Normal", "Outlier")) +
  labs(title = "Mahalanobis Distance per Observasi (Setelah Iterasi)",
       x = "Index Observasi", y = "Mahalanobis Distance", color = "") +
  theme_minimal()

Boxplot Sebelum vs Sesudah Hapus Outlier

par(mfrow = c(2, 3))
boxplot(data$Age,
        main = "Age — Sebelum",       col = "salmon",     ylab = "Age")
boxplot(data$Height,
        main = "Height — Sebelum",    col = "salmon",     ylab = "Height")
boxplot(data_bersih$Age,
        main = "Age — Sesudah",       col = "lightgreen", ylab = "Age")
boxplot(data_bersih$Height,
        main = "Height — Sesudah",    col = "lightgreen", ylab = "Height")
par(mfrow = c(1, 1))

Pemodelan Multinomial Logistic Regression (MLR)

Split Data: Train & Test

set.seed(42)
train_idx  <- createDataPartition(data_bersih$NObeyesdad, p = 0.8, list = FALSE)
data_train <- data_bersih[train_idx, ]
data_test  <- data_bersih[-train_idx, ]

cat(sprintf("Train : %d baris\n", nrow(data_train)))
## Train : 1615 baris
cat(sprintf("Test  : %d baris\n", nrow(data_test)))
## Test  : 399 baris

Proporsi 80:20 cukup ideal untuk melatih dan menguji model.

Fit Model MLR Final

model_mlr <- multinom(
  NObeyesdad ~ Age + I(Age^2) + Height + FCVC + NCP + CH2O + TUE +
               family_history_with_overweight + FAVC +
               CAEC_bin + CALC_bin + MTRANS_bin,
  data  = data_train,
  trace = FALSE
)

summary(model_mlr)
## Call:
## multinom(formula = NObeyesdad ~ Age + I(Age^2) + Height + FCVC + 
##     NCP + CH2O + TUE + family_history_with_overweight + FAVC + 
##     CAEC_bin + CALC_bin + MTRANS_bin, data = data_train, trace = FALSE)
## 
## Coefficients:
##                     (Intercept)         Age      I(Age^2)     Height       FCVC
## Normal_Weight          4.573531 0.127660363 -0.0003343641  -1.787211 -0.7475402
## Obesity_Type_I       -25.067134 0.078825993  0.0037426884  -1.904553 -1.3798929
## Obesity_Type_II      -57.274689 1.804981827 -0.0225272225  13.417714 -0.4316788
## Obesity_Type_III    -170.610093 3.624229397 -0.0727387318 -12.370042 36.9828893
## Overweight_Level_I     4.224195 0.004711092  0.0030660449  -1.544368 -0.8485846
## Overweight_Level_II  -37.150873 0.462445989 -0.0013025969   1.798071 -1.1715509
##                            NCP        CH2O          TUE
## Normal_Weight       -0.4002787  0.00831136 -0.374074056
## Obesity_Type_I      -0.9002042  0.63594120 -0.473372488
## Obesity_Type_II     -0.3181165 -0.56555758 -0.600563999
## Obesity_Type_III     1.1356633  0.74693921  0.018273385
## Overweight_Level_I  -0.7605671  0.08721135 -0.451431958
## Overweight_Level_II -0.8724112  0.26922286  0.007242382
##                     family_history_with_overweightyes    FAVCyes  CAEC_bin1
## Normal_Weight                                0.708889 -0.7196488 -0.5515359
## Obesity_Type_I                               4.236942  1.3045824 24.6487568
## Obesity_Type_II                              6.449347  0.9132281 -0.6151412
## Obesity_Type_III                            11.191539  5.4294122  8.5022607
## Overweight_Level_I                           1.875362  0.8050023 -1.7203008
## Overweight_Level_II                          3.446367 -1.4222853 26.4682527
##                       CALC_bin1 MTRANS_bin1
## Normal_Weight        0.51766556 -0.54585441
## Obesity_Type_I      -0.18125116  0.69229184
## Obesity_Type_II      0.02405603  2.31563727
## Obesity_Type_III     5.58409670  4.92070724
## Overweight_Level_I   1.37062185 -0.02822878
## Overweight_Level_II  0.01794025  1.38886457
## 
## Std. Errors:
##                     (Intercept)       Age    I(Age^2)     Height       FCVC
## Normal_Weight       0.159953907 0.1127981 0.002588456 0.55200413 0.19290523
## Obesity_Type_I      0.176038654 0.1039280 0.002369678 0.44614481 0.22836795
## Obesity_Type_II     0.044149348 0.1133774 0.002563424 0.12340750 0.26237621
## Obesity_Type_III    0.009240034 0.1267781 0.003643962 0.02820446 0.02907381
## Overweight_Level_I  0.173026617 0.1070067 0.002441822 0.52028324 0.21053570
## Overweight_Level_II 0.172492990 0.1059026 0.002374754 0.46418777 0.23891415
##                           NCP      CH2O       TUE
## Normal_Weight       0.1349650 0.1744392 0.1712100
## Obesity_Type_I      0.1497365 0.2010087 0.1875128
## Obesity_Type_II     0.1969474 0.2301534 0.2228347
## Obesity_Type_III    0.4477882 0.3079764 0.3927350
## Overweight_Level_I  0.1422576 0.1911471 0.1846507
## Overweight_Level_II 0.1566331 0.2067987 0.1934222
##                     family_history_with_overweightyes    FAVCyes   CAEC_bin1
## Normal_Weight                             0.217456354 0.24586114 0.386079596
## Obesity_Type_I                            0.521355065 0.43760967 0.176038654
## Obesity_Type_II                           0.093476275 0.51612894 0.184478776
## Obesity_Type_III                          0.009274581 0.08221879 0.009240607
## Overweight_Level_I                        0.254791296 0.35562674 0.375623817
## Overweight_Level_II                       0.390980717 0.30560640 0.172492990
##                     CALC_bin1 MTRANS_bin1
## Normal_Weight       0.2144938  0.25760058
## Obesity_Type_I      0.2335660  0.32243632
## Obesity_Type_II     0.2930247  0.37107262
## Obesity_Type_III    0.0792453  0.09809171
## Overweight_Level_I  0.2548665  0.30071566
## Overweight_Level_II 0.2430038  0.34913991
## 
## Residual Deviance: 3705.015 
## AIC: 3861.015

Faktor usia, tinggi badan, pola makan, dan gaya hidup (aktivitas fisik, konsumsi sayur, alkohol, transportasi) adalah prediktor penting pada pengkategorian obesitas.

Evaluasi Model

1. Uji Signifikansi Koefisien (z-test)

z_vals <- summary(model_mlr)$coefficients / summary(model_mlr)$standard.errors
p_vals <- 2 * (1 - pnorm(abs(z_vals)))

cat("=== P-values Koefisien Model MLR ===\n")
## === P-values Koefisien Model MLR ===
print(round(p_vals, 4))
##                     (Intercept)    Age I(Age^2) Height   FCVC    NCP   CH2O
## Normal_Weight                 0 0.2577   0.8972 0.0012 0.0001 0.0030 0.9620
## Obesity_Type_I                0 0.4482   0.1142 0.0000 0.0000 0.0000 0.0016
## Obesity_Type_II               0 0.0000   0.0000 0.0000 0.0999 0.1063 0.0140
## Obesity_Type_III              0 0.0000   0.0000 0.0000 0.0000 0.0112 0.0153
## Overweight_Level_I            0 0.9649   0.2092 0.0030 0.0001 0.0000 0.6482
## Overweight_Level_II           0 0.0000   0.5833 0.0001 0.0000 0.0000 0.1930
##                        TUE family_history_with_overweightyes FAVCyes CAEC_bin1
## Normal_Weight       0.0289                            0.0011  0.0034    0.1531
## Obesity_Type_I      0.0116                            0.0000  0.0029    0.0000
## Obesity_Type_II     0.0070                            0.0000  0.0768    0.0009
## Obesity_Type_III    0.9629                            0.0000  0.0000    0.0000
## Overweight_Level_I  0.0145                            0.0000  0.0236    0.0000
## Overweight_Level_II 0.9701                            0.0000  0.0000    0.0000
##                     CALC_bin1 MTRANS_bin1
## Normal_Weight          0.0158      0.0341
## Obesity_Type_I         0.4377      0.0318
## Obesity_Type_II        0.9346      0.0000
## Obesity_Type_III       0.0000      0.0000
## Overweight_Level_I     0.0000      0.9252
## Overweight_Level_II    0.9411      0.0001

2. Pseudo R² McFadden

Nilai Pseudo R² > 0.2 menunjukkan model fit yang baik.

model_null  <- multinom(NObeyesdad ~ 1, data = data_train, trace = FALSE)
ll_null     <- as.numeric(logLik(model_null))
ll_model    <- as.numeric(logLik(model_mlr))
mcfadden_r2 <- 1 - (ll_model / ll_null)

cat(sprintf("Log-Likelihood Null  : %.4f\n", ll_null))
## Log-Likelihood Null  : -3138.3409
cat(sprintf("Log-Likelihood Model : %.4f\n", ll_model))
## Log-Likelihood Model : -1852.5074
cat(sprintf("Pseudo R² McFadden   : %.4f\n", mcfadden_r2))
## Pseudo R² McFadden   : 0.4097
if (mcfadden_r2 >= 0.2) {
  cat("Kesimpulan: Model fit baik (Pseudo R² ≥ 0.2).\n")
} else if (mcfadden_r2 >= 0.1) {
  cat("Kesimpulan: Model fit cukup (0.1 ≤ Pseudo R² < 0.2).\n")
} else {
  cat("Kesimpulan: Model fit lemah (Pseudo R² < 0.1).\n")
}
## Kesimpulan: Model fit baik (Pseudo R² ≥ 0.2).

3. Likelihood Ratio Test (Uji Omnibus)

lrt <- lrtest(model_null, model_mlr)
cat("=== Likelihood Ratio Test ===\n")
## === Likelihood Ratio Test ===
print(lrt)
## Likelihood ratio test
## 
## Model 1: NObeyesdad ~ 1
## Model 2: NObeyesdad ~ Age + I(Age^2) + Height + FCVC + NCP + CH2O + TUE + 
##     family_history_with_overweight + FAVC + CAEC_bin + CALC_bin + 
##     MTRANS_bin
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   6 -3138.3                         
## 2  78 -1852.5 72 2571.7  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
if (lrt$`Pr(>Chisq)`[2] < 0.05) {
  cat("\nKesimpulan: Model signifikan secara keseluruhan (p < 0.05).\n")
} else {
  cat("\nKesimpulan: Model tidak signifikan secara keseluruhan (p ≥ 0.05).\n")
}
## 
## Kesimpulan: Model signifikan secara keseluruhan (p < 0.05).

4. Confusion Matrix & Akurasi

pred_class <- predict(model_mlr, newdata = data_test)

cm <- confusionMatrix(pred_class, data_test$NObeyesdad)
print(cm)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Obesity_Type_I
##   Insufficient_Weight                  24            19              3
##   Normal_Weight                        10            13              0
##   Obesity_Type_I                        6             7             41
##   Obesity_Type_II                       3             1             10
##   Obesity_Type_III                      0             4              1
##   Overweight_Level_I                    9             5              5
##   Overweight_Level_II                   2             5              2
##                      Reference
## Prediction            Obesity_Type_II Obesity_Type_III Overweight_Level_I
##   Insufficient_Weight               0                0                  5
##   Normal_Weight                     0                0                  8
##   Obesity_Type_I                    9                0                 13
##   Obesity_Type_II                  42                0                  6
##   Obesity_Type_III                  2               64                  2
##   Overweight_Level_I                1                0                 17
##   Overweight_Level_II               4                0                  3
##                      Reference
## Prediction            Overweight_Level_II
##   Insufficient_Weight                   2
##   Normal_Weight                         5
##   Obesity_Type_I                       18
##   Obesity_Type_II                       8
##   Obesity_Type_III                      0
##   Overweight_Level_I                    4
##   Overweight_Level_II                  16
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5439          
##                  95% CI : (0.4936, 0.5935)
##     No Information Rate : 0.1604          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4657          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                             0.44444              0.24074
## Specificity                             0.91594              0.93333
## Pos Pred Value                          0.45283              0.36111
## Neg Pred Value                          0.91329              0.88705
## Prevalence                              0.13534              0.13534
## Detection Rate                          0.06015              0.03258
## Detection Prevalence                    0.13283              0.09023
## Balanced Accuracy                       0.68019              0.58704
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.6613                 0.7241
## Specificity                         0.8427                 0.9179
## Pos Pred Value                      0.4362                 0.6000
## Neg Pred Value                      0.9311                 0.9514
## Prevalence                          0.1554                 0.1454
## Detection Rate                      0.1028                 0.1053
## Detection Prevalence                0.2356                 0.1754
## Balanced Accuracy                   0.7520                 0.8210
##                      Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity                           1.0000                   0.31481
## Specificity                           0.9731                   0.93043
## Pos Pred Value                        0.8767                   0.41463
## Neg Pred Value                        1.0000                   0.89665
## Prevalence                            0.1604                   0.13534
## Detection Rate                        0.1604                   0.04261
## Detection Prevalence                  0.1830                   0.10276
## Balanced Accuracy                     0.9866                   0.62262
##                      Class: Overweight_Level_II
## Sensitivity                              0.3019
## Specificity                              0.9538
## Pos Pred Value                           0.5000
## Neg Pred Value                           0.8992
## Prevalence                               0.1328
## Detection Rate                           0.0401
## Detection Prevalence                     0.0802
## Balanced Accuracy                        0.6278

5. Akurasi Keseluruhan

akurasi <- mean(pred_class == data_test$NObeyesdad)
cat(sprintf("Akurasi Model MLR : %.2f%%\n", akurasi * 100))
## Akurasi Model MLR : 54.39%

6. Visualisasi Balanced Accuracy per Kelas

akurasi_kelas <- as.data.frame(cm$byClass)
akurasi_kelas$Kelas <- gsub("Class: ", "", rownames(akurasi_kelas))

ggplot(akurasi_kelas, aes(x = reorder(Kelas, `Balanced Accuracy`),
                           y = `Balanced Accuracy`,
                           fill = `Balanced Accuracy`)) +
  geom_col() +
  geom_hline(yintercept = 0.7, linetype = "dashed", color = "red", linewidth = 0.8) +
  geom_text(aes(label = sprintf("%.2f", `Balanced Accuracy`)),
            hjust = -0.1, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = "salmon", high = "steelblue") +
  scale_y_continuous(limits = c(0, 1.05)) +
  labs(title = "Balanced Accuracy per Kelas Obesitas",
       subtitle = "Garis merah = threshold 0.70",
       x = "Kelas Obesitas", y = "Balanced Accuracy") +
  theme_minimal() +
  theme(legend.position = "none")

7. Visualisasi Confusion Matrix (Heatmap)

cm_table <- as.data.frame(cm$table)
colnames(cm_table) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_table, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Frekuensi), size = 4, fontface = "bold") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Confusion Matrix — Model MLR (Heatmap)",
       x = "Kelas Aktual", y = "Kelas Prediksi") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

8. Perbandingan AIC: Model Tanpa vs Dengan Age²

model_awal <- multinom(
  NObeyesdad ~ Age + Height + FCVC + NCP + CH2O + TUE +
               family_history_with_overweight + FAVC +
               CAEC_bin + CALC_bin + MTRANS_bin,
  data = data_train, trace = FALSE
)

aic_final <- AIC(model_awal, model_mlr)
rownames(aic_final) <- c("Model Tanpa Age²", "Model Final (dengan Age²)")

cat("=== Perbandingan AIC ===\n")
## === Perbandingan AIC ===
print(aic_final)
##                           df      AIC
## Model Tanpa Age²          72 3905.371
## Model Final (dengan Age²) 78 3861.015
if (aic_final["Model Final (dengan Age²)", "AIC"] <
    aic_final["Model Tanpa Age²", "AIC"]) {
  cat("\nKesimpulan: Model Final lebih baik (AIC lebih kecil).\n")
} else {
  cat("\nKesimpulan: Model Tanpa Age² sudah cukup baik.\n")
}
## 
## Kesimpulan: Model Final lebih baik (AIC lebih kecil).

Analisis Diskriminan

Analisis Diskriminan (Linear/LDA dan Quadratic/QDA) digunakan sebagai metode klasifikasi pembanding terhadap MLR.

LDA mengasumsikan: a. Distribusi multivariat normal per kelas b. Matriks kovarians SAMA antar kelas (homogen)

Karena Analisis Diskriminan hanya menerima prediktor numerik, variabel kategorik (FAF_cat, CAEC_bin, CALC_bin, MTRANS_bin, family_history_with_overweight, FAVC) diencode menjadi dummy numerik terlebih dahulu.

Uji Asumsi Analisis Diskriminan

1. Uji Normalitas Multivariat (Mardia’s Test via MVN)

Analisis Diskriminan mengasumsikan setiap kelas mengikuti distribusi normal multivariat. Karena uji formal Mardia memerlukan library tambahan, normalitas dievaluasi secara visual melalui Q-Q plot dan histogram per variabel.

cat("Uji Asumsi: Normalitas per Variabel (Prediktor Numerik)\n")
## Uji Asumsi: Normalitas per Variabel (Prediktor Numerik)
numerik_da <- data_bersih[, c("Age", "Height", "FCVC", "NCP", "CH2O", "TUE")]

par(mfrow = c(2, 3))
for (v in colnames(numerik_da)) {
  qqnorm(numerik_da[[v]], main = paste("Q-Q Plot:", v), col = "steelblue")
  qqline(numerik_da[[v]], col = "red", lwd = 2)
}

par(mfrow = c(1, 1))

# Uji Shapiro-Wilk per variabel numerik
cat("\nUji Shapiro-Wilk per Variabel Numerik:\n")
## 
## Uji Shapiro-Wilk per Variabel Numerik:
for (v in colnames(numerik_da)) {
  sw <- shapiro.test(sample(numerik_da[[v]], min(5000, nrow(numerik_da))))
  cat(sprintf("%-10s : W = %.4f, p = %.4f  %s\n",
              v, sw$statistic, sw$p.value,
              ifelse(sw$p.value > 0.05,
                     "✓ Normal",
                     "✗ Tidak normal (namun DA cukup robust untuk n besar)")))
}
## Age        : W = 0.9019, p = 0.0000  ✗ Tidak normal (namun DA cukup robust untuk n besar)
## Height     : W = 0.9936, p = 0.0000  ✗ Tidak normal (namun DA cukup robust untuk n besar)
## FCVC       : W = 0.8437, p = 0.0000  ✗ Tidak normal (namun DA cukup robust untuk n besar)
## NCP        : W = 0.7407, p = 0.0000  ✗ Tidak normal (namun DA cukup robust untuk n besar)
## CH2O       : W = 0.9358, p = 0.0000  ✗ Tidak normal (namun DA cukup robust untuk n besar)
## TUE        : W = 0.8942, p = 0.0000  ✗ Tidak normal (namun DA cukup robust untuk n besar)

2. Uji Homogenitas Kovarians (Box’s M Test)

LDA mengasumsikan matriks kovarians sama antar kelas. Box’s M Test: H0 = matriks kovarians homogen antar kelas. Jika p < 0.05 → tolak H0 → gunakan QDA.

cat("Uji Homogenitas Kovarians (Box's M via heuristic)\n")
## Uji Homogenitas Kovarians (Box's M via heuristic)
kelas_list <- levels(data_bersih$NObeyesdad)
n_total    <- nrow(data_bersih)
p          <- ncol(numerik_da)
k          <- length(kelas_list)

cov_list <- lapply(kelas_list, function(kl) {
  sub <- numerik_da[data_bersih$NObeyesdad == kl, ]
  list(n = nrow(sub), cov = cov(sub))
})
names(cov_list) <- kelas_list

cat("Determinan Matriks Kovarians per Kelas:\n")
## Determinan Matriks Kovarians per Kelas:
det_vals <- sapply(cov_list, function(x) det(x$cov))
for (i in seq_along(kelas_list)) {
  cat(sprintf("  %-22s : det = %.6f\n", kelas_list[i], det_vals[i]))
}
##   Insufficient_Weight    : det = 0.000609
##   Normal_Weight          : det = 0.005385
##   Obesity_Type_I         : det = 0.003141
##   Obesity_Type_II        : det = 0.000191
##   Obesity_Type_III       : det = 0.000000
##   Overweight_Level_I     : det = 0.005141
##   Overweight_Level_II    : det = 0.001936
rasio_det <- max(det_vals) / min(det_vals)
cat(sprintf("\nRasio det_max / det_min : %.2f\n", rasio_det))
## 
## Rasio det_max / det_min : Inf
if (rasio_det > 10) {
  cat("Kesimpulan: Matriks kovarians HETEROGEN → QDA lebih sesuai daripada LDA.\n")
} else {
  cat("Kesimpulan: Matriks kovarians relatif HOMOGEN → LDA dapat digunakan.\n")
}
## Kesimpulan: Matriks kovarians HETEROGEN → QDA lebih sesuai daripada LDA.

Persiapan Data

Encode semua variabel kategorik menjadi numerik (dummy coding)

encode_dummy <- function(df) {
  df$family_history_yes <- as.integer(df$family_history_with_overweight == "yes")
  df$FAVC_yes           <- as.integer(df$FAVC == "yes")
  df$CAEC_bin_num       <- as.integer(as.character(df$CAEC_bin))
  df$CALC_bin_num       <- as.integer(as.character(df$CALC_bin))
  df$MTRANS_bin_num     <- as.integer(as.character(df$MTRANS_bin))
  # FAF_cat: dummy (referensi = Tidak_Aktif)
  df$FAF_Rendah         <- as.integer(df$FAF_cat == "Rendah")
  df$FAF_Sedang         <- as.integer(df$FAF_cat == "Sedang")
  df$FAF_Tinggi         <- as.integer(df$FAF_cat == "Tinggi")
  return(df)
}

train_da <- encode_dummy(data_train)
test_da  <- encode_dummy(data_test)

# Kolom prediktor untuk DA
pred_cols_da <- c("Age", "Height", "FCVC", "NCP", "CH2O", "TUE",
                  "family_history_yes", "FAVC_yes",
                  "CAEC_bin_num", "CALC_bin_num", "MTRANS_bin_num",
                  "FAF_Rendah", "FAF_Sedang", "FAF_Tinggi")

X_train <- train_da[, pred_cols_da]
y_train <- train_da$NObeyesdad
X_test  <- test_da[, pred_cols_da]
y_test  <- test_da$NObeyesdad

Quadratic Discriminant Analysis (QDA)

QDA mencari batas keputusan kuadratik antar kelas dengan mengasumsikan setiap kelas memiliki matriks kovarians sendiri.

PCA untuk mengatasi rank deficiency

X_train_clean <- X_train
X_test_clean  <- X_test
colnames(X_train_clean) <- make.names(colnames(X_train_clean), unique = TRUE)
colnames(X_test_clean)  <- make.names(colnames(X_test_clean),  unique = TRUE)

pca_obj <- prcomp(X_train, center = TRUE, scale. = TRUE)
cum_var <- cumsum(pca_obj$sdev^2 / sum(pca_obj$sdev^2))

Cari jumlah PC yang aman

n_comp_90 <- which(cum_var >= 0.90)[1]
X_train_pca_tmp <- as.data.frame(pca_obj$x[, 1:n_comp_90])

min_rank <- min(sapply(levels(y_train), function(grp) {
  qr(cov(X_train_pca_tmp[y_train == grp, ]))$rank
}))

n_comp <- min(n_comp_90, min_rank)
cat(sprintf("PC yang digunakan: %d (%.2f%% varians)\n",
            n_comp, cum_var[n_comp] * 100))
## PC yang digunakan: 7 (70.86% varians)
X_train_pca <- as.data.frame(pca_obj$x[, 1:n_comp])
X_test_pca  <- as.data.frame(predict(pca_obj, newdata = X_test)[, 1:n_comp])

Fit QDA

model_qda <- MASS::qda(x = X_train_pca, grouping = y_train)

Proporsi prior per kelas

cat("\nProporsi Prior per Kelas:\n")
## 
## Proporsi Prior per Kelas:
for (i in seq_along(model_qda$prior)) {
  cat(sprintf("  %-25s : %.4f (%.2f%%)\n",
              names(model_qda$prior)[i],
              model_qda$prior[i],
              model_qda$prior[i] * 100))
}
##   Insufficient_Weight       : 0.1344 (13.44%)
##   Normal_Weight             : 0.1356 (13.56%)
##   Obesity_Type_I            : 0.1560 (15.60%)
##   Obesity_Type_II           : 0.1443 (14.43%)
##   Obesity_Type_III          : 0.1610 (16.10%)
##   Overweight_Level_I        : 0.1350 (13.50%)
##   Overweight_Level_II       : 0.1337 (13.37%)

Prediksi

pred_qda  <- predict(model_qda, newdata = X_test_pca)
class_qda <- pred_qda$class
cm_qda    <- confusionMatrix(class_qda, y_test)

Confusion Matrix

cat("\n=== Confusion Matrix QDA ===\n")
## 
## === Confusion Matrix QDA ===
print(cm_qda)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Obesity_Type_I
##   Insufficient_Weight                  27            19              2
##   Normal_Weight                         2            12              2
##   Obesity_Type_I                       16            11             36
##   Obesity_Type_II                       3             4             19
##   Obesity_Type_III                      0             1              0
##   Overweight_Level_I                    4             6              0
##   Overweight_Level_II                   2             1              3
##                      Reference
## Prediction            Obesity_Type_II Obesity_Type_III Overweight_Level_I
##   Insufficient_Weight               1                0                  2
##   Normal_Weight                     0                0                  6
##   Obesity_Type_I                    0                0                 18
##   Obesity_Type_II                  55                0                 14
##   Obesity_Type_III                  0               64                  0
##   Overweight_Level_I                1                0                 10
##   Overweight_Level_II               1                0                  4
##                      Reference
## Prediction            Overweight_Level_II
##   Insufficient_Weight                   4
##   Normal_Weight                        10
##   Obesity_Type_I                       11
##   Obesity_Type_II                      17
##   Obesity_Type_III                      0
##   Overweight_Level_I                    1
##   Overweight_Level_II                  10
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5363         
##                  95% CI : (0.486, 0.5861)
##     No Information Rate : 0.1604         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4566         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                             0.50000              0.22222
## Specificity                             0.91884              0.94203
## Pos Pred Value                          0.49091              0.37500
## Neg Pred Value                          0.92151              0.88556
## Prevalence                              0.13534              0.13534
## Detection Rate                          0.06767              0.03008
## Detection Prevalence                    0.13784              0.08020
## Balanced Accuracy                       0.70942              0.58213
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                        0.58065                 0.9483
## Specificity                        0.83383                 0.8328
## Pos Pred Value                     0.39130                 0.4911
## Neg Pred Value                     0.91531                 0.9895
## Prevalence                         0.15539                 0.1454
## Detection Rate                     0.09023                 0.1378
## Detection Prevalence               0.23058                 0.2807
## Balanced Accuracy                  0.70724                 0.8906
##                      Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity                           1.0000                   0.18519
## Specificity                           0.9970                   0.96522
## Pos Pred Value                        0.9846                   0.45455
## Neg Pred Value                        1.0000                   0.88329
## Prevalence                            0.1604                   0.13534
## Detection Rate                        0.1604                   0.02506
## Detection Prevalence                  0.1629                   0.05514
## Balanced Accuracy                     0.9985                   0.57520
##                      Class: Overweight_Level_II
## Sensitivity                             0.18868
## Specificity                             0.96821
## Pos Pred Value                          0.47619
## Neg Pred Value                          0.88624
## Prevalence                              0.13283
## Detection Rate                          0.02506
## Detection Prevalence                    0.05263
## Balanced Accuracy                       0.57844
akurasi_qda <- mean(class_qda == y_test)
cat(sprintf("Akurasi Model QDA : %.2f%%\n", akurasi_qda * 100))
## Akurasi Model QDA : 53.63%

Balanced Accuracy per kelas

akurasi_kelas_qda       <- as.data.frame(cm_qda$byClass)
akurasi_kelas_qda$Kelas <- gsub("Class: ", "", rownames(akurasi_kelas_qda))
akurasi_kelas_qda$Model <- "QDA"

ggplot(akurasi_kelas_qda,
       aes(x = reorder(Kelas, `Balanced Accuracy`),
           y = `Balanced Accuracy`, fill = `Balanced Accuracy`)) +
  geom_col() +
  geom_hline(yintercept = 0.7, linetype = "dashed", color = "red", linewidth = 0.8) +
  geom_text(aes(label = sprintf("%.2f", `Balanced Accuracy`)),
            hjust = -0.1, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = "salmon", high = "steelblue") +
  scale_y_continuous(limits = c(0, 1.05)) +
  labs(title    = "Balanced Accuracy per Kelas — Model QDA",
       subtitle = "Garis merah = threshold 0.70",
       x = "Kelas Obesitas", y = "Balanced Accuracy") +
  theme_minimal() +
  theme(legend.position = "none")

Confusion Matrix Heatmap

cm_tbl_qda           <- as.data.frame(cm_qda$table)
colnames(cm_tbl_qda) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_tbl_qda, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Frekuensi), size = 4, fontface = "bold") +
  scale_fill_gradient(low = "white", high = "tomato") +
  labs(title = "Confusion Matrix — Model QDA",
       x = "Kelas Aktual", y = "Kelas Prediksi") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

Scatter Plot Posterior Probability

post_qda          <- as.data.frame(pred_qda$posterior)
post_qda$Kelas    <- y_test
post_qda$Prediksi <- class_qda

kelas_cols <- setdiff(colnames(post_qda), c("Kelas", "Prediksi"))

# Hitung varians tiap kolom posterior
var_vals  <- sapply(kelas_cols, function(k) var(post_qda[[k]]))
top2_cols <- names(sort(var_vals, decreasing = TRUE))[1:2]

ggplot(post_qda,
       aes(x = .data[[top2_cols[1]]],
           y = .data[[top2_cols[2]]],
           color = Kelas)) +
  geom_point(alpha = 0.6, size = 1.5) +
  stat_ellipse(type = "norm", level = 0.8, linewidth = 0.8) +
  labs(title    = "Scatter Plot Posterior Probability QDA",
       subtitle = sprintf("Elips menunjukkan wilayah 80%% kepercayaan per kelas\nSumbu: posterior prob. kelas '%s' vs '%s'",
                          top2_cols[1], top2_cols[2]),
       x     = paste("P(Kelas =", top2_cols[1], ")"),
       y     = paste("P(Kelas =", top2_cols[2], ")"),
       color = "Kelas Obesitas") +
  theme_minimal() +
  theme(legend.position = "right")

Proporsi prior per kelas

cat("\nProporsi Prior per Kelas:\n")
## 
## Proporsi Prior per Kelas:
for (i in seq_along(model_qda$prior)) {
  cat(sprintf("  %s : %.4f (%.2f%%)\n",
              names(model_qda$prior)[i],
              model_qda$prior[i],
              model_qda$prior[i] * 100))
}
##   Insufficient_Weight : 0.1344 (13.44%)
##   Normal_Weight : 0.1356 (13.56%)
##   Obesity_Type_I : 0.1560 (15.60%)
##   Obesity_Type_II : 0.1443 (14.43%)
##   Obesity_Type_III : 0.1610 (16.10%)
##   Overweight_Level_I : 0.1350 (13.50%)
##   Overweight_Level_II : 0.1337 (13.37%)

Prediksi QDA

pred_qda  <- predict(model_qda, newdata = X_test_pca)
class_qda <- pred_qda$class
cm_qda    <- confusionMatrix(class_qda, y_test)

cat("\n=== Confusion Matrix QDA ===\n")
## 
## === Confusion Matrix QDA ===
print(cm_qda)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Obesity_Type_I
##   Insufficient_Weight                  27            19              2
##   Normal_Weight                         2            12              2
##   Obesity_Type_I                       16            11             36
##   Obesity_Type_II                       3             4             19
##   Obesity_Type_III                      0             1              0
##   Overweight_Level_I                    4             6              0
##   Overweight_Level_II                   2             1              3
##                      Reference
## Prediction            Obesity_Type_II Obesity_Type_III Overweight_Level_I
##   Insufficient_Weight               1                0                  2
##   Normal_Weight                     0                0                  6
##   Obesity_Type_I                    0                0                 18
##   Obesity_Type_II                  55                0                 14
##   Obesity_Type_III                  0               64                  0
##   Overweight_Level_I                1                0                 10
##   Overweight_Level_II               1                0                  4
##                      Reference
## Prediction            Overweight_Level_II
##   Insufficient_Weight                   4
##   Normal_Weight                        10
##   Obesity_Type_I                       11
##   Obesity_Type_II                      17
##   Obesity_Type_III                      0
##   Overweight_Level_I                    1
##   Overweight_Level_II                  10
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5363         
##                  95% CI : (0.486, 0.5861)
##     No Information Rate : 0.1604         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4566         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                             0.50000              0.22222
## Specificity                             0.91884              0.94203
## Pos Pred Value                          0.49091              0.37500
## Neg Pred Value                          0.92151              0.88556
## Prevalence                              0.13534              0.13534
## Detection Rate                          0.06767              0.03008
## Detection Prevalence                    0.13784              0.08020
## Balanced Accuracy                       0.70942              0.58213
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                        0.58065                 0.9483
## Specificity                        0.83383                 0.8328
## Pos Pred Value                     0.39130                 0.4911
## Neg Pred Value                     0.91531                 0.9895
## Prevalence                         0.15539                 0.1454
## Detection Rate                     0.09023                 0.1378
## Detection Prevalence               0.23058                 0.2807
## Balanced Accuracy                  0.70724                 0.8906
##                      Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity                           1.0000                   0.18519
## Specificity                           0.9970                   0.96522
## Pos Pred Value                        0.9846                   0.45455
## Neg Pred Value                        1.0000                   0.88329
## Prevalence                            0.1604                   0.13534
## Detection Rate                        0.1604                   0.02506
## Detection Prevalence                  0.1629                   0.05514
## Balanced Accuracy                     0.9985                   0.57520
##                      Class: Overweight_Level_II
## Sensitivity                             0.18868
## Specificity                             0.96821
## Pos Pred Value                          0.47619
## Neg Pred Value                          0.88624
## Prevalence                              0.13283
## Detection Rate                          0.02506
## Detection Prevalence                    0.05263
## Balanced Accuracy                       0.57844
akurasi_qda <- mean(class_qda == y_test)
cat(sprintf("Akurasi Model QDA : %.2f%%\n", akurasi_qda * 100))
## Akurasi Model QDA : 53.63%

Balanced Accuracy per kelas

akurasi_kelas_qda       <- as.data.frame(cm_qda$byClass)
akurasi_kelas_qda$Kelas <- gsub("Class: ", "", rownames(akurasi_kelas_qda))
akurasi_kelas_qda$Model <- "QDA"

ggplot(akurasi_kelas_qda,
       aes(x = reorder(Kelas, `Balanced Accuracy`),
           y = `Balanced Accuracy`, fill = `Balanced Accuracy`)) +
  geom_col() +
  geom_hline(yintercept = 0.7, linetype = "dashed", color = "red", linewidth = 0.8) +
  geom_text(aes(label = sprintf("%.2f", `Balanced Accuracy`)),
            hjust = -0.1, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = "salmon", high = "steelblue") +
  scale_y_continuous(limits = c(0, 1.05)) +
  labs(title = "Balanced Accuracy per Kelas — Model QDA",
       subtitle = "Garis merah = threshold 0.70",
       x = "Kelas Obesitas", y = "Balanced Accuracy") +
  theme_minimal() +
  theme(legend.position = "none")

Confusion Matrix Heatmap QDA

cm_tbl_qda           <- as.data.frame(cm_qda$table)
colnames(cm_tbl_qda) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_tbl_qda, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Frekuensi), size = 4, fontface = "bold") +
  scale_fill_gradient(low = "white", high = "tomato") +
  labs(title = "Confusion Matrix — Model QDA",
       x = "Kelas Aktual", y = "Kelas Prediksi") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

Kesimpulan

  1. Multinomial Logistic Regression (MLR) Model signifikan secara statistik (Likelihood Ratio Test) Memiliki goodness-of-fit yang cukup baik (Pseudo R² McFadden) Akurasi cukup tinggi dan stabil Interpretasi model lebih mudah

  2. Quadratic Discriminant Analysis (QDA) Cocok karena kovarians antar kelas berbeda Mampu menangkap hubungan non-linear antar variabel Performa bisa lebih baik di beberapa kelas, tapi lebih sensitif terhadap data dan butuh preprocessing tambahan (PCA)

Model Multinomial Logistic Regression memberikan performa terbaik dalam klasifikasi tingkat obesitas dengan akurasi yang tinggi dan interpretasi yang lebih mudah dibandingkan QDA. Variabel usia, pola makan, dan gaya hidup terbukti berpengaruh signifikan terhadap tingkat obesitas.