Anggota :
Fina Nihayatul Husna (24031554022)
Juli Yawati Manalu (24031554050)
Alya Nabila Tamam (24031554099)
Link Dataset : https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition
library(nnet)
## Warning: package 'nnet' was built under R version 4.5.3
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(corrplot)
## corrplot 0.95 loaded
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(klaR)
## Warning: package 'klaR' was built under R version 4.5.3
data <- read.csv("ObesityDataSet.csv",
stringsAsFactors = FALSE)
head(data)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
cat("Dimensi awal:", dim(data), "\n")
## Dimensi awal: 2111 17
str(data)
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
data$Gender <- NULL
data$Weight <- NULL
cat("Kolom setelah penghapusan:\n")
## Kolom setelah penghapusan:
print(colnames(data))
## [1] "Age" "Height"
## [3] "family_history_with_overweight" "FAVC"
## [5] "FCVC" "NCP"
## [7] "CAEC" "SMOKE"
## [9] "CH2O" "SCC"
## [11] "FAF" "TUE"
## [13] "CALC" "MTRANS"
## [15] "NObeyesdad"
Gender dan Weight dihapus karena sangat berkorelasi dengan label.
data$NObeyesdad <- as.factor(data$NObeyesdad)
kategorik_cols <- c("family_history_with_overweight", "FAVC", "CAEC",
"SMOKE", "SCC", "CALC", "MTRANS")
for (col in kategorik_cols) {
if (col %in% colnames(data)) {
data[[col]] <- as.factor(data[[col]])
}
}
cat("Tipe data setelah konversi faktor:\n")
## Tipe data setelah konversi faktor:
str(data)
## 'data.frame': 2111 obs. of 15 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
## $ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ SMOKE : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
## $ MTRANS : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
## $ NObeyesdad : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...
numerik_cols <- c("Age", "Height", "FCVC", "NCP", "CH2O", "TUE")
for (col in numerik_cols) {
if (col %in% colnames(data)) {
data[[col]] <- as.numeric(data[[col]])
}
}
cat("Tipe data setelah konversi numerik:\n")
## Tipe data setelah konversi numerik:
str(data)
## 'data.frame': 2111 obs. of 15 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
## $ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ SMOKE : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
## $ MTRANS : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
## $ NObeyesdad : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...
cat("Jumlah NA :\n")
## Jumlah NA :
print(colSums(is.na(data)))
## Age Height
## 0 0
## family_history_with_overweight FAVC
## 0 0
## FCVC NCP
## 0 0
## CAEC SMOKE
## 0 0
## CH2O SCC
## 0 0
## FAF TUE
## 0 0
## CALC MTRANS
## 0 0
## NObeyesdad
## 0
cat("Frekuensi NObeyesdad sebelum filter:\n")
## Frekuensi NObeyesdad sebelum filter:
print(sort(table(data$NObeyesdad), decreasing = TRUE))
##
## Obesity_Type_I Obesity_Type_III Obesity_Type_II Overweight_Level_I
## 351 324 297 290
## Overweight_Level_II Normal_Weight Insufficient_Weight
## 290 287 272
genre_valid <- names(table(data$NObeyesdad))[table(data$NObeyesdad) >= 30]
data <- data %>% filter(NObeyesdad %in% genre_valid)
data$NObeyesdad <- droplevels(data$NObeyesdad)
cat("\nKelas yang digunakan (", nlevels(data$NObeyesdad), "kelas):\n")
##
## Kelas yang digunakan ( 7 kelas):
print(levels(data$NObeyesdad))
## [1] "Insufficient_Weight" "Normal_Weight" "Obesity_Type_I"
## [4] "Obesity_Type_II" "Obesity_Type_III" "Overweight_Level_I"
## [7] "Overweight_Level_II"
Langkah ini memastikan model lebih stabil, mengurangi noise, dan mempermudah interpretasi.
cat("\nDistribusi FAF sebelum kategorisasi:\n")
##
## Distribusi FAF sebelum kategorisasi:
print(summary(data$FAF))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1245 1.0000 1.0103 1.6667 3.0000
data$FAF_cat <- cut(data$FAF,
breaks = c(-Inf, 0, 1, 2, Inf),
labels = c("Tidak_Aktif", "Rendah", "Sedang", "Tinggi"),
right = TRUE)
data$FAF_cat <- factor(data$FAF_cat,
levels = c("Tidak_Aktif", "Rendah", "Sedang", "Tinggi"),
ordered = FALSE)
data$FAF <- NULL
# Cek distribusi sebelum transformasi kategorik
cat("\nDistribusi FAF_cat setelah kategorisasi:\n")
##
## Distribusi FAF_cat setelah kategorisasi:
print(table(data$FAF_cat))
##
## Tidak_Aktif Rendah Sedang Tinggi
## 411 834 673 193
cat("(Proporsi):\n")
## (Proporsi):
print(round(prop.table(table(data$FAF_cat)) * 100, 2))
##
## Tidak_Aktif Rendah Sedang Tinggi
## 19.47 39.51 31.88 9.14
cat("\nDistribusi SMOKE:\n"); print(table(data$SMOKE))
##
## Distribusi SMOKE:
##
## no yes
## 2067 44
cat("Distribusi SCC:\n"); print(table(data$SCC))
## Distribusi SCC:
##
## no yes
## 2015 96
cat("Distribusi CAEC:\n"); print(table(data$CAEC))
## Distribusi CAEC:
##
## Always Frequently no Sometimes
## 53 242 51 1765
cat("Distribusi CALC:\n"); print(table(data$CALC))
## Distribusi CALC:
##
## Always Frequently no Sometimes
## 1 70 639 1401
cat("Distribusi MTRANS:\n"); print(table(data$MTRANS))
## Distribusi MTRANS:
##
## Automobile Bike Motorbike
## 457 7 11
## Public_Transportation Walking
## 1580 56
# Hapus SMOKE & SCC
data$SMOKE <- NULL
data$SCC <- NULL
# Binarisasi CAEC: makan di luar waktu makan (1) vs tidak (0)
data$CAEC_bin <- ifelse(data$CAEC %in% c("Sometimes", "Frequently", "Always"), 1, 0)
data$CAEC_bin <- as.factor(data$CAEC_bin)
data$CAEC <- NULL
# Binarisasi CALC: konsumsi alkohol (1) vs tidak (0)
data$CALC_bin <- ifelse(data$CALC != "no", 1, 0)
data$CALC_bin <- as.factor(data$CALC_bin)
data$CALC <- NULL
# Binarisasi MTRANS: transportasi umum (1) vs lainnya (0)
data$MTRANS_bin <- ifelse(data$MTRANS == "Public_Transportation", 1, 0)
data$MTRANS_bin <- as.factor(data$MTRANS_bin)
data$MTRANS <- NULL
cat("\nDistribusi CAEC_bin:\n"); print(table(data$CAEC_bin))
##
## Distribusi CAEC_bin:
##
## 0 1
## 51 2060
cat("Distribusi CALC_bin:\n"); print(table(data$CALC_bin))
## Distribusi CALC_bin:
##
## 0 1
## 639 1472
cat("Distribusi MTRANS_bin:\n"); print(table(data$MTRANS_bin))
## Distribusi MTRANS_bin:
##
## 0 1
## 531 1580
Transformasi ini membuat variabel lebih informatif dan mudah dipakai dalam regresi logistik.
str(data)
## 'data.frame': 2111 obs. of 13 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
## $ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ NObeyesdad : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...
## $ FAF_cat : Factor w/ 4 levels "Tidak_Aktif",..: 1 4 3 3 1 1 2 4 2 2 ...
## $ CAEC_bin : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ CALC_bin : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 1 ...
## $ MTRANS_bin : Factor w/ 2 levels "0","1": 2 2 2 1 2 1 1 2 2 2 ...
summary(data)
## Age Height family_history_with_overweight FAVC
## Min. :14.00 Min. :1.450 no : 385 no : 245
## 1st Qu.:19.95 1st Qu.:1.630 yes:1726 yes:1866
## Median :22.78 Median :1.700
## Mean :24.31 Mean :1.702
## 3rd Qu.:26.00 3rd Qu.:1.768
## Max. :61.00 Max. :1.980
##
## FCVC NCP CH2O TUE
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:2.659 1st Qu.:1.585 1st Qu.:0.0000
## Median :2.386 Median :3.000 Median :2.000 Median :0.6253
## Mean :2.419 Mean :2.686 Mean :2.008 Mean :0.6579
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:2.477 3rd Qu.:1.0000
## Max. :3.000 Max. :4.000 Max. :3.000 Max. :2.0000
##
## NObeyesdad FAF_cat CAEC_bin CALC_bin MTRANS_bin
## Insufficient_Weight:272 Tidak_Aktif:411 0: 51 0: 639 0: 531
## Normal_Weight :287 Rendah :834 1:2060 1:1472 1:1580
## Obesity_Type_I :351 Sedang :673
## Obesity_Type_II :297 Tinggi :193
## Obesity_Type_III :324
## Overweight_Level_I :290
## Overweight_Level_II:290
ggplot(data, aes(x = forcats::fct_infreq(NObeyesdad), fill = NObeyesdad)) +
geom_bar() +
labs(title = "Distribusi Kategori Obesitas (Label Target)",
x = "Obesity Level", y = "Frekuensi") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "none")
Distribusi obesitas cukup hampir merata antar kelas.
ggplot(data, aes(x = NObeyesdad, y = Age, fill = NObeyesdad)) +
geom_boxplot() +
labs(title = "Distribusi Age per Tingkat Obesitas",
x = "Obesity Level", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "none")
ggplot(data, aes(x = NObeyesdad, y = Height, fill = NObeyesdad)) +
geom_boxplot() +
labs(title = "Distribusi Height per Tingkat Obesitas",
x = "Obesity Level", y = "Height (m)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "none")
Tinggi badan tidak terlalu berbeda antar kelas.
ggplot(data, aes(x = NObeyesdad, fill = FAF_cat)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Proporsi Kategori FAF per Tingkat Obesitas",
x = "Obesity Level", y = "Proporsi", fill = "Aktivitas Fisik") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
Ada indikasi hubungan antara usia & aktivitas fisik dengan kateogri obesitas.
Multinomial Logistic Regression mengasumsikan hubungan linear antara prediktor numerik dengan log-odds. Pengujian dilakukan melalui tiga cara: visualisasi GAM smoother, uji formal Box-Tidwell, dan perbandingan AIC.
Jika kurva GAM mendekati garis lurus, asumsi linearitas terpenuhi.
ggplot(data, aes(x = Age, y = as.numeric(NObeyesdad))) +
geom_point(alpha = 0.1, color = "steelblue") +
geom_smooth(method = "gam", color = "red", se = TRUE) +
labs(title = "Linearitas: Age vs NObeyesdad",
x = "Age", y = "NObeyesdad (numerik)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
ggplot(data, aes(x = Height, y = as.numeric(NObeyesdad))) +
geom_point(alpha = 0.1, color = "steelblue") +
geom_smooth(method = "gam", color = "red", se = TRUE) +
labs(title = "Linearitas: Height vs NObeyesdad",
x = "Height", y = "NObeyesdad (numerik)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Ditambahkan interaksi X·ln(X) pada model binomial. Jika semua koefisien interaksi tidak signifikan (p > 0.05), asumsi linearitas log-odds terpenuhi.
data_bt <- data %>%
mutate(
Age_lnAge = ifelse(Age > 0, Age * log(Age), 0),
Height_lnHeight = ifelse(Height > 0, Height * log(Height), 0)
)
kelas_bt <- levels(data_bt$NObeyesdad)[1:2]
data_bt2 <- data_bt %>%
filter(NObeyesdad %in% kelas_bt) %>%
mutate(NObeyesdad = droplevels(NObeyesdad))
bt_model <- glm(NObeyesdad ~ Age + Height + Age_lnAge + Height_lnHeight,
data = data_bt2,
family = binomial)
cat("=== Hasil Box-Tidwell ===\n")
## === Hasil Box-Tidwell ===
print(summary(bt_model)$coefficients)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -37.5002920 48.4199329 -0.7744805 0.4386467
## Age 0.4068502 0.8182804 0.4972014 0.6190470
## Height 30.3619239 43.6619077 0.6953870 0.4868128
## Age_lnAge -0.0610457 0.1977858 -0.3086455 0.7575912
## Height_lnHeight -20.5912055 28.6659258 -0.7183164 0.4725622
cat("\n--- Interpretasi ---\n")
##
## --- Interpretasi ---
bt_coef <- summary(bt_model)$coefficients
interaksi <- c("Age_lnAge", "Height_lnHeight")
for (var in interaksi) {
p <- bt_coef[var, "Pr(>|z|)"]
status <- ifelse(p > 0.05,
"✓ Tidak signifikan → Linearitas terpenuhi",
"✗ Signifikan → Linearitas TIDAK terpenuhi")
cat(sprintf("%-20s : p = %.4f %s\n", var, p, status))
}
## Age_lnAge : p = 0.7576 ✓ Tidak signifikan → Linearitas terpenuhi
## Height_lnHeight : p = 0.4726 ✓ Tidak signifikan → Linearitas terpenuhi
Box-Tidwell menunjukkan p-value > 0.05 → linearitas terpenuhi.
AIC lebih kecil menunjukkan model yang lebih baik. Jika model
kuadratik memiliki AIC lebih kecil, maka Age² perlu
dimasukkan ke model final.
model_linear <- multinom(
NObeyesdad ~ Age + Height + FCVC + NCP + CH2O + TUE,
data = data, trace = FALSE)
model_kuadrat <- multinom(
NObeyesdad ~ Age + I(Age^2) + Height + FCVC + NCP + CH2O + TUE,
data = data, trace = FALSE)
aic_compare <- AIC(model_linear, model_kuadrat)
cat("=== Perbandingan AIC Linear vs Kuadratik ===\n")
## === Perbandingan AIC Linear vs Kuadratik ===
print(aic_compare)
## df AIC
## model_linear 42 6292.374
## model_kuadrat 48 6051.577
if (aic_compare["model_kuadrat", "AIC"] < aic_compare["model_linear", "AIC"]) {
cat("\nKesimpulan: Model kuadratik lebih baik → Age² dimasukkan ke model final.\n")
} else {
cat("\nKesimpulan: Model linear sudah cukup.\n")
}
##
## Kesimpulan: Model kuadratik lebih baik → Age² dimasukkan ke model final.
Asumsi independensi pada Multinomial Logistic Regression mensyaratkan bahwa setiap observasi tidak saling memengaruhi satu sama lain. Uji statistik seperti Durbin-Watson tidak relevan di sini karena dirancang untuk data deret waktu (time series), bukan data cross-sectional.
Dataset ini merupakan data cross-sectional di mana setiap baris merepresentasikan satu individu yang berbeda dari survei populasi di Mexico, Peru, dan Colombia. Tidak ada struktur pengulangan (repeated measures), klaster, maupun urutan waktu antar observasi. Oleh karena itu, asumsi independensi observasi dianggap terpenuhi secara by design berdasarkan karakteristik pengumpulan data.
cat("=== Verifikasi Independensi Observasi ===\n\n")
## === Verifikasi Independensi Observasi ===
cat(sprintf("Jumlah total observasi : %d\n", nrow(data)))
## Jumlah total observasi : 2111
cat(sprintf("Jumlah variabel : %d\n", ncol(data)))
## Jumlah variabel : 13
cat("\nJenis data : Cross-sectional\n")
##
## Jenis data : Cross-sectional
cat("Sumber data : Survei individu (Mexico, Peru, Colombia)\n")
## Sumber data : Survei individu (Mexico, Peru, Colombia)
cat("Struktur repeated measures : Tidak ada\n")
## Struktur repeated measures : Tidak ada
cat("Struktur time series : Tidak ada\n\n")
## Struktur time series : Tidak ada
cat("Kesimpulan: Setiap observasi merepresentasikan individu yang berbeda.\n")
## Kesimpulan: Setiap observasi merepresentasikan individu yang berbeda.
cat("Asumsi independensi observasi terpenuhi by design.\n")
## Asumsi independensi observasi terpenuhi by design.
numerik <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(numerik)
cat("Matriks Korelasi:\n")
## Matriks Korelasi:
print(round(cor_matrix, 3))
## Age Height FCVC NCP CH2O TUE
## Age 1.000 -0.026 0.016 -0.044 -0.045 -0.297
## Height -0.026 1.000 -0.038 0.244 0.213 0.052
## FCVC 0.016 -0.038 1.000 0.042 0.068 -0.101
## NCP -0.044 0.244 0.042 1.000 0.057 0.036
## CH2O -0.045 0.213 0.068 0.057 1.000 0.012
## TUE -0.297 0.052 -0.101 0.036 0.012 1.000
corrplot(cor_matrix, method = "color", type = "upper",
addCoef.col = "black", tl.col = "black",
title = "Matriks Korelasi Prediktor Numerik",
mar = c(0, 0, 1.5, 0))
VIF < 5 menunjukkan tidak ada multikolinieritas yang mengkhawatirkan. VIF < 10 masih dapat diterima.
data$NObeyesdad_num <- as.numeric(data$NObeyesdad)
model_vif <- lm(NObeyesdad_num ~ Age + Height + FCVC + NCP + CH2O + TUE,
data = data)
vif_result <- vif(model_vif)
data$NObeyesdad_num <- NULL
cat("Nilai VIF:\n")
## Nilai VIF:
print(round(vif_result, 4))
## Age Height FCVC NCP CH2O TUE
## 1.1001 1.1165 1.0210 1.0682 1.0561 1.1104
cat("\nKesimpulan:\n")
##
## Kesimpulan:
if (all(vif_result < 5)) {
cat("Semua VIF < 5 → Tidak ada multikolinieritas.\n")
} else if (all(vif_result < 10)) {
cat("Semua VIF < 10 → Multikolinieritas masih dapat diterima.\n")
} else {
cat("Terdapat VIF ≥ 10 → Ada multikolinieritas serius.\n")
}
## Semua VIF < 5 → Tidak ada multikolinieritas.
Outlier dihapus secara iteratif menggunakan Jarak Mahalanobis dengan cutoff Chi-Square (df = jumlah variabel numerik, α = 0.025) hingga tidak ada outlier tersisa.
data_bersih <- data
iterasi <- 0
repeat {
numerik_iter <- data_bersih[, c("Age", "Height", "FCVC", "NCP",
"CH2O", "TUE")]
md_iter <- mahalanobis(numerik_iter, colMeans(numerik_iter), cov(numerik_iter))
cutoff <- qchisq(0.975, df = ncol(numerik_iter))
outlier_idx <- which(md_iter > cutoff)
iterasi <- iterasi + 1
cat(sprintf("Iterasi %d: %d outlier ditemukan (n = %d)\n",
iterasi, length(outlier_idx), nrow(data_bersih)))
if (length(outlier_idx) == 0) break
data_bersih <- data_bersih[-outlier_idx, ]
data_bersih$NObeyesdad <- droplevels(data_bersih$NObeyesdad)
}
## Iterasi 1: 45 outlier ditemukan (n = 2111)
## Iterasi 2: 15 outlier ditemukan (n = 2066)
## Iterasi 3: 11 outlier ditemukan (n = 2051)
## Iterasi 4: 5 outlier ditemukan (n = 2040)
## Iterasi 5: 5 outlier ditemukan (n = 2035)
## Iterasi 6: 5 outlier ditemukan (n = 2030)
## Iterasi 7: 7 outlier ditemukan (n = 2025)
## Iterasi 8: 3 outlier ditemukan (n = 2018)
## Iterasi 9: 1 outlier ditemukan (n = 2015)
## Iterasi 10: 0 outlier ditemukan (n = 2014)
cat(sprintf("\nTotal iterasi : %d\n", iterasi))
##
## Total iterasi : 10
cat(sprintf("Data sebelum pembersihan : %d baris\n", nrow(data)))
## Data sebelum pembersihan : 2111 baris
cat(sprintf("Data setelah pembersihan : %d baris\n", nrow(data_bersih)))
## Data setelah pembersihan : 2014 baris
cat(sprintf("Cutoff Mahalanobis : %.4f\n", cutoff))
## Cutoff Mahalanobis : 14.4494
numerik_final <- data_bersih[, c("Age", "Height", "FCVC", "NCP",
"CH2O", "TUE")]
md_final <- mahalanobis(numerik_final, colMeans(numerik_final), cov(numerik_final))
df_md <- data.frame(index = 1:length(md_final),
md = md_final,
outlier = md_final > cutoff)
ggplot(df_md, aes(x = index, y = md, color = outlier)) +
geom_point(alpha = 0.5, size = 1) +
geom_hline(yintercept = cutoff, linetype = "dashed", color = "red", linewidth = 1) +
scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "tomato"),
labels = c("Normal", "Outlier")) +
labs(title = "Mahalanobis Distance per Observasi (Setelah Iterasi)",
x = "Index Observasi", y = "Mahalanobis Distance", color = "") +
theme_minimal()
par(mfrow = c(2, 3))
boxplot(data$Age,
main = "Age — Sebelum", col = "salmon", ylab = "Age")
boxplot(data$Height,
main = "Height — Sebelum", col = "salmon", ylab = "Height")
boxplot(data_bersih$Age,
main = "Age — Sesudah", col = "lightgreen", ylab = "Age")
boxplot(data_bersih$Height,
main = "Height — Sesudah", col = "lightgreen", ylab = "Height")
par(mfrow = c(1, 1))
set.seed(42)
train_idx <- createDataPartition(data_bersih$NObeyesdad, p = 0.8, list = FALSE)
data_train <- data_bersih[train_idx, ]
data_test <- data_bersih[-train_idx, ]
cat(sprintf("Train : %d baris\n", nrow(data_train)))
## Train : 1615 baris
cat(sprintf("Test : %d baris\n", nrow(data_test)))
## Test : 399 baris
Proporsi 80:20 cukup ideal untuk melatih dan menguji model.
model_mlr <- multinom(
NObeyesdad ~ Age + I(Age^2) + Height + FCVC + NCP + CH2O + TUE +
family_history_with_overweight + FAVC +
CAEC_bin + CALC_bin + MTRANS_bin,
data = data_train,
trace = FALSE
)
summary(model_mlr)
## Call:
## multinom(formula = NObeyesdad ~ Age + I(Age^2) + Height + FCVC +
## NCP + CH2O + TUE + family_history_with_overweight + FAVC +
## CAEC_bin + CALC_bin + MTRANS_bin, data = data_train, trace = FALSE)
##
## Coefficients:
## (Intercept) Age I(Age^2) Height FCVC
## Normal_Weight 4.573531 0.127660363 -0.0003343641 -1.787211 -0.7475402
## Obesity_Type_I -25.067134 0.078825993 0.0037426884 -1.904553 -1.3798929
## Obesity_Type_II -57.274689 1.804981827 -0.0225272225 13.417714 -0.4316788
## Obesity_Type_III -170.610093 3.624229397 -0.0727387318 -12.370042 36.9828893
## Overweight_Level_I 4.224195 0.004711092 0.0030660449 -1.544368 -0.8485846
## Overweight_Level_II -37.150873 0.462445989 -0.0013025969 1.798071 -1.1715509
## NCP CH2O TUE
## Normal_Weight -0.4002787 0.00831136 -0.374074056
## Obesity_Type_I -0.9002042 0.63594120 -0.473372488
## Obesity_Type_II -0.3181165 -0.56555758 -0.600563999
## Obesity_Type_III 1.1356633 0.74693921 0.018273385
## Overweight_Level_I -0.7605671 0.08721135 -0.451431958
## Overweight_Level_II -0.8724112 0.26922286 0.007242382
## family_history_with_overweightyes FAVCyes CAEC_bin1
## Normal_Weight 0.708889 -0.7196488 -0.5515359
## Obesity_Type_I 4.236942 1.3045824 24.6487568
## Obesity_Type_II 6.449347 0.9132281 -0.6151412
## Obesity_Type_III 11.191539 5.4294122 8.5022607
## Overweight_Level_I 1.875362 0.8050023 -1.7203008
## Overweight_Level_II 3.446367 -1.4222853 26.4682527
## CALC_bin1 MTRANS_bin1
## Normal_Weight 0.51766556 -0.54585441
## Obesity_Type_I -0.18125116 0.69229184
## Obesity_Type_II 0.02405603 2.31563727
## Obesity_Type_III 5.58409670 4.92070724
## Overweight_Level_I 1.37062185 -0.02822878
## Overweight_Level_II 0.01794025 1.38886457
##
## Std. Errors:
## (Intercept) Age I(Age^2) Height FCVC
## Normal_Weight 0.159953907 0.1127981 0.002588456 0.55200413 0.19290523
## Obesity_Type_I 0.176038654 0.1039280 0.002369678 0.44614481 0.22836795
## Obesity_Type_II 0.044149348 0.1133774 0.002563424 0.12340750 0.26237621
## Obesity_Type_III 0.009240034 0.1267781 0.003643962 0.02820446 0.02907381
## Overweight_Level_I 0.173026617 0.1070067 0.002441822 0.52028324 0.21053570
## Overweight_Level_II 0.172492990 0.1059026 0.002374754 0.46418777 0.23891415
## NCP CH2O TUE
## Normal_Weight 0.1349650 0.1744392 0.1712100
## Obesity_Type_I 0.1497365 0.2010087 0.1875128
## Obesity_Type_II 0.1969474 0.2301534 0.2228347
## Obesity_Type_III 0.4477882 0.3079764 0.3927350
## Overweight_Level_I 0.1422576 0.1911471 0.1846507
## Overweight_Level_II 0.1566331 0.2067987 0.1934222
## family_history_with_overweightyes FAVCyes CAEC_bin1
## Normal_Weight 0.217456354 0.24586114 0.386079596
## Obesity_Type_I 0.521355065 0.43760967 0.176038654
## Obesity_Type_II 0.093476275 0.51612894 0.184478776
## Obesity_Type_III 0.009274581 0.08221879 0.009240607
## Overweight_Level_I 0.254791296 0.35562674 0.375623817
## Overweight_Level_II 0.390980717 0.30560640 0.172492990
## CALC_bin1 MTRANS_bin1
## Normal_Weight 0.2144938 0.25760058
## Obesity_Type_I 0.2335660 0.32243632
## Obesity_Type_II 0.2930247 0.37107262
## Obesity_Type_III 0.0792453 0.09809171
## Overweight_Level_I 0.2548665 0.30071566
## Overweight_Level_II 0.2430038 0.34913991
##
## Residual Deviance: 3705.015
## AIC: 3861.015
Faktor usia, tinggi badan, pola makan, dan gaya hidup (aktivitas fisik, konsumsi sayur, alkohol, transportasi) adalah prediktor penting pada pengkategorian obesitas.
z_vals <- summary(model_mlr)$coefficients / summary(model_mlr)$standard.errors
p_vals <- 2 * (1 - pnorm(abs(z_vals)))
cat("=== P-values Koefisien Model MLR ===\n")
## === P-values Koefisien Model MLR ===
print(round(p_vals, 4))
## (Intercept) Age I(Age^2) Height FCVC NCP CH2O
## Normal_Weight 0 0.2577 0.8972 0.0012 0.0001 0.0030 0.9620
## Obesity_Type_I 0 0.4482 0.1142 0.0000 0.0000 0.0000 0.0016
## Obesity_Type_II 0 0.0000 0.0000 0.0000 0.0999 0.1063 0.0140
## Obesity_Type_III 0 0.0000 0.0000 0.0000 0.0000 0.0112 0.0153
## Overweight_Level_I 0 0.9649 0.2092 0.0030 0.0001 0.0000 0.6482
## Overweight_Level_II 0 0.0000 0.5833 0.0001 0.0000 0.0000 0.1930
## TUE family_history_with_overweightyes FAVCyes CAEC_bin1
## Normal_Weight 0.0289 0.0011 0.0034 0.1531
## Obesity_Type_I 0.0116 0.0000 0.0029 0.0000
## Obesity_Type_II 0.0070 0.0000 0.0768 0.0009
## Obesity_Type_III 0.9629 0.0000 0.0000 0.0000
## Overweight_Level_I 0.0145 0.0000 0.0236 0.0000
## Overweight_Level_II 0.9701 0.0000 0.0000 0.0000
## CALC_bin1 MTRANS_bin1
## Normal_Weight 0.0158 0.0341
## Obesity_Type_I 0.4377 0.0318
## Obesity_Type_II 0.9346 0.0000
## Obesity_Type_III 0.0000 0.0000
## Overweight_Level_I 0.0000 0.9252
## Overweight_Level_II 0.9411 0.0001
Nilai Pseudo R² > 0.2 menunjukkan model fit yang baik.
model_null <- multinom(NObeyesdad ~ 1, data = data_train, trace = FALSE)
ll_null <- as.numeric(logLik(model_null))
ll_model <- as.numeric(logLik(model_mlr))
mcfadden_r2 <- 1 - (ll_model / ll_null)
cat(sprintf("Log-Likelihood Null : %.4f\n", ll_null))
## Log-Likelihood Null : -3138.3409
cat(sprintf("Log-Likelihood Model : %.4f\n", ll_model))
## Log-Likelihood Model : -1852.5074
cat(sprintf("Pseudo R² McFadden : %.4f\n", mcfadden_r2))
## Pseudo R² McFadden : 0.4097
if (mcfadden_r2 >= 0.2) {
cat("Kesimpulan: Model fit baik (Pseudo R² ≥ 0.2).\n")
} else if (mcfadden_r2 >= 0.1) {
cat("Kesimpulan: Model fit cukup (0.1 ≤ Pseudo R² < 0.2).\n")
} else {
cat("Kesimpulan: Model fit lemah (Pseudo R² < 0.1).\n")
}
## Kesimpulan: Model fit baik (Pseudo R² ≥ 0.2).
lrt <- lrtest(model_null, model_mlr)
cat("=== Likelihood Ratio Test ===\n")
## === Likelihood Ratio Test ===
print(lrt)
## Likelihood ratio test
##
## Model 1: NObeyesdad ~ 1
## Model 2: NObeyesdad ~ Age + I(Age^2) + Height + FCVC + NCP + CH2O + TUE +
## family_history_with_overweight + FAVC + CAEC_bin + CALC_bin +
## MTRANS_bin
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 -3138.3
## 2 78 -1852.5 72 2571.7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
if (lrt$`Pr(>Chisq)`[2] < 0.05) {
cat("\nKesimpulan: Model signifikan secara keseluruhan (p < 0.05).\n")
} else {
cat("\nKesimpulan: Model tidak signifikan secara keseluruhan (p ≥ 0.05).\n")
}
##
## Kesimpulan: Model signifikan secara keseluruhan (p < 0.05).
pred_class <- predict(model_mlr, newdata = data_test)
cm <- confusionMatrix(pred_class, data_test$NObeyesdad)
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 24 19 3
## Normal_Weight 10 13 0
## Obesity_Type_I 6 7 41
## Obesity_Type_II 3 1 10
## Obesity_Type_III 0 4 1
## Overweight_Level_I 9 5 5
## Overweight_Level_II 2 5 2
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 5
## Normal_Weight 0 0 8
## Obesity_Type_I 9 0 13
## Obesity_Type_II 42 0 6
## Obesity_Type_III 2 64 2
## Overweight_Level_I 1 0 17
## Overweight_Level_II 4 0 3
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 2
## Normal_Weight 5
## Obesity_Type_I 18
## Obesity_Type_II 8
## Obesity_Type_III 0
## Overweight_Level_I 4
## Overweight_Level_II 16
##
## Overall Statistics
##
## Accuracy : 0.5439
## 95% CI : (0.4936, 0.5935)
## No Information Rate : 0.1604
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4657
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.44444 0.24074
## Specificity 0.91594 0.93333
## Pos Pred Value 0.45283 0.36111
## Neg Pred Value 0.91329 0.88705
## Prevalence 0.13534 0.13534
## Detection Rate 0.06015 0.03258
## Detection Prevalence 0.13283 0.09023
## Balanced Accuracy 0.68019 0.58704
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.6613 0.7241
## Specificity 0.8427 0.9179
## Pos Pred Value 0.4362 0.6000
## Neg Pred Value 0.9311 0.9514
## Prevalence 0.1554 0.1454
## Detection Rate 0.1028 0.1053
## Detection Prevalence 0.2356 0.1754
## Balanced Accuracy 0.7520 0.8210
## Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity 1.0000 0.31481
## Specificity 0.9731 0.93043
## Pos Pred Value 0.8767 0.41463
## Neg Pred Value 1.0000 0.89665
## Prevalence 0.1604 0.13534
## Detection Rate 0.1604 0.04261
## Detection Prevalence 0.1830 0.10276
## Balanced Accuracy 0.9866 0.62262
## Class: Overweight_Level_II
## Sensitivity 0.3019
## Specificity 0.9538
## Pos Pred Value 0.5000
## Neg Pred Value 0.8992
## Prevalence 0.1328
## Detection Rate 0.0401
## Detection Prevalence 0.0802
## Balanced Accuracy 0.6278
akurasi <- mean(pred_class == data_test$NObeyesdad)
cat(sprintf("Akurasi Model MLR : %.2f%%\n", akurasi * 100))
## Akurasi Model MLR : 54.39%
akurasi_kelas <- as.data.frame(cm$byClass)
akurasi_kelas$Kelas <- gsub("Class: ", "", rownames(akurasi_kelas))
ggplot(akurasi_kelas, aes(x = reorder(Kelas, `Balanced Accuracy`),
y = `Balanced Accuracy`,
fill = `Balanced Accuracy`)) +
geom_col() +
geom_hline(yintercept = 0.7, linetype = "dashed", color = "red", linewidth = 0.8) +
geom_text(aes(label = sprintf("%.2f", `Balanced Accuracy`)),
hjust = -0.1, size = 3.5) +
coord_flip() +
scale_fill_gradient(low = "salmon", high = "steelblue") +
scale_y_continuous(limits = c(0, 1.05)) +
labs(title = "Balanced Accuracy per Kelas Obesitas",
subtitle = "Garis merah = threshold 0.70",
x = "Kelas Obesitas", y = "Balanced Accuracy") +
theme_minimal() +
theme(legend.position = "none")
cm_table <- as.data.frame(cm$table)
colnames(cm_table) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(cm_table, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(color = "white") +
geom_text(aes(label = Frekuensi), size = 4, fontface = "bold") +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix — Model MLR (Heatmap)",
x = "Kelas Aktual", y = "Kelas Prediksi") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
model_awal <- multinom(
NObeyesdad ~ Age + Height + FCVC + NCP + CH2O + TUE +
family_history_with_overweight + FAVC +
CAEC_bin + CALC_bin + MTRANS_bin,
data = data_train, trace = FALSE
)
aic_final <- AIC(model_awal, model_mlr)
rownames(aic_final) <- c("Model Tanpa Age²", "Model Final (dengan Age²)")
cat("=== Perbandingan AIC ===\n")
## === Perbandingan AIC ===
print(aic_final)
## df AIC
## Model Tanpa Age² 72 3905.371
## Model Final (dengan Age²) 78 3861.015
if (aic_final["Model Final (dengan Age²)", "AIC"] <
aic_final["Model Tanpa Age²", "AIC"]) {
cat("\nKesimpulan: Model Final lebih baik (AIC lebih kecil).\n")
} else {
cat("\nKesimpulan: Model Tanpa Age² sudah cukup baik.\n")
}
##
## Kesimpulan: Model Final lebih baik (AIC lebih kecil).
Analisis Diskriminan (Linear/LDA dan Quadratic/QDA) digunakan sebagai metode klasifikasi pembanding terhadap MLR.
LDA mengasumsikan: a. Distribusi multivariat normal per kelas b. Matriks kovarians SAMA antar kelas (homogen)
Karena Analisis Diskriminan hanya menerima prediktor numerik, variabel kategorik (FAF_cat, CAEC_bin, CALC_bin, MTRANS_bin, family_history_with_overweight, FAVC) diencode menjadi dummy numerik terlebih dahulu.
Analisis Diskriminan mengasumsikan setiap kelas mengikuti distribusi normal multivariat. Karena uji formal Mardia memerlukan library tambahan, normalitas dievaluasi secara visual melalui Q-Q plot dan histogram per variabel.
cat("Uji Asumsi: Normalitas per Variabel (Prediktor Numerik)\n")
## Uji Asumsi: Normalitas per Variabel (Prediktor Numerik)
numerik_da <- data_bersih[, c("Age", "Height", "FCVC", "NCP", "CH2O", "TUE")]
par(mfrow = c(2, 3))
for (v in colnames(numerik_da)) {
qqnorm(numerik_da[[v]], main = paste("Q-Q Plot:", v), col = "steelblue")
qqline(numerik_da[[v]], col = "red", lwd = 2)
}
par(mfrow = c(1, 1))
# Uji Shapiro-Wilk per variabel numerik
cat("\nUji Shapiro-Wilk per Variabel Numerik:\n")
##
## Uji Shapiro-Wilk per Variabel Numerik:
for (v in colnames(numerik_da)) {
sw <- shapiro.test(sample(numerik_da[[v]], min(5000, nrow(numerik_da))))
cat(sprintf("%-10s : W = %.4f, p = %.4f %s\n",
v, sw$statistic, sw$p.value,
ifelse(sw$p.value > 0.05,
"✓ Normal",
"✗ Tidak normal (namun DA cukup robust untuk n besar)")))
}
## Age : W = 0.9019, p = 0.0000 ✗ Tidak normal (namun DA cukup robust untuk n besar)
## Height : W = 0.9936, p = 0.0000 ✗ Tidak normal (namun DA cukup robust untuk n besar)
## FCVC : W = 0.8437, p = 0.0000 ✗ Tidak normal (namun DA cukup robust untuk n besar)
## NCP : W = 0.7407, p = 0.0000 ✗ Tidak normal (namun DA cukup robust untuk n besar)
## CH2O : W = 0.9358, p = 0.0000 ✗ Tidak normal (namun DA cukup robust untuk n besar)
## TUE : W = 0.8942, p = 0.0000 ✗ Tidak normal (namun DA cukup robust untuk n besar)
LDA mengasumsikan matriks kovarians sama antar kelas. Box’s M Test: H0 = matriks kovarians homogen antar kelas. Jika p < 0.05 → tolak H0 → gunakan QDA.
cat("Uji Homogenitas Kovarians (Box's M via heuristic)\n")
## Uji Homogenitas Kovarians (Box's M via heuristic)
kelas_list <- levels(data_bersih$NObeyesdad)
n_total <- nrow(data_bersih)
p <- ncol(numerik_da)
k <- length(kelas_list)
cov_list <- lapply(kelas_list, function(kl) {
sub <- numerik_da[data_bersih$NObeyesdad == kl, ]
list(n = nrow(sub), cov = cov(sub))
})
names(cov_list) <- kelas_list
cat("Determinan Matriks Kovarians per Kelas:\n")
## Determinan Matriks Kovarians per Kelas:
det_vals <- sapply(cov_list, function(x) det(x$cov))
for (i in seq_along(kelas_list)) {
cat(sprintf(" %-22s : det = %.6f\n", kelas_list[i], det_vals[i]))
}
## Insufficient_Weight : det = 0.000609
## Normal_Weight : det = 0.005385
## Obesity_Type_I : det = 0.003141
## Obesity_Type_II : det = 0.000191
## Obesity_Type_III : det = 0.000000
## Overweight_Level_I : det = 0.005141
## Overweight_Level_II : det = 0.001936
rasio_det <- max(det_vals) / min(det_vals)
cat(sprintf("\nRasio det_max / det_min : %.2f\n", rasio_det))
##
## Rasio det_max / det_min : Inf
if (rasio_det > 10) {
cat("Kesimpulan: Matriks kovarians HETEROGEN → QDA lebih sesuai daripada LDA.\n")
} else {
cat("Kesimpulan: Matriks kovarians relatif HOMOGEN → LDA dapat digunakan.\n")
}
## Kesimpulan: Matriks kovarians HETEROGEN → QDA lebih sesuai daripada LDA.
Encode semua variabel kategorik menjadi numerik (dummy coding)
encode_dummy <- function(df) {
df$family_history_yes <- as.integer(df$family_history_with_overweight == "yes")
df$FAVC_yes <- as.integer(df$FAVC == "yes")
df$CAEC_bin_num <- as.integer(as.character(df$CAEC_bin))
df$CALC_bin_num <- as.integer(as.character(df$CALC_bin))
df$MTRANS_bin_num <- as.integer(as.character(df$MTRANS_bin))
# FAF_cat: dummy (referensi = Tidak_Aktif)
df$FAF_Rendah <- as.integer(df$FAF_cat == "Rendah")
df$FAF_Sedang <- as.integer(df$FAF_cat == "Sedang")
df$FAF_Tinggi <- as.integer(df$FAF_cat == "Tinggi")
return(df)
}
train_da <- encode_dummy(data_train)
test_da <- encode_dummy(data_test)
# Kolom prediktor untuk DA
pred_cols_da <- c("Age", "Height", "FCVC", "NCP", "CH2O", "TUE",
"family_history_yes", "FAVC_yes",
"CAEC_bin_num", "CALC_bin_num", "MTRANS_bin_num",
"FAF_Rendah", "FAF_Sedang", "FAF_Tinggi")
X_train <- train_da[, pred_cols_da]
y_train <- train_da$NObeyesdad
X_test <- test_da[, pred_cols_da]
y_test <- test_da$NObeyesdad
QDA mencari batas keputusan kuadratik antar kelas dengan mengasumsikan setiap kelas memiliki matriks kovarians sendiri.
X_train_clean <- X_train
X_test_clean <- X_test
colnames(X_train_clean) <- make.names(colnames(X_train_clean), unique = TRUE)
colnames(X_test_clean) <- make.names(colnames(X_test_clean), unique = TRUE)
pca_obj <- prcomp(X_train, center = TRUE, scale. = TRUE)
cum_var <- cumsum(pca_obj$sdev^2 / sum(pca_obj$sdev^2))
n_comp_90 <- which(cum_var >= 0.90)[1]
X_train_pca_tmp <- as.data.frame(pca_obj$x[, 1:n_comp_90])
min_rank <- min(sapply(levels(y_train), function(grp) {
qr(cov(X_train_pca_tmp[y_train == grp, ]))$rank
}))
n_comp <- min(n_comp_90, min_rank)
cat(sprintf("PC yang digunakan: %d (%.2f%% varians)\n",
n_comp, cum_var[n_comp] * 100))
## PC yang digunakan: 7 (70.86% varians)
X_train_pca <- as.data.frame(pca_obj$x[, 1:n_comp])
X_test_pca <- as.data.frame(predict(pca_obj, newdata = X_test)[, 1:n_comp])
model_qda <- MASS::qda(x = X_train_pca, grouping = y_train)
cat("\nProporsi Prior per Kelas:\n")
##
## Proporsi Prior per Kelas:
for (i in seq_along(model_qda$prior)) {
cat(sprintf(" %-25s : %.4f (%.2f%%)\n",
names(model_qda$prior)[i],
model_qda$prior[i],
model_qda$prior[i] * 100))
}
## Insufficient_Weight : 0.1344 (13.44%)
## Normal_Weight : 0.1356 (13.56%)
## Obesity_Type_I : 0.1560 (15.60%)
## Obesity_Type_II : 0.1443 (14.43%)
## Obesity_Type_III : 0.1610 (16.10%)
## Overweight_Level_I : 0.1350 (13.50%)
## Overweight_Level_II : 0.1337 (13.37%)
pred_qda <- predict(model_qda, newdata = X_test_pca)
class_qda <- pred_qda$class
cm_qda <- confusionMatrix(class_qda, y_test)
cat("\n=== Confusion Matrix QDA ===\n")
##
## === Confusion Matrix QDA ===
print(cm_qda)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 27 19 2
## Normal_Weight 2 12 2
## Obesity_Type_I 16 11 36
## Obesity_Type_II 3 4 19
## Obesity_Type_III 0 1 0
## Overweight_Level_I 4 6 0
## Overweight_Level_II 2 1 3
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 1 0 2
## Normal_Weight 0 0 6
## Obesity_Type_I 0 0 18
## Obesity_Type_II 55 0 14
## Obesity_Type_III 0 64 0
## Overweight_Level_I 1 0 10
## Overweight_Level_II 1 0 4
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 4
## Normal_Weight 10
## Obesity_Type_I 11
## Obesity_Type_II 17
## Obesity_Type_III 0
## Overweight_Level_I 1
## Overweight_Level_II 10
##
## Overall Statistics
##
## Accuracy : 0.5363
## 95% CI : (0.486, 0.5861)
## No Information Rate : 0.1604
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4566
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.50000 0.22222
## Specificity 0.91884 0.94203
## Pos Pred Value 0.49091 0.37500
## Neg Pred Value 0.92151 0.88556
## Prevalence 0.13534 0.13534
## Detection Rate 0.06767 0.03008
## Detection Prevalence 0.13784 0.08020
## Balanced Accuracy 0.70942 0.58213
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.58065 0.9483
## Specificity 0.83383 0.8328
## Pos Pred Value 0.39130 0.4911
## Neg Pred Value 0.91531 0.9895
## Prevalence 0.15539 0.1454
## Detection Rate 0.09023 0.1378
## Detection Prevalence 0.23058 0.2807
## Balanced Accuracy 0.70724 0.8906
## Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity 1.0000 0.18519
## Specificity 0.9970 0.96522
## Pos Pred Value 0.9846 0.45455
## Neg Pred Value 1.0000 0.88329
## Prevalence 0.1604 0.13534
## Detection Rate 0.1604 0.02506
## Detection Prevalence 0.1629 0.05514
## Balanced Accuracy 0.9985 0.57520
## Class: Overweight_Level_II
## Sensitivity 0.18868
## Specificity 0.96821
## Pos Pred Value 0.47619
## Neg Pred Value 0.88624
## Prevalence 0.13283
## Detection Rate 0.02506
## Detection Prevalence 0.05263
## Balanced Accuracy 0.57844
akurasi_qda <- mean(class_qda == y_test)
cat(sprintf("Akurasi Model QDA : %.2f%%\n", akurasi_qda * 100))
## Akurasi Model QDA : 53.63%
akurasi_kelas_qda <- as.data.frame(cm_qda$byClass)
akurasi_kelas_qda$Kelas <- gsub("Class: ", "", rownames(akurasi_kelas_qda))
akurasi_kelas_qda$Model <- "QDA"
ggplot(akurasi_kelas_qda,
aes(x = reorder(Kelas, `Balanced Accuracy`),
y = `Balanced Accuracy`, fill = `Balanced Accuracy`)) +
geom_col() +
geom_hline(yintercept = 0.7, linetype = "dashed", color = "red", linewidth = 0.8) +
geom_text(aes(label = sprintf("%.2f", `Balanced Accuracy`)),
hjust = -0.1, size = 3.5) +
coord_flip() +
scale_fill_gradient(low = "salmon", high = "steelblue") +
scale_y_continuous(limits = c(0, 1.05)) +
labs(title = "Balanced Accuracy per Kelas — Model QDA",
subtitle = "Garis merah = threshold 0.70",
x = "Kelas Obesitas", y = "Balanced Accuracy") +
theme_minimal() +
theme(legend.position = "none")
cm_tbl_qda <- as.data.frame(cm_qda$table)
colnames(cm_tbl_qda) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(cm_tbl_qda, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(color = "white") +
geom_text(aes(label = Frekuensi), size = 4, fontface = "bold") +
scale_fill_gradient(low = "white", high = "tomato") +
labs(title = "Confusion Matrix — Model QDA",
x = "Kelas Aktual", y = "Kelas Prediksi") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
post_qda <- as.data.frame(pred_qda$posterior)
post_qda$Kelas <- y_test
post_qda$Prediksi <- class_qda
kelas_cols <- setdiff(colnames(post_qda), c("Kelas", "Prediksi"))
# Hitung varians tiap kolom posterior
var_vals <- sapply(kelas_cols, function(k) var(post_qda[[k]]))
top2_cols <- names(sort(var_vals, decreasing = TRUE))[1:2]
ggplot(post_qda,
aes(x = .data[[top2_cols[1]]],
y = .data[[top2_cols[2]]],
color = Kelas)) +
geom_point(alpha = 0.6, size = 1.5) +
stat_ellipse(type = "norm", level = 0.8, linewidth = 0.8) +
labs(title = "Scatter Plot Posterior Probability QDA",
subtitle = sprintf("Elips menunjukkan wilayah 80%% kepercayaan per kelas\nSumbu: posterior prob. kelas '%s' vs '%s'",
top2_cols[1], top2_cols[2]),
x = paste("P(Kelas =", top2_cols[1], ")"),
y = paste("P(Kelas =", top2_cols[2], ")"),
color = "Kelas Obesitas") +
theme_minimal() +
theme(legend.position = "right")
cat("\nProporsi Prior per Kelas:\n")
##
## Proporsi Prior per Kelas:
for (i in seq_along(model_qda$prior)) {
cat(sprintf(" %s : %.4f (%.2f%%)\n",
names(model_qda$prior)[i],
model_qda$prior[i],
model_qda$prior[i] * 100))
}
## Insufficient_Weight : 0.1344 (13.44%)
## Normal_Weight : 0.1356 (13.56%)
## Obesity_Type_I : 0.1560 (15.60%)
## Obesity_Type_II : 0.1443 (14.43%)
## Obesity_Type_III : 0.1610 (16.10%)
## Overweight_Level_I : 0.1350 (13.50%)
## Overweight_Level_II : 0.1337 (13.37%)
pred_qda <- predict(model_qda, newdata = X_test_pca)
class_qda <- pred_qda$class
cm_qda <- confusionMatrix(class_qda, y_test)
cat("\n=== Confusion Matrix QDA ===\n")
##
## === Confusion Matrix QDA ===
print(cm_qda)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 27 19 2
## Normal_Weight 2 12 2
## Obesity_Type_I 16 11 36
## Obesity_Type_II 3 4 19
## Obesity_Type_III 0 1 0
## Overweight_Level_I 4 6 0
## Overweight_Level_II 2 1 3
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 1 0 2
## Normal_Weight 0 0 6
## Obesity_Type_I 0 0 18
## Obesity_Type_II 55 0 14
## Obesity_Type_III 0 64 0
## Overweight_Level_I 1 0 10
## Overweight_Level_II 1 0 4
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 4
## Normal_Weight 10
## Obesity_Type_I 11
## Obesity_Type_II 17
## Obesity_Type_III 0
## Overweight_Level_I 1
## Overweight_Level_II 10
##
## Overall Statistics
##
## Accuracy : 0.5363
## 95% CI : (0.486, 0.5861)
## No Information Rate : 0.1604
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4566
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.50000 0.22222
## Specificity 0.91884 0.94203
## Pos Pred Value 0.49091 0.37500
## Neg Pred Value 0.92151 0.88556
## Prevalence 0.13534 0.13534
## Detection Rate 0.06767 0.03008
## Detection Prevalence 0.13784 0.08020
## Balanced Accuracy 0.70942 0.58213
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.58065 0.9483
## Specificity 0.83383 0.8328
## Pos Pred Value 0.39130 0.4911
## Neg Pred Value 0.91531 0.9895
## Prevalence 0.15539 0.1454
## Detection Rate 0.09023 0.1378
## Detection Prevalence 0.23058 0.2807
## Balanced Accuracy 0.70724 0.8906
## Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity 1.0000 0.18519
## Specificity 0.9970 0.96522
## Pos Pred Value 0.9846 0.45455
## Neg Pred Value 1.0000 0.88329
## Prevalence 0.1604 0.13534
## Detection Rate 0.1604 0.02506
## Detection Prevalence 0.1629 0.05514
## Balanced Accuracy 0.9985 0.57520
## Class: Overweight_Level_II
## Sensitivity 0.18868
## Specificity 0.96821
## Pos Pred Value 0.47619
## Neg Pred Value 0.88624
## Prevalence 0.13283
## Detection Rate 0.02506
## Detection Prevalence 0.05263
## Balanced Accuracy 0.57844
akurasi_qda <- mean(class_qda == y_test)
cat(sprintf("Akurasi Model QDA : %.2f%%\n", akurasi_qda * 100))
## Akurasi Model QDA : 53.63%
akurasi_kelas_qda <- as.data.frame(cm_qda$byClass)
akurasi_kelas_qda$Kelas <- gsub("Class: ", "", rownames(akurasi_kelas_qda))
akurasi_kelas_qda$Model <- "QDA"
ggplot(akurasi_kelas_qda,
aes(x = reorder(Kelas, `Balanced Accuracy`),
y = `Balanced Accuracy`, fill = `Balanced Accuracy`)) +
geom_col() +
geom_hline(yintercept = 0.7, linetype = "dashed", color = "red", linewidth = 0.8) +
geom_text(aes(label = sprintf("%.2f", `Balanced Accuracy`)),
hjust = -0.1, size = 3.5) +
coord_flip() +
scale_fill_gradient(low = "salmon", high = "steelblue") +
scale_y_continuous(limits = c(0, 1.05)) +
labs(title = "Balanced Accuracy per Kelas — Model QDA",
subtitle = "Garis merah = threshold 0.70",
x = "Kelas Obesitas", y = "Balanced Accuracy") +
theme_minimal() +
theme(legend.position = "none")
cm_tbl_qda <- as.data.frame(cm_qda$table)
colnames(cm_tbl_qda) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(cm_tbl_qda, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(color = "white") +
geom_text(aes(label = Frekuensi), size = 4, fontface = "bold") +
scale_fill_gradient(low = "white", high = "tomato") +
labs(title = "Confusion Matrix — Model QDA",
x = "Kelas Aktual", y = "Kelas Prediksi") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
Multinomial Logistic Regression (MLR) Model signifikan secara statistik (Likelihood Ratio Test) Memiliki goodness-of-fit yang cukup baik (Pseudo R² McFadden) Akurasi cukup tinggi dan stabil Interpretasi model lebih mudah
Quadratic Discriminant Analysis (QDA) Cocok karena kovarians antar kelas berbeda Mampu menangkap hubungan non-linear antar variabel Performa bisa lebih baik di beberapa kelas, tapi lebih sensitif terhadap data dan butuh preprocessing tambahan (PCA)
Model Multinomial Logistic Regression memberikan performa terbaik dalam klasifikasi tingkat obesitas dengan akurasi yang tinggi dan interpretasi yang lebih mudah dibandingkan QDA. Variabel usia, pola makan, dan gaya hidup terbukti berpengaruh signifikan terhadap tingkat obesitas.