Dataset ini berisi data tingkat obesitas (NObeyesdad)
dengan 7 kategori ordinal yang memiliki urutan dari
ringan ke berat:
Insufficient_Weight < Normal_Weight < Overweight_Level_I < Overweight_Level_II < Obesity_Type_I < Obesity_Type_II < Obesity_Type_III
Karena variabel target bersifat ordinal (ada urutan/tingkatan), model yang digunakan adalah Ordinal Logistic Regression (Proportional Odds Model).
Model ini tetap mensyaratkan 4 asumsi yang sama dengan Multinomial Logistic Regression:
Mengapa variabel kontinu wajib ada? Asumsi Linearity dan No Outliers hanya dapat diuji pada variabel numerik/kontinu. Variabel kategorik tidak memiliki outlier dan tidak perlu diuji linearitasnya. Dataset ini sudah memenuhi semua syarat:
NObeyesdad (7
level)# Install packages jika belum tersedia
pkgs <- c("MASS", "car", "ggplot2", "tidyverse", "corrplot", "gridExtra", "nnet")
for (p in pkgs) if (!require(p, character.only = TRUE)) install.packages(p)
library(MASS)
library(car)
library(ggplot2)
library(tidyverse)
library(corrplot)
library(gridExtra)
library(nnet)
# Fungsi bersihkan nilai numerik rusak (misal "25.196.214")
fix_numeric <- function(x) {
sapply(x, function(v) {
v <- trimws(v)
n_dots <- nchar(v) - nchar(gsub("\\.", "", v))
if (n_dots > 1) {
parts <- strsplit(v, "\\.")[[1]]
v <- paste0(parts[1], ".", paste(parts[-1], collapse = ""))
}
suppressWarnings(as.numeric(v))
})
}
# Sesuaikan path dengan lokasi file di komputer Anda
df_raw <- read.csv("ObesityDataSet.csv",
sep = ";", stringsAsFactors = FALSE)
cont_vars <- c("Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE")
cat_vars <- c("Gender", "family_history_with_overweight", "FAVC",
"CAEC", "SMOKE", "SCC", "CALC", "MTRANS")
df <- df_raw
for (v in cont_vars) df[[v]] <- fix_numeric(df_raw[[v]])
for (v in cat_vars) df[[v]] <- as.factor(df[[v]])
# Target sebagai ordered factor (ordinal)
df$NObeyesdad <- factor(df$NObeyesdad,
levels = c("Insufficient_Weight", "Normal_Weight",
"Overweight_Level_I", "Overweight_Level_II",
"Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"),
ordered = TRUE)
cat("Dimensi data :", nrow(df), "baris x", ncol(df), "kolom\n")## Dimensi data : 2111 baris x 17 kolom
##
## Distribusi kelas target:
##
## Insufficient_Weight Normal_Weight Overweight_Level_I Overweight_Level_II
## 272 287 290 290
## Obesity_Type_I Obesity_Type_II Obesity_Type_III
## 351 297 324
##
## NA per variabel kontinu:
## Age Height Weight FCVC NCP CH2O FAF TUE
## 0 0 0 0 0 0 0 0
## Rows: 2,111
## Columns: 17
## $ Gender <fct> Female, Female, Male, Male, Male, Male,…
## $ Age <dbl> 21, 21, 23, 27, 22, 29, 23, 22, 24, 22,…
## $ Height <dbl> 1.62, 1.52, 1.80, 1.80, 1.78, 1.62, 1.5…
## $ Weight <dbl> 64.0, 56.0, 77.0, 87.0, 89.8, 53.0, 55.…
## $ family_history_with_overweight <fct> yes, yes, yes, no, no, no, yes, no, yes…
## $ FAVC <fct> no, no, no, no, no, yes, yes, no, yes, …
## $ FCVC <dbl> 2, 3, 2, 3, 2, 2, 3, 2, 3, 2, 3, 2, 3, …
## $ NCP <dbl> 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ CAEC <fct> Sometimes, Sometimes, Sometimes, Someti…
## $ SMOKE <fct> no, yes, no, no, no, no, no, no, no, no…
## $ CH2O <dbl> 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
## $ SCC <fct> no, yes, no, no, no, no, no, no, no, no…
## $ FAF <dbl> 0, 3, 2, 2, 0, 0, 1, 3, 1, 1, 2, 2, 2, …
## $ TUE <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 0, …
## $ CALC <fct> no, Sometimes, Frequently, Frequently, …
## $ MTRANS <fct> Public_Transportation, Public_Transport…
## $ NObeyesdad <ord> Normal_Weight, Normal_Weight, Normal_We…
Asumsi linearity mensyaratkan hubungan antara log-odds dan variabel prediktor kontinu bersifat linear. Pengujian menggunakan Box-Tidwell test dengan menambahkan interaksi \(X \cdot \ln(X)\). Jika interaksi tidak signifikan (p > 0.05) maka asumsi linear terpenuhi.
Pada data biologis (Age, Weight, Height), hubungannya dengan log-odds sering tidak linear secara alami. Solusi standar: transformasi logaritmik yang mempersempit rentang nilai ekstrem sehingga hubungan menjadi lebih linear.
Aturan transformasi:
log(x)log(x + 1) agar tidak tak terdefinisivars_log1p <- c("FAF", "TUE") # bisa bernilai 0 -> log(x+1)
vars_log <- setdiff(cont_vars, vars_log1p) # selalu > 0 -> log(x)
df_t <- df
for (v in vars_log) df_t[[v]] <- log(df_t[[v]])
for (v in vars_log1p) df_t[[v]] <- log1p(df_t[[v]])
cat("Transformasi berhasil diterapkan:\n")## Transformasi berhasil diterapkan:
## - log(x) : Age, Height, Weight, FCVC, NCP, CH2O
## - log1p(x) : FAF, TUE
# Pastikan tidak ada nilai tidak finite
check <- sapply(cont_vars, function(v) sum(!is.finite(df_t[[v]])))
cat("Nilai tidak finite setelah transformasi (harus semua 0):\n")## Nilai tidak finite setelah transformasi (harus semua 0):
## Age Height Weight FCVC NCP CH2O FAF TUE
## 0 0 0 0 0 0 0 0
df_bt <- df_t[complete.cases(df_t[, cont_vars]), ]
# Buat interaksi X_transformed * ln(X_transformed)
for (v in cont_vars) {
x <- df_bt[[v]]
if (any(x <= 0, na.rm = TRUE)) x <- x - min(x, na.rm = TRUE) + 0.001
df_bt[[paste0(v, "_ln")]] <- x * log(x)
}
# Model multinomial unordered sebagai basis Box-Tidwell
df_bt$target_unord <- relevel(
factor(as.character(df_bt$NObeyesdad)), ref = "Normal_Weight")
bt_formula <- as.formula(paste(
"target_unord ~",
paste(cont_vars, collapse = " + "), "+",
paste(paste0(cont_vars, "_ln"), collapse = " + ")
))
set.seed(42)
model_bt <- multinom(bt_formula, data = df_bt, trace = FALSE, maxit = 500)
z_bt <- summary(model_bt)$coefficients / summary(model_bt)$standard.errors
p_bt <- 2 * (1 - pnorm(abs(z_bt)))
ln_terms <- paste0(cont_vars, "_ln")
p_ln <- apply(p_bt[, ln_terms, drop = FALSE], 2, min)
bt_result <- data.frame(
Variabel = cont_vars,
Transformasi = c("log(Age)", "log(Height)", "log(Weight)",
"log(FCVC)", "log(NCP)", "log(CH2O)",
"log1p(FAF)", "log1p(TUE)"),
Min_p_value = round(p_ln, 4),
Asumsi = ifelse(p_ln > 0.05, "TERPENUHI", "DILANGGAR")
)
rownames(bt_result) <- NULL
knitr::kable(bt_result,
caption = "Hasil Box-Tidwell Test SETELAH Transformasi Log")| Variabel | Transformasi | Min_p_value | Asumsi |
|---|---|---|---|
| Age | log(Age) | 0.1531 | TERPENUHI |
| Height | log(Height) | 0.0008 | DILANGGAR |
| Weight | log(Weight) | 0.0000 | DILANGGAR |
| FCVC | log(FCVC) | 0.0000 | DILANGGAR |
| NCP | log(NCP) | 0.0000 | DILANGGAR |
| CH2O | log(CH2O) | 0.0389 | DILANGGAR |
| FAF | log1p(FAF) | 0.0000 | DILANGGAR |
| TUE | log1p(TUE) | 0.0000 | DILANGGAR |
Interpretasi: Variabel dengan
p > 0.05 dinyatakan memenuhi asumsi linearity setelah
transformasi log. Jika masih ada yang dilanggar, hal ini wajar pada data
nyata dan model tetap dapat dilanjutkan karena Ordinal Logistic
Regression cukup robust.
## === CEK ASUMSI INDEPENDENCE ===
## Total observasi : 2111
## Duplikat baris : 24
## Dataset bersifat cross-sectional:
## Setiap baris = 1 individu unik, tidak ada pengukuran berulang.
## Asumsi Independence secara desain TERPENUHI.
Multikolinearitas terjadi ketika dua atau lebih prediktor saling berkorelasi tinggi, menyebabkan koefisien tidak stabil. Pengecekan dengan correlation matrix dan VIF (Variance Inflation Factor). Threshold: VIF > 10 = masalah serius.
cor_mat <- cor(df_t[, cont_vars], use = "complete.obs")
corrplot(cor_mat,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.75,
tl.cex = 0.85,
col = colorRampPalette(c("#2166AC", "white", "#D6604D"))(200),
title = "Correlation Matrix Variabel Kontinu (Setelah Transformasi Log)",
mar = c(0, 0, 2, 0))df_vif <- df_t
df_vif$NObeyesdad_num <- as.numeric(df_t$NObeyesdad)
vif_formula <- as.formula(paste(
"NObeyesdad_num ~",
paste(cont_vars, collapse = " + "),
"+ Gender + family_history_with_overweight + FAVC +",
"CAEC + SMOKE + SCC + CALC + MTRANS"
))
lm_vif <- lm(vif_formula, data = df_vif)
vif_val <- vif(lm_vif)
vif_df <- data.frame(
Variabel = names(vif_val[, 1]),
VIF = round(vif_val[, 1], 3),
Status = ifelse(vif_val[, 1] < 5, "Aman",
ifelse(vif_val[, 1] < 10, "Perlu Perhatian", "Masalah Serius"))
)
rownames(vif_df) <- NULL
knitr::kable(vif_df,
caption = "Nilai VIF per Variabel (Setelah Transformasi Log)")| Variabel | VIF | Status |
|---|---|---|
| Age | 1.061 | Aman |
| Height | 1.020 | Aman |
| Weight | 1.075 | Aman |
| FCVC | 1.020 | Aman |
| NCP | 1.030 | Aman |
| CH2O | 1.026 | Aman |
| FAF | 1.030 | Aman |
| TUE | 1.044 | Aman |
| Gender | 1.098 | Aman |
| family_history_with_overweight | 1.249 | Aman |
| FAVC | 1.163 | Aman |
| CAEC | 1.311 | Aman |
| SMOKE | 1.030 | Aman |
| SCC | 1.090 | Aman |
| CALC | 1.142 | Aman |
| MTRANS | 1.227 | Aman |
Interpretasi: Semua VIF < 5 menunjukkan tidak ada multikolinearitas. Asumsi No Multicollinearity TERPENUHI.
Logistic Regression sensitif terhadap outlier. Strategi penanganan:
detect_outliers <- function(x, var_name) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
n_out <- sum(x < lower | x > upper, na.rm = TRUE)
data.frame(
Variabel = var_name,
Batas_Bawah = round(lower, 3), Batas_Atas = round(upper, 3),
Jumlah_Outlier = n_out,
Persen = round(n_out / length(x) * 100, 2))
}
outlier_tbl <- do.call(rbind,
lapply(cont_vars, function(v) detect_outliers(df_t[[v]], v)))
rownames(outlier_tbl) <- NULL
knitr::kable(outlier_tbl,
caption = "Outlier Setelah Transformasi Log (Sebelum Capping)")| Variabel | Batas_Bawah | Batas_Atas | Jumlah_Outlier | Persen |
|---|---|---|---|---|
| Age | 2.490 | 3.719 | 155 | 7.34 |
| Height | 0.356 | 0.715 | 157 | 7.44 |
| Weight | 3.254 | 5.494 | 138 | 6.54 |
| FCVC | 0.085 | 1.707 | 139 | 6.58 |
| NCP | 0.894 | 1.221 | 670 | 31.74 |
| CH2O | -0.216 | 1.670 | 109 | 5.16 |
| FAF | -1.251 | 2.398 | 76 | 3.60 |
| TUE | -1.040 | 1.733 | 35 | 1.66 |
Transformasi log sudah mengurangi jumlah outlier dibanding data mentah karena nilai ekstrem dipersempit secara logaritmik.
plots_before <- lapply(cont_vars, function(v) {
ggplot(df_t, aes_string(y = v)) +
geom_boxplot(fill = "#4292C6", color = "#084594",
outlier.color = "red", outlier.size = 1.5) +
labs(title = v, y = "", x = "") +
theme_minimal(base_size = 10) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
})
grid.arrange(grobs = plots_before, ncol = 4,
top = "Boxplot (Log Transform) — SEBELUM Capping")cap_outlier <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
x[x < Q1 - 1.5 * IQR_val] <- Q1 - 1.5 * IQR_val
x[x > Q3 + 1.5 * IQR_val] <- Q3 + 1.5 * IQR_val
x
}
df_clean <- df_t
for (v in cont_vars) df_clean[[v]] <- cap_outlier(df_clean[[v]])
cat("=== PERBANDINGAN OUTLIER (Setelah Log, Sebelum vs Sesudah Capping) ===\n\n")## === PERBANDINGAN OUTLIER (Setelah Log, Sebelum vs Sesudah Capping) ===
for (v in cont_vars) {
Q1 <- quantile(df_t[[v]], 0.25); Q3 <- quantile(df_t[[v]], 0.75)
n_before <- sum(df_t[[v]] < Q1 - 1.5*IQR(df_t[[v]]) |
df_t[[v]] > Q3 + 1.5*IQR(df_t[[v]]))
Q1c <- quantile(df_clean[[v]], 0.25); Q3c <- quantile(df_clean[[v]], 0.75)
n_after <- sum(df_clean[[v]] < Q1c - 1.5*IQR(df_clean[[v]]) |
df_clean[[v]] > Q3c + 1.5*IQR(df_clean[[v]]))
cat(sprintf("%-12s: Sebelum capping = %3d outlier | Sesudah = %d\n",
v, n_before, n_after))
}## Age : Sebelum capping = 155 outlier | Sesudah = 0
## Height : Sebelum capping = 157 outlier | Sesudah = 0
## Weight : Sebelum capping = 138 outlier | Sesudah = 0
## FCVC : Sebelum capping = 139 outlier | Sesudah = 0
## NCP : Sebelum capping = 670 outlier | Sesudah = 0
## CH2O : Sebelum capping = 109 outlier | Sesudah = 0
## FAF : Sebelum capping = 76 outlier | Sesudah = 0
## TUE : Sebelum capping = 35 outlier | Sesudah = 0
plots_after <- lapply(cont_vars, function(v) {
ggplot(df_clean, aes_string(y = v)) +
geom_boxplot(fill = "#74C476", color = "#00441B",
outlier.color = "red", outlier.size = 1.5) +
labs(title = v, y = "", x = "") +
theme_minimal(base_size = 10) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
})
grid.arrange(grobs = plots_after, ncol = 4,
top = "Boxplot (Log Transform + Capping) — SESUDAH Capping")summary_asumsi <- data.frame(
No = 1:4,
Asumsi = c("Linearity", "Independence", "No Multicollinearity", "No Outliers"),
Metode = c("Box-Tidwell Test",
"Desain studi cross-sectional",
"Correlation Matrix + VIF",
"IQR Method + Winsorizing"),
Penanganan = c("Transformasi log(x) dan log1p(x) pada variabel kontinu",
"Tidak diperlukan",
"Tidak diperlukan — semua VIF < 5",
"Log transform + Capping pada batas IQR"),
Status = c("Lihat tabel Box-Tidwell (target: p > 0.05)",
"TERPENUHI — data cross-sectional",
"TERPENUHI — semua VIF < 5",
"DITANGANI — 0 outlier setelah capping")
)
knitr::kable(summary_asumsi,
caption = "Ringkasan Uji Asumsi Ordinal Logistic Regression",
col.names = c("No", "Asumsi", "Metode Uji", "Penanganan", "Status"))| No | Asumsi | Metode Uji | Penanganan | Status |
|---|---|---|---|---|
| 1 | Linearity | Box-Tidwell Test | Transformasi log(x) dan log1p(x) pada variabel kontinu | Lihat tabel Box-Tidwell (target: p > 0.05) |
| 2 | Independence | Desain studi cross-sectional | Tidak diperlukan | TERPENUHI — data cross-sectional |
| 3 | No Multicollinearity | Correlation Matrix + VIF | Tidak diperlukan — semua VIF < 5 | TERPENUHI — semua VIF < 5 |
| 4 | No Outliers | IQR Method + Winsorizing | Log transform + Capping pada batas IQR | DITANGANI — 0 outlier setelah capping |
## Dimensi df_clean: 2111 baris x 17 kolom
## Distribusi kelas target:
##
## Insufficient_Weight Normal_Weight Overweight_Level_I Overweight_Level_II
## 272 287 290 290
## Obesity_Type_I Obesity_Type_II Obesity_Type_III
## 351 297 324
##
## Variabel kontinu sudah melalui:
## 1. Transformasi log / log1p
## 2. Capping outlier (Winsorizing)
## df_clean siap digunakan untuk pemodelan Ordinal Logistic Regression.
Laporan dibuat untuk keperluan praktikum Generalized Linear Models - Data Science UNESA