1. Import Library

library(MASS)
library(brant)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

2. Load Dataset

2.1 Membaca Dataset

data <- read.csv("ObesityDataSet_raw_and_data_sinthetic.csv")

head(data)
##   Age Gender Height Weight       CALC FAVC FCVC NCP SCC SMOKE CH2O
## 1  21 Female   1.62   64.0         no   no    2   3  no    no    2
## 2  21 Female   1.52   56.0  Sometimes   no    3   3 yes   yes    3
## 3  23   Male   1.80   77.0 Frequently   no    2   3  no    no    2
## 4  27   Male   1.80   87.0 Frequently   no    3   3  no    no    2
## 5  22   Male   1.78   89.8  Sometimes   no    2   1  no    no    2
## 6  29   Male   1.62   53.0  Sometimes  yes    2   3  no    no    2
##   family_history_with_overweight FAF TUE      CAEC                MTRANS
## 1                            yes   0   1 Sometimes Public_Transportation
## 2                            yes   3   0 Sometimes Public_Transportation
## 3                            yes   2   1 Sometimes Public_Transportation
## 4                             no   2   0 Sometimes               Walking
## 5                             no   0   0 Sometimes Public_Transportation
## 6                             no   0   0 Sometimes            Automobile
##            NObeyesdad
## 1       Normal_Weight
## 2       Normal_Weight
## 3       Normal_Weight
## 4  Overweight_Level_I
## 5 Overweight_Level_II
## 6       Normal_Weight

2.2 Struktur Dataset

str(data)
## 'data.frame':    2111 obs. of  17 variables:
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Gender                        : chr  "Female" "Female" "Male" "Male" ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ CALC                          : chr  "no" "Sometimes" "Frequently" "Frequently" ...
##  $ FAVC                          : chr  "no" "no" "no" "no" ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ SCC                           : chr  "no" "yes" "no" "no" ...
##  $ SMOKE                         : chr  "no" "yes" "no" "no" ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ family_history_with_overweight: chr  "yes" "yes" "yes" "no" ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CAEC                          : chr  "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
##  $ MTRANS                        : chr  "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
##  $ NObeyesdad                    : chr  "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...

2.3 Statistik Deskriptif

summary(data)
##       Age           Gender              Height          Weight      
##  Min.   :14.00   Length:2111        Min.   :1.450   Min.   : 39.00  
##  1st Qu.:19.95   Class :character   1st Qu.:1.630   1st Qu.: 65.47  
##  Median :22.78   Mode  :character   Median :1.700   Median : 83.00  
##  Mean   :24.31                      Mean   :1.702   Mean   : 86.59  
##  3rd Qu.:26.00                      3rd Qu.:1.768   3rd Qu.:107.43  
##  Max.   :61.00                      Max.   :1.980   Max.   :173.00  
##      CALC               FAVC                FCVC            NCP       
##  Length:2111        Length:2111        Min.   :1.000   Min.   :1.000  
##  Class :character   Class :character   1st Qu.:2.000   1st Qu.:2.659  
##  Mode  :character   Mode  :character   Median :2.386   Median :3.000  
##                                        Mean   :2.419   Mean   :2.686  
##                                        3rd Qu.:3.000   3rd Qu.:3.000  
##                                        Max.   :3.000   Max.   :4.000  
##      SCC               SMOKE                CH2O      
##  Length:2111        Length:2111        Min.   :1.000  
##  Class :character   Class :character   1st Qu.:1.585  
##  Mode  :character   Mode  :character   Median :2.000  
##                                        Mean   :2.008  
##                                        3rd Qu.:2.477  
##                                        Max.   :3.000  
##  family_history_with_overweight      FAF              TUE        
##  Length:2111                    Min.   :0.0000   Min.   :0.0000  
##  Class :character               1st Qu.:0.1245   1st Qu.:0.0000  
##  Mode  :character               Median :1.0000   Median :0.6253  
##                                 Mean   :1.0103   Mean   :0.6579  
##                                 3rd Qu.:1.6667   3rd Qu.:1.0000  
##                                 Max.   :3.0000   Max.   :2.0000  
##      CAEC              MTRANS           NObeyesdad       
##  Length:2111        Length:2111        Length:2111       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 

2.4 Missing Value & Penghapusan Data Duplikat

colSums(is.na(data))
##                            Age                         Gender 
##                              0                              0 
##                         Height                         Weight 
##                              0                              0 
##                           CALC                           FAVC 
##                              0                              0 
##                           FCVC                            NCP 
##                              0                              0 
##                            SCC                          SMOKE 
##                              0                              0 
##                           CH2O family_history_with_overweight 
##                              0                              0 
##                            FAF                            TUE 
##                              0                              0 
##                           CAEC                         MTRANS 
##                              0                              0 
##                     NObeyesdad 
##                              0
nrow(data)
## [1] 2111
data <- unique(data)

nrow(data)
## [1] 2087

2.5 Distribusi Target

table(data$NObeyesdad)
## 
## Insufficient_Weight       Normal_Weight      Obesity_Type_I     Obesity_Type_II 
##                 267                 282                 351                 297 
##    Obesity_Type_III  Overweight_Level_I Overweight_Level_II 
##                 324                 276                 290

Visualsi Target

par(mar = c(10,4,4,2))  # tambah margin bawah

barplot(table(data$NObeyesdad),
        main = "Distribusi Tingkat Obesitas",
        col = "lightblue",
        las = 2,
        cex.names = 0.7)  # kecilin font

3. Preprocessing

3.1 Deteksi Outlier

numeric_cols <- c(
  "FCVC",
  "NCP",
  "CH2O",
  "FAF",
  "TUE"
)

outlier_summary <- c()

for (col in numeric_cols) {
  
  Q1 <- quantile(data[[col]], 0.25)
  Q3 <- quantile(data[[col]], 0.75)
  IQR <- Q3 - Q1
  
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  
  outliers <- data[[col]][
    data[[col]] < lower |
    data[[col]] > upper
  ]
  
  outlier_summary[col] <- length(outliers)
}

outlier_summary
## FCVC  NCP CH2O  FAF  TUE 
##    0  577    0    0    0

3.2 Visualisasi Outlier

for (col in numeric_cols) {
  boxplot(data[[col]],
          main = paste("Boxplot", col),
          col = "lightblue")
}

3.3 Penanganan Outlier

for (col in numeric_cols) {
  Q1 <- quantile(data[[col]], 0.25)
  Q3 <- quantile(data[[col]], 0.75)
  IQR <- Q3 - Q1
  
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  
  data[[col]][data[[col]] < lower] <- lower
  data[[col]][data[[col]] > upper] <- upper
}

3.5 Analisis Korelasi

for (col in numeric_cols) {
  cat(col, ": min =", min(data[[col]]),
      "max =", max(data[[col]]), "\n")
}
## FCVC : min = 1 max = 3 
## NCP : min = 2.243667 max = 3.453799 
## CH2O : min = 1 max = 3 
## FAF : min = 0 max = 3 
## TUE : min = 0 max = 2
# ambil hanya kolom numerik
numeric_data <- data[, sapply(data, is.numeric)]

# cek dulu (biar aman)
str(numeric_data)
## 'data.frame':    2087 obs. of  8 variables:
##  $ Age   : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height: num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight: num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ FCVC  : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP   : num  3 3 3 3 2.24 ...
##  $ CH2O  : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ FAF   : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE   : num  1 0 1 0 0 0 0 0 1 1 ...
# hitung korelasi (AMAN dari NA)
cor_matrix <- cor(numeric_data, use = "complete.obs")

# tampilkan
cor_matrix
##                Age      Height      Weight        FCVC         NCP        CH2O
## Age     1.00000000 -0.03174825  0.19816049  0.01357180 -0.09210744 -0.04405777
## Height -0.03174825  1.00000000  0.45746802 -0.04036257  0.19069750  0.22048705
## Weight  0.19816049  0.45746802  1.00000000  0.21657440  0.03755726  0.20382302
## FCVC    0.01357180 -0.04036257  0.21657440  1.00000000  0.03465198  0.08133170
## NCP    -0.09210744  0.19069750  0.03755726  0.03465198  1.00000000  0.07353687
## CH2O   -0.04405777  0.22048705  0.20382302  0.08133170  0.07353687  1.00000000
## FAF    -0.14820151  0.29358397 -0.05649007  0.02200298  0.12694700  0.16530995
## TUE    -0.30292652  0.04180763 -0.07935127 -0.10412758  0.02258690  0.02070441
##                FAF         TUE
## Age    -0.14820151 -0.30292652
## Height  0.29358397  0.04180763
## Weight -0.05649007 -0.07935127
## FCVC    0.02200298 -0.10412758
## NCP     0.12694700  0.02258690
## CH2O    0.16530995  0.02070441
## FAF     1.00000000  0.05871604
## TUE     0.05871604  1.00000000

3.6 Visualisasi Korelasi

if (!require(corrplot)) install.packages("corrplot")
## Loading required package: corrplot
## corrplot 0.95 loaded
library(corrplot)

corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.col = "black",
         tl.srt = 45,
         addCoef.col = "black")

4. Transformasi & Pembagian Data

4.1 Transformasi Variabel Target

Variabel NObeyesdad diubah menjadi ordered factor dengan urutan tingkat obesitas dari rendah hingga tinggi. Hal ini bertujuan agar model dapat memahami hubungan berurutan antar kategori dalam analisis Ordinal Logistic Regression.

data$NObeyesdad <- factor(data$NObeyesdad,
 levels = c("Insufficient_Weight",
            "Normal_Weight",
            "Overweight_Level_I",
            "Overweight_Level_II",
            "Obesity_Type_I",
            "Obesity_Type_II",
            "Obesity_Type_III"),
 ordered = TRUE)

4.2 Encoding Variabel Kategorik

Variabel kategorikal seperti Gender, FAVC, CAEC, SMOKE, SCC, CALC, MTRANS, dan family_history_with_overweight dikonversi menjadi factor agar dapat diproses dengan benar oleh model statistik

factor_cols <- c(
  "Gender",
  "family_history_with_overweight",
  "FAVC",
  "SMOKE",
  "SCC",
  "MTRANS"
)

data[factor_cols] <- lapply(data[factor_cols], as.factor)
data$CAEC <- factor(
  data$CAEC,
  levels = c("no", "Sometimes", "Frequently", "Always"),
  ordered = TRUE
)

data$CALC <- factor(
  data$CALC,
  levels = c("no", "Sometimes", "Frequently", "Always"),
  ordered = TRUE
)

4.3 Distribusi Variabel Target

Distribusi data pada setiap kategori NObeyesdad relatif seimbang, sehingga tidak terdapat dominasi kelas tertentu dan model dapat belajar dengan baik dari seluruh kategori

summary(data$NObeyesdad)
## Insufficient_Weight       Normal_Weight  Overweight_Level_I Overweight_Level_II 
##                 267                 282                 276                 290 
##      Obesity_Type_I     Obesity_Type_II    Obesity_Type_III 
##                 351                 297                 324

4.4 Validasi Struktur Data

Hasil pengecekan menunjukkan seluruh variabel telah memiliki tipe data yang sesuai, sehingga dataset siap digunakan untuk tahap pemodelan.

str(data)
## 'data.frame':    2087 obs. of  17 variables:
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Gender                        : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ CALC                          : Ord.factor w/ 4 levels "no"<"Sometimes"<..: 1 2 3 3 2 2 2 2 3 1 ...
##  $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 2.24 ...
##  $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CAEC                          : Ord.factor w/ 4 levels "no"<"Sometimes"<..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ MTRANS                        : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
##  $ NObeyesdad                    : Ord.factor w/ 7 levels "Insufficient_Weight"<..: 2 2 2 3 4 2 2 2 2 2 ...

4.4 Data Splitting

Data dibagi menjadi data latih (training) dan data uji (testing) untuk memastikan model dapat dievaluasi pada data yang tidak digunakan saat pelatihan. Pembagian ini dilakukan secara acak dengan proporsi tertentu (70% data latih dan 30% data uji).

set.seed(123)

train_index <- sample(1:nrow(data), 0.7*nrow(data))

train <- data[train_index, ]
test  <- data[-train_index, ]

4.6 Standarisasi Data Numerik

num_vars <- c(
  "Age",
  "Height",
  "Weight",
  "FCVC",
  "NCP",
  "CH2O",
  "FAF",
  "TUE"
)
preproc <- preProcess(
  train[, num_vars],
  method = c("center", "scale")
)

train[, num_vars] <- predict(
  preproc,
  train[, num_vars]
)

test[, num_vars] <- predict(
  preproc,
  test[, num_vars]
)

5. Pembangunan Model

5.1 Pembangunan Model Regresi Logistik Ordinal

Model dibangun menggunakan metode Ordinal Logistic Regression dengan pendekatan cumulative logit. Variabel dependen berupa tingkat obesitas yang bersifat ordinal, sedangkan variabel independen meliputi Weight, FAF, CH2O, dan Age. Estimasi parameter dilakukan menggunakan metode Maximum Likelihood.

sapply(train, class)
## $Age
## [1] "numeric"
## 
## $Gender
## [1] "factor"
## 
## $Height
## [1] "numeric"
## 
## $Weight
## [1] "numeric"
## 
## $CALC
## [1] "ordered" "factor" 
## 
## $FAVC
## [1] "factor"
## 
## $FCVC
## [1] "numeric"
## 
## $NCP
## [1] "numeric"
## 
## $SCC
## [1] "factor"
## 
## $SMOKE
## [1] "factor"
## 
## $CH2O
## [1] "numeric"
## 
## $family_history_with_overweight
## [1] "factor"
## 
## $FAF
## [1] "numeric"
## 
## $TUE
## [1] "numeric"
## 
## $CAEC
## [1] "ordered" "factor" 
## 
## $MTRANS
## [1] "factor"
## 
## $NObeyesdad
## [1] "ordered" "factor"
model <- polr(
  NObeyesdad ~ Gender + Age + Weight +
    family_history_with_overweight +
    FAVC + FCVC + CH2O + FAF +
    CAEC + MTRANS + CALC,
  data = train,
  Hess = TRUE
)

5.2 Hasil Model

summary(model)
## Call:
## polr(formula = NObeyesdad ~ Gender + Age + Weight + family_history_with_overweight + 
##     FAVC + FCVC + CH2O + FAF + CAEC + MTRANS + CALC, data = train, 
##     Hess = TRUE)
## 
## Coefficients:
##                                      Value Std. Error  t value
## GenderMale                        -3.54727    0.17398 -20.3887
## Age                                0.54538    0.08077   6.7519
## Weight                             7.43273    0.25302  29.3761
## family_history_with_overweightyes  0.37135    0.18965   1.9581
## FAVCyes                           -0.52647    0.19079  -2.7595
## FCVC                               0.13141    0.06503   2.0209
## CH2O                              -0.25041    0.06402  -3.9113
## FAF                               -0.47514    0.06631  -7.1660
## CAEC.L                            -1.84429    0.37337  -4.9395
## CAEC.Q                             0.86628    0.29268   2.9598
## CAEC.C                             0.83071    0.18945   4.3849
## MTRANSBike                         0.77989    1.10141   0.7081
## MTRANSMotorbike                    2.44126    0.72273   3.3778
## MTRANSPublic_Transportation        1.18742    0.18780   6.3229
## MTRANSWalking                      0.06679    0.39714   0.1682
## CALC.L                             1.77506    1.49367   1.1884
## CALC.Q                             2.05451    1.12315   1.8292
## CALC.C                             0.77425    0.54101   1.4311
## 
## Intercepts:
##                                        Value    Std. Error t value 
## Insufficient_Weight|Normal_Weight      -11.0359   0.7525   -14.6652
## Normal_Weight|Overweight_Level_I        -7.0398   0.6990   -10.0713
## Overweight_Level_I|Overweight_Level_II  -3.9538   0.6797    -5.8173
## Overweight_Level_II|Obesity_Type_I      -1.1998   0.6733    -1.7819
## Obesity_Type_I|Obesity_Type_II           3.1836   0.6810     4.6750
## Obesity_Type_II|Obesity_Type_III         7.3190   0.7020    10.4252
## 
## Residual Deviance: 2132.359 
## AIC: 2180.359

5.3 Uji Asumsi Proportional Odds

# Menguji apakah asumsi proportional odds terpenuhi.
# Jika p-value > 0.05, asumsi terpenuhi dan model valid.

brant(model)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## -------------------------------------------------------------------- 
## Test for             X2  df  probability 
## -------------------------------------------------------------------- 
## Omnibus                  923.7   90  0
## GenderMale               46.02   5   0
## Age                  45.34   5   0
## Weight                   2.64    5   0.76
## family_history_with_overweightyes    11.74   5   0.04
## FAVCyes                  35.91   5   0
## FCVC                 4.33    5   0.5
## CH2O                 6.18    5   0.29
## FAF                  4.1 5   0.53
## CAEC.L                   10.85   5   0.05
## CAEC.Q                   13.25   5   0.02
## CAEC.C                   1.97    5   0.85
## MTRANSBike               68.6    5   0
## MTRANSMotorbike          -0.56   5   1
## MTRANSPublic_Transportation      17.48   5   0
## MTRANSWalking                10.91   5   0.05
## CALC.L                   0   5   1
## CALC.Q                   0   5   1
## CALC.C                   0   5   1
## -------------------------------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds
## Warning in brant(model): 4236 combinations in table(dv,ivs) do not occur.
## Because of that, the test results might be invalid.

5.4 Uji Signifikansi

Uji signifikansi dilakukan untuk mengetahui apakah variabel independen memiliki pengaruh yang signifikan terhadap variabel dependen. Pengujian dilakukan menggunakan nilai p-value, dengan kriteria bahwa variabel dikatakan signifikan jika p-value < 0.05.

ctable <- coef(summary(model))

p_value <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2

hasil <- cbind(ctable, "p value" = p_value)

hasil
##                                               Value Std. Error     t value
## GenderMale                              -3.54727486 0.17398224 -20.3887184
## Age                                      0.54538232 0.08077462   6.7519022
## Weight                                   7.43273219 0.25302011  29.3760537
## family_history_with_overweightyes        0.37135099 0.18965034   1.9580824
## FAVCyes                                 -0.52647178 0.19078723  -2.7594707
## FCVC                                     0.13140760 0.06502528   2.0208694
## CH2O                                    -0.25040827 0.06402202  -3.9112837
## FAF                                     -0.47514187 0.06630511  -7.1659917
## CAEC.L                                  -1.84428701 0.37337363  -4.9395214
## CAEC.Q                                   0.86628106 0.29268148   2.9598082
## CAEC.C                                   0.83070823 0.18944855   4.3848750
## MTRANSBike                               0.77988958 1.10141394   0.7080804
## MTRANSMotorbike                          2.44125950 0.72273123   3.3778248
## MTRANSPublic_Transportation              1.18741560 0.18779504   6.3229336
## MTRANSWalking                            0.06678712 0.39714401   0.1681685
## CALC.L                                   1.77506094 1.49367060   1.1883885
## CALC.Q                                   2.05450673 1.12314992   1.8292364
## CALC.C                                   0.77424656 0.54100951   1.4311145
## Insufficient_Weight|Normal_Weight      -11.03593202 0.75252688 -14.6651665
## Normal_Weight|Overweight_Level_I        -7.03977526 0.69899266 -10.0713149
## Overweight_Level_I|Overweight_Level_II  -3.95377444 0.67966258  -5.8172608
## Overweight_Level_II|Obesity_Type_I      -1.19976857 0.67330348  -1.7819135
## Obesity_Type_I|Obesity_Type_II           3.18361089 0.68099252   4.6749572
## Obesity_Type_II|Obesity_Type_III         7.31896116 0.70204640  10.4251815
##                                              p value
## GenderMale                              2.105907e-92
## Age                                     1.459192e-11
## Weight                                 1.110923e-189
## family_history_with_overweightyes       5.022035e-02
## FAVCyes                                 5.789508e-03
## FCVC                                    4.329329e-02
## CH2O                                    9.180684e-05
## FAF                                     7.722543e-13
## CAEC.L                                  7.831456e-07
## CAEC.Q                                  3.078306e-03
## CAEC.C                                  1.160524e-05
## MTRANSBike                              4.788954e-01
## MTRANSMotorbike                         7.306160e-04
## MTRANSPublic_Transportation             2.566436e-10
## MTRANSWalking                           8.664507e-01
## CALC.L                                  2.346804e-01
## CALC.Q                                  6.736420e-02
## CALC.C                                  1.523974e-01
## Insufficient_Weight|Normal_Weight       1.077525e-48
## Normal_Weight|Overweight_Level_I        7.398227e-24
## Overweight_Level_I|Overweight_Level_II  5.981978e-09
## Overweight_Level_II|Obesity_Type_I      7.476334e-02
## Obesity_Type_I|Obesity_Type_II          2.940149e-06
## Obesity_Type_II|Obesity_Type_III        1.902957e-25

5.5 Perhitungan Odds Ratio

Odds ratio dihitung untuk menginterpretasikan besarnya pengaruh variabel independen terhadap peluang perubahan kategori pada variabel dependen. Nilai ini diperoleh dengan melakukan eksponensial terhadap koefisien model.

odds_ratio <- exp(coef(model))
odds_ratio
##                        GenderMale                               Age 
##                      2.880303e-02                      1.725268e+00 
##                            Weight family_history_with_overweightyes 
##                      1.690420e+03                      1.449692e+00 
##                           FAVCyes                              FCVC 
##                      5.906854e-01                      1.140433e+00 
##                              CH2O                               FAF 
##                      7.784829e-01                      6.217968e-01 
##                            CAEC.L                            CAEC.Q 
##                      1.581380e-01                      2.378051e+00 
##                            CAEC.C                        MTRANSBike 
##                      2.294944e+00                      2.181231e+00 
##                   MTRANSMotorbike       MTRANSPublic_Transportation 
##                      1.148750e+01                      3.278597e+00 
##                     MTRANSWalking                            CALC.L 
##                      1.069068e+00                      5.900641e+00 
##                            CALC.Q                            CALC.C 
##                      7.802988e+00                      2.168957e+00

5.6 Penentuan Variabel Paling Berpengaruh

Variabel yang paling berpengaruh ditentukan berdasarkan besar nilai koefisien atau odds ratio. Variabel dengan nilai terbesar dianggap memiliki pengaruh paling dominan terhadap tingkat obesitas.

sort(abs(coef(model)), decreasing = TRUE)
##                            Weight                        GenderMale 
##                        7.43273219                        3.54727486 
##                   MTRANSMotorbike                            CALC.Q 
##                        2.44125950                        2.05450673 
##                            CAEC.L                            CALC.L 
##                        1.84428701                        1.77506094 
##       MTRANSPublic_Transportation                            CAEC.Q 
##                        1.18741560                        0.86628106 
##                            CAEC.C                        MTRANSBike 
##                        0.83070823                        0.77988958 
##                            CALC.C                               Age 
##                        0.77424656                        0.54538232 
##                           FAVCyes                               FAF 
##                        0.52647178                        0.47514187 
## family_history_with_overweightyes                              CH2O 
##                        0.37135099                        0.25040827 
##                              FCVC                     MTRANSWalking 
##                        0.13140760                        0.06678712

6. Evaluasi Model

6.1 Akurasi Model

# Konversi keduanya ke character dulu sebelum dibandingkan
pred_class <- predict(model, test)

# Akurasi - fix error
mean(as.character(pred_class) == as.character(test$NObeyesdad))
## [1] 0.6443381
actual_num <- as.numeric(test$NObeyesdad)
pred_num   <- as.numeric(factor(as.character(pred_class),
                                levels = levels(test$NObeyesdad)))
mean(abs(actual_num - pred_num))
## [1] 0.3572568
pred_prob <- predict(model, test, type = "probs")
head(pred_prob)
##    Insufficient_Weight Normal_Weight Overweight_Level_I Overweight_Level_II
## 3         6.282075e-03  2.495784e-01        0.626855457         0.108895069
## 7         1.564267e-02  4.479643e-01        0.486189891         0.046848907
## 15        2.370660e-01  7.070686e-01        0.053169483         0.002523809
## 21        6.925176e-03  2.680578e-01        0.617515015         0.099890984
## 22        5.619729e-07  3.000209e-05        0.000638037         0.009729822
## 25        4.034994e-02  6.554079e-01        0.284656489         0.018315281
##    Obesity_Type_I Obesity_Type_II Obesity_Type_III
## 3    0.0082833973    1.039060e-04     1.689392e-06
## 7    0.0033122079    4.133813e-05     6.720674e-07
## 15   0.0001699398    2.114190e-06     3.437074e-08
## 21   0.0075152696    9.419683e-05     1.531517e-06
## 22   0.4466405683    5.243110e-01     1.865003e-02
## 25   0.0012544811    1.562393e-05     2.540043e-07

6.2 Confusion Matrix

confusionMatrix(
  factor(as.character(pred_class), levels = levels(test$NObeyesdad)),
  factor(as.character(test$NObeyesdad), levels = levels(test$NObeyesdad))
)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  59            14                  0
##   Normal_Weight                        12            48                  9
##   Overweight_Level_I                    0            23                 35
##   Overweight_Level_II                   0             0                 35
##   Obesity_Type_I                        0             0                  1
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                   24              0               0
##   Overweight_Level_II                  37             17               0
##   Obesity_Type_I                       26             76              14
##   Obesity_Type_II                       0             10              75
##   Obesity_Type_III                      0              0              12
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                   26
##   Obesity_Type_III                  74
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6443          
##                  95% CI : (0.6055, 0.6819)
##     No Information Rate : 0.1643          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5838          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.8310              0.56471
## Specificity                              0.9748              0.96125
## Pos Pred Value                           0.8082              0.69565
## Neg Pred Value                           0.9783              0.93369
## Prevalence                               0.1132              0.13557
## Detection Rate                           0.0941              0.07656
## Detection Prevalence                     0.1164              0.11005
## Balanced Accuracy                        0.9029              0.76298
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                            0.43750                    0.42529
## Specificity                            0.91408                    0.90370
## Pos Pred Value                         0.42683                    0.41573
## Neg Pred Value                         0.91743                    0.90706
## Prevalence                             0.12759                    0.13876
## Detection Rate                         0.05582                    0.05901
## Detection Prevalence                   0.13078                    0.14195
## Balanced Accuracy                      0.67579                    0.66450
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.7379                 0.7426
## Specificity                         0.9218                 0.9316
## Pos Pred Value                      0.6496                 0.6757
## Neg Pred Value                      0.9471                 0.9496
## Prevalence                          0.1643                 0.1611
## Detection Rate                      0.1212                 0.1196
## Detection Prevalence                0.1866                 0.1770
## Balanced Accuracy                   0.8298                 0.8371
##                      Class: Obesity_Type_III
## Sensitivity                           0.7400
## Specificity                           0.9772
## Pos Pred Value                        0.8605
## Neg Pred Value                        0.9519
## Prevalence                            0.1595
## Detection Rate                        0.1180
## Detection Prevalence                  0.1372
## Balanced Accuracy                     0.8586

7. Visualisasi Hasil

comparison <- data.frame(
  Actual    = as.character(test$NObeyesdad),
  Predicted = as.character(pred_class)
)

7.1 Visualisasi Perbandingan Aktual & Prediksi

# Definisi urutan kategori
kategori_levels <- c("Insufficient_Weight", "Normal_Weight", 
                     "Overweight_Level_I", "Overweight_Level_II",
                     "Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III")

# Label singkat biar tidak terlalu panjang di axis
kategori_labels <- c("Insuf.\nWeight", "Normal\nWeight", 
                     "Overweight\nI", "Overweight\nII",
                     "Obesity\nI", "Obesity\nII", "Obesity\nIII")

# Warna per kategori
warna_kategori <- c(
  "Insufficient_Weight" = "#4E79A7",
  "Normal_Weight"       = "#59A14F",
  "Overweight_Level_I"  = "#F28E2B",
  "Overweight_Level_II" = "#E15759",
  "Obesity_Type_I"      = "#B07AA1",
  "Obesity_Type_II"     = "#FF9DA7",
  "Obesity_Type_III"    = "#9C755F"
)

# Konversi ke factor dengan urutan yang benar
comparison$Actual    <- factor(comparison$Actual,    levels = kategori_levels)
comparison$Predicted <- factor(comparison$Predicted, levels = kategori_levels)

ggplot(comparison, aes(x = Actual, fill = Predicted)) +
  geom_bar(position = position_dodge(width = 0.85),
           width = 0.75, color = "white", linewidth = 0.3) +
  scale_fill_manual(values = warna_kategori,
                    labels = kategori_labels) +
  scale_x_discrete(labels = kategori_labels) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x     = element_text(size = 9, color = "gray30"),
    axis.text.y     = element_text(size = 9, color = "gray30"),
    axis.title      = element_text(size = 11, color = "gray20"),
    plot.title      = element_text(face = "bold", size = 14, hjust = 0.5),
    plot.subtitle   = element_text(size = 10, hjust = 0.5, color = "gray50"),
    legend.position = "right",
    legend.title    = element_text(size = 10, face = "bold"),
    legend.text     = element_text(size = 8),
    panel.grid.major.x = element_blank(),
    panel.grid.minor   = element_blank(),
    plot.background    = element_rect(fill = "white", color = NA),
    panel.background   = element_rect(fill = "white", color = NA)
  ) +
  labs(
    title    = "Perbandingan Kategori Aktual vs Prediksi",
    subtitle = "Ordinal Logistic Regression — Data Test",
    x        = "Kategori Aktual",
    y        = "Jumlah Observasi",
    fill     = "Prediksi"
  )

7.2 Visualisasi Odds Ratio

or_df <- data.frame(
  Variable = names(exp(coef(model))),
  OR       = exp(coef(model))
)

or_df$Arah <- ifelse(or_df$OR > 1, "Meningkatkan Risiko", "Menurunkan Risiko")

ggplot(or_df, aes(x = reorder(Variable, OR), y = OR, fill = Arah)) +
  geom_col(width = 0.7) +
  geom_hline(yintercept = 1, linetype = "dashed", 
             color = "gray40", linewidth = 0.8) +
  geom_text(aes(label = round(OR, 3)), 
            hjust = -0.2, size = 3.2) +
  scale_fill_manual(values = c(
    "Meningkatkan Risiko" = "#E15759",
    "Menurunkan Risiko"   = "#4E79A7"
  )) +
  coord_flip() +
  theme_minimal(base_size = 12) +
  theme(
    plot.title      = element_text(face = "bold", size = 14, hjust = 0.5),
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    plot.background = element_rect(fill = "white", color = NA)
  ) +
  labs(
    title = "Odds Ratio Variabel Prediktor",
    x     = "",
    y     = "Odds Ratio",
    fill  = "Arah Pengaruh"
  )

7.3 Visualisasi Heatmap Confusion Matrix

heatmap_data <- comparison %>%
  count(Actual, Predicted) %>%
  mutate(
    Actual    = factor(Actual,    levels = kategori_levels),
    Predicted = factor(Predicted, levels = kategori_levels)
  )

ggplot(heatmap_data, aes(x = Actual, y = Predicted, fill = n)) +
  geom_tile(color = "white", linewidth = 0.8) +
  geom_text(aes(label = n, color = n > 50), 
            size = 3.5, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#1A5276") +
  scale_color_manual(values = c("TRUE" = "white", "FALSE" = "gray20"),
                     guide = "none") +
  scale_x_discrete(labels = kategori_labels) +
  scale_y_discrete(labels = kategori_labels) +
  theme_minimal(base_size = 11) +
  theme(
    axis.text.x      = element_text(angle = 45, hjust = 1, size = 9),
    axis.text.y      = element_text(size = 9),
    plot.title       = element_text(face = "bold", size = 14, hjust = 0.5),
    plot.subtitle    = element_text(size = 10, hjust = 0.5, color = "gray50"),
    panel.grid       = element_blank(),
    plot.background  = element_rect(fill = "white", color = NA)
  ) +
  labs(
    title    = "Heatmap Confusion Matrix",
    subtitle = "Diagonal = prediksi benar | Warna lebih gelap = jumlah lebih banyak",
    x        = "Aktual",
    y        = "Prediksi",
    fill     = "Jumlah"
  )