DT - Proyek ML

Data Analysis

A. Library

library(dplyr)
library(stringr) 
library(caret)
library(rpart)
library(tidyverse)
library(caret)
library(rpart)
library(rpart.plot)
library(lime)
library(iml)
library(ggplot2)
library(fastshap)
library(rattle)

B. Data

2.1. Import Data

obesity <- read.csv("D:/UNY/MySta/SEM 5/ML/Proyek ML/ObesityDataSet_raw_and_data_sinthetic.csv",
                    header = TRUE, sep = ',',
                    stringsAsFactors = FALSE)
head(obesity)
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad
## 1       Normal_Weight
## 2       Normal_Weight
## 3       Normal_Weight
## 4  Overweight_Level_I
## 5 Overweight_Level_II
## 6       Normal_Weight

2.2. Data Structure, Character Variables, and Size

# Structure
str(obesity)
## 'data.frame':    2111 obs. of  17 variables:
##  $ Gender                        : chr  "Female" "Female" "Male" "Male" ...
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ family_history_with_overweight: chr  "yes" "yes" "yes" "no" ...
##  $ FAVC                          : chr  "no" "no" "no" "no" ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : chr  "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
##  $ SMOKE                         : chr  "no" "yes" "no" "no" ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : chr  "no" "yes" "no" "no" ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : chr  "no" "Sometimes" "Frequently" "Frequently" ...
##  $ MTRANS                        : chr  "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
##  $ NObeyesdad                    : chr  "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
# Character variable
sapply(obesity, class)
##                         Gender                            Age 
##                    "character"                      "numeric" 
##                         Height                         Weight 
##                      "numeric"                      "numeric" 
## family_history_with_overweight                           FAVC 
##                    "character"                    "character" 
##                           FCVC                            NCP 
##                      "numeric"                      "numeric" 
##                           CAEC                          SMOKE 
##                    "character"                    "character" 
##                           CH2O                            SCC 
##                      "numeric"                    "character" 
##                            FAF                            TUE 
##                      "numeric"                      "numeric" 
##                           CALC                         MTRANS 
##                    "character"                    "character" 
##                     NObeyesdad 
##                    "character"
# Size
cat("Number of rows:", nrow(obesity), "\n")
## Number of rows: 2111
cat("Number of columns:", ncol(obesity), "\n")
## Number of columns: 17

2.3. Data Summary

summary(obesity)
##     Gender               Age            Height          Weight      
##  Length:2111        Min.   :14.00   Min.   :1.450   Min.   : 39.00  
##  Class :character   1st Qu.:19.95   1st Qu.:1.630   1st Qu.: 65.47  
##  Mode  :character   Median :22.78   Median :1.700   Median : 83.00  
##                     Mean   :24.31   Mean   :1.702   Mean   : 86.59  
##                     3rd Qu.:26.00   3rd Qu.:1.768   3rd Qu.:107.43  
##                     Max.   :61.00   Max.   :1.980   Max.   :173.00  
##  family_history_with_overweight     FAVC                FCVC      
##  Length:2111                    Length:2111        Min.   :1.000  
##  Class :character               Class :character   1st Qu.:2.000  
##  Mode  :character               Mode  :character   Median :2.386  
##                                                    Mean   :2.419  
##                                                    3rd Qu.:3.000  
##                                                    Max.   :3.000  
##       NCP            CAEC              SMOKE                CH2O      
##  Min.   :1.000   Length:2111        Length:2111        Min.   :1.000  
##  1st Qu.:2.659   Class :character   Class :character   1st Qu.:1.585  
##  Median :3.000   Mode  :character   Mode  :character   Median :2.000  
##  Mean   :2.686                                         Mean   :2.008  
##  3rd Qu.:3.000                                         3rd Qu.:2.477  
##  Max.   :4.000                                         Max.   :3.000  
##      SCC                 FAF              TUE             CALC          
##  Length:2111        Min.   :0.0000   Min.   :0.0000   Length:2111       
##  Class :character   1st Qu.:0.1245   1st Qu.:0.0000   Class :character  
##  Mode  :character   Median :1.0000   Median :0.6253   Mode  :character  
##                     Mean   :1.0103   Mean   :0.6579                     
##                     3rd Qu.:1.6667   3rd Qu.:1.0000                     
##                     Max.   :3.0000   Max.   :2.0000                     
##     MTRANS           NObeyesdad       
##  Length:2111        Length:2111       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

C. Data Preprocessing

3.1. Missing and Duplicate Values

# Check missing values
colSums(is.na(obesity))
##                         Gender                            Age 
##                              0                              0 
##                         Height                         Weight 
##                              0                              0 
## family_history_with_overweight                           FAVC 
##                              0                              0 
##                           FCVC                            NCP 
##                              0                              0 
##                           CAEC                          SMOKE 
##                              0                              0 
##                           CH2O                            SCC 
##                              0                              0 
##                            FAF                            TUE 
##                              0                              0 
##                           CALC                         MTRANS 
##                              0                              0 
##                     NObeyesdad 
##                              0
# Check duplicate values
sum(duplicated(obesity))
## [1] 24
# duplicate rows
dups <- obesity[duplicated(obesity) | duplicated(obesity, fromLast=TRUE), ]
dups
##     Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 98  Female  21   1.52     42                             no   no    3   1
## 99  Female  21   1.52     42                             no   no    3   1
## 106 Female  25   1.57     55                             no  yes    2   1
## 107 Female  25   1.57     55                             no  yes    2   1
## 146   Male  21   1.62     70                             no  yes    2   1
## 175   Male  21   1.62     70                             no  yes    2   1
## 180   Male  21   1.62     70                             no  yes    2   1
## 185   Male  21   1.62     70                             no  yes    2   1
## 209 Female  22   1.69     65                            yes  yes    2   3
## 210 Female  22   1.69     65                            yes  yes    2   3
## 283 Female  18   1.62     55                            yes  yes    2   3
## 296 Female  16   1.66     58                             no   no    2   1
## 310 Female  16   1.66     58                             no   no    2   1
## 444   Male  18   1.72     53                            yes  yes    2   3
## 461 Female  18   1.62     55                            yes  yes    2   3
## 467   Male  22   1.74     75                            yes  yes    3   3
## 468   Male  22   1.74     75                            yes  yes    3   3
## 497   Male  18   1.72     53                            yes  yes    2   3
## 524 Female  21   1.52     42                             no  yes    3   1
## 528 Female  21   1.52     42                             no  yes    3   1
## 660 Female  21   1.52     42                             no  yes    3   1
## 664 Female  21   1.52     42                             no  yes    3   1
## 764   Male  21   1.62     70                             no  yes    2   1
## 765   Male  21   1.62     70                             no  yes    2   1
## 825   Male  21   1.62     70                             no  yes    2   1
## 831   Male  21   1.62     70                             no  yes    2   1
## 832   Male  21   1.62     70                             no  yes    2   1
## 833   Male  21   1.62     70                             no  yes    2   1
## 834   Male  21   1.62     70                             no  yes    2   1
## 835   Male  21   1.62     70                             no  yes    2   1
## 922   Male  21   1.62     70                             no  yes    2   1
## 923   Male  21   1.62     70                             no  yes    2   1
## 924   Male  21   1.62     70                             no  yes    2   1
##           CAEC SMOKE CH2O SCC FAF TUE      CALC                MTRANS
## 98  Frequently    no    1  no   0   0 Sometimes Public_Transportation
## 99  Frequently    no    1  no   0   0 Sometimes Public_Transportation
## 106  Sometimes    no    2  no   2   0 Sometimes Public_Transportation
## 107  Sometimes    no    2  no   2   0 Sometimes Public_Transportation
## 146         no    no    3  no   1   0 Sometimes Public_Transportation
## 175         no    no    3  no   1   0 Sometimes Public_Transportation
## 180         no    no    3  no   1   0 Sometimes Public_Transportation
## 185         no    no    3  no   1   0 Sometimes Public_Transportation
## 209  Sometimes    no    2  no   1   1 Sometimes Public_Transportation
## 210  Sometimes    no    2  no   1   1 Sometimes Public_Transportation
## 283 Frequently    no    1  no   1   1        no Public_Transportation
## 296  Sometimes    no    1  no   0   1        no               Walking
## 310  Sometimes    no    1  no   0   1        no               Walking
## 444  Sometimes    no    2  no   0   2 Sometimes Public_Transportation
## 461 Frequently    no    1  no   1   1        no Public_Transportation
## 467 Frequently    no    1  no   1   0        no            Automobile
## 468 Frequently    no    1  no   1   0        no            Automobile
## 497  Sometimes    no    2  no   0   2 Sometimes Public_Transportation
## 524 Frequently    no    1  no   0   0 Sometimes Public_Transportation
## 528 Frequently    no    1  no   0   0 Sometimes Public_Transportation
## 660 Frequently    no    1  no   0   0 Sometimes Public_Transportation
## 664 Frequently    no    1  no   0   0 Sometimes Public_Transportation
## 764         no    no    3  no   1   0 Sometimes Public_Transportation
## 765         no    no    3  no   1   0 Sometimes Public_Transportation
## 825         no    no    3  no   1   0 Sometimes Public_Transportation
## 831         no    no    3  no   1   0 Sometimes Public_Transportation
## 832         no    no    3  no   1   0 Sometimes Public_Transportation
## 833         no    no    3  no   1   0 Sometimes Public_Transportation
## 834         no    no    3  no   1   0 Sometimes Public_Transportation
## 835         no    no    3  no   1   0 Sometimes Public_Transportation
## 922         no    no    3  no   1   0 Sometimes Public_Transportation
## 923         no    no    3  no   1   0 Sometimes Public_Transportation
## 924         no    no    3  no   1   0 Sometimes Public_Transportation
##              NObeyesdad
## 98  Insufficient_Weight
## 99  Insufficient_Weight
## 106       Normal_Weight
## 107       Normal_Weight
## 146  Overweight_Level_I
## 175  Overweight_Level_I
## 180  Overweight_Level_I
## 185  Overweight_Level_I
## 209       Normal_Weight
## 210       Normal_Weight
## 283       Normal_Weight
## 296       Normal_Weight
## 310       Normal_Weight
## 444 Insufficient_Weight
## 461       Normal_Weight
## 467       Normal_Weight
## 468       Normal_Weight
## 497 Insufficient_Weight
## 524 Insufficient_Weight
## 528 Insufficient_Weight
## 660 Insufficient_Weight
## 664 Insufficient_Weight
## 764  Overweight_Level_I
## 765  Overweight_Level_I
## 825  Overweight_Level_I
## 831  Overweight_Level_I
## 832  Overweight_Level_I
## 833  Overweight_Level_I
## 834  Overweight_Level_I
## 835  Overweight_Level_I
## 922  Overweight_Level_I
## 923  Overweight_Level_I
## 924  Overweight_Level_I
# Handling duplicate (drop duplicate, but save unik row)
obesity <- obesity %>% distinct()

# Check duplicate again
sum(duplicated(obesity))
## [1] 0

3.2. Check Outlier

# Variabel numerik
numeric_cols <- c("Age", "Height", "Weight", "FCVC", 
                  "NCP", "CH2O", "FAF", "TUE")

# Fungsi IQR
outlier_report <- lapply(numeric_cols, function(col){
  x <- obesity[[col]]
  
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR_val <- Q3 - Q1
  
  lower <- Q1 - 1.5 * IQR_val
  upper <- Q3 + 1.5 * IQR_val
  
  outliers <- sum(x < lower | x > upper)
  total <- length(x)
  
  data.frame(
    Variable = col,
    Q1 = round(Q1, 3),
    Q3 = round(Q3, 3),
    IQR = round(IQR_val, 3),
    Lower = round(lower, 3),
    Upper = round(upper, 3),
    OutlierCount = outliers,
    OutlierPercent = round((outliers / total) * 100, 2)
  )
})

outlier_report <- do.call(rbind, outlier_report)
rownames(outlier_report) <- NULL

outlier_report
##   Variable     Q1      Q3    IQR  Lower   Upper OutlierCount OutlierPercent
## 1      Age 19.916  26.000  6.084 10.790  35.126          167           8.00
## 2   Height  1.630   1.769  0.139  1.421   1.978            1           0.05
## 3   Weight 66.000 108.016 42.016  2.976 171.040            1           0.05
## 4     FCVC  2.000   3.000  1.000  0.500   4.500            0           0.00
## 5      NCP  2.697   3.000  0.303  2.244   3.454          577          27.65
## 6     CH2O  1.591   2.466  0.875  0.278   3.779            0           0.00
## 7      FAF  0.125   1.678  1.554 -2.206   4.008            0           0.00
## 8      TUE  0.000   1.000  1.000 -1.500   2.500            0           0.00

Variabel NCP memiliki persentase outlier cukup tinggi (27.65%). Hal ini terjadi karena rentang nilai NCP sangat sempit (1–4) sehingga metode IQR mudah mengidentifikasi nilai tepi sebagai outlier, meskipun secara substantif nilai tersebut masih wajar. Oleh karena itu, outlier pada variabel ini tidak dihapus, karena secara statistik masih mungkin dan tidak memengaruhi performa Decision Tree.

3.3. Encoding Kategorik ke Numerik

# Standardize categorical text
obesity <- obesity %>%
  mutate(across(where(is.character), ~ tolower(str_trim(.))))

# Convert factor to character
obesity <- obesity %>% 
  mutate(across(where(is.factor), as.character))

# Function for encoding
encode <- function(x, mapping) {
  unname(mapping[x])
}

# Apply encoding
obesity <- obesity %>% mutate(
  Gender = encode(Gender, c("female"=0, "male"=1)),
  
  family_history_with_overweight = encode(
    family_history_with_overweight,
    c("no"=0, "yes"=1)
  ),
  
  FAVC = encode(FAVC, c("no"=0, "yes"=1)),
  SMOKE = encode(SMOKE, c("no"=0, "yes"=1)),
  SCC = encode(SCC, c("no"=0, "yes"=1)),
  
  CAEC = encode(CAEC,
                c("no"=0, "sometimes"=1,
                  "frequently"=2, "always"=3)),
  
  CALC = encode(CALC,
                c("no"=0, "sometimes"=1,
                  "frequently"=2, "always"=3)),
  
  MTRANS = encode(MTRANS,
                  c("automobile"=0, "motorbike"=1, "bike"=2,
                    "public_transportation"=3, "walking"=4)),
  
  NObeyesdad = encode(NObeyesdad,
                      c("insufficient_weight"=0,
                        "normal_weight"=1,
                        "overweight_level_i"=2,
                        "overweight_level_ii"=3,
                        "obesity_type_i"=4,
                        "obesity_type_ii"=5,
                        "obesity_type_iii"=6))
)

obesity <- obesity %>%
  mutate(across(everything(), as.numeric))
str(obesity)
## 'data.frame':    2087 obs. of  17 variables:
##  $ Gender                        : num  0 0 1 1 1 1 0 1 1 1 ...
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ family_history_with_overweight: num  1 1 1 0 0 0 1 0 1 1 ...
##  $ FAVC                          : num  0 0 0 0 0 1 1 0 1 1 ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ SMOKE                         : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : num  0 1 2 2 1 1 1 1 2 0 ...
##  $ MTRANS                        : num  3 3 3 4 3 0 1 3 3 3 ...
##  $ NObeyesdad                    : num  1 1 1 2 3 1 1 1 1 1 ...

3.4. Check Class Target

class_dist <- data.frame(
  Class = names(table(obesity$NObeyesdad)),
  Count = as.numeric(table(obesity$NObeyesdad))
) %>%
  mutate(Percentage = round((Count / sum(Count)) * 100, 2))
class_dist
##   Class Count Percentage
## 1     0   267      12.79
## 2     1   282      13.51
## 3     2   276      13.22
## 4     3   290      13.90
## 5     4   351      16.82
## 6     5   297      14.23
## 7     6   324      15.52
ggplot(class_dist, aes(x = Class, y = Count, fill = Class)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Count), vjust = -0.5) +
  labs(title = "Distribusi Kelas Target",
       x = "Kelas Obesitas", y = "Jumlah Data") +
  theme_minimal() +
  theme(legend.position = "none")

3.5. Train-Test Split

set.seed(2025)
idx <- createDataPartition(obesity$NObeyesdad, p = 0.8, list = FALSE)
train <- obesity[idx, ]
test  <- obesity[-idx, ]

dim(train); dim(test)
## [1] 1672   17
## [1] 415  17

D. Modelling - Decision Tree

4.2. Train Final Model

final_dt <- train_single_dt(train, best_params)
fancyRpartPlot(
  final_dt,
  caption = paste("Optimal Tree (maxdepth =", best_params$maxdepth, ")")
)

4.3. Evaluation

a. Confusion Matrix

pred_final <- predict(final_dt, newdata = test, type = "class")

cm <- confusionMatrix(
  factor(pred_final, levels = sort(unique(train$NObeyesdad))),
  factor(test$NObeyesdad,  levels = sort(unique(train$NObeyesdad)))
)
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  5  6
##          0 48  1  0  0  0  0  0
##          1  3 50  3  0  0  0  0
##          2  0  7 49  5  0  0  0
##          3  0  0  2 51  0  0  0
##          4  0  0  1  2 63  6  0
##          5  0  0  0  0  3 56  0
##          6  0  0  0  0  0  1 64
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9181          
##                  95% CI : (0.8874, 0.9426)
##     No Information Rate : 0.159           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9043          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9412   0.8621   0.8909   0.8793   0.9545   0.8889
## Specificity            0.9973   0.9832   0.9667   0.9944   0.9742   0.9915
## Pos Pred Value         0.9796   0.8929   0.8033   0.9623   0.8750   0.9492
## Neg Pred Value         0.9918   0.9777   0.9831   0.9807   0.9913   0.9803
## Prevalence             0.1229   0.1398   0.1325   0.1398   0.1590   0.1518
## Detection Rate         0.1157   0.1205   0.1181   0.1229   0.1518   0.1349
## Detection Prevalence   0.1181   0.1349   0.1470   0.1277   0.1735   0.1422
## Balanced Accuracy      0.9692   0.9226   0.9288   0.9369   0.9644   0.9402
##                      Class: 6
## Sensitivity            1.0000
## Specificity            0.9972
## Pos Pred Value         0.9846
## Neg Pred Value         1.0000
## Prevalence             0.1542
## Detection Rate         0.1542
## Detection Prevalence   0.1566
## Balanced Accuracy      0.9986

b. Classification Report

byclass <- cm$byClass

classification_report <- data.frame(
  Class = rownames(byclass),
  Precision = byclass[, "Pos Pred Value"],
  Recall    = byclass[, "Sensitivity"],
  F1_Score  = 2 * byclass[, "Pos Pred Value"] * byclass[, "Sensitivity"] /
                   (byclass[, "Pos Pred Value"] + byclass[, "Sensitivity"])
)
classification_report
##             Class Precision    Recall  F1_Score
## Class: 0 Class: 0 0.9795918 0.9411765 0.9600000
## Class: 1 Class: 1 0.8928571 0.8620690 0.8771930
## Class: 2 Class: 2 0.8032787 0.8909091 0.8448276
## Class: 3 Class: 3 0.9622642 0.8793103 0.9189189
## Class: 4 Class: 4 0.8750000 0.9545455 0.9130435
## Class: 5 Class: 5 0.9491525 0.8888889 0.9180328
## Class: 6 Class: 6 0.9846154 1.0000000 0.9922481

c. Learning Curve

depths <- 1:12
train_acc <- numeric(length(depths))
val_acc   <- numeric(length(depths))

for (i in seq_along(depths)) {
  m <- rpart(
    NObeyesdad ~ .,
    data = train,
    method = "class",
    control = list(maxdepth = depths[i])
  )

  train_acc[i] <- mean(predict(m, newdata = train, type = "class") == train$NObeyesdad)
  val_acc[i]   <- mean(predict(m, newdata = test,  type = "class") == test$NObeyesdad)
}

lc_df <- data.frame(Depth = depths, Train = train_acc, Validation = val_acc)

ggplot(lc_df, aes(x = Depth)) +
  geom_line(aes(y = Train, color = "Train")) +
  geom_point(aes(y = Train, color = "Train")) +
  geom_line(aes(y = Validation, color = "Validation")) +
  geom_point(aes(y = Validation, color = "Validation")) +
  labs(title = "Learning Curve", y = "Accuracy") +
  scale_color_manual(values = c("Train"="red","Validation"="blue")) +
  theme_minimal()

4.4. Feature Importance (SHAP)