DT - Proyek ML
Data Analysis
A. Library
B. Data
2.1. Import Data
obesity <- read.csv("D:/UNY/MySta/SEM 5/ML/Proyek ML/ObesityDataSet_raw_and_data_sinthetic.csv",
header = TRUE, sep = ',',
stringsAsFactors = FALSE)
head(obesity)## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
2.2. Data Structure, Character Variables, and Size
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
## Gender Age
## "character" "numeric"
## Height Weight
## "numeric" "numeric"
## family_history_with_overweight FAVC
## "character" "character"
## FCVC NCP
## "numeric" "numeric"
## CAEC SMOKE
## "character" "character"
## CH2O SCC
## "numeric" "character"
## FAF TUE
## "numeric" "numeric"
## CALC MTRANS
## "character" "character"
## NObeyesdad
## "character"
## Number of rows: 2111
## Number of columns: 17
2.3. Data Summary
## Gender Age Height Weight
## Length:2111 Min. :14.00 Min. :1.450 Min. : 39.00
## Class :character 1st Qu.:19.95 1st Qu.:1.630 1st Qu.: 65.47
## Mode :character Median :22.78 Median :1.700 Median : 83.00
## Mean :24.31 Mean :1.702 Mean : 86.59
## 3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
## Max. :61.00 Max. :1.980 Max. :173.00
## family_history_with_overweight FAVC FCVC
## Length:2111 Length:2111 Min. :1.000
## Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Median :2.386
## Mean :2.419
## 3rd Qu.:3.000
## Max. :3.000
## NCP CAEC SMOKE CH2O
## Min. :1.000 Length:2111 Length:2111 Min. :1.000
## 1st Qu.:2.659 Class :character Class :character 1st Qu.:1.585
## Median :3.000 Mode :character Mode :character Median :2.000
## Mean :2.686 Mean :2.008
## 3rd Qu.:3.000 3rd Qu.:2.477
## Max. :4.000 Max. :3.000
## SCC FAF TUE CALC
## Length:2111 Min. :0.0000 Min. :0.0000 Length:2111
## Class :character 1st Qu.:0.1245 1st Qu.:0.0000 Class :character
## Mode :character Median :1.0000 Median :0.6253 Mode :character
## Mean :1.0103 Mean :0.6579
## 3rd Qu.:1.6667 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.0000
## MTRANS NObeyesdad
## Length:2111 Length:2111
## Class :character Class :character
## Mode :character Mode :character
##
##
##
C. Data Preprocessing
3.1. Missing and Duplicate Values
## Gender Age
## 0 0
## Height Weight
## 0 0
## family_history_with_overweight FAVC
## 0 0
## FCVC NCP
## 0 0
## CAEC SMOKE
## 0 0
## CH2O SCC
## 0 0
## FAF TUE
## 0 0
## CALC MTRANS
## 0 0
## NObeyesdad
## 0
## [1] 24
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 98 Female 21 1.52 42 no no 3 1
## 99 Female 21 1.52 42 no no 3 1
## 106 Female 25 1.57 55 no yes 2 1
## 107 Female 25 1.57 55 no yes 2 1
## 146 Male 21 1.62 70 no yes 2 1
## 175 Male 21 1.62 70 no yes 2 1
## 180 Male 21 1.62 70 no yes 2 1
## 185 Male 21 1.62 70 no yes 2 1
## 209 Female 22 1.69 65 yes yes 2 3
## 210 Female 22 1.69 65 yes yes 2 3
## 283 Female 18 1.62 55 yes yes 2 3
## 296 Female 16 1.66 58 no no 2 1
## 310 Female 16 1.66 58 no no 2 1
## 444 Male 18 1.72 53 yes yes 2 3
## 461 Female 18 1.62 55 yes yes 2 3
## 467 Male 22 1.74 75 yes yes 3 3
## 468 Male 22 1.74 75 yes yes 3 3
## 497 Male 18 1.72 53 yes yes 2 3
## 524 Female 21 1.52 42 no yes 3 1
## 528 Female 21 1.52 42 no yes 3 1
## 660 Female 21 1.52 42 no yes 3 1
## 664 Female 21 1.52 42 no yes 3 1
## 764 Male 21 1.62 70 no yes 2 1
## 765 Male 21 1.62 70 no yes 2 1
## 825 Male 21 1.62 70 no yes 2 1
## 831 Male 21 1.62 70 no yes 2 1
## 832 Male 21 1.62 70 no yes 2 1
## 833 Male 21 1.62 70 no yes 2 1
## 834 Male 21 1.62 70 no yes 2 1
## 835 Male 21 1.62 70 no yes 2 1
## 922 Male 21 1.62 70 no yes 2 1
## 923 Male 21 1.62 70 no yes 2 1
## 924 Male 21 1.62 70 no yes 2 1
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 98 Frequently no 1 no 0 0 Sometimes Public_Transportation
## 99 Frequently no 1 no 0 0 Sometimes Public_Transportation
## 106 Sometimes no 2 no 2 0 Sometimes Public_Transportation
## 107 Sometimes no 2 no 2 0 Sometimes Public_Transportation
## 146 no no 3 no 1 0 Sometimes Public_Transportation
## 175 no no 3 no 1 0 Sometimes Public_Transportation
## 180 no no 3 no 1 0 Sometimes Public_Transportation
## 185 no no 3 no 1 0 Sometimes Public_Transportation
## 209 Sometimes no 2 no 1 1 Sometimes Public_Transportation
## 210 Sometimes no 2 no 1 1 Sometimes Public_Transportation
## 283 Frequently no 1 no 1 1 no Public_Transportation
## 296 Sometimes no 1 no 0 1 no Walking
## 310 Sometimes no 1 no 0 1 no Walking
## 444 Sometimes no 2 no 0 2 Sometimes Public_Transportation
## 461 Frequently no 1 no 1 1 no Public_Transportation
## 467 Frequently no 1 no 1 0 no Automobile
## 468 Frequently no 1 no 1 0 no Automobile
## 497 Sometimes no 2 no 0 2 Sometimes Public_Transportation
## 524 Frequently no 1 no 0 0 Sometimes Public_Transportation
## 528 Frequently no 1 no 0 0 Sometimes Public_Transportation
## 660 Frequently no 1 no 0 0 Sometimes Public_Transportation
## 664 Frequently no 1 no 0 0 Sometimes Public_Transportation
## 764 no no 3 no 1 0 Sometimes Public_Transportation
## 765 no no 3 no 1 0 Sometimes Public_Transportation
## 825 no no 3 no 1 0 Sometimes Public_Transportation
## 831 no no 3 no 1 0 Sometimes Public_Transportation
## 832 no no 3 no 1 0 Sometimes Public_Transportation
## 833 no no 3 no 1 0 Sometimes Public_Transportation
## 834 no no 3 no 1 0 Sometimes Public_Transportation
## 835 no no 3 no 1 0 Sometimes Public_Transportation
## 922 no no 3 no 1 0 Sometimes Public_Transportation
## 923 no no 3 no 1 0 Sometimes Public_Transportation
## 924 no no 3 no 1 0 Sometimes Public_Transportation
## NObeyesdad
## 98 Insufficient_Weight
## 99 Insufficient_Weight
## 106 Normal_Weight
## 107 Normal_Weight
## 146 Overweight_Level_I
## 175 Overweight_Level_I
## 180 Overweight_Level_I
## 185 Overweight_Level_I
## 209 Normal_Weight
## 210 Normal_Weight
## 283 Normal_Weight
## 296 Normal_Weight
## 310 Normal_Weight
## 444 Insufficient_Weight
## 461 Normal_Weight
## 467 Normal_Weight
## 468 Normal_Weight
## 497 Insufficient_Weight
## 524 Insufficient_Weight
## 528 Insufficient_Weight
## 660 Insufficient_Weight
## 664 Insufficient_Weight
## 764 Overweight_Level_I
## 765 Overweight_Level_I
## 825 Overweight_Level_I
## 831 Overweight_Level_I
## 832 Overweight_Level_I
## 833 Overweight_Level_I
## 834 Overweight_Level_I
## 835 Overweight_Level_I
## 922 Overweight_Level_I
## 923 Overweight_Level_I
## 924 Overweight_Level_I
# Handling duplicate (drop duplicate, but save unik row)
obesity <- obesity %>% distinct()
# Check duplicate again
sum(duplicated(obesity))## [1] 0
3.2. Check Outlier
# Variabel numerik
numeric_cols <- c("Age", "Height", "Weight", "FCVC",
"NCP", "CH2O", "FAF", "TUE")
# Fungsi IQR
outlier_report <- lapply(numeric_cols, function(col){
x <- obesity[[col]]
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
outliers <- sum(x < lower | x > upper)
total <- length(x)
data.frame(
Variable = col,
Q1 = round(Q1, 3),
Q3 = round(Q3, 3),
IQR = round(IQR_val, 3),
Lower = round(lower, 3),
Upper = round(upper, 3),
OutlierCount = outliers,
OutlierPercent = round((outliers / total) * 100, 2)
)
})
outlier_report <- do.call(rbind, outlier_report)
rownames(outlier_report) <- NULL
outlier_report## Variable Q1 Q3 IQR Lower Upper OutlierCount OutlierPercent
## 1 Age 19.916 26.000 6.084 10.790 35.126 167 8.00
## 2 Height 1.630 1.769 0.139 1.421 1.978 1 0.05
## 3 Weight 66.000 108.016 42.016 2.976 171.040 1 0.05
## 4 FCVC 2.000 3.000 1.000 0.500 4.500 0 0.00
## 5 NCP 2.697 3.000 0.303 2.244 3.454 577 27.65
## 6 CH2O 1.591 2.466 0.875 0.278 3.779 0 0.00
## 7 FAF 0.125 1.678 1.554 -2.206 4.008 0 0.00
## 8 TUE 0.000 1.000 1.000 -1.500 2.500 0 0.00
Variabel NCP memiliki persentase outlier cukup tinggi (27.65%). Hal ini terjadi karena rentang nilai NCP sangat sempit (1–4) sehingga metode IQR mudah mengidentifikasi nilai tepi sebagai outlier, meskipun secara substantif nilai tersebut masih wajar. Oleh karena itu, outlier pada variabel ini tidak dihapus, karena secara statistik masih mungkin dan tidak memengaruhi performa Decision Tree.
3.3. Encoding Kategorik ke Numerik
# Standardize categorical text
obesity <- obesity %>%
mutate(across(where(is.character), ~ tolower(str_trim(.))))
# Convert factor to character
obesity <- obesity %>%
mutate(across(where(is.factor), as.character))
# Function for encoding
encode <- function(x, mapping) {
unname(mapping[x])
}
# Apply encoding
obesity <- obesity %>% mutate(
Gender = encode(Gender, c("female"=0, "male"=1)),
family_history_with_overweight = encode(
family_history_with_overweight,
c("no"=0, "yes"=1)
),
FAVC = encode(FAVC, c("no"=0, "yes"=1)),
SMOKE = encode(SMOKE, c("no"=0, "yes"=1)),
SCC = encode(SCC, c("no"=0, "yes"=1)),
CAEC = encode(CAEC,
c("no"=0, "sometimes"=1,
"frequently"=2, "always"=3)),
CALC = encode(CALC,
c("no"=0, "sometimes"=1,
"frequently"=2, "always"=3)),
MTRANS = encode(MTRANS,
c("automobile"=0, "motorbike"=1, "bike"=2,
"public_transportation"=3, "walking"=4)),
NObeyesdad = encode(NObeyesdad,
c("insufficient_weight"=0,
"normal_weight"=1,
"overweight_level_i"=2,
"overweight_level_ii"=3,
"obesity_type_i"=4,
"obesity_type_ii"=5,
"obesity_type_iii"=6))
)
obesity <- obesity %>%
mutate(across(everything(), as.numeric))
str(obesity)## 'data.frame': 2087 obs. of 17 variables:
## $ Gender : num 0 0 1 1 1 1 0 1 1 1 ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: num 1 1 1 0 0 0 1 0 1 1 ...
## $ FAVC : num 0 0 0 0 0 1 1 0 1 1 ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : num 1 1 1 1 1 1 1 1 1 1 ...
## $ SMOKE : num 0 1 0 0 0 0 0 0 0 0 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : num 0 1 0 0 0 0 0 0 0 0 ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : num 0 1 2 2 1 1 1 1 2 0 ...
## $ MTRANS : num 3 3 3 4 3 0 1 3 3 3 ...
## $ NObeyesdad : num 1 1 1 2 3 1 1 1 1 1 ...
3.4. Check Class Target
class_dist <- data.frame(
Class = names(table(obesity$NObeyesdad)),
Count = as.numeric(table(obesity$NObeyesdad))
) %>%
mutate(Percentage = round((Count / sum(Count)) * 100, 2))
class_dist## Class Count Percentage
## 1 0 267 12.79
## 2 1 282 13.51
## 3 2 276 13.22
## 4 3 290 13.90
## 5 4 351 16.82
## 6 5 297 14.23
## 7 6 324 15.52
ggplot(class_dist, aes(x = Class, y = Count, fill = Class)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Count), vjust = -0.5) +
labs(title = "Distribusi Kelas Target",
x = "Kelas Obesitas", y = "Jumlah Data") +
theme_minimal() +
theme(legend.position = "none")D. Modelling - Decision Tree
4.1. Hyperparameter Tuning (Random Search)
set.seed(2025)
criterion_list <- c("gini", "information", "logloss_sim")
splitter_list <- c("best", "random")
maxdepth_list <- 1:10
search_grid <- data.frame(
criterion = sample(criterion_list, 30, replace = TRUE),
splitter = sample(splitter_list, 30, replace = TRUE),
maxdepth = sample(maxdepth_list, 30, replace = TRUE)
)
# Mapping Functions
criterion_map <- function(c) {
if (c == "gini") return("gini")
if (c == "information") return("information")
if (c == "logloss_sim") return("information")
}
cp_map <- function(c) {
if (c == "logloss_sim") return(0.0001)
return(0.01)
}
splitter_map <- function(s) {
if (s == "best") return(0)
if (s == "random") return(2)
}
# Training function
train_single_dt <- function(train_data, params) {
rpart(
NObeyesdad ~ .,
data = train_data,
method = "class",
parms = list(split = criterion_map(params$criterion)),
control = list(
maxdepth = params$maxdepth,
cp = cp_map(params$criterion),
usesurrogate = splitter_map(params$splitter)
)
)
}
results <- data.frame()
for (i in 1:nrow(search_grid)) {
params <- search_grid[i, ]
model <- train_single_dt(train, params)
pred <- predict(model, newdata = test, type = "class")
acc <- mean(pred == test$NObeyesdad)
results <- rbind(
results,
data.frame(
criterion = params$criterion,
splitter = params$splitter,
maxdepth = params$maxdepth,
accuracy = acc
)
)
}
results <- results[order(-results$accuracy), ]
best_params <- results[1, ]
best_params## criterion splitter maxdepth accuracy
## 4 logloss_sim random 8 0.9180723
4.2. Train Final Model
4.3. Evaluation
a. Confusion Matrix
pred_final <- predict(final_dt, newdata = test, type = "class")
cm <- confusionMatrix(
factor(pred_final, levels = sort(unique(train$NObeyesdad))),
factor(test$NObeyesdad, levels = sort(unique(train$NObeyesdad)))
)
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6
## 0 48 1 0 0 0 0 0
## 1 3 50 3 0 0 0 0
## 2 0 7 49 5 0 0 0
## 3 0 0 2 51 0 0 0
## 4 0 0 1 2 63 6 0
## 5 0 0 0 0 3 56 0
## 6 0 0 0 0 0 1 64
##
## Overall Statistics
##
## Accuracy : 0.9181
## 95% CI : (0.8874, 0.9426)
## No Information Rate : 0.159
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9043
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.9412 0.8621 0.8909 0.8793 0.9545 0.8889
## Specificity 0.9973 0.9832 0.9667 0.9944 0.9742 0.9915
## Pos Pred Value 0.9796 0.8929 0.8033 0.9623 0.8750 0.9492
## Neg Pred Value 0.9918 0.9777 0.9831 0.9807 0.9913 0.9803
## Prevalence 0.1229 0.1398 0.1325 0.1398 0.1590 0.1518
## Detection Rate 0.1157 0.1205 0.1181 0.1229 0.1518 0.1349
## Detection Prevalence 0.1181 0.1349 0.1470 0.1277 0.1735 0.1422
## Balanced Accuracy 0.9692 0.9226 0.9288 0.9369 0.9644 0.9402
## Class: 6
## Sensitivity 1.0000
## Specificity 0.9972
## Pos Pred Value 0.9846
## Neg Pred Value 1.0000
## Prevalence 0.1542
## Detection Rate 0.1542
## Detection Prevalence 0.1566
## Balanced Accuracy 0.9986
b. Classification Report
byclass <- cm$byClass
classification_report <- data.frame(
Class = rownames(byclass),
Precision = byclass[, "Pos Pred Value"],
Recall = byclass[, "Sensitivity"],
F1_Score = 2 * byclass[, "Pos Pred Value"] * byclass[, "Sensitivity"] /
(byclass[, "Pos Pred Value"] + byclass[, "Sensitivity"])
)
classification_report## Class Precision Recall F1_Score
## Class: 0 Class: 0 0.9795918 0.9411765 0.9600000
## Class: 1 Class: 1 0.8928571 0.8620690 0.8771930
## Class: 2 Class: 2 0.8032787 0.8909091 0.8448276
## Class: 3 Class: 3 0.9622642 0.8793103 0.9189189
## Class: 4 Class: 4 0.8750000 0.9545455 0.9130435
## Class: 5 Class: 5 0.9491525 0.8888889 0.9180328
## Class: 6 Class: 6 0.9846154 1.0000000 0.9922481
c. Learning Curve
depths <- 1:12
train_acc <- numeric(length(depths))
val_acc <- numeric(length(depths))
for (i in seq_along(depths)) {
m <- rpart(
NObeyesdad ~ .,
data = train,
method = "class",
control = list(maxdepth = depths[i])
)
train_acc[i] <- mean(predict(m, newdata = train, type = "class") == train$NObeyesdad)
val_acc[i] <- mean(predict(m, newdata = test, type = "class") == test$NObeyesdad)
}
lc_df <- data.frame(Depth = depths, Train = train_acc, Validation = val_acc)
ggplot(lc_df, aes(x = Depth)) +
geom_line(aes(y = Train, color = "Train")) +
geom_point(aes(y = Train, color = "Train")) +
geom_line(aes(y = Validation, color = "Validation")) +
geom_point(aes(y = Validation, color = "Validation")) +
labs(title = "Learning Curve", y = "Accuracy") +
scale_color_manual(values = c("Train"="red","Validation"="blue")) +
theme_minimal()