R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

title: “Analisis Klasifikasi Status Gizi Balita” author: “Nama Mahasiswa” date: “2026-06-03” output: html_document: toc: true toc_float: true number_sections: true pdf_document: toc: true number_sections: true —

Pendahuluan

Laporan ini menyajikan analisis klasifikasi status gizi balita menggunakan metode Random Forest dan K-Nearest Neighbor (KNN) dengan hyperparameter tuning.

1. Install dan Load Package

# ============================================================
# 1. INSTALL DAN LOAD PACKAGE
# ============================================================
library(readr)
library(dplyr)
library(caret)
library(e1071)
library(randomForest)
library(class)
library(rpart)
library(rpart.plot)
library(ggplot2)
library(corrplot)
library(pROC)

2. Input Data

# ============================================================
# 2. INPUT DATA
# ============================================================

data_balita <- read_csv("~/SEMESTER 6/TUGAS UAS/DATMIN/data_balita.csv")
## Rows: 120999 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Jenis Kelamin, Status Gizi
## dbl (2): Umur (bulan), Tinggi Badan (cm)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data_balita)
## # A tibble: 6 × 4
##   `Umur (bulan)` `Jenis Kelamin` `Tinggi Badan (cm)` `Status Gizi`   
##            <dbl> <chr>                         <dbl> <chr>           
## 1              0 laki-laki                      44.6 stunted         
## 2              0 laki-laki                      56.7 tinggi          
## 3              0 laki-laki                      46.9 normal          
## 4              0 laki-laki                      47.5 normal          
## 5              0 laki-laki                      42.7 severely stunted
## 6              0 laki-laki                      44.3 stunted
str(data_balita)
## spc_tbl_ [120,999 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Umur (bulan)     : num [1:120999] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Jenis Kelamin    : chr [1:120999] "laki-laki" "laki-laki" "laki-laki" "laki-laki" ...
##  $ Tinggi Badan (cm): num [1:120999] 44.6 56.7 46.9 47.5 42.7 ...
##  $ Status Gizi      : chr [1:120999] "stunted" "tinggi" "normal" "normal" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Umur (bulan)` = col_double(),
##   ..   `Jenis Kelamin` = col_character(),
##   ..   `Tinggi Badan (cm)` = col_double(),
##   ..   `Status Gizi` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

3. Preprocessing Data

# ============================================================
# 3. PREPROCESSING
# ============================================================

colnames(data_balita) <- c("Umur_Bulan", "Jenis_Kelamin", "Tinggi_Badan", "Status_Gizi")
data_balita <- na.omit(data_balita)
data_balita$Jenis_Kelamin <- as.factor(data_balita$Jenis_Kelamin)
data_balita$Status_Gizi <- as.factor(data_balita$Status_Gizi)

4. Distribusi Status Gizi Balita

# ============================================================
# DISTRIBUSI STATUS GIZI BALITA (Bar Chart)
# ============================================================
# Menghitung frekuensi status gizi
status_counts <- data_balita %>%
  group_by(Status_Gizi) %>%
  summarise(Jumlah = n()) %>%
  mutate(Persentase = Jumlah / sum(Jumlah) * 100)

# Membuat bar chart
p_status <- ggplot(status_counts, aes(x = Status_Gizi, y = Jumlah, fill = Status_Gizi)) +
  geom_bar(stat = "identity", width = 0.7) +
  geom_text(aes(label = Jumlah), vjust = -0.5, size = 4, fontface = "bold") +
  labs(
    title = "Distribusi Status Gizi Balita",
    x = "Status Gizi",
    y = "Frekuensi"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    plot.title = element_text(size = 14, face = "bold"),
    legend.position = "none"
  ) +
  scale_fill_manual(values = c("normal" = "#F8766D", 
                               "severely stunted" = "#7CAE00", 
                               "stunted" = "#00BFC4", 
                               "tinggi" = "#C77CFF"))

print(p_status)

5. Visualisasi Boxplot Umur per Status Gizi

# ============================================================
# VISUALISASI BOXPLOT UMUR PER STATUS GIZI
# ============================================================
# Boxplot umur berdasarkan status gizi
p1 <- ggplot(data_balita, aes(x = Status_Gizi, y = Umur_Bulan, fill = Status_Gizi)) +
  geom_boxplot(outlier.size = 0.5, outlier.alpha = 0.3) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
  labs(
    title = "Distribusi Umur per Status Gizi",
    subtitle = "Titik hitam menunjukkan nilai rata-rata",
    x = "Status Gizi",
    y = "Umur (bulan)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  ) +
  scale_fill_manual(values = c("#ff9999", "#ffcc99", "#99ff99", "#99ccff"))

print(p1)

6. Hubungan Umur dan Tinggi Badan Balita

# ============================================================
# HUBUNGAN UMUR VS TINGGI BADAN BALITA
# ============================================================
p_hubungan <- ggplot(data_balita,
                     aes(x = Umur_Bulan,
                         y = Tinggi_Badan,
                         color = Status_Gizi)) +
  geom_point(size = 1.2,
             alpha = 0.5) +
  labs(
    title = "Hubungan Umur vs Tinggi Badan Balita",
    x = "Umur (Bulan)",
    y = "Tinggi Badan (cm)",
    color = "Status Gizi"
  ) +
  scale_color_manual(values = c(
    "normal" = "#F8766D",
    "severely stunted" = "#7CAE00",
    "stunted" = "#00BFC4",
    "tinggi" = "#C77CFF"
  )) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0, size = 10),
    legend.position = "right"
  )

print(p_hubungan)

7. Cek Imbalance Data

# ============================================================
# CEK IMBALANCE DATA
# ============================================================

cat("\nDISTRIBUSI DATA :\n")
## 
## DISTRIBUSI DATA :
table(data_balita$Status_Gizi)
## 
##           normal severely stunted          stunted           tinggi 
##            67755            19869            13815            19560
prop.table(table(data_balita$Status_Gizi)) * 100
## 
##           normal severely stunted          stunted           tinggi 
##         55.99633         16.42080         11.41745         16.16542

8. Split Data

# ============================================================
# SPLIT DATA
# ============================================================

set.seed(123)
index <- createDataPartition(data_balita$Status_Gizi, p = 0.8, list = FALSE)
Data_train <- data_balita[index, ]
Data_test <- data_balita[-index, ]

9. Normalisasi Data untuk KNN

# ============================================================
# NORMALISASI DATA UNTUK KNN
# ============================================================

normalisasi <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

train_num <- Data_train[, c("Umur_Bulan", "Tinggi_Badan")]
test_num <- Data_test[, c("Umur_Bulan", "Tinggi_Badan")]

train_norm <- as.data.frame(lapply(train_num, normalisasi))
test_norm <- as.data.frame(lapply(test_num, normalisasi))

10. Model Random Forest dengan Hyperparameter Tuning

# ============================================================
# MODEL RANDOM FOREST + HYPERPARAMETER TUNING
# ============================================================
cat("\n", rep("=", 70), "\n", sep="")
## 
## ======================================================================
cat("RANDOM FOREST - HYPERPARAMETER TUNING")
## RANDOM FOREST - HYPERPARAMETER TUNING
cat(rep("=", 70), "\n", sep="")
## ======================================================================
set.seed(123)

rf_results2 <- data.frame(
  ntree = integer(),
  mtry = integer(),
  nodesize = integer(),
  Accuracy = numeric()
)

ntree_values <- c(100, 200, 300, 500)
mtry_values <- c(1, 2, 3)
nodesize_values <- c(1, 5, 10)

for (nt in ntree_values) {
  for (mt in mtry_values) {
    for (ns in nodesize_values) {
      set.seed(123)
      rf_model <- randomForest(
        Status_Gizi ~ Umur_Bulan + Tinggi_Badan + Jenis_Kelamin,
        data = Data_train,
        ntree = nt,
        mtry = mt,
        nodesize = ns
      )
      
      pred_temp <- predict(rf_model, Data_test)
      acc_temp <- confusionMatrix(pred_temp, Data_test$Status_Gizi)$overall["Accuracy"]
      
      rf_results2 <- rbind(rf_results2,
                           data.frame(ntree = nt, mtry = mt, 
                                      nodesize = ns, Accuracy = acc_temp))
      
      cat(sprintf("ntree=%d, mtry=%d, nodesize=%d -> Accuracy: %.4f\n", 
                  nt, mt, ns, acc_temp))
    }
  }
}
## ntree=100, mtry=1, nodesize=1 -> Accuracy: 0.6342
## ntree=100, mtry=1, nodesize=5 -> Accuracy: 0.6068
## ntree=100, mtry=1, nodesize=10 -> Accuracy: 0.5961
## ntree=100, mtry=2, nodesize=1 -> Accuracy: 0.9990
## ntree=100, mtry=2, nodesize=5 -> Accuracy: 0.9987
## ntree=100, mtry=2, nodesize=10 -> Accuracy: 0.9983
## ntree=100, mtry=3, nodesize=1 -> Accuracy: 0.9990
## ntree=100, mtry=3, nodesize=5 -> Accuracy: 0.9986
## ntree=100, mtry=3, nodesize=10 -> Accuracy: 0.9971
## ntree=200, mtry=1, nodesize=1 -> Accuracy: 0.6251
## ntree=200, mtry=1, nodesize=5 -> Accuracy: 0.6053
## ntree=200, mtry=1, nodesize=10 -> Accuracy: 0.5918
## ntree=200, mtry=2, nodesize=1 -> Accuracy: 0.9990
## ntree=200, mtry=2, nodesize=5 -> Accuracy: 0.9986
## ntree=200, mtry=2, nodesize=10 -> Accuracy: 0.9982
## ntree=200, mtry=3, nodesize=1 -> Accuracy: 0.9990
## ntree=200, mtry=3, nodesize=5 -> Accuracy: 0.9987
## ntree=200, mtry=3, nodesize=10 -> Accuracy: 0.9974
## ntree=300, mtry=1, nodesize=1 -> Accuracy: 0.6201
## ntree=300, mtry=1, nodesize=5 -> Accuracy: 0.6038
## ntree=300, mtry=1, nodesize=10 -> Accuracy: 0.5948
## ntree=300, mtry=2, nodesize=1 -> Accuracy: 0.9990
## ntree=300, mtry=2, nodesize=5 -> Accuracy: 0.9987
## ntree=300, mtry=2, nodesize=10 -> Accuracy: 0.9985
## ntree=300, mtry=3, nodesize=1 -> Accuracy: 0.9990
## ntree=300, mtry=3, nodesize=5 -> Accuracy: 0.9986
## ntree=300, mtry=3, nodesize=10 -> Accuracy: 0.9973
## ntree=500, mtry=1, nodesize=1 -> Accuracy: 0.6245
## ntree=500, mtry=1, nodesize=5 -> Accuracy: 0.6059
## ntree=500, mtry=1, nodesize=10 -> Accuracy: 0.5994
## ntree=500, mtry=2, nodesize=1 -> Accuracy: 0.9990
## ntree=500, mtry=2, nodesize=5 -> Accuracy: 0.9987
## ntree=500, mtry=2, nodesize=10 -> Accuracy: 0.9984
## ntree=500, mtry=3, nodesize=1 -> Accuracy: 0.9990
## ntree=500, mtry=3, nodesize=5 -> Accuracy: 0.9986
## ntree=500, mtry=3, nodesize=10 -> Accuracy: 0.9972
best_rf2 <- rf_results2[which.max(rf_results2$Accuracy), ]
cat("\nRandom Forest terbaik:\n")
## 
## Random Forest terbaik:
print(best_rf2)
##            ntree mtry nodesize  Accuracy
## Accuracy12   200    2        1 0.9990495
set.seed(123)
model_rf_tuned2 <- randomForest(
  Status_Gizi ~ Umur_Bulan + Tinggi_Badan + Jenis_Kelamin,
  data = Data_train,
  ntree = best_rf2$ntree,
  mtry = best_rf2$mtry,
  nodesize = best_rf2$nodesize,
  importance = TRUE
)

pred_rf2 <- predict(model_rf_tuned2, Data_test)
cm_rf2 <- confusionMatrix(pred_rf2, Data_test$Status_Gizi)
print(cm_rf2)
## Confusion Matrix and Statistics
## 
##                   Reference
## Prediction         normal severely stunted stunted tinggi
##   normal            13543                0       1      6
##   severely stunted      0             3970       4      0
##   stunted               5                3    2758      0
##   tinggi                3                0       0   3906
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9991          
##                  95% CI : (0.9986, 0.9994)
##     No Information Rate : 0.56            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9985          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: normal Class: severely stunted Class: stunted
## Sensitivity                 0.9994                  0.9992         0.9982
## Specificity                 0.9993                  0.9998         0.9996
## Pos Pred Value              0.9995                  0.9990         0.9971
## Neg Pred Value              0.9992                  0.9999         0.9998
## Prevalence                  0.5600                  0.1642         0.1142
## Detection Rate              0.5597                  0.1641         0.1140
## Detection Prevalence        0.5599                  0.1642         0.1143
## Balanced Accuracy           0.9994                  0.9995         0.9989
##                      Class: tinggi
## Sensitivity                 0.9985
## Specificity                 0.9999
## Pos Pred Value              0.9992
## Neg Pred Value              0.9997
## Prevalence                  0.1617
## Detection Rate              0.1614
## Detection Prevalence        0.1615
## Balanced Accuracy           0.9992

11. Model KNN dengan Hyperparameter Tuning

# ============================================================
# MODEL KNN + HYPERPARAMETER TUNING
# ============================================================
cat("\n", rep("=", 70), "\n", sep="")
## 
## ======================================================================
cat("KNN - HYPERPARAMETER TUNING \n")
## KNN - HYPERPARAMETER TUNING
cat(rep("=", 70), "\n", sep="")
## ======================================================================
k_values <- seq(1, 31, by = 2)
knn_results2 <- data.frame(k = k_values, Accuracy = NA, F1 = NA)

for (i in 1:length(k_values)) {
  set.seed(123)
  pred_knn2 <- knn(
    train = train_norm,
    test = test_norm,
    cl = Data_train$Status_Gizi,
    k = k_values[i]
  )
  
  cm_temp <- confusionMatrix(pred_knn2, Data_test$Status_Gizi)
  knn_results2$Accuracy[i] <- cm_temp$overall["Accuracy"]
  knn_results2$F1[i] <- mean(cm_temp$byClass[, "F1"], na.rm = TRUE)
  
  cat(sprintf("k = %2d -> Accuracy: %.4f, F1: %.4f\n", 
              k_values[i], knn_results2$Accuracy[i], knn_results2$F1[i]))
}
## k =  1 -> Accuracy: 0.9339, F1: 0.9036
## k =  3 -> Accuracy: 0.9352, F1: 0.9053
## k =  5 -> Accuracy: 0.9344, F1: 0.9037
## k =  7 -> Accuracy: 0.9344, F1: 0.9034
## k =  9 -> Accuracy: 0.9345, F1: 0.9037
## k = 11 -> Accuracy: 0.9346, F1: 0.9035
## k = 13 -> Accuracy: 0.9336, F1: 0.9018
## k = 15 -> Accuracy: 0.9338, F1: 0.9020
## k = 17 -> Accuracy: 0.9344, F1: 0.9030
## k = 19 -> Accuracy: 0.9351, F1: 0.9042
## k = 21 -> Accuracy: 0.9349, F1: 0.9037
## k = 23 -> Accuracy: 0.9349, F1: 0.9039
## k = 25 -> Accuracy: 0.9362, F1: 0.9057
## k = 27 -> Accuracy: 0.9365, F1: 0.9065
## k = 29 -> Accuracy: 0.9359, F1: 0.9055
## k = 31 -> Accuracy: 0.9368, F1: 0.9073
best_k2 <- k_values[which.max(knn_results2$Accuracy)]
cat("\nNilai k terbaik:", best_k2, "\n")
## 
## Nilai k terbaik: 31
set.seed(123)
pred_knn2 <- knn(
  train = train_norm,
  test = test_norm,
  cl = Data_train$Status_Gizi,
  k = best_k2
)
cm_knn2 <- confusionMatrix(pred_knn2, Data_test$Status_Gizi)
print(cm_knn2)
## Confusion Matrix and Statistics
## 
##                   Reference
## Prediction         normal severely stunted stunted tinggi
##   normal            13092                0     290    157
##   severely stunted      0             3660     310      0
##   stunted             270              313    2163      0
##   tinggi              189                0       0   3755
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9368          
##                  95% CI : (0.9337, 0.9398)
##     No Information Rate : 0.56            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8982          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: normal Class: severely stunted Class: stunted
## Sensitivity                 0.9661                  0.9212        0.78284
## Specificity                 0.9580                  0.9847        0.97280
## Pos Pred Value              0.9670                  0.9219        0.78769
## Neg Pred Value              0.9569                  0.9845        0.97203
## Prevalence                  0.5600                  0.1642        0.11418
## Detection Rate              0.5410                  0.1512        0.08938
## Detection Prevalence        0.5595                  0.1641        0.11348
## Balanced Accuracy           0.9621                  0.9529        0.87782
##                      Class: tinggi
## Sensitivity                 0.9599
## Specificity                 0.9907
## Pos Pred Value              0.9521
## Neg Pred Value              0.9922
## Prevalence                  0.1617
## Detection Rate              0.1552
## Detection Prevalence        0.1630
## Balanced Accuracy           0.9753

12. ROC dan AUC

# ============================================================
# ROC - AUC
# ============================================================
roc_rf2 <- multiclass.roc(Data_test$Status_Gizi, as.numeric(pred_rf2))
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
roc_knn2 <- multiclass.roc(Data_test$Status_Gizi, as.numeric(pred_knn2))
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc_rf2 <- roc_rf2$auc
auc_knn2 <- roc_knn2$auc

13. Evaluasi Model

# ============================================================
# EVALUASI MODEL
# ============================================================
# RANDOM FOREST
acc_rf2  <- cm_rf2$overall['Accuracy']
prec_rf2 <- mean(cm_rf2$byClass[, "Precision"], na.rm = TRUE)
rec_rf2  <- mean(cm_rf2$byClass[, "Recall"], na.rm = TRUE)
sens_rf2 <- mean(cm_rf2$byClass[, "Sensitivity"], na.rm = TRUE)
f1_rf2   <- mean(cm_rf2$byClass[, "F1"], na.rm = TRUE)

# KNN
acc_knn2  <- cm_knn2$overall['Accuracy']
prec_knn2 <- mean(cm_knn2$byClass[, "Precision"], na.rm = TRUE)
rec_knn2  <- mean(cm_knn2$byClass[, "Recall"], na.rm = TRUE)
sens_knn2 <- mean(cm_knn2$byClass[, "Sensitivity"], na.rm = TRUE)
f1_knn2   <- mean(cm_knn2$byClass[, "F1"], na.rm = TRUE)

14. Perbandingan Model

# ============================================================
# PERBANDINGAN MODEL
# ============================================================

Perbandingan_Model2 <- data.frame(
  Model = c("Random Forest", "KNN"),
  Accuracy = c(acc_rf2, acc_knn2),
  Precision = c(prec_rf2, prec_knn2),
  Recall = c(rec_rf2, rec_knn2),
  Sensitivity = c(sens_rf2, sens_knn2),
  F1_Score = c(f1_rf2, f1_knn2),
  AUC = c(auc_rf2, auc_knn2)
)

Perbandingan_Model2
##           Model  Accuracy Precision    Recall Sensitivity  F1_Score       AUC
## 1 Random Forest 0.9990909 0.9987043 0.9988278   0.9988278 0.9987659 0.9989024
## 2           KNN 0.9368156 0.9071672 0.9075145   0.9075145 0.9073349 0.9313709

Kesimpulan

Model terbaik dipilih berdasarkan nilai Accuracy, Precision, Recall, F1-Score, dan AUC yang diperoleh dari hasil pengujian data.