This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.title: “Analisis Klasifikasi Status Gizi Balita” author: “Nama Mahasiswa” date: “2026-06-03” output: html_document: toc: true toc_float: true number_sections: true pdf_document: toc: true number_sections: true —
Laporan ini menyajikan analisis klasifikasi status gizi balita menggunakan metode Random Forest dan K-Nearest Neighbor (KNN) dengan hyperparameter tuning.
# ============================================================
# 1. INSTALL DAN LOAD PACKAGE
# ============================================================
library(readr)
library(dplyr)
library(caret)
library(e1071)
library(randomForest)
library(class)
library(rpart)
library(rpart.plot)
library(ggplot2)
library(corrplot)
library(pROC)
# ============================================================
# 2. INPUT DATA
# ============================================================
data_balita <- read_csv("~/SEMESTER 6/TUGAS UAS/DATMIN/data_balita.csv")
## Rows: 120999 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Jenis Kelamin, Status Gizi
## dbl (2): Umur (bulan), Tinggi Badan (cm)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data_balita)
## # A tibble: 6 × 4
## `Umur (bulan)` `Jenis Kelamin` `Tinggi Badan (cm)` `Status Gizi`
## <dbl> <chr> <dbl> <chr>
## 1 0 laki-laki 44.6 stunted
## 2 0 laki-laki 56.7 tinggi
## 3 0 laki-laki 46.9 normal
## 4 0 laki-laki 47.5 normal
## 5 0 laki-laki 42.7 severely stunted
## 6 0 laki-laki 44.3 stunted
str(data_balita)
## spc_tbl_ [120,999 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Umur (bulan) : num [1:120999] 0 0 0 0 0 0 0 0 0 0 ...
## $ Jenis Kelamin : chr [1:120999] "laki-laki" "laki-laki" "laki-laki" "laki-laki" ...
## $ Tinggi Badan (cm): num [1:120999] 44.6 56.7 46.9 47.5 42.7 ...
## $ Status Gizi : chr [1:120999] "stunted" "tinggi" "normal" "normal" ...
## - attr(*, "spec")=
## .. cols(
## .. `Umur (bulan)` = col_double(),
## .. `Jenis Kelamin` = col_character(),
## .. `Tinggi Badan (cm)` = col_double(),
## .. `Status Gizi` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# ============================================================
# 3. PREPROCESSING
# ============================================================
colnames(data_balita) <- c("Umur_Bulan", "Jenis_Kelamin", "Tinggi_Badan", "Status_Gizi")
data_balita <- na.omit(data_balita)
data_balita$Jenis_Kelamin <- as.factor(data_balita$Jenis_Kelamin)
data_balita$Status_Gizi <- as.factor(data_balita$Status_Gizi)
# ============================================================
# DISTRIBUSI STATUS GIZI BALITA (Bar Chart)
# ============================================================
# Menghitung frekuensi status gizi
status_counts <- data_balita %>%
group_by(Status_Gizi) %>%
summarise(Jumlah = n()) %>%
mutate(Persentase = Jumlah / sum(Jumlah) * 100)
# Membuat bar chart
p_status <- ggplot(status_counts, aes(x = Status_Gizi, y = Jumlah, fill = Status_Gizi)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = Jumlah), vjust = -0.5, size = 4, fontface = "bold") +
labs(
title = "Distribusi Status Gizi Balita",
x = "Status Gizi",
y = "Frekuensi"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
plot.title = element_text(size = 14, face = "bold"),
legend.position = "none"
) +
scale_fill_manual(values = c("normal" = "#F8766D",
"severely stunted" = "#7CAE00",
"stunted" = "#00BFC4",
"tinggi" = "#C77CFF"))
print(p_status)
# ============================================================
# VISUALISASI BOXPLOT UMUR PER STATUS GIZI
# ============================================================
# Boxplot umur berdasarkan status gizi
p1 <- ggplot(data_balita, aes(x = Status_Gizi, y = Umur_Bulan, fill = Status_Gizi)) +
geom_boxplot(outlier.size = 0.5, outlier.alpha = 0.3) +
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
labs(
title = "Distribusi Umur per Status Gizi",
subtitle = "Titik hitam menunjukkan nilai rata-rata",
x = "Status Gizi",
y = "Umur (bulan)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
) +
scale_fill_manual(values = c("#ff9999", "#ffcc99", "#99ff99", "#99ccff"))
print(p1)
# ============================================================
# HUBUNGAN UMUR VS TINGGI BADAN BALITA
# ============================================================
p_hubungan <- ggplot(data_balita,
aes(x = Umur_Bulan,
y = Tinggi_Badan,
color = Status_Gizi)) +
geom_point(size = 1.2,
alpha = 0.5) +
labs(
title = "Hubungan Umur vs Tinggi Badan Balita",
x = "Umur (Bulan)",
y = "Tinggi Badan (cm)",
color = "Status Gizi"
) +
scale_color_manual(values = c(
"normal" = "#F8766D",
"severely stunted" = "#7CAE00",
"stunted" = "#00BFC4",
"tinggi" = "#C77CFF"
)) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0, size = 10),
legend.position = "right"
)
print(p_hubungan)
# ============================================================
# CEK IMBALANCE DATA
# ============================================================
cat("\nDISTRIBUSI DATA :\n")
##
## DISTRIBUSI DATA :
table(data_balita$Status_Gizi)
##
## normal severely stunted stunted tinggi
## 67755 19869 13815 19560
prop.table(table(data_balita$Status_Gizi)) * 100
##
## normal severely stunted stunted tinggi
## 55.99633 16.42080 11.41745 16.16542
# ============================================================
# SPLIT DATA
# ============================================================
set.seed(123)
index <- createDataPartition(data_balita$Status_Gizi, p = 0.8, list = FALSE)
Data_train <- data_balita[index, ]
Data_test <- data_balita[-index, ]
# ============================================================
# NORMALISASI DATA UNTUK KNN
# ============================================================
normalisasi <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
train_num <- Data_train[, c("Umur_Bulan", "Tinggi_Badan")]
test_num <- Data_test[, c("Umur_Bulan", "Tinggi_Badan")]
train_norm <- as.data.frame(lapply(train_num, normalisasi))
test_norm <- as.data.frame(lapply(test_num, normalisasi))
# ============================================================
# MODEL RANDOM FOREST + HYPERPARAMETER TUNING
# ============================================================
cat("\n", rep("=", 70), "\n", sep="")
##
## ======================================================================
cat("RANDOM FOREST - HYPERPARAMETER TUNING")
## RANDOM FOREST - HYPERPARAMETER TUNING
cat(rep("=", 70), "\n", sep="")
## ======================================================================
set.seed(123)
rf_results2 <- data.frame(
ntree = integer(),
mtry = integer(),
nodesize = integer(),
Accuracy = numeric()
)
ntree_values <- c(100, 200, 300, 500)
mtry_values <- c(1, 2, 3)
nodesize_values <- c(1, 5, 10)
for (nt in ntree_values) {
for (mt in mtry_values) {
for (ns in nodesize_values) {
set.seed(123)
rf_model <- randomForest(
Status_Gizi ~ Umur_Bulan + Tinggi_Badan + Jenis_Kelamin,
data = Data_train,
ntree = nt,
mtry = mt,
nodesize = ns
)
pred_temp <- predict(rf_model, Data_test)
acc_temp <- confusionMatrix(pred_temp, Data_test$Status_Gizi)$overall["Accuracy"]
rf_results2 <- rbind(rf_results2,
data.frame(ntree = nt, mtry = mt,
nodesize = ns, Accuracy = acc_temp))
cat(sprintf("ntree=%d, mtry=%d, nodesize=%d -> Accuracy: %.4f\n",
nt, mt, ns, acc_temp))
}
}
}
## ntree=100, mtry=1, nodesize=1 -> Accuracy: 0.6342
## ntree=100, mtry=1, nodesize=5 -> Accuracy: 0.6068
## ntree=100, mtry=1, nodesize=10 -> Accuracy: 0.5961
## ntree=100, mtry=2, nodesize=1 -> Accuracy: 0.9990
## ntree=100, mtry=2, nodesize=5 -> Accuracy: 0.9987
## ntree=100, mtry=2, nodesize=10 -> Accuracy: 0.9983
## ntree=100, mtry=3, nodesize=1 -> Accuracy: 0.9990
## ntree=100, mtry=3, nodesize=5 -> Accuracy: 0.9986
## ntree=100, mtry=3, nodesize=10 -> Accuracy: 0.9971
## ntree=200, mtry=1, nodesize=1 -> Accuracy: 0.6251
## ntree=200, mtry=1, nodesize=5 -> Accuracy: 0.6053
## ntree=200, mtry=1, nodesize=10 -> Accuracy: 0.5918
## ntree=200, mtry=2, nodesize=1 -> Accuracy: 0.9990
## ntree=200, mtry=2, nodesize=5 -> Accuracy: 0.9986
## ntree=200, mtry=2, nodesize=10 -> Accuracy: 0.9982
## ntree=200, mtry=3, nodesize=1 -> Accuracy: 0.9990
## ntree=200, mtry=3, nodesize=5 -> Accuracy: 0.9987
## ntree=200, mtry=3, nodesize=10 -> Accuracy: 0.9974
## ntree=300, mtry=1, nodesize=1 -> Accuracy: 0.6201
## ntree=300, mtry=1, nodesize=5 -> Accuracy: 0.6038
## ntree=300, mtry=1, nodesize=10 -> Accuracy: 0.5948
## ntree=300, mtry=2, nodesize=1 -> Accuracy: 0.9990
## ntree=300, mtry=2, nodesize=5 -> Accuracy: 0.9987
## ntree=300, mtry=2, nodesize=10 -> Accuracy: 0.9985
## ntree=300, mtry=3, nodesize=1 -> Accuracy: 0.9990
## ntree=300, mtry=3, nodesize=5 -> Accuracy: 0.9986
## ntree=300, mtry=3, nodesize=10 -> Accuracy: 0.9973
## ntree=500, mtry=1, nodesize=1 -> Accuracy: 0.6245
## ntree=500, mtry=1, nodesize=5 -> Accuracy: 0.6059
## ntree=500, mtry=1, nodesize=10 -> Accuracy: 0.5994
## ntree=500, mtry=2, nodesize=1 -> Accuracy: 0.9990
## ntree=500, mtry=2, nodesize=5 -> Accuracy: 0.9987
## ntree=500, mtry=2, nodesize=10 -> Accuracy: 0.9984
## ntree=500, mtry=3, nodesize=1 -> Accuracy: 0.9990
## ntree=500, mtry=3, nodesize=5 -> Accuracy: 0.9986
## ntree=500, mtry=3, nodesize=10 -> Accuracy: 0.9972
best_rf2 <- rf_results2[which.max(rf_results2$Accuracy), ]
cat("\nRandom Forest terbaik:\n")
##
## Random Forest terbaik:
print(best_rf2)
## ntree mtry nodesize Accuracy
## Accuracy12 200 2 1 0.9990495
set.seed(123)
model_rf_tuned2 <- randomForest(
Status_Gizi ~ Umur_Bulan + Tinggi_Badan + Jenis_Kelamin,
data = Data_train,
ntree = best_rf2$ntree,
mtry = best_rf2$mtry,
nodesize = best_rf2$nodesize,
importance = TRUE
)
pred_rf2 <- predict(model_rf_tuned2, Data_test)
cm_rf2 <- confusionMatrix(pred_rf2, Data_test$Status_Gizi)
print(cm_rf2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction normal severely stunted stunted tinggi
## normal 13543 0 1 6
## severely stunted 0 3970 4 0
## stunted 5 3 2758 0
## tinggi 3 0 0 3906
##
## Overall Statistics
##
## Accuracy : 0.9991
## 95% CI : (0.9986, 0.9994)
## No Information Rate : 0.56
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9985
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: normal Class: severely stunted Class: stunted
## Sensitivity 0.9994 0.9992 0.9982
## Specificity 0.9993 0.9998 0.9996
## Pos Pred Value 0.9995 0.9990 0.9971
## Neg Pred Value 0.9992 0.9999 0.9998
## Prevalence 0.5600 0.1642 0.1142
## Detection Rate 0.5597 0.1641 0.1140
## Detection Prevalence 0.5599 0.1642 0.1143
## Balanced Accuracy 0.9994 0.9995 0.9989
## Class: tinggi
## Sensitivity 0.9985
## Specificity 0.9999
## Pos Pred Value 0.9992
## Neg Pred Value 0.9997
## Prevalence 0.1617
## Detection Rate 0.1614
## Detection Prevalence 0.1615
## Balanced Accuracy 0.9992
# ============================================================
# MODEL KNN + HYPERPARAMETER TUNING
# ============================================================
cat("\n", rep("=", 70), "\n", sep="")
##
## ======================================================================
cat("KNN - HYPERPARAMETER TUNING \n")
## KNN - HYPERPARAMETER TUNING
cat(rep("=", 70), "\n", sep="")
## ======================================================================
k_values <- seq(1, 31, by = 2)
knn_results2 <- data.frame(k = k_values, Accuracy = NA, F1 = NA)
for (i in 1:length(k_values)) {
set.seed(123)
pred_knn2 <- knn(
train = train_norm,
test = test_norm,
cl = Data_train$Status_Gizi,
k = k_values[i]
)
cm_temp <- confusionMatrix(pred_knn2, Data_test$Status_Gizi)
knn_results2$Accuracy[i] <- cm_temp$overall["Accuracy"]
knn_results2$F1[i] <- mean(cm_temp$byClass[, "F1"], na.rm = TRUE)
cat(sprintf("k = %2d -> Accuracy: %.4f, F1: %.4f\n",
k_values[i], knn_results2$Accuracy[i], knn_results2$F1[i]))
}
## k = 1 -> Accuracy: 0.9339, F1: 0.9036
## k = 3 -> Accuracy: 0.9352, F1: 0.9053
## k = 5 -> Accuracy: 0.9344, F1: 0.9037
## k = 7 -> Accuracy: 0.9344, F1: 0.9034
## k = 9 -> Accuracy: 0.9345, F1: 0.9037
## k = 11 -> Accuracy: 0.9346, F1: 0.9035
## k = 13 -> Accuracy: 0.9336, F1: 0.9018
## k = 15 -> Accuracy: 0.9338, F1: 0.9020
## k = 17 -> Accuracy: 0.9344, F1: 0.9030
## k = 19 -> Accuracy: 0.9351, F1: 0.9042
## k = 21 -> Accuracy: 0.9349, F1: 0.9037
## k = 23 -> Accuracy: 0.9349, F1: 0.9039
## k = 25 -> Accuracy: 0.9362, F1: 0.9057
## k = 27 -> Accuracy: 0.9365, F1: 0.9065
## k = 29 -> Accuracy: 0.9359, F1: 0.9055
## k = 31 -> Accuracy: 0.9368, F1: 0.9073
best_k2 <- k_values[which.max(knn_results2$Accuracy)]
cat("\nNilai k terbaik:", best_k2, "\n")
##
## Nilai k terbaik: 31
set.seed(123)
pred_knn2 <- knn(
train = train_norm,
test = test_norm,
cl = Data_train$Status_Gizi,
k = best_k2
)
cm_knn2 <- confusionMatrix(pred_knn2, Data_test$Status_Gizi)
print(cm_knn2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction normal severely stunted stunted tinggi
## normal 13092 0 290 157
## severely stunted 0 3660 310 0
## stunted 270 313 2163 0
## tinggi 189 0 0 3755
##
## Overall Statistics
##
## Accuracy : 0.9368
## 95% CI : (0.9337, 0.9398)
## No Information Rate : 0.56
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8982
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: normal Class: severely stunted Class: stunted
## Sensitivity 0.9661 0.9212 0.78284
## Specificity 0.9580 0.9847 0.97280
## Pos Pred Value 0.9670 0.9219 0.78769
## Neg Pred Value 0.9569 0.9845 0.97203
## Prevalence 0.5600 0.1642 0.11418
## Detection Rate 0.5410 0.1512 0.08938
## Detection Prevalence 0.5595 0.1641 0.11348
## Balanced Accuracy 0.9621 0.9529 0.87782
## Class: tinggi
## Sensitivity 0.9599
## Specificity 0.9907
## Pos Pred Value 0.9521
## Neg Pred Value 0.9922
## Prevalence 0.1617
## Detection Rate 0.1552
## Detection Prevalence 0.1630
## Balanced Accuracy 0.9753
# ============================================================
# ROC - AUC
# ============================================================
roc_rf2 <- multiclass.roc(Data_test$Status_Gizi, as.numeric(pred_rf2))
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
roc_knn2 <- multiclass.roc(Data_test$Status_Gizi, as.numeric(pred_knn2))
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc_rf2 <- roc_rf2$auc
auc_knn2 <- roc_knn2$auc
# ============================================================
# EVALUASI MODEL
# ============================================================
# RANDOM FOREST
acc_rf2 <- cm_rf2$overall['Accuracy']
prec_rf2 <- mean(cm_rf2$byClass[, "Precision"], na.rm = TRUE)
rec_rf2 <- mean(cm_rf2$byClass[, "Recall"], na.rm = TRUE)
sens_rf2 <- mean(cm_rf2$byClass[, "Sensitivity"], na.rm = TRUE)
f1_rf2 <- mean(cm_rf2$byClass[, "F1"], na.rm = TRUE)
# KNN
acc_knn2 <- cm_knn2$overall['Accuracy']
prec_knn2 <- mean(cm_knn2$byClass[, "Precision"], na.rm = TRUE)
rec_knn2 <- mean(cm_knn2$byClass[, "Recall"], na.rm = TRUE)
sens_knn2 <- mean(cm_knn2$byClass[, "Sensitivity"], na.rm = TRUE)
f1_knn2 <- mean(cm_knn2$byClass[, "F1"], na.rm = TRUE)
# ============================================================
# PERBANDINGAN MODEL
# ============================================================
Perbandingan_Model2 <- data.frame(
Model = c("Random Forest", "KNN"),
Accuracy = c(acc_rf2, acc_knn2),
Precision = c(prec_rf2, prec_knn2),
Recall = c(rec_rf2, rec_knn2),
Sensitivity = c(sens_rf2, sens_knn2),
F1_Score = c(f1_rf2, f1_knn2),
AUC = c(auc_rf2, auc_knn2)
)
Perbandingan_Model2
## Model Accuracy Precision Recall Sensitivity F1_Score AUC
## 1 Random Forest 0.9990909 0.9987043 0.9988278 0.9988278 0.9987659 0.9989024
## 2 KNN 0.9368156 0.9071672 0.9075145 0.9075145 0.9073349 0.9313709
Model terbaik dipilih berdasarkan nilai Accuracy, Precision, Recall, F1-Score, dan AUC yang diperoleh dari hasil pengujian data.