library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(psych)
library(writexl)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(e1071)
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
library(iml)
## Warning: package 'iml' was built under R version 4.5.2
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.5.2
## Loaded ROSE 0.0-4
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Life_Expectancy_Data <- read_excel("Life Expectancy Data.xlsx")
str(Life_Expectancy_Data)
## tibble [2,938 × 18] (S3: tbl_df/tbl/data.frame)
## $ Status : chr [1:2938] "Developing" "Developing" "Developing" "Developing" ...
## $ Life expectancy : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult Mortality : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
## $ Alcohol : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage expenditure : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis B : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : num [1:2938] 1154 492 430 2787 3013 ...
## $ BMI : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under-five deaths : num [1:2938] 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : num [1:2938] 6 58 62 67 68 66 63 64 63 58 ...
## $ Total expenditure : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV/AIDS : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
## $ Population : num [1:2938] 33736494 327582 31731688 3696958 2978599 ...
## $ thinness 5-9 years : num [1:2938] 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
describe(Life_Expectancy_Data)
## vars n mean sd median
## Status* 1 2938 1.83 0.38 2.00
## Life expectancy 2 2928 69.22 9.52 72.10
## Adult Mortality 3 2928 164.80 124.29 144.00
## Alcohol 4 2744 4.60 4.05 3.76
## percentage expenditure 5 2938 738.25 1987.91 64.91
## Hepatitis B 6 2385 80.94 25.07 92.00
## Measles 7 2938 2419.59 11467.27 17.00
## BMI 8 2904 38.32 20.04 43.50
## under-five deaths 9 2938 42.04 160.45 4.00
## Polio 10 2919 82.55 23.43 93.00
## Total expenditure 11 2712 5.94 2.50 5.76
## Diphtheria 12 2919 82.32 23.72 93.00
## HIV/AIDS 13 2938 1.74 5.08 0.10
## GDP 14 2490 7483.16 14270.17 1766.95
## Population 15 2286 12753375.12 61012096.51 1386542.00
## thinness 5-9 years 16 2904 4.87 4.51 3.30
## Income composition of resources 17 2771 0.63 0.21 0.68
## Schooling 18 2775 11.99 3.36 12.30
## trimmed mad min max
## Status* 1.91 0.00 1.00 2.000000e+00
## Life expectancy 69.91 8.60 36.30 8.900000e+01
## Adult Mortality 150.51 112.68 1.00 7.230000e+02
## Alcohol 4.23 4.81 0.01 1.787000e+01
## percentage expenditure 230.74 96.24 0.00 1.947991e+04
## Hepatitis B 86.89 8.90 1.00 9.900000e+01
## Measles 286.08 25.20 0.00 2.121830e+05
## BMI 39.05 24.17 1.00 8.730000e+01
## under-five deaths 14.15 5.93 0.00 2.500000e+03
## Polio 88.05 8.90 3.00 9.900000e+01
## Total expenditure 5.85 2.36 0.37 1.760000e+01
## Diphtheria 87.99 8.90 2.00 9.900000e+01
## HIV/AIDS 0.54 0.00 0.10 5.060000e+01
## GDP 3751.73 2360.98 1.68 1.191727e+05
## Population 3953693.58 2012347.06 34.00 1.293859e+09
## thinness 5-9 years 4.15 3.41 0.10 2.860000e+01
## Income composition of resources 0.65 0.19 0.00 9.500000e-01
## Schooling 12.17 3.11 0.00 2.070000e+01
## range skew kurtosis se
## Status* 1.000000e+00 -1.72 0.95 0.01
## Life expectancy 5.270000e+01 -0.64 -0.24 0.18
## Adult Mortality 7.220000e+02 1.17 1.74 2.30
## Alcohol 1.786000e+01 0.59 -0.81 0.08
## percentage expenditure 1.947991e+04 4.65 26.51 36.68
## Hepatitis B 9.800000e+01 -1.93 2.76 0.51
## Measles 2.121830e+05 9.43 114.58 211.56
## BMI 8.630000e+01 -0.22 -1.29 0.37
## under-five deaths 2.500000e+03 9.49 109.49 2.96
## Polio 9.600000e+01 -2.10 3.76 0.43
## Total expenditure 1.723000e+01 0.62 1.15 0.05
## Diphtheria 9.700000e+01 -2.07 3.55 0.44
## HIV/AIDS 5.050000e+01 5.39 34.80 0.09
## GDP 1.191711e+05 3.20 12.29 285.98
## Population 1.293859e+09 15.90 297.09 1276079.80
## thinness 5-9 years 2.850000e+01 1.78 4.34 0.08
## Income composition of resources 9.500000e-01 -1.14 1.38 0.00
## Schooling 2.070000e+01 -0.60 0.88 0.06
missing_table <- data.frame(
Variable = names(Life_Expectancy_Data),
Missing = colSums(is.na(Life_Expectancy_Data))
)
print(missing_table)
## Variable Missing
## Status Status 0
## Life expectancy Life expectancy 10
## Adult Mortality Adult Mortality 10
## Alcohol Alcohol 194
## percentage expenditure percentage expenditure 0
## Hepatitis B Hepatitis B 553
## Measles Measles 0
## BMI BMI 34
## under-five deaths under-five deaths 0
## Polio Polio 19
## Total expenditure Total expenditure 226
## Diphtheria Diphtheria 19
## HIV/AIDS HIV/AIDS 0
## GDP GDP 448
## Population Population 652
## thinness 5-9 years thinness 5-9 years 34
## Income composition of resources Income composition of resources 167
## Schooling Schooling 163
num_vars <- names(Life_Expectancy_Data)[sapply(Life_Expectancy_Data, is.numeric)]
cat_vars <- names(Life_Expectancy_Data)[sapply(Life_Expectancy_Data, is.character)]
impute_numeric <- function(x) {
if (all(is.na(x))) return(x)
if (abs(skewness(x, na.rm = TRUE)) > 1) {
x[is.na(x)] <- median(x, na.rm = TRUE)
} else {
x[is.na(x)] <- mean(x, na.rm = TRUE)
}
return(x)
}
impute_categorical <- function(x) {
mode_value <- names(sort(table(x), decreasing = TRUE))[1]
x[is.na(x)] <- mode_value
return(x)
}
Life_Expectancy_Imputed <- Life_Expectancy_Data
# Imputasi numerik
for (col in num_vars) {
Life_Expectancy_Imputed[[col]] <- impute_numeric(Life_Expectancy_Imputed[[col]])
}
# Imputasi kategorikal
for (col in cat_vars) {
Life_Expectancy_Imputed[[col]] <- impute_categorical(Life_Expectancy_Imputed[[col]])
}
cat("Missing value sebelum imputasi:\n")
## Missing value sebelum imputasi:
print(colSums(is.na(Life_Expectancy_Data)))
## Status Life expectancy
## 0 10
## Adult Mortality Alcohol
## 10 194
## percentage expenditure Hepatitis B
## 0 553
## Measles BMI
## 0 34
## under-five deaths Polio
## 0 19
## Total expenditure Diphtheria
## 226 19
## HIV/AIDS GDP
## 0 448
## Population thinness 5-9 years
## 652 34
## Income composition of resources Schooling
## 167 163
cat("\nMissing value sesudah imputasi:\n")
##
## Missing value sesudah imputasi:
print(colSums(is.na(Life_Expectancy_Imputed)))
## Status Life expectancy
## 0 0
## Adult Mortality Alcohol
## 0 0
## percentage expenditure Hepatitis B
## 0 0
## Measles BMI
## 0 0
## under-five deaths Polio
## 0 0
## Total expenditure Diphtheria
## 0 0
## HIV/AIDS GDP
## 0 0
## Population thinness 5-9 years
## 0 0
## Income composition of resources Schooling
## 0 0
cek_outlier <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
sum(x < lower | x > upper, na.rm = TRUE)
}
num_vars <- names(Life_Expectancy_Imputed)[sapply(Life_Expectancy_Imputed, is.numeric)]
outlier_before <- data.frame(
Variable = num_vars,
Outlier_Before = sapply(Life_Expectancy_Imputed[num_vars], cek_outlier)
)
cat("=== OUTLIER SEBELUM PEMBERSIHAN ===\n")
## === OUTLIER SEBELUM PEMBERSIHAN ===
print(outlier_before)
## Variable Outlier_Before
## Life expectancy Life expectancy 17
## Adult Mortality Adult Mortality 86
## Alcohol Alcohol 3
## percentage expenditure percentage expenditure 389
## Hepatitis B Hepatitis B 322
## Measles Measles 542
## BMI BMI 0
## under-five deaths under-five deaths 394
## Polio Polio 279
## Total expenditure Total expenditure 51
## Diphtheria Diphtheria 298
## HIV/AIDS HIV/AIDS 542
## GDP GDP 445
## Population Population 452
## thinness 5-9 years thinness 5-9 years 99
## Income composition of resources Income composition of resources 130
## Schooling Schooling 77
df_before_long <- Life_Expectancy_Imputed %>%
select(all_of(num_vars)) %>%
tidyr::pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(df_before_long, aes(x = "", y = Value)) +
geom_boxplot(
fill = "lightgreen",
color = "darkgreen",
outlier.color = "red",
outlier.size = 1
) +
facet_wrap(~ Variable, scales = "free", ncol = 4) +
theme_minimal(base_size = 11) +
theme(
strip.text = element_text(face = "bold"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(title = "Boxplot Outlier Sebelum Pembersihan", x = NULL, y = "Nilai")
handle_outlier <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
x[x < lower] <- lower
x[x > upper] <- upper
return(x)
}
Life_Expectancy_Clean <- Life_Expectancy_Imputed
Life_Expectancy_Clean[num_vars] <-
lapply(Life_Expectancy_Imputed[num_vars], handle_outlier)
outlier_after <- data.frame(
Variable = num_vars,
Outlier_After = sapply(Life_Expectancy_Clean[num_vars], cek_outlier)
)
print(outlier_after)
## Variable Outlier_After
## Life expectancy Life expectancy 0
## Adult Mortality Adult Mortality 0
## Alcohol Alcohol 0
## percentage expenditure percentage expenditure 0
## Hepatitis B Hepatitis B 0
## Measles Measles 0
## BMI BMI 0
## under-five deaths under-five deaths 0
## Polio Polio 0
## Total expenditure Total expenditure 0
## Diphtheria Diphtheria 0
## HIV/AIDS HIV/AIDS 0
## GDP GDP 0
## Population Population 0
## thinness 5-9 years thinness 5-9 years 0
## Income composition of resources Income composition of resources 0
## Schooling Schooling 0
df_after_long <- Life_Expectancy_Clean %>%
select(all_of(num_vars)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(df_after_long, aes(x = "", y = Value)) +
geom_boxplot(
fill = "lightgreen",
color = "darkgreen",
outlier.color = "red",
outlier.size = 1
) +
facet_wrap(~ Variable, scales = "free", ncol = 4) +
theme_minimal(base_size = 11) +
theme(
strip.text = element_text(face = "bold"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(title = "Boxplot Setelah Penanganan Outlier", x = NULL, y = "Nilai")
describe(Life_Expectancy_Clean)
## vars n mean sd median
## Status* 1 2938 1.83 0.38 2.00
## Life expectancy 2 2938 69.23 9.48 72.00
## Adult Mortality 3 2938 161.95 115.49 144.00
## Alcohol 4 2938 4.60 3.91 4.16
## percentage expenditure 5 2938 284.05 389.46 64.91
## Hepatitis B 6 2938 87.01 12.31 92.00
## Measles 7 2938 233.75 353.61 17.00
## BMI 8 2938 38.32 19.93 43.00
## under-five deaths 9 2938 17.97 25.19 4.00
## Polio 10 2938 85.36 15.81 93.00
## Total expenditure 11 2938 5.90 2.29 5.94
## Diphtheria 12 2938 85.30 15.79 93.00
## HIV/AIDS 13 2938 0.54 0.69 0.10
## GDP 14 2938 3443.02 3777.90 1766.95
## Population 15 2938 3173254.80 3837554.78 1386542.00
## thinness 5-9 years 16 2938 4.71 3.98 3.30
## Income composition of resources 17 2938 0.63 0.19 0.68
## Schooling 18 2938 12.04 3.10 12.10
## trimmed mad min max
## Status* 1.91 0.00 1.00 2.00
## Life expectancy 69.91 8.60 44.60 89.00
## Adult Mortality 150.46 111.19 1.00 456.50
## Alcohol 4.24 4.67 0.01 16.84
## percentage expenditure 218.18 96.24 0.00 1096.81
## Hepatitis B 88.78 7.41 61.00 99.00
## Measles 179.79 25.20 0.00 900.62
## BMI 39.04 24.17 1.00 87.30
## under-five deaths 13.72 5.93 0.00 70.00
## Polio 88.11 8.90 49.50 99.00
## Total expenditure 5.85 2.21 0.37 11.77
## Diphtheria 88.06 7.41 49.50 99.00
## HIV/AIDS 0.43 0.00 0.10 1.85
## GDP 2909.41 2115.86 1.68 11077.78
## Population 2612276.22 1804399.81 34.00 10832552.25
## thinness 5-9 years 4.14 3.41 0.10 15.60
## Income composition of resources 0.65 0.17 0.10 0.95
## Schooling 12.16 2.82 4.60 19.80
## range skew kurtosis se
## Status* 1.00 -1.72 0.95 0.01
## Life expectancy 44.40 -0.62 -0.29 0.17
## Adult Mortality 455.50 0.77 -0.01 2.13
## Alcohol 16.83 0.61 -0.66 0.07
## percentage expenditure 1096.81 1.24 -0.03 7.19
## Hepatitis B 38.00 -1.17 -0.04 0.23
## Measles 900.62 1.18 -0.40 6.52
## BMI 86.30 -0.22 -1.27 0.37
## under-five deaths 70.00 1.24 -0.08 0.46
## Polio 49.50 -1.24 0.22 0.29
## Total expenditure 11.40 0.27 -0.04 0.04
## Diphtheria 49.50 -1.26 0.28 0.29
## HIV/AIDS 1.75 1.21 -0.36 0.01
## GDP 11076.10 1.16 -0.16 69.70
## Population 10832518.25 1.19 -0.21 70799.26
## thinness 5-9 years 15.50 1.10 0.56 0.07
## Income composition of resources 0.85 -0.90 0.59 0.00
## Schooling 15.20 -0.30 -0.02 0.06
cat_cols <- sapply(Life_Expectancy_Clean, is.character)
cat_cols <- names(Life_Expectancy_Clean)[cat_cols]
cat("Kolom kategorikal yang ditemukan:\n")
## Kolom kategorikal yang ditemukan:
print(cat_cols)
## [1] "Status"
for (col in cat_cols) {
unique_vals <- unique(Life_Expectancy_Clean[[col]])
if (length(unique_vals) == 2) {
Life_Expectancy_Clean[[col]] <- as.numeric(as.factor(Life_Expectancy_Clean[[col]])) - 1
} else {
cat(paste("Kolom", col, "memiliki lebih dari 2 kategori, perlu one-hot encoding.\n"))
}
}
str(Life_Expectancy_Clean)
## tibble [2,938 × 18] (S3: tbl_df/tbl/data.frame)
## $ Status : num [1:2938] 1 1 1 1 1 1 1 1 1 1 ...
## $ Life expectancy : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult Mortality : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
## $ Alcohol : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage expenditure : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis B : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : num [1:2938] 901 492 430 901 901 ...
## $ BMI : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under-five deaths : num [1:2938] 70 70 70 70 70 70 70 70 70 70 ...
## $ Polio : num [1:2938] 49.5 58 62 67 68 66 63 64 63 58 ...
## $ Total expenditure : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV/AIDS : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
## $ Population : num [1:2938] 10832552 327582 10832552 3696958 2978599 ...
## $ thinness 5-9 years : num [1:2938] 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 ...
## $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
View(Life_Expectancy_Clean)
names(Life_Expectancy_Clean) <- make.names(names(Life_Expectancy_Clean))
target <- "Life.expectancy"
predictors <- c(
"Adult.Mortality", "Alcohol", "percentage.expenditure", "Hepatitis.B",
"Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure",
"Diphtheria", "HIV.AIDS", "GDP", "Population", "thinness.5.9.years",
"Income.composition.of.resources", "Schooling"
)
formula_str <- paste(target, "~", paste(predictors, collapse = " + "))
model_vif <- lm(as.formula(formula_str), data = Life_Expectancy_Clean)
vif_values <- vif(model_vif)
vif_df <- data.frame(
Variabel = names(vif_values),
VIF = as.numeric(vif_values)
)
cat("=== HASIL VIF (Vertikal) ===\n")
## === HASIL VIF (Vertikal) ===
print(vif_df)
## Variabel VIF
## 1 Adult.Mortality 1.713391
## 2 Alcohol 1.591185
## 3 percentage.expenditure 3.811133
## 4 Hepatitis.B 1.480498
## 5 Measles 1.598401
## 6 BMI 1.852373
## 7 under.five.deaths 2.389527
## 8 Polio 3.760797
## 9 Total.expenditure 1.195959
## 10 Diphtheria 3.923627
## 11 HIV.AIDS 2.146507
## 12 GDP 4.196137
## 13 Population 1.200941
## 14 thinness.5.9.years 1.789578
## 15 Income.composition.of.resources 3.425626
## 16 Schooling 4.161519
cat("\n=== INTERPRETASI OTOMATIS ===\n")
##
## === INTERPRETASI OTOMATIS ===
for (i in 1:length(vif_values)) {
if (vif_values[i] < 5) {
cat(names(vif_values)[i], ": Tidak ada indikasi multikolinearitas (VIF < 5)\n")
} else if (vif_values[i] >= 5 & vif_values[i] < 10) {
cat(names(vif_values)[i], ": Ada indikasi multikolinearitas sedang (5 ≤ VIF < 10)\n")
} else {
cat(names(vif_values)[i], ": Multikolinearitas tinggi (VIF ≥ 10)\n")
}
}
## Adult.Mortality : Tidak ada indikasi multikolinearitas (VIF < 5)
## Alcohol : Tidak ada indikasi multikolinearitas (VIF < 5)
## percentage.expenditure : Tidak ada indikasi multikolinearitas (VIF < 5)
## Hepatitis.B : Tidak ada indikasi multikolinearitas (VIF < 5)
## Measles : Tidak ada indikasi multikolinearitas (VIF < 5)
## BMI : Tidak ada indikasi multikolinearitas (VIF < 5)
## under.five.deaths : Tidak ada indikasi multikolinearitas (VIF < 5)
## Polio : Tidak ada indikasi multikolinearitas (VIF < 5)
## Total.expenditure : Tidak ada indikasi multikolinearitas (VIF < 5)
## Diphtheria : Tidak ada indikasi multikolinearitas (VIF < 5)
## HIV.AIDS : Tidak ada indikasi multikolinearitas (VIF < 5)
## GDP : Tidak ada indikasi multikolinearitas (VIF < 5)
## Population : Tidak ada indikasi multikolinearitas (VIF < 5)
## thinness.5.9.years : Tidak ada indikasi multikolinearitas (VIF < 5)
## Income.composition.of.resources : Tidak ada indikasi multikolinearitas (VIF < 5)
## Schooling : Tidak ada indikasi multikolinearitas (VIF < 5)
set.seed(123)
idx <- sample(1:nrow(Life_Expectancy_Clean), 0.8 * nrow(Life_Expectancy_Clean))
data_train <- Life_Expectancy_Clean[idx, ]
data_test <- Life_Expectancy_Clean[-idx, ]
cat("Jumlah data training:", nrow(data_train), "\n")
## Jumlah data training: 2350
cat("Jumlah data testing :", nrow(data_test), "\n")
## Jumlah data testing : 588
target <- "Status"
data_train[[target]] <- as.factor(data_train[[target]])
data_test[[target]] <- as.factor(data_test[[target]])
normalize <- function(x){ (x - min(x)) / (max(x) - min(x)) }
num_cols <- names(data_train)[sapply(data_train, is.numeric)]
data_train[num_cols] <- lapply(data_train[num_cols], normalize)
data_test[num_cols] <- lapply(data_test[num_cols], normalize)
#data training
data_train %>%
count(Status) %>%
mutate(proporsi = round(n/sum(n)*100,2))
## # A tibble: 2 × 3
## Status n proporsi
## <fct> <int> <dbl>
## 1 0 413 17.6
## 2 1 1937 82.4
#data testing
data_test %>%
count(Status) %>%
mutate(proporsi = round(n / sum(n) * 100, 2))
## # A tibble: 2 × 3
## Status n proporsi
## <fct> <int> <dbl>
## 1 0 99 16.8
## 2 1 489 83.2
set.seed(123)
data_train_bal <- ROSE(Status ~ ., data = data_train, seed = 123)$data
data_train_bal %>%
count(Status) %>%
mutate(proporsi = round(n/sum(n)*100,2))
## Status n proporsi
## 1 1 1186 50.47
## 2 0 1164 49.53
# ============================
# 1. SVM (Unbalanced) Training
# ============================
svm_imbal <- svm(
Status ~ .,
data = data_train,
kernel = "radial",
cost = 1,
gamma = 0.1,
probability = TRUE
)
# ============================
# 2. Prediksi pada Data Training
# ============================
pred_svm_train <- predict(svm_imbal, newdata = data_train)
conf_svm_train <- confusionMatrix(pred_svm_train, data_train$Status)
print("=== SVM Training (Unbalanced) ===")
## [1] "=== SVM Training (Unbalanced) ==="
print(conf_svm_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 374 44
## 1 39 1893
##
## Accuracy : 0.9647
## 95% CI : (0.9564, 0.9718)
## No Information Rate : 0.8243
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8787
##
## Mcnemar's Test P-Value : 0.6606
##
## Sensitivity : 0.9056
## Specificity : 0.9773
## Pos Pred Value : 0.8947
## Neg Pred Value : 0.9798
## Prevalence : 0.1757
## Detection Rate : 0.1591
## Detection Prevalence : 0.1779
## Balanced Accuracy : 0.9414
##
## 'Positive' Class : 0
##
# ============================
# 3. Prediksi pada Data Testing
# ============================
pred_svm_test <- predict(svm_imbal, newdata = data_test)
conf_svm_test <- confusionMatrix(pred_svm_test, data_test$Status)
print("=== SVM Testing (Unbalanced) ===")
## [1] "=== SVM Testing (Unbalanced) ==="
print(conf_svm_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 88 28
## 1 11 461
##
## Accuracy : 0.9337
## 95% CI : (0.9104, 0.9524)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : 1.538e-13
##
## Kappa : 0.7783
##
## Mcnemar's Test P-Value : 0.01041
##
## Sensitivity : 0.8889
## Specificity : 0.9427
## Pos Pred Value : 0.7586
## Neg Pred Value : 0.9767
## Prevalence : 0.1684
## Detection Rate : 0.1497
## Detection Prevalence : 0.1973
## Balanced Accuracy : 0.9158
##
## 'Positive' Class : 0
##
# ============================
# 4. Precision, Recall, F1 per kelas (Testing)
# ============================
# ===========================================
# === METRIK LENGKAP UNTUK DATA TRAINING ====
# ===========================================
cat("\n===== METRICS SVM (Training) =====\n")
##
## ===== METRICS SVM (Training) =====
# Ambil confusion matrix training
cm_train <- conf_svm_train
# Accuracy
acc_train <- cm_train$overall["Accuracy"]
# Metrik per kelas dari caret
prec_train_0 <- confusionMatrix(pred_svm_train, data_train$Status, positive = "0")$byClass["Pos Pred Value"]
rec_train_0 <- confusionMatrix(pred_svm_train, data_train$Status, positive = "0")$byClass["Sensitivity"]
f1_train_0 <- confusionMatrix(pred_svm_train, data_train$Status, positive = "0")$byClass["F1"]
prec_train_1 <- confusionMatrix(pred_svm_train, data_train$Status, positive = "1")$byClass["Pos Pred Value"]
rec_train_1 <- confusionMatrix(pred_svm_train, data_train$Status, positive = "1")$byClass["Sensitivity"]
f1_train_1 <- confusionMatrix(pred_svm_train, data_train$Status, positive = "1")$byClass["F1"]
hasil_train <- data.frame(
Class = c("0", "1"),
Precision = c(prec_train_0, prec_train_1),
Recall = c(rec_train_0, rec_train_1),
F1 = c(f1_train_0, f1_train_1)
)
cat("Accuracy (Training):", round(acc_train, 4), "\n")
## Accuracy (Training): 0.9647
print(hasil_train)
## Class Precision Recall F1
## 1 0 0.8947368 0.9055690 0.9001203
## 2 1 0.9798137 0.9772845 0.9785474
# ===========================================
# === METRIK LENGKAP UNTUK DATA TESTING =====
# ===========================================
cat("\n===== METRICS SVM (Testing) =====\n")
##
## ===== METRICS SVM (Testing) =====
# Ambil confusion matrix testing
cm_test <- conf_svm_test
# Accuracy
acc_test <- cm_test$overall["Accuracy"]
# Metrik per kelas
prec_test_0 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "0")$byClass["Pos Pred Value"]
rec_test_0 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "0")$byClass["Sensitivity"]
f1_test_0 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "0")$byClass["F1"]
prec_test_1 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "1")$byClass["Pos Pred Value"]
rec_test_1 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "1")$byClass["Sensitivity"]
f1_test_1 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "1")$byClass["F1"]
hasil_test <- data.frame(
Class = c("0", "1"),
Precision = c(prec_test_0, prec_test_1),
Recall = c(rec_test_0, rec_test_1),
F1 = c(f1_test_0, f1_test_1)
)
cat("Accuracy (Testing):", round(acc_test, 4), "\n")
## Accuracy (Testing): 0.9337
print(hasil_test)
## Class Precision Recall F1
## 1 0 0.7586207 0.8888889 0.8186047
## 2 1 0.9766949 0.9427403 0.9594173
# ============================
# 5. Plot Confusion Matrix (Testing)
# ============================
cm_imbal <- conf_svm_test$table
cm_imbal_melt <- melt(cm_imbal)
ggplot(cm_imbal_melt, aes(Reference, Prediction, fill = value)) +
geom_tile() +
geom_text(aes(label = value), size = 6, color = "white") +
scale_fill_gradient(low = "blue", high = "red") +
ggtitle("Confusion Matrix – SVM Unbalanced") +
theme_minimal(base_size = 14)
# ============================
# 6. ROC Curve & AUC (Testing)
# ============================
prob_imbal <- attr(predict(svm_imbal, newdata = data_test, probability = TRUE),
"probabilities")[, "1"]
roc_imbal <- roc(data_test$Status, prob_imbal)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_imbal, main = "ROC Curve – SVM Unbalanced", col = "darkgreen", lwd = 2)
auc_value <- auc(roc_imbal)
cat("\nAUC (SVM Unbalanced, Testing):", round(auc_value, 4), "\n")
##
## AUC (SVM Unbalanced, Testing): 0.9669
#interpretasi SVM UNBALANCE Model Support Vector Machine (SVM) pada kondisi data unbalanced menunjukkan performa yang sangat baik baik pada data training maupun data testing. Ketidakseimbangan kelas tampak dari proporsi kelas positif yang jauh lebih sedikit dibanding kelas negatif, sehingga terdapat risiko model hanya “mengikuti mayoritas”. Namun, hasil evaluasi menunjukkan bahwa SVM tetap mampu melakukan pemisahan kelas secara akurat. Pada data testing, confusion matrix memperlihatkan bahwa model mampu mengklasifikasikan 461 data sebagai kelas 1 secara tepat (True Positive) dan 88 data sebagai kelas 0 secara tepat (True Negative). Meskipun demikian, masih terdapat 28 kasus False Negative dan 11 kasus False Positive, yang berarti model masih sedikit lebih sering salah pada kelas minoritas. Hal tersebut selaras dengan nilai metrik klasifikasi yang diperoleh, di mana akurasi mencapai 93,37% dan balanced accuracy sebesar 91,58%, menunjukkan bahwa meskipun kelas tidak seimbang, performa model tetap stabil untuk kedua kelas. Sensitivity atau kemampuan mendeteksi kelas 0 sebesar 88,89% sedikit lebih rendah dibanding specificity (94,27%), yang berarti model jauh lebih optimal dalam mengenali kelas 1 dibanding kelas 0. Nilai Kappa 0.7783 mengindikasikan tingkat kesepakatan yang kuat antara prediksi model dan kondisi aktual, sehingga performa model tidak hanya terlihat tinggi akibat dominasi kelas mayoritas. Selain itu, nilai PPV sebesar 75,86% dan NPV sebesar 97,67% menunjukkan bahwa prediksi kelas 1 sangat jarang salah, sedangkan prediksi kelas 0 masih mengandung kesalahan yang relatif lebih tinggi.
Hasil ini diperkuat oleh visualisasi ROC Curve yang menunjukkan kurva berada jauh di atas garis diagonal pembeda acak, dengan AUC sebesar 0.9669. Nilai AUC yang mendekati 1 mencerminkan kemampuan diskriminasi model yang sangat kuat dalam membedakan kelas 0 dan kelas 1. Dengan demikian, meskipun model dilatih pada data yang tidak seimbang, SVM tetap menunjukkan performa yang superior baik secara statistik maupun visual berdasarkan hasil kurva ROC. Secara keseluruhan, SVM Unbalanced terbukti mampu melakukan klasifikasi dengan baik pada data training maupun testing, namun masih menunjukkan kecenderungan bias terhadap kelas mayoritas, yang tercermin dari lebih tingginya false negative. Model ini sangat layak digunakan, namun performanya masih dapat ditingkatkan apabila dilakukan penanganan ketidakseimbangan kelas, seperti SMOTE, undersampling, atau penyesuaian cost-sensitive agar sensitivitas terhadap kelas minoritas semakin meningkat.
# ============================
# 1. SVM (Balanced) Training
# ============================
svm_bal <- svm(
Status ~ .,
data = data_train_bal, # data train sudah balance
kernel = "radial",
cost = 1,
gamma = 0.1,
probability = TRUE
)
# ============================
# 2. Prediksi pada Data Training
# ============================
pred_svm_train_bal <- predict(svm_bal, newdata = data_train_bal)
conf_svm_train_bal <- confusionMatrix(pred_svm_train_bal, data_train_bal$Status)
print("=== SVM Training (Balanced) ===")
## [1] "=== SVM Training (Balanced) ==="
print(conf_svm_train_bal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 1135 4
## 0 51 1160
##
## Accuracy : 0.9766
## 95% CI : (0.9696, 0.9823)
## No Information Rate : 0.5047
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9532
##
## Mcnemar's Test P-Value : 5.552e-10
##
## Sensitivity : 0.9570
## Specificity : 0.9966
## Pos Pred Value : 0.9965
## Neg Pred Value : 0.9579
## Prevalence : 0.5047
## Detection Rate : 0.4830
## Detection Prevalence : 0.4847
## Balanced Accuracy : 0.9768
##
## 'Positive' Class : 1
##
# ============================
# Precision, Recall, F1 per kelas (Training)
# ============================
cm0_train_bal <- confusionMatrix(pred_svm_train_bal, data_train_bal$Status, positive = "0")
precision_0_train <- cm0_train_bal$byClass["Pos Pred Value"]
recall_0_train <- cm0_train_bal$byClass["Sensitivity"]
f1_0_train <- cm0_train_bal$byClass["F1"]
cm1_train_bal <- confusionMatrix(pred_svm_train_bal, data_train_bal$Status, positive = "1")
precision_1_train <- cm1_train_bal$byClass["Pos Pred Value"]
recall_1_train <- cm1_train_bal$byClass["Sensitivity"]
f1_1_train <- cm1_train_bal$byClass["F1"]
hasil_per_kelas_train_bal <- data.frame(
Class = c("0", "1"),
Precision = c(precision_0_train, precision_1_train),
Recall = c(recall_0_train, recall_1_train),
F1 = c(f1_0_train, f1_1_train)
)
print("=== Precision, Recall, F1 per Kelas (Training) ===")
## [1] "=== Precision, Recall, F1 per Kelas (Training) ==="
print(hasil_per_kelas_train_bal)
## Class Precision Recall F1
## 1 0 0.9578860 0.9965636 0.9768421
## 2 1 0.9964881 0.9569983 0.9763441
# ============================
# 3. Prediksi pada Data Testing
# ============================
pred_svm_test_bal <- predict(svm_bal, newdata = data_test)
conf_svm_test_bal <- confusionMatrix(pred_svm_test_bal, data_test$Status)
## Warning in confusionMatrix.default(pred_svm_test_bal, data_test$Status): Levels
## are not in the same order for reference and data. Refactoring data to match.
print("=== SVM Testing (Balanced) ===")
## [1] "=== SVM Testing (Balanced) ==="
print(conf_svm_test_bal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 98 90
## 1 1 399
##
## Accuracy : 0.8452
## 95% CI : (0.8134, 0.8735)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : 0.2053
##
## Kappa : 0.5932
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9899
## Specificity : 0.8160
## Pos Pred Value : 0.5213
## Neg Pred Value : 0.9975
## Prevalence : 0.1684
## Detection Rate : 0.1667
## Detection Prevalence : 0.3197
## Balanced Accuracy : 0.9029
##
## 'Positive' Class : 0
##
# ============================
# 4. Precision, Recall, F1 per kelas (Testing)
# ============================
cm0_bal <- confusionMatrix(pred_svm_test_bal, data_test$Status, positive = "0")
## Warning in confusionMatrix.default(pred_svm_test_bal, data_test$Status, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
precision_0_bal <- cm0_bal$byClass["Pos Pred Value"]
recall_0_bal <- cm0_bal$byClass["Sensitivity"]
f1_0_bal <- cm0_bal$byClass["F1"]
cm1_bal <- confusionMatrix(pred_svm_test_bal, data_test$Status, positive = "1")
## Warning in confusionMatrix.default(pred_svm_test_bal, data_test$Status, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
precision_1_bal <- cm1_bal$byClass["Pos Pred Value"]
recall_1_bal <- cm1_bal$byClass["Sensitivity"]
f1_1_bal <- cm1_bal$byClass["F1"]
hasil_per_kelas_bal <- data.frame(
Class = c("0", "1"),
Precision = c(precision_0_bal, precision_1_bal),
Recall = c(recall_0_bal, recall_1_bal),
F1 = c(f1_0_bal, f1_1_bal)
)
print("=== Precision, Recall, F1 per Kelas (Testing) ===")
## [1] "=== Precision, Recall, F1 per Kelas (Testing) ==="
print(hasil_per_kelas_bal)
## Class Precision Recall F1
## 1 0 0.5212766 0.9898990 0.6829268
## 2 1 0.9975000 0.8159509 0.8976378
# ============================
# 5. Plot Confusion Matrix (Testing)
# ============================
cm_bal <- conf_svm_test_bal$table
cm_bal_melt <- melt(cm_bal)
ggplot(cm_bal_melt, aes(Reference, Prediction, fill = value)) +
geom_tile() +
geom_text(aes(label = value), size = 6, color = "white") +
scale_fill_gradient(low = "blue", high = "red") +
ggtitle("Confusion Matrix – SVM Balanced") +
theme_minimal(base_size = 14)
# ============================
# 6. ROC Curve & AUC (Testing)
# ============================
prob_bal <- attr(predict(svm_bal, newdata = data_test, probability = TRUE),
"probabilities")[, "1"]
roc_bal <- roc(data_test$Status, prob_bal)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_bal, main = "ROC Curve – SVM Balanced", col = "darkblue", lwd = 2)
auc_value_bal <- auc(roc_bal)
cat("\nAUC (SVM Balanced, Testing):", round(auc_value_bal, 4), "\n")
##
## AUC (SVM Balanced, Testing): 0.9658
#interpretasi Model SVM Balanced dibangun dengan tujuan mengatasi ketidakseimbangan kelas yang terdapat pada dataset sehingga model tidak terlalu condong pada kelas mayoritas. Pada proses pelatihan (training), model menunjukkan performa yang sangat baik, ditunjukkan oleh nilai akurasi 97,66% dan balanced accuracy 97,68%. Confusion matrix pada data training memperlihatkan bahwa model mampu mengklasifikasikan 1.135 observasi kelas 1 dan 1.160 observasi kelas 0 secara benar, dengan jumlah kesalahan klasifikasi yang sangat kecil yaitu hanya 4 kasus false positive dan 51 kasus false negative. Nilai sensitivity sebesar 95,70% menandakan kemampuan model dalam menangkap kelas 1 dengan sangat baik, sedangkan specificity sebesar 99,66% menunjukkan kemampuan hampir sempurna dalam mengenali kelas 0. Nilai Kappa sebesar 0.9532 menggambarkan tingkat kesesuaian prediksi dan data aktual yang sangat kuat, sehingga performa tinggi model tidak hanya disebabkan oleh dominasi kelas tertentu. Secara keseluruhan, hasil pelatihan menunjukkan bahwa penyeimbangan kelas berhasil meningkatkan akurasi sekaligus menjaga kestabilan prediksi antar kelas.
Namun ketika model diuji pada data testing, performanya mengalami penurunan yang cukup signifikan jika dibandingkan dengan data training. Akurasi turun menjadi 84,52% dengan balanced accuracy sebesar 90,29%. Confusion matrix pada data testing memperlihatkan 399 observasi kelas 1 berhasil diprediksi dengan benar, tetapi terjadi peningkatan kesalahan pada kelas 0: terdapat 98 false negative dan hanya 90 instance kelas 0 yang teridentifikasi benar. Sensitivity mencapai 98,99% yang berarti model hampir selalu mengenali kelas 1, tetapi specificity turun menjadi 81,60% sehingga kemampuan model mendeteksi kelas 0 menurun cukup tajam. Hal ini juga tercermin dari nilai Pos Pred Value (Precision kelas 0) yang hanya 52,13%, mengindikasikan bahwa prediksi untuk kelas 0 masih sering salah meskipun Neg Pred Value (Precision kelas 1) sangat tinggi yaitu 99,75%. Nilai Kappa yang turun menjadi 0.5932 menunjukkan bahwa kestabilan model pada data testing hanya berada pada tingkat sedang, sehingga belum sebaik performa pada data training.
Visualisasi ROC Curve pada data testing menunjukkan bahwa model tetap memiliki kemampuan diskriminasi yang sangat baik, ditunjukkan oleh nilai AUC sebesar 0.9658 yang berada jauh di atas garis diagonal acak. Hal ini mengindikasikan bahwa meskipun performa prediksi kelas 0 mengalami penurunan pada data testing, model SVM Balanced tetap mampu membedakan kelas dengan sangat baik secara probabilistik. Secara keseluruhan, model SVM Balanced berhasil meningkatkan sensitivitas terhadap kelas minoritas, sehingga menghasilkan deteksi yang lebih merata antar kelas dibandingkan model SVM Unbalanced. Namun, performa model pada data testing menunjukkan gejala overfitting, karena hasil training sangat tinggi sementara testing tidak sebaik itu, terutama pada prediksi kelas 0. Dengan demikian, meskipun balancing memberikan peningkatan kemampuan dalam mengenali kelas minoritas, penyempurnaan lebih lanjut tetap diperlukan, misalnya melalui tuning parameter, cross-validation yang lebih ketat, atau pendekatan balancing berbasis SMOTE maupun cost-sensitive learning untuk meningkatkan generalisasi model.
# ============================
# 1. Random Forest (Unbalanced Data) Training
# ============================
rf_imbal <- randomForest(
Status ~ .,
data = data_train,
ntree = 500,
mtry = 4,
importance = TRUE
)
# ============================
# 2. Prediksi pada Data Training
# ============================
rf_pred_train <- predict(rf_imbal, newdata = data_train)
conf_rf_train <- confusionMatrix(rf_pred_train, data_train$Status)
print("=== Random Forest Training (Unbalanced) ===")
## [1] "=== Random Forest Training (Unbalanced) ==="
print(conf_rf_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 413 0
## 1 0 1937
##
## Accuracy : 1
## 95% CI : (0.9984, 1)
## No Information Rate : 0.8243
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1757
## Detection Rate : 0.1757
## Detection Prevalence : 0.1757
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
cat("\nAkurasi RF Training (Unbalanced):", round(conf_rf_train$overall["Accuracy"], 4), "\n\n")
##
## Akurasi RF Training (Unbalanced): 1
# ============================
# 5. Precision, Recall, F1 per Kelas (Training)
# ============================
cm0_train <- confusionMatrix(rf_pred_train, data_train$Status, positive = "0")
precision_0_train <- cm0_train$byClass["Pos Pred Value"]
recall_0_train <- cm0_train$byClass["Sensitivity"]
f1_0_train <- cm0_train$byClass["F1"]
cm1_train <- confusionMatrix(rf_pred_train, data_train$Status, positive = "1")
precision_1_train <- cm1_train$byClass["Pos Pred Value"]
recall_1_train <- cm1_train$byClass["Sensitivity"]
f1_1_train <- cm1_train$byClass["F1"]
hasil_train_rf <- data.frame(
Class = c("0", "1"),
Precision = c(precision_0_train, precision_1_train),
Recall = c(recall_0_train, recall_1_train),
F1 = c(f1_0_train, f1_1_train)
)
cat("\n=== Precision, Recall, F1 – Random Forest TRAINING (Unbalanced) ===\n")
##
## === Precision, Recall, F1 – Random Forest TRAINING (Unbalanced) ===
print(hasil_train_rf)
## Class Precision Recall F1
## 1 0 1 1 1
## 2 1 1 1 1
# ============================
# 3. Prediksi pada Data Testing
# ============================
rf_pred_test <- predict(rf_imbal, newdata = data_test)
conf_rf_test <- confusionMatrix(rf_pred_test, data_test$Status)
print("=== Random Forest Testing (Unbalanced) ===")
## [1] "=== Random Forest Testing (Unbalanced) ==="
print(conf_rf_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 94 16
## 1 5 473
##
## Accuracy : 0.9643
## 95% CI : (0.9459, 0.9778)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8779
##
## Mcnemar's Test P-Value : 0.0291
##
## Sensitivity : 0.9495
## Specificity : 0.9673
## Pos Pred Value : 0.8545
## Neg Pred Value : 0.9895
## Prevalence : 0.1684
## Detection Rate : 0.1599
## Detection Prevalence : 0.1871
## Balanced Accuracy : 0.9584
##
## 'Positive' Class : 0
##
cat("\nAkurasi RF Testing (Unbalanced):", round(conf_rf_test$overall["Accuracy"], 4), "\n")
##
## Akurasi RF Testing (Unbalanced): 0.9643
# ============================
# 6. Precision, Recall, F1 per Kelas (Testing)
# ============================
cm0_test <- confusionMatrix(rf_pred_test, data_test$Status, positive = "0")
precision_0_test <- cm0_test$byClass["Pos Pred Value"]
recall_0_test <- cm0_test$byClass["Sensitivity"]
f1_0_test <- cm0_test$byClass["F1"]
cm1_test <- confusionMatrix(rf_pred_test, data_test$Status, positive = "1")
precision_1_test <- cm1_test$byClass["Pos Pred Value"]
recall_1_test <- cm1_test$byClass["Sensitivity"]
f1_1_test <- cm1_test$byClass["F1"]
hasil_test_rf <- data.frame(
Class = c("0", "1"),
Precision = c(precision_0_test, precision_1_test),
Recall = c(recall_0_test, recall_1_test),
F1 = c(f1_0_test, f1_1_test)
)
cat("\n=== Precision, Recall, F1 – Random Forest TESTING (Unbalanced) ===\n")
##
## === Precision, Recall, F1 – Random Forest TESTING (Unbalanced) ===
print(hasil_test_rf)
## Class Precision Recall F1
## 1 0 0.8545455 0.9494949 0.8995215
## 2 1 0.9895397 0.9672802 0.9782834
# ============================
# 4. plot Variable Importance
# ============================
# Plot Variable Importance
varImpPlot(
rf_imbal,
main = "Random Forest Variable Importance (unbalanced Data)",
pch = 19
)
rf_varimp <- importance(rf_imbal)
rf_varimp_df <- data.frame(
Variable = rownames(rf_varimp),
Importance = rf_varimp[, 1]
)
# Urutkan dari paling penting
rf_varimp_df <- rf_varimp_df[order(rf_varimp_df$Importance, decreasing = TRUE), ]
# Plot barchart
ggplot(rf_varimp_df, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "salmon") +
coord_flip() + # horizontal
labs(title = "Random Forest Variable Importance (Unbalanced Data)",
x = "Variable",
y = "Importance") +
theme_minimal(base_size = 14)
#interpretasi Berdasarkan hasil analisis menggunakan algoritma Random Forest pada data yang masih dalam kondisi unbalanced, model menunjukkan performa yang sangat tinggi saat proses pelatihan (training). Hal ini terlihat dari confusion matrix dimana model berhasil mengklasifikasikan seluruh data dengan benar, tanpa kesalahan prediksi baik untuk kelas 0 maupun kelas 1. Nilai akurasi pada training mencapai 100%, disertai nilai Kappa sebesar 1, yang menandakan adanya kesesuaian sempurna antara prediksi model dan kondisi aktual. Meskipun hasil ini tampak sangat baik, performa yang terlalu sempurna berpotensi menandakan overfitting, yaitu model terlalu menyesuaikan diri dengan data pelatihan sehingga mungkin tidak mampu bekerja sebaik itu pada data baru.
Ketika diuji menggunakan data testing, performa Random Forest menurun tetapi tetap berada dalam kategori sangat baik. Confusion matrix pada data testing menunjukkan bahwa model memprediksi kelas 0 dengan benar sebanyak 94 kasus dan salah sebanyak 5 kasus, sementara pada kelas 1 model memprediksi benar sebanyak 473 kasus dan salah sebanyak 16 kasus. Dari hasil ini diperoleh nilai akurasi sebesar 0.9643 (96,43%), dengan nilai Kappa 0.8779, yang berarti tingkat kesepakatan antara prediksi dan kondisi aktual masih sangat tinggi. Sensitivitas model terhadap kelas positif (kelas 0) mencapai 94,95%, sementara spesifisitas model terhadap kelas negatif (kelas 1) adalah 96,73%, menunjukkan kemampuan klasifikasi yang seimbang di antara kedua kelas meskipun dataset tidak seimbang dari awalnya.
Selain itu, hasil plot variable importance memberikan gambaran mengenai variabel yang paling berpengaruh dalam proses prediksi Random Forest pada dataset unbalanced. Berdasarkan grafik MeanDecreaseAccuracy dan MeanDecreaseGini, variabel Alcohol ditemukan sebagai faktor paling penting dalam menentukan hasil prediksi status kesehatan negara. Variabel penting berikutnya meliputi Income composition of resources, thinness 5-9 years, Life expectancy, dan Adult mortality. Sementara variabel dengan kontribusi paling rendah antara lain HIV/AIDS dan Measles. Pola ini mengindikasikan bahwa faktor gaya hidup dan kondisi demografi seperti konsumsi alkohol, tingkat ekonomi, dan masalah nutrisi anak memiliki pengaruh besar dalam pembentukan model. Visualisasi barplot memperkuat hasil tersebut dengan menunjukkan jarak kontribusi yang sangat mencolok antara variabel Alcohol dengan variabel lain.
Secara keseluruhan, Random Forest pada data unbalanced berhasil memberikan performa prediksi yang sangat baik, khususnya pada data testing. Namun akurasi yang sempurna pada data training memperkuat indikasi bahwa model mengalami overfitting, sehingga meskipun hasil testing tetap tinggi, peningkatan generalisasi dapat dicapai bila dilakukan penyeimbangan kelas untuk memperoleh ketepatan model yang lebih stabil dan konsisten pada berbagai jenis data.
# ============================
# 1. Random Forest (Balanced Data) Training
# ============================
rf_bal <- randomForest(
Status ~ .,
data = data_train_bal,
ntree = 500,
mtry = 4,
importance = TRUE
)
# ============================
# 2. Prediksi pada Data Training
# ============================
rf_pred_train_bal <- predict(rf_bal, newdata = data_train_bal)
conf_rf_train_bal <- confusionMatrix(rf_pred_train_bal, data_train_bal$Status)
print("=== Random Forest Training (Balanced) ===")
## [1] "=== Random Forest Training (Balanced) ==="
print(conf_rf_train_bal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 1186 0
## 0 0 1164
##
## Accuracy : 1
## 95% CI : (0.9984, 1)
## No Information Rate : 0.5047
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5047
## Detection Rate : 0.5047
## Detection Prevalence : 0.5047
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
cat("\nAkurasi RF Training (Balanced):", round(conf_rf_train_bal$overall["Accuracy"], 4), "\n\n")
##
## Akurasi RF Training (Balanced): 1
# --- Kelas 0 ---
cm0_train_bal <- confusionMatrix(rf_pred_train_bal, data_train_bal$Status, positive = "0")
precision_0_train_bal <- cm0_train_bal$byClass["Pos Pred Value"]
recall_0_train_bal <- cm0_train_bal$byClass["Sensitivity"]
f1_0_train_bal <- cm0_train_bal$byClass["F1"]
# --- Kelas 1 ---
cm1_train_bal <- confusionMatrix(rf_pred_train_bal, data_train_bal$Status, positive = "1")
precision_1_train_bal <- cm1_train_bal$byClass["Pos Pred Value"]
recall_1_train_bal <- cm1_train_bal$byClass["Sensitivity"]
f1_1_train_bal <- cm1_train_bal$byClass["F1"]
hasil_train_rf_bal <- data.frame(
Class = c("0", "1"),
Precision = c(precision_0_train_bal, precision_1_train_bal),
Recall = c(recall_0_train_bal, recall_1_train_bal),
F1 = c(f1_0_train_bal, f1_1_train_bal)
)
cat("\n=== Precision, Recall, F1 – Random Forest TRAINING (Balanced) ===\n")
##
## === Precision, Recall, F1 – Random Forest TRAINING (Balanced) ===
print(hasil_train_rf_bal)
## Class Precision Recall F1
## 1 0 1 1 1
## 2 1 1 1 1
# ============================
# 3. Prediksi pada Data Testing
# ============================
rf_pred_test_bal <- predict(rf_bal, newdata = data_test)
conf_rf_test_bal <- confusionMatrix(rf_pred_test_bal, data_test$Status)
## Warning in confusionMatrix.default(rf_pred_test_bal, data_test$Status): Levels
## are not in the same order for reference and data. Refactoring data to match.
print("=== Random Forest Testing (Balanced) ===")
## [1] "=== Random Forest Testing (Balanced) ==="
print(conf_rf_test_bal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 99 133
## 1 0 356
##
## Accuracy : 0.7738
## 95% CI : (0.7378, 0.807)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.4741
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.7280
## Pos Pred Value : 0.4267
## Neg Pred Value : 1.0000
## Prevalence : 0.1684
## Detection Rate : 0.1684
## Detection Prevalence : 0.3946
## Balanced Accuracy : 0.8640
##
## 'Positive' Class : 0
##
cat("\nAkurasi RF Testing (Balanced):", round(conf_rf_test_bal$overall["Accuracy"], 4), "\n")
##
## Akurasi RF Testing (Balanced): 0.7738
# --- Kelas 0 ---
cm0_test_bal <- confusionMatrix(rf_pred_test_bal, data_test$Status, positive = "0")
## Warning in confusionMatrix.default(rf_pred_test_bal, data_test$Status, positive
## = "0"): Levels are not in the same order for reference and data. Refactoring
## data to match.
precision_0_test_bal <- cm0_test_bal$byClass["Pos Pred Value"]
recall_0_test_bal <- cm0_test_bal$byClass["Sensitivity"]
f1_0_test_bal <- cm0_test_bal$byClass["F1"]
# --- Kelas 1 ---
cm1_test_bal <- confusionMatrix(rf_pred_test_bal, data_test$Status, positive = "1")
## Warning in confusionMatrix.default(rf_pred_test_bal, data_test$Status, positive
## = "1"): Levels are not in the same order for reference and data. Refactoring
## data to match.
precision_1_test_bal <- cm1_test_bal$byClass["Pos Pred Value"]
recall_1_test_bal <- cm1_test_bal$byClass["Sensitivity"]
f1_1_test_bal <- cm1_test_bal$byClass["F1"]
hasil_test_rf_bal <- data.frame(
Class = c("0", "1"),
Precision = c(precision_0_test_bal, precision_1_test_bal),
Recall = c(recall_0_test_bal, recall_1_test_bal),
F1 = c(f1_0_test_bal, f1_1_test_bal)
)
cat("\n=== Precision, Recall, F1 – Random Forest TESTING (Balanced) ===\n")
##
## === Precision, Recall, F1 – Random Forest TESTING (Balanced) ===
print(hasil_test_rf_bal)
## Class Precision Recall F1
## 1 0 0.4267241 1.0000000 0.5981873
## 2 1 1.0000000 0.7280164 0.8426036
# ============================
# 4. Plot Variable Importance
# ============================
# Plot Variable Importance
varImpPlot(
rf_imbal,
main = "Random Forest Variable Importance (Balanced Data)",
pch = 19
)
# Ambil variable importance
rf_varimp_bal <- importance(rf_bal)
rf_varimp_df_bal <- data.frame(
Variable = rownames(rf_varimp_bal),
Importance = rf_varimp_bal[, 1] # MeanDecreaseGini
)
# Urutkan dari paling penting
rf_varimp_df_bal <- rf_varimp_df_bal[order(rf_varimp_df_bal$Importance, decreasing = TRUE), ]
# Plot barchart
ggplot(rf_varimp_df_bal, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Random Forest Variable Importance (Balanced Data)",
x = "Variable",
y = "Importance") +
theme_minimal(base_size = 14)
#interpretasi Model Random Forest yang dibangun menggunakan data yang telah diseimbangkan (balanced) menunjukkan performa sempurna pada data training, dengan akurasi 100% dan tanpa satu pun kesalahan klasifikasi. Hal ini mengindikasikan bahwa model sangat mampu mempelajari pola pada data hasil balancing, namun performa yang terlalu sempurna seperti ini juga menjadi tanda kuat bahwa model mengalami overfitting terhadap data training. Ketika diuji pada data testing yang tidak dibalancing, performanya menurun cukup signifikan dengan akurasi hanya sekitar 77%. Meskipun model masih mampu mendeteksi kelas positif dengan sangat baik (sensitivitas 100%), ia menunjukkan penurunan kemampuan dalam membedakan kelas lainnya, terlihat dari spesifisitas yang hanya sekitar 72%. Nilai presisi yang rendah menunjukkan banyak prediksi positif yang salah. Secara keseluruhan, hasil ini menunjukkan bahwa Random Forest dengan data balanced bekerja sangat baik pada data latih tetapi tidak mampu mempertahankan performanya pada data baru, menandakan bahwa teknik balancing menyebabkan model belajar pola yang tidak cukup representatif untuk data sebenarnya.
# SVM
svm_acc_unbal <- conf_svm_test$overall["Accuracy"]
svm_acc_bal <- conf_svm_test_bal$overall["Accuracy"]
# Random Forest
rf_acc_unbal <- conf_rf_test$overall["Accuracy"]
rf_acc_bal <- conf_rf_test_bal$overall["Accuracy"]
accuracy_df_all <- data.frame(
model = c("SVM_Unbalanced", "SVM_Balanced",
"RandomForest_Unbalanced", "RandomForest_Balanced"),
accuracy = c(as.numeric(svm_acc_unbal), as.numeric(svm_acc_bal),
as.numeric(rf_acc_unbal), as.numeric(rf_acc_bal))
)
accuracy_df_all <- accuracy_df_all %>% arrange(desc(accuracy))
print(accuracy_df_all)
## model accuracy
## 1 RandomForest_Unbalanced 0.9642857
## 2 SVM_Unbalanced 0.9336735
## 3 SVM_Balanced 0.8452381
## 4 RandomForest_Balanced 0.7738095
#interpretasi Tabel ringkasan akurasi menunjukkan bahwa model Random Forest Unbalanced memiliki performa tertinggi dengan akurasi sekitar 96%, diikuti oleh SVM Unbalanced dengan akurasi sekitar 93%. Ketika kedua algoritma dilatih menggunakan data yang telah diseimbangkan, akurasinya menurun cukup signifikan. Model SVM Balanced hanya mencapai akurasi sekitar 84%, sementara Random Forest Balanced memiliki akurasi terendah, yaitu sekitar 77%. Pola ini menunjukkan bahwa proses balancing tidak selalu meningkatkan performa model, terutama ketika data asli sebenarnya lebih merepresentasikan kondisi nyata. Random Forest secara khusus terlihat sangat terdampak oleh balancing, yang membuatnya overfitting pada data training tetapi gagal mempertahankan performa saat diuji pada data testing. Secara keseluruhan, hasil ini mengindikasikan bahwa untuk dataset ini, model tanpa balancing justru memberikan generalisasi yang lebih baik dibanding model yang dilatih menggunakan data balanced.