library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(psych)
library(writexl)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(e1071)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
library(iml)
## Warning: package 'iml' was built under R version 4.5.2
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.5.2
## Loaded ROSE 0.0-4
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Life_Expectancy_Data <- read_excel("Life Expectancy Data.xlsx")
str(Life_Expectancy_Data)
## tibble [2,938 × 18] (S3: tbl_df/tbl/data.frame)
## $ Status : chr [1:2938] "Developing" "Developing" "Developing" "Developing" ...
## $ Life expectancy : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult Mortality : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
## $ Alcohol : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage expenditure : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis B : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : num [1:2938] 1154 492 430 2787 3013 ...
## $ BMI : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under-five deaths : num [1:2938] 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : num [1:2938] 6 58 62 67 68 66 63 64 63 58 ...
## $ Total expenditure : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV/AIDS : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
## $ Population : num [1:2938] 33736494 327582 31731688 3696958 2978599 ...
## $ thinness 5-9 years : num [1:2938] 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
describe(Life_Expectancy_Data)
## vars n mean sd median
## Status* 1 2938 1.83 0.38 2.00
## Life expectancy 2 2928 69.22 9.52 72.10
## Adult Mortality 3 2928 164.80 124.29 144.00
## Alcohol 4 2744 4.60 4.05 3.76
## percentage expenditure 5 2938 738.25 1987.91 64.91
## Hepatitis B 6 2385 80.94 25.07 92.00
## Measles 7 2938 2419.59 11467.27 17.00
## BMI 8 2904 38.32 20.04 43.50
## under-five deaths 9 2938 42.04 160.45 4.00
## Polio 10 2919 82.55 23.43 93.00
## Total expenditure 11 2712 5.94 2.50 5.76
## Diphtheria 12 2919 82.32 23.72 93.00
## HIV/AIDS 13 2938 1.74 5.08 0.10
## GDP 14 2490 7483.16 14270.17 1766.95
## Population 15 2286 12753375.12 61012096.51 1386542.00
## thinness 5-9 years 16 2904 4.87 4.51 3.30
## Income composition of resources 17 2771 0.63 0.21 0.68
## Schooling 18 2775 11.99 3.36 12.30
## trimmed mad min max
## Status* 1.91 0.00 1.00 2.000000e+00
## Life expectancy 69.91 8.60 36.30 8.900000e+01
## Adult Mortality 150.51 112.68 1.00 7.230000e+02
## Alcohol 4.23 4.81 0.01 1.787000e+01
## percentage expenditure 230.74 96.24 0.00 1.947991e+04
## Hepatitis B 86.89 8.90 1.00 9.900000e+01
## Measles 286.08 25.20 0.00 2.121830e+05
## BMI 39.05 24.17 1.00 8.730000e+01
## under-five deaths 14.15 5.93 0.00 2.500000e+03
## Polio 88.05 8.90 3.00 9.900000e+01
## Total expenditure 5.85 2.36 0.37 1.760000e+01
## Diphtheria 87.99 8.90 2.00 9.900000e+01
## HIV/AIDS 0.54 0.00 0.10 5.060000e+01
## GDP 3751.73 2360.98 1.68 1.191727e+05
## Population 3953693.58 2012347.06 34.00 1.293859e+09
## thinness 5-9 years 4.15 3.41 0.10 2.860000e+01
## Income composition of resources 0.65 0.19 0.00 9.500000e-01
## Schooling 12.17 3.11 0.00 2.070000e+01
## range skew kurtosis se
## Status* 1.000000e+00 -1.72 0.95 0.01
## Life expectancy 5.270000e+01 -0.64 -0.24 0.18
## Adult Mortality 7.220000e+02 1.17 1.74 2.30
## Alcohol 1.786000e+01 0.59 -0.81 0.08
## percentage expenditure 1.947991e+04 4.65 26.51 36.68
## Hepatitis B 9.800000e+01 -1.93 2.76 0.51
## Measles 2.121830e+05 9.43 114.58 211.56
## BMI 8.630000e+01 -0.22 -1.29 0.37
## under-five deaths 2.500000e+03 9.49 109.49 2.96
## Polio 9.600000e+01 -2.10 3.76 0.43
## Total expenditure 1.723000e+01 0.62 1.15 0.05
## Diphtheria 9.700000e+01 -2.07 3.55 0.44
## HIV/AIDS 5.050000e+01 5.39 34.80 0.09
## GDP 1.191711e+05 3.20 12.29 285.98
## Population 1.293859e+09 15.90 297.09 1276079.80
## thinness 5-9 years 2.850000e+01 1.78 4.34 0.08
## Income composition of resources 9.500000e-01 -1.14 1.38 0.00
## Schooling 2.070000e+01 -0.60 0.88 0.06
missing_table <- data.frame(
Variable = names(Life_Expectancy_Data),
Missing = colSums(is.na(Life_Expectancy_Data))
)
print(missing_table)
## Variable Missing
## Status Status 0
## Life expectancy Life expectancy 10
## Adult Mortality Adult Mortality 10
## Alcohol Alcohol 194
## percentage expenditure percentage expenditure 0
## Hepatitis B Hepatitis B 553
## Measles Measles 0
## BMI BMI 34
## under-five deaths under-five deaths 0
## Polio Polio 19
## Total expenditure Total expenditure 226
## Diphtheria Diphtheria 19
## HIV/AIDS HIV/AIDS 0
## GDP GDP 448
## Population Population 652
## thinness 5-9 years thinness 5-9 years 34
## Income composition of resources Income composition of resources 167
## Schooling Schooling 163
num_vars <- names(Life_Expectancy_Data)[sapply(Life_Expectancy_Data, is.numeric)]
cat_vars <- names(Life_Expectancy_Data)[sapply(Life_Expectancy_Data, is.character)]
impute_numeric <- function(x) {
if (all(is.na(x))) return(x)
if (abs(skewness(x, na.rm = TRUE)) > 1) {
x[is.na(x)] <- median(x, na.rm = TRUE)
} else {
x[is.na(x)] <- mean(x, na.rm = TRUE)
}
return(x)
}
impute_categorical <- function(x) {
mode_value <- names(sort(table(x), decreasing = TRUE))[1]
x[is.na(x)] <- mode_value
return(x)
}
Life_Expectancy_Imputed <- Life_Expectancy_Data
# Imputasi numerik
for (col in num_vars) {
Life_Expectancy_Imputed[[col]] <- impute_numeric(Life_Expectancy_Imputed[[col]])
}
# Imputasi kategorikal
for (col in cat_vars) {
Life_Expectancy_Imputed[[col]] <- impute_categorical(Life_Expectancy_Imputed[[col]])
}
cat("Missing value sebelum imputasi:\n")
## Missing value sebelum imputasi:
print(colSums(is.na(Life_Expectancy_Data)))
## Status Life expectancy
## 0 10
## Adult Mortality Alcohol
## 10 194
## percentage expenditure Hepatitis B
## 0 553
## Measles BMI
## 0 34
## under-five deaths Polio
## 0 19
## Total expenditure Diphtheria
## 226 19
## HIV/AIDS GDP
## 0 448
## Population thinness 5-9 years
## 652 34
## Income composition of resources Schooling
## 167 163
cat("\nMissing value sesudah imputasi:\n")
##
## Missing value sesudah imputasi:
print(colSums(is.na(Life_Expectancy_Imputed)))
## Status Life expectancy
## 0 0
## Adult Mortality Alcohol
## 0 0
## percentage expenditure Hepatitis B
## 0 0
## Measles BMI
## 0 0
## under-five deaths Polio
## 0 0
## Total expenditure Diphtheria
## 0 0
## HIV/AIDS GDP
## 0 0
## Population thinness 5-9 years
## 0 0
## Income composition of resources Schooling
## 0 0
cek_outlier <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
sum(x < lower | x > upper, na.rm = TRUE)
}
num_vars <- names(Life_Expectancy_Imputed)[sapply(Life_Expectancy_Imputed, is.numeric)]
outlier_before <- data.frame(
Variable = num_vars,
Outlier_Before = sapply(Life_Expectancy_Imputed[num_vars], cek_outlier)
)
cat("=== OUTLIER SEBELUM PEMBERSIHAN ===\n")
## === OUTLIER SEBELUM PEMBERSIHAN ===
print(outlier_before)
## Variable Outlier_Before
## Life expectancy Life expectancy 17
## Adult Mortality Adult Mortality 86
## Alcohol Alcohol 3
## percentage expenditure percentage expenditure 389
## Hepatitis B Hepatitis B 322
## Measles Measles 542
## BMI BMI 0
## under-five deaths under-five deaths 394
## Polio Polio 279
## Total expenditure Total expenditure 51
## Diphtheria Diphtheria 298
## HIV/AIDS HIV/AIDS 542
## GDP GDP 445
## Population Population 452
## thinness 5-9 years thinness 5-9 years 99
## Income composition of resources Income composition of resources 130
## Schooling Schooling 77
df_before_long <- Life_Expectancy_Imputed %>%
select(all_of(num_vars)) %>%
tidyr::pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(df_before_long, aes(x = "", y = Value)) +
geom_boxplot(
fill = "lightgreen",
color = "darkgreen",
outlier.color = "red",
outlier.size = 1
) +
facet_wrap(~ Variable, scales = "free", ncol = 4) +
theme_minimal(base_size = 11) +
theme(
strip.text = element_text(face = "bold"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(title = "Boxplot Outlier Sebelum Pembersihan", x = NULL, y = "Nilai")

handle_outlier <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
x[x < lower] <- lower
x[x > upper] <- upper
return(x)
}
Life_Expectancy_Clean <- Life_Expectancy_Imputed
Life_Expectancy_Clean[num_vars] <-
lapply(Life_Expectancy_Imputed[num_vars], handle_outlier)
outlier_after <- data.frame(
Variable = num_vars,
Outlier_After = sapply(Life_Expectancy_Clean[num_vars], cek_outlier)
)
print(outlier_after)
## Variable Outlier_After
## Life expectancy Life expectancy 0
## Adult Mortality Adult Mortality 0
## Alcohol Alcohol 0
## percentage expenditure percentage expenditure 0
## Hepatitis B Hepatitis B 0
## Measles Measles 0
## BMI BMI 0
## under-five deaths under-five deaths 0
## Polio Polio 0
## Total expenditure Total expenditure 0
## Diphtheria Diphtheria 0
## HIV/AIDS HIV/AIDS 0
## GDP GDP 0
## Population Population 0
## thinness 5-9 years thinness 5-9 years 0
## Income composition of resources Income composition of resources 0
## Schooling Schooling 0
df_after_long <- Life_Expectancy_Clean %>%
select(all_of(num_vars)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(df_after_long, aes(x = "", y = Value)) +
geom_boxplot(
fill = "lightgreen",
color = "darkgreen",
outlier.color = "red",
outlier.size = 1
) +
facet_wrap(~ Variable, scales = "free", ncol = 4) +
theme_minimal(base_size = 11) +
theme(
strip.text = element_text(face = "bold"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(title = "Boxplot Setelah Penanganan Outlier", x = NULL, y = "Nilai")

cat_cols <- sapply(Life_Expectancy_Clean, is.character)
cat_cols <- names(Life_Expectancy_Clean)[cat_cols]
cat("Kolom kategorikal yang ditemukan:\n")
## Kolom kategorikal yang ditemukan:
print(cat_cols)
## [1] "Status"
for (col in cat_cols) {
unique_vals <- unique(Life_Expectancy_Clean[[col]])
if (length(unique_vals) == 2) {
Life_Expectancy_Clean[[col]] <- as.numeric(as.factor(Life_Expectancy_Clean[[col]])) - 1
} else {
cat(paste("Kolom", col, "memiliki lebih dari 2 kategori, perlu one-hot encoding.\n"))
}
}
str(Life_Expectancy_Clean)
## tibble [2,938 × 18] (S3: tbl_df/tbl/data.frame)
## $ Status : num [1:2938] 1 1 1 1 1 1 1 1 1 1 ...
## $ Life expectancy : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult Mortality : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
## $ Alcohol : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage expenditure : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis B : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : num [1:2938] 901 492 430 901 901 ...
## $ BMI : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under-five deaths : num [1:2938] 70 70 70 70 70 70 70 70 70 70 ...
## $ Polio : num [1:2938] 49.5 58 62 67 68 66 63 64 63 58 ...
## $ Total expenditure : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV/AIDS : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
## $ Population : num [1:2938] 10832552 327582 10832552 3696958 2978599 ...
## $ thinness 5-9 years : num [1:2938] 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 ...
## $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
names(Life_Expectancy_Clean) <- make.names(names(Life_Expectancy_Clean))
target <- "Life.expectancy"
predictors <- c(
"Adult.Mortality", "Alcohol", "percentage.expenditure", "Hepatitis.B",
"Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure",
"Diphtheria", "HIV.AIDS", "GDP", "Population", "thinness.5.9.years",
"Income.composition.of.resources", "Schooling"
)
formula_str <- paste(target, "~", paste(predictors, collapse = " + "))
model_vif <- lm(as.formula(formula_str), data = Life_Expectancy_Clean)
vif_values <- vif(model_vif)
vif_df <- data.frame(
Variabel = names(vif_values),
VIF = as.numeric(vif_values)
)
cat("=== HASIL VIF (Vertikal) ===\n")
## === HASIL VIF (Vertikal) ===
print(vif_df)
## Variabel VIF
## 1 Adult.Mortality 1.713391
## 2 Alcohol 1.591185
## 3 percentage.expenditure 3.811133
## 4 Hepatitis.B 1.480498
## 5 Measles 1.598401
## 6 BMI 1.852373
## 7 under.five.deaths 2.389527
## 8 Polio 3.760797
## 9 Total.expenditure 1.195959
## 10 Diphtheria 3.923627
## 11 HIV.AIDS 2.146507
## 12 GDP 4.196137
## 13 Population 1.200941
## 14 thinness.5.9.years 1.789578
## 15 Income.composition.of.resources 3.425626
## 16 Schooling 4.161519
cat("\n=== INTERPRETASI OTOMATIS ===\n")
##
## === INTERPRETASI OTOMATIS ===
for (i in 1:length(vif_values)) {
if (vif_values[i] < 5) {
cat(names(vif_values)[i], ": Tidak ada indikasi multikolinearitas (VIF < 5)\n")
} else if (vif_values[i] >= 5 & vif_values[i] < 10) {
cat(names(vif_values)[i], ": Ada indikasi multikolinearitas sedang (5 ≤ VIF < 10)\n")
} else {
cat(names(vif_values)[i], ": Multikolinearitas tinggi (VIF ≥ 10)\n")
}
}
## Adult.Mortality : Tidak ada indikasi multikolinearitas (VIF < 5)
## Alcohol : Tidak ada indikasi multikolinearitas (VIF < 5)
## percentage.expenditure : Tidak ada indikasi multikolinearitas (VIF < 5)
## Hepatitis.B : Tidak ada indikasi multikolinearitas (VIF < 5)
## Measles : Tidak ada indikasi multikolinearitas (VIF < 5)
## BMI : Tidak ada indikasi multikolinearitas (VIF < 5)
## under.five.deaths : Tidak ada indikasi multikolinearitas (VIF < 5)
## Polio : Tidak ada indikasi multikolinearitas (VIF < 5)
## Total.expenditure : Tidak ada indikasi multikolinearitas (VIF < 5)
## Diphtheria : Tidak ada indikasi multikolinearitas (VIF < 5)
## HIV.AIDS : Tidak ada indikasi multikolinearitas (VIF < 5)
## GDP : Tidak ada indikasi multikolinearitas (VIF < 5)
## Population : Tidak ada indikasi multikolinearitas (VIF < 5)
## thinness.5.9.years : Tidak ada indikasi multikolinearitas (VIF < 5)
## Income.composition.of.resources : Tidak ada indikasi multikolinearitas (VIF < 5)
## Schooling : Tidak ada indikasi multikolinearitas (VIF < 5)
set.seed(123)
idx <- sample(1:nrow(Life_Expectancy_Clean), 0.8 * nrow(Life_Expectancy_Clean))
data_train <- Life_Expectancy_Clean[idx, ]
data_test <- Life_Expectancy_Clean[-idx, ]
cat("Jumlah data training:", nrow(data_train), "\n")
## Jumlah data training: 2350
cat("Jumlah data testing :", nrow(data_test), "\n")
## Jumlah data testing : 588
target <- "Status"
data_train[[target]] <- as.factor(data_train[[target]])
data_test[[target]] <- as.factor(data_test[[target]])
normalize <- function(x){ (x - min(x)) / (max(x) - min(x)) }
num_cols <- names(data_train)[sapply(data_train, is.numeric)]
data_train[num_cols] <- lapply(data_train[num_cols], normalize)
data_test[num_cols] <- lapply(data_test[num_cols], normalize)
data_train %>%
count(Status) %>%
mutate(proporsi = round(n/sum(n)*100,2))
## # A tibble: 2 × 3
## Status n proporsi
## <fct> <int> <dbl>
## 1 0 413 17.6
## 2 1 1937 82.4
set.seed(123)
data_train_bal <- ROSE(Status ~ ., data = data_train, seed = 123)$data
data_train_bal %>%
count(Status) %>%
mutate(proporsi = round(n/sum(n)*100,2))
## Status n proporsi
## 1 1 1186 50.47
## 2 0 1164 49.53
# ============================
# 1. SVM (Unbalanced) Training
# ============================
svm_imbal <- svm(
Status ~ .,
data = data_train,
kernel = "radial",
cost = 1,
gamma = 0.1,
probability = TRUE
)
# ============================
# 2. Prediksi pada Data Training
# ============================
pred_svm_train <- predict(svm_imbal, newdata = data_train)
conf_svm_train <- confusionMatrix(pred_svm_train, data_train$Status)
print("=== SVM Training (Unbalanced) ===")
## [1] "=== SVM Training (Unbalanced) ==="
print(conf_svm_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 374 44
## 1 39 1893
##
## Accuracy : 0.9647
## 95% CI : (0.9564, 0.9718)
## No Information Rate : 0.8243
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8787
##
## Mcnemar's Test P-Value : 0.6606
##
## Sensitivity : 0.9056
## Specificity : 0.9773
## Pos Pred Value : 0.8947
## Neg Pred Value : 0.9798
## Prevalence : 0.1757
## Detection Rate : 0.1591
## Detection Prevalence : 0.1779
## Balanced Accuracy : 0.9414
##
## 'Positive' Class : 0
##
# ============================
# 3. Prediksi pada Data Testing
# ============================
pred_svm_test <- predict(svm_imbal, newdata = data_test)
conf_svm_test <- confusionMatrix(pred_svm_test, data_test$Status)
print("=== SVM Testing (Unbalanced) ===")
## [1] "=== SVM Testing (Unbalanced) ==="
print(conf_svm_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 88 28
## 1 11 461
##
## Accuracy : 0.9337
## 95% CI : (0.9104, 0.9524)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : 1.538e-13
##
## Kappa : 0.7783
##
## Mcnemar's Test P-Value : 0.01041
##
## Sensitivity : 0.8889
## Specificity : 0.9427
## Pos Pred Value : 0.7586
## Neg Pred Value : 0.9767
## Prevalence : 0.1684
## Detection Rate : 0.1497
## Detection Prevalence : 0.1973
## Balanced Accuracy : 0.9158
##
## 'Positive' Class : 0
##
# ============================
# 4. Precision, Recall, F1 per kelas (Testing)
# ============================
cm0 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "0")
precision_0 <- cm0$byClass["Pos Pred Value"]
recall_0 <- cm0$byClass["Sensitivity"]
f1_0 <- cm0$byClass["F1"]
cm1 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "1")
precision_1 <- cm1$byClass["Pos Pred Value"]
recall_1 <- cm1$byClass["Sensitivity"]
f1_1 <- cm1$byClass["F1"]
hasil_per_kelas <- data.frame(
Class = c("0", "1"),
Precision = c(precision_0, precision_1),
Recall = c(recall_0, recall_1),
F1 = c(f1_0, f1_1)
)
print("=== Precision, Recall, F1 per Kelas (Testing) ===")
## [1] "=== Precision, Recall, F1 per Kelas (Testing) ==="
print(hasil_per_kelas)
## Class Precision Recall F1
## 1 0 0.7586207 0.8888889 0.8186047
## 2 1 0.9766949 0.9427403 0.9594173
# ============================
# 5. Plot Confusion Matrix (Testing)
# ============================
cm_imbal <- conf_svm_test$table
cm_imbal_melt <- melt(cm_imbal)
ggplot(cm_imbal_melt, aes(Reference, Prediction, fill = value)) +
geom_tile() +
geom_text(aes(label = value), size = 6, color = "white") +
scale_fill_gradient(low = "blue", high = "red") +
ggtitle("Confusion Matrix – SVM Unbalanced") +
theme_minimal(base_size = 14)

# ============================
# 6. ROC Curve & AUC (Testing)
# ============================
prob_imbal <- attr(predict(svm_imbal, newdata = data_test, probability = TRUE),
"probabilities")[, "1"]
roc_imbal <- roc(data_test$Status, prob_imbal)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_imbal, main = "ROC Curve – SVM Unbalanced", col = "darkgreen", lwd = 2)

auc_value <- auc(roc_imbal)
cat("\nAUC (SVM Unbalanced, Testing):", round(auc_value, 4), "\n")
##
## AUC (SVM Unbalanced, Testing): 0.9669
library(e1071)
library(caret)
library(ggplot2)
library(reshape2)
library(pROC)
# ============================
# 1. SVM (Balanced) Training
# ============================
svm_bal <- svm(
Status ~ .,
data = data_train_bal, # data train sudah balance
kernel = "radial",
cost = 1,
gamma = 0.1,
probability = TRUE
)
# ============================
# 2. Prediksi pada Data Training
# ============================
pred_svm_train_bal <- predict(svm_bal, newdata = data_train_bal)
conf_svm_train_bal <- confusionMatrix(pred_svm_train_bal, data_train_bal$Status)
print("=== SVM Training (Balanced) ===")
## [1] "=== SVM Training (Balanced) ==="
print(conf_svm_train_bal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 1135 4
## 0 51 1160
##
## Accuracy : 0.9766
## 95% CI : (0.9696, 0.9823)
## No Information Rate : 0.5047
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9532
##
## Mcnemar's Test P-Value : 5.552e-10
##
## Sensitivity : 0.9570
## Specificity : 0.9966
## Pos Pred Value : 0.9965
## Neg Pred Value : 0.9579
## Prevalence : 0.5047
## Detection Rate : 0.4830
## Detection Prevalence : 0.4847
## Balanced Accuracy : 0.9768
##
## 'Positive' Class : 1
##
# ============================
# 3. Prediksi pada Data Testing
# ============================
pred_svm_test_bal <- predict(svm_bal, newdata = data_test)
conf_svm_test_bal <- confusionMatrix(pred_svm_test_bal, data_test$Status)
## Warning in confusionMatrix.default(pred_svm_test_bal, data_test$Status): Levels
## are not in the same order for reference and data. Refactoring data to match.
print("=== SVM Testing (Balanced) ===")
## [1] "=== SVM Testing (Balanced) ==="
print(conf_svm_test_bal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 98 90
## 1 1 399
##
## Accuracy : 0.8452
## 95% CI : (0.8134, 0.8735)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : 0.2053
##
## Kappa : 0.5932
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9899
## Specificity : 0.8160
## Pos Pred Value : 0.5213
## Neg Pred Value : 0.9975
## Prevalence : 0.1684
## Detection Rate : 0.1667
## Detection Prevalence : 0.3197
## Balanced Accuracy : 0.9029
##
## 'Positive' Class : 0
##
# ============================
# 4. Precision, Recall, F1 per kelas (Testing)
# ============================
cm0_bal <- confusionMatrix(pred_svm_test_bal, data_test$Status, positive = "0")
## Warning in confusionMatrix.default(pred_svm_test_bal, data_test$Status, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
precision_0_bal <- cm0_bal$byClass["Pos Pred Value"]
recall_0_bal <- cm0_bal$byClass["Sensitivity"]
f1_0_bal <- cm0_bal$byClass["F1"]
cm1_bal <- confusionMatrix(pred_svm_test_bal, data_test$Status, positive = "1")
## Warning in confusionMatrix.default(pred_svm_test_bal, data_test$Status, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
precision_1_bal <- cm1_bal$byClass["Pos Pred Value"]
recall_1_bal <- cm1_bal$byClass["Sensitivity"]
f1_1_bal <- cm1_bal$byClass["F1"]
hasil_per_kelas_bal <- data.frame(
Class = c("0", "1"),
Precision = c(precision_0_bal, precision_1_bal),
Recall = c(recall_0_bal, recall_1_bal),
F1 = c(f1_0_bal, f1_1_bal)
)
print("=== Precision, Recall, F1 per Kelas (Testing) ===")
## [1] "=== Precision, Recall, F1 per Kelas (Testing) ==="
print(hasil_per_kelas_bal)
## Class Precision Recall F1
## 1 0 0.5212766 0.9898990 0.6829268
## 2 1 0.9975000 0.8159509 0.8976378
# ============================
# 5. Plot Confusion Matrix (Testing)
# ============================
cm_bal <- conf_svm_test_bal$table
cm_bal_melt <- melt(cm_bal)
ggplot(cm_bal_melt, aes(Reference, Prediction, fill = value)) +
geom_tile() +
geom_text(aes(label = value), size = 6, color = "white") +
scale_fill_gradient(low = "blue", high = "red") +
ggtitle("Confusion Matrix – SVM Balanced") +
theme_minimal(base_size = 14)

# ============================
# 6. ROC Curve & AUC (Testing)
# ============================
prob_bal <- attr(predict(svm_bal, newdata = data_test, probability = TRUE),
"probabilities")[, "1"]
roc_bal <- roc(data_test$Status, prob_bal)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_bal, main = "ROC Curve – SVM Balanced", col = "darkblue", lwd = 2)

auc_value_bal <- auc(roc_bal)
cat("\nAUC (SVM Balanced, Testing):", round(auc_value_bal, 4), "\n")
##
## AUC (SVM Balanced, Testing): 0.9658
# ============================
# 1. Random Forest (Unbalanced Data) Training
# ============================
rf_imbal <- randomForest(
Status ~ .,
data = data_train,
ntree = 500,
mtry = 4,
importance = TRUE
)
# ============================
# 2. Prediksi pada Data Training
# ============================
rf_pred_train <- predict(rf_imbal, newdata = data_train)
conf_rf_train <- confusionMatrix(rf_pred_train, data_train$Status)
print("=== Random Forest Training (Unbalanced) ===")
## [1] "=== Random Forest Training (Unbalanced) ==="
print(conf_rf_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 413 0
## 1 0 1937
##
## Accuracy : 1
## 95% CI : (0.9984, 1)
## No Information Rate : 0.8243
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1757
## Detection Rate : 0.1757
## Detection Prevalence : 0.1757
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
cat("\nAkurasi RF Training (Unbalanced):", round(conf_rf_train$overall["Accuracy"], 4), "\n\n")
##
## Akurasi RF Training (Unbalanced): 1
# ============================
# 3. Prediksi pada Data Testing
# ============================
rf_pred_test <- predict(rf_imbal, newdata = data_test)
conf_rf_test <- confusionMatrix(rf_pred_test, data_test$Status)
print("=== Random Forest Testing (Unbalanced) ===")
## [1] "=== Random Forest Testing (Unbalanced) ==="
print(conf_rf_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 94 16
## 1 5 473
##
## Accuracy : 0.9643
## 95% CI : (0.9459, 0.9778)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8779
##
## Mcnemar's Test P-Value : 0.0291
##
## Sensitivity : 0.9495
## Specificity : 0.9673
## Pos Pred Value : 0.8545
## Neg Pred Value : 0.9895
## Prevalence : 0.1684
## Detection Rate : 0.1599
## Detection Prevalence : 0.1871
## Balanced Accuracy : 0.9584
##
## 'Positive' Class : 0
##
cat("\nAkurasi RF Testing (Unbalanced):", round(conf_rf_test$overall["Accuracy"], 4), "\n")
##
## Akurasi RF Testing (Unbalanced): 0.9643
# ============================
# 4. plot Variable Importance
# ============================
# Plot Variable Importance
varImpPlot(
rf_imbal,
main = "Random Forest Variable Importance (unbalanced Data)",
pch = 19
)

rf_varimp <- importance(rf_imbal)
rf_varimp_df <- data.frame(
Variable = rownames(rf_varimp),
Importance = rf_varimp[, 1]
)
# Urutkan dari paling penting
rf_varimp_df <- rf_varimp_df[order(rf_varimp_df$Importance, decreasing = TRUE), ]
# Plot barchart
ggplot(rf_varimp_df, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "salmon") +
coord_flip() + # horizontal
labs(title = "Random Forest Variable Importance (Unbalanced Data)",
x = "Variable",
y = "Importance") +
theme_minimal(base_size = 14)

# ============================
# 1. Random Forest (Balanced Data) Training
# ============================
rf_bal <- randomForest(
Status ~ .,
data = data_train_bal,
ntree = 500,
mtry = 4,
importance = TRUE
)
# ============================
# 2. Prediksi pada Data Training
# ============================
rf_pred_train_bal <- predict(rf_bal, newdata = data_train_bal)
conf_rf_train_bal <- confusionMatrix(rf_pred_train_bal, data_train_bal$Status)
print("=== Random Forest Training (Balanced) ===")
## [1] "=== Random Forest Training (Balanced) ==="
print(conf_rf_train_bal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 1186 0
## 0 0 1164
##
## Accuracy : 1
## 95% CI : (0.9984, 1)
## No Information Rate : 0.5047
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5047
## Detection Rate : 0.5047
## Detection Prevalence : 0.5047
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
cat("\nAkurasi RF Training (Balanced):", round(conf_rf_train_bal$overall["Accuracy"], 4), "\n\n")
##
## Akurasi RF Training (Balanced): 1
# ============================
# 3. Prediksi pada Data Testing
# ============================
rf_pred_test_bal <- predict(rf_bal, newdata = data_test)
conf_rf_test_bal <- confusionMatrix(rf_pred_test_bal, data_test$Status)
## Warning in confusionMatrix.default(rf_pred_test_bal, data_test$Status): Levels
## are not in the same order for reference and data. Refactoring data to match.
print("=== Random Forest Testing (Balanced) ===")
## [1] "=== Random Forest Testing (Balanced) ==="
print(conf_rf_test_bal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 99 133
## 1 0 356
##
## Accuracy : 0.7738
## 95% CI : (0.7378, 0.807)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.4741
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.7280
## Pos Pred Value : 0.4267
## Neg Pred Value : 1.0000
## Prevalence : 0.1684
## Detection Rate : 0.1684
## Detection Prevalence : 0.3946
## Balanced Accuracy : 0.8640
##
## 'Positive' Class : 0
##
cat("\nAkurasi RF Testing (Balanced):", round(conf_rf_test_bal$overall["Accuracy"], 4), "\n")
##
## Akurasi RF Testing (Balanced): 0.7738
# ============================
# 4. Plot Variable Importance
# ============================
# Plot Variable Importance
varImpPlot(
rf_imbal,
main = "Random Forest Variable Importance (Balanced Data)",
pch = 19
)

# Ambil variable importance
rf_varimp_bal <- importance(rf_bal)
rf_varimp_df_bal <- data.frame(
Variable = rownames(rf_varimp_bal),
Importance = rf_varimp_bal[, 1] # MeanDecreaseGini
)
# Urutkan dari paling penting
rf_varimp_df_bal <- rf_varimp_df_bal[order(rf_varimp_df_bal$Importance, decreasing = TRUE), ]
# Plot barchart
ggplot(rf_varimp_df_bal, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Random Forest Variable Importance (Balanced Data)",
x = "Variable",
y = "Importance") +
theme_minimal(base_size = 14)

# SVM
svm_acc_unbal <- conf_svm_test$overall["Accuracy"]
svm_acc_bal <- conf_svm_test_bal$overall["Accuracy"]
# Random Forest
rf_acc_unbal <- conf_rf_test$overall["Accuracy"]
rf_acc_bal <- conf_rf_test_bal$overall["Accuracy"]
accuracy_df_all <- data.frame(
model = c("SVM_Unbalanced", "SVM_Balanced",
"RandomForest_Unbalanced", "RandomForest_Balanced"),
accuracy = c(as.numeric(svm_acc_unbal), as.numeric(svm_acc_bal),
as.numeric(rf_acc_unbal), as.numeric(rf_acc_bal))
)
accuracy_df_all <- accuracy_df_all %>% arrange(desc(accuracy))
print(accuracy_df_all)
## model accuracy
## 1 RandomForest_Unbalanced 0.9642857
## 2 SVM_Unbalanced 0.9336735
## 3 SVM_Balanced 0.8452381
## 4 RandomForest_Balanced 0.7738095