library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(psych)
library(writexl)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(e1071)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
Life_Expectancy_Data <- read_excel("Life Expectancy Data.xlsx")
str(Life_Expectancy_Data)
## tibble [2,938 × 18] (S3: tbl_df/tbl/data.frame)
## $ Status : chr [1:2938] "Developing" "Developing" "Developing" "Developing" ...
## $ Life expectancy : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult Mortality : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
## $ Alcohol : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage expenditure : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis B : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : num [1:2938] 1154 492 430 2787 3013 ...
## $ BMI : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under-five deaths : num [1:2938] 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : num [1:2938] 6 58 62 67 68 66 63 64 63 58 ...
## $ Total expenditure : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV/AIDS : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
## $ Population : num [1:2938] 33736494 327582 31731688 3696958 2978599 ...
## $ thinness 5-9 years : num [1:2938] 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
describe(Life_Expectancy_Data)
## vars n mean sd median
## Status* 1 2938 1.83 0.38 2.00
## Life expectancy 2 2928 69.22 9.52 72.10
## Adult Mortality 3 2928 164.80 124.29 144.00
## Alcohol 4 2744 4.60 4.05 3.76
## percentage expenditure 5 2938 738.25 1987.91 64.91
## Hepatitis B 6 2385 80.94 25.07 92.00
## Measles 7 2938 2419.59 11467.27 17.00
## BMI 8 2904 38.32 20.04 43.50
## under-five deaths 9 2938 42.04 160.45 4.00
## Polio 10 2919 82.55 23.43 93.00
## Total expenditure 11 2712 5.94 2.50 5.76
## Diphtheria 12 2919 82.32 23.72 93.00
## HIV/AIDS 13 2938 1.74 5.08 0.10
## GDP 14 2490 7483.16 14270.17 1766.95
## Population 15 2286 12753375.12 61012096.51 1386542.00
## thinness 5-9 years 16 2904 4.87 4.51 3.30
## Income composition of resources 17 2771 0.63 0.21 0.68
## Schooling 18 2775 11.99 3.36 12.30
## trimmed mad min max
## Status* 1.91 0.00 1.00 2.000000e+00
## Life expectancy 69.91 8.60 36.30 8.900000e+01
## Adult Mortality 150.51 112.68 1.00 7.230000e+02
## Alcohol 4.23 4.81 0.01 1.787000e+01
## percentage expenditure 230.74 96.24 0.00 1.947991e+04
## Hepatitis B 86.89 8.90 1.00 9.900000e+01
## Measles 286.08 25.20 0.00 2.121830e+05
## BMI 39.05 24.17 1.00 8.730000e+01
## under-five deaths 14.15 5.93 0.00 2.500000e+03
## Polio 88.05 8.90 3.00 9.900000e+01
## Total expenditure 5.85 2.36 0.37 1.760000e+01
## Diphtheria 87.99 8.90 2.00 9.900000e+01
## HIV/AIDS 0.54 0.00 0.10 5.060000e+01
## GDP 3751.73 2360.98 1.68 1.191727e+05
## Population 3953693.58 2012347.06 34.00 1.293859e+09
## thinness 5-9 years 4.15 3.41 0.10 2.860000e+01
## Income composition of resources 0.65 0.19 0.00 9.500000e-01
## Schooling 12.17 3.11 0.00 2.070000e+01
## range skew kurtosis se
## Status* 1.000000e+00 -1.72 0.95 0.01
## Life expectancy 5.270000e+01 -0.64 -0.24 0.18
## Adult Mortality 7.220000e+02 1.17 1.74 2.30
## Alcohol 1.786000e+01 0.59 -0.81 0.08
## percentage expenditure 1.947991e+04 4.65 26.51 36.68
## Hepatitis B 9.800000e+01 -1.93 2.76 0.51
## Measles 2.121830e+05 9.43 114.58 211.56
## BMI 8.630000e+01 -0.22 -1.29 0.37
## under-five deaths 2.500000e+03 9.49 109.49 2.96
## Polio 9.600000e+01 -2.10 3.76 0.43
## Total expenditure 1.723000e+01 0.62 1.15 0.05
## Diphtheria 9.700000e+01 -2.07 3.55 0.44
## HIV/AIDS 5.050000e+01 5.39 34.80 0.09
## GDP 1.191711e+05 3.20 12.29 285.98
## Population 1.293859e+09 15.90 297.09 1276079.80
## thinness 5-9 years 2.850000e+01 1.78 4.34 0.08
## Income composition of resources 9.500000e-01 -1.14 1.38 0.00
## Schooling 2.070000e+01 -0.60 0.88 0.06
missing_table <- data.frame(
Variable = names(Life_Expectancy_Data),
Missing = colSums(is.na(Life_Expectancy_Data))
)
print(missing_table)
## Variable Missing
## Status Status 0
## Life expectancy Life expectancy 10
## Adult Mortality Adult Mortality 10
## Alcohol Alcohol 194
## percentage expenditure percentage expenditure 0
## Hepatitis B Hepatitis B 553
## Measles Measles 0
## BMI BMI 34
## under-five deaths under-five deaths 0
## Polio Polio 19
## Total expenditure Total expenditure 226
## Diphtheria Diphtheria 19
## HIV/AIDS HIV/AIDS 0
## GDP GDP 448
## Population Population 652
## thinness 5-9 years thinness 5-9 years 34
## Income composition of resources Income composition of resources 167
## Schooling Schooling 163
num_vars <- names(Life_Expectancy_Data)[sapply(Life_Expectancy_Data, is.numeric)]
cat_vars <- names(Life_Expectancy_Data)[sapply(Life_Expectancy_Data, is.character)]
impute_numeric <- function(x) {
if (all(is.na(x))) return(x)
if (abs(skewness(x, na.rm = TRUE)) > 1) {
x[is.na(x)] <- median(x, na.rm = TRUE)
} else {
x[is.na(x)] <- mean(x, na.rm = TRUE)
}
return(x)
}
impute_categorical <- function(x) {
mode_value <- names(sort(table(x), decreasing = TRUE))[1]
x[is.na(x)] <- mode_value
return(x)
}
Life_Expectancy_Imputed <- Life_Expectancy_Data
# Imputasi numerik
for (col in num_vars) {
Life_Expectancy_Imputed[[col]] <- impute_numeric(Life_Expectancy_Imputed[[col]])
}
# Imputasi kategorikal
for (col in cat_vars) {
Life_Expectancy_Imputed[[col]] <- impute_categorical(Life_Expectancy_Imputed[[col]])
}
cat("Missing value sebelum imputasi:\n")
## Missing value sebelum imputasi:
print(colSums(is.na(Life_Expectancy_Data)))
## Status Life expectancy
## 0 10
## Adult Mortality Alcohol
## 10 194
## percentage expenditure Hepatitis B
## 0 553
## Measles BMI
## 0 34
## under-five deaths Polio
## 0 19
## Total expenditure Diphtheria
## 226 19
## HIV/AIDS GDP
## 0 448
## Population thinness 5-9 years
## 652 34
## Income composition of resources Schooling
## 167 163
cat("\nMissing value sesudah imputasi:\n")
##
## Missing value sesudah imputasi:
print(colSums(is.na(Life_Expectancy_Imputed)))
## Status Life expectancy
## 0 0
## Adult Mortality Alcohol
## 0 0
## percentage expenditure Hepatitis B
## 0 0
## Measles BMI
## 0 0
## under-five deaths Polio
## 0 0
## Total expenditure Diphtheria
## 0 0
## HIV/AIDS GDP
## 0 0
## Population thinness 5-9 years
## 0 0
## Income composition of resources Schooling
## 0 0
cek_outlier <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
sum(x < lower | x > upper, na.rm = TRUE)
}
num_vars <- names(Life_Expectancy_Imputed)[sapply(Life_Expectancy_Imputed, is.numeric)]
outlier_before <- data.frame(
Variable = num_vars,
Outlier_Before = sapply(Life_Expectancy_Imputed[num_vars], cek_outlier)
)
cat("=== OUTLIER SEBELUM PEMBERSIHAN ===\n")
## === OUTLIER SEBELUM PEMBERSIHAN ===
print(outlier_before)
## Variable Outlier_Before
## Life expectancy Life expectancy 17
## Adult Mortality Adult Mortality 86
## Alcohol Alcohol 3
## percentage expenditure percentage expenditure 389
## Hepatitis B Hepatitis B 322
## Measles Measles 542
## BMI BMI 0
## under-five deaths under-five deaths 394
## Polio Polio 279
## Total expenditure Total expenditure 51
## Diphtheria Diphtheria 298
## HIV/AIDS HIV/AIDS 542
## GDP GDP 445
## Population Population 452
## thinness 5-9 years thinness 5-9 years 99
## Income composition of resources Income composition of resources 130
## Schooling Schooling 77
df_before_long <- Life_Expectancy_Imputed %>%
select(all_of(num_vars)) %>%
tidyr::pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(df_before_long, aes(x = "", y = Value)) +
geom_boxplot(
fill = "lightgreen",
color = "darkgreen",
outlier.color = "red",
outlier.size = 1
) +
facet_wrap(~ Variable, scales = "free", ncol = 4) +
theme_minimal(base_size = 11) +
theme(
strip.text = element_text(face = "bold"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(title = "Boxplot Outlier Sebelum Pembersihan", x = NULL, y = "Nilai")
handle_outlier <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
x[x < lower] <- lower
x[x > upper] <- upper
return(x)
}
Life_Expectancy_Clean <- Life_Expectancy_Imputed
Life_Expectancy_Clean[num_vars] <-
lapply(Life_Expectancy_Imputed[num_vars], handle_outlier)
outlier_after <- data.frame(
Variable = num_vars,
Outlier_After = sapply(Life_Expectancy_Clean[num_vars], cek_outlier)
)
print(outlier_after)
## Variable Outlier_After
## Life expectancy Life expectancy 0
## Adult Mortality Adult Mortality 0
## Alcohol Alcohol 0
## percentage expenditure percentage expenditure 0
## Hepatitis B Hepatitis B 0
## Measles Measles 0
## BMI BMI 0
## under-five deaths under-five deaths 0
## Polio Polio 0
## Total expenditure Total expenditure 0
## Diphtheria Diphtheria 0
## HIV/AIDS HIV/AIDS 0
## GDP GDP 0
## Population Population 0
## thinness 5-9 years thinness 5-9 years 0
## Income composition of resources Income composition of resources 0
## Schooling Schooling 0
df_after_long <- Life_Expectancy_Clean %>%
select(all_of(num_vars)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(df_after_long, aes(x = "", y = Value)) +
geom_boxplot(
fill = "lightgreen",
color = "darkgreen",
outlier.color = "red",
outlier.size = 1
) +
facet_wrap(~ Variable, scales = "free", ncol = 4) +
theme_minimal(base_size = 11) +
theme(
strip.text = element_text(face = "bold"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(title = "Boxplot Setelah Penanganan Outlier", x = NULL, y = "Nilai")
cat_cols <- sapply(Life_Expectancy_Clean, is.character)
cat_cols <- names(Life_Expectancy_Clean)[cat_cols]
cat("Kolom kategorikal yang ditemukan:\n")
## Kolom kategorikal yang ditemukan:
print(cat_cols)
## [1] "Status"
for (col in cat_cols) {
unique_vals <- unique(Life_Expectancy_Clean[[col]])
if (length(unique_vals) == 2) {
Life_Expectancy_Clean[[col]] <- as.numeric(as.factor(Life_Expectancy_Clean[[col]])) - 1
} else {
cat(paste("Kolom", col, "memiliki lebih dari 2 kategori, perlu one-hot encoding.\n"))
}
}
str(Life_Expectancy_Clean)
## tibble [2,938 × 18] (S3: tbl_df/tbl/data.frame)
## $ Status : num [1:2938] 1 1 1 1 1 1 1 1 1 1 ...
## $ Life expectancy : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult Mortality : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
## $ Alcohol : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage expenditure : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis B : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : num [1:2938] 901 492 430 901 901 ...
## $ BMI : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under-five deaths : num [1:2938] 70 70 70 70 70 70 70 70 70 70 ...
## $ Polio : num [1:2938] 49.5 58 62 67 68 66 63 64 63 58 ...
## $ Total expenditure : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV/AIDS : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
## $ Population : num [1:2938] 10832552 327582 10832552 3696958 2978599 ...
## $ thinness 5-9 years : num [1:2938] 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 ...
## $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
set.seed(123)
index_train <- sample(1:nrow(Life_Expectancy_Clean),
size = 0.8 * nrow(Life_Expectancy_Clean))
data_train <- Life_Expectancy_Clean[index_train, ]
data_test <- Life_Expectancy_Clean[-index_train, ]
cat("Jumlah data training:", nrow(data_train), "\n")
## Jumlah data training: 2350
cat("Jumlah data testing :", nrow(data_test), "\n")
## Jumlah data testing : 588
str(data_train)
## tibble [2,350 × 18] (S3: tbl_df/tbl/data.frame)
## $ Status : num [1:2350] 1 0 1 1 1 0 1 0 1 1 ...
## $ Life expectancy : num [1:2350] 62.5 81.5 64.7 45.6 71 78.7 74.1 81 82.2 79.2 ...
## $ Adult Mortality : num [1:2350] 243 58 22 58 135 8 151 67 6 81 ...
## $ Alcohol : num [1:2350] 1.77 7.2 5.46 1.47 0.01 ...
## $ percentage expenditure : num [1:2350] 172 778.2 47.1 31.6 52.8 ...
## $ Hepatitis B : num [1:2350] 75 92 75 92 96 92 97 95 97 92 ...
## $ Measles : num [1:2350] 680 6 0 901 237 ...
## $ BMI : num [1:2350] 38.3 57.3 24 17.2 17 ...
## $ under-five deaths : num [1:2350] 70 0 0 25 70 1 4 0 1 1 ...
## $ Polio : num [1:2350] 49.5 98 97 49.5 96 98 97 96 95 95 ...
## $ Total expenditure : num [1:2350] 7.97 9.47 7.84 4.16 2.88 8.46 9.78 8.1 7.81 8.3 ...
## $ Diphtheria : num [1:2350] 49.5 98 97 49.5 96 98 97 96 95 95 ...
## $ HIV/AIDS : num [1:2350] 0.3 0.1 1.2 1.85 0.1 0.1 0.4 0.1 0.1 0.1 ...
## $ GDP : num [1:2350] 1476 5276 844 254 952 ...
## $ Population : num [1:2350] 10832552 9378126 159328 397612 10832552 ...
## $ thinness 5-9 years : num [1:2350] 3.3 1.3 6.8 1.2 15.6 1 2.1 0.2 1.1 0.8 ...
## $ Income composition of resources: num [1:2350] 0.461 0.895 0.521 0.315 0.565 0.881 0.614 0.902 0.895 0.83 ...
## $ Schooling : num [1:2350] 7 15.8 10.2 5.4 10 16.5 11.6 18.6 16 15.3 ...
model_vif <- lm(
`Life expectancy` ~
`Adult Mortality` +
Alcohol +
`percentage expenditure` +
`Hepatitis B` +
Measles +
BMI +
`under-five deaths` +
Polio +
`Total expenditure` +
Diphtheria +
`HIV/AIDS` +
GDP +
Population +
`thinness 5-9 years` +
`Income composition of resources` +
Schooling,
data = Life_Expectancy_Clean
)
vif_values <- vif(model_vif)
vif_df <- data.frame(
Variabel = names(vif_values),
VIF = as.numeric(vif_values)
)
cat("=== HASIL VIF (Vertikal) ===\n")
## === HASIL VIF (Vertikal) ===
print(vif_df)
## Variabel VIF
## 1 `Adult Mortality` 1.713391
## 2 Alcohol 1.591185
## 3 `percentage expenditure` 3.811133
## 4 `Hepatitis B` 1.480498
## 5 Measles 1.598401
## 6 BMI 1.852373
## 7 `under-five deaths` 2.389527
## 8 Polio 3.760797
## 9 `Total expenditure` 1.195959
## 10 Diphtheria 3.923627
## 11 `HIV/AIDS` 2.146507
## 12 GDP 4.196137
## 13 Population 1.200941
## 14 `thinness 5-9 years` 1.789578
## 15 `Income composition of resources` 3.425626
## 16 Schooling 4.161519
cat("\n=== INTERPRETASI OTOMATIS ===\n")
##
## === INTERPRETASI OTOMATIS ===
for (i in 1:length(vif_values)) {
if (vif_values[i] < 5) {
cat(names(vif_values)[i], ": Tidak ada indikasi multikolinearitas (VIF < 5)\n")
} else if (vif_values[i] >= 5 & vif_values[i] < 10) {
cat(names(vif_values)[i], ": Ada indikasi multikolinearitas sedang (5 ≤ VIF < 10)\n")
} else {
cat(names(vif_values)[i], ": Multikolinearitas tinggi (VIF ≥ 10)\n")
}
}
## `Adult Mortality` : Tidak ada indikasi multikolinearitas (VIF < 5)
## Alcohol : Tidak ada indikasi multikolinearitas (VIF < 5)
## `percentage expenditure` : Tidak ada indikasi multikolinearitas (VIF < 5)
## `Hepatitis B` : Tidak ada indikasi multikolinearitas (VIF < 5)
## Measles : Tidak ada indikasi multikolinearitas (VIF < 5)
## BMI : Tidak ada indikasi multikolinearitas (VIF < 5)
## `under-five deaths` : Tidak ada indikasi multikolinearitas (VIF < 5)
## Polio : Tidak ada indikasi multikolinearitas (VIF < 5)
## `Total expenditure` : Tidak ada indikasi multikolinearitas (VIF < 5)
## Diphtheria : Tidak ada indikasi multikolinearitas (VIF < 5)
## `HIV/AIDS` : Tidak ada indikasi multikolinearitas (VIF < 5)
## GDP : Tidak ada indikasi multikolinearitas (VIF < 5)
## Population : Tidak ada indikasi multikolinearitas (VIF < 5)
## `thinness 5-9 years` : Tidak ada indikasi multikolinearitas (VIF < 5)
## `Income composition of resources` : Tidak ada indikasi multikolinearitas (VIF < 5)
## Schooling : Tidak ada indikasi multikolinearitas (VIF < 5)
target <- "Status"
data_train[[target]] <- as.factor(data_train[[target]])
data_test[[target]] <- as.factor(data_test[[target]])
numeric_cols <- sapply(data_train, is.numeric)
means <- sapply(data_train[, numeric_cols], mean)
sds <- sapply(data_train[, numeric_cols], sd)
data_train[, numeric_cols] <- scale(data_train[, numeric_cols],
center = means, scale = sds)
data_test[, numeric_cols] <- scale(data_test[, numeric_cols],
center = means, scale = sds)
svm_model <- svm(
as.formula(paste(target, "~ .")),
data = data_train,
kernel = "radial",
cost = 1,
gamma = 0.1
)
print(svm_model)
##
## Call:
## svm(formula = as.formula(paste(target, "~ .")), data = data_train,
## kernel = "radial", cost = 1, gamma = 0.1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 520
prediksi <- predict(svm_model, data_test)
#evaluasi model
conf_matrix <- confusionMatrix(prediksi, data_test[[target]])
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 82 23
## 1 17 466
##
## Accuracy : 0.932
## 95% CI : (0.9085, 0.951)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : 4.335e-13
##
## Kappa : 0.7628
##
## Mcnemar's Test P-Value : 0.4292
##
## Sensitivity : 0.8283
## Specificity : 0.9530
## Pos Pred Value : 0.7810
## Neg Pred Value : 0.9648
## Prevalence : 0.1684
## Detection Rate : 0.1395
## Detection Prevalence : 0.1786
## Balanced Accuracy : 0.8906
##
## 'Positive' Class : 0
##
tuned <- tune.svm(
as.formula(paste(target, "~ .")),
data = data_train,
gamma = 2^(-1:2),
cost = 2^(2:5)
)
cat("Model terbaik:\n")
## Model terbaik:
print(tuned$best.model)
##
## Call:
## best.svm(x = as.formula(paste(target, "~ .")), data = data_train,
## gamma = 2^(-1:2), cost = 2^(2:5))
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 8
##
## Number of Support Vectors: 1526
Penjelasan: Model SVM dengan kernel radial yang digunakan dijalankan dengan parameter cost = 1 dan gamma = 0.1, dan menghasilkan akurasi sebesar 93,2% dengan nilai Kappa 0.7628 yang menunjukkan tingkat kesesuaian prediksi yang kuat. Namun, performa model meningkat setelah dilakukan tuning hyperparameterdengan rentang gamma = 2^(-1:2) dan cost = 2^(2:5). Model terbaik diperoleh pada cost = 8, di mana akurasi naik menjadi 96,26% dan Kappa meningkat menjadi 0.8631, menandakan bahwa model memiliki kemampuan klasifikasi yang sangat baik. Selain itu, sensitivitas dan spesifisitas juga meningkat, menunjukkan bahwa model semakin akurat dalam membedakan negara berkembang maupun negara maju. Jumlah support vector yang lebih besar pada model terbaik (1526) juga menunjukkan bahwa model menjadi lebih kompleks dan mampu menangkap pola pemisahan kelas dengan lebih detail. Secara keseluruhan, tuning parameter memberikan peningkatan kinerja signifikan sehingga model SVM yang dihasilkan lebih optimal dalam mengklasifikasikan status negara.
names(data_train) <- make.names(names(data_train))
names(data_test) <- make.names(names(data_test))
target <- "Status"
# Pastikan target adalah faktor
data_train[[target]] <- as.factor(data_train[[target]])
data_test[[target]] <- as.factor(data_test[[target]])
set.seed(123)
rf_model <- randomForest(
as.formula(paste(target, "~ .")),
data = data_train,
ntree = 500,
mtry = floor(sqrt(ncol(data_train) - 1)),
importance = TRUE
)
print(rf_model)
##
## Call:
## randomForest(formula = as.formula(paste(target, "~ .")), data = data_train, ntree = 500, mtry = floor(sqrt(ncol(data_train) - 1)), importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 1.74%
## Confusion matrix:
## 0 1 class.error
## 0 391 22 0.053268765
## 1 19 1918 0.009808983
importance(rf_model)
## 0 1 MeanDecreaseAccuracy
## Life.expectancy 28.32657 21.093096 33.34020
## Adult.Mortality 26.35880 13.907902 29.17470
## Alcohol 53.37670 42.260972 58.93694
## percentage.expenditure 11.02620 8.847301 14.03214
## Hepatitis.B 21.55278 14.519321 23.56800
## Measles 13.57395 14.307277 19.05165
## BMI 13.97521 19.602924 22.36022
## under.five.deaths 20.23153 11.869186 22.85821
## Polio 14.82508 11.130910 17.49953
## Total.expenditure 20.75744 13.952672 23.19883
## Diphtheria 17.62341 11.056998 18.23942
## HIV.AIDS 10.28081 -3.327009 10.38862
## GDP 11.65998 7.945555 13.32032
## Population 14.66850 15.788226 20.14473
## thinness.5.9.years 27.51786 20.592138 35.70768
## Income.composition.of.resources 28.65088 23.411885 34.60436
## Schooling 15.85807 19.353633 23.57449
## MeanDecreaseGini
## Life.expectancy 107.836448
## Adult.Mortality 31.422741
## Alcohol 141.733755
## percentage.expenditure 15.992392
## Hepatitis.B 16.687900
## Measles 10.160079
## BMI 29.046701
## under.five.deaths 18.652589
## Polio 10.203394
## Total.expenditure 20.075779
## Diphtheria 11.577852
## HIV.AIDS 6.103743
## GDP 10.950652
## Population 12.902485
## thinness.5.9.years 63.206172
## Income.composition.of.resources 123.206473
## Schooling 49.912496
varImpPlot(rf_model)
rf_pred <- predict(rf_model, data_test)
#evaluasi model
rf_conf <- confusionMatrix(rf_pred, data_test[[target]])
print(rf_conf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 93 4
## 1 6 485
##
## Accuracy : 0.983
## 95% CI : (0.9689, 0.9918)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9388
##
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.9394
## Specificity : 0.9918
## Pos Pred Value : 0.9588
## Neg Pred Value : 0.9878
## Prevalence : 0.1684
## Detection Rate : 0.1582
## Detection Prevalence : 0.1650
## Balanced Accuracy : 0.9656
##
## 'Positive' Class : 0
##
#tunning random forest
tuned_rf <- tuneRF(
x = data_train[, colnames(data_train) != target],
y = data_train[[target]],
stepFactor = 1.5,
improve = 0.01,
ntreeTry = 300
)
## mtry = 4 OOB error = 1.79%
## Searching left ...
## mtry = 3 OOB error = 1.91%
## -0.07142857 0.01
## Searching right ...
## mtry = 6 OOB error = 2%
## -0.1190476 0.01
print(tuned_rf)
## mtry OOBError
## 3.OOB 3 0.01914894
## 4.OOB 4 0.01787234
## 6.OOB 6 0.02000000
Penjelasan: Berdasarkan analisis ini terlihat nilai Balanced Accuracy sebesar 96,56%, yang menunjukkan bahwa model mampu mengenali kedua kelas secara seimbang dan akurat. Nilai Positive Predictive Value (0.9588) dan Negative Predictive Value (0.9878) menandakan bahwa prediksi model untuk kedua kelas memiliki tingkat ketepatan yang sangat tinggi. Dari proses tuning menggunakan tuneRF, nilai mtry = 4 menghasilkan OOB error paling rendah (1,79%), yang berarti model paling optimal menggunakan empat variabel acak pada setiap pembentukan pohon. Perbandingan dengan mtry = 3 dan mtry = 6 menunjukkan OOB error yang lebih tinggi, sehingga mtry = 4 adalah kombinasi terbaik. Secara keseluruhan, hasil ini menunjukkan bahwa Random Forest mampu mempelajari pola data dengan sangat kuat, stabil, dan efektif, sehingga memberikan kinerja yang lebih unggul dibandingkan svm.
imp <- importance(rf_model)
# Ubah ke data frame
imp_df <- data.frame(
Variable = rownames(imp),
Importance = imp[, "MeanDecreaseGini"]
)
# Urutkan berdasarkan importance
imp_df <- imp_df %>%
arrange(desc(Importance))
# Plot rapi
ggplot(imp_df, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_col(fill = "steelblue") +
coord_flip() +
theme_minimal(base_size = 14) +
labs(
title = "Variable Importance (MeanDecreaseGini)",
x = "Variable",
y = "Importance"
)
Penjelasan: Berdasarkan hasil variable importance dari model Random
Forest, variabel yang paling berpengaruh dalam membedakan negara maju
dan berkembang adalah Alcohol, Income composition of resources, dan Life
expectancy. Ketiga variabel ini memiliki nilai Mean Decrease Gini
tertinggi, menunjukkan bahwa informasi dari variabel tersebut paling
sering digunakan oleh pohon keputusan dalam memecah data secara akurat.
Variabel seperti thinness 5–9 years, Schooling, dan Adult Mortality juga
memiliki kontribusi penting, menandakan bahwa kondisi kesehatan, gizi,
dan pendidikan merupakan indikator kuat tingkat perkembangan suatu
negara. Sementara itu, variabel seperti GDP, Polio, Measles, dan
HIV/AIDS berada di posisi yang lebih rendah, yang berarti kontribusinya
dalam klasifikasi relatif kecil dibandingkan variabel utama lainnya.
Secara keseluruhan, model mengindikasikan bahwa faktor kesehatan umum,
kualitas sumber daya manusia, dan harapan hidup merupakan determinan
terbesar dalam membedakan tingkat kemajuan suatu negara.
#svm
svm_model <- svm(as.formula(paste(target, "~ .")),
data = data_train,
kernel = "radial",
cost = 4,
gamma = 0.5)
svm_pred_test <- predict(svm_model, newdata = data_test)
svm_cm <- confusionMatrix(svm_pred_test, data_test[[target]])
print(svm_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 85 8
## 1 14 481
##
## Accuracy : 0.9626
## 95% CI : (0.9439, 0.9764)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8631
##
## Mcnemar's Test P-Value : 0.2864
##
## Sensitivity : 0.8586
## Specificity : 0.9836
## Pos Pred Value : 0.9140
## Neg Pred Value : 0.9717
## Prevalence : 0.1684
## Detection Rate : 0.1446
## Detection Prevalence : 0.1582
## Balanced Accuracy : 0.9211
##
## 'Positive' Class : 0
##
#random forest
rf_model <- randomForest(as.formula(paste(target, "~ .")),
data = data_train,
ntree = 500,
mtry = 4,
importance = TRUE)
rf_pred_test <- predict(rf_model, newdata = data_test)
rf_cm <- confusionMatrix(rf_pred_test, data_test[[target]])
print(rf_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 94 4
## 1 5 485
##
## Accuracy : 0.9847
## 95% CI : (0.9711, 0.993)
## No Information Rate : 0.8316
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9451
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9495
## Specificity : 0.9918
## Pos Pred Value : 0.9592
## Neg Pred Value : 0.9898
## Prevalence : 0.1684
## Detection Rate : 0.1599
## Detection Prevalence : 0.1667
## Balanced Accuracy : 0.9707
##
## 'Positive' Class : 0
##
#bandingkan akurasi model
cat("\nAkurasi RF Testing :", rf_cm$overall["Accuracy"], "\n")
##
## Akurasi RF Testing : 0.9846939
cat("Akurasi SVM Testing:", svm_cm$overall["Accuracy"], "\n")
## Akurasi SVM Testing: 0.962585
Penjelasan: Hasil evaluasi pada data testing menunjukkan bahwa Random Forest (RF) memiliki performa yang lebih unggul dibandingkan Support Vector Machine (SVM). RF mencapai akurasi 98,47%, lebih tinggi dibandingkan SVM yang memperoleh 96,26%, dengan nilai Kappa RF sebesar 0,9451, mengindikasikan tingkat kesepakatan yang sangat kuat antara prediksi dan data aktual. RF juga menunjukkan sensitivitas (0,9495) dan specificity (0,9918) yang lebih tinggi dibandingkan SVM, artinya model RF lebih baik dalam mendeteksi kelas positif (negara berkembang) maupun negatif (negara maju). Nilai Positive Predictive Value dan Negative Predictive Value RF juga lebih tinggi (0,9592 dan 0.9898), menunjukkan prediksi kelasnya lebih akurat. Secara keseluruhan, Random Forest memberikan kinerja klasifikasi yang lebih stabil, lebih presisi, dan lebih dapat diandalkan dibandingkan SVM dalam membedakan negara maju dan berkembang pada data testing.