Tugas Besar Datmin B

library(readxl)

## Warning: package 'readxl' was built under R version 4.5.2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)     
library(psych) 
library(writexl)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(e1071)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:psych':
## 
##     outlier

## The following object is masked from 'package:dplyr':
## 
##     combine

library(caret)

## Loading required package: lattice

library(iml)

## Warning: package 'iml' was built under R version 4.5.2

library(ROSE)

## Warning: package 'ROSE' was built under R version 4.5.2

## Loaded ROSE 0.0-4

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Life_Expectancy_Data <- read_excel("Life Expectancy Data.xlsx")
str(Life_Expectancy_Data)

## tibble [2,938 × 18] (S3: tbl_df/tbl/data.frame)
##  $ Status                         : chr [1:2938] "Developing" "Developing" "Developing" "Developing" ...
##  $ Life expectancy                : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ Adult Mortality                : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
##  $ Alcohol                        : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percentage expenditure         : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
##  $ Hepatitis B                    : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
##  $ Measles                        : num [1:2938] 1154 492 430 2787 3013 ...
##  $ BMI                            : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under-five deaths              : num [1:2938] 83 86 89 93 97 102 106 110 113 116 ...
##  $ Polio                          : num [1:2938] 6 58 62 67 68 66 63 64 63 58 ...
##  $ Total expenditure              : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
##  $ Diphtheria                     : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
##  $ HIV/AIDS                       : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
##  $ Population                     : num [1:2938] 33736494 327582 31731688 3696958 2978599 ...
##  $ thinness 5-9 years             : num [1:2938] 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
##  $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
##  $ Schooling                      : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...

describe(Life_Expectancy_Data)

##                                 vars    n        mean          sd     median
## Status*                            1 2938        1.83        0.38       2.00
## Life expectancy                    2 2928       69.22        9.52      72.10
## Adult Mortality                    3 2928      164.80      124.29     144.00
## Alcohol                            4 2744        4.60        4.05       3.76
## percentage expenditure             5 2938      738.25     1987.91      64.91
## Hepatitis B                        6 2385       80.94       25.07      92.00
## Measles                            7 2938     2419.59    11467.27      17.00
## BMI                                8 2904       38.32       20.04      43.50
## under-five deaths                  9 2938       42.04      160.45       4.00
## Polio                             10 2919       82.55       23.43      93.00
## Total expenditure                 11 2712        5.94        2.50       5.76
## Diphtheria                        12 2919       82.32       23.72      93.00
## HIV/AIDS                          13 2938        1.74        5.08       0.10
## GDP                               14 2490     7483.16    14270.17    1766.95
## Population                        15 2286 12753375.12 61012096.51 1386542.00
## thinness 5-9 years                16 2904        4.87        4.51       3.30
## Income composition of resources   17 2771        0.63        0.21       0.68
## Schooling                         18 2775       11.99        3.36      12.30
##                                    trimmed        mad   min          max
## Status*                               1.91       0.00  1.00 2.000000e+00
## Life expectancy                      69.91       8.60 36.30 8.900000e+01
## Adult Mortality                     150.51     112.68  1.00 7.230000e+02
## Alcohol                               4.23       4.81  0.01 1.787000e+01
## percentage expenditure              230.74      96.24  0.00 1.947991e+04
## Hepatitis B                          86.89       8.90  1.00 9.900000e+01
## Measles                             286.08      25.20  0.00 2.121830e+05
## BMI                                  39.05      24.17  1.00 8.730000e+01
## under-five deaths                    14.15       5.93  0.00 2.500000e+03
## Polio                                88.05       8.90  3.00 9.900000e+01
## Total expenditure                     5.85       2.36  0.37 1.760000e+01
## Diphtheria                           87.99       8.90  2.00 9.900000e+01
## HIV/AIDS                              0.54       0.00  0.10 5.060000e+01
## GDP                                3751.73    2360.98  1.68 1.191727e+05
## Population                      3953693.58 2012347.06 34.00 1.293859e+09
## thinness 5-9 years                    4.15       3.41  0.10 2.860000e+01
## Income composition of resources       0.65       0.19  0.00 9.500000e-01
## Schooling                            12.17       3.11  0.00 2.070000e+01
##                                        range  skew kurtosis         se
## Status*                         1.000000e+00 -1.72     0.95       0.01
## Life expectancy                 5.270000e+01 -0.64    -0.24       0.18
## Adult Mortality                 7.220000e+02  1.17     1.74       2.30
## Alcohol                         1.786000e+01  0.59    -0.81       0.08
## percentage expenditure          1.947991e+04  4.65    26.51      36.68
## Hepatitis B                     9.800000e+01 -1.93     2.76       0.51
## Measles                         2.121830e+05  9.43   114.58     211.56
## BMI                             8.630000e+01 -0.22    -1.29       0.37
## under-five deaths               2.500000e+03  9.49   109.49       2.96
## Polio                           9.600000e+01 -2.10     3.76       0.43
## Total expenditure               1.723000e+01  0.62     1.15       0.05
## Diphtheria                      9.700000e+01 -2.07     3.55       0.44
## HIV/AIDS                        5.050000e+01  5.39    34.80       0.09
## GDP                             1.191711e+05  3.20    12.29     285.98
## Population                      1.293859e+09 15.90   297.09 1276079.80
## thinness 5-9 years              2.850000e+01  1.78     4.34       0.08
## Income composition of resources 9.500000e-01 -1.14     1.38       0.00
## Schooling                       2.070000e+01 -0.60     0.88       0.06

missing_table <- data.frame(
  Variable = names(Life_Expectancy_Data),
  Missing = colSums(is.na(Life_Expectancy_Data))
)
print(missing_table)

##                                                        Variable Missing
## Status                                                   Status       0
## Life expectancy                                 Life expectancy      10
## Adult Mortality                                 Adult Mortality      10
## Alcohol                                                 Alcohol     194
## percentage expenditure                   percentage expenditure       0
## Hepatitis B                                         Hepatitis B     553
## Measles                                                 Measles       0
## BMI                                                         BMI      34
## under-five deaths                             under-five deaths       0
## Polio                                                     Polio      19
## Total expenditure                             Total expenditure     226
## Diphtheria                                           Diphtheria      19
## HIV/AIDS                                               HIV/AIDS       0
## GDP                                                         GDP     448
## Population                                           Population     652
## thinness 5-9 years                           thinness 5-9 years      34
## Income composition of resources Income composition of resources     167
## Schooling                                             Schooling     163

num_vars <- names(Life_Expectancy_Data)[sapply(Life_Expectancy_Data, is.numeric)]
cat_vars <- names(Life_Expectancy_Data)[sapply(Life_Expectancy_Data, is.character)]

impute_numeric <- function(x) {
  if (all(is.na(x))) return(x)
  
  if (abs(skewness(x, na.rm = TRUE)) > 1) {
    x[is.na(x)] <- median(x, na.rm = TRUE)
  } else {
    x[is.na(x)] <- mean(x, na.rm = TRUE)
  }
  return(x)
}

impute_categorical <- function(x) {
  mode_value <- names(sort(table(x), decreasing = TRUE))[1]
  x[is.na(x)] <- mode_value
  return(x)
}

Life_Expectancy_Imputed <- Life_Expectancy_Data

# Imputasi numerik
for (col in num_vars) {
  Life_Expectancy_Imputed[[col]] <- impute_numeric(Life_Expectancy_Imputed[[col]])
}

# Imputasi kategorikal
for (col in cat_vars) {
  Life_Expectancy_Imputed[[col]] <- impute_categorical(Life_Expectancy_Imputed[[col]])
}

cat("Missing value sebelum imputasi:\n")

## Missing value sebelum imputasi:

print(colSums(is.na(Life_Expectancy_Data)))

##                          Status                 Life expectancy 
##                               0                              10 
##                 Adult Mortality                         Alcohol 
##                              10                             194 
##          percentage expenditure                     Hepatitis B 
##                               0                             553 
##                         Measles                             BMI 
##                               0                              34 
##               under-five deaths                           Polio 
##                               0                              19 
##               Total expenditure                      Diphtheria 
##                             226                              19 
##                        HIV/AIDS                             GDP 
##                               0                             448 
##                      Population              thinness 5-9 years 
##                             652                              34 
## Income composition of resources                       Schooling 
##                             167                             163

cat("\nMissing value sesudah imputasi:\n")

## 
## Missing value sesudah imputasi:

print(colSums(is.na(Life_Expectancy_Imputed)))

##                          Status                 Life expectancy 
##                               0                               0 
##                 Adult Mortality                         Alcohol 
##                               0                               0 
##          percentage expenditure                     Hepatitis B 
##                               0                               0 
##                         Measles                             BMI 
##                               0                               0 
##               under-five deaths                           Polio 
##                               0                               0 
##               Total expenditure                      Diphtheria 
##                               0                               0 
##                        HIV/AIDS                             GDP 
##                               0                               0 
##                      Population              thinness 5-9 years 
##                               0                               0 
## Income composition of resources                       Schooling 
##                               0                               0

cek_outlier <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR_val
  upper <- Q3 + 1.5 * IQR_val
  sum(x < lower | x > upper, na.rm = TRUE)
}

num_vars <- names(Life_Expectancy_Imputed)[sapply(Life_Expectancy_Imputed, is.numeric)]

outlier_before <- data.frame(
  Variable = num_vars,
  Outlier_Before = sapply(Life_Expectancy_Imputed[num_vars], cek_outlier)
)

cat("=== OUTLIER SEBELUM PEMBERSIHAN ===\n")

## === OUTLIER SEBELUM PEMBERSIHAN ===

print(outlier_before)

##                                                        Variable Outlier_Before
## Life expectancy                                 Life expectancy             17
## Adult Mortality                                 Adult Mortality             86
## Alcohol                                                 Alcohol              3
## percentage expenditure                   percentage expenditure            389
## Hepatitis B                                         Hepatitis B            322
## Measles                                                 Measles            542
## BMI                                                         BMI              0
## under-five deaths                             under-five deaths            394
## Polio                                                     Polio            279
## Total expenditure                             Total expenditure             51
## Diphtheria                                           Diphtheria            298
## HIV/AIDS                                               HIV/AIDS            542
## GDP                                                         GDP            445
## Population                                           Population            452
## thinness 5-9 years                           thinness 5-9 years             99
## Income composition of resources Income composition of resources            130
## Schooling                                             Schooling             77

df_before_long <- Life_Expectancy_Imputed %>%
  select(all_of(num_vars)) %>%
  tidyr::pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(df_before_long, aes(x = "", y = Value)) +
  geom_boxplot(
    fill = "lightgreen",
    color = "darkgreen",
    outlier.color = "red",
    outlier.size = 1
  ) +
  facet_wrap(~ Variable, scales = "free", ncol = 4) +
  theme_minimal(base_size = 11) +
  theme(
    strip.text = element_text(face = "bold"),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  labs(title = "Boxplot Outlier Sebelum Pembersihan", x = NULL, y = "Nilai")

handle_outlier <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR_val
  upper <- Q3 + 1.5 * IQR_val
  
  x[x < lower] <- lower
  x[x > upper] <- upper
  return(x)
}

Life_Expectancy_Clean <- Life_Expectancy_Imputed
Life_Expectancy_Clean[num_vars] <- 
  lapply(Life_Expectancy_Imputed[num_vars], handle_outlier)

outlier_after <- data.frame(
  Variable = num_vars,
  Outlier_After = sapply(Life_Expectancy_Clean[num_vars], cek_outlier)
)

print(outlier_after)

##                                                        Variable Outlier_After
## Life expectancy                                 Life expectancy             0
## Adult Mortality                                 Adult Mortality             0
## Alcohol                                                 Alcohol             0
## percentage expenditure                   percentage expenditure             0
## Hepatitis B                                         Hepatitis B             0
## Measles                                                 Measles             0
## BMI                                                         BMI             0
## under-five deaths                             under-five deaths             0
## Polio                                                     Polio             0
## Total expenditure                             Total expenditure             0
## Diphtheria                                           Diphtheria             0
## HIV/AIDS                                               HIV/AIDS             0
## GDP                                                         GDP             0
## Population                                           Population             0
## thinness 5-9 years                           thinness 5-9 years             0
## Income composition of resources Income composition of resources             0
## Schooling                                             Schooling             0

df_after_long <- Life_Expectancy_Clean %>%
  select(all_of(num_vars)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(df_after_long, aes(x = "", y = Value)) +
  geom_boxplot(
    fill = "lightgreen",
    color = "darkgreen",
    outlier.color = "red",
    outlier.size = 1
  ) +
  facet_wrap(~ Variable, scales = "free", ncol = 4) +
  theme_minimal(base_size = 11) +
  theme(
    strip.text = element_text(face = "bold"),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  labs(title = "Boxplot Setelah Penanganan Outlier", x = NULL, y = "Nilai")

cat_cols <- sapply(Life_Expectancy_Clean, is.character)
cat_cols <- names(Life_Expectancy_Clean)[cat_cols]

cat("Kolom kategorikal yang ditemukan:\n")

## Kolom kategorikal yang ditemukan:

print(cat_cols)

## [1] "Status"

for (col in cat_cols) {
  unique_vals <- unique(Life_Expectancy_Clean[[col]])
  
  if (length(unique_vals) == 2) {
    Life_Expectancy_Clean[[col]] <- as.numeric(as.factor(Life_Expectancy_Clean[[col]])) - 1
    
  } else {
    cat(paste("Kolom", col, "memiliki lebih dari 2 kategori, perlu one-hot encoding.\n"))
  }
}

str(Life_Expectancy_Clean)

## tibble [2,938 × 18] (S3: tbl_df/tbl/data.frame)
##  $ Status                         : num [1:2938] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Life expectancy                : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ Adult Mortality                : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
##  $ Alcohol                        : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percentage expenditure         : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
##  $ Hepatitis B                    : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
##  $ Measles                        : num [1:2938] 901 492 430 901 901 ...
##  $ BMI                            : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under-five deaths              : num [1:2938] 70 70 70 70 70 70 70 70 70 70 ...
##  $ Polio                          : num [1:2938] 49.5 58 62 67 68 66 63 64 63 58 ...
##  $ Total expenditure              : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
##  $ Diphtheria                     : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
##  $ HIV/AIDS                       : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
##  $ Population                     : num [1:2938] 10832552 327582 10832552 3696958 2978599 ...
##  $ thinness 5-9 years             : num [1:2938] 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 15.6 ...
##  $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
##  $ Schooling                      : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...

names(Life_Expectancy_Clean) <- make.names(names(Life_Expectancy_Clean))

target <- "Life.expectancy"

predictors <- c(
  "Adult.Mortality", "Alcohol", "percentage.expenditure", "Hepatitis.B",
  "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure",
  "Diphtheria", "HIV.AIDS", "GDP", "Population", "thinness.5.9.years",
  "Income.composition.of.resources", "Schooling"
)

formula_str <- paste(target, "~", paste(predictors, collapse = " + "))
model_vif <- lm(as.formula(formula_str), data = Life_Expectancy_Clean)

vif_values <- vif(model_vif)

vif_df <- data.frame(
  Variabel = names(vif_values),
  VIF = as.numeric(vif_values)
)

cat("=== HASIL VIF (Vertikal) ===\n")

## === HASIL VIF (Vertikal) ===

print(vif_df)

##                           Variabel      VIF
## 1                  Adult.Mortality 1.713391
## 2                          Alcohol 1.591185
## 3           percentage.expenditure 3.811133
## 4                      Hepatitis.B 1.480498
## 5                          Measles 1.598401
## 6                              BMI 1.852373
## 7                under.five.deaths 2.389527
## 8                            Polio 3.760797
## 9                Total.expenditure 1.195959
## 10                      Diphtheria 3.923627
## 11                        HIV.AIDS 2.146507
## 12                             GDP 4.196137
## 13                      Population 1.200941
## 14              thinness.5.9.years 1.789578
## 15 Income.composition.of.resources 3.425626
## 16                       Schooling 4.161519

cat("\n=== INTERPRETASI OTOMATIS ===\n")

## 
## === INTERPRETASI OTOMATIS ===

for (i in 1:length(vif_values)) {
  if (vif_values[i] < 5) {
    cat(names(vif_values)[i], ": Tidak ada indikasi multikolinearitas (VIF < 5)\n")
  } else if (vif_values[i] >= 5 & vif_values[i] < 10) {
    cat(names(vif_values)[i], ": Ada indikasi multikolinearitas sedang (5 ≤ VIF < 10)\n")
  } else {
    cat(names(vif_values)[i], ": Multikolinearitas tinggi (VIF ≥ 10)\n")
  }
}

## Adult.Mortality : Tidak ada indikasi multikolinearitas (VIF < 5)
## Alcohol : Tidak ada indikasi multikolinearitas (VIF < 5)
## percentage.expenditure : Tidak ada indikasi multikolinearitas (VIF < 5)
## Hepatitis.B : Tidak ada indikasi multikolinearitas (VIF < 5)
## Measles : Tidak ada indikasi multikolinearitas (VIF < 5)
## BMI : Tidak ada indikasi multikolinearitas (VIF < 5)
## under.five.deaths : Tidak ada indikasi multikolinearitas (VIF < 5)
## Polio : Tidak ada indikasi multikolinearitas (VIF < 5)
## Total.expenditure : Tidak ada indikasi multikolinearitas (VIF < 5)
## Diphtheria : Tidak ada indikasi multikolinearitas (VIF < 5)
## HIV.AIDS : Tidak ada indikasi multikolinearitas (VIF < 5)
## GDP : Tidak ada indikasi multikolinearitas (VIF < 5)
## Population : Tidak ada indikasi multikolinearitas (VIF < 5)
## thinness.5.9.years : Tidak ada indikasi multikolinearitas (VIF < 5)
## Income.composition.of.resources : Tidak ada indikasi multikolinearitas (VIF < 5)
## Schooling : Tidak ada indikasi multikolinearitas (VIF < 5)

set.seed(123)
idx <- sample(1:nrow(Life_Expectancy_Clean), 0.8 * nrow(Life_Expectancy_Clean))

data_train <- Life_Expectancy_Clean[idx, ]
data_test  <- Life_Expectancy_Clean[-idx, ]

cat("Jumlah data training:", nrow(data_train), "\n")

## Jumlah data training: 2350

cat("Jumlah data testing :", nrow(data_test), "\n")

## Jumlah data testing : 588

target <- "Status"

data_train[[target]] <- as.factor(data_train[[target]])
data_test[[target]]  <- as.factor(data_test[[target]])

normalize <- function(x){ (x - min(x)) / (max(x) - min(x)) }

num_cols <- names(data_train)[sapply(data_train, is.numeric)]

data_train[num_cols] <- lapply(data_train[num_cols], normalize)
data_test[num_cols]  <- lapply(data_test[num_cols], normalize)

data_train %>%
  count(Status) %>%
  mutate(proporsi = round(n/sum(n)*100,2))

## # A tibble: 2 × 3
##   Status     n proporsi
##   <fct>  <int>    <dbl>
## 1 0        413     17.6
## 2 1       1937     82.4

set.seed(123)
data_train_bal <- ROSE(Status ~ ., data = data_train, seed = 123)$data

data_train_bal %>%
  count(Status) %>%
  mutate(proporsi = round(n/sum(n)*100,2))

##   Status    n proporsi
## 1      1 1186    50.47
## 2      0 1164    49.53

# ============================
# 1. SVM (Unbalanced) Training
# ============================

svm_imbal <- svm(
  Status ~ ., 
  data = data_train,
  kernel = "radial",
  cost = 1,
  gamma = 0.1,
  probability = TRUE
)

# ============================
# 2. Prediksi pada Data Training
# ============================

pred_svm_train <- predict(svm_imbal, newdata = data_train)
conf_svm_train <- confusionMatrix(pred_svm_train, data_train$Status)

print("=== SVM Training (Unbalanced) ===")

## [1] "=== SVM Training (Unbalanced) ==="

print(conf_svm_train)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  374   44
##          1   39 1893
##                                           
##                Accuracy : 0.9647          
##                  95% CI : (0.9564, 0.9718)
##     No Information Rate : 0.8243          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8787          
##                                           
##  Mcnemar's Test P-Value : 0.6606          
##                                           
##             Sensitivity : 0.9056          
##             Specificity : 0.9773          
##          Pos Pred Value : 0.8947          
##          Neg Pred Value : 0.9798          
##              Prevalence : 0.1757          
##          Detection Rate : 0.1591          
##    Detection Prevalence : 0.1779          
##       Balanced Accuracy : 0.9414          
##                                           
##        'Positive' Class : 0               
##

# ============================
# 3. Prediksi pada Data Testing
# ============================

pred_svm_test <- predict(svm_imbal, newdata = data_test)
conf_svm_test <- confusionMatrix(pred_svm_test, data_test$Status)

print("=== SVM Testing (Unbalanced) ===")

## [1] "=== SVM Testing (Unbalanced) ==="

print(conf_svm_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  88  28
##          1  11 461
##                                           
##                Accuracy : 0.9337          
##                  95% CI : (0.9104, 0.9524)
##     No Information Rate : 0.8316          
##     P-Value [Acc > NIR] : 1.538e-13       
##                                           
##                   Kappa : 0.7783          
##                                           
##  Mcnemar's Test P-Value : 0.01041         
##                                           
##             Sensitivity : 0.8889          
##             Specificity : 0.9427          
##          Pos Pred Value : 0.7586          
##          Neg Pred Value : 0.9767          
##              Prevalence : 0.1684          
##          Detection Rate : 0.1497          
##    Detection Prevalence : 0.1973          
##       Balanced Accuracy : 0.9158          
##                                           
##        'Positive' Class : 0               
##

# ============================
# 4. Precision, Recall, F1 per kelas (Testing)
# ============================

cm0 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "0")
precision_0 <- cm0$byClass["Pos Pred Value"]
recall_0    <- cm0$byClass["Sensitivity"]
f1_0        <- cm0$byClass["F1"]

cm1 <- confusionMatrix(pred_svm_test, data_test$Status, positive = "1")
precision_1 <- cm1$byClass["Pos Pred Value"]
recall_1    <- cm1$byClass["Sensitivity"]
f1_1        <- cm1$byClass["F1"]

hasil_per_kelas <- data.frame(
  Class     = c("0", "1"),
  Precision = c(precision_0, precision_1),
  Recall    = c(recall_0, recall_1),
  F1        = c(f1_0, f1_1)
)

print("=== Precision, Recall, F1 per Kelas (Testing) ===")

## [1] "=== Precision, Recall, F1 per Kelas (Testing) ==="

print(hasil_per_kelas)

##   Class Precision    Recall        F1
## 1     0 0.7586207 0.8888889 0.8186047
## 2     1 0.9766949 0.9427403 0.9594173

# ============================
# 5. Plot Confusion Matrix (Testing)
# ============================

cm_imbal <- conf_svm_test$table
cm_imbal_melt <- melt(cm_imbal)

ggplot(cm_imbal_melt, aes(Reference, Prediction, fill = value)) +
  geom_tile() +
  geom_text(aes(label = value), size = 6, color = "white") +
  scale_fill_gradient(low = "blue", high = "red") +
  ggtitle("Confusion Matrix – SVM Unbalanced") +
  theme_minimal(base_size = 14)

# ============================
# 6. ROC Curve & AUC (Testing)
# ============================

prob_imbal <- attr(predict(svm_imbal, newdata = data_test, probability = TRUE),
                   "probabilities")[, "1"]

roc_imbal <- roc(data_test$Status, prob_imbal)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc_imbal, main = "ROC Curve – SVM Unbalanced", col = "darkgreen", lwd = 2)

auc_value <- auc(roc_imbal)
cat("\nAUC (SVM Unbalanced, Testing):", round(auc_value, 4), "\n")

## 
## AUC (SVM Unbalanced, Testing): 0.9669

library(e1071)
library(caret)
library(ggplot2)
library(reshape2)
library(pROC)

# ============================
# 1. SVM (Balanced) Training
# ============================

svm_bal <- svm(
  Status ~ ., 
  data = data_train_bal,   # data train sudah balance
  kernel = "radial",
  cost = 1,
  gamma = 0.1,
  probability = TRUE
)

# ============================
# 2. Prediksi pada Data Training
# ============================

pred_svm_train_bal <- predict(svm_bal, newdata = data_train_bal)
conf_svm_train_bal <- confusionMatrix(pred_svm_train_bal, data_train_bal$Status)

print("=== SVM Training (Balanced) ===")

## [1] "=== SVM Training (Balanced) ==="

print(conf_svm_train_bal)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1 1135    4
##          0   51 1160
##                                           
##                Accuracy : 0.9766          
##                  95% CI : (0.9696, 0.9823)
##     No Information Rate : 0.5047          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9532          
##                                           
##  Mcnemar's Test P-Value : 5.552e-10       
##                                           
##             Sensitivity : 0.9570          
##             Specificity : 0.9966          
##          Pos Pred Value : 0.9965          
##          Neg Pred Value : 0.9579          
##              Prevalence : 0.5047          
##          Detection Rate : 0.4830          
##    Detection Prevalence : 0.4847          
##       Balanced Accuracy : 0.9768          
##                                           
##        'Positive' Class : 1               
##

# ============================
# 3. Prediksi pada Data Testing
# ============================

pred_svm_test_bal <- predict(svm_bal, newdata = data_test)
conf_svm_test_bal <- confusionMatrix(pred_svm_test_bal, data_test$Status)

## Warning in confusionMatrix.default(pred_svm_test_bal, data_test$Status): Levels
## are not in the same order for reference and data. Refactoring data to match.

print("=== SVM Testing (Balanced) ===")

## [1] "=== SVM Testing (Balanced) ==="

print(conf_svm_test_bal)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  98  90
##          1   1 399
##                                           
##                Accuracy : 0.8452          
##                  95% CI : (0.8134, 0.8735)
##     No Information Rate : 0.8316          
##     P-Value [Acc > NIR] : 0.2053          
##                                           
##                   Kappa : 0.5932          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9899          
##             Specificity : 0.8160          
##          Pos Pred Value : 0.5213          
##          Neg Pred Value : 0.9975          
##              Prevalence : 0.1684          
##          Detection Rate : 0.1667          
##    Detection Prevalence : 0.3197          
##       Balanced Accuracy : 0.9029          
##                                           
##        'Positive' Class : 0               
##

# ============================
# 4. Precision, Recall, F1 per kelas (Testing)
# ============================

cm0_bal <- confusionMatrix(pred_svm_test_bal, data_test$Status, positive = "0")

## Warning in confusionMatrix.default(pred_svm_test_bal, data_test$Status, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

precision_0_bal <- cm0_bal$byClass["Pos Pred Value"]
recall_0_bal    <- cm0_bal$byClass["Sensitivity"]
f1_0_bal        <- cm0_bal$byClass["F1"]

cm1_bal <- confusionMatrix(pred_svm_test_bal, data_test$Status, positive = "1")

## Warning in confusionMatrix.default(pred_svm_test_bal, data_test$Status, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

precision_1_bal <- cm1_bal$byClass["Pos Pred Value"]
recall_1_bal    <- cm1_bal$byClass["Sensitivity"]
f1_1_bal        <- cm1_bal$byClass["F1"]

hasil_per_kelas_bal <- data.frame(
  Class     = c("0", "1"),
  Precision = c(precision_0_bal, precision_1_bal),
  Recall    = c(recall_0_bal, recall_1_bal),
  F1        = c(f1_0_bal, f1_1_bal)
)

print("=== Precision, Recall, F1 per Kelas (Testing) ===")

## [1] "=== Precision, Recall, F1 per Kelas (Testing) ==="

print(hasil_per_kelas_bal)

##   Class Precision    Recall        F1
## 1     0 0.5212766 0.9898990 0.6829268
## 2     1 0.9975000 0.8159509 0.8976378

# ============================
# 5. Plot Confusion Matrix (Testing)
# ============================

cm_bal <- conf_svm_test_bal$table
cm_bal_melt <- melt(cm_bal)

ggplot(cm_bal_melt, aes(Reference, Prediction, fill = value)) +
  geom_tile() +
  geom_text(aes(label = value), size = 6, color = "white") +
  scale_fill_gradient(low = "blue", high = "red") +
  ggtitle("Confusion Matrix – SVM Balanced") +
  theme_minimal(base_size = 14)

# ============================
# 6. ROC Curve & AUC (Testing)
# ============================

prob_bal <- attr(predict(svm_bal, newdata = data_test, probability = TRUE),
                 "probabilities")[, "1"]

roc_bal <- roc(data_test$Status, prob_bal)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc_bal, main = "ROC Curve – SVM Balanced", col = "darkblue", lwd = 2)

auc_value_bal <- auc(roc_bal)
cat("\nAUC (SVM Balanced, Testing):", round(auc_value_bal, 4), "\n")

## 
## AUC (SVM Balanced, Testing): 0.9658

# ============================
# 1. Random Forest (Unbalanced Data) Training
# ============================

rf_imbal <- randomForest(
  Status ~ ., 
  data = data_train,      
  ntree = 500,
  mtry = 4,
  importance = TRUE
)

# ============================
# 2. Prediksi pada Data Training
# ============================

rf_pred_train <- predict(rf_imbal, newdata = data_train)
conf_rf_train <- confusionMatrix(rf_pred_train, data_train$Status)

print("=== Random Forest Training (Unbalanced) ===")

## [1] "=== Random Forest Training (Unbalanced) ==="

print(conf_rf_train)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  413    0
##          1    0 1937
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9984, 1)
##     No Information Rate : 0.8243     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.1757     
##          Detection Rate : 0.1757     
##    Detection Prevalence : 0.1757     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

cat("\nAkurasi RF Training (Unbalanced):", round(conf_rf_train$overall["Accuracy"], 4), "\n\n")

## 
## Akurasi RF Training (Unbalanced): 1

# ============================
# 3. Prediksi pada Data Testing
# ============================

rf_pred_test <- predict(rf_imbal, newdata = data_test)
conf_rf_test <- confusionMatrix(rf_pred_test, data_test$Status)

print("=== Random Forest Testing (Unbalanced) ===")

## [1] "=== Random Forest Testing (Unbalanced) ==="

print(conf_rf_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  94  16
##          1   5 473
##                                           
##                Accuracy : 0.9643          
##                  95% CI : (0.9459, 0.9778)
##     No Information Rate : 0.8316          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8779          
##                                           
##  Mcnemar's Test P-Value : 0.0291          
##                                           
##             Sensitivity : 0.9495          
##             Specificity : 0.9673          
##          Pos Pred Value : 0.8545          
##          Neg Pred Value : 0.9895          
##              Prevalence : 0.1684          
##          Detection Rate : 0.1599          
##    Detection Prevalence : 0.1871          
##       Balanced Accuracy : 0.9584          
##                                           
##        'Positive' Class : 0               
##

cat("\nAkurasi RF Testing (Unbalanced):", round(conf_rf_test$overall["Accuracy"], 4), "\n")

## 
## Akurasi RF Testing (Unbalanced): 0.9643

# ============================
# 4. plot Variable Importance
# ============================
# Plot Variable Importance
varImpPlot(
  rf_imbal,
  main = "Random Forest Variable Importance (unbalanced Data)",
  pch = 19
)

rf_varimp <- importance(rf_imbal)
rf_varimp_df <- data.frame(
  Variable = rownames(rf_varimp),
  Importance = rf_varimp[, 1]  
)

# Urutkan dari paling penting
rf_varimp_df <- rf_varimp_df[order(rf_varimp_df$Importance, decreasing = TRUE), ]

# Plot barchart
ggplot(rf_varimp_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "salmon") +
  coord_flip() +  # horizontal
  labs(title = "Random Forest Variable Importance (Unbalanced Data)",
       x = "Variable",
       y = "Importance") +
  theme_minimal(base_size = 14)

# ============================
# 1. Random Forest (Balanced Data) Training
# ============================

rf_bal <- randomForest(
  Status ~ ., 
  data = data_train_bal,  
  ntree = 500,
  mtry = 4,
  importance = TRUE
)

# ============================
# 2. Prediksi pada Data Training
# ============================

rf_pred_train_bal <- predict(rf_bal, newdata = data_train_bal)
conf_rf_train_bal <- confusionMatrix(rf_pred_train_bal, data_train_bal$Status)

print("=== Random Forest Training (Balanced) ===")

## [1] "=== Random Forest Training (Balanced) ==="

print(conf_rf_train_bal)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1 1186    0
##          0    0 1164
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9984, 1)
##     No Information Rate : 0.5047     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.5047     
##          Detection Rate : 0.5047     
##    Detection Prevalence : 0.5047     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 1          
##

cat("\nAkurasi RF Training (Balanced):", round(conf_rf_train_bal$overall["Accuracy"], 4), "\n\n")

## 
## Akurasi RF Training (Balanced): 1

# ============================
# 3. Prediksi pada Data Testing
# ============================

rf_pred_test_bal <- predict(rf_bal, newdata = data_test)
conf_rf_test_bal <- confusionMatrix(rf_pred_test_bal, data_test$Status)

## Warning in confusionMatrix.default(rf_pred_test_bal, data_test$Status): Levels
## are not in the same order for reference and data. Refactoring data to match.

print("=== Random Forest Testing (Balanced) ===")

## [1] "=== Random Forest Testing (Balanced) ==="

print(conf_rf_test_bal)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  99 133
##          1   0 356
##                                          
##                Accuracy : 0.7738         
##                  95% CI : (0.7378, 0.807)
##     No Information Rate : 0.8316         
##     P-Value [Acc > NIR] : 0.9999         
##                                          
##                   Kappa : 0.4741         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.7280         
##          Pos Pred Value : 0.4267         
##          Neg Pred Value : 1.0000         
##              Prevalence : 0.1684         
##          Detection Rate : 0.1684         
##    Detection Prevalence : 0.3946         
##       Balanced Accuracy : 0.8640         
##                                          
##        'Positive' Class : 0              
##

cat("\nAkurasi RF Testing (Balanced):", round(conf_rf_test_bal$overall["Accuracy"], 4), "\n")

## 
## Akurasi RF Testing (Balanced): 0.7738

# ============================
# 4. Plot Variable Importance
# ============================

# Plot Variable Importance
varImpPlot(
  rf_imbal,
  main = "Random Forest Variable Importance (Balanced Data)",
  pch = 19
)

# Ambil variable importance
rf_varimp_bal <- importance(rf_bal)
rf_varimp_df_bal <- data.frame(
  Variable = rownames(rf_varimp_bal),
  Importance = rf_varimp_bal[, 1]  # MeanDecreaseGini
)

# Urutkan dari paling penting
rf_varimp_df_bal <- rf_varimp_df_bal[order(rf_varimp_df_bal$Importance, decreasing = TRUE), ]

# Plot barchart
ggplot(rf_varimp_df_bal, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Random Forest Variable Importance (Balanced Data)",
       x = "Variable",
       y = "Importance") +
  theme_minimal(base_size = 14)

# SVM
svm_acc_unbal <- conf_svm_test$overall["Accuracy"]
svm_acc_bal   <- conf_svm_test_bal$overall["Accuracy"]

# Random Forest
rf_acc_unbal  <- conf_rf_test$overall["Accuracy"]
rf_acc_bal    <- conf_rf_test_bal$overall["Accuracy"]

accuracy_df_all <- data.frame(
  model = c("SVM_Unbalanced", "SVM_Balanced", 
            "RandomForest_Unbalanced", "RandomForest_Balanced"),
  accuracy = c(as.numeric(svm_acc_unbal), as.numeric(svm_acc_bal),
               as.numeric(rf_acc_unbal), as.numeric(rf_acc_bal))
)

accuracy_df_all <- accuracy_df_all %>% arrange(desc(accuracy))

print(accuracy_df_all)

##                     model  accuracy
## 1 RandomForest_Unbalanced 0.9642857
## 2          SVM_Unbalanced 0.9336735
## 3            SVM_Balanced 0.8452381
## 4   RandomForest_Balanced 0.7738095

Tugas Besar Datmin B

Kelompok 8

2025-11-19