### Library
library(ggplot2) # Visualisasi 
library(e1071) # Skewness, kurtosis, SVM 
library(dplyr) # korelasi
library(reshape2) # heatmap 
library(smotefamily) # oversampling
library(randomForest) # random forest

Data Understanding

Struktur Data

df <- read.csv("D:\\Kuliah D\\Semester 6\\PKL\\Laporan\\Kredit_Kolektibilitas.csv")
str(df)
## 'data.frame':    2388 obs. of  5 variables:
##  $ PLAFON: chr  "160.000.000" "96.000.000" "120.000.000" "230.000.000" ...
##  $ RATE  : chr  "11" "5,5" "19" "15" ...
##  $ SEGMEN: chr  "RITEL" "RITEL" "RITEL" "RITEL" ...
##  $ GENDER: chr  "L" "P" "L" "L" ...
##  $ RISIKO: num  0 0 0 0 0 0 0 0 0 0 ...
### Menyesuaikan Jenis Data 
df$PLAFON <- as.numeric(gsub("\\.", "", df$PLAFON))
df$RATE <- as.numeric(gsub(",", ".", gsub("\\.", "", df$RATE)))
head(df)
##    PLAFON  RATE SEGMEN GENDER RISIKO
## 1 1.6e+08 11.00  RITEL      L      0
## 2 9.6e+07  5.50  RITEL      P      0
## 3 1.2e+08 19.00  RITEL      L      0
## 4 2.3e+08 15.00  RITEL      L      0
## 5 8.0e+07 15.25  RITEL      L      0
## 6 1.5e+08 15.25  RITEL      P      0

Statistik Deskriptif

### Numerik
summary(df)
##      PLAFON               RATE          SEGMEN             GENDER         
##  Min.   :1.400e+06   Min.   : 2.63   Length:2388        Length:2388       
##  1st Qu.:1.000e+08   1st Qu.: 8.50   Class :character   Class :character  
##  Median :2.000e+08   Median :10.00   Mode  :character   Mode  :character  
##  Mean   :3.305e+08   Mean   :10.07                                        
##  3rd Qu.:3.305e+08   3rd Qu.:11.50                                        
##  Max.   :7.499e+09   Max.   :19.00                                        
##      RISIKO       
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.08962  
##  3rd Qu.:0.00000  
##  Max.   :4.00000
### Disttribusi dan nilai unik kategorik

unik_gender <- table(df$GENDER)
unik_segmen <- table(df$SEGMEN)
unik_risiko <- table(df$RISIKO)
# Distribusi variabel GENDER
unik_gender
## 
##    L    P 
## 1899  489
# Distribusi variable SEGMEN
unik_segmen
## 
## RITEL 
##  2388
# Distribusi variabel RISIKO
unik_risiko
## 
##    0    1    2    3    4 
## 2280   65    9    5   29

Visualisasi

### PLAFON
ggplot(df, aes(x = PLAFON)) +
  geom_histogram(binwidth = (max(df$PLAFON) - min(df$PLAFON))/30, fill = "steelblue", color = "black") +
  labs(title = "Distribusi Plafon", x = "Plafon", y = "Frekuensi") +
  theme_minimal()

### RATE
ggplot(df, aes(x = RATE)) +
  geom_histogram(binwidth = (max(df$RATE) - min(df$RATE))/30, fill = "darkgreen", color = "black") +
  labs(title = "Distribusi Rate", x = "Rate", y = "Frekuensi") +
  theme_minimal()

### SEGMEN
ggplot(df, aes(x = SEGMEN)) +
  geom_bar(fill = "yellow", color = "black") +
  labs(title = "Distribusi Segmen", x = "Segmenr", y = "Jumlah") +
  theme_minimal()

### GENDER
ggplot(df, aes(x = GENDER)) +
  geom_bar(fill = "purple", color = "black") +
  labs(title = "Distribusi Gender", x = "Gender", y = "Jumlah") +
  theme_minimal()

### RISIKO

ggplot(df, aes(x = factor(RISIKO))) +
  geom_bar(fill = "darkred", color = "black") +
  labs(title = "Distribusi Risiko", x = "Risiko (0 = Lancar, 1 = Tidak Lancar)", y = "Jumlah") +
  theme_minimal()

## Skewness dan kurtosis

# Menghitung skewness dan kurtosis PLAFON
skewness_plafon <- skewness(df$PLAFON)
kurtosis_plafon <- kurtosis(df$PLAFON)

# Menghitung skewness dan kurtosis RATE
skewness_rate <- skewness(df$RATE)
kurtosis_rate <- kurtosis(df$RATE)

# Skewnwss PLAFON
print(skewness_plafon)
## [1] 5.220934
# Kurtosis PLAFON
print(kurtosis_plafon)
## [1] 34.82443
# Skewness RATE
print(skewness_rate)
## [1] -0.03587709
# Kurtosis RATE
print(kurtosis_rate)
## [1] 0.6753836

Korelasi Antar Variabel

# Konversi GENDER dan RISIKO ke numerik
df_corr <- df %>%
  mutate(GENDER_numeric = ifelse(GENDER == "L", 0, 1)) %>%
  mutate(SEGMEN_numeric = ifelse(SEGMEN == "RITEL", 0, 1)) %>%
  select(PLAFON, RATE, SEGMEN_numeric, GENDER_numeric, RISIKO)

# Menghitung correlation matrix
correlation_matrix <- cor(df_corr)

# Menyiapkan matrix untuk ggplot2
melted_corr <- melt(correlation_matrix)

# Heatmap
ggplot(melted_corr, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1,
                                   size = 10, hjust = 1)) +
  coord_fixed() +
  labs(title = "Correlation Matrix Heatmap",
       x = "", y = "") +
  geom_text(aes(Var1, Var2, label = round(value, 2)), color = "black", size = 3)

## Mengecek Missing Values

missing_values <- colSums(is.na(df))
missing_df <- data.frame(
  Variable = names(missing_values),
  Missing_Count = as.numeric(missing_values)
)
missing_df
##   Variable Missing_Count
## 1   PLAFON             0
## 2     RATE             0
## 3   SEGMEN             0
## 4   GENDER             0
## 5   RISIKO             0

Mengecek Imbalance Data

# Distribusi variabel RISIKO
print(table(df$RISIKO))
## 
##    0    1    2    3    4 
## 2280   65    9    5   29
# Proporsi kelas RISIKO
print(prop.table(table(df$RISIKO)))
## 
##           0           1           2           3           4 
## 0.954773869 0.027219430 0.003768844 0.002093802 0.012144054

Preprocessing

Feature Selection

# Menghapus SEGMEN
df <- subset(df, select = -c(SEGMEN))
summary(df)
##      PLAFON               RATE          GENDER              RISIKO       
##  Min.   :1.400e+06   Min.   : 2.63   Length:2388        Min.   :0.00000  
##  1st Qu.:1.000e+08   1st Qu.: 8.50   Class :character   1st Qu.:0.00000  
##  Median :2.000e+08   Median :10.00   Mode  :character   Median :0.00000  
##  Mean   :3.305e+08   Mean   :10.07                      Mean   :0.08962  
##  3rd Qu.:3.305e+08   3rd Qu.:11.50                      3rd Qu.:0.00000  
##  Max.   :7.499e+09   Max.   :19.00                      Max.   :4.00000

Feature Enginering

# Menggabungkan skala RISIKO selain 0 (tidak risiko/lancar) menjadi nilai 1 (ada risiko/tidak lancar)
df$RISIKO <- ifelse(df$RISIKO > 0, 1, 0)
# Frekuensi variabel RISIKo setelah digabungkan
table(df$RISIKO)
## 
##    0    1 
## 2280  108
# Mengecek Imbalance data kembali
## Distribusi variabel RISIKO
print(table(df$RISIKO))
## 
##    0    1 
## 2280  108
## Proporsi variabel RISIKO
print(prop.table(table(df$RISIKO)))
## 
##          0          1 
## 0.95477387 0.04522613

Encoding

# Binerisasi feature GENDER, dengan 0 untuk L dan 1 untuk P

# Membersihkan kategorik dari tanda baca lain
df$GENDER <- trimws(df$GENDER)
df$GENDER <- toupper(df$GENDER)

# Binarisasi
df$GENDER <- ifelse(df$GENDER == "L", 0,
                   ifelse(df$GENDER == "P", 1, NA))

# Cek hasil
## Frekuensi variabel GENDER setelah binarisasi
table(df$GENDER)
## 
##    0    1 
## 1899  489
## Statistik DIskriptif setelah binarisasi GENDER
summary(df)
##      PLAFON               RATE           GENDER           RISIKO       
##  Min.   :1.400e+06   Min.   : 2.63   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:1.000e+08   1st Qu.: 8.50   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :2.000e+08   Median :10.00   Median :0.0000   Median :0.00000  
##  Mean   :3.305e+08   Mean   :10.07   Mean   :0.2048   Mean   :0.04523  
##  3rd Qu.:3.305e+08   3rd Qu.:11.50   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :7.499e+09   Max.   :19.00   Max.   :1.0000   Max.   :1.00000

Oversampling

# Duplikat dfs sebagai back up
df_copy <- df

# Pastikan target factor
df_copy$RISIKO <- as.factor(df_copy$RISIKO)

# Pisahkan GENDER
gender_asli <- df_copy$GENDER

# Ambil hanya fitur TANPA GENDER
X <- df_copy[, !(names(df_copy) %in% c("RISIKO", "GENDER"))]

# Pastikan semua numerik
X <- data.frame(lapply(X, function(x) as.numeric(as.factor(x))))

target <- df_copy$RISIKO

# Hitung rasio 50:50

tab <- table(target)
minor <- min(tab)
major <- max(tab)

target_ratio <- 0.5
desired_minor <- (target_ratio * major) / (1 - target_ratio)
dup_size <- ceiling((desired_minor - minor) / minor)

# SMOTE TANPA GENDER
set.seed(123)

smote_result <- SMOTE(X, target, K = 5, dup_size = dup_size)

df_clean <- smote_result$data
colnames(df_clean)[ncol(df_clean)] <- "RISIKO"

# Tambahkan kembali GENDER
n_asli <- nrow(df_copy)
n_baru <- nrow(df_clean)

# Ambil GENDER asli
gender_clean <- as.character(gender_asli)

# Tambahkan GENDER untuk data sintetis (sampling berdasarkan distribusi asli)
set.seed(123)
gender_sintetik <- sample(gender_clean, n_baru - n_asli, replace = TRUE)

# Gabungkan
df_clean$GENDER <- c(gender_clean, gender_sintetik)

# Jadikan factor kembali
df_clean$GENDER <- as.factor(df_clean$GENDER)

# Cek hasil oversampling
## Sebelum oversampling
### Distribusi RISIKO
print(table(df$RISIKO))
## 
##    0    1 
## 2280  108
### Distribusi GENDER
print(table(df$GENDER))
## 
##    0    1 
## 1899  489
cat("Sesudah:\n")
## Sesudah:
## Setelah oversampling
### Distribusi RISIKO
print(table(df_clean$RISIKO))
## 
##    0    1 
## 2280 2376
### Distribusi GENDER
print(table(df_clean$GENDER))
## 
##    0    1 
## 3697  959
# Statistik Deskriptif setelah oversampling
summary(df_clean)
##      PLAFON           RATE          RISIKO          GENDER  
##  Min.   :  1.0   Min.   : 1.00   Length:4656        0:3697  
##  1st Qu.:116.0   1st Qu.:24.43   Class :character   1: 959  
##  Median :180.0   Median :48.91   Mode  :character           
##  Mean   :189.7   Mean   :42.76                              
##  3rd Qu.:263.0   3rd Qu.:58.00                              
##  Max.   :399.0   Max.   :77.00
# Struktur data setelah oversampling
str(df_clean)
## 'data.frame':    4656 obs. of  4 variables:
##  $ PLAFON: num  85 332 180 165 180 156 116 308 382 116 ...
##  $ RATE  : num  52 35 22 65 25 52 21 20 25 35 ...
##  $ RISIKO: chr  "1" "1" "1" "1" ...
##  $ GENDER: Factor w/ 2 levels "0","1": 1 2 1 1 1 2 2 2 1 1 ...

Cek ulang oversampling

# Distribusi variabel RISIKO
print(table(df_clean$RISIKO))
## 
##    0    1 
## 2280 2376
# Proporsi variabel RISIKO
print(prop.table(table(df_clean$RISIKO)))
## 
##         0         1 
## 0.4896907 0.5103093

Data Splitting

# Randomisasi sample
set.seed(123)

# Jumlah data
n <- nrow(df_clean) # untuk data balance
n_imb <- nrow(df) # untuk data imbalance

train_index <- sample(1:n, size = 0.8 * n) # untuk data balance
train_index_imb <- sample(1:n_imb, size = 0.8 * n_imb) # untuk data imbalance

# Split data untuk modeling
train_data <- df_clean[train_index, ] # untuk data balance
test_data  <- df_clean[-train_index, ] # untuk data balance
# Split data untuk perbandingan penggunaan SMOTE
train_data_imb <- df[train_index_imb, ] # untuk data imbalance
test_data_imb  <- df[-train_index_imb, ] # untuk data imbalance

# Cek jumlah
cat("Jumlah data training:", nrow(train_data), "\n")
## Jumlah data training: 3724
cat("Jumlah data testing:", nrow(test_data), "\n")
## Jumlah data testing: 932
cat("Jumlah data training untuk data imbalance:", nrow(train_data_imb), "\n")
## Jumlah data training untuk data imbalance: 1910
cat("Jumlah data testing untuk data imbalance:", nrow(test_data_imb), "\n")
## Jumlah data testing untuk data imbalance: 478

Model Training

Regresi Logistik

Tanpa SMOTE

# Pastikan target berupa factor
train_data_imb$RISIKO <- as.factor(train_data_imb$RISIKO)
test_data_imb$RISIKO <- as.factor(test_data_imb$RISIKO)

# Training model regresi logistik
model_logistik_imb <- glm(RISIKO ~ .,
                      data = train_data_imb,
                      family = binomial)

## Ringkasan model
print(summary(model_logistik_imb))
## 
## Call:
## glm(formula = RISIKO ~ ., family = binomial, data = train_data_imb)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.370e+00  5.479e-01  -9.801  < 2e-16 ***
## PLAFON       4.945e-10  1.102e-10   4.487 7.23e-06 ***
## RATE         1.879e-01  4.844e-02   3.880 0.000105 ***
## GENDER       3.142e-01  2.581e-01   1.217 0.223502    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 676.72  on 1909  degrees of freedom
## Residual deviance: 644.54  on 1906  degrees of freedom
## AIC: 652.54
## 
## Number of Fisher Scoring iterations: 6
## Probabilitas ketika RISIKO 1
model_logistik_prob_imb <- predict(model_logistik_imb, type = "response")
## 10 prrobabilitaas untuk kelas 1
print(model_logistik_prob_imb[1:10])
##        559       2363       1801       2101       2047        609       1032 
## 0.02886821 0.05362175 0.02617056 0.03622874 0.02198560 0.03589598 0.05289925 
##       1493       2083       2076 
## 0.04803894 0.02363840 0.03254422
## dummy dari R
print(contrasts(train_data_imb$RISIKO))
##   1
## 0 0
## 1 1

Dengan SMOTE

# Pastikan target berupa factor
train_data$RISIKO <- as.factor(train_data$RISIKO)
test_data$RISIKO <- as.factor(test_data$RISIKO)

# Training model regresi logistik
model_logistik <- glm(RISIKO ~ .,
                      data = train_data,
                      family = binomial)

## Ringkasan model
summary(model_logistik)
## 
## Call:
## glm(formula = RISIKO ~ ., family = binomial, data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.3692018  0.1515064 -15.638   <2e-16 ***
## PLAFON       0.0044790  0.0003841  11.662   <2e-16 ***
## RATE         0.0365029  0.0022087  16.527   <2e-16 ***
## GENDER1     -0.0250951  0.0844829  -0.297    0.766    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5161.3  on 3723  degrees of freedom
## Residual deviance: 4842.3  on 3720  degrees of freedom
## AIC: 4850.3
## 
## Number of Fisher Scoring iterations: 4
## Probabilitas ketika nasabah memiliki risiko
model_logistik_prob <- predict(model_logistik, type = "response")
## 10 oribabilitas ketika RISIKO 1
model_logistik_prob[1:10]
##      2463      2511      2227       526      4291      2986      1842      1142 
## 0.6930532 0.6019380 0.5881571 0.5942299 0.3794544 0.6126196 0.5313928 0.5447943 
##      3371      3446 
## 0.4833571 0.3758741
## Dummy dari E
contrasts(train_data$RISIKO)
##   1
## 0 0
## 1 1

SVM

Tanpa SMOTE

# Training SVM dengan Tuning parameter
tune_svm_imb <- tune(
  svm,
  RISIKO ~ .,
  data = train_data_imb,
  kernel = "radial",
  ranges = list(
    cost = c(0.1, 1, 10, 100, 1000),
    gamma = c(0.5, 1, 2, 3, 4)
  )
)

# Hasil asil tuning
print(summary(tune_svm_imb))
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   0.1   0.5
## 
## - best performance: 0.04293194 
## 
## - Detailed performance results:
##     cost gamma      error dispersion
## 1  1e-01   0.5 0.04293194 0.01651959
## 2  1e+00   0.5 0.04293194 0.01651959
## 3  1e+01   0.5 0.04450262 0.01506342
## 4  1e+02   0.5 0.04607330 0.01391791
## 5  1e+03   0.5 0.04816754 0.01557048
## 6  1e-01   1.0 0.04293194 0.01651959
## 7  1e+00   1.0 0.04450262 0.01585157
## 8  1e+01   1.0 0.04607330 0.01391791
## 9  1e+02   1.0 0.04816754 0.01413505
## 10 1e+03   1.0 0.05078534 0.01561930
## 11 1e-01   2.0 0.04293194 0.01651959
## 12 1e+00   2.0 0.04397906 0.01564852
## 13 1e+01   2.0 0.04712042 0.01351827
## 14 1e+02   2.0 0.04764398 0.01490078
## 15 1e+03   2.0 0.04973822 0.01696529
## 16 1e-01   3.0 0.04293194 0.01651959
## 17 1e+00   3.0 0.04345550 0.01656562
## 18 1e+01   3.0 0.04712042 0.01351827
## 19 1e+02   3.0 0.04816754 0.01595690
## 20 1e+03   3.0 0.05026178 0.01850242
## 21 1e-01   4.0 0.04293194 0.01651959
## 22 1e+00   4.0 0.04345550 0.01656562
## 23 1e+01   4.0 0.04659686 0.01448622
## 24 1e+02   4.0 0.04921466 0.01603307
## 25 1e+03   4.0 0.05183246 0.01918927
# Model terbaik
model_svm_tuned_imb <- tune_svm_imb$best.model
model_svm_tuned_imb
## 
## Call:
## best.tune(METHOD = svm, train.x = RISIKO ~ ., data = train_data_imb, 
##     ranges = list(cost = c(0.1, 1, 10, 100, 1000), gamma = c(0.5, 
##         1, 2, 3, 4)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.1 
## 
## Number of Support Vectors:  221
# Parameter terbaik
tune_svm_imb$best.parameters
##   cost gamma
## 1  0.1   0.5

Dengan SMOTE

# Training SVM dengan Tuning parameter
tune_svm <- tune(
  svm,
  RISIKO ~ .,
  data = train_data,
  kernel = "radial",
  ranges = list(
    cost = c(0.1, 1, 10, 100, 1000),
    gamma = c(0.5, 1, 2, 3, 4)
  )
)

# Hasil tuning
summary(tune_svm)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##  1000     4
## 
## - best performance: 0.2175077 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1  1e-01   0.5 0.3321716 0.02093097
## 2  1e+00   0.5 0.3112305 0.01793032
## 3  1e+01   0.5 0.2854536 0.01678488
## 4  1e+02   0.5 0.2723053 0.02301237
## 5  1e+03   0.5 0.2696172 0.02196933
## 6  1e-01   1.0 0.3174039 0.02152166
## 7  1e+00   1.0 0.2849152 0.01747435
## 8  1e+01   1.0 0.2671985 0.02280229
## 9  1e+02   1.0 0.2604832 0.02606386
## 10 1e+03   1.0 0.2615555 0.02128474
## 11 1e-01   2.0 0.3018320 0.01336495
## 12 1e+00   2.0 0.2671993 0.02644715
## 13 1e+01   2.0 0.2586007 0.02331320
## 14 1e+02   2.0 0.2545706 0.02803797
## 15 1e+03   2.0 0.2379205 0.02667281
## 16 1e-01   3.0 0.2881389 0.01798074
## 17 1e+00   3.0 0.2602151 0.02665716
## 18 1e+01   3.0 0.2545742 0.02764483
## 19 1e+02   3.0 0.2379205 0.02641124
## 20 1e+03   3.0 0.2290589 0.02855188
## 21 1e-01   4.0 0.2825045 0.01929022
## 22 1e+00   4.0 0.2604846 0.02558033
## 23 1e+01   4.0 0.2449054 0.02671872
## 24 1e+02   4.0 0.2293292 0.02672377
## 25 1e+03   4.0 0.2175077 0.02888289
# Model terbaik
model_svm_tuned <- tune_svm$best.model
model_svm_tuned
## 
## Call:
## best.tune(METHOD = svm, train.x = RISIKO ~ ., data = train_data, 
##     ranges = list(cost = c(0.1, 1, 10, 100, 1000), gamma = c(0.5, 
##         1, 2, 3, 4)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1000 
## 
## Number of Support Vectors:  1752
# Parameter terbaik
tune_svm$best.parameters
##    cost gamma
## 25 1000     4

Random Forest

Tanpa SMOTE

# Set seed
set.seed(123)

# Training Random Forest
model_rf_imb <- randomForest(
  RISIKO ~ .,
  data = train_data_imb,
  ntree = 100,
  mtry = floor(sqrt(ncol(train_data_imb) - 1)),
  importance = TRUE
)

# Ringkasan model
print(model_rf_imb)
## 
## Call:
##  randomForest(formula = RISIKO ~ ., data = train_data_imb, ntree = 100,      mtry = floor(sqrt(ncol(train_data_imb) - 1)), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 4.29%
## Confusion matrix:
##      0 1 class.error
## 0 1828 0           0
## 1   82 0           1
# Feature Importance
importance(model_rf_imb)
##                0         1 MeanDecreaseAccuracy MeanDecreaseGini
## PLAFON  3.200588  7.160980             4.604895        16.793679
## RATE    2.593600  4.230859             3.756946        10.111842
## GENDER -2.321892 -1.829782            -2.742265         1.101632

Dengan SMOTE

# Set seed
set.seed(123)

# Training Random Forest
model_rf <- randomForest(
  RISIKO ~ .,
  data = train_data,
  ntree = 100,
  mtry = floor(sqrt(ncol(train_data) - 1)),
  importance = TRUE
)

# Ringkasan model
model_rf
## 
## Call:
##  randomForest(formula = RISIKO ~ ., data = train_data, ntree = 100,      mtry = floor(sqrt(ncol(train_data) - 1)), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 26.91%
## Confusion matrix:
##      0    1 class.error
## 0 1017  811   0.4436543
## 1  191 1705   0.1007384
# Feature Importance
importance(model_rf)
##                 0          1 MeanDecreaseAccuracy MeanDecreaseGini
## PLAFON 14.0106451 14.1746590           15.5410920       227.612783
## RATE   12.8995680 15.6783026           16.8229826       249.923167
## GENDER -0.7413821  0.1400874           -0.3762267         2.634378

Model Testing

Regresi Logistik

prob_pred_imb <- predict(
  model_logistik_imb,
  newdata = test_data_imb,
  type = "response"
)
contrasts(test_data_imb$RISIKO)
##   1
## 0 0
## 1 1
# prbabilita prediksi untuk nasabah memiliki risiko
head(prob_pred_imb)
##          4         11         12         14         19         22 
## 0.08035400 0.07718657 0.08165054 0.05793871 0.07908469 0.05478284
# prediksi RISIKO nasabah
prediksi_kelas_imb <- ifelse(prob_pred_imb > 0.5, 1, 0)
head(prediksi_kelas_imb)
##  4 11 12 14 19 22 
##  0  0  0  0  0  0
# confusion matrix balance data
table(Prediksi = prediksi_kelas_imb, Aktual = test_data_imb$RISIKO)
##         Aktual
## Prediksi   0   1
##        0 452  26
# Akurasi
akurasi_log_imb <- mean(prediksi_kelas_imb == test_data_imb$RISIKO)
akurasi_log_imb
## [1] 0.9456067

Dengan SMOTE

prob_pred <- predict(
  model_logistik,
  newdata = test_data,
  type = "response"
)
contrasts(test_data$RISIKO)
##   1
## 0 0
## 1 1
# prbabilita prediksi untuk nasabah memiliki risiko
head(prob_pred)
##         6         8        16        23        24        28 
## 0.5504735 0.4293028 0.6040368 0.3353674 0.5549559 0.6459937
# Prediksi kelas balance data
prediksi_kelas <- ifelse(prob_pred > 0.5, 1, 0)

head(prediksi_kelas)
##  6  8 16 23 24 28 
##  1  0  1  0  1  1
# confusion matrix balance data
table(Prediksi = prediksi_kelas, Aktual = test_data$RISIKO)
##         Aktual
## Prediksi   0   1
##        0 246 110
##        1 206 370
# Akurasi
akurasi_log <- mean(prediksi_kelas == test_data$RISIKO)
akurasi_log
## [1] 0.6609442

SVM

Tanpa SMOTE

# PREDIKSI DATA TEST
prediksi_svm_imb <- predict(
  model_svm_tuned_imb,
  newdata = test_data_imb
)

# Data testing vs Prediksi
head(test_data_imb$RISIKO)
## [1] 0 0 0 0 0 0
## Levels: 0 1
head(prediksi_svm_imb)
##  4 11 12 14 19 22 
##  0  0  0  0  0  0 
## Levels: 0 1
# Matrix Confusion
table(
  true = test_data_imb$RISIKO,
  pred = prediksi_svm_imb
)
##     pred
## true   0   1
##    0 452   0
##    1  26   0
akurasi_svm_imb <- mean(prediksi_svm_imb == test_data_imb$RISIKO)
# Akurasi
akurasi_svm_imb
## [1] 0.9456067

Dengan SMOTE

# PREDIKSI DATA TEST

prediksi_svm <- predict(
  model_svm_tuned,
  newdata = test_data
)

# Data testing vs Prediksi
head(test_data$RISIKO)
## [1] 1 1 1 1 1 1
## Levels: 0 1
head(prediksi_svm)
##  6  8 16 23 24 28 
##  1  0  1  1  1  1 
## Levels: 0 1
# Matrix Confusion
table(
  true = test_data$RISIKO,
  pred = prediksi_svm
)
##     pred
## true   0   1
##    0 325 127
##    1  84 396
# Akurasi
akurasi_svm <- mean(prediksi_svm == test_data$RISIKO)
akurasi_svm
## [1] 0.7736052

Random Forest

Tanpa SMOTE

prediksi_rf_imb <- predict(
  model_rf_imb,
  newdata = test_data_imb
)

# Hasil prediksi
head(prediksi_rf_imb)
##  4 11 12 14 19 22 
##  0  0  0  0  0  0 
## Levels: 0 1
# Confusion matrix
table(Prediksi = prediksi_rf_imb, Aktual = test_data_imb$RISIKO)
##         Aktual
## Prediksi   0   1
##        0 452  26
##        1   0   0
# Akurasi
akurasi_rf_imb <- mean(prediksi_rf_imb == test_data_imb$RISIKO)
akurasi_rf_imb
## [1] 0.9456067

Dengan SMOTE

prediksi_rf <- predict(
  model_rf,
  newdata = test_data
)

# Lihat hasil prediksi
head(prediksi_rf)
##  6  8 16 23 24 28 
##  1  0  1  1  1  1 
## Levels: 0 1
# Confusion matrix
table(Prediksi = prediksi_rf, Aktual = test_data$RISIKO)
##         Aktual
## Prediksi   0   1
##        0 236  49
##        1 216 431
# Akurasi
akurasi_rf <- mean(prediksi_rf == test_data$RISIKO)
akurasi_rf
## [1] 0.7156652

Evaluasi Model

Regresi Logistik

Tanpa SMOTE

# Confusion matrix
conf_matrix_log_imb <- table(
  factor(test_data_imb$RISIKO, levels = c(0,1)),
  factor(prediksi_kelas_imb, levels = c(0,1))
)
print(conf_matrix_log_imb)
##    
##       0   1
##   0 452   0
##   1  26   0
# Akurasi
akurasi_log_imb <- mean(prediksi_kelas_imb == test_data_imb$RISIKO)
cat("Akurasi:", akurasi_log_imb, "\n")
## Akurasi: 0.9456067
# Menghitung precision, recall, f1 untuk setiap kelas
# Extract values
# Assuming binary classification
if (nrow(conf_matrix_log_imb) == 2 && ncol(conf_matrix_log_imb) == 2) {
  # Get class names
  classes <- rownames(conf_matrix_log_imb)
  # For each class as positive
  for (i in 1:2) {
    TP <- conf_matrix_log_imb[i, i]
    FP <- sum(conf_matrix_log_imb[, i]) - TP
    FN <- sum(conf_matrix_log_imb[i, ]) - TP
    TN <- sum(conf_matrix_log_imb) - TP - FP - FN

    precision <- ifelse((TP+FP)==0, 0, TP/(TP+FP))
    recall <- ifelse((TP+FN)==0, 0, TP/(TP+FN))
    f1 <- ifelse((precision+recall)==0, 0, 2*precision*recall/(precision+recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:", round(recall, 4), "\n")
    cat("  F1-score:", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2, perlu penanganan khusus.\n")
}
## Kelas: 0 
##   Precision: 0.9456 
##   Recall: 1 
##   F1-score: 0.972 
## 
## Kelas: 1 
##   Precision: 0 
##   Recall: 0 
##   F1-score: 0

Dengan SMOTE

# Confusion matrix
conf_matrix <- table(Aktual = test_data$RISIKO, Prediksi = prediksi_kelas)
print(conf_matrix)
##       Prediksi
## Aktual   0   1
##      0 246 206
##      1 110 370
# Akurasi
akurasi_log <- mean(prediksi_kelas == test_data$RISIKO)
cat("Akurasi:", akurasi_log, "\n")
## Akurasi: 0.6609442
# Menghitung precision, recall, f1 untuk setiap kelas
# Extract values
# Assuming binary classification
if (nrow(conf_matrix) == 2 && ncol(conf_matrix) == 2) {
  # Get class names
  classes <- rownames(conf_matrix)
  # For each class as positive
  for (i in 1:2) {
    TP <- conf_matrix[i, i]
    FP <- sum(conf_matrix[, i]) - TP
    FN <- sum(conf_matrix[i, ]) - TP
    TN <- sum(conf_matrix) - TP - FP - FN

    precision <- ifelse((TP+FP)==0, 0, TP/(TP+FP))
    recall <- ifelse((TP+FN)==0, 0, TP/(TP+FN))
    f1 <- ifelse((precision+recall)==0, 0, 2*precision*recall/(precision+recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:", round(recall, 4), "\n")
    cat("  F1-score:", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2, perlu penanganan khusus.\n")
}
## Kelas: 0 
##   Precision: 0.691 
##   Recall: 0.5442 
##   F1-score: 0.6089 
## 
## Kelas: 1 
##   Precision: 0.6424 
##   Recall: 0.7708 
##   F1-score: 0.7008

SVM

Tanpa SMOTE

# Confusion Matrix
conf_matrix_svm_imb <- table(
  true = test_data_imb$RISIKO,
  pred = prediksi_svm_imb
)
print(conf_matrix_svm_imb)
##     pred
## true   0   1
##    0 452   0
##    1  26   0
# Akurasi
akurasi_svm_imb <- mean(prediksi_svm_imb == test_data_imb$RISIKO)
cat("Akurasi SVM (tanpa SMOTE):", akurasi_svm_imb, "\n\n")
## Akurasi SVM (tanpa SMOTE): 0.9456067
# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_svm_imb) == 2 && ncol(conf_matrix_svm_imb) == 2) {
  classes <- rownames(conf_matrix_svm_imb)

  for (i in 1:2) {
    TP <- conf_matrix_svm_imb[i, i]               # True Positive
    FP <- sum(conf_matrix_svm_imb[, i]) - TP      # False Positive
    FN <- sum(conf_matrix_svm_imb[i, ]) - TP      # False Negative
    TN <- sum(conf_matrix_svm_imb) - TP - FP - FN # True Negative (tidak langsung dipakai)

    precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
    recall    <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
    f1        <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:   ", round(recall, 4), "\n")
    cat("  F1-score: ", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}
## Kelas: 0 
##   Precision: 0.9456 
##   Recall:    1 
##   F1-score:  0.972 
## 
## Kelas: 1 
##   Precision: 0 
##   Recall:    0 
##   F1-score:  0

Dengan SMOTE

# Confusion Matrix
conf_matrix_svm <- table(
  true = test_data$RISIKO,
  pred = prediksi_svm
)
print(conf_matrix_svm)
##     pred
## true   0   1
##    0 325 127
##    1  84 396
# Akurasi
akurasi_svm <- mean(prediksi_svm == test_data$RISIKO)
cat("Akurasi SVM:", akurasi_svm, "\n\n")
## Akurasi SVM: 0.7736052
# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_svm) == 2 && ncol(conf_matrix_svm) == 2) {
  classes <- rownames(conf_matrix_svm)

  for (i in 1:2) {
    TP <- conf_matrix_svm[i, i]               # True Positive
    FP <- sum(conf_matrix_svm[, i]) - TP      # False Positive
    FN <- sum(conf_matrix_svm[i, ]) - TP      # False Negative
    TN <- sum(conf_matrix_svm) - TP - FP - FN # True Negative (tidak langsung dipakai)

    precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
    recall    <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
    f1        <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:   ", round(recall, 4), "\n")
    cat("  F1-score: ", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}
## Kelas: 0 
##   Precision: 0.7946 
##   Recall:    0.719 
##   F1-score:  0.7549 
## 
## Kelas: 1 
##   Precision: 0.7572 
##   Recall:    0.825 
##   F1-score:  0.7896

Random Forest

Tanpa SMOTE

# Confusion Matrix
conf_matrix_rf_imb <- table(
  true = test_data_imb$RISIKO,
  pred = prediksi_rf_imb
)
print(conf_matrix_rf_imb)
##     pred
## true   0   1
##    0 452   0
##    1  26   0
# Akurasi
akurasi_rf_imb <- mean(prediksi_rf_imb == test_data_imb$RISIKO)
cat("Akurasi Random Forest (tanpa SMOTE):", akurasi_rf_imb, "\n\n")
## Akurasi Random Forest (tanpa SMOTE): 0.9456067
# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_rf_imb) == 2 && ncol(conf_matrix_rf_imb) == 2) {
  classes <- rownames(conf_matrix_rf_imb)

  for (i in 1:2) {
    TP <- conf_matrix_rf_imb[i, i]               # True Positive
    FP <- sum(conf_matrix_rf_imb[, i]) - TP      # False Positive
    FN <- sum(conf_matrix_rf_imb[i, ]) - TP      # False Negative
    TN <- sum(conf_matrix_rf_imb) - TP - FP - FN # True Negative (opsional)

    precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
    recall    <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
    f1        <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:   ", round(recall, 4), "\n")
    cat("  F1-score: ", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}
## Kelas: 0 
##   Precision: 0.9456 
##   Recall:    1 
##   F1-score:  0.972 
## 
## Kelas: 1 
##   Precision: 0 
##   Recall:    0 
##   F1-score:  0

Dengan SMOTE

# Confusion Matrix
conf_matrix_rf <- table(
  true = test_data$RISIKO,
  pred = prediksi_rf
)
print(conf_matrix_rf)
##     pred
## true   0   1
##    0 236 216
##    1  49 431
# Akurasi
akurasi_rf <- mean(prediksi_rf == test_data$RISIKO)
cat("Akurasi Random Forest:", akurasi_rf, "\n\n")
## Akurasi Random Forest: 0.7156652
# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_rf) == 2 && ncol(conf_matrix_rf) == 2) {
  classes <- rownames(conf_matrix_rf)

  for (i in 1:2) {
    TP <- conf_matrix_rf[i, i]               # True Positive
    FP <- sum(conf_matrix_rf[, i]) - TP      # False Positive
    FN <- sum(conf_matrix_rf[i, ]) - TP      # False Negative
    TN <- sum(conf_matrix_rf) - TP - FP - FN # True Negative (opsional)

    precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
    recall    <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
    f1        <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:   ", round(recall, 4), "\n")
    cat("  F1-score: ", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}
## Kelas: 0 
##   Precision: 0.8281 
##   Recall:    0.5221 
##   F1-score:  0.6404 
## 
## Kelas: 1 
##   Precision: 0.6662 
##   Recall:    0.8979 
##   F1-score:  0.7649