### Library
library(ggplot2) # Visualisasi 
library(e1071) # Skewness, kurtosis, SVM 
library(dplyr) # korelasi
library(reshape2) # heatmap 
library(smotefamily) # oversampling
library(randomForest) # random forest

Data Understanding

Struktur Data

df <- read.csv("D:\\Kuliah D\\Semester 6\\PKL\\Laporan\\Kredit_Kolektibilitas.csv")
str(df)

## 'data.frame':    2388 obs. of  5 variables:
##  $ PLAFON: chr  "160.000.000" "96.000.000" "120.000.000" "230.000.000" ...
##  $ RATE  : chr  "11" "5,5" "19" "15" ...
##  $ SEGMEN: chr  "RITEL" "RITEL" "RITEL" "RITEL" ...
##  $ GENDER: chr  "L" "P" "L" "L" ...
##  $ RISIKO: num  0 0 0 0 0 0 0 0 0 0 ...

### Menyesuaikan Jenis Data 
df$PLAFON <- as.numeric(gsub("\\.", "", df$PLAFON))
df$RATE <- as.numeric(gsub(",", ".", gsub("\\.", "", df$RATE)))
head(df)

##    PLAFON  RATE SEGMEN GENDER RISIKO
## 1 1.6e+08 11.00  RITEL      L      0
## 2 9.6e+07  5.50  RITEL      P      0
## 3 1.2e+08 19.00  RITEL      L      0
## 4 2.3e+08 15.00  RITEL      L      0
## 5 8.0e+07 15.25  RITEL      L      0
## 6 1.5e+08 15.25  RITEL      P      0

Statistik Deskriptif

### Numerik
summary(df)

##      PLAFON               RATE          SEGMEN             GENDER         
##  Min.   :1.400e+06   Min.   : 2.63   Length:2388        Length:2388       
##  1st Qu.:1.000e+08   1st Qu.: 8.50   Class :character   Class :character  
##  Median :2.000e+08   Median :10.00   Mode  :character   Mode  :character  
##  Mean   :3.305e+08   Mean   :10.07                                        
##  3rd Qu.:3.305e+08   3rd Qu.:11.50                                        
##  Max.   :7.499e+09   Max.   :19.00                                        
##      RISIKO       
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.08962  
##  3rd Qu.:0.00000  
##  Max.   :4.00000

### Disttribusi dan nilai unik kategorik

unik_gender <- table(df$GENDER)
unik_segmen <- table(df$SEGMEN)
unik_risiko <- table(df$RISIKO)
# Distribusi variabel GENDER
unik_gender

## 
##    L    P 
## 1899  489

# Distribusi variable SEGMEN
unik_segmen

## 
## RITEL 
##  2388

# Distribusi variabel RISIKO
unik_risiko

## 
##    0    1    2    3    4 
## 2280   65    9    5   29

Visualisasi

### PLAFON
ggplot(df, aes(x = PLAFON)) +
  geom_histogram(binwidth = (max(df$PLAFON) - min(df$PLAFON))/30, fill = "steelblue", color = "black") +
  labs(title = "Distribusi Plafon", x = "Plafon", y = "Frekuensi") +
  theme_minimal()

### RATE
ggplot(df, aes(x = RATE)) +
  geom_histogram(binwidth = (max(df$RATE) - min(df$RATE))/30, fill = "darkgreen", color = "black") +
  labs(title = "Distribusi Rate", x = "Rate", y = "Frekuensi") +
  theme_minimal()

### SEGMEN
ggplot(df, aes(x = SEGMEN)) +
  geom_bar(fill = "yellow", color = "black") +
  labs(title = "Distribusi Segmen", x = "Segmenr", y = "Jumlah") +
  theme_minimal()

### GENDER
ggplot(df, aes(x = GENDER)) +
  geom_bar(fill = "purple", color = "black") +
  labs(title = "Distribusi Gender", x = "Gender", y = "Jumlah") +
  theme_minimal()

### RISIKO

ggplot(df, aes(x = factor(RISIKO))) +
  geom_bar(fill = "darkred", color = "black") +
  labs(title = "Distribusi Risiko", x = "Risiko (0 = Lancar, 1 = Tidak Lancar)", y = "Jumlah") +
  theme_minimal()

## Skewness dan kurtosis

# Menghitung skewness dan kurtosis PLAFON
skewness_plafon <- skewness(df$PLAFON)
kurtosis_plafon <- kurtosis(df$PLAFON)

# Menghitung skewness dan kurtosis RATE
skewness_rate <- skewness(df$RATE)
kurtosis_rate <- kurtosis(df$RATE)

# Skewnwss PLAFON
print(skewness_plafon)

## [1] 5.220934

# Kurtosis PLAFON
print(kurtosis_plafon)

## [1] 34.82443

# Skewness RATE
print(skewness_rate)

## [1] -0.03587709

# Kurtosis RATE
print(kurtosis_rate)

## [1] 0.6753836

Korelasi Antar Variabel

# Konversi GENDER dan RISIKO ke numerik
df_corr <- df %>%
  mutate(GENDER_numeric = ifelse(GENDER == "L", 0, 1)) %>%
  mutate(SEGMEN_numeric = ifelse(SEGMEN == "RITEL", 0, 1)) %>%
  select(PLAFON, RATE, SEGMEN_numeric, GENDER_numeric, RISIKO)

# Menghitung correlation matrix
correlation_matrix <- cor(df_corr)

# Menyiapkan matrix untuk ggplot2
melted_corr <- melt(correlation_matrix)

# Heatmap
ggplot(melted_corr, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1,
                                   size = 10, hjust = 1)) +
  coord_fixed() +
  labs(title = "Correlation Matrix Heatmap",
       x = "", y = "") +
  geom_text(aes(Var1, Var2, label = round(value, 2)), color = "black", size = 3)

## Mengecek Missing Values

missing_values <- colSums(is.na(df))
missing_df <- data.frame(
  Variable = names(missing_values),
  Missing_Count = as.numeric(missing_values)
)
missing_df

##   Variable Missing_Count
## 1   PLAFON             0
## 2     RATE             0
## 3   SEGMEN             0
## 4   GENDER             0
## 5   RISIKO             0

Mengecek Imbalance Data

# Distribusi variabel RISIKO
print(table(df$RISIKO))

## 
##    0    1    2    3    4 
## 2280   65    9    5   29

# Proporsi kelas RISIKO
print(prop.table(table(df$RISIKO)))

## 
##           0           1           2           3           4 
## 0.954773869 0.027219430 0.003768844 0.002093802 0.012144054

Preprocessing

Feature Selection

# Menghapus SEGMEN
df <- subset(df, select = -c(SEGMEN))
summary(df)

##      PLAFON               RATE          GENDER              RISIKO       
##  Min.   :1.400e+06   Min.   : 2.63   Length:2388        Min.   :0.00000  
##  1st Qu.:1.000e+08   1st Qu.: 8.50   Class :character   1st Qu.:0.00000  
##  Median :2.000e+08   Median :10.00   Mode  :character   Median :0.00000  
##  Mean   :3.305e+08   Mean   :10.07                      Mean   :0.08962  
##  3rd Qu.:3.305e+08   3rd Qu.:11.50                      3rd Qu.:0.00000  
##  Max.   :7.499e+09   Max.   :19.00                      Max.   :4.00000

Feature Enginering

# Menggabungkan skala RISIKO selain 0 (tidak risiko/lancar) menjadi nilai 1 (ada risiko/tidak lancar)
df$RISIKO <- ifelse(df$RISIKO > 0, 1, 0)
# Frekuensi variabel RISIKo setelah digabungkan
table(df$RISIKO)

## 
##    0    1 
## 2280  108

# Mengecek Imbalance data kembali
## Distribusi variabel RISIKO
print(table(df$RISIKO))

## 
##    0    1 
## 2280  108

## Proporsi variabel RISIKO
print(prop.table(table(df$RISIKO)))

## 
##          0          1 
## 0.95477387 0.04522613

Encoding

# Binerisasi feature GENDER, dengan 0 untuk L dan 1 untuk P

# Membersihkan kategorik dari tanda baca lain
df$GENDER <- trimws(df$GENDER)
df$GENDER <- toupper(df$GENDER)

# Binarisasi
df$GENDER <- ifelse(df$GENDER == "L", 0,
                   ifelse(df$GENDER == "P", 1, NA))

# Cek hasil
## Frekuensi variabel GENDER setelah binarisasi
table(df$GENDER)

## 
##    0    1 
## 1899  489

## Statistik DIskriptif setelah binarisasi GENDER
summary(df)

##      PLAFON               RATE           GENDER           RISIKO       
##  Min.   :1.400e+06   Min.   : 2.63   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:1.000e+08   1st Qu.: 8.50   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :2.000e+08   Median :10.00   Median :0.0000   Median :0.00000  
##  Mean   :3.305e+08   Mean   :10.07   Mean   :0.2048   Mean   :0.04523  
##  3rd Qu.:3.305e+08   3rd Qu.:11.50   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :7.499e+09   Max.   :19.00   Max.   :1.0000   Max.   :1.00000

Oversampling

# Duplikat dfs sebagai back up
df_copy <- df

# Pastikan target factor
df_copy$RISIKO <- as.factor(df_copy$RISIKO)

# Pisahkan GENDER
gender_asli <- df_copy$GENDER

# Ambil hanya fitur TANPA GENDER
X <- df_copy[, !(names(df_copy) %in% c("RISIKO", "GENDER"))]

# Pastikan semua numerik
X <- data.frame(lapply(X, function(x) as.numeric(as.factor(x))))

target <- df_copy$RISIKO

# Hitung rasio 50:50

tab <- table(target)
minor <- min(tab)
major <- max(tab)

target_ratio <- 0.5
desired_minor <- (target_ratio * major) / (1 - target_ratio)
dup_size <- ceiling((desired_minor - minor) / minor)

# SMOTE TANPA GENDER
set.seed(123)

smote_result <- SMOTE(X, target, K = 5, dup_size = dup_size)

df_clean <- smote_result$data
colnames(df_clean)[ncol(df_clean)] <- "RISIKO"

# Tambahkan kembali GENDER
n_asli <- nrow(df_copy)
n_baru <- nrow(df_clean)

# Ambil GENDER asli
gender_clean <- as.character(gender_asli)

# Tambahkan GENDER untuk data sintetis (sampling berdasarkan distribusi asli)
set.seed(123)
gender_sintetik <- sample(gender_clean, n_baru - n_asli, replace = TRUE)

# Gabungkan
df_clean$GENDER <- c(gender_clean, gender_sintetik)

# Jadikan factor kembali
df_clean$GENDER <- as.factor(df_clean$GENDER)

# Cek hasil oversampling
## Sebelum oversampling
### Distribusi RISIKO
print(table(df$RISIKO))

## 
##    0    1 
## 2280  108

### Distribusi GENDER
print(table(df$GENDER))

## 
##    0    1 
## 1899  489

cat("Sesudah:\n")

## Sesudah:

## Setelah oversampling
### Distribusi RISIKO
print(table(df_clean$RISIKO))

## 
##    0    1 
## 2280 2376

### Distribusi GENDER
print(table(df_clean$GENDER))

## 
##    0    1 
## 3697  959

# Statistik Deskriptif setelah oversampling
summary(df_clean)

##      PLAFON           RATE          RISIKO          GENDER  
##  Min.   :  1.0   Min.   : 1.00   Length:4656        0:3697  
##  1st Qu.:116.0   1st Qu.:24.43   Class :character   1: 959  
##  Median :180.0   Median :48.91   Mode  :character           
##  Mean   :189.7   Mean   :42.76                              
##  3rd Qu.:263.0   3rd Qu.:58.00                              
##  Max.   :399.0   Max.   :77.00

# Struktur data setelah oversampling
str(df_clean)

## 'data.frame':    4656 obs. of  4 variables:
##  $ PLAFON: num  85 332 180 165 180 156 116 308 382 116 ...
##  $ RATE  : num  52 35 22 65 25 52 21 20 25 35 ...
##  $ RISIKO: chr  "1" "1" "1" "1" ...
##  $ GENDER: Factor w/ 2 levels "0","1": 1 2 1 1 1 2 2 2 1 1 ...

Cek ulang oversampling

# Distribusi variabel RISIKO
print(table(df_clean$RISIKO))

## 
##    0    1 
## 2280 2376

# Proporsi variabel RISIKO
print(prop.table(table(df_clean$RISIKO)))

## 
##         0         1 
## 0.4896907 0.5103093

Data Splitting

# Randomisasi sample
set.seed(123)

# Jumlah data
n <- nrow(df_clean) # untuk data balance
n_imb <- nrow(df) # untuk data imbalance

train_index <- sample(1:n, size = 0.8 * n) # untuk data balance
train_index_imb <- sample(1:n_imb, size = 0.8 * n_imb) # untuk data imbalance

# Split data untuk modeling
train_data <- df_clean[train_index, ] # untuk data balance
test_data  <- df_clean[-train_index, ] # untuk data balance
# Split data untuk perbandingan penggunaan SMOTE
train_data_imb <- df[train_index_imb, ] # untuk data imbalance
test_data_imb  <- df[-train_index_imb, ] # untuk data imbalance

# Cek jumlah
cat("Jumlah data training:", nrow(train_data), "\n")

## Jumlah data training: 3724

cat("Jumlah data testing:", nrow(test_data), "\n")

## Jumlah data testing: 932

cat("Jumlah data training untuk data imbalance:", nrow(train_data_imb), "\n")

## Jumlah data training untuk data imbalance: 1910

cat("Jumlah data testing untuk data imbalance:", nrow(test_data_imb), "\n")

## Jumlah data testing untuk data imbalance: 478

Model Training

Regresi Logistik

Tanpa SMOTE

# Pastikan target berupa factor
train_data_imb$RISIKO <- as.factor(train_data_imb$RISIKO)
test_data_imb$RISIKO <- as.factor(test_data_imb$RISIKO)

# Training model regresi logistik
model_logistik_imb <- glm(RISIKO ~ .,
                      data = train_data_imb,
                      family = binomial)

## Ringkasan model
print(summary(model_logistik_imb))

## 
## Call:
## glm(formula = RISIKO ~ ., family = binomial, data = train_data_imb)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.370e+00  5.479e-01  -9.801  < 2e-16 ***
## PLAFON       4.945e-10  1.102e-10   4.487 7.23e-06 ***
## RATE         1.879e-01  4.844e-02   3.880 0.000105 ***
## GENDER       3.142e-01  2.581e-01   1.217 0.223502    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 676.72  on 1909  degrees of freedom
## Residual deviance: 644.54  on 1906  degrees of freedom
## AIC: 652.54
## 
## Number of Fisher Scoring iterations: 6

## Probabilitas ketika RISIKO 1
model_logistik_prob_imb <- predict(model_logistik_imb, type = "response")
## 10 prrobabilitaas untuk kelas 1
print(model_logistik_prob_imb[1:10])

##        559       2363       1801       2101       2047        609       1032 
## 0.02886821 0.05362175 0.02617056 0.03622874 0.02198560 0.03589598 0.05289925 
##       1493       2083       2076 
## 0.04803894 0.02363840 0.03254422

## dummy dari R
print(contrasts(train_data_imb$RISIKO))

##   1
## 0 0
## 1 1

Dengan SMOTE

# Pastikan target berupa factor
train_data$RISIKO <- as.factor(train_data$RISIKO)
test_data$RISIKO <- as.factor(test_data$RISIKO)

# Training model regresi logistik
model_logistik <- glm(RISIKO ~ .,
                      data = train_data,
                      family = binomial)

## Ringkasan model
summary(model_logistik)

## 
## Call:
## glm(formula = RISIKO ~ ., family = binomial, data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.3692018  0.1515064 -15.638   <2e-16 ***
## PLAFON       0.0044790  0.0003841  11.662   <2e-16 ***
## RATE         0.0365029  0.0022087  16.527   <2e-16 ***
## GENDER1     -0.0250951  0.0844829  -0.297    0.766    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5161.3  on 3723  degrees of freedom
## Residual deviance: 4842.3  on 3720  degrees of freedom
## AIC: 4850.3
## 
## Number of Fisher Scoring iterations: 4

## Probabilitas ketika nasabah memiliki risiko
model_logistik_prob <- predict(model_logistik, type = "response")
## 10 oribabilitas ketika RISIKO 1
model_logistik_prob[1:10]

##      2463      2511      2227       526      4291      2986      1842      1142 
## 0.6930532 0.6019380 0.5881571 0.5942299 0.3794544 0.6126196 0.5313928 0.5447943 
##      3371      3446 
## 0.4833571 0.3758741

## Dummy dari E
contrasts(train_data$RISIKO)

##   1
## 0 0
## 1 1

SVM

Tanpa SMOTE

# Training SVM dengan Tuning parameter
tune_svm_imb <- tune(
  svm,
  RISIKO ~ .,
  data = train_data_imb,
  kernel = "radial",
  ranges = list(
    cost = c(0.1, 1, 10, 100, 1000),
    gamma = c(0.5, 1, 2, 3, 4)
  )
)

# Hasil asil tuning
print(summary(tune_svm_imb))

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   0.1   0.5
## 
## - best performance: 0.04293194 
## 
## - Detailed performance results:
##     cost gamma      error dispersion
## 1  1e-01   0.5 0.04293194 0.01651959
## 2  1e+00   0.5 0.04293194 0.01651959
## 3  1e+01   0.5 0.04450262 0.01506342
## 4  1e+02   0.5 0.04607330 0.01391791
## 5  1e+03   0.5 0.04816754 0.01557048
## 6  1e-01   1.0 0.04293194 0.01651959
## 7  1e+00   1.0 0.04450262 0.01585157
## 8  1e+01   1.0 0.04607330 0.01391791
## 9  1e+02   1.0 0.04816754 0.01413505
## 10 1e+03   1.0 0.05078534 0.01561930
## 11 1e-01   2.0 0.04293194 0.01651959
## 12 1e+00   2.0 0.04397906 0.01564852
## 13 1e+01   2.0 0.04712042 0.01351827
## 14 1e+02   2.0 0.04764398 0.01490078
## 15 1e+03   2.0 0.04973822 0.01696529
## 16 1e-01   3.0 0.04293194 0.01651959
## 17 1e+00   3.0 0.04345550 0.01656562
## 18 1e+01   3.0 0.04712042 0.01351827
## 19 1e+02   3.0 0.04816754 0.01595690
## 20 1e+03   3.0 0.05026178 0.01850242
## 21 1e-01   4.0 0.04293194 0.01651959
## 22 1e+00   4.0 0.04345550 0.01656562
## 23 1e+01   4.0 0.04659686 0.01448622
## 24 1e+02   4.0 0.04921466 0.01603307
## 25 1e+03   4.0 0.05183246 0.01918927

# Model terbaik
model_svm_tuned_imb <- tune_svm_imb$best.model
model_svm_tuned_imb

## 
## Call:
## best.tune(METHOD = svm, train.x = RISIKO ~ ., data = train_data_imb, 
##     ranges = list(cost = c(0.1, 1, 10, 100, 1000), gamma = c(0.5, 
##         1, 2, 3, 4)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.1 
## 
## Number of Support Vectors:  221

# Parameter terbaik
tune_svm_imb$best.parameters

##   cost gamma
## 1  0.1   0.5

Dengan SMOTE

# Training SVM dengan Tuning parameter
tune_svm <- tune(
  svm,
  RISIKO ~ .,
  data = train_data,
  kernel = "radial",
  ranges = list(
    cost = c(0.1, 1, 10, 100, 1000),
    gamma = c(0.5, 1, 2, 3, 4)
  )
)

# Hasil tuning
summary(tune_svm)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##  1000     4
## 
## - best performance: 0.2175077 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1  1e-01   0.5 0.3321716 0.02093097
## 2  1e+00   0.5 0.3112305 0.01793032
## 3  1e+01   0.5 0.2854536 0.01678488
## 4  1e+02   0.5 0.2723053 0.02301237
## 5  1e+03   0.5 0.2696172 0.02196933
## 6  1e-01   1.0 0.3174039 0.02152166
## 7  1e+00   1.0 0.2849152 0.01747435
## 8  1e+01   1.0 0.2671985 0.02280229
## 9  1e+02   1.0 0.2604832 0.02606386
## 10 1e+03   1.0 0.2615555 0.02128474
## 11 1e-01   2.0 0.3018320 0.01336495
## 12 1e+00   2.0 0.2671993 0.02644715
## 13 1e+01   2.0 0.2586007 0.02331320
## 14 1e+02   2.0 0.2545706 0.02803797
## 15 1e+03   2.0 0.2379205 0.02667281
## 16 1e-01   3.0 0.2881389 0.01798074
## 17 1e+00   3.0 0.2602151 0.02665716
## 18 1e+01   3.0 0.2545742 0.02764483
## 19 1e+02   3.0 0.2379205 0.02641124
## 20 1e+03   3.0 0.2290589 0.02855188
## 21 1e-01   4.0 0.2825045 0.01929022
## 22 1e+00   4.0 0.2604846 0.02558033
## 23 1e+01   4.0 0.2449054 0.02671872
## 24 1e+02   4.0 0.2293292 0.02672377
## 25 1e+03   4.0 0.2175077 0.02888289

# Model terbaik
model_svm_tuned <- tune_svm$best.model
model_svm_tuned

## 
## Call:
## best.tune(METHOD = svm, train.x = RISIKO ~ ., data = train_data, 
##     ranges = list(cost = c(0.1, 1, 10, 100, 1000), gamma = c(0.5, 
##         1, 2, 3, 4)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1000 
## 
## Number of Support Vectors:  1752

# Parameter terbaik
tune_svm$best.parameters

##    cost gamma
## 25 1000     4

Random Forest

Tanpa SMOTE

# Set seed
set.seed(123)

# Training Random Forest
model_rf_imb <- randomForest(
  RISIKO ~ .,
  data = train_data_imb,
  ntree = 100,
  mtry = floor(sqrt(ncol(train_data_imb) - 1)),
  importance = TRUE
)

# Ringkasan model
print(model_rf_imb)

## 
## Call:
##  randomForest(formula = RISIKO ~ ., data = train_data_imb, ntree = 100,      mtry = floor(sqrt(ncol(train_data_imb) - 1)), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 4.29%
## Confusion matrix:
##      0 1 class.error
## 0 1828 0           0
## 1   82 0           1

# Feature Importance
importance(model_rf_imb)

##                0         1 MeanDecreaseAccuracy MeanDecreaseGini
## PLAFON  3.200588  7.160980             4.604895        16.793679
## RATE    2.593600  4.230859             3.756946        10.111842
## GENDER -2.321892 -1.829782            -2.742265         1.101632

Dengan SMOTE

# Set seed
set.seed(123)

# Training Random Forest
model_rf <- randomForest(
  RISIKO ~ .,
  data = train_data,
  ntree = 100,
  mtry = floor(sqrt(ncol(train_data) - 1)),
  importance = TRUE
)

# Ringkasan model
model_rf

## 
## Call:
##  randomForest(formula = RISIKO ~ ., data = train_data, ntree = 100,      mtry = floor(sqrt(ncol(train_data) - 1)), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 26.91%
## Confusion matrix:
##      0    1 class.error
## 0 1017  811   0.4436543
## 1  191 1705   0.1007384

# Feature Importance
importance(model_rf)

##                 0          1 MeanDecreaseAccuracy MeanDecreaseGini
## PLAFON 14.0106451 14.1746590           15.5410920       227.612783
## RATE   12.8995680 15.6783026           16.8229826       249.923167
## GENDER -0.7413821  0.1400874           -0.3762267         2.634378

Model Testing

Regresi Logistik

prob_pred_imb <- predict(
  model_logistik_imb,
  newdata = test_data_imb,
  type = "response"
)
contrasts(test_data_imb$RISIKO)

##   1
## 0 0
## 1 1

# prbabilita prediksi untuk nasabah memiliki risiko
head(prob_pred_imb)

##          4         11         12         14         19         22 
## 0.08035400 0.07718657 0.08165054 0.05793871 0.07908469 0.05478284

# prediksi RISIKO nasabah
prediksi_kelas_imb <- ifelse(prob_pred_imb > 0.5, 1, 0)
head(prediksi_kelas_imb)

##  4 11 12 14 19 22 
##  0  0  0  0  0  0

# confusion matrix balance data
table(Prediksi = prediksi_kelas_imb, Aktual = test_data_imb$RISIKO)

##         Aktual
## Prediksi   0   1
##        0 452  26

# Akurasi
akurasi_log_imb <- mean(prediksi_kelas_imb == test_data_imb$RISIKO)
akurasi_log_imb

## [1] 0.9456067

Dengan SMOTE

prob_pred <- predict(
  model_logistik,
  newdata = test_data,
  type = "response"
)
contrasts(test_data$RISIKO)

##   1
## 0 0
## 1 1

# prbabilita prediksi untuk nasabah memiliki risiko
head(prob_pred)

##         6         8        16        23        24        28 
## 0.5504735 0.4293028 0.6040368 0.3353674 0.5549559 0.6459937

# Prediksi kelas balance data
prediksi_kelas <- ifelse(prob_pred > 0.5, 1, 0)

head(prediksi_kelas)

##  6  8 16 23 24 28 
##  1  0  1  0  1  1

# confusion matrix balance data
table(Prediksi = prediksi_kelas, Aktual = test_data$RISIKO)

##         Aktual
## Prediksi   0   1
##        0 246 110
##        1 206 370

# Akurasi
akurasi_log <- mean(prediksi_kelas == test_data$RISIKO)
akurasi_log

## [1] 0.6609442

SVM

Tanpa SMOTE

# PREDIKSI DATA TEST
prediksi_svm_imb <- predict(
  model_svm_tuned_imb,
  newdata = test_data_imb
)

# Data testing vs Prediksi
head(test_data_imb$RISIKO)

## [1] 0 0 0 0 0 0
## Levels: 0 1

head(prediksi_svm_imb)

##  4 11 12 14 19 22 
##  0  0  0  0  0  0 
## Levels: 0 1

# Matrix Confusion
table(
  true = test_data_imb$RISIKO,
  pred = prediksi_svm_imb
)

##     pred
## true   0   1
##    0 452   0
##    1  26   0

akurasi_svm_imb <- mean(prediksi_svm_imb == test_data_imb$RISIKO)
# Akurasi
akurasi_svm_imb

## [1] 0.9456067

Dengan SMOTE

# PREDIKSI DATA TEST

prediksi_svm <- predict(
  model_svm_tuned,
  newdata = test_data
)

# Data testing vs Prediksi
head(test_data$RISIKO)

## [1] 1 1 1 1 1 1
## Levels: 0 1

head(prediksi_svm)

##  6  8 16 23 24 28 
##  1  0  1  1  1  1 
## Levels: 0 1

# Matrix Confusion
table(
  true = test_data$RISIKO,
  pred = prediksi_svm
)

##     pred
## true   0   1
##    0 325 127
##    1  84 396

# Akurasi
akurasi_svm <- mean(prediksi_svm == test_data$RISIKO)
akurasi_svm

## [1] 0.7736052

Random Forest

Tanpa SMOTE

prediksi_rf_imb <- predict(
  model_rf_imb,
  newdata = test_data_imb
)

# Hasil prediksi
head(prediksi_rf_imb)

##  4 11 12 14 19 22 
##  0  0  0  0  0  0 
## Levels: 0 1

# Confusion matrix
table(Prediksi = prediksi_rf_imb, Aktual = test_data_imb$RISIKO)

##         Aktual
## Prediksi   0   1
##        0 452  26
##        1   0   0

# Akurasi
akurasi_rf_imb <- mean(prediksi_rf_imb == test_data_imb$RISIKO)
akurasi_rf_imb

## [1] 0.9456067

Dengan SMOTE

prediksi_rf <- predict(
  model_rf,
  newdata = test_data
)

# Lihat hasil prediksi
head(prediksi_rf)

##  6  8 16 23 24 28 
##  1  0  1  1  1  1 
## Levels: 0 1

# Confusion matrix
table(Prediksi = prediksi_rf, Aktual = test_data$RISIKO)

##         Aktual
## Prediksi   0   1
##        0 236  49
##        1 216 431

# Akurasi
akurasi_rf <- mean(prediksi_rf == test_data$RISIKO)
akurasi_rf

## [1] 0.7156652

Evaluasi Model

Regresi Logistik

Tanpa SMOTE

# Confusion matrix
conf_matrix_log_imb <- table(
  factor(test_data_imb$RISIKO, levels = c(0,1)),
  factor(prediksi_kelas_imb, levels = c(0,1))
)
print(conf_matrix_log_imb)

##    
##       0   1
##   0 452   0
##   1  26   0

# Akurasi
akurasi_log_imb <- mean(prediksi_kelas_imb == test_data_imb$RISIKO)
cat("Akurasi:", akurasi_log_imb, "\n")

## Akurasi: 0.9456067

# Menghitung precision, recall, f1 untuk setiap kelas
# Extract values
# Assuming binary classification
if (nrow(conf_matrix_log_imb) == 2 && ncol(conf_matrix_log_imb) == 2) {
  # Get class names
  classes <- rownames(conf_matrix_log_imb)
  # For each class as positive
  for (i in 1:2) {
    TP <- conf_matrix_log_imb[i, i]
    FP <- sum(conf_matrix_log_imb[, i]) - TP
    FN <- sum(conf_matrix_log_imb[i, ]) - TP
    TN <- sum(conf_matrix_log_imb) - TP - FP - FN

    precision <- ifelse((TP+FP)==0, 0, TP/(TP+FP))
    recall <- ifelse((TP+FN)==0, 0, TP/(TP+FN))
    f1 <- ifelse((precision+recall)==0, 0, 2*precision*recall/(precision+recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:", round(recall, 4), "\n")
    cat("  F1-score:", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2, perlu penanganan khusus.\n")
}

## Kelas: 0 
##   Precision: 0.9456 
##   Recall: 1 
##   F1-score: 0.972 
## 
## Kelas: 1 
##   Precision: 0 
##   Recall: 0 
##   F1-score: 0

Dengan SMOTE

# Confusion matrix
conf_matrix <- table(Aktual = test_data$RISIKO, Prediksi = prediksi_kelas)
print(conf_matrix)

##       Prediksi
## Aktual   0   1
##      0 246 206
##      1 110 370

# Akurasi
akurasi_log <- mean(prediksi_kelas == test_data$RISIKO)
cat("Akurasi:", akurasi_log, "\n")

## Akurasi: 0.6609442

# Menghitung precision, recall, f1 untuk setiap kelas
# Extract values
# Assuming binary classification
if (nrow(conf_matrix) == 2 && ncol(conf_matrix) == 2) {
  # Get class names
  classes <- rownames(conf_matrix)
  # For each class as positive
  for (i in 1:2) {
    TP <- conf_matrix[i, i]
    FP <- sum(conf_matrix[, i]) - TP
    FN <- sum(conf_matrix[i, ]) - TP
    TN <- sum(conf_matrix) - TP - FP - FN

    precision <- ifelse((TP+FP)==0, 0, TP/(TP+FP))
    recall <- ifelse((TP+FN)==0, 0, TP/(TP+FN))
    f1 <- ifelse((precision+recall)==0, 0, 2*precision*recall/(precision+recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:", round(recall, 4), "\n")
    cat("  F1-score:", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2, perlu penanganan khusus.\n")
}

## Kelas: 0 
##   Precision: 0.691 
##   Recall: 0.5442 
##   F1-score: 0.6089 
## 
## Kelas: 1 
##   Precision: 0.6424 
##   Recall: 0.7708 
##   F1-score: 0.7008

SVM

Tanpa SMOTE

# Confusion Matrix
conf_matrix_svm_imb <- table(
  true = test_data_imb$RISIKO,
  pred = prediksi_svm_imb
)
print(conf_matrix_svm_imb)

##     pred
## true   0   1
##    0 452   0
##    1  26   0

# Akurasi
akurasi_svm_imb <- mean(prediksi_svm_imb == test_data_imb$RISIKO)
cat("Akurasi SVM (tanpa SMOTE):", akurasi_svm_imb, "\n\n")

## Akurasi SVM (tanpa SMOTE): 0.9456067

# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_svm_imb) == 2 && ncol(conf_matrix_svm_imb) == 2) {
  classes <- rownames(conf_matrix_svm_imb)

  for (i in 1:2) {
    TP <- conf_matrix_svm_imb[i, i]               # True Positive
    FP <- sum(conf_matrix_svm_imb[, i]) - TP      # False Positive
    FN <- sum(conf_matrix_svm_imb[i, ]) - TP      # False Negative
    TN <- sum(conf_matrix_svm_imb) - TP - FP - FN # True Negative (tidak langsung dipakai)

    precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
    recall    <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
    f1        <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:   ", round(recall, 4), "\n")
    cat("  F1-score: ", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}

## Kelas: 0 
##   Precision: 0.9456 
##   Recall:    1 
##   F1-score:  0.972 
## 
## Kelas: 1 
##   Precision: 0 
##   Recall:    0 
##   F1-score:  0

Dengan SMOTE

# Confusion Matrix
conf_matrix_svm <- table(
  true = test_data$RISIKO,
  pred = prediksi_svm
)
print(conf_matrix_svm)

##     pred
## true   0   1
##    0 325 127
##    1  84 396

# Akurasi
akurasi_svm <- mean(prediksi_svm == test_data$RISIKO)
cat("Akurasi SVM:", akurasi_svm, "\n\n")

## Akurasi SVM: 0.7736052

# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_svm) == 2 && ncol(conf_matrix_svm) == 2) {
  classes <- rownames(conf_matrix_svm)

  for (i in 1:2) {
    TP <- conf_matrix_svm[i, i]               # True Positive
    FP <- sum(conf_matrix_svm[, i]) - TP      # False Positive
    FN <- sum(conf_matrix_svm[i, ]) - TP      # False Negative
    TN <- sum(conf_matrix_svm) - TP - FP - FN # True Negative (tidak langsung dipakai)

    precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
    recall    <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
    f1        <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:   ", round(recall, 4), "\n")
    cat("  F1-score: ", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}

## Kelas: 0 
##   Precision: 0.7946 
##   Recall:    0.719 
##   F1-score:  0.7549 
## 
## Kelas: 1 
##   Precision: 0.7572 
##   Recall:    0.825 
##   F1-score:  0.7896

Random Forest

Tanpa SMOTE

# Confusion Matrix
conf_matrix_rf_imb <- table(
  true = test_data_imb$RISIKO,
  pred = prediksi_rf_imb
)
print(conf_matrix_rf_imb)

##     pred
## true   0   1
##    0 452   0
##    1  26   0

# Akurasi
akurasi_rf_imb <- mean(prediksi_rf_imb == test_data_imb$RISIKO)
cat("Akurasi Random Forest (tanpa SMOTE):", akurasi_rf_imb, "\n\n")

## Akurasi Random Forest (tanpa SMOTE): 0.9456067

# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_rf_imb) == 2 && ncol(conf_matrix_rf_imb) == 2) {
  classes <- rownames(conf_matrix_rf_imb)

  for (i in 1:2) {
    TP <- conf_matrix_rf_imb[i, i]               # True Positive
    FP <- sum(conf_matrix_rf_imb[, i]) - TP      # False Positive
    FN <- sum(conf_matrix_rf_imb[i, ]) - TP      # False Negative
    TN <- sum(conf_matrix_rf_imb) - TP - FP - FN # True Negative (opsional)

    precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
    recall    <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
    f1        <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:   ", round(recall, 4), "\n")
    cat("  F1-score: ", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}

## Kelas: 0 
##   Precision: 0.9456 
##   Recall:    1 
##   F1-score:  0.972 
## 
## Kelas: 1 
##   Precision: 0 
##   Recall:    0 
##   F1-score:  0

Dengan SMOTE

# Confusion Matrix
conf_matrix_rf <- table(
  true = test_data$RISIKO,
  pred = prediksi_rf
)
print(conf_matrix_rf)

##     pred
## true   0   1
##    0 236 216
##    1  49 431

# Akurasi
akurasi_rf <- mean(prediksi_rf == test_data$RISIKO)
cat("Akurasi Random Forest:", akurasi_rf, "\n\n")

## Akurasi Random Forest: 0.7156652

# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_rf) == 2 && ncol(conf_matrix_rf) == 2) {
  classes <- rownames(conf_matrix_rf)

  for (i in 1:2) {
    TP <- conf_matrix_rf[i, i]               # True Positive
    FP <- sum(conf_matrix_rf[, i]) - TP      # False Positive
    FN <- sum(conf_matrix_rf[i, ]) - TP      # False Negative
    TN <- sum(conf_matrix_rf) - TP - FP - FN # True Negative (opsional)

    precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
    recall    <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
    f1        <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))

    cat("Kelas:", classes[i], "\n")
    cat("  Precision:", round(precision, 4), "\n")
    cat("  Recall:   ", round(recall, 4), "\n")
    cat("  F1-score: ", round(f1, 4), "\n\n")
  }
} else {
  cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}

## Kelas: 0 
##   Precision: 0.8281 
##   Recall:    0.5221 
##   F1-score:  0.6404 
## 
## Kelas: 1 
##   Precision: 0.6662 
##   Recall:    0.8979 
##   F1-score:  0.7649

Machine Learning Algorithm On Credit Scoring

Aiskha Rizkya Putri

2026-04-25

Data Understanding

Struktur Data

Statistik Deskriptif

Visualisasi

Korelasi Antar Variabel

Mengecek Imbalance Data

Preprocessing

Feature Selection

Feature Enginering

Encoding

Oversampling

Cek ulang oversampling

Data Splitting

Model Training

Regresi Logistik

Tanpa SMOTE

Dengan SMOTE

SVM

Tanpa SMOTE

Dengan SMOTE

Random Forest

Tanpa SMOTE

Dengan SMOTE

Model Testing

Regresi Logistik

Dengan SMOTE

SVM

Tanpa SMOTE

Dengan SMOTE

Random Forest

Tanpa SMOTE

Dengan SMOTE

Evaluasi Model

Regresi Logistik

Tanpa SMOTE

Dengan SMOTE

SVM

Tanpa SMOTE

Dengan SMOTE

Random Forest

Tanpa SMOTE

Dengan SMOTE