### Library
library(ggplot2) # Visualisasi
library(e1071) # Skewness, kurtosis, SVM
library(dplyr) # korelasi
library(reshape2) # heatmap
library(smotefamily) # oversampling
library(randomForest) # random forest
df <- read.csv("D:\\Kuliah D\\Semester 6\\PKL\\Laporan\\Kredit_Kolektibilitas.csv")
str(df)
## 'data.frame': 2388 obs. of 5 variables:
## $ PLAFON: chr "160.000.000" "96.000.000" "120.000.000" "230.000.000" ...
## $ RATE : chr "11" "5,5" "19" "15" ...
## $ SEGMEN: chr "RITEL" "RITEL" "RITEL" "RITEL" ...
## $ GENDER: chr "L" "P" "L" "L" ...
## $ RISIKO: num 0 0 0 0 0 0 0 0 0 0 ...
### Menyesuaikan Jenis Data
df$PLAFON <- as.numeric(gsub("\\.", "", df$PLAFON))
df$RATE <- as.numeric(gsub(",", ".", gsub("\\.", "", df$RATE)))
head(df)
## PLAFON RATE SEGMEN GENDER RISIKO
## 1 1.6e+08 11.00 RITEL L 0
## 2 9.6e+07 5.50 RITEL P 0
## 3 1.2e+08 19.00 RITEL L 0
## 4 2.3e+08 15.00 RITEL L 0
## 5 8.0e+07 15.25 RITEL L 0
## 6 1.5e+08 15.25 RITEL P 0
### Numerik
summary(df)
## PLAFON RATE SEGMEN GENDER
## Min. :1.400e+06 Min. : 2.63 Length:2388 Length:2388
## 1st Qu.:1.000e+08 1st Qu.: 8.50 Class :character Class :character
## Median :2.000e+08 Median :10.00 Mode :character Mode :character
## Mean :3.305e+08 Mean :10.07
## 3rd Qu.:3.305e+08 3rd Qu.:11.50
## Max. :7.499e+09 Max. :19.00
## RISIKO
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.08962
## 3rd Qu.:0.00000
## Max. :4.00000
### Disttribusi dan nilai unik kategorik
unik_gender <- table(df$GENDER)
unik_segmen <- table(df$SEGMEN)
unik_risiko <- table(df$RISIKO)
# Distribusi variabel GENDER
unik_gender
##
## L P
## 1899 489
# Distribusi variable SEGMEN
unik_segmen
##
## RITEL
## 2388
# Distribusi variabel RISIKO
unik_risiko
##
## 0 1 2 3 4
## 2280 65 9 5 29
### PLAFON
ggplot(df, aes(x = PLAFON)) +
geom_histogram(binwidth = (max(df$PLAFON) - min(df$PLAFON))/30, fill = "steelblue", color = "black") +
labs(title = "Distribusi Plafon", x = "Plafon", y = "Frekuensi") +
theme_minimal()
### RATE
ggplot(df, aes(x = RATE)) +
geom_histogram(binwidth = (max(df$RATE) - min(df$RATE))/30, fill = "darkgreen", color = "black") +
labs(title = "Distribusi Rate", x = "Rate", y = "Frekuensi") +
theme_minimal()
### SEGMEN
ggplot(df, aes(x = SEGMEN)) +
geom_bar(fill = "yellow", color = "black") +
labs(title = "Distribusi Segmen", x = "Segmenr", y = "Jumlah") +
theme_minimal()
### GENDER
ggplot(df, aes(x = GENDER)) +
geom_bar(fill = "purple", color = "black") +
labs(title = "Distribusi Gender", x = "Gender", y = "Jumlah") +
theme_minimal()
### RISIKO
ggplot(df, aes(x = factor(RISIKO))) +
geom_bar(fill = "darkred", color = "black") +
labs(title = "Distribusi Risiko", x = "Risiko (0 = Lancar, 1 = Tidak Lancar)", y = "Jumlah") +
theme_minimal()
## Skewness dan kurtosis
# Menghitung skewness dan kurtosis PLAFON
skewness_plafon <- skewness(df$PLAFON)
kurtosis_plafon <- kurtosis(df$PLAFON)
# Menghitung skewness dan kurtosis RATE
skewness_rate <- skewness(df$RATE)
kurtosis_rate <- kurtosis(df$RATE)
# Skewnwss PLAFON
print(skewness_plafon)
## [1] 5.220934
# Kurtosis PLAFON
print(kurtosis_plafon)
## [1] 34.82443
# Skewness RATE
print(skewness_rate)
## [1] -0.03587709
# Kurtosis RATE
print(kurtosis_rate)
## [1] 0.6753836
# Konversi GENDER dan RISIKO ke numerik
df_corr <- df %>%
mutate(GENDER_numeric = ifelse(GENDER == "L", 0, 1)) %>%
mutate(SEGMEN_numeric = ifelse(SEGMEN == "RITEL", 0, 1)) %>%
select(PLAFON, RATE, SEGMEN_numeric, GENDER_numeric, RISIKO)
# Menghitung correlation matrix
correlation_matrix <- cor(df_corr)
# Menyiapkan matrix untuk ggplot2
melted_corr <- melt(correlation_matrix)
# Heatmap
ggplot(melted_corr, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1)) +
coord_fixed() +
labs(title = "Correlation Matrix Heatmap",
x = "", y = "") +
geom_text(aes(Var1, Var2, label = round(value, 2)), color = "black", size = 3)
## Mengecek Missing Values
missing_values <- colSums(is.na(df))
missing_df <- data.frame(
Variable = names(missing_values),
Missing_Count = as.numeric(missing_values)
)
missing_df
## Variable Missing_Count
## 1 PLAFON 0
## 2 RATE 0
## 3 SEGMEN 0
## 4 GENDER 0
## 5 RISIKO 0
# Distribusi variabel RISIKO
print(table(df$RISIKO))
##
## 0 1 2 3 4
## 2280 65 9 5 29
# Proporsi kelas RISIKO
print(prop.table(table(df$RISIKO)))
##
## 0 1 2 3 4
## 0.954773869 0.027219430 0.003768844 0.002093802 0.012144054
# Menghapus SEGMEN
df <- subset(df, select = -c(SEGMEN))
summary(df)
## PLAFON RATE GENDER RISIKO
## Min. :1.400e+06 Min. : 2.63 Length:2388 Min. :0.00000
## 1st Qu.:1.000e+08 1st Qu.: 8.50 Class :character 1st Qu.:0.00000
## Median :2.000e+08 Median :10.00 Mode :character Median :0.00000
## Mean :3.305e+08 Mean :10.07 Mean :0.08962
## 3rd Qu.:3.305e+08 3rd Qu.:11.50 3rd Qu.:0.00000
## Max. :7.499e+09 Max. :19.00 Max. :4.00000
# Menggabungkan skala RISIKO selain 0 (tidak risiko/lancar) menjadi nilai 1 (ada risiko/tidak lancar)
df$RISIKO <- ifelse(df$RISIKO > 0, 1, 0)
# Frekuensi variabel RISIKo setelah digabungkan
table(df$RISIKO)
##
## 0 1
## 2280 108
# Mengecek Imbalance data kembali
## Distribusi variabel RISIKO
print(table(df$RISIKO))
##
## 0 1
## 2280 108
## Proporsi variabel RISIKO
print(prop.table(table(df$RISIKO)))
##
## 0 1
## 0.95477387 0.04522613
# Binerisasi feature GENDER, dengan 0 untuk L dan 1 untuk P
# Membersihkan kategorik dari tanda baca lain
df$GENDER <- trimws(df$GENDER)
df$GENDER <- toupper(df$GENDER)
# Binarisasi
df$GENDER <- ifelse(df$GENDER == "L", 0,
ifelse(df$GENDER == "P", 1, NA))
# Cek hasil
## Frekuensi variabel GENDER setelah binarisasi
table(df$GENDER)
##
## 0 1
## 1899 489
## Statistik DIskriptif setelah binarisasi GENDER
summary(df)
## PLAFON RATE GENDER RISIKO
## Min. :1.400e+06 Min. : 2.63 Min. :0.0000 Min. :0.00000
## 1st Qu.:1.000e+08 1st Qu.: 8.50 1st Qu.:0.0000 1st Qu.:0.00000
## Median :2.000e+08 Median :10.00 Median :0.0000 Median :0.00000
## Mean :3.305e+08 Mean :10.07 Mean :0.2048 Mean :0.04523
## 3rd Qu.:3.305e+08 3rd Qu.:11.50 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :7.499e+09 Max. :19.00 Max. :1.0000 Max. :1.00000
# Duplikat dfs sebagai back up
df_copy <- df
# Pastikan target factor
df_copy$RISIKO <- as.factor(df_copy$RISIKO)
# Pisahkan GENDER
gender_asli <- df_copy$GENDER
# Ambil hanya fitur TANPA GENDER
X <- df_copy[, !(names(df_copy) %in% c("RISIKO", "GENDER"))]
# Pastikan semua numerik
X <- data.frame(lapply(X, function(x) as.numeric(as.factor(x))))
target <- df_copy$RISIKO
# Hitung rasio 50:50
tab <- table(target)
minor <- min(tab)
major <- max(tab)
target_ratio <- 0.5
desired_minor <- (target_ratio * major) / (1 - target_ratio)
dup_size <- ceiling((desired_minor - minor) / minor)
# SMOTE TANPA GENDER
set.seed(123)
smote_result <- SMOTE(X, target, K = 5, dup_size = dup_size)
df_clean <- smote_result$data
colnames(df_clean)[ncol(df_clean)] <- "RISIKO"
# Tambahkan kembali GENDER
n_asli <- nrow(df_copy)
n_baru <- nrow(df_clean)
# Ambil GENDER asli
gender_clean <- as.character(gender_asli)
# Tambahkan GENDER untuk data sintetis (sampling berdasarkan distribusi asli)
set.seed(123)
gender_sintetik <- sample(gender_clean, n_baru - n_asli, replace = TRUE)
# Gabungkan
df_clean$GENDER <- c(gender_clean, gender_sintetik)
# Jadikan factor kembali
df_clean$GENDER <- as.factor(df_clean$GENDER)
# Cek hasil oversampling
## Sebelum oversampling
### Distribusi RISIKO
print(table(df$RISIKO))
##
## 0 1
## 2280 108
### Distribusi GENDER
print(table(df$GENDER))
##
## 0 1
## 1899 489
cat("Sesudah:\n")
## Sesudah:
## Setelah oversampling
### Distribusi RISIKO
print(table(df_clean$RISIKO))
##
## 0 1
## 2280 2376
### Distribusi GENDER
print(table(df_clean$GENDER))
##
## 0 1
## 3697 959
# Statistik Deskriptif setelah oversampling
summary(df_clean)
## PLAFON RATE RISIKO GENDER
## Min. : 1.0 Min. : 1.00 Length:4656 0:3697
## 1st Qu.:116.0 1st Qu.:24.43 Class :character 1: 959
## Median :180.0 Median :48.91 Mode :character
## Mean :189.7 Mean :42.76
## 3rd Qu.:263.0 3rd Qu.:58.00
## Max. :399.0 Max. :77.00
# Struktur data setelah oversampling
str(df_clean)
## 'data.frame': 4656 obs. of 4 variables:
## $ PLAFON: num 85 332 180 165 180 156 116 308 382 116 ...
## $ RATE : num 52 35 22 65 25 52 21 20 25 35 ...
## $ RISIKO: chr "1" "1" "1" "1" ...
## $ GENDER: Factor w/ 2 levels "0","1": 1 2 1 1 1 2 2 2 1 1 ...
# Distribusi variabel RISIKO
print(table(df_clean$RISIKO))
##
## 0 1
## 2280 2376
# Proporsi variabel RISIKO
print(prop.table(table(df_clean$RISIKO)))
##
## 0 1
## 0.4896907 0.5103093
# Randomisasi sample
set.seed(123)
# Jumlah data
n <- nrow(df_clean) # untuk data balance
n_imb <- nrow(df) # untuk data imbalance
train_index <- sample(1:n, size = 0.8 * n) # untuk data balance
train_index_imb <- sample(1:n_imb, size = 0.8 * n_imb) # untuk data imbalance
# Split data untuk modeling
train_data <- df_clean[train_index, ] # untuk data balance
test_data <- df_clean[-train_index, ] # untuk data balance
# Split data untuk perbandingan penggunaan SMOTE
train_data_imb <- df[train_index_imb, ] # untuk data imbalance
test_data_imb <- df[-train_index_imb, ] # untuk data imbalance
# Cek jumlah
cat("Jumlah data training:", nrow(train_data), "\n")
## Jumlah data training: 3724
cat("Jumlah data testing:", nrow(test_data), "\n")
## Jumlah data testing: 932
cat("Jumlah data training untuk data imbalance:", nrow(train_data_imb), "\n")
## Jumlah data training untuk data imbalance: 1910
cat("Jumlah data testing untuk data imbalance:", nrow(test_data_imb), "\n")
## Jumlah data testing untuk data imbalance: 478
# Pastikan target berupa factor
train_data_imb$RISIKO <- as.factor(train_data_imb$RISIKO)
test_data_imb$RISIKO <- as.factor(test_data_imb$RISIKO)
# Training model regresi logistik
model_logistik_imb <- glm(RISIKO ~ .,
data = train_data_imb,
family = binomial)
## Ringkasan model
print(summary(model_logistik_imb))
##
## Call:
## glm(formula = RISIKO ~ ., family = binomial, data = train_data_imb)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.370e+00 5.479e-01 -9.801 < 2e-16 ***
## PLAFON 4.945e-10 1.102e-10 4.487 7.23e-06 ***
## RATE 1.879e-01 4.844e-02 3.880 0.000105 ***
## GENDER 3.142e-01 2.581e-01 1.217 0.223502
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 676.72 on 1909 degrees of freedom
## Residual deviance: 644.54 on 1906 degrees of freedom
## AIC: 652.54
##
## Number of Fisher Scoring iterations: 6
## Probabilitas ketika RISIKO 1
model_logistik_prob_imb <- predict(model_logistik_imb, type = "response")
## 10 prrobabilitaas untuk kelas 1
print(model_logistik_prob_imb[1:10])
## 559 2363 1801 2101 2047 609 1032
## 0.02886821 0.05362175 0.02617056 0.03622874 0.02198560 0.03589598 0.05289925
## 1493 2083 2076
## 0.04803894 0.02363840 0.03254422
## dummy dari R
print(contrasts(train_data_imb$RISIKO))
## 1
## 0 0
## 1 1
# Pastikan target berupa factor
train_data$RISIKO <- as.factor(train_data$RISIKO)
test_data$RISIKO <- as.factor(test_data$RISIKO)
# Training model regresi logistik
model_logistik <- glm(RISIKO ~ .,
data = train_data,
family = binomial)
## Ringkasan model
summary(model_logistik)
##
## Call:
## glm(formula = RISIKO ~ ., family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3692018 0.1515064 -15.638 <2e-16 ***
## PLAFON 0.0044790 0.0003841 11.662 <2e-16 ***
## RATE 0.0365029 0.0022087 16.527 <2e-16 ***
## GENDER1 -0.0250951 0.0844829 -0.297 0.766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5161.3 on 3723 degrees of freedom
## Residual deviance: 4842.3 on 3720 degrees of freedom
## AIC: 4850.3
##
## Number of Fisher Scoring iterations: 4
## Probabilitas ketika nasabah memiliki risiko
model_logistik_prob <- predict(model_logistik, type = "response")
## 10 oribabilitas ketika RISIKO 1
model_logistik_prob[1:10]
## 2463 2511 2227 526 4291 2986 1842 1142
## 0.6930532 0.6019380 0.5881571 0.5942299 0.3794544 0.6126196 0.5313928 0.5447943
## 3371 3446
## 0.4833571 0.3758741
## Dummy dari E
contrasts(train_data$RISIKO)
## 1
## 0 0
## 1 1
# Training SVM dengan Tuning parameter
tune_svm_imb <- tune(
svm,
RISIKO ~ .,
data = train_data_imb,
kernel = "radial",
ranges = list(
cost = c(0.1, 1, 10, 100, 1000),
gamma = c(0.5, 1, 2, 3, 4)
)
)
# Hasil asil tuning
print(summary(tune_svm_imb))
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 0.1 0.5
##
## - best performance: 0.04293194
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.5 0.04293194 0.01651959
## 2 1e+00 0.5 0.04293194 0.01651959
## 3 1e+01 0.5 0.04450262 0.01506342
## 4 1e+02 0.5 0.04607330 0.01391791
## 5 1e+03 0.5 0.04816754 0.01557048
## 6 1e-01 1.0 0.04293194 0.01651959
## 7 1e+00 1.0 0.04450262 0.01585157
## 8 1e+01 1.0 0.04607330 0.01391791
## 9 1e+02 1.0 0.04816754 0.01413505
## 10 1e+03 1.0 0.05078534 0.01561930
## 11 1e-01 2.0 0.04293194 0.01651959
## 12 1e+00 2.0 0.04397906 0.01564852
## 13 1e+01 2.0 0.04712042 0.01351827
## 14 1e+02 2.0 0.04764398 0.01490078
## 15 1e+03 2.0 0.04973822 0.01696529
## 16 1e-01 3.0 0.04293194 0.01651959
## 17 1e+00 3.0 0.04345550 0.01656562
## 18 1e+01 3.0 0.04712042 0.01351827
## 19 1e+02 3.0 0.04816754 0.01595690
## 20 1e+03 3.0 0.05026178 0.01850242
## 21 1e-01 4.0 0.04293194 0.01651959
## 22 1e+00 4.0 0.04345550 0.01656562
## 23 1e+01 4.0 0.04659686 0.01448622
## 24 1e+02 4.0 0.04921466 0.01603307
## 25 1e+03 4.0 0.05183246 0.01918927
# Model terbaik
model_svm_tuned_imb <- tune_svm_imb$best.model
model_svm_tuned_imb
##
## Call:
## best.tune(METHOD = svm, train.x = RISIKO ~ ., data = train_data_imb,
## ranges = list(cost = c(0.1, 1, 10, 100, 1000), gamma = c(0.5,
## 1, 2, 3, 4)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.1
##
## Number of Support Vectors: 221
# Parameter terbaik
tune_svm_imb$best.parameters
## cost gamma
## 1 0.1 0.5
# Training SVM dengan Tuning parameter
tune_svm <- tune(
svm,
RISIKO ~ .,
data = train_data,
kernel = "radial",
ranges = list(
cost = c(0.1, 1, 10, 100, 1000),
gamma = c(0.5, 1, 2, 3, 4)
)
)
# Hasil tuning
summary(tune_svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1000 4
##
## - best performance: 0.2175077
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.5 0.3321716 0.02093097
## 2 1e+00 0.5 0.3112305 0.01793032
## 3 1e+01 0.5 0.2854536 0.01678488
## 4 1e+02 0.5 0.2723053 0.02301237
## 5 1e+03 0.5 0.2696172 0.02196933
## 6 1e-01 1.0 0.3174039 0.02152166
## 7 1e+00 1.0 0.2849152 0.01747435
## 8 1e+01 1.0 0.2671985 0.02280229
## 9 1e+02 1.0 0.2604832 0.02606386
## 10 1e+03 1.0 0.2615555 0.02128474
## 11 1e-01 2.0 0.3018320 0.01336495
## 12 1e+00 2.0 0.2671993 0.02644715
## 13 1e+01 2.0 0.2586007 0.02331320
## 14 1e+02 2.0 0.2545706 0.02803797
## 15 1e+03 2.0 0.2379205 0.02667281
## 16 1e-01 3.0 0.2881389 0.01798074
## 17 1e+00 3.0 0.2602151 0.02665716
## 18 1e+01 3.0 0.2545742 0.02764483
## 19 1e+02 3.0 0.2379205 0.02641124
## 20 1e+03 3.0 0.2290589 0.02855188
## 21 1e-01 4.0 0.2825045 0.01929022
## 22 1e+00 4.0 0.2604846 0.02558033
## 23 1e+01 4.0 0.2449054 0.02671872
## 24 1e+02 4.0 0.2293292 0.02672377
## 25 1e+03 4.0 0.2175077 0.02888289
# Model terbaik
model_svm_tuned <- tune_svm$best.model
model_svm_tuned
##
## Call:
## best.tune(METHOD = svm, train.x = RISIKO ~ ., data = train_data,
## ranges = list(cost = c(0.1, 1, 10, 100, 1000), gamma = c(0.5,
## 1, 2, 3, 4)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1000
##
## Number of Support Vectors: 1752
# Parameter terbaik
tune_svm$best.parameters
## cost gamma
## 25 1000 4
# Set seed
set.seed(123)
# Training Random Forest
model_rf_imb <- randomForest(
RISIKO ~ .,
data = train_data_imb,
ntree = 100,
mtry = floor(sqrt(ncol(train_data_imb) - 1)),
importance = TRUE
)
# Ringkasan model
print(model_rf_imb)
##
## Call:
## randomForest(formula = RISIKO ~ ., data = train_data_imb, ntree = 100, mtry = floor(sqrt(ncol(train_data_imb) - 1)), importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 4.29%
## Confusion matrix:
## 0 1 class.error
## 0 1828 0 0
## 1 82 0 1
# Feature Importance
importance(model_rf_imb)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## PLAFON 3.200588 7.160980 4.604895 16.793679
## RATE 2.593600 4.230859 3.756946 10.111842
## GENDER -2.321892 -1.829782 -2.742265 1.101632
# Set seed
set.seed(123)
# Training Random Forest
model_rf <- randomForest(
RISIKO ~ .,
data = train_data,
ntree = 100,
mtry = floor(sqrt(ncol(train_data) - 1)),
importance = TRUE
)
# Ringkasan model
model_rf
##
## Call:
## randomForest(formula = RISIKO ~ ., data = train_data, ntree = 100, mtry = floor(sqrt(ncol(train_data) - 1)), importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 26.91%
## Confusion matrix:
## 0 1 class.error
## 0 1017 811 0.4436543
## 1 191 1705 0.1007384
# Feature Importance
importance(model_rf)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## PLAFON 14.0106451 14.1746590 15.5410920 227.612783
## RATE 12.8995680 15.6783026 16.8229826 249.923167
## GENDER -0.7413821 0.1400874 -0.3762267 2.634378
prob_pred_imb <- predict(
model_logistik_imb,
newdata = test_data_imb,
type = "response"
)
contrasts(test_data_imb$RISIKO)
## 1
## 0 0
## 1 1
# prbabilita prediksi untuk nasabah memiliki risiko
head(prob_pred_imb)
## 4 11 12 14 19 22
## 0.08035400 0.07718657 0.08165054 0.05793871 0.07908469 0.05478284
# prediksi RISIKO nasabah
prediksi_kelas_imb <- ifelse(prob_pred_imb > 0.5, 1, 0)
head(prediksi_kelas_imb)
## 4 11 12 14 19 22
## 0 0 0 0 0 0
# confusion matrix balance data
table(Prediksi = prediksi_kelas_imb, Aktual = test_data_imb$RISIKO)
## Aktual
## Prediksi 0 1
## 0 452 26
# Akurasi
akurasi_log_imb <- mean(prediksi_kelas_imb == test_data_imb$RISIKO)
akurasi_log_imb
## [1] 0.9456067
prob_pred <- predict(
model_logistik,
newdata = test_data,
type = "response"
)
contrasts(test_data$RISIKO)
## 1
## 0 0
## 1 1
# prbabilita prediksi untuk nasabah memiliki risiko
head(prob_pred)
## 6 8 16 23 24 28
## 0.5504735 0.4293028 0.6040368 0.3353674 0.5549559 0.6459937
# Prediksi kelas balance data
prediksi_kelas <- ifelse(prob_pred > 0.5, 1, 0)
head(prediksi_kelas)
## 6 8 16 23 24 28
## 1 0 1 0 1 1
# confusion matrix balance data
table(Prediksi = prediksi_kelas, Aktual = test_data$RISIKO)
## Aktual
## Prediksi 0 1
## 0 246 110
## 1 206 370
# Akurasi
akurasi_log <- mean(prediksi_kelas == test_data$RISIKO)
akurasi_log
## [1] 0.6609442
# PREDIKSI DATA TEST
prediksi_svm_imb <- predict(
model_svm_tuned_imb,
newdata = test_data_imb
)
# Data testing vs Prediksi
head(test_data_imb$RISIKO)
## [1] 0 0 0 0 0 0
## Levels: 0 1
head(prediksi_svm_imb)
## 4 11 12 14 19 22
## 0 0 0 0 0 0
## Levels: 0 1
# Matrix Confusion
table(
true = test_data_imb$RISIKO,
pred = prediksi_svm_imb
)
## pred
## true 0 1
## 0 452 0
## 1 26 0
akurasi_svm_imb <- mean(prediksi_svm_imb == test_data_imb$RISIKO)
# Akurasi
akurasi_svm_imb
## [1] 0.9456067
# PREDIKSI DATA TEST
prediksi_svm <- predict(
model_svm_tuned,
newdata = test_data
)
# Data testing vs Prediksi
head(test_data$RISIKO)
## [1] 1 1 1 1 1 1
## Levels: 0 1
head(prediksi_svm)
## 6 8 16 23 24 28
## 1 0 1 1 1 1
## Levels: 0 1
# Matrix Confusion
table(
true = test_data$RISIKO,
pred = prediksi_svm
)
## pred
## true 0 1
## 0 325 127
## 1 84 396
# Akurasi
akurasi_svm <- mean(prediksi_svm == test_data$RISIKO)
akurasi_svm
## [1] 0.7736052
prediksi_rf_imb <- predict(
model_rf_imb,
newdata = test_data_imb
)
# Hasil prediksi
head(prediksi_rf_imb)
## 4 11 12 14 19 22
## 0 0 0 0 0 0
## Levels: 0 1
# Confusion matrix
table(Prediksi = prediksi_rf_imb, Aktual = test_data_imb$RISIKO)
## Aktual
## Prediksi 0 1
## 0 452 26
## 1 0 0
# Akurasi
akurasi_rf_imb <- mean(prediksi_rf_imb == test_data_imb$RISIKO)
akurasi_rf_imb
## [1] 0.9456067
prediksi_rf <- predict(
model_rf,
newdata = test_data
)
# Lihat hasil prediksi
head(prediksi_rf)
## 6 8 16 23 24 28
## 1 0 1 1 1 1
## Levels: 0 1
# Confusion matrix
table(Prediksi = prediksi_rf, Aktual = test_data$RISIKO)
## Aktual
## Prediksi 0 1
## 0 236 49
## 1 216 431
# Akurasi
akurasi_rf <- mean(prediksi_rf == test_data$RISIKO)
akurasi_rf
## [1] 0.7156652
# Confusion matrix
conf_matrix_log_imb <- table(
factor(test_data_imb$RISIKO, levels = c(0,1)),
factor(prediksi_kelas_imb, levels = c(0,1))
)
print(conf_matrix_log_imb)
##
## 0 1
## 0 452 0
## 1 26 0
# Akurasi
akurasi_log_imb <- mean(prediksi_kelas_imb == test_data_imb$RISIKO)
cat("Akurasi:", akurasi_log_imb, "\n")
## Akurasi: 0.9456067
# Menghitung precision, recall, f1 untuk setiap kelas
# Extract values
# Assuming binary classification
if (nrow(conf_matrix_log_imb) == 2 && ncol(conf_matrix_log_imb) == 2) {
# Get class names
classes <- rownames(conf_matrix_log_imb)
# For each class as positive
for (i in 1:2) {
TP <- conf_matrix_log_imb[i, i]
FP <- sum(conf_matrix_log_imb[, i]) - TP
FN <- sum(conf_matrix_log_imb[i, ]) - TP
TN <- sum(conf_matrix_log_imb) - TP - FP - FN
precision <- ifelse((TP+FP)==0, 0, TP/(TP+FP))
recall <- ifelse((TP+FN)==0, 0, TP/(TP+FN))
f1 <- ifelse((precision+recall)==0, 0, 2*precision*recall/(precision+recall))
cat("Kelas:", classes[i], "\n")
cat(" Precision:", round(precision, 4), "\n")
cat(" Recall:", round(recall, 4), "\n")
cat(" F1-score:", round(f1, 4), "\n\n")
}
} else {
cat("Confusion matrix bukan 2x2, perlu penanganan khusus.\n")
}
## Kelas: 0
## Precision: 0.9456
## Recall: 1
## F1-score: 0.972
##
## Kelas: 1
## Precision: 0
## Recall: 0
## F1-score: 0
# Confusion matrix
conf_matrix <- table(Aktual = test_data$RISIKO, Prediksi = prediksi_kelas)
print(conf_matrix)
## Prediksi
## Aktual 0 1
## 0 246 206
## 1 110 370
# Akurasi
akurasi_log <- mean(prediksi_kelas == test_data$RISIKO)
cat("Akurasi:", akurasi_log, "\n")
## Akurasi: 0.6609442
# Menghitung precision, recall, f1 untuk setiap kelas
# Extract values
# Assuming binary classification
if (nrow(conf_matrix) == 2 && ncol(conf_matrix) == 2) {
# Get class names
classes <- rownames(conf_matrix)
# For each class as positive
for (i in 1:2) {
TP <- conf_matrix[i, i]
FP <- sum(conf_matrix[, i]) - TP
FN <- sum(conf_matrix[i, ]) - TP
TN <- sum(conf_matrix) - TP - FP - FN
precision <- ifelse((TP+FP)==0, 0, TP/(TP+FP))
recall <- ifelse((TP+FN)==0, 0, TP/(TP+FN))
f1 <- ifelse((precision+recall)==0, 0, 2*precision*recall/(precision+recall))
cat("Kelas:", classes[i], "\n")
cat(" Precision:", round(precision, 4), "\n")
cat(" Recall:", round(recall, 4), "\n")
cat(" F1-score:", round(f1, 4), "\n\n")
}
} else {
cat("Confusion matrix bukan 2x2, perlu penanganan khusus.\n")
}
## Kelas: 0
## Precision: 0.691
## Recall: 0.5442
## F1-score: 0.6089
##
## Kelas: 1
## Precision: 0.6424
## Recall: 0.7708
## F1-score: 0.7008
# Confusion Matrix
conf_matrix_svm_imb <- table(
true = test_data_imb$RISIKO,
pred = prediksi_svm_imb
)
print(conf_matrix_svm_imb)
## pred
## true 0 1
## 0 452 0
## 1 26 0
# Akurasi
akurasi_svm_imb <- mean(prediksi_svm_imb == test_data_imb$RISIKO)
cat("Akurasi SVM (tanpa SMOTE):", akurasi_svm_imb, "\n\n")
## Akurasi SVM (tanpa SMOTE): 0.9456067
# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_svm_imb) == 2 && ncol(conf_matrix_svm_imb) == 2) {
classes <- rownames(conf_matrix_svm_imb)
for (i in 1:2) {
TP <- conf_matrix_svm_imb[i, i] # True Positive
FP <- sum(conf_matrix_svm_imb[, i]) - TP # False Positive
FN <- sum(conf_matrix_svm_imb[i, ]) - TP # False Negative
TN <- sum(conf_matrix_svm_imb) - TP - FP - FN # True Negative (tidak langsung dipakai)
precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
recall <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
f1 <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))
cat("Kelas:", classes[i], "\n")
cat(" Precision:", round(precision, 4), "\n")
cat(" Recall: ", round(recall, 4), "\n")
cat(" F1-score: ", round(f1, 4), "\n\n")
}
} else {
cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}
## Kelas: 0
## Precision: 0.9456
## Recall: 1
## F1-score: 0.972
##
## Kelas: 1
## Precision: 0
## Recall: 0
## F1-score: 0
# Confusion Matrix
conf_matrix_svm <- table(
true = test_data$RISIKO,
pred = prediksi_svm
)
print(conf_matrix_svm)
## pred
## true 0 1
## 0 325 127
## 1 84 396
# Akurasi
akurasi_svm <- mean(prediksi_svm == test_data$RISIKO)
cat("Akurasi SVM:", akurasi_svm, "\n\n")
## Akurasi SVM: 0.7736052
# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_svm) == 2 && ncol(conf_matrix_svm) == 2) {
classes <- rownames(conf_matrix_svm)
for (i in 1:2) {
TP <- conf_matrix_svm[i, i] # True Positive
FP <- sum(conf_matrix_svm[, i]) - TP # False Positive
FN <- sum(conf_matrix_svm[i, ]) - TP # False Negative
TN <- sum(conf_matrix_svm) - TP - FP - FN # True Negative (tidak langsung dipakai)
precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
recall <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
f1 <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))
cat("Kelas:", classes[i], "\n")
cat(" Precision:", round(precision, 4), "\n")
cat(" Recall: ", round(recall, 4), "\n")
cat(" F1-score: ", round(f1, 4), "\n\n")
}
} else {
cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}
## Kelas: 0
## Precision: 0.7946
## Recall: 0.719
## F1-score: 0.7549
##
## Kelas: 1
## Precision: 0.7572
## Recall: 0.825
## F1-score: 0.7896
# Confusion Matrix
conf_matrix_rf_imb <- table(
true = test_data_imb$RISIKO,
pred = prediksi_rf_imb
)
print(conf_matrix_rf_imb)
## pred
## true 0 1
## 0 452 0
## 1 26 0
# Akurasi
akurasi_rf_imb <- mean(prediksi_rf_imb == test_data_imb$RISIKO)
cat("Akurasi Random Forest (tanpa SMOTE):", akurasi_rf_imb, "\n\n")
## Akurasi Random Forest (tanpa SMOTE): 0.9456067
# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_rf_imb) == 2 && ncol(conf_matrix_rf_imb) == 2) {
classes <- rownames(conf_matrix_rf_imb)
for (i in 1:2) {
TP <- conf_matrix_rf_imb[i, i] # True Positive
FP <- sum(conf_matrix_rf_imb[, i]) - TP # False Positive
FN <- sum(conf_matrix_rf_imb[i, ]) - TP # False Negative
TN <- sum(conf_matrix_rf_imb) - TP - FP - FN # True Negative (opsional)
precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
recall <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
f1 <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))
cat("Kelas:", classes[i], "\n")
cat(" Precision:", round(precision, 4), "\n")
cat(" Recall: ", round(recall, 4), "\n")
cat(" F1-score: ", round(f1, 4), "\n\n")
}
} else {
cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}
## Kelas: 0
## Precision: 0.9456
## Recall: 1
## F1-score: 0.972
##
## Kelas: 1
## Precision: 0
## Recall: 0
## F1-score: 0
# Confusion Matrix
conf_matrix_rf <- table(
true = test_data$RISIKO,
pred = prediksi_rf
)
print(conf_matrix_rf)
## pred
## true 0 1
## 0 236 216
## 1 49 431
# Akurasi
akurasi_rf <- mean(prediksi_rf == test_data$RISIKO)
cat("Akurasi Random Forest:", akurasi_rf, "\n\n")
## Akurasi Random Forest: 0.7156652
# Menghitung precision, recall, F1-score untuk setiap kelas
if (nrow(conf_matrix_rf) == 2 && ncol(conf_matrix_rf) == 2) {
classes <- rownames(conf_matrix_rf)
for (i in 1:2) {
TP <- conf_matrix_rf[i, i] # True Positive
FP <- sum(conf_matrix_rf[, i]) - TP # False Positive
FN <- sum(conf_matrix_rf[i, ]) - TP # False Negative
TN <- sum(conf_matrix_rf) - TP - FP - FN # True Negative (opsional)
precision <- ifelse((TP + FP) == 0, 0, TP / (TP + FP))
recall <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
f1 <- ifelse((precision + recall) == 0, 0, 2 * precision * recall / (precision + recall))
cat("Kelas:", classes[i], "\n")
cat(" Precision:", round(precision, 4), "\n")
cat(" Recall: ", round(recall, 4), "\n")
cat(" F1-score: ", round(f1, 4), "\n\n")
}
} else {
cat("Confusion matrix bukan 2x2. Perlu penanganan khusus untuk multi-kelas.\n")
}
## Kelas: 0
## Precision: 0.8281
## Recall: 0.5221
## F1-score: 0.6404
##
## Kelas: 1
## Precision: 0.6662
## Recall: 0.8979
## F1-score: 0.7649