# STEP 1: DATA CLEANING (WITH "Y")
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
# 1. Load data dengan skip=1 karena baris pertama adalah header
cat(" LOADING DATA...\n")
## LOADING DATA...
credit_data <- read_excel("default_credit_card.xls", skip = 1)
# Cek struktur data
cat("\n DATA STRUCTURE:\n")
##
## DATA STRUCTURE:
print(dim(credit_data))
## [1] 30000 25
cat("Nama kolom:\n")
## Nama kolom:
print(names(credit_data))
## [1] "ID" "LIMIT_BAL"
## [3] "SEX" "EDUCATION"
## [5] "MARRIAGE" "AGE"
## [7] "PAY_0" "PAY_2"
## [9] "PAY_3" "PAY_4"
## [11] "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2"
## [15] "BILL_AMT3" "BILL_AMT4"
## [17] "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1" "PAY_AMT2"
## [21] "PAY_AMT3" "PAY_AMT4"
## [23] "PAY_AMT5" "PAY_AMT6"
## [25] "default payment next month"
# 2. Rapikan nama kolom - cari kolom ID dan target Y
cat("\n🔧 FIXING COLUMN NAMES...\n")
##
## 🔧 FIXING COLUMN NAMES...
# Cari kolom mana yang mungkin ID
id_col <- which(sapply(credit_data, function(x) all(is.numeric(x))) &
sapply(credit_data, function(x) min(x, na.rm = TRUE) >= 1))
if(length(id_col) > 0) {
names(credit_data)[id_col[1]] <- "ID"
cat(" Kolom", id_col[1], "dijadikan sebagai 'ID'\n")
}
## Kolom 1 dijadikan sebagai 'ID'
# Cari kolom target Y
y_col <- which(toupper(names(credit_data)) == "Y" |
names(credit_data) == "default payment next month" |
grepl("default", names(credit_data), ignore.case = TRUE))
if(length(y_col) > 0) {
names(credit_data)[y_col[1]] <- "Y"
cat(" Kolom", y_col[1], "dijadikan sebagai 'Y' (target)\n")
} else {
# Jika tidak ditemukan, coba kolom terakhir
names(credit_data)[ncol(credit_data)] <- "Y"
cat(" Kolom Y tidak ditemukan, kolom terakhir dijadikan sebagai 'Y'\n")
}
## Kolom 25 dijadikan sebagai 'Y' (target)
# 3. Cek missing values
cat(" CHECKING MISSING VALUES:\n")
## CHECKING MISSING VALUES:
missing_summary <- colSums(is.na(credit_data))
print(missing_summary[missing_summary > 0])
## named numeric(0)
if(sum(missing_summary) == 0) {
cat(" Tidak ada missing values!\n")
} else {
cat(" Ada missing values, perlu ditangani...\n")
# Handle missing values
credit_data <- credit_data %>%
mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
}
## Tidak ada missing values!
# 4. Cek duplikat
cat("\n CHECKING DUPLICATES:\n")
##
## CHECKING DUPLICATES:
duplicate_count <- sum(duplicated(credit_data))
cat("Jumlah baris duplikat:", duplicate_count, "\n")
## Jumlah baris duplikat: 0
if(duplicate_count > 0) {
credit_data <- distinct(credit_data)
cat(" Duplikat dihapus!\n")
}
# 5. Cek tipe data
cat("\n CHECKING DATA TYPES:\n")
##
## CHECKING DATA TYPES:
str(credit_data)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
## $ LIMIT_BAL: num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ SEX : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
## $ EDUCATION: num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
## $ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
## $ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1: num [1:30000] 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2: num [1:30000] 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3: num [1:30000] 689 2682 13559 49291 35835 ...
## $ BILL_AMT4: num [1:30000] 0 3272 14331 28314 20940 ...
## $ BILL_AMT5: num [1:30000] 0 3455 14948 28959 19146 ...
## $ BILL_AMT6: num [1:30000] 0 3261 15549 29547 19131 ...
## $ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
## $ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
## $ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
## $ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
## $ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
## $ Y : num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
# Konversi semua kolom numerik ke numeric
for(col in names(credit_data)) {
if(is.character(credit_data[[col]])) {
# Coba konversi ke numeric jika memungkinkan
num_vals <- as.numeric(credit_data[[col]])
if(sum(is.na(num_vals)) / length(num_vals) < 0.1) { # Jika <10% NA
credit_data[[col]] <- num_vals
cat(" Kolom", col, "dikonversi ke numeric\n")
}
}
}
# Pastikan target variable "Y" numeric
if("Y" %in% names(credit_data)) {
credit_data$Y <- as.numeric(as.character(credit_data$Y))
cat(" Target variable 'Y' dijadikan numeric\n")
cat(" Nilai unik Y:", unique(credit_data$Y), "\n")
}
## Target variable 'Y' dijadikan numeric
## Nilai unik Y: 1 0
# 6. Cek outlier (numerical columns)
cat("\n CHECKING OUTLIERS (numerical columns):\n")
##
## CHECKING OUTLIERS (numerical columns):
numeric_cols <- names(credit_data)[sapply(credit_data, is.numeric)]
for(col in numeric_cols) {
if(col != "Y") { # Skip target variable "Y"
q1 <- quantile(credit_data[[col]], 0.25, na.rm = TRUE)
q3 <- quantile(credit_data[[col]], 0.75, na.rm = TRUE)
iqr <- q3 - q1
if(iqr > 0) { # Hanya jika IQR > 0
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
outliers <- sum(credit_data[[col]] < lower_bound | credit_data[[col]] > upper_bound, na.rm = TRUE)
if(outliers > 0) {
cat(col, ":", outliers, "outlier(s) detected\n")
}
}
}
}
## LIMIT_BAL : 167 outlier(s) detected
## EDUCATION : 454 outlier(s) detected
## AGE : 272 outlier(s) detected
## PAY_0 : 3130 outlier(s) detected
## PAY_2 : 4410 outlier(s) detected
## PAY_3 : 4209 outlier(s) detected
## PAY_4 : 3508 outlier(s) detected
## PAY_5 : 2968 outlier(s) detected
## PAY_6 : 3079 outlier(s) detected
## BILL_AMT1 : 2400 outlier(s) detected
## BILL_AMT2 : 2395 outlier(s) detected
## BILL_AMT3 : 2469 outlier(s) detected
## BILL_AMT4 : 2622 outlier(s) detected
## BILL_AMT5 : 2725 outlier(s) detected
## BILL_AMT6 : 2693 outlier(s) detected
## PAY_AMT1 : 2745 outlier(s) detected
## PAY_AMT2 : 2714 outlier(s) detected
## PAY_AMT3 : 2598 outlier(s) detected
## PAY_AMT4 : 2994 outlier(s) detected
## PAY_AMT5 : 2945 outlier(s) detected
## PAY_AMT6 : 2958 outlier(s) detected
cat("\n DATA CLEANING COMPLETED!\n")
##
## DATA CLEANING COMPLETED!
cat(" FINAL DATA SUMMARY:\n")
## FINAL DATA SUMMARY:
cat("Total samples:", nrow(credit_data), "\n")
## Total samples: 30000
cat("Total features:", ncol(credit_data), "\n")
## Total features: 25
cat("Features (first 5):", names(credit_data)[1:min(5, ncol(credit_data))], "\n")
## Features (first 5): ID LIMIT_BAL SEX EDUCATION MARRIAGE
if("Y" %in% names(credit_data)) {
cat("Target variable 'Y' distribution:\n")
print(table(credit_data$Y))
}
## Target variable 'Y' distribution:
##
## 0 1
## 23364 6636
# STEP 2: EXPLORATORY DATA ANALYSIS (WITH "Y")
library(ggplot2)
# 1. Distribusi target "Y"
cat(" TARGET DISTRIBUTION (Y):\n")
## TARGET DISTRIBUTION (Y):
target_dist <- table(credit_data$Y)
print(target_dist)
##
## 0 1
## 23364 6636
default_rate <- target_dist[2]/sum(target_dist)*100
cat("Persentase Default (Y=1):", round(default_rate, 2), "%\n")
## Persentase Default (Y=1): 22.12 %
cat("Ratio Default vs Non-Default: 1:", round(target_dist[1]/target_dist[2], 1), "\n")
## Ratio Default vs Non-Default: 1: 3.5
# Plot distribusi target
ggplot(credit_data, aes(x = factor(Y, labels = c("No Default", "Default")),
fill = factor(Y))) +
geom_bar() +
geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5) +
labs(title = "Distribution of Credit Card Default",
subtitle = paste("Default rate:", round(default_rate, 1), "%"),
x = "Default Status",
y = "Count") +
scale_fill_manual(values = c("0" = "purple", "1" = "pink"),
name = "Status") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 2. Statistik deskriptif
cat("\n DESCRIPTIVE STATISTICS FOR TARGET VARIABLE:\n")
##
## DESCRIPTIVE STATISTICS FOR TARGET VARIABLE:
print(summary(credit_data$Y))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2212 0.0000 1.0000
cat("\n DESCRIPTIVE STATISTICS FOR NUMERICAL FEATURES:\n")
##
## DESCRIPTIVE STATISTICS FOR NUMERICAL FEATURES:
# Hanya tampilkan beberapa fitur penting
important_features <- c("LIMIT_BAL", "AGE", "BILL_AMT1", "PAY_AMT1")
print(summary(credit_data[, important_features]))
## LIMIT_BAL AGE BILL_AMT1 PAY_AMT1
## Min. : 10000 Min. :21.00 Min. :-165580 Min. : 0
## 1st Qu.: 50000 1st Qu.:28.00 1st Qu.: 3559 1st Qu.: 1000
## Median : 140000 Median :34.00 Median : 22382 Median : 2100
## Mean : 167484 Mean :35.49 Mean : 51223 Mean : 5664
## 3rd Qu.: 240000 3rd Qu.:41.00 3rd Qu.: 67091 3rd Qu.: 5006
## Max. :1000000 Max. :79.00 Max. : 964511 Max. :873552
# 3. Korelasi dengan target "Y"
cat("\n🔗 CORRELATION ANALYSIS WITH TARGET (Y):\n")
##
## 🔗 CORRELATION ANALYSIS WITH TARGET (Y):
numeric_data <- credit_data %>% select(where(is.numeric))
# Hanya hitung korelasi jika ada data numerik
if(ncol(numeric_data) > 1) {
# Cek korelasi dengan target "Y"
if("Y" %in% colnames(numeric_data)) {
# Hitung korelasi Pearson
correlations <- sapply(numeric_data, function(x) cor(x, numeric_data$Y, use = "complete.obs"))
correlations <- correlations[names(correlations) != "Y"] # Exclude Y itself
if(length(correlations) > 0) {
cat("Top 10 features with highest absolute correlation with Y:\n")
cor_df <- data.frame(
Feature = names(correlations),
Correlation = correlations,
Abs_Correlation = abs(correlations)
) %>% arrange(desc(Abs_Correlation))
print(head(cor_df, 10))
# Plot korelasi teratas
top_features <- head(cor_df$Feature, 6)
cat("\n📊 Top features with highest correlation to default:\n")
for(feature in top_features) {
cat(sprintf("%-15s: Correlation = %6.3f\n", feature, cor_df[cor_df$Feature == feature, "Correlation"]))
}
}
}
}
## Top 10 features with highest absolute correlation with Y:
## Feature Correlation Abs_Correlation
## PAY_0 PAY_0 0.32479373 0.32479373
## PAY_2 PAY_2 0.26355120 0.26355120
## PAY_3 PAY_3 0.23525251 0.23525251
## PAY_4 PAY_4 0.21661364 0.21661364
## PAY_5 PAY_5 0.20414891 0.20414891
## PAY_6 PAY_6 0.18686636 0.18686636
## LIMIT_BAL LIMIT_BAL -0.15351988 0.15351988
## PAY_AMT1 PAY_AMT1 -0.07292949 0.07292949
## PAY_AMT2 PAY_AMT2 -0.05857871 0.05857871
## PAY_AMT4 PAY_AMT4 -0.05682740 0.05682740
##
## 📊 Top features with highest correlation to default:
## PAY_0 : Correlation = 0.325
## PAY_2 : Correlation = 0.264
## PAY_3 : Correlation = 0.235
## PAY_4 : Correlation = 0.217
## PAY_5 : Correlation = 0.204
## PAY_6 : Correlation = 0.187
# 4. Visualisasi distribusi beberapa fitur vs Y
cat("\n📊 VISUALIZING IMPORTANT FEATURES VS TARGET (Y):\n")
##
## 📊 VISUALIZING IMPORTANT FEATURES VS TARGET (Y):
# Pilih 6 fitur yang paling menarik
selected_features <- c("LIMIT_BAL", "AGE", "PAY_0", "BILL_AMT1", "PAY_AMT1", "EDUCATION")
for(col in selected_features) {
if(col %in% names(credit_data)) {
cat("\nPlotting:", col, "\n")
# Boxplot untuk fitur numerik
if(is.numeric(credit_data[[col]])) {
p <- ggplot(credit_data, aes(x = factor(Y, labels = c("No Default", "Default")),
y = .data[[col]], fill = factor(Y))) +
geom_boxplot(alpha = 0.7) +
labs(title = paste(col, "by Default Status"),
x = "Default Status", y = col) +
scale_fill_manual(values = c("0" = "green", "1" = "lightblue"), guide = "none") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
print(p)
# Hitung statistik ringkasan per kelompok
stats <- credit_data %>%
group_by(Y) %>%
summarise(
Mean = mean(.data[[col]], na.rm = TRUE),
Median = median(.data[[col]], na.rm = TRUE),
SD = sd(.data[[col]], na.rm = TRUE),
Min = min(.data[[col]], na.rm = TRUE),
Max = max(.data[[col]], na.rm = TRUE)
)
cat("Statistics for", col, "by group:\n")
print(stats)
}
}
}
##
## Plotting: LIMIT_BAL

## Statistics for LIMIT_BAL by group:
## # A tibble: 2 × 6
## Y Mean Median SD Min Max
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 178100. 150000 131628. 10000 1000000
## 2 1 130110. 90000 115379. 10000 740000
##
## Plotting: AGE

## Statistics for AGE by group:
## # A tibble: 2 × 6
## Y Mean Median SD Min Max
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 35.4 34 9.08 21 79
## 2 1 35.7 34 9.69 21 75
##
## Plotting: PAY_0

## Statistics for PAY_0 by group:
## # A tibble: 2 × 6
## Y Mean Median SD Min Max
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -0.211 0 0.952 -2 8
## 2 1 0.668 1 1.38 -2 8
##
## Plotting: BILL_AMT1

## Statistics for BILL_AMT1 by group:
## # A tibble: 2 × 6
## Y Mean Median SD Min Max
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 51994. 23120. 73578. -165580 964511
## 2 1 48509. 20185 73782. -6676 613860
##
## Plotting: PAY_AMT1

## Statistics for PAY_AMT1 by group:
## # A tibble: 2 × 6
## Y Mean Median SD Min Max
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 6307. 2460. 18015. 0 873552
## 2 1 3397. 1636 9544. 0 300000
##
## Plotting: EDUCATION

## Statistics for EDUCATION by group:
## # A tibble: 2 × 6
## Y Mean Median SD Min Max
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1.84 2 0.807 0 6
## 2 1 1.89 2 0.728 1 6
# 5. Distribusi kategori untuk variabel kategorikal
cat("\n CATEGORICAL VARIABLES DISTRIBUTION:\n")
##
## CATEGORICAL VARIABLES DISTRIBUTION:
categorical_vars <- c("SEX", "EDUCATION", "MARRIAGE")
for(col in categorical_vars) {
if(col %in% names(credit_data)) {
cat("\n", col, "distribution:\n")
# Cross tabulation dengan Y
cross_tab <- table(credit_data[[col]], credit_data$Y)
colnames(cross_tab) <- c("No Default", "Default")
print(cross_tab)
# Hitung default rate per kategori
default_rates <- prop.table(cross_tab, margin = 1)[, 2] * 100
cat("Default rates by category (%):\n")
print(round(default_rates, 1))
# Plot
p <- credit_data %>%
group_by(.data[[col]], Y) %>%
summarise(Count = n(), .groups = 'drop') %>%
mutate(Percentage = Count / sum(Count) * 100) %>%
ggplot(aes(x = factor(.data[[col]]), y = Count, fill = factor(Y, labels = c("No Default", "Default")))) +
geom_bar(stat = "identity", position = "fill") +
geom_text(aes(label = paste0(round(Percentage, 1), "%")),
position = position_fill(vjust = 0.5), size = 3) +
labs(title = paste("Default Distribution by", col),
x = col, y = "Proportion") +
scale_fill_manual(values = c("No Default" = "orange", "Default" = "yellow"),
name = "Status") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
print(p)
}
}
##
## SEX distribution:
##
## No Default Default
## 1 9015 2873
## 2 14349 3763
## Default rates by category (%):
## 1 2
## 24.2 20.8

##
## EDUCATION distribution:
##
## No Default Default
## 0 14 0
## 1 8549 2036
## 2 10700 3330
## 3 3680 1237
## 4 116 7
## 5 262 18
## 6 43 8
## Default rates by category (%):
## 0 1 2 3 4 5 6
## 0.0 19.2 23.7 25.2 5.7 6.4 15.7

##
## MARRIAGE distribution:
##
## No Default Default
## 0 49 5
## 1 10453 3206
## 2 12623 3341
## 3 239 84
## Default rates by category (%):
## 0 1 2 3
## 9.3 23.5 20.9 26.0

# STEP 3: DATA PREPARATION FOR MODELING (WITH "Y")
# Load library caret untuk data splitting
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
# 1. Pisahkan features dan target
cat("\n SEPARATING FEATURES AND TARGET...\n")
##
## SEPARATING FEATURES AND TARGET...
X <- credit_data %>% select(-Y)
Y <- credit_data$Y
cat("Features (X) shape:", nrow(X), "samples x", ncol(X), "features\n")
## Features (X) shape: 30000 samples x 24 features
cat("Target (Y) length :", length(Y), "\n")
## Target (Y) length : 30000
cat("Target summary:\n")
## Target summary:
print(summary(Y))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2212 0.0000 1.0000
# 2. Encoding variabel kategorikal
cat("\n ENCODING CATEGORICAL VARIABLES...\n")
##
## ENCODING CATEGORICAL VARIABLES...
# Identifikasi variabel kategorikal
categorical_candidates <- c("SEX", "EDUCATION", "MARRIAGE")
categorical_found <- categorical_candidates[categorical_candidates %in% names(X)]
if(length(categorical_found) > 0) {
cat("Categorical variables found:", categorical_found, "\n")
# Tampilkan nilai unik sebelum encoding
for(col in categorical_found) {
cat("\n", col, "unique values:\n")
print(sort(unique(X[[col]])))
cat("Counts:\n")
print(table(X[[col]]))
}
# Convert to factor
for(col in categorical_found) {
X[[col]] <- as.factor(X[[col]])
cat("Done", col, "converted to factor\n")
}
} else {
cat(" No categorical variables found\n")
}
## Categorical variables found: SEX EDUCATION MARRIAGE
##
## SEX unique values:
## [1] 1 2
## Counts:
##
## 1 2
## 11888 18112
##
## EDUCATION unique values:
## [1] 0 1 2 3 4 5 6
## Counts:
##
## 0 1 2 3 4 5 6
## 14 10585 14030 4917 123 280 51
##
## MARRIAGE unique values:
## [1] 0 1 2 3
## Counts:
##
## 0 1 2 3
## 54 13659 15964 323
## Done SEX converted to factor
## Done EDUCATION converted to factor
## Done MARRIAGE converted to factor
# 3. Split data train-test (80-20)
cat("\n SPLITTING DATA INTO TRAIN-TEST SETS (80-20)...\n")
##
## SPLITTING DATA INTO TRAIN-TEST SETS (80-20)...
set.seed(123)
# Pastikan Y adalah faktor untuk stratified sampling
Y_factor <- as.factor(Y)
train_index <- createDataPartition(Y_factor, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
Y_train <- Y[train_index]
Y_test <- Y[-train_index]
cat(" Data split completed:\n")
## Data split completed:
cat(" Train set:", nrow(X_train), "samples (", round(nrow(X_train)/nrow(X)*100, 1), "%)\n")
## Train set: 24001 samples ( 80 %)
cat(" Test set :", nrow(X_test), "samples (", round(nrow(X_test)/nrow(X)*100, 1), "%)\n")
## Test set : 5999 samples ( 20 %)
# 4. Cek distribusi kelas di train dan test
cat("\n CHECKING CLASS DISTRIBUTION...\n")
##
## CHECKING CLASS DISTRIBUTION...
train_dist <- table(Y_train)
test_dist <- table(Y_test)
cat("Training set distribution:\n")
## Training set distribution:
cat(" Non-Default (0):", train_dist["0"], "(", round(train_dist["0"]/length(Y_train)*100, 1), "%)\n")
## Non-Default (0): 18692 ( 77.9 %)
cat(" Default (1) :", train_dist["1"], "(", round(train_dist["1"]/length(Y_train)*100, 1), "%)\n")
## Default (1) : 5309 ( 22.1 %)
cat("\nTest set distribution:\n")
##
## Test set distribution:
cat(" Non-Default (0):", test_dist["0"], "(", round(test_dist["0"]/length(Y_test)*100, 1), "%)\n")
## Non-Default (0): 4672 ( 77.9 %)
cat(" Default (1) :", test_dist["1"], "(", round(test_dist["1"]/length(Y_test)*100, 1), "%)\n")
## Default (1) : 1327 ( 22.1 %)
# 5. Handle class imbalance (jika ada)
cat("\n CLASS IMBALANCE ANALYSIS...\n")
##
## CLASS IMBALANCE ANALYSIS...
default_rate_train <- mean(Y_train == 1)
default_rate_test <- mean(Y_test == 1)
cat("Default rate in training set:", round(default_rate_train*100, 2), "%\n")
## Default rate in training set: 22.12 %
cat("Default rate in test set :", round(default_rate_test*100, 2), "%\n")
## Default rate in test set : 22.12 %
# Cek apakah ada imbalance signifikan
if(default_rate_train < 0.2 || default_rate_train > 0.8) {
cat("\n SIGNIFICANT CLASS IMBALANCE DETECTED!\n")
cat(" Training default rate:", round(default_rate_train*100, 1), "%\n")
cat(" Consider using:\n")
cat(" - Class weights in model training\n")
cat(" - Oversampling (SMOTE) for minority class\n")
cat(" - Undersampling for majority class\n")
cat(" - Ensemble methods\n")
} else {
cat("\n Class distribution is acceptable for modeling\n")
}
##
## Class distribution is acceptable for modeling
# 6. Data summary setelah preparation
cat("\n FINAL DATA PREPARATION SUMMARY:\n")
##
## FINAL DATA PREPARATION SUMMARY:
cat("Original data size :", nrow(credit_data), "x", ncol(credit_data), "\n")
## Original data size : 30000 x 25
cat("Features (X) size :", nrow(X), "x", ncol(X), "\n")
## Features (X) size : 30000 x 24
cat("Training set size :", nrow(X_train), "x", ncol(X_train), "\n")
## Training set size : 24001 x 24
cat("Test set size :", nrow(X_test), "x", ncol(X_test), "\n")
## Test set size : 5999 x 24
cat("Categorical features:", length(categorical_found), "\n")
## Categorical features: 3
cat("Numerical features :", ncol(X) - length(categorical_found), "\n")
## Numerical features : 21
# 7. Cek tipe data final
cat("\n FINAL DATA TYPES CHECK:\n")
##
## FINAL DATA TYPES CHECK:
cat("Training features structure:\n")
## Training features structure:
str(X_train, list.len = 5)
## tibble [24,001 × 24] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:24001] 1 2 3 4 5 6 7 8 9 11 ...
## $ LIMIT_BAL: num [1:24001] 20000 120000 90000 50000 50000 50000 500000 100000 140000 200000 ...
## $ SEX : Factor w/ 2 levels "1","2": 2 2 2 2 1 1 1 2 2 2 ...
## $ EDUCATION: Factor w/ 7 levels "0","1","2","3",..: 3 3 3 3 3 2 2 3 4 4 ...
## $ MARRIAGE : Factor w/ 4 levels "0","1","2","3": 2 3 3 2 2 3 3 3 2 3 ...
## [list output truncated]
cat("\n DATA PREPARATION COMPLETED!\n")
##
## DATA PREPARATION COMPLETED!
cat(" Ready for model training in STEP 4\n")
## Ready for model training in STEP 4
# STEP 4: MODEL BUILDING (WITH "Y")
# Load library untuk modeling
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
cat(" TRAINING MACHINE LEARNING MODELS...\n")
## TRAINING MACHINE LEARNING MODELS...
cat("Dataset memiliki", ncol(X_train), "fitur dan", nrow(X_train), "samples training\n")
## Dataset memiliki 24 fitur dan 24001 samples training
# 1. LOGISTIC REGRESSION
cat("\n")
cat(paste(rep("-", 40), collapse = ""), "\n")
## ----------------------------------------
cat(" TRAINING LOGISTIC REGRESSION...\n")
## TRAINING LOGISTIC REGRESSION...
log_model <- glm(Y_train ~ .,
data = data.frame(X_train, Y_train = Y_train),
family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cat(" Logistic Regression trained successfully!\n")
## Logistic Regression trained successfully!
cat(" Number of coefficients:", length(coef(log_model)), "\n")
## Number of coefficients: 32
cat(" Model convergence:", log_model$converged, "\n")
## Model convergence: TRUE
# Summary model logistic regression
cat("\n LOGISTIC REGRESSION SUMMARY (Top 10 coefficients):\n")
##
## LOGISTIC REGRESSION SUMMARY (Top 10 coefficients):
log_summary <- summary(log_model)
log_coef <- coef(log_summary)
# Tampilkan 10 coefficient terpenting (berdasarkan p-value)
if(nrow(log_coef) > 10) {
significant_coef <- log_coef[order(abs(log_coef[, "z value"]), decreasing = TRUE), ]
print(head(significant_coef, 10))
} else {
print(log_coef)
}
## Estimate Std. Error z value Pr(>|z|)
## PAY_0 5.682950e-01 1.988808e-02 28.574659 1.387668e-179
## PAY_AMT1 -1.369627e-05 2.574031e-06 -5.320941 1.032320e-07
## BILL_AMT1 -7.244998e-06 1.365865e-06 -5.304329 1.130883e-07
## PAY_AMT2 -1.125372e-05 2.496951e-06 -4.506983 6.575592e-06
## PAY_2 9.680623e-02 2.260219e-02 4.283046 1.843523e-05
## LIMIT_BAL -6.888196e-07 1.765485e-07 -3.901590 9.556302e-05
## AGE 5.859174e-03 2.086780e-03 2.807758 4.988772e-03
## SEX2 -9.397502e-02 3.444322e-02 -2.728404 6.364157e-03
## PAY_3 5.827961e-02 2.544666e-02 2.290266 2.200590e-02
## PAY_AMT4 -4.761408e-06 2.079422e-06 -2.289775 2.203437e-02
# 2. RANDOM FOREST
cat("\n")
cat(paste(rep("-", 40), collapse = ""), "\n")
## ----------------------------------------
cat(" TRAINING RANDOM FOREST...\n")
## TRAINING RANDOM FOREST...
# Pastikan Y_train sebagai factor untuk classification
Y_train_factor <- as.factor(Y_train)
# Train Random Forest dengan parameter optimal
set.seed(123)
cat("Training Random Forest with 100 trees...\n")
## Training Random Forest with 100 trees...
rf_model <- randomForest(x = X_train,
y = Y_train_factor,
ntree = 100,
mtry = floor(sqrt(ncol(X_train))), # Default untuk classification
importance = TRUE,
do.trace = 10, # Tampilkan progress setiap 10 trees
na.action = na.omit)
## ntree OOB 1 2
## 10: 22.80% 11.10% 63.91%
## 20: 20.30% 7.97% 63.70%
## 30: 19.33% 6.66% 63.93%
## 40: 18.92% 6.32% 63.29%
## 50: 18.80% 6.13% 63.42%
## 60: 18.72% 6.02% 63.44%
## 70: 18.67% 5.94% 63.46%
## 80: 18.64% 5.99% 63.19%
## 90: 18.50% 5.91% 62.84%
## 100: 18.43% 5.78% 62.95%
cat("\n Random Forest trained successfully!\n")
##
## Random Forest trained successfully!
cat(" Number of trees:", rf_model$ntree, "\n")
## Number of trees: 100
if(!is.null(rf_model$err.rate)) {
cat(" OOB error rate:", round(mean(rf_model$err.rate[, "OOB"]), 4), "\n")
}
## OOB error rate: 0.1971
if(!is.null(rf_model$confusion)) {
cat(" Confusion matrix (OOB):\n")
print(rf_model$confusion)
}
## Confusion matrix (OOB):
## 0 1 class.error
## 0 17611 1081 0.05783223
## 1 3342 1967 0.62949708
# STEP 5: MODEL EVALUATION (WITH "Y")
# Load library untuk evaluasi
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
cat(" EVALUATING MODEL PERFORMANCE ON TEST SET...\n")
## EVALUATING MODEL PERFORMANCE ON TEST SET...
cat("Test set size:", nrow(X_test), "samples\n")
## Test set size: 5999 samples
# 1. PREDICTIONS
cat("\n MAKING PREDICTIONS...\n")
##
## MAKING PREDICTIONS...
# Logistic Regression predictions
log_prob <- predict(log_model, newdata = X_test, type = "response")
log_class <- ifelse(log_prob > 0.5, 1, 0)
# Random Forest predictions
rf_class <- predict(rf_model, newdata = X_test)
rf_prob <- predict(rf_model, newdata = X_test, type = "prob")[, "1"]
cat(" Predictions completed\n")
## Predictions completed
cat(" Logistic Regression: probability and class predictions\n")
## Logistic Regression: probability and class predictions
cat(" Random Forest: class and probability predictions\n")
## Random Forest: class and probability predictions
# 2. CONFUSION MATRICES
cat("\n CONFUSION MATRICES:\n")
##
## CONFUSION MATRICES:
# Fungsi untuk menampilkan confusion matrix yang informatif
print_confusion_matrix <- function(true, pred, model_name) {
cat("\n", model_name, "Confusion Matrix:\n")
cm <- confusionMatrix(factor(pred), factor(true), positive = "1")
# Tampilkan matrix
print(cm$table)
# Tampilkan metrics
cat("\nPerformance Metrics:\n")
cat("Accuracy :", round(cm$overall["Accuracy"], 4), "\n")
cat("Sensitivity (Recall) :", round(cm$byClass["Sensitivity"], 4), "\n")
cat("Specificity :", round(cm$byClass["Specificity"], 4), "\n")
cat("Precision :", round(cm$byClass["Precision"], 4), "\n")
cat("F1-Score :", round(cm$byClass["F1"], 4), "\n")
return(cm)
}
# Logistic Regression CM
log_cm <- print_confusion_matrix(Y_test, log_class, "LOGISTIC REGRESSION")
##
## LOGISTIC REGRESSION Confusion Matrix:
## Reference
## Prediction 0 1
## 0 4541 996
## 1 131 331
##
## Performance Metrics:
## Accuracy : 0.8121
## Sensitivity (Recall) : 0.2494
## Specificity : 0.972
## Precision : 0.7165
## F1-Score : 0.37
# Random Forest CM
rf_cm <- print_confusion_matrix(Y_test, rf_class, "RANDOM FOREST")
##
## RANDOM FOREST Confusion Matrix:
## Reference
## Prediction 0 1
## 0 4420 817
## 1 252 510
##
## Performance Metrics:
## Accuracy : 0.8218
## Sensitivity (Recall) : 0.3843
## Specificity : 0.9461
## Precision : 0.6693
## F1-Score : 0.4883
# 3. ROC-AUC ANALYSIS
cat("\n ROC-AUC ANALYSIS:\n")
##
## ROC-AUC ANALYSIS:
# ROC curves
log_roc <- roc(Y_test, log_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
rf_roc <- roc(Y_test, rf_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
cat("Area Under Curve (AUC):\n")
## Area Under Curve (AUC):
cat(" Logistic Regression:", round(auc(log_roc), 4), "\n")
## Logistic Regression: 0.7222
cat(" Random Forest :", round(auc(rf_roc), 4), "\n")
## Random Forest : 0.7609
# Plot ROC curves
cat("\n📈 PLOTTING ROC CURVES...\n")
##
## 📈 PLOTTING ROC CURVES...
plot(log_roc, col = "blue", main = "ROC Curves Comparison", lwd = 2)
lines(rf_roc, col = "red", lwd = 2)
legend("bottomright",
legend = c(paste("Logistic Regression (AUC =", round(auc(log_roc), 3), ")"),
paste("Random Forest (AUC =", round(auc(rf_roc), 3), ")")),
col = c("blue", "red"), lwd = 2)

# 4. FEATURE IMPORTANCE ANALYSIS
cat("\n FEATURE IMPORTANCE ANALYSIS:\n")
##
## FEATURE IMPORTANCE ANALYSIS:
# Random Forest Feature Importance
cat("\n RANDOM FOREST FEATURE IMPORTANCE (Top 15):\n")
##
## RANDOM FOREST FEATURE IMPORTANCE (Top 15):
if(!is.null(rf_model$importance)) {
imp <- importance(rf_model)
imp_df <- data.frame(
Feature = rownames(imp),
MeanDecreaseGini = imp[, "MeanDecreaseGini"],
MeanDecreaseAccuracy = imp[, "MeanDecreaseAccuracy"]
)
imp_df <- imp_df[order(imp_df$MeanDecreaseGini, decreasing = TRUE), ]
print(head(imp_df, 15))
# Plot feature importance
par(mar = c(5, 8, 4, 2))
barplot(rev(head(imp_df$MeanDecreaseGini, 10)),
names.arg = rev(head(imp_df$Feature, 10)),
horiz = TRUE,
las = 1,
col = "steelblue",
main = "Top 10 Feature Importance (Random Forest)",
xlab = "Mean Decrease Gini")
par(mar = c(5, 4, 4, 2)) # Reset margin
}
## Feature MeanDecreaseGini MeanDecreaseAccuracy
## PAY_0 PAY_0 810.0435 59.010524
## ID ID 540.8089 2.214992
## BILL_AMT1 BILL_AMT1 449.7547 16.036735
## AGE AGE 427.6229 7.375773
## BILL_AMT2 BILL_AMT2 416.0259 18.957307
## PAY_AMT1 PAY_AMT1 407.8766 15.667540
## LIMIT_BAL LIMIT_BAL 394.0394 14.275521
## BILL_AMT3 BILL_AMT3 390.5940 21.018379
## BILL_AMT4 BILL_AMT4 381.7948 23.867476
## BILL_AMT6 BILL_AMT6 381.2817 20.688393
## BILL_AMT5 BILL_AMT5 377.4902 18.904437
## PAY_AMT2 PAY_AMT2 367.0724 15.675822
## PAY_AMT6 PAY_AMT6 344.8493 14.660677
## PAY_AMT3 PAY_AMT3 341.1450 18.304254
## PAY_AMT5 PAY_AMT5 336.5149 14.451095

# 5. MODEL COMPARISON SUMMARY
cat("\n MODEL COMPARISON SUMMARY:\n")
##
## MODEL COMPARISON SUMMARY:
comparison_df <- data.frame(
Metric = c("Accuracy", "Sensitivity (Recall)", "Specificity",
"Precision", "F1-Score", "AUC"),
Logistic_Regression = c(
round(log_cm$overall["Accuracy"], 4),
round(log_cm$byClass["Sensitivity"], 4),
round(log_cm$byClass["Specificity"], 4),
round(log_cm$byClass["Precision"], 4),
round(log_cm$byClass["F1"], 4),
round(auc(log_roc), 4)
),
Random_Forest = c(
round(rf_cm$overall["Accuracy"], 4),
round(rf_cm$byClass["Sensitivity"], 4),
round(rf_cm$byClass["Specificity"], 4),
round(rf_cm$byClass["Precision"], 4),
round(rf_cm$byClass["F1"], 4),
round(auc(rf_roc), 4)
)
)
print(comparison_df)
## Metric Logistic_Regression Random_Forest
## Accuracy Accuracy 0.8121 0.8218
## Sensitivity Sensitivity (Recall) 0.2494 0.3843
## Specificity Specificity 0.9720 0.9461
## Precision Precision 0.7165 0.6693
## F1 F1-Score 0.3700 0.4883
## AUC 0.7222 0.7609
# Determine best model
cat("\n BEST MODEL DETERMINATION:\n")
##
## BEST MODEL DETERMINATION:
if(auc(rf_roc) > auc(log_roc)) {
cat(" Random Forest performs better based on AUC\n")
best_model <- "RANDOM FOREST"
} else {
cat(" Logistic Regression performs better based on AUC\n")
best_model <- "LOGISTIC REGRESSION"
}
## Random Forest performs better based on AUC
# STEP 6: FINAL REPORT AND RECOMMENDATIONS
# 1. EXECUTIVE SUMMARY
cat("\n EXECUTIVE SUMMARY\n")
##
## EXECUTIVE SUMMARY
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" Dataset Overview:\n")
## Dataset Overview:
cat(" • Total samples:", nrow(credit_data), "\n")
## • Total samples: 30000
cat(" • Default rate:", round(mean(credit_data$Y) * 100, 1), "%\n")
## • Default rate: 22.1 %
cat(" • Features analyzed:", ncol(X), "\n")
## • Features analyzed: 24
cat(" • Training samples:", nrow(X_train), "\n")
## • Training samples: 24001
cat(" • Testing samples:", nrow(X_test), "\n")
## • Testing samples: 5999
cat("\n Key Findings:\n")
##
## Key Findings:
cat(" • PAY_0 (payment status) is the STRONGEST predictor of default\n")
## • PAY_0 (payment status) is the STRONGEST predictor of default
cat(" • Significant class imbalance (78% non-default vs 22% default)\n")
## • Significant class imbalance (78% non-default vs 22% default)
cat(" • Random Forest outperforms Logistic Regression overall\n")
## • Random Forest outperforms Logistic Regression overall
cat(" • Model can detect 38% of actual defaults (Random Forest)\n")
## • Model can detect 38% of actual defaults (Random Forest)
# 2. MODEL PERFORMANCE DEEP DIVE
cat("\n MODEL PERFORMANCE DEEP DIVE\n")
##
## MODEL PERFORMANCE DEEP DIVE
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" Logistic Regression:\n")
## Logistic Regression:
cat(" • Strengths: High specificity (97.2%), High precision (71.7%)\n")
## • Strengths: High specificity (97.2%), High precision (71.7%)
cat(" • Weaknesses: Low sensitivity (24.9%), Misses 75% of defaults\n")
## • Weaknesses: Low sensitivity (24.9%), Misses 75% of defaults
cat(" • Best for: Conservative risk assessment, minimizing false alarms\n\n")
## • Best for: Conservative risk assessment, minimizing false alarms
cat(" Random Forest:\n")
## Random Forest:
cat(" • Strengths: Better overall accuracy (82.2%), Higher sensitivity (38.4%)\n")
## • Strengths: Better overall accuracy (82.2%), Higher sensitivity (38.4%)
cat(" • Weaknesses: Lower precision (66.9%), More false alarms\n")
## • Weaknesses: Lower precision (66.9%), More false alarms
cat(" • Best for: Maximizing default detection, Overall portfolio safety\n")
## • Best for: Maximizing default detection, Overall portfolio safety
# 3. BUSINESS IMPACT ANALYSIS
cat("\n BUSINESS IMPACT ANALYSIS\n")
##
## BUSINESS IMPACT ANALYSIS
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
# Hitung potential financial impact
avg_limit <- mean(credit_data$LIMIT_BAL)
default_amount <- avg_limit * 0.5 # Asumsi 50% dari limit akan hilang jika default
cat(" Financial Impact Estimation:\n")
## Financial Impact Estimation:
cat(" • Average credit limit: Rp", format(round(avg_limit), big.mark = ","), "\n")
## • Average credit limit: Rp 167,484
cat(" • Estimated loss per default: Rp", format(round(default_amount), big.mark = ","), "\n\n")
## • Estimated loss per default: Rp 83,742
cat(" Model Comparison Impact:\n")
## Model Comparison Impact:
# Logistic Regression impact
lr_detected <- 331
lr_missed <- 996
lr_fp <- 131
# Random Forest impact
rf_detected <- 510
rf_missed <- 817
rf_fp <- 252
cat(" Logistic Regression:\n")
## Logistic Regression:
cat(" • Defaults detected:", lr_detected, "/ 1327 (", round(lr_detected/1327*100, 1), "%)\n")
## • Defaults detected: 331 / 1327 ( 24.9 %)
cat(" • Potential loss prevented: Rp", format(round(lr_detected * default_amount), big.mark = ","), "\n")
## • Potential loss prevented: Rp 27,718,655
cat(" • False alarms:", lr_fp, "customers\n\n")
## • False alarms: 131 customers
cat(" Random Forest:\n")
## Random Forest:
cat(" • Defaults detected:", rf_detected, "/ 1327 (", round(rf_detected/1327*100, 1), "%)\n")
## • Defaults detected: 510 / 1327 ( 38.4 %)
cat(" • Potential loss prevented: Rp", format(round(rf_detected * default_amount), big.mark = ","), "\n")
## • Potential loss prevented: Rp 42,708,502
cat(" • Additional loss prevention vs LR: Rp",
format(round((rf_detected - lr_detected) * default_amount), big.mark = ","), "\n")
## • Additional loss prevention vs LR: Rp 14,989,847
cat(" • False alarms:", rf_fp, "customers (+", rf_fp - lr_fp, " vs LR)\n")
## • False alarms: 252 customers (+ 121 vs LR)
# 4. TOP 5 PREDICTORS OF DEFAULT
cat("\n TOP 5 PREDICTORS OF DEFAULT\n")
##
## TOP 5 PREDICTORS OF DEFAULT
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat("Based on Random Forest feature importance:\n")
## Based on Random Forest feature importance:
top_features <- head(imp_df, 5)
for(i in 1:nrow(top_features)) {
cat(i, ". ", top_features$Feature[i], "\n", sep = "")
}
## 1. PAY_0
## 2. ID
## 3. BILL_AMT1
## 4. AGE
## 5. BILL_AMT2
cat("\nActionable Insights:\n")
##
## Actionable Insights:
cat(" • Monitor PAY_0 closely - immediate payment status is critical\n")
## • Monitor PAY_0 closely - immediate payment status is critical
cat(" • Higher bill amounts (BILL_AMT1, BILL_AMT2) increase default risk\n")
## • Higher bill amounts (BILL_AMT1, BILL_AMT2) increase default risk
cat(" • Age is a significant factor in default prediction\n")
## • Age is a significant factor in default prediction
cat(" • Credit limit (LIMIT_BAL) inversely related to default risk\n")
## • Credit limit (LIMIT_BAL) inversely related to default risk
# 5. RECOMMENDATIONS
cat("\n RECOMMENDATIONS\n")
##
## RECOMMENDATIONS
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" Primary Recommendation:\n")
## Primary Recommendation:
cat(" DEPLOY RANDOM FOREST MODEL\n\n")
## DEPLOY RANDOM FOREST MODEL
cat(" Reasons:\n")
## Reasons:
cat(" 1. Detects 54% more defaults than Logistic Regression\n")
## 1. Detects 54% more defaults than Logistic Regression
cat(" 2. Higher overall accuracy (82.2% vs 81.2%)\n")
## 2. Higher overall accuracy (82.2% vs 81.2%)
cat(" 3. Better AUC (0.761 vs 0.722)\n")
## 3. Better AUC (0.761 vs 0.722)
cat(" 4. Superior F1-Score (0.488 vs 0.370)\n")
## 4. Superior F1-Score (0.488 vs 0.370)
cat(" 5. Prevents significantly more financial loss\n")
## 5. Prevents significantly more financial loss
cat("\n🛡️ Risk Mitigation Strategies:\n")
##
## 🛡️ Risk Mitigation Strategies:
cat(" 1. Implement tiered intervention system:\n")
## 1. Implement tiered intervention system:
cat(" • High risk (prob > 0.7): Reduce credit limit immediately\n")
## • High risk (prob > 0.7): Reduce credit limit immediately
cat(" • Medium risk (0.4-0.7): Send payment reminders\n")
## • Medium risk (0.4-0.7): Send payment reminders
cat(" • Low risk (<0.4): Monitor regularly\n")
## • Low risk (<0.4): Monitor regularly
cat(" 2. Combine with rule-based checks for PAY_0 > 2\n")
## 2. Combine with rule-based checks for PAY_0 > 2
cat(" 3. Regular model retraining (monthly)\n")
## 3. Regular model retraining (monthly)
cat("\n Monitoring Metrics:\n")
##
## Monitoring Metrics:
cat(" 1. Weekly tracking of detection rate and false positive rate\n")
## 1. Weekly tracking of detection rate and false positive rate
cat(" 2. Monthly financial impact assessment\n")
## 2. Monthly financial impact assessment
cat(" 3. Quarterly model performance review\n")
## 3. Quarterly model performance review
# 6. IMPLEMENTATION ROADMAP
cat("\n IMPLEMENTATION ROADMAP\n")
##
## IMPLEMENTATION ROADMAP
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" Phase 1 (Month 1-2): Pilot Deployment\n")
## Phase 1 (Month 1-2): Pilot Deployment
cat(" • Deploy for 10% of customer base\n")
## • Deploy for 10% of customer base
cat(" • A/B testing against current system\n")
## • A/B testing against current system
cat(" • Collect feedback and fine-tune thresholds\n\n")
## • Collect feedback and fine-tune thresholds
cat(" Phase 2 (Month 3-4): Full Deployment\n")
## Phase 2 (Month 3-4): Full Deployment
cat(" • Roll out to entire customer base\n")
## • Roll out to entire customer base
cat(" • Integrate with CRM system\n")
## • Integrate with CRM system
cat(" • Train risk management team\n\n")
## • Train risk management team
cat(" Phase 3 (Month 5-6): Optimization\n")
## Phase 3 (Month 5-6): Optimization
cat(" • Implement automated retraining pipeline\n")
## • Implement automated retraining pipeline
cat(" • Add new features (spending patterns, etc.)\n")
## • Add new features (spending patterns, etc.)
cat(" • Develop dashboard for real-time monitoring\n")
## • Develop dashboard for real-time monitoring
# 7. CONCLUSION
cat("\n CONCLUSION\n")
##
## CONCLUSION
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" The Random Forest model provides the best balance between\n")
## The Random Forest model provides the best balance between
cat(" default detection and overall accuracy for credit risk assessment.\n\n")
## default detection and overall accuracy for credit risk assessment.
cat(" Key success factors:\n")
## Key success factors:
cat(" • Strong predictive power (AUC: 0.761)\n")
## • Strong predictive power (AUC: 0.761)
cat(" • 38% default detection rate\n")
## • 38% default detection rate
cat(" • Actionable insights from feature importance\n\n")
## • Actionable insights from feature importance
cat(" Expected business impact:\n")
## Expected business impact:
cat(" • Significant reduction in credit losses\n")
## • Significant reduction in credit losses
cat(" • Improved risk management capabilities\n")
## • Improved risk management capabilities
cat(" • Data-driven decision making\n")
## • Data-driven decision making
cat("\n")
cat(paste(rep("=", 70), collapse = ""), "\n")
## ======================================================================
cat(" ANALYSIS COMPLETED - READY FOR DEPLOYMENT\n")
## ANALYSIS COMPLETED - READY FOR DEPLOYMENT
cat(paste(rep("=", 70), collapse = ""), "\n")
## ======================================================================
# Save final results
final_results <- list(
dataset_info = list(
n_samples = nrow(credit_data),
default_rate = mean(credit_data$Y),
n_features = ncol(X)
),
model_performance = comparison_df,
best_model = best_model,
feature_importance = head(imp_df, 10),
recommendations = "Deploy Random Forest with tiered intervention system"
)
saveRDS(final_results, "final_analysis_results.rds")