# STEP 1: DATA CLEANING (WITH "Y")

library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
# 1. Load data dengan skip=1 karena baris pertama adalah header
cat(" LOADING DATA...\n")
##  LOADING DATA...
credit_data <- read_excel("default_credit_card.xls", skip = 1)
# Cek struktur data
cat("\n DATA STRUCTURE:\n")
## 
##  DATA STRUCTURE:
print(dim(credit_data))
## [1] 30000    25
cat("Nama kolom:\n")
## Nama kolom:
print(names(credit_data))
##  [1] "ID"                         "LIMIT_BAL"                 
##  [3] "SEX"                        "EDUCATION"                 
##  [5] "MARRIAGE"                   "AGE"                       
##  [7] "PAY_0"                      "PAY_2"                     
##  [9] "PAY_3"                      "PAY_4"                     
## [11] "PAY_5"                      "PAY_6"                     
## [13] "BILL_AMT1"                  "BILL_AMT2"                 
## [15] "BILL_AMT3"                  "BILL_AMT4"                 
## [17] "BILL_AMT5"                  "BILL_AMT6"                 
## [19] "PAY_AMT1"                   "PAY_AMT2"                  
## [21] "PAY_AMT3"                   "PAY_AMT4"                  
## [23] "PAY_AMT5"                   "PAY_AMT6"                  
## [25] "default payment next month"
# 2. Rapikan nama kolom - cari kolom ID dan target Y
cat("\n🔧 FIXING COLUMN NAMES...\n")
## 
## 🔧 FIXING COLUMN NAMES...
# Cari kolom mana yang mungkin ID
id_col <- which(sapply(credit_data, function(x) all(is.numeric(x))) & 
                sapply(credit_data, function(x) min(x, na.rm = TRUE) >= 1))
if(length(id_col) > 0) {
  names(credit_data)[id_col[1]] <- "ID"
  cat(" Kolom", id_col[1], "dijadikan sebagai 'ID'\n")
}
##  Kolom 1 dijadikan sebagai 'ID'
# Cari kolom target Y
y_col <- which(toupper(names(credit_data)) == "Y" | 
               names(credit_data) == "default payment next month" |
               grepl("default", names(credit_data), ignore.case = TRUE))

if(length(y_col) > 0) {
  names(credit_data)[y_col[1]] <- "Y"
  cat(" Kolom", y_col[1], "dijadikan sebagai 'Y' (target)\n")
} else {
  # Jika tidak ditemukan, coba kolom terakhir
  names(credit_data)[ncol(credit_data)] <- "Y"
  cat(" Kolom Y tidak ditemukan, kolom terakhir dijadikan sebagai 'Y'\n")
}
##  Kolom 25 dijadikan sebagai 'Y' (target)
# 3. Cek missing values
cat(" CHECKING MISSING VALUES:\n")
##  CHECKING MISSING VALUES:
missing_summary <- colSums(is.na(credit_data))
print(missing_summary[missing_summary > 0])
## named numeric(0)
if(sum(missing_summary) == 0) {
  cat(" Tidak ada missing values!\n")
} else {
  cat(" Ada missing values, perlu ditangani...\n")
  
    # Handle missing values
  credit_data <- credit_data %>%
    mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
}
##  Tidak ada missing values!
# 4. Cek duplikat
cat("\n CHECKING DUPLICATES:\n")
## 
##  CHECKING DUPLICATES:
duplicate_count <- sum(duplicated(credit_data))
cat("Jumlah baris duplikat:", duplicate_count, "\n")
## Jumlah baris duplikat: 0
if(duplicate_count > 0) {
  credit_data <- distinct(credit_data)
  cat(" Duplikat dihapus!\n")
}
# 5. Cek tipe data
cat("\n CHECKING DATA TYPES:\n")
## 
##  CHECKING DATA TYPES:
str(credit_data)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
##  $ ID       : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ LIMIT_BAL: num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ SEX      : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
##  $ EDUCATION: num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
##  $ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
##  $ AGE      : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
##  $ PAY_0    : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ PAY_2    : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
##  $ PAY_3    : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ PAY_4    : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
##  $ PAY_5    : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
##  $ PAY_6    : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ BILL_AMT1: num [1:30000] 3913 2682 29239 46990 8617 ...
##  $ BILL_AMT2: num [1:30000] 3102 1725 14027 48233 5670 ...
##  $ BILL_AMT3: num [1:30000] 689 2682 13559 49291 35835 ...
##  $ BILL_AMT4: num [1:30000] 0 3272 14331 28314 20940 ...
##  $ BILL_AMT5: num [1:30000] 0 3455 14948 28959 19146 ...
##  $ BILL_AMT6: num [1:30000] 0 3261 15549 29547 19131 ...
##  $ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
##  $ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
##  $ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
##  $ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
##  $ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
##  $ Y        : num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
# Konversi semua kolom numerik ke numeric
for(col in names(credit_data)) {
  if(is.character(credit_data[[col]])) {
    # Coba konversi ke numeric jika memungkinkan
    num_vals <- as.numeric(credit_data[[col]])
    if(sum(is.na(num_vals)) / length(num_vals) < 0.1) {  # Jika <10% NA
      credit_data[[col]] <- num_vals
      cat(" Kolom", col, "dikonversi ke numeric\n")
    }
  }
}

# Pastikan target variable "Y" numeric
if("Y" %in% names(credit_data)) {
  credit_data$Y <- as.numeric(as.character(credit_data$Y))
  cat(" Target variable 'Y' dijadikan numeric\n")
  cat("   Nilai unik Y:", unique(credit_data$Y), "\n")
}
##  Target variable 'Y' dijadikan numeric
##    Nilai unik Y: 1 0
# 6. Cek outlier (numerical columns)
cat("\n CHECKING OUTLIERS (numerical columns):\n")
## 
##  CHECKING OUTLIERS (numerical columns):
numeric_cols <- names(credit_data)[sapply(credit_data, is.numeric)]

for(col in numeric_cols) {
  if(col != "Y") {  # Skip target variable "Y"
    q1 <- quantile(credit_data[[col]], 0.25, na.rm = TRUE)
    q3 <- quantile(credit_data[[col]], 0.75, na.rm = TRUE)
    iqr <- q3 - q1
    if(iqr > 0) {  # Hanya jika IQR > 0
      lower_bound <- q1 - 1.5 * iqr
      upper_bound <- q3 + 1.5 * iqr
      
      outliers <- sum(credit_data[[col]] < lower_bound | credit_data[[col]] > upper_bound, na.rm = TRUE)
      if(outliers > 0) {
        cat(col, ":", outliers, "outlier(s) detected\n")
      }
    }
  }
}
## LIMIT_BAL : 167 outlier(s) detected
## EDUCATION : 454 outlier(s) detected
## AGE : 272 outlier(s) detected
## PAY_0 : 3130 outlier(s) detected
## PAY_2 : 4410 outlier(s) detected
## PAY_3 : 4209 outlier(s) detected
## PAY_4 : 3508 outlier(s) detected
## PAY_5 : 2968 outlier(s) detected
## PAY_6 : 3079 outlier(s) detected
## BILL_AMT1 : 2400 outlier(s) detected
## BILL_AMT2 : 2395 outlier(s) detected
## BILL_AMT3 : 2469 outlier(s) detected
## BILL_AMT4 : 2622 outlier(s) detected
## BILL_AMT5 : 2725 outlier(s) detected
## BILL_AMT6 : 2693 outlier(s) detected
## PAY_AMT1 : 2745 outlier(s) detected
## PAY_AMT2 : 2714 outlier(s) detected
## PAY_AMT3 : 2598 outlier(s) detected
## PAY_AMT4 : 2994 outlier(s) detected
## PAY_AMT5 : 2945 outlier(s) detected
## PAY_AMT6 : 2958 outlier(s) detected
cat("\n DATA CLEANING COMPLETED!\n")
## 
##  DATA CLEANING COMPLETED!
cat(" FINAL DATA SUMMARY:\n")
##  FINAL DATA SUMMARY:
cat("Total samples:", nrow(credit_data), "\n")
## Total samples: 30000
cat("Total features:", ncol(credit_data), "\n")
## Total features: 25
cat("Features (first 5):", names(credit_data)[1:min(5, ncol(credit_data))], "\n")
## Features (first 5): ID LIMIT_BAL SEX EDUCATION MARRIAGE
if("Y" %in% names(credit_data)) {
  cat("Target variable 'Y' distribution:\n")
  print(table(credit_data$Y))
}
## Target variable 'Y' distribution:
## 
##     0     1 
## 23364  6636
# STEP 2: EXPLORATORY DATA ANALYSIS (WITH "Y")

library(ggplot2)
# 1. Distribusi target "Y"
cat(" TARGET DISTRIBUTION (Y):\n")
##  TARGET DISTRIBUTION (Y):
target_dist <- table(credit_data$Y)
print(target_dist)
## 
##     0     1 
## 23364  6636
default_rate <- target_dist[2]/sum(target_dist)*100
cat("Persentase Default (Y=1):", round(default_rate, 2), "%\n")
## Persentase Default (Y=1): 22.12 %
cat("Ratio Default vs Non-Default: 1:", round(target_dist[1]/target_dist[2], 1), "\n")
## Ratio Default vs Non-Default: 1: 3.5
# Plot distribusi target
ggplot(credit_data, aes(x = factor(Y, labels = c("No Default", "Default")), 
                        fill = factor(Y))) +
  geom_bar() +
  geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5) +
  labs(title = "Distribution of Credit Card Default",
       subtitle = paste("Default rate:", round(default_rate, 1), "%"),
       x = "Default Status",
       y = "Count") +
  scale_fill_manual(values = c("0" = "purple", "1" = "pink"),
                    name = "Status") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 2. Statistik deskriptif
cat("\n DESCRIPTIVE STATISTICS FOR TARGET VARIABLE:\n")
## 
##  DESCRIPTIVE STATISTICS FOR TARGET VARIABLE:
print(summary(credit_data$Y))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2212  0.0000  1.0000
cat("\n DESCRIPTIVE STATISTICS FOR NUMERICAL FEATURES:\n")
## 
##  DESCRIPTIVE STATISTICS FOR NUMERICAL FEATURES:
# Hanya tampilkan beberapa fitur penting
important_features <- c("LIMIT_BAL", "AGE", "BILL_AMT1", "PAY_AMT1")
print(summary(credit_data[, important_features]))
##    LIMIT_BAL            AGE          BILL_AMT1          PAY_AMT1     
##  Min.   :  10000   Min.   :21.00   Min.   :-165580   Min.   :     0  
##  1st Qu.:  50000   1st Qu.:28.00   1st Qu.:   3559   1st Qu.:  1000  
##  Median : 140000   Median :34.00   Median :  22382   Median :  2100  
##  Mean   : 167484   Mean   :35.49   Mean   :  51223   Mean   :  5664  
##  3rd Qu.: 240000   3rd Qu.:41.00   3rd Qu.:  67091   3rd Qu.:  5006  
##  Max.   :1000000   Max.   :79.00   Max.   : 964511   Max.   :873552
# 3. Korelasi dengan target "Y"
cat("\n🔗 CORRELATION ANALYSIS WITH TARGET (Y):\n")
## 
## 🔗 CORRELATION ANALYSIS WITH TARGET (Y):
numeric_data <- credit_data %>% select(where(is.numeric))

# Hanya hitung korelasi jika ada data numerik
if(ncol(numeric_data) > 1) {
  # Cek korelasi dengan target "Y"
  if("Y" %in% colnames(numeric_data)) {
    # Hitung korelasi Pearson
    correlations <- sapply(numeric_data, function(x) cor(x, numeric_data$Y, use = "complete.obs"))
    correlations <- correlations[names(correlations) != "Y"]  # Exclude Y itself
    
    if(length(correlations) > 0) {
      cat("Top 10 features with highest absolute correlation with Y:\n")
      cor_df <- data.frame(
        Feature = names(correlations),
        Correlation = correlations,
        Abs_Correlation = abs(correlations)
      ) %>% arrange(desc(Abs_Correlation))
      
      print(head(cor_df, 10))
      
      # Plot korelasi teratas
      top_features <- head(cor_df$Feature, 6)
      cat("\n📊 Top features with highest correlation to default:\n")
      for(feature in top_features) {
        cat(sprintf("%-15s: Correlation = %6.3f\n", feature, cor_df[cor_df$Feature == feature, "Correlation"]))
      }
    }
  }
}
## Top 10 features with highest absolute correlation with Y:
##             Feature Correlation Abs_Correlation
## PAY_0         PAY_0  0.32479373      0.32479373
## PAY_2         PAY_2  0.26355120      0.26355120
## PAY_3         PAY_3  0.23525251      0.23525251
## PAY_4         PAY_4  0.21661364      0.21661364
## PAY_5         PAY_5  0.20414891      0.20414891
## PAY_6         PAY_6  0.18686636      0.18686636
## LIMIT_BAL LIMIT_BAL -0.15351988      0.15351988
## PAY_AMT1   PAY_AMT1 -0.07292949      0.07292949
## PAY_AMT2   PAY_AMT2 -0.05857871      0.05857871
## PAY_AMT4   PAY_AMT4 -0.05682740      0.05682740
## 
## 📊 Top features with highest correlation to default:
## PAY_0          : Correlation =  0.325
## PAY_2          : Correlation =  0.264
## PAY_3          : Correlation =  0.235
## PAY_4          : Correlation =  0.217
## PAY_5          : Correlation =  0.204
## PAY_6          : Correlation =  0.187
# 4. Visualisasi distribusi beberapa fitur vs Y
cat("\n📊 VISUALIZING IMPORTANT FEATURES VS TARGET (Y):\n")
## 
## 📊 VISUALIZING IMPORTANT FEATURES VS TARGET (Y):
# Pilih 6 fitur yang paling menarik
selected_features <- c("LIMIT_BAL", "AGE", "PAY_0", "BILL_AMT1", "PAY_AMT1", "EDUCATION")

for(col in selected_features) {
  if(col %in% names(credit_data)) {
    cat("\nPlotting:", col, "\n")
    
    # Boxplot untuk fitur numerik
    if(is.numeric(credit_data[[col]])) {
      p <- ggplot(credit_data, aes(x = factor(Y, labels = c("No Default", "Default")), 
                                   y = .data[[col]], fill = factor(Y))) +
        geom_boxplot(alpha = 0.7) +
        labs(title = paste(col, "by Default Status"),
              x = "Default Status", y = col) +
        scale_fill_manual(values = c("0" = "green", "1" = "lightblue"), guide = "none") +
        theme_minimal() +
        theme(plot.title = element_text(hjust = 0.5))
      
      print(p)
      
      # Hitung statistik ringkasan per kelompok
      stats <- credit_data %>%
        group_by(Y) %>%
        summarise(
          Mean = mean(.data[[col]], na.rm = TRUE),
          Median = median(.data[[col]], na.rm = TRUE),
          SD = sd(.data[[col]], na.rm = TRUE),
          Min = min(.data[[col]], na.rm = TRUE),
          Max = max(.data[[col]], na.rm = TRUE)
        )
      
      cat("Statistics for", col, "by group:\n")
      print(stats)
    }
  }
}
## 
## Plotting: LIMIT_BAL

## Statistics for LIMIT_BAL by group:
## # A tibble: 2 × 6
##       Y    Mean Median      SD   Min     Max
##   <dbl>   <dbl>  <dbl>   <dbl> <dbl>   <dbl>
## 1     0 178100. 150000 131628. 10000 1000000
## 2     1 130110.  90000 115379. 10000  740000
## 
## Plotting: AGE

## Statistics for AGE by group:
## # A tibble: 2 × 6
##       Y  Mean Median    SD   Min   Max
##   <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1     0  35.4     34  9.08    21    79
## 2     1  35.7     34  9.69    21    75
## 
## Plotting: PAY_0

## Statistics for PAY_0 by group:
## # A tibble: 2 × 6
##       Y   Mean Median    SD   Min   Max
##   <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1     0 -0.211      0 0.952    -2     8
## 2     1  0.668      1 1.38     -2     8
## 
## Plotting: BILL_AMT1

## Statistics for BILL_AMT1 by group:
## # A tibble: 2 × 6
##       Y   Mean Median     SD     Min    Max
##   <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>
## 1     0 51994. 23120. 73578. -165580 964511
## 2     1 48509. 20185  73782.   -6676 613860
## 
## Plotting: PAY_AMT1

## Statistics for PAY_AMT1 by group:
## # A tibble: 2 × 6
##       Y  Mean Median     SD   Min    Max
##   <dbl> <dbl>  <dbl>  <dbl> <dbl>  <dbl>
## 1     0 6307.  2460. 18015.     0 873552
## 2     1 3397.  1636   9544.     0 300000
## 
## Plotting: EDUCATION

## Statistics for EDUCATION by group:
## # A tibble: 2 × 6
##       Y  Mean Median    SD   Min   Max
##   <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1     0  1.84      2 0.807     0     6
## 2     1  1.89      2 0.728     1     6
# 5. Distribusi kategori untuk variabel kategorikal
cat("\n CATEGORICAL VARIABLES DISTRIBUTION:\n")
## 
##  CATEGORICAL VARIABLES DISTRIBUTION:
categorical_vars <- c("SEX", "EDUCATION", "MARRIAGE")

for(col in categorical_vars) {
  if(col %in% names(credit_data)) {
    cat("\n", col, "distribution:\n")
    
    # Cross tabulation dengan Y
    cross_tab <- table(credit_data[[col]], credit_data$Y)
    colnames(cross_tab) <- c("No Default", "Default")
    
    print(cross_tab)
    
    # Hitung default rate per kategori
    default_rates <- prop.table(cross_tab, margin = 1)[, 2] * 100
    cat("Default rates by category (%):\n")
    print(round(default_rates, 1))
    
      # Plot
    p <- credit_data %>%
      group_by(.data[[col]], Y) %>%
      summarise(Count = n(), .groups = 'drop') %>%
      mutate(Percentage = Count / sum(Count) * 100) %>%
      ggplot(aes(x = factor(.data[[col]]), y = Count, fill = factor(Y, labels = c("No Default", "Default")))) +
      geom_bar(stat = "identity", position = "fill") +
      geom_text(aes(label = paste0(round(Percentage, 1), "%")), 
                position = position_fill(vjust = 0.5), size = 3) +
      labs(title = paste("Default Distribution by", col),
           x = col, y = "Proportion") +
      scale_fill_manual(values = c("No Default" = "orange", "Default" = "yellow"),
                        name = "Status") +
      theme_minimal() +
      theme(plot.title = element_text(hjust = 0.5))
    
    print(p)
  }
}
## 
##  SEX distribution:
##    
##     No Default Default
##   1       9015    2873
##   2      14349    3763
## Default rates by category (%):
##    1    2 
## 24.2 20.8

## 
##  EDUCATION distribution:
##    
##     No Default Default
##   0         14       0
##   1       8549    2036
##   2      10700    3330
##   3       3680    1237
##   4        116       7
##   5        262      18
##   6         43       8
## Default rates by category (%):
##    0    1    2    3    4    5    6 
##  0.0 19.2 23.7 25.2  5.7  6.4 15.7

## 
##  MARRIAGE distribution:
##    
##     No Default Default
##   0         49       5
##   1      10453    3206
##   2      12623    3341
##   3        239      84
## Default rates by category (%):
##    0    1    2    3 
##  9.3 23.5 20.9 26.0

# STEP 3: DATA PREPARATION FOR MODELING (WITH "Y")

# Load library caret untuk data splitting
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
# 1. Pisahkan features dan target
cat("\n SEPARATING FEATURES AND TARGET...\n")
## 
##  SEPARATING FEATURES AND TARGET...
X <- credit_data %>% select(-Y)
Y <- credit_data$Y

cat("Features (X) shape:", nrow(X), "samples x", ncol(X), "features\n")
## Features (X) shape: 30000 samples x 24 features
cat("Target (Y) length :", length(Y), "\n")
## Target (Y) length : 30000
cat("Target summary:\n")
## Target summary:
print(summary(Y))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2212  0.0000  1.0000
# 2. Encoding variabel kategorikal
cat("\n ENCODING CATEGORICAL VARIABLES...\n")
## 
##  ENCODING CATEGORICAL VARIABLES...
# Identifikasi variabel kategorikal
categorical_candidates <- c("SEX", "EDUCATION", "MARRIAGE")
categorical_found <- categorical_candidates[categorical_candidates %in% names(X)]

if(length(categorical_found) > 0) {
  cat("Categorical variables found:", categorical_found, "\n")
  
  # Tampilkan nilai unik sebelum encoding
  for(col in categorical_found) {
    cat("\n", col, "unique values:\n")
    print(sort(unique(X[[col]])))
    cat("Counts:\n")
    print(table(X[[col]]))
  }

   # Convert to factor
  for(col in categorical_found) {
    X[[col]] <- as.factor(X[[col]])
    cat("Done", col, "converted to factor\n")
  }
} else {
  cat(" No categorical variables found\n")
}
## Categorical variables found: SEX EDUCATION MARRIAGE 
## 
##  SEX unique values:
## [1] 1 2
## Counts:
## 
##     1     2 
## 11888 18112 
## 
##  EDUCATION unique values:
## [1] 0 1 2 3 4 5 6
## Counts:
## 
##     0     1     2     3     4     5     6 
##    14 10585 14030  4917   123   280    51 
## 
##  MARRIAGE unique values:
## [1] 0 1 2 3
## Counts:
## 
##     0     1     2     3 
##    54 13659 15964   323 
## Done SEX converted to factor
## Done EDUCATION converted to factor
## Done MARRIAGE converted to factor
# 3. Split data train-test (80-20)
cat("\n SPLITTING DATA INTO TRAIN-TEST SETS (80-20)...\n")
## 
##  SPLITTING DATA INTO TRAIN-TEST SETS (80-20)...
set.seed(123)

# Pastikan Y adalah faktor untuk stratified sampling
Y_factor <- as.factor(Y)

train_index <- createDataPartition(Y_factor, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
Y_train <- Y[train_index]
Y_test <- Y[-train_index]

cat(" Data split completed:\n")
##  Data split completed:
cat("   Train set:", nrow(X_train), "samples (", round(nrow(X_train)/nrow(X)*100, 1), "%)\n")
##    Train set: 24001 samples ( 80 %)
cat("   Test set :", nrow(X_test), "samples (", round(nrow(X_test)/nrow(X)*100, 1), "%)\n")
##    Test set : 5999 samples ( 20 %)
# 4. Cek distribusi kelas di train dan test
cat("\n CHECKING CLASS DISTRIBUTION...\n")
## 
##  CHECKING CLASS DISTRIBUTION...
train_dist <- table(Y_train)
test_dist <- table(Y_test)

cat("Training set distribution:\n")
## Training set distribution:
cat("   Non-Default (0):", train_dist["0"], "(", round(train_dist["0"]/length(Y_train)*100, 1), "%)\n")
##    Non-Default (0): 18692 ( 77.9 %)
cat("   Default (1)    :", train_dist["1"], "(", round(train_dist["1"]/length(Y_train)*100, 1), "%)\n")
##    Default (1)    : 5309 ( 22.1 %)
cat("\nTest set distribution:\n")
## 
## Test set distribution:
cat("   Non-Default (0):", test_dist["0"], "(", round(test_dist["0"]/length(Y_test)*100, 1), "%)\n")
##    Non-Default (0): 4672 ( 77.9 %)
cat("   Default (1)    :", test_dist["1"], "(", round(test_dist["1"]/length(Y_test)*100, 1), "%)\n")
##    Default (1)    : 1327 ( 22.1 %)
# 5. Handle class imbalance (jika ada)
cat("\n CLASS IMBALANCE ANALYSIS...\n")
## 
##  CLASS IMBALANCE ANALYSIS...
default_rate_train <- mean(Y_train == 1)
default_rate_test <- mean(Y_test == 1)

cat("Default rate in training set:", round(default_rate_train*100, 2), "%\n")
## Default rate in training set: 22.12 %
cat("Default rate in test set    :", round(default_rate_test*100, 2), "%\n")
## Default rate in test set    : 22.12 %
# Cek apakah ada imbalance signifikan
if(default_rate_train < 0.2 || default_rate_train > 0.8) {
  cat("\n SIGNIFICANT CLASS IMBALANCE DETECTED!\n")
  cat("   Training default rate:", round(default_rate_train*100, 1), "%\n")
  cat("   Consider using:\n")
  cat("   - Class weights in model training\n")
  cat("   - Oversampling (SMOTE) for minority class\n")
  cat("   - Undersampling for majority class\n")
  cat("   - Ensemble methods\n")
} else {
  cat("\n Class distribution is acceptable for modeling\n")
}
## 
##  Class distribution is acceptable for modeling
# 6. Data summary setelah preparation
cat("\n FINAL DATA PREPARATION SUMMARY:\n")
## 
##  FINAL DATA PREPARATION SUMMARY:
cat("Original data size   :", nrow(credit_data), "x", ncol(credit_data), "\n")
## Original data size   : 30000 x 25
cat("Features (X) size    :", nrow(X), "x", ncol(X), "\n")
## Features (X) size    : 30000 x 24
cat("Training set size    :", nrow(X_train), "x", ncol(X_train), "\n")
## Training set size    : 24001 x 24
cat("Test set size       :", nrow(X_test), "x", ncol(X_test), "\n")
## Test set size       : 5999 x 24
cat("Categorical features:", length(categorical_found), "\n")
## Categorical features: 3
cat("Numerical features  :", ncol(X) - length(categorical_found), "\n")
## Numerical features  : 21
# 7. Cek tipe data final
cat("\n FINAL DATA TYPES CHECK:\n")
## 
##  FINAL DATA TYPES CHECK:
cat("Training features structure:\n")
## Training features structure:
str(X_train, list.len = 5)
## tibble [24,001 × 24] (S3: tbl_df/tbl/data.frame)
##  $ ID       : num [1:24001] 1 2 3 4 5 6 7 8 9 11 ...
##  $ LIMIT_BAL: num [1:24001] 20000 120000 90000 50000 50000 50000 500000 100000 140000 200000 ...
##  $ SEX      : Factor w/ 2 levels "1","2": 2 2 2 2 1 1 1 2 2 2 ...
##  $ EDUCATION: Factor w/ 7 levels "0","1","2","3",..: 3 3 3 3 3 2 2 3 4 4 ...
##  $ MARRIAGE : Factor w/ 4 levels "0","1","2","3": 2 3 3 2 2 3 3 3 2 3 ...
##   [list output truncated]
cat("\n DATA PREPARATION COMPLETED!\n")
## 
##  DATA PREPARATION COMPLETED!
cat("   Ready for model training in STEP 4\n")
##    Ready for model training in STEP 4
# STEP 4: MODEL BUILDING (WITH "Y")

# Load library untuk modeling
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
cat(" TRAINING MACHINE LEARNING MODELS...\n")
##  TRAINING MACHINE LEARNING MODELS...
cat("Dataset memiliki", ncol(X_train), "fitur dan", nrow(X_train), "samples training\n")
## Dataset memiliki 24 fitur dan 24001 samples training
# 1. LOGISTIC REGRESSION
cat("\n")
cat(paste(rep("-", 40), collapse = ""), "\n")
## ----------------------------------------
cat(" TRAINING LOGISTIC REGRESSION...\n")
##  TRAINING LOGISTIC REGRESSION...
log_model <- glm(Y_train ~ ., 
                 data = data.frame(X_train, Y_train = Y_train),
                 family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cat(" Logistic Regression trained successfully!\n")
##  Logistic Regression trained successfully!
cat("   Number of coefficients:", length(coef(log_model)), "\n")
##    Number of coefficients: 32
cat("   Model convergence:", log_model$converged, "\n")
##    Model convergence: TRUE
# Summary model logistic regression
cat("\n LOGISTIC REGRESSION SUMMARY (Top 10 coefficients):\n")
## 
##  LOGISTIC REGRESSION SUMMARY (Top 10 coefficients):
log_summary <- summary(log_model)
log_coef <- coef(log_summary)

# Tampilkan 10 coefficient terpenting (berdasarkan p-value)
if(nrow(log_coef) > 10) {
  significant_coef <- log_coef[order(abs(log_coef[, "z value"]), decreasing = TRUE), ]
  print(head(significant_coef, 10))
} else {
  print(log_coef)
}
##                Estimate   Std. Error   z value      Pr(>|z|)
## PAY_0      5.682950e-01 1.988808e-02 28.574659 1.387668e-179
## PAY_AMT1  -1.369627e-05 2.574031e-06 -5.320941  1.032320e-07
## BILL_AMT1 -7.244998e-06 1.365865e-06 -5.304329  1.130883e-07
## PAY_AMT2  -1.125372e-05 2.496951e-06 -4.506983  6.575592e-06
## PAY_2      9.680623e-02 2.260219e-02  4.283046  1.843523e-05
## LIMIT_BAL -6.888196e-07 1.765485e-07 -3.901590  9.556302e-05
## AGE        5.859174e-03 2.086780e-03  2.807758  4.988772e-03
## SEX2      -9.397502e-02 3.444322e-02 -2.728404  6.364157e-03
## PAY_3      5.827961e-02 2.544666e-02  2.290266  2.200590e-02
## PAY_AMT4  -4.761408e-06 2.079422e-06 -2.289775  2.203437e-02
# 2. RANDOM FOREST
cat("\n")
cat(paste(rep("-", 40), collapse = ""), "\n")
## ----------------------------------------
cat(" TRAINING RANDOM FOREST...\n")
##  TRAINING RANDOM FOREST...
# Pastikan Y_train sebagai factor untuk classification
Y_train_factor <- as.factor(Y_train)

# Train Random Forest dengan parameter optimal
set.seed(123)
cat("Training Random Forest with 100 trees...\n")
## Training Random Forest with 100 trees...
rf_model <- randomForest(x = X_train, 
                         y = Y_train_factor,
                         ntree = 100,
                         mtry = floor(sqrt(ncol(X_train))),  # Default untuk classification
                          importance = TRUE,
                         do.trace = 10,  # Tampilkan progress setiap 10 trees
                         na.action = na.omit)
## ntree      OOB      1      2
##    10:  22.80% 11.10% 63.91%
##    20:  20.30%  7.97% 63.70%
##    30:  19.33%  6.66% 63.93%
##    40:  18.92%  6.32% 63.29%
##    50:  18.80%  6.13% 63.42%
##    60:  18.72%  6.02% 63.44%
##    70:  18.67%  5.94% 63.46%
##    80:  18.64%  5.99% 63.19%
##    90:  18.50%  5.91% 62.84%
##   100:  18.43%  5.78% 62.95%
cat("\n Random Forest trained successfully!\n")
## 
##  Random Forest trained successfully!
cat("   Number of trees:", rf_model$ntree, "\n")
##    Number of trees: 100
if(!is.null(rf_model$err.rate)) {
  cat("   OOB error rate:", round(mean(rf_model$err.rate[, "OOB"]), 4), "\n")
}
##    OOB error rate: 0.1971
if(!is.null(rf_model$confusion)) {
  cat("   Confusion matrix (OOB):\n")
  print(rf_model$confusion)
}
##    Confusion matrix (OOB):
##       0    1 class.error
## 0 17611 1081  0.05783223
## 1  3342 1967  0.62949708
# STEP 5: MODEL EVALUATION (WITH "Y")

# Load library untuk evaluasi
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)

cat(" EVALUATING MODEL PERFORMANCE ON TEST SET...\n")
##  EVALUATING MODEL PERFORMANCE ON TEST SET...
cat("Test set size:", nrow(X_test), "samples\n")
## Test set size: 5999 samples
# 1. PREDICTIONS
cat("\n MAKING PREDICTIONS...\n")
## 
##  MAKING PREDICTIONS...
# Logistic Regression predictions
log_prob <- predict(log_model, newdata = X_test, type = "response")
log_class <- ifelse(log_prob > 0.5, 1, 0)

# Random Forest predictions
rf_class <- predict(rf_model, newdata = X_test)
rf_prob <- predict(rf_model, newdata = X_test, type = "prob")[, "1"]

cat(" Predictions completed\n")
##  Predictions completed
cat("   Logistic Regression: probability and class predictions\n")
##    Logistic Regression: probability and class predictions
cat("   Random Forest: class and probability predictions\n")
##    Random Forest: class and probability predictions
# 2. CONFUSION MATRICES
cat("\n CONFUSION MATRICES:\n")
## 
##  CONFUSION MATRICES:
# Fungsi untuk menampilkan confusion matrix yang informatif
print_confusion_matrix <- function(true, pred, model_name) {
  cat("\n", model_name, "Confusion Matrix:\n")
  cm <- confusionMatrix(factor(pred), factor(true), positive = "1")
  
  # Tampilkan matrix
  print(cm$table)
  
  # Tampilkan metrics
  cat("\nPerformance Metrics:\n")
  cat("Accuracy :", round(cm$overall["Accuracy"], 4), "\n")
  cat("Sensitivity (Recall) :", round(cm$byClass["Sensitivity"], 4), "\n")
  cat("Specificity         :", round(cm$byClass["Specificity"], 4), "\n")
  cat("Precision           :", round(cm$byClass["Precision"], 4), "\n")
  cat("F1-Score           :", round(cm$byClass["F1"], 4), "\n")
  
   return(cm)
}

# Logistic Regression CM
log_cm <- print_confusion_matrix(Y_test, log_class, "LOGISTIC REGRESSION")
## 
##  LOGISTIC REGRESSION Confusion Matrix:
##           Reference
## Prediction    0    1
##          0 4541  996
##          1  131  331
## 
## Performance Metrics:
## Accuracy : 0.8121 
## Sensitivity (Recall) : 0.2494 
## Specificity         : 0.972 
## Precision           : 0.7165 
## F1-Score           : 0.37
# Random Forest CM
rf_cm <- print_confusion_matrix(Y_test, rf_class, "RANDOM FOREST")
## 
##  RANDOM FOREST Confusion Matrix:
##           Reference
## Prediction    0    1
##          0 4420  817
##          1  252  510
## 
## Performance Metrics:
## Accuracy : 0.8218 
## Sensitivity (Recall) : 0.3843 
## Specificity         : 0.9461 
## Precision           : 0.6693 
## F1-Score           : 0.4883
# 3. ROC-AUC ANALYSIS
cat("\n ROC-AUC ANALYSIS:\n")
## 
##  ROC-AUC ANALYSIS:
# ROC curves
log_roc <- roc(Y_test, log_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
rf_roc <- roc(Y_test, rf_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
cat("Area Under Curve (AUC):\n")
## Area Under Curve (AUC):
cat("   Logistic Regression:", round(auc(log_roc), 4), "\n")
##    Logistic Regression: 0.7222
cat("   Random Forest      :", round(auc(rf_roc), 4), "\n")
##    Random Forest      : 0.7609
# Plot ROC curves
cat("\n📈 PLOTTING ROC CURVES...\n")
## 
## 📈 PLOTTING ROC CURVES...
plot(log_roc, col = "blue", main = "ROC Curves Comparison", lwd = 2)
lines(rf_roc, col = "red", lwd = 2)
legend("bottomright", 
       legend = c(paste("Logistic Regression (AUC =", round(auc(log_roc), 3), ")"),
                  paste("Random Forest (AUC =", round(auc(rf_roc), 3), ")")),
       col = c("blue", "red"), lwd = 2)

# 4. FEATURE IMPORTANCE ANALYSIS
cat("\n FEATURE IMPORTANCE ANALYSIS:\n")
## 
##  FEATURE IMPORTANCE ANALYSIS:
# Random Forest Feature Importance
cat("\n RANDOM FOREST FEATURE IMPORTANCE (Top 15):\n")
## 
##  RANDOM FOREST FEATURE IMPORTANCE (Top 15):
if(!is.null(rf_model$importance)) {
  imp <- importance(rf_model)
  imp_df <- data.frame(
    Feature = rownames(imp),
    MeanDecreaseGini = imp[, "MeanDecreaseGini"],
    MeanDecreaseAccuracy = imp[, "MeanDecreaseAccuracy"]
  ) 
  imp_df <- imp_df[order(imp_df$MeanDecreaseGini, decreasing = TRUE), ]
  
  print(head(imp_df, 15))
  
  # Plot feature importance
  par(mar = c(5, 8, 4, 2))
  barplot(rev(head(imp_df$MeanDecreaseGini, 10)),
          names.arg = rev(head(imp_df$Feature, 10)),
          horiz = TRUE,
          las = 1,
          col = "steelblue",
          main = "Top 10 Feature Importance (Random Forest)",
          xlab = "Mean Decrease Gini")
  par(mar = c(5, 4, 4, 2))  # Reset margin
}
##             Feature MeanDecreaseGini MeanDecreaseAccuracy
## PAY_0         PAY_0         810.0435            59.010524
## ID               ID         540.8089             2.214992
## BILL_AMT1 BILL_AMT1         449.7547            16.036735
## AGE             AGE         427.6229             7.375773
## BILL_AMT2 BILL_AMT2         416.0259            18.957307
## PAY_AMT1   PAY_AMT1         407.8766            15.667540
## LIMIT_BAL LIMIT_BAL         394.0394            14.275521
## BILL_AMT3 BILL_AMT3         390.5940            21.018379
## BILL_AMT4 BILL_AMT4         381.7948            23.867476
## BILL_AMT6 BILL_AMT6         381.2817            20.688393
## BILL_AMT5 BILL_AMT5         377.4902            18.904437
## PAY_AMT2   PAY_AMT2         367.0724            15.675822
## PAY_AMT6   PAY_AMT6         344.8493            14.660677
## PAY_AMT3   PAY_AMT3         341.1450            18.304254
## PAY_AMT5   PAY_AMT5         336.5149            14.451095

# 5. MODEL COMPARISON SUMMARY
cat("\n MODEL COMPARISON SUMMARY:\n")
## 
##  MODEL COMPARISON SUMMARY:
comparison_df <- data.frame(
  Metric = c("Accuracy", "Sensitivity (Recall)", "Specificity", 
             "Precision", "F1-Score", "AUC"),
  Logistic_Regression = c(
    round(log_cm$overall["Accuracy"], 4),
    round(log_cm$byClass["Sensitivity"], 4),
    round(log_cm$byClass["Specificity"], 4),
    round(log_cm$byClass["Precision"], 4),
    round(log_cm$byClass["F1"], 4),
    round(auc(log_roc), 4)
  ),
  Random_Forest = c(
    round(rf_cm$overall["Accuracy"], 4),
    round(rf_cm$byClass["Sensitivity"], 4),
    round(rf_cm$byClass["Specificity"], 4),
    round(rf_cm$byClass["Precision"], 4),
    round(rf_cm$byClass["F1"], 4),
    round(auc(rf_roc), 4)
  )
)

print(comparison_df)
##                           Metric Logistic_Regression Random_Forest
## Accuracy                Accuracy              0.8121        0.8218
## Sensitivity Sensitivity (Recall)              0.2494        0.3843
## Specificity          Specificity              0.9720        0.9461
## Precision              Precision              0.7165        0.6693
## F1                      F1-Score              0.3700        0.4883
##                              AUC              0.7222        0.7609
# Determine best model
cat("\n BEST MODEL DETERMINATION:\n")
## 
##  BEST MODEL DETERMINATION:
if(auc(rf_roc) > auc(log_roc)) {
  cat(" Random Forest performs better based on AUC\n")
  best_model <- "RANDOM FOREST"
} else {
  cat(" Logistic Regression performs better based on AUC\n")
  best_model <- "LOGISTIC REGRESSION"
}
##  Random Forest performs better based on AUC
# STEP 6: FINAL REPORT AND RECOMMENDATIONS

# 1. EXECUTIVE SUMMARY
cat("\n EXECUTIVE SUMMARY\n")
## 
##  EXECUTIVE SUMMARY
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" Dataset Overview:\n")
##  Dataset Overview:
cat("   • Total samples:", nrow(credit_data), "\n")
##    • Total samples: 30000
cat("   • Default rate:", round(mean(credit_data$Y) * 100, 1), "%\n")
##    • Default rate: 22.1 %
cat("   • Features analyzed:", ncol(X), "\n")
##    • Features analyzed: 24
cat("   • Training samples:", nrow(X_train), "\n")
##    • Training samples: 24001
cat("   • Testing samples:", nrow(X_test), "\n")
##    • Testing samples: 5999
cat("\n Key Findings:\n")
## 
##  Key Findings:
cat("   • PAY_0 (payment status) is the STRONGEST predictor of default\n")
##    • PAY_0 (payment status) is the STRONGEST predictor of default
cat("   • Significant class imbalance (78% non-default vs 22% default)\n")
##    • Significant class imbalance (78% non-default vs 22% default)
cat("   • Random Forest outperforms Logistic Regression overall\n")
##    • Random Forest outperforms Logistic Regression overall
cat("   • Model can detect 38% of actual defaults (Random Forest)\n")
##    • Model can detect 38% of actual defaults (Random Forest)
# 2. MODEL PERFORMANCE DEEP DIVE
cat("\n MODEL PERFORMANCE DEEP DIVE\n")
## 
##  MODEL PERFORMANCE DEEP DIVE
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" Logistic Regression:\n")
##  Logistic Regression:
cat("   • Strengths: High specificity (97.2%), High precision (71.7%)\n")
##    • Strengths: High specificity (97.2%), High precision (71.7%)
cat("   • Weaknesses: Low sensitivity (24.9%), Misses 75% of defaults\n")
##    • Weaknesses: Low sensitivity (24.9%), Misses 75% of defaults
cat("   • Best for: Conservative risk assessment, minimizing false alarms\n\n")
##    • Best for: Conservative risk assessment, minimizing false alarms
cat(" Random Forest:\n")
##  Random Forest:
cat("   • Strengths: Better overall accuracy (82.2%), Higher sensitivity (38.4%)\n")
##    • Strengths: Better overall accuracy (82.2%), Higher sensitivity (38.4%)
cat("   • Weaknesses: Lower precision (66.9%), More false alarms\n")
##    • Weaknesses: Lower precision (66.9%), More false alarms
cat("   • Best for: Maximizing default detection, Overall portfolio safety\n")
##    • Best for: Maximizing default detection, Overall portfolio safety
# 3. BUSINESS IMPACT ANALYSIS
cat("\n BUSINESS IMPACT ANALYSIS\n")
## 
##  BUSINESS IMPACT ANALYSIS
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
# Hitung potential financial impact
avg_limit <- mean(credit_data$LIMIT_BAL)
default_amount <- avg_limit * 0.5  # Asumsi 50% dari limit akan hilang jika default

cat(" Financial Impact Estimation:\n")
##  Financial Impact Estimation:
cat("   • Average credit limit: Rp", format(round(avg_limit), big.mark = ","), "\n")
##    • Average credit limit: Rp 167,484
cat("   • Estimated loss per default: Rp", format(round(default_amount), big.mark = ","), "\n\n")
##    • Estimated loss per default: Rp 83,742
cat(" Model Comparison Impact:\n")
##  Model Comparison Impact:
# Logistic Regression impact
lr_detected <- 331
lr_missed <- 996
lr_fp <- 131

# Random Forest impact
rf_detected <- 510
rf_missed <- 817
rf_fp <- 252

cat("   Logistic Regression:\n")
##    Logistic Regression:
cat("     • Defaults detected:", lr_detected, "/ 1327 (", round(lr_detected/1327*100, 1), "%)\n")
##      • Defaults detected: 331 / 1327 ( 24.9 %)
cat("     • Potential loss prevented: Rp", format(round(lr_detected * default_amount), big.mark = ","), "\n")
##      • Potential loss prevented: Rp 27,718,655
cat("     • False alarms:", lr_fp, "customers\n\n")
##      • False alarms: 131 customers
cat("   Random Forest:\n")
##    Random Forest:
cat("     • Defaults detected:", rf_detected, "/ 1327 (", round(rf_detected/1327*100, 1), "%)\n")
##      • Defaults detected: 510 / 1327 ( 38.4 %)
cat("     • Potential loss prevented: Rp", format(round(rf_detected * default_amount), big.mark = ","), "\n")
##      • Potential loss prevented: Rp 42,708,502
cat("     • Additional loss prevention vs LR: Rp", 
    format(round((rf_detected - lr_detected) * default_amount), big.mark = ","), "\n")
##      • Additional loss prevention vs LR: Rp 14,989,847
cat("     • False alarms:", rf_fp, "customers (+", rf_fp - lr_fp, " vs LR)\n")
##      • False alarms: 252 customers (+ 121  vs LR)
# 4. TOP 5 PREDICTORS OF DEFAULT
cat("\n TOP 5 PREDICTORS OF DEFAULT\n")
## 
##  TOP 5 PREDICTORS OF DEFAULT
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat("Based on Random Forest feature importance:\n")
## Based on Random Forest feature importance:
top_features <- head(imp_df, 5)
for(i in 1:nrow(top_features)) {
  cat(i, ". ", top_features$Feature[i], "\n", sep = "")
}
## 1. PAY_0
## 2. ID
## 3. BILL_AMT1
## 4. AGE
## 5. BILL_AMT2
cat("\nActionable Insights:\n")
## 
## Actionable Insights:
cat("   • Monitor PAY_0 closely - immediate payment status is critical\n")
##    • Monitor PAY_0 closely - immediate payment status is critical
cat("   • Higher bill amounts (BILL_AMT1, BILL_AMT2) increase default risk\n")
##    • Higher bill amounts (BILL_AMT1, BILL_AMT2) increase default risk
cat("   • Age is a significant factor in default prediction\n")
##    • Age is a significant factor in default prediction
cat("   • Credit limit (LIMIT_BAL) inversely related to default risk\n")
##    • Credit limit (LIMIT_BAL) inversely related to default risk
# 5. RECOMMENDATIONS
cat("\n RECOMMENDATIONS\n")
## 
##  RECOMMENDATIONS
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" Primary Recommendation:\n")
##  Primary Recommendation:
cat("   DEPLOY RANDOM FOREST MODEL\n\n")
##    DEPLOY RANDOM FOREST MODEL
cat("   Reasons:\n")
##    Reasons:
cat("   1. Detects 54% more defaults than Logistic Regression\n")
##    1. Detects 54% more defaults than Logistic Regression
cat("   2. Higher overall accuracy (82.2% vs 81.2%)\n")
##    2. Higher overall accuracy (82.2% vs 81.2%)
cat("   3. Better AUC (0.761 vs 0.722)\n")
##    3. Better AUC (0.761 vs 0.722)
cat("   4. Superior F1-Score (0.488 vs 0.370)\n")
##    4. Superior F1-Score (0.488 vs 0.370)
cat("   5. Prevents significantly more financial loss\n")
##    5. Prevents significantly more financial loss
cat("\n🛡️ Risk Mitigation Strategies:\n")
## 
## 🛡️ Risk Mitigation Strategies:
cat("   1. Implement tiered intervention system:\n")
##    1. Implement tiered intervention system:
cat("      • High risk (prob > 0.7): Reduce credit limit immediately\n")
##       • High risk (prob > 0.7): Reduce credit limit immediately
cat("      • Medium risk (0.4-0.7): Send payment reminders\n")
##       • Medium risk (0.4-0.7): Send payment reminders
cat("      • Low risk (<0.4): Monitor regularly\n")
##       • Low risk (<0.4): Monitor regularly
cat("   2. Combine with rule-based checks for PAY_0 > 2\n")
##    2. Combine with rule-based checks for PAY_0 > 2
cat("   3. Regular model retraining (monthly)\n")
##    3. Regular model retraining (monthly)
cat("\n Monitoring Metrics:\n")
## 
##  Monitoring Metrics:
cat("   1. Weekly tracking of detection rate and false positive rate\n")
##    1. Weekly tracking of detection rate and false positive rate
cat("   2. Monthly financial impact assessment\n")
##    2. Monthly financial impact assessment
cat("   3. Quarterly model performance review\n")
##    3. Quarterly model performance review
# 6. IMPLEMENTATION ROADMAP
cat("\n IMPLEMENTATION ROADMAP\n")
## 
##  IMPLEMENTATION ROADMAP
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" Phase 1 (Month 1-2): Pilot Deployment\n")
##  Phase 1 (Month 1-2): Pilot Deployment
cat("   • Deploy for 10% of customer base\n")
##    • Deploy for 10% of customer base
cat("   • A/B testing against current system\n")
##    • A/B testing against current system
cat("   • Collect feedback and fine-tune thresholds\n\n")
##    • Collect feedback and fine-tune thresholds
cat(" Phase 2 (Month 3-4): Full Deployment\n")
##  Phase 2 (Month 3-4): Full Deployment
cat("   • Roll out to entire customer base\n")
##    • Roll out to entire customer base
cat("   • Integrate with CRM system\n")
##    • Integrate with CRM system
cat("   • Train risk management team\n\n")
##    • Train risk management team
cat(" Phase 3 (Month 5-6): Optimization\n")
##  Phase 3 (Month 5-6): Optimization
cat("   • Implement automated retraining pipeline\n")
##    • Implement automated retraining pipeline
cat("   • Add new features (spending patterns, etc.)\n")
##    • Add new features (spending patterns, etc.)
cat("   • Develop dashboard for real-time monitoring\n")
##    • Develop dashboard for real-time monitoring
# 7. CONCLUSION
cat("\n CONCLUSION\n")
## 
##  CONCLUSION
cat(paste(rep("-", 50), collapse = ""), "\n")
## --------------------------------------------------
cat(" The Random Forest model provides the best balance between\n")
##  The Random Forest model provides the best balance between
cat("   default detection and overall accuracy for credit risk assessment.\n\n")
##    default detection and overall accuracy for credit risk assessment.
cat(" Key success factors:\n")
##  Key success factors:
cat("   • Strong predictive power (AUC: 0.761)\n")
##    • Strong predictive power (AUC: 0.761)
cat("   • 38% default detection rate\n")
##    • 38% default detection rate
cat("   • Actionable insights from feature importance\n\n")
##    • Actionable insights from feature importance
cat(" Expected business impact:\n")
##  Expected business impact:
cat("   • Significant reduction in credit losses\n")
##    • Significant reduction in credit losses
cat("   • Improved risk management capabilities\n")
##    • Improved risk management capabilities
cat("   • Data-driven decision making\n")
##    • Data-driven decision making
cat("\n")
cat(paste(rep("=", 70), collapse = ""), "\n")
## ======================================================================
cat(" ANALYSIS COMPLETED - READY FOR DEPLOYMENT\n")
##  ANALYSIS COMPLETED - READY FOR DEPLOYMENT
cat(paste(rep("=", 70), collapse = ""), "\n")
## ======================================================================
# Save final results
final_results <- list(
  dataset_info = list(
    n_samples = nrow(credit_data),
    default_rate = mean(credit_data$Y),
    n_features = ncol(X)
  ),
  model_performance = comparison_df,
  best_model = best_model,
  feature_importance = head(imp_df, 10),
  recommendations = "Deploy Random Forest with tiered intervention system"
)

saveRDS(final_results, "final_analysis_results.rds")