1. Load Required Packages

# Data manipulation and analysis
library(dplyr)
library(tidyr)
library(data.table)
library(lubridate)
library(readr)

# Visualization
library(ggplot2)
library(scales)
library(kableExtra)
library(gridExtra)

# Modeling and metrics
library(caret)
library(pROC)
library(scorecard)
library(arm)
library(gbm)
library(Metrics)

# Custom functions
source("apply_scorecard.R")

2. Load and Prepare Data

# Load the dataset
CRDB_ModelFeatures_V1 <- read_csv(
  "C://Users//m.njuguna//OneDrive - Creditinfo Group//Mary CreditInfo Documents//Analytics 2025//Kenya//CIP Score//Retro Analysis//Performance Analysis//OldCIPCustomers_04112025.csv"
)

cat("Dataset loaded successfully with", nrow(CRDB_ModelFeatures_V1), "rows and", ncol(CRDB_ModelFeatures_V1), "columns.\n")
## Dataset loaded successfully with 315775 rows and 15 columns.
# Data cleaning and preparation
CRDB_ModelFeatures_V1 <- CRDB_ModelFeatures_V1 %>%
  mutate(
    IsBadCont90DPDL12M = ifelse(IsBadCont90DPDL12M >= 1, 1, 0),
    Score = as.numeric(Score),
    ScoringDate = as.Date(ScoringDate)
  ) %>%
  # Exclude invalid or missing scores
  filter(
    !is.na(Score),
    !is.na(IsBadCont90DPDL12M),
    !Score %in% c(999, 250)
  )

cat("Data cleaning completed. Final dataset has", nrow(CRDB_ModelFeatures_V1), "rows.\n")
## Data cleaning completed. Final dataset has 216701 rows.

3. Model Performance Analysis

3.1 Performance Calculation Function

calculate_auc_gini <- function(dataset, target_col, predictor_col, train_split = 0.8, seed = 123) {
  set.seed(seed)
  
  # Create proper train-test split
  train_index <- createDataPartition(dataset[[target_col]], p = train_split, list = FALSE)
  train_data <- dataset[train_index, ]
  test_data <- dataset[-train_index, ]
  
  formula <- as.formula(paste(target_col, "~", predictor_col))
  logistic_model <- glm(formula, data = train_data, family = binomial)
  
  predicted_probs <- predict(logistic_model, test_data, type = "response")
  
  # AUC - use pROC::auc() explicitly
  roc_curve <- pROC::roc(test_data[[target_col]], predicted_probs)
  auc_value <- pROC::auc(roc_curve)  # Fixed: use pROC::auc()
  
  # GINI
  gini <- 2 * auc_value - 1
  
  # KS stat
  bad_probs <- predicted_probs[test_data[[target_col]] == 1]
  good_probs <- predicted_probs[test_data[[target_col]] == 0]
  
  if(length(bad_probs) > 0 && length(good_probs) > 0) {
    ks_stat <- max(ecdf(bad_probs)(sort(predicted_probs)) - ecdf(good_probs)(sort(predicted_probs)))
  } else {
    ks_stat <- NA
    warning("Insufficient data for KS calculation")
  }
  
  return(list(
    perf = tibble(
      auc = auc_value,
      gini = gini,
      ks_stat = ks_stat
    ),
    test_data = test_data %>%
      mutate(predicted_prob = predicted_probs)
  ))
}

3.2 Run Model Analysis

# Run the performance analysis
results <- calculate_auc_gini(
  dataset = CRDB_ModelFeatures_V1,
  target_col = "IsBadCont90DPDL12M", 
  predictor_col = "Score"
)

cat("Model performance analysis completed.\n")
## Model performance analysis completed.

3.3 Display Performance Metrics

# Extract performance metrics
metrics <- results$perf
auc_value <- metrics$auc
gini_value <- metrics$gini
ks_value <- metrics$ks_stat

# Display metrics in a formatted table
performance_table <- tibble(
  Metric = c("AUC (Area Under Curve)", "Gini Coefficient", "KS Statistic"),
  Value = c(round(auc_value, 4), round(gini_value, 4), round(ks_value, 4)),
  Interpretation = c(
    ifelse(auc_value >= 0.8, "Good", ifelse(auc_value >= 0.7, "Fair", "Poor")),
    ifelse(gini_value >= 0.6, "Strong", ifelse(gini_value >= 0.4, "Moderate", "Weak")),
    ifelse(ks_value >= 0.4, "Strong", ifelse(ks_value >= 0.3, "Moderate", "Weak"))
  )
)

performance_table %>%
  kable(align = "c", caption = "Model Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model Performance Metrics
Metric Value Interpretation
AUC (Area Under Curve) 0.5502 Poor
Gini Coefficient 0.1003 Weak
KS Statistic 0.0016 Weak

3.4 ROC Curve

# Generate ROC curve
roc_curve <- roc(results$test_data$IsBadCont90DPDL12M, results$test_data$predicted_prob)

plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "red")
text(0.6, 0.2, paste("AUC =", round(auc_value, 4)), col = "blue", cex = 1.2)

3.5 Data Summary

# Basic data summary
cat("## Data Summary\n")
## ## Data Summary
cat("Total accounts in test data:", nrow(results$test_data), "\n")
## Total accounts in test data: 43340
cat("Bad rate in test data:", round(mean(results$test_data$IsBadCont90DPDL12M) * 100, 2), "%\n")
## Bad rate in test data: 7.13 %
cat("Score range:", min(results$test_data$Score), "-", max(results$test_data$Score), "\n")
## Score range: 451 - 774
# Score distribution
cat("\n## Score Distribution Summary\n")
## 
## ## Score Distribution Summary
summary(results$test_data$Score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   451.0   609.0   642.0   634.4   665.0   774.0

4. Score Band Analysis

4.1 Create Custom Score Bands

# Identify score column
score_columns <- names(results$test_data)[grepl("score|Score", names(results$test_data))]
score_col <- score_columns[1]

cat("Using column '", score_col, "' for score band analysis.\n")
## Using column ' Score ' for score band analysis.
# Define custom score bands
results$test_data <- results$test_data %>%
  mutate(
    ScoreBands = case_when(
      !!sym(score_col) >= 743 & !!sym(score_col) <= 900 ~ "743-900",
      !!sym(score_col) >= 731 & !!sym(score_col) <= 742 ~ "731-742", 
      !!sym(score_col) >= 713 & !!sym(score_col) <= 730 ~ "713-730",
      !!sym(score_col) >= 701 & !!sym(score_col) <= 712 ~ "701-712",
      !!sym(score_col) >= 689 & !!sym(score_col) <= 700 ~ "689-700",
      !!sym(score_col) >= 677 & !!sym(score_col) <= 688 ~ "677-688",
      !!sym(score_col) >= 665 & !!sym(score_col) <= 676 ~ "665-676",
      !!sym(score_col) >= 653 & !!sym(score_col) <= 664 ~ "653-664",
      !!sym(score_col) >= 641 & !!sym(score_col) <= 652 ~ "641-652",
      !!sym(score_col) >= 620 & !!sym(score_col) <= 640 ~ "620-640",
      !!sym(score_col) >= 604 & !!sym(score_col) <= 619 ~ "604-619",
      !!sym(score_col) >= 574 & !!sym(score_col) <= 603 ~ "574-603",
      !!sym(score_col) >= 530 & !!sym(score_col) <= 573 ~ "530-573",
      !!sym(score_col) >= 251 & !!sym(score_col) <= 529 ~ "251-529",
      !!sym(score_col) == 250 ~ "250-250",
      !!sym(score_col) == 999 ~ "999-999",
      TRUE ~ "Other"
    ),
    ScoreBands_Ordered = factor(ScoreBands, levels = c(
      "999-999", "250-250", "251-529", "530-573", "574-603", "604-619", 
      "620-640", "641-652", "653-664", "665-676", "677-688", "689-700", 
      "701-712", "713-730", "731-742", "743-900"
    ))
  )

cat("Custom score bands created successfully.\n")
## Custom score bands created successfully.

4.2 Calculate Band Performance Metrics

# Calculate performance metrics by score band
performance_by_band <- results$test_data %>%
  group_by(ScoreBands, ScoreBands_Ordered) %>%
  summarise(
    Count = n(),
    Bad_Count = sum(IsBadCont90DPDL12M, na.rm = TRUE),
    Good_Count = Count - Bad_Count,
    Bad_Rate = round(Bad_Count / Count, 4),
    Bad_Rate_Pct = round(Bad_Rate * 100, 2),
    Avg_Score = round(mean(!!sym(score_col), na.rm = TRUE), 1),
    Min_Score = min(!!sym(score_col), na.rm = TRUE),
    Max_Score = max(!!sym(score_col), na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  arrange(ScoreBands_Ordered) %>%
  mutate(
    Cumulative_Count = cumsum(Count),
    Cumulative_Pct = round(Cumulative_Count / sum(Count) * 100, 1),
    Cumulative_Bads = cumsum(Bad_Count),
    Cumulative_Bad_Rate = round(Cumulative_Bads / Cumulative_Count * 100, 2)
  )

cat("Performance metrics calculated for", nrow(performance_by_band), "score bands.\n")
## Performance metrics calculated for 14 score bands.

4.3 Display Band Performance Table

# Display performance table
performance_by_band %>%
  dplyr::select(
    `Score Band` = ScoreBands,
    Accounts = Count,
    `Bad Accounts` = Bad_Count,
    `Bad Rate %` = Bad_Rate_Pct,
    `Avg Score` = Avg_Score,
    `Cumulative %` = Cumulative_Pct,
    `Cum Bad Rate %` = Cumulative_Bad_Rate
  ) %>%
  kable(align = "c", digits = 2, caption = "Performance by Score Band") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(performance_by_band$Bad_Rate_Pct > 10), bold = TRUE, color = "red")
Performance by Score Band
Score Band Accounts Bad Accounts Bad Rate % Avg Score Cumulative % Cum Bad Rate %
251-529 845 80 9.47 517.3 1.9 9.47
530-573 4176 374 8.96 554.3 11.6 9.04
574-603 4469 365 8.17 590.0 21.9 8.63
604-619 4005 342 8.54 611.9 31.1 8.60
620-640 7352 543 7.39 630.9 48.1 8.17
641-652 5978 400 6.69 646.5 61.9 7.84
653-664 5476 343 6.26 658.4 74.5 7.58
665-676 4401 277 6.29 670.0 84.7 7.42
677-688 3491 210 6.02 681.8 92.7 7.30
689-700 1608 97 6.03 693.9 96.4 7.25
701-712 806 35 4.34 705.4 98.3 7.20
713-730 535 20 3.74 720.1 99.5 7.15
731-742 136 2 1.47 736.4 99.9 7.14
743-900 62 2 3.23 753.0 100.0 7.13

5. Visualizations

5.1 Bad Rate by Score Band

# Filter out special bands for normal analysis
normal_bands <- performance_by_band %>% 
  filter(!ScoreBands %in% c("999-999", "250-250"))

ggplot(normal_bands, aes(x = reorder(ScoreBands, Avg_Score), y = Bad_Rate_Pct)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.7) +
  geom_text(aes(label = paste0(Bad_Rate_Pct, "%")), vjust = -0.5, size = 3) +
  labs(
    title = "Bad Rate by Score Band",
    subtitle = "Lower scores should show higher bad rates (monotonicity)",
    x = "Score Band",
    y = "Bad Rate (%)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

5.2 Population Distribution

ggplot(normal_bands, aes(x = reorder(ScoreBands, Avg_Score), y = Count)) +
  geom_bar(stat = "identity", fill = "darkorange", alpha = 0.7) +
  geom_text(aes(label = scales::comma(Count)), vjust = -0.5, size = 3) +
  labs(
    title = "Population Distribution by Score Band",
    x = "Score Band",
    y = "Number of Accounts"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

5.3 Combined Population and Bad Rate

# Create combined plot
p_combined <- ggplot(normal_bands, aes(x = reorder(ScoreBands, Avg_Score))) +
  geom_bar(aes(y = Count, fill = "Population"), stat = "identity", alpha = 0.6) +
  geom_line(aes(y = Bad_Rate_Pct * max(Count) / max(Bad_Rate_Pct), group = 1, color = "Bad Rate"), size = 1.5) +
  geom_point(aes(y = Bad_Rate_Pct * max(Count) / max(Bad_Rate_Pct), color = "Bad Rate"), size = 2) +
  scale_y_continuous(
    name = "Number of Accounts",
    sec.axis = sec_axis(~ . * max(normal_bands$Bad_Rate_Pct) / max(normal_bands$Count), 
                       name = "Bad Rate (%)")
  ) +
  scale_fill_manual(values = c("Population" = "steelblue")) +
  scale_color_manual(values = c("Bad Rate" = "red")) +
  labs(
    title = "Population Distribution and Bad Rate by Score Band",
    x = "Score Band",
    fill = "",
    color = ""
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )

print(p_combined)

6. Monotonicity Analysis

# Check monotonicity
monotonicity_check <- normal_bands %>%
  arrange(desc(Avg_Score)) %>%
  mutate(
    Bad_Rate_Change = ifelse(is.na(lag(Bad_Rate_Pct)), NA, Bad_Rate_Pct - lag(Bad_Rate_Pct)),
    Monotonicity = case_when(
      is.na(Bad_Rate_Change) ~ "Baseline",
      Bad_Rate_Change > 1 ~ "Significant Violation",
      Bad_Rate_Change > 0 ~ "Minor Violation", 
      Bad_Rate_Change <= 0 ~ "OK"
    )
  )

# Display monotonicity results
monotonicity_check %>%
  dplyr::select(
    `Score Band` = ScoreBands,
    `Bad Rate %` = Bad_Rate_Pct,
    `Change from Previous` = Bad_Rate_Change,
    Monotonicity
  ) %>%
  kable(align = "c", digits = 2, caption = "Monotonicity Check") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(monotonicity_check$Monotonicity == "Significant Violation"), 
           bold = TRUE, color = "red")
Monotonicity Check
Score Band Bad Rate % Change from Previous Monotonicity
743-900 3.23 NA Baseline
731-742 1.47 -1.76 OK
713-730 3.74 2.27 Significant Violation
701-712 4.34 0.60 Minor Violation
689-700 6.03 1.69 Significant Violation
677-688 6.02 -0.01 OK
665-676 6.29 0.27 Minor Violation
653-664 6.26 -0.03 OK
641-652 6.69 0.43 Minor Violation
620-640 7.39 0.70 Minor Violation
604-619 8.54 1.15 Significant Violation
574-603 8.17 -0.37 OK
530-573 8.96 0.79 Minor Violation
251-529 9.47 0.51 Minor Violation

7. Risk Segmentation

# Create risk segments
risk_segments <- normal_bands %>%
  mutate(
    Risk_Segment = case_when(
      Bad_Rate_Pct >= 20 ~ "Very High Risk",
      Bad_Rate_Pct >= 10 ~ "High Risk",
      Bad_Rate_Pct >= 5 ~ "Medium-High Risk",
      Bad_Rate_Pct >= 2 ~ "Medium Risk", 
      Bad_Rate_Pct >= 1 ~ "Low Risk",
      Bad_Rate_Pct < 1 ~ "Very Low Risk"
    )
  )

# Display risk segments
risk_segments %>%
  dplyr::select(
    `Score Band` = ScoreBands,
    `Bad Rate %` = Bad_Rate_Pct,
    `Risk Segment` = Risk_Segment
  ) %>%
  kable(align = "c", caption = "Risk Segmentation") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Risk Segmentation
Score Band Bad Rate % Risk Segment
251-529 9.47 Medium-High Risk
530-573 8.96 Medium-High Risk
574-603 8.17 Medium-High Risk
604-619 8.54 Medium-High Risk
620-640 7.39 Medium-High Risk
641-652 6.69 Medium-High Risk
653-664 6.26 Medium-High Risk
665-676 6.29 Medium-High Risk
677-688 6.02 Medium-High Risk
689-700 6.03 Medium-High Risk
701-712 4.34 Medium Risk
713-730 3.74 Medium Risk
731-742 1.47 Low Risk
743-900 3.23 Medium Risk

7.1 Portfolio Composition by Risk Segment

portfolio_composition <- risk_segments %>%
  group_by(Risk_Segment) %>%
  summarise(
    Bands = n(),
    Total_Accounts = sum(Count),
    Total_Bads = sum(Bad_Count),
    Avg_Bad_Rate = round(Total_Bads / Total_Accounts * 100, 2),
    Portfolio_Share = round(Total_Accounts / sum(Total_Accounts) * 100, 1)
  ) %>%
  arrange(desc(Avg_Bad_Rate))

portfolio_composition %>%
  kable(align = "c", digits = 2, caption = "Portfolio Composition by Risk Segment") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Portfolio Composition by Risk Segment
Risk_Segment Bands Total_Accounts Total_Bads Avg_Bad_Rate Portfolio_Share
Medium-High Risk 10 41801 3031 7.25 100
Medium Risk 3 1403 57 4.06 100
Low Risk 1 136 2 1.47 100

8. Summary and Recommendations

cat("## ANALYSIS SUMMARY\n")
## ## ANALYSIS SUMMARY
cat("══════════════════════════════════════\n\n")
## ══════════════════════════════════════
# Overall statistics
total_accounts <- sum(performance_by_band$Count)
total_bads <- sum(performance_by_band$Bad_Count)
overall_bad_rate <- total_bads / total_accounts * 100

cat("### Portfolio Statistics:\n")
## ### Portfolio Statistics:
cat("- Total Accounts:", format(total_accounts, big.mark = ","), "\n")
## - Total Accounts: 43,340
cat("- Total Bad Accounts:", format(total_bads, big.mark = ","), "\n")
## - Total Bad Accounts: 3,090
cat("- Overall Bad Rate:", round(overall_bad_rate, 2), "%\n")
## - Overall Bad Rate: 7.13 %
cat("- Number of Score Bands:", nrow(performance_by_band), "\n")
## - Number of Score Bands: 14
cat("- Model AUC:", round(auc_value, 4), "(", ifelse(auc_value >= 0.8, "Good", "Fair"), ")\n\n")
## - Model AUC: 0.5502 ( Fair )
# Identify key bands
high_risk_bands <- normal_bands %>% filter(Bad_Rate_Pct > 10)
low_risk_bands <- normal_bands %>% filter(Bad_Rate_Pct < 1)
significant_violations <- sum(monotonicity_check$Monotonicity == "Significant Violation", na.rm = TRUE)

cat("### Key Findings:\n")
## ### Key Findings:
if(nrow(high_risk_bands) > 0) {
  cat("- High Risk Bands (Bad Rate > 10%):", paste(high_risk_bands$ScoreBands, collapse = ", "), "\n")
}
if(nrow(low_risk_bands) > 0) {
  cat("- Low Risk Bands (Bad Rate < 1%):", paste(low_risk_bands$ScoreBands, collapse = ", "), "\n")
}
if(significant_violations > 0) {
  cat("- Monotonicity Violations:", significant_violations, "significant violations detected\n")
}
## - Monotonicity Violations: 3 significant violations detected
cat("\n### Recommendations:\n")
## 
## ### Recommendations:
cat("1. Implement risk-based pricing strategies\n")
## 1. Implement risk-based pricing strategies
cat("2. Monitor high-risk bands closely for early warning signs\n")
## 2. Monitor high-risk bands closely for early warning signs
cat("3. Consider expanding credit access for low-risk segments\n")
## 3. Consider expanding credit access for low-risk segments
cat("4. Regularly validate and recalibrate score bands\n")
## 4. Regularly validate and recalibrate score bands
cat("5. Use portfolio composition for strategic planning\n")
## 5. Use portfolio composition for strategic planning

9. Debugging Information

## ## DEBUGGING INFORMATION
## ══════════════════════════
## Data dimensions:
## - Original data: 216701 rows
## - Test data: 43340 rows
## Column names in test data:
##  [1] "sample_order_id"    "ExSubjectId"        "ExContractId"      
##  [4] "SubscriberId"       "TypeOfContractId"   "IsBadCont30DPDL12M"
##  [7] "IsBadCont60DPDL12M" "IsBadCont90DPDL12M" "IsBadCont30DPDL6M" 
## [10] "IsBadCont60DPDL6M"  "IsBadCont90DPDL6M"  "ScoringDate"       
## [13] "Score"              "RiskGrade"          "ScoreBands"        
## [16] "predicted_prob"     "ScoreBands_Ordered"
## 
## Score bands created:
##  [1] "701-712" "641-652" "665-676" "620-640" "689-700" "653-664" "713-730"
##  [8] "574-603" "604-619" "530-573" "251-529" "677-688" "731-742" "743-900"
## 
## Performance metrics calculated successfully.