1. Load Required Packages

# Data manipulation and analysis
library(dplyr)
library(tidyr)
library(data.table)
library(lubridate)
library(readr)

# Visualization
library(ggplot2)
library(scales)
library(kableExtra)
library(gridExtra)

# Modeling and metrics
library(caret)
library(pROC)
library(scorecard)
library(arm)
library(gbm)
library(Metrics)

# Custom functions
source("apply_scorecard.R")

2. Load and Prepare Data

# Load the dataset
CRDB_ModelFeatures_V1 <- read_csv(
  "C://Users//m.njuguna//OneDrive - Creditinfo Group//Mary CreditInfo Documents//Analytics 2025//Kenya//CIP Score//Retro Analysis//Performance Analysis//OldCIPCustomers_04112025.csv"
)

cat("Dataset loaded successfully with", nrow(CRDB_ModelFeatures_V1), "rows and", ncol(CRDB_ModelFeatures_V1), "columns.\n")

## Dataset loaded successfully with 315775 rows and 15 columns.

# Data cleaning and preparation
CRDB_ModelFeatures_V1 <- CRDB_ModelFeatures_V1 %>%
  mutate(
    IsBadCont90DPDL12M = ifelse(IsBadCont90DPDL12M >= 1, 1, 0),
    Score = as.numeric(Score),
    ScoringDate = as.Date(ScoringDate)
  ) %>%
  # Exclude invalid or missing scores
  filter(
    !is.na(Score),
    !is.na(IsBadCont90DPDL12M),
    !Score %in% c(999, 250)
  )

cat("Data cleaning completed. Final dataset has", nrow(CRDB_ModelFeatures_V1), "rows.\n")

## Data cleaning completed. Final dataset has 216701 rows.

3. Model Performance Analysis

3.1 Performance Calculation Function

calculate_auc_gini <- function(dataset, target_col, predictor_col, train_split = 0.8, seed = 123) {
  set.seed(seed)
  
  # Create proper train-test split
  train_index <- createDataPartition(dataset[[target_col]], p = train_split, list = FALSE)
  train_data <- dataset[train_index, ]
  test_data <- dataset[-train_index, ]
  
  formula <- as.formula(paste(target_col, "~", predictor_col))
  logistic_model <- glm(formula, data = train_data, family = binomial)
  
  predicted_probs <- predict(logistic_model, test_data, type = "response")
  
  # AUC - use pROC::auc() explicitly
  roc_curve <- pROC::roc(test_data[[target_col]], predicted_probs)
  auc_value <- pROC::auc(roc_curve)  # Fixed: use pROC::auc()
  
  # GINI
  gini <- 2 * auc_value - 1
  
  # KS stat
  bad_probs <- predicted_probs[test_data[[target_col]] == 1]
  good_probs <- predicted_probs[test_data[[target_col]] == 0]
  
  if(length(bad_probs) > 0 && length(good_probs) > 0) {
    ks_stat <- max(ecdf(bad_probs)(sort(predicted_probs)) - ecdf(good_probs)(sort(predicted_probs)))
  } else {
    ks_stat <- NA
    warning("Insufficient data for KS calculation")
  }
  
  return(list(
    perf = tibble(
      auc = auc_value,
      gini = gini,
      ks_stat = ks_stat
    ),
    test_data = test_data %>%
      mutate(predicted_prob = predicted_probs)
  ))
}

3.2 Run Model Analysis

# Run the performance analysis
results <- calculate_auc_gini(
  dataset = CRDB_ModelFeatures_V1,
  target_col = "IsBadCont90DPDL12M", 
  predictor_col = "Score"
)

cat("Model performance analysis completed.\n")

## Model performance analysis completed.

3.3 Display Performance Metrics

# Extract performance metrics
metrics <- results$perf
auc_value <- metrics$auc
gini_value <- metrics$gini
ks_value <- metrics$ks_stat

# Display metrics in a formatted table
performance_table <- tibble(
  Metric = c("AUC (Area Under Curve)", "Gini Coefficient", "KS Statistic"),
  Value = c(round(auc_value, 4), round(gini_value, 4), round(ks_value, 4)),
  Interpretation = c(
    ifelse(auc_value >= 0.8, "Good", ifelse(auc_value >= 0.7, "Fair", "Poor")),
    ifelse(gini_value >= 0.6, "Strong", ifelse(gini_value >= 0.4, "Moderate", "Weak")),
    ifelse(ks_value >= 0.4, "Strong", ifelse(ks_value >= 0.3, "Moderate", "Weak"))
  )
)

performance_table %>%
  kable(align = "c", caption = "Model Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model Performance Metrics
Metric	Value	Interpretation
AUC (Area Under Curve)	0.5502	Poor
Gini Coefficient	0.1003	Weak
KS Statistic	0.0016	Weak

3.4 ROC Curve

# Generate ROC curve
roc_curve <- roc(results$test_data$IsBadCont90DPDL12M, results$test_data$predicted_prob)

plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "red")
text(0.6, 0.2, paste("AUC =", round(auc_value, 4)), col = "blue", cex = 1.2)

3.5 Data Summary

# Basic data summary
cat("## Data Summary\n")

## ## Data Summary

cat("Total accounts in test data:", nrow(results$test_data), "\n")

## Total accounts in test data: 43340

cat("Bad rate in test data:", round(mean(results$test_data$IsBadCont90DPDL12M) * 100, 2), "%\n")

## Bad rate in test data: 7.13 %

cat("Score range:", min(results$test_data$Score), "-", max(results$test_data$Score), "\n")

## Score range: 451 - 774

# Score distribution
cat("\n## Score Distribution Summary\n")

## 
## ## Score Distribution Summary

summary(results$test_data$Score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   451.0   609.0   642.0   634.4   665.0   774.0

4. Score Band Analysis

4.1 Create Custom Score Bands

# Identify score column
score_columns <- names(results$test_data)[grepl("score|Score", names(results$test_data))]
score_col <- score_columns[1]

cat("Using column '", score_col, "' for score band analysis.\n")

## Using column ' Score ' for score band analysis.

# Define custom score bands
results$test_data <- results$test_data %>%
  mutate(
    ScoreBands = case_when(
      !!sym(score_col) >= 743 & !!sym(score_col) <= 900 ~ "743-900",
      !!sym(score_col) >= 731 & !!sym(score_col) <= 742 ~ "731-742", 
      !!sym(score_col) >= 713 & !!sym(score_col) <= 730 ~ "713-730",
      !!sym(score_col) >= 701 & !!sym(score_col) <= 712 ~ "701-712",
      !!sym(score_col) >= 689 & !!sym(score_col) <= 700 ~ "689-700",
      !!sym(score_col) >= 677 & !!sym(score_col) <= 688 ~ "677-688",
      !!sym(score_col) >= 665 & !!sym(score_col) <= 676 ~ "665-676",
      !!sym(score_col) >= 653 & !!sym(score_col) <= 664 ~ "653-664",
      !!sym(score_col) >= 641 & !!sym(score_col) <= 652 ~ "641-652",
      !!sym(score_col) >= 620 & !!sym(score_col) <= 640 ~ "620-640",
      !!sym(score_col) >= 604 & !!sym(score_col) <= 619 ~ "604-619",
      !!sym(score_col) >= 574 & !!sym(score_col) <= 603 ~ "574-603",
      !!sym(score_col) >= 530 & !!sym(score_col) <= 573 ~ "530-573",
      !!sym(score_col) >= 251 & !!sym(score_col) <= 529 ~ "251-529",
      !!sym(score_col) == 250 ~ "250-250",
      !!sym(score_col) == 999 ~ "999-999",
      TRUE ~ "Other"
    ),
    ScoreBands_Ordered = factor(ScoreBands, levels = c(
      "999-999", "250-250", "251-529", "530-573", "574-603", "604-619", 
      "620-640", "641-652", "653-664", "665-676", "677-688", "689-700", 
      "701-712", "713-730", "731-742", "743-900"
    ))
  )

cat("Custom score bands created successfully.\n")

## Custom score bands created successfully.

4.2 Calculate Band Performance Metrics

# Calculate performance metrics by score band
performance_by_band <- results$test_data %>%
  group_by(ScoreBands, ScoreBands_Ordered) %>%
  summarise(
    Count = n(),
    Bad_Count = sum(IsBadCont90DPDL12M, na.rm = TRUE),
    Good_Count = Count - Bad_Count,
    Bad_Rate = round(Bad_Count / Count, 4),
    Bad_Rate_Pct = round(Bad_Rate * 100, 2),
    Avg_Score = round(mean(!!sym(score_col), na.rm = TRUE), 1),
    Min_Score = min(!!sym(score_col), na.rm = TRUE),
    Max_Score = max(!!sym(score_col), na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  arrange(ScoreBands_Ordered) %>%
  mutate(
    Cumulative_Count = cumsum(Count),
    Cumulative_Pct = round(Cumulative_Count / sum(Count) * 100, 1),
    Cumulative_Bads = cumsum(Bad_Count),
    Cumulative_Bad_Rate = round(Cumulative_Bads / Cumulative_Count * 100, 2)
  )

cat("Performance metrics calculated for", nrow(performance_by_band), "score bands.\n")

## Performance metrics calculated for 14 score bands.

4.3 Display Band Performance Table

# Display performance table
performance_by_band %>%
  dplyr::select(
    `Score Band` = ScoreBands,
    Accounts = Count,
    `Bad Accounts` = Bad_Count,
    `Bad Rate %` = Bad_Rate_Pct,
    `Avg Score` = Avg_Score,
    `Cumulative %` = Cumulative_Pct,
    `Cum Bad Rate %` = Cumulative_Bad_Rate
  ) %>%
  kable(align = "c", digits = 2, caption = "Performance by Score Band") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(performance_by_band$Bad_Rate_Pct > 10), bold = TRUE, color = "red")

Performance by Score Band
Score Band	Accounts	Bad Accounts	Bad Rate %	Avg Score	Cumulative %	Cum Bad Rate %
251-529	845	80	9.47	517.3	1.9	9.47
530-573	4176	374	8.96	554.3	11.6	9.04
574-603	4469	365	8.17	590.0	21.9	8.63
604-619	4005	342	8.54	611.9	31.1	8.60
620-640	7352	543	7.39	630.9	48.1	8.17
641-652	5978	400	6.69	646.5	61.9	7.84
653-664	5476	343	6.26	658.4	74.5	7.58
665-676	4401	277	6.29	670.0	84.7	7.42
677-688	3491	210	6.02	681.8	92.7	7.30
689-700	1608	97	6.03	693.9	96.4	7.25
701-712	806	35	4.34	705.4	98.3	7.20
713-730	535	20	3.74	720.1	99.5	7.15
731-742	136	2	1.47	736.4	99.9	7.14
743-900	62	2	3.23	753.0	100.0	7.13

5. Visualizations

5.1 Bad Rate by Score Band

# Filter out special bands for normal analysis
normal_bands <- performance_by_band %>% 
  filter(!ScoreBands %in% c("999-999", "250-250"))

ggplot(normal_bands, aes(x = reorder(ScoreBands, Avg_Score), y = Bad_Rate_Pct)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.7) +
  geom_text(aes(label = paste0(Bad_Rate_Pct, "%")), vjust = -0.5, size = 3) +
  labs(
    title = "Bad Rate by Score Band",
    subtitle = "Lower scores should show higher bad rates (monotonicity)",
    x = "Score Band",
    y = "Bad Rate (%)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

5.2 Population Distribution

ggplot(normal_bands, aes(x = reorder(ScoreBands, Avg_Score), y = Count)) +
  geom_bar(stat = "identity", fill = "darkorange", alpha = 0.7) +
  geom_text(aes(label = scales::comma(Count)), vjust = -0.5, size = 3) +
  labs(
    title = "Population Distribution by Score Band",
    x = "Score Band",
    y = "Number of Accounts"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

5.3 Combined Population and Bad Rate

# Create combined plot
p_combined <- ggplot(normal_bands, aes(x = reorder(ScoreBands, Avg_Score))) +
  geom_bar(aes(y = Count, fill = "Population"), stat = "identity", alpha = 0.6) +
  geom_line(aes(y = Bad_Rate_Pct * max(Count) / max(Bad_Rate_Pct), group = 1, color = "Bad Rate"), size = 1.5) +
  geom_point(aes(y = Bad_Rate_Pct * max(Count) / max(Bad_Rate_Pct), color = "Bad Rate"), size = 2) +
  scale_y_continuous(
    name = "Number of Accounts",
    sec.axis = sec_axis(~ . * max(normal_bands$Bad_Rate_Pct) / max(normal_bands$Count), 
                       name = "Bad Rate (%)")
  ) +
  scale_fill_manual(values = c("Population" = "steelblue")) +
  scale_color_manual(values = c("Bad Rate" = "red")) +
  labs(
    title = "Population Distribution and Bad Rate by Score Band",
    x = "Score Band",
    fill = "",
    color = ""
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )

print(p_combined)

6. Monotonicity Analysis

# Check monotonicity
monotonicity_check <- normal_bands %>%
  arrange(desc(Avg_Score)) %>%
  mutate(
    Bad_Rate_Change = ifelse(is.na(lag(Bad_Rate_Pct)), NA, Bad_Rate_Pct - lag(Bad_Rate_Pct)),
    Monotonicity = case_when(
      is.na(Bad_Rate_Change) ~ "Baseline",
      Bad_Rate_Change > 1 ~ "Significant Violation",
      Bad_Rate_Change > 0 ~ "Minor Violation", 
      Bad_Rate_Change <= 0 ~ "OK"
    )
  )

# Display monotonicity results
monotonicity_check %>%
  dplyr::select(
    `Score Band` = ScoreBands,
    `Bad Rate %` = Bad_Rate_Pct,
    `Change from Previous` = Bad_Rate_Change,
    Monotonicity
  ) %>%
  kable(align = "c", digits = 2, caption = "Monotonicity Check") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(monotonicity_check$Monotonicity == "Significant Violation"), 
           bold = TRUE, color = "red")

Monotonicity Check
Score Band	Bad Rate %	Change from Previous	Monotonicity
743-900	3.23	NA	Baseline
731-742	1.47	-1.76	OK
713-730	3.74	2.27	Significant Violation
701-712	4.34	0.60	Minor Violation
689-700	6.03	1.69	Significant Violation
677-688	6.02	-0.01	OK
665-676	6.29	0.27	Minor Violation
653-664	6.26	-0.03	OK
641-652	6.69	0.43	Minor Violation
620-640	7.39	0.70	Minor Violation
604-619	8.54	1.15	Significant Violation
574-603	8.17	-0.37	OK
530-573	8.96	0.79	Minor Violation
251-529	9.47	0.51	Minor Violation

7. Risk Segmentation

# Create risk segments
risk_segments <- normal_bands %>%
  mutate(
    Risk_Segment = case_when(
      Bad_Rate_Pct >= 20 ~ "Very High Risk",
      Bad_Rate_Pct >= 10 ~ "High Risk",
      Bad_Rate_Pct >= 5 ~ "Medium-High Risk",
      Bad_Rate_Pct >= 2 ~ "Medium Risk", 
      Bad_Rate_Pct >= 1 ~ "Low Risk",
      Bad_Rate_Pct < 1 ~ "Very Low Risk"
    )
  )

# Display risk segments
risk_segments %>%
  dplyr::select(
    `Score Band` = ScoreBands,
    `Bad Rate %` = Bad_Rate_Pct,
    `Risk Segment` = Risk_Segment
  ) %>%
  kable(align = "c", caption = "Risk Segmentation") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Risk Segmentation
Score Band	Bad Rate %	Risk Segment
251-529	9.47	Medium-High Risk
530-573	8.96	Medium-High Risk
574-603	8.17	Medium-High Risk
604-619	8.54	Medium-High Risk
620-640	7.39	Medium-High Risk
641-652	6.69	Medium-High Risk
653-664	6.26	Medium-High Risk
665-676	6.29	Medium-High Risk
677-688	6.02	Medium-High Risk
689-700	6.03	Medium-High Risk
701-712	4.34	Medium Risk
713-730	3.74	Medium Risk
731-742	1.47	Low Risk
743-900	3.23	Medium Risk

7.1 Portfolio Composition by Risk Segment

portfolio_composition <- risk_segments %>%
  group_by(Risk_Segment) %>%
  summarise(
    Bands = n(),
    Total_Accounts = sum(Count),
    Total_Bads = sum(Bad_Count),
    Avg_Bad_Rate = round(Total_Bads / Total_Accounts * 100, 2),
    Portfolio_Share = round(Total_Accounts / sum(Total_Accounts) * 100, 1)
  ) %>%
  arrange(desc(Avg_Bad_Rate))

portfolio_composition %>%
  kable(align = "c", digits = 2, caption = "Portfolio Composition by Risk Segment") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Portfolio Composition by Risk Segment
Risk_Segment	Bands	Total_Accounts	Total_Bads	Avg_Bad_Rate	Portfolio_Share
Medium-High Risk	10	41801	3031	7.25	100
Medium Risk	3	1403	57	4.06	100
Low Risk	1	136	2	1.47	100

8. Summary and Recommendations

cat("## ANALYSIS SUMMARY\n")

## ## ANALYSIS SUMMARY

cat("══════════════════════════════════════\n\n")

## ══════════════════════════════════════

# Overall statistics
total_accounts <- sum(performance_by_band$Count)
total_bads <- sum(performance_by_band$Bad_Count)
overall_bad_rate <- total_bads / total_accounts * 100

cat("### Portfolio Statistics:\n")

## ### Portfolio Statistics:

cat("- Total Accounts:", format(total_accounts, big.mark = ","), "\n")

## - Total Accounts: 43,340

cat("- Total Bad Accounts:", format(total_bads, big.mark = ","), "\n")

## - Total Bad Accounts: 3,090

cat("- Overall Bad Rate:", round(overall_bad_rate, 2), "%\n")

## - Overall Bad Rate: 7.13 %

cat("- Number of Score Bands:", nrow(performance_by_band), "\n")

## - Number of Score Bands: 14

cat("- Model AUC:", round(auc_value, 4), "(", ifelse(auc_value >= 0.8, "Good", "Fair"), ")\n\n")

## - Model AUC: 0.5502 ( Fair )

# Identify key bands
high_risk_bands <- normal_bands %>% filter(Bad_Rate_Pct > 10)
low_risk_bands <- normal_bands %>% filter(Bad_Rate_Pct < 1)
significant_violations <- sum(monotonicity_check$Monotonicity == "Significant Violation", na.rm = TRUE)

cat("### Key Findings:\n")

## ### Key Findings:

if(nrow(high_risk_bands) > 0) {
  cat("- High Risk Bands (Bad Rate > 10%):", paste(high_risk_bands$ScoreBands, collapse = ", "), "\n")
}
if(nrow(low_risk_bands) > 0) {
  cat("- Low Risk Bands (Bad Rate < 1%):", paste(low_risk_bands$ScoreBands, collapse = ", "), "\n")
}
if(significant_violations > 0) {
  cat("- Monotonicity Violations:", significant_violations, "significant violations detected\n")
}

## - Monotonicity Violations: 3 significant violations detected

cat("\n### Recommendations:\n")

## 
## ### Recommendations:

cat("1. Implement risk-based pricing strategies\n")

## 1. Implement risk-based pricing strategies

cat("2. Monitor high-risk bands closely for early warning signs\n")

## 2. Monitor high-risk bands closely for early warning signs

cat("3. Consider expanding credit access for low-risk segments\n")

## 3. Consider expanding credit access for low-risk segments

cat("4. Regularly validate and recalibrate score bands\n")

## 4. Regularly validate and recalibrate score bands

cat("5. Use portfolio composition for strategic planning\n")

## 5. Use portfolio composition for strategic planning

9. Debugging Information

## ## DEBUGGING INFORMATION

## ══════════════════════════

## Data dimensions:

## - Original data: 216701 rows

## - Test data: 43340 rows

## Column names in test data:

##  [1] "sample_order_id"    "ExSubjectId"        "ExContractId"      
##  [4] "SubscriberId"       "TypeOfContractId"   "IsBadCont30DPDL12M"
##  [7] "IsBadCont60DPDL12M" "IsBadCont90DPDL12M" "IsBadCont30DPDL6M" 
## [10] "IsBadCont60DPDL6M"  "IsBadCont90DPDL6M"  "ScoringDate"       
## [13] "Score"              "RiskGrade"          "ScoreBands"        
## [16] "predicted_prob"     "ScoreBands_Ordered"

## 
## Score bands created:

##  [1] "701-712" "641-652" "665-676" "620-640" "689-700" "653-664" "713-730"
##  [8] "574-603" "604-619" "530-573" "251-529" "677-688" "731-742" "743-900"

## 
## Performance metrics calculated successfully.

Credit Score Model Performance Analysis

Analytics Team

2025-11-05

1. Load Required Packages

2. Load and Prepare Data

3. Model Performance Analysis

3.1 Performance Calculation Function

3.2 Run Model Analysis

3.3 Display Performance Metrics

3.4 ROC Curve

3.5 Data Summary

4. Score Band Analysis

4.1 Create Custom Score Bands

4.2 Calculate Band Performance Metrics

4.3 Display Band Performance Table

5. Visualizations

5.1 Bad Rate by Score Band

5.2 Population Distribution

5.3 Combined Population and Bad Rate

6. Monotonicity Analysis

7. Risk Segmentation

7.1 Portfolio Composition by Risk Segment

8. Summary and Recommendations

9. Debugging Information