# Data manipulation and analysis
library(dplyr)
library(tidyr)
library(data.table)
library(lubridate)
library(readr)
# Visualization
library(ggplot2)
library(scales)
library(kableExtra)
library(gridExtra)
# Modeling and metrics
library(caret)
library(pROC)
library(scorecard)
library(arm)
library(gbm)
library(Metrics)
# Custom functions
source("apply_scorecard.R")# Load the dataset
CRDB_ModelFeatures_V1 <- read_csv(
"C://Users//m.njuguna//OneDrive - Creditinfo Group//Mary CreditInfo Documents//Analytics 2025//Kenya//CIP Score//Retro Analysis//Performance Analysis//OldCIPCustomers_04112025.csv"
)
cat("Dataset loaded successfully with", nrow(CRDB_ModelFeatures_V1), "rows and", ncol(CRDB_ModelFeatures_V1), "columns.\n")## Dataset loaded successfully with 315775 rows and 15 columns.
# Data cleaning and preparation
CRDB_ModelFeatures_V1 <- CRDB_ModelFeatures_V1 %>%
mutate(
IsBadCont90DPDL12M = ifelse(IsBadCont90DPDL12M >= 1, 1, 0),
Score = as.numeric(Score),
ScoringDate = as.Date(ScoringDate)
) %>%
# Exclude invalid or missing scores
filter(
!is.na(Score),
!is.na(IsBadCont90DPDL12M),
!Score %in% c(999, 250)
)
cat("Data cleaning completed. Final dataset has", nrow(CRDB_ModelFeatures_V1), "rows.\n")## Data cleaning completed. Final dataset has 216701 rows.
calculate_auc_gini <- function(dataset, target_col, predictor_col, train_split = 0.8, seed = 123) {
set.seed(seed)
# Create proper train-test split
train_index <- createDataPartition(dataset[[target_col]], p = train_split, list = FALSE)
train_data <- dataset[train_index, ]
test_data <- dataset[-train_index, ]
formula <- as.formula(paste(target_col, "~", predictor_col))
logistic_model <- glm(formula, data = train_data, family = binomial)
predicted_probs <- predict(logistic_model, test_data, type = "response")
# AUC - use pROC::auc() explicitly
roc_curve <- pROC::roc(test_data[[target_col]], predicted_probs)
auc_value <- pROC::auc(roc_curve) # Fixed: use pROC::auc()
# GINI
gini <- 2 * auc_value - 1
# KS stat
bad_probs <- predicted_probs[test_data[[target_col]] == 1]
good_probs <- predicted_probs[test_data[[target_col]] == 0]
if(length(bad_probs) > 0 && length(good_probs) > 0) {
ks_stat <- max(ecdf(bad_probs)(sort(predicted_probs)) - ecdf(good_probs)(sort(predicted_probs)))
} else {
ks_stat <- NA
warning("Insufficient data for KS calculation")
}
return(list(
perf = tibble(
auc = auc_value,
gini = gini,
ks_stat = ks_stat
),
test_data = test_data %>%
mutate(predicted_prob = predicted_probs)
))
}# Run the performance analysis
results <- calculate_auc_gini(
dataset = CRDB_ModelFeatures_V1,
target_col = "IsBadCont90DPDL12M",
predictor_col = "Score"
)
cat("Model performance analysis completed.\n")## Model performance analysis completed.
# Extract performance metrics
metrics <- results$perf
auc_value <- metrics$auc
gini_value <- metrics$gini
ks_value <- metrics$ks_stat
# Display metrics in a formatted table
performance_table <- tibble(
Metric = c("AUC (Area Under Curve)", "Gini Coefficient", "KS Statistic"),
Value = c(round(auc_value, 4), round(gini_value, 4), round(ks_value, 4)),
Interpretation = c(
ifelse(auc_value >= 0.8, "Good", ifelse(auc_value >= 0.7, "Fair", "Poor")),
ifelse(gini_value >= 0.6, "Strong", ifelse(gini_value >= 0.4, "Moderate", "Weak")),
ifelse(ks_value >= 0.4, "Strong", ifelse(ks_value >= 0.3, "Moderate", "Weak"))
)
)
performance_table %>%
kable(align = "c", caption = "Model Performance Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value | Interpretation |
|---|---|---|
| AUC (Area Under Curve) | 0.5502 | Poor |
| Gini Coefficient | 0.1003 | Weak |
| KS Statistic | 0.0016 | Weak |
# Generate ROC curve
roc_curve <- roc(results$test_data$IsBadCont90DPDL12M, results$test_data$predicted_prob)
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "red")
text(0.6, 0.2, paste("AUC =", round(auc_value, 4)), col = "blue", cex = 1.2)# Basic data summary
cat("## Data Summary\n")## ## Data Summary
cat("Total accounts in test data:", nrow(results$test_data), "\n")## Total accounts in test data: 43340
cat("Bad rate in test data:", round(mean(results$test_data$IsBadCont90DPDL12M) * 100, 2), "%\n")## Bad rate in test data: 7.13 %
cat("Score range:", min(results$test_data$Score), "-", max(results$test_data$Score), "\n")## Score range: 451 - 774
# Score distribution
cat("\n## Score Distribution Summary\n")##
## ## Score Distribution Summary
summary(results$test_data$Score)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 451.0 609.0 642.0 634.4 665.0 774.0
# Identify score column
score_columns <- names(results$test_data)[grepl("score|Score", names(results$test_data))]
score_col <- score_columns[1]
cat("Using column '", score_col, "' for score band analysis.\n")## Using column ' Score ' for score band analysis.
# Define custom score bands
results$test_data <- results$test_data %>%
mutate(
ScoreBands = case_when(
!!sym(score_col) >= 743 & !!sym(score_col) <= 900 ~ "743-900",
!!sym(score_col) >= 731 & !!sym(score_col) <= 742 ~ "731-742",
!!sym(score_col) >= 713 & !!sym(score_col) <= 730 ~ "713-730",
!!sym(score_col) >= 701 & !!sym(score_col) <= 712 ~ "701-712",
!!sym(score_col) >= 689 & !!sym(score_col) <= 700 ~ "689-700",
!!sym(score_col) >= 677 & !!sym(score_col) <= 688 ~ "677-688",
!!sym(score_col) >= 665 & !!sym(score_col) <= 676 ~ "665-676",
!!sym(score_col) >= 653 & !!sym(score_col) <= 664 ~ "653-664",
!!sym(score_col) >= 641 & !!sym(score_col) <= 652 ~ "641-652",
!!sym(score_col) >= 620 & !!sym(score_col) <= 640 ~ "620-640",
!!sym(score_col) >= 604 & !!sym(score_col) <= 619 ~ "604-619",
!!sym(score_col) >= 574 & !!sym(score_col) <= 603 ~ "574-603",
!!sym(score_col) >= 530 & !!sym(score_col) <= 573 ~ "530-573",
!!sym(score_col) >= 251 & !!sym(score_col) <= 529 ~ "251-529",
!!sym(score_col) == 250 ~ "250-250",
!!sym(score_col) == 999 ~ "999-999",
TRUE ~ "Other"
),
ScoreBands_Ordered = factor(ScoreBands, levels = c(
"999-999", "250-250", "251-529", "530-573", "574-603", "604-619",
"620-640", "641-652", "653-664", "665-676", "677-688", "689-700",
"701-712", "713-730", "731-742", "743-900"
))
)
cat("Custom score bands created successfully.\n")## Custom score bands created successfully.
# Calculate performance metrics by score band
performance_by_band <- results$test_data %>%
group_by(ScoreBands, ScoreBands_Ordered) %>%
summarise(
Count = n(),
Bad_Count = sum(IsBadCont90DPDL12M, na.rm = TRUE),
Good_Count = Count - Bad_Count,
Bad_Rate = round(Bad_Count / Count, 4),
Bad_Rate_Pct = round(Bad_Rate * 100, 2),
Avg_Score = round(mean(!!sym(score_col), na.rm = TRUE), 1),
Min_Score = min(!!sym(score_col), na.rm = TRUE),
Max_Score = max(!!sym(score_col), na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(ScoreBands_Ordered) %>%
mutate(
Cumulative_Count = cumsum(Count),
Cumulative_Pct = round(Cumulative_Count / sum(Count) * 100, 1),
Cumulative_Bads = cumsum(Bad_Count),
Cumulative_Bad_Rate = round(Cumulative_Bads / Cumulative_Count * 100, 2)
)
cat("Performance metrics calculated for", nrow(performance_by_band), "score bands.\n")## Performance metrics calculated for 14 score bands.
# Display performance table
performance_by_band %>%
dplyr::select(
`Score Band` = ScoreBands,
Accounts = Count,
`Bad Accounts` = Bad_Count,
`Bad Rate %` = Bad_Rate_Pct,
`Avg Score` = Avg_Score,
`Cumulative %` = Cumulative_Pct,
`Cum Bad Rate %` = Cumulative_Bad_Rate
) %>%
kable(align = "c", digits = 2, caption = "Performance by Score Band") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which(performance_by_band$Bad_Rate_Pct > 10), bold = TRUE, color = "red")| Score Band | Accounts | Bad Accounts | Bad Rate % | Avg Score | Cumulative % | Cum Bad Rate % |
|---|---|---|---|---|---|---|
| 251-529 | 845 | 80 | 9.47 | 517.3 | 1.9 | 9.47 |
| 530-573 | 4176 | 374 | 8.96 | 554.3 | 11.6 | 9.04 |
| 574-603 | 4469 | 365 | 8.17 | 590.0 | 21.9 | 8.63 |
| 604-619 | 4005 | 342 | 8.54 | 611.9 | 31.1 | 8.60 |
| 620-640 | 7352 | 543 | 7.39 | 630.9 | 48.1 | 8.17 |
| 641-652 | 5978 | 400 | 6.69 | 646.5 | 61.9 | 7.84 |
| 653-664 | 5476 | 343 | 6.26 | 658.4 | 74.5 | 7.58 |
| 665-676 | 4401 | 277 | 6.29 | 670.0 | 84.7 | 7.42 |
| 677-688 | 3491 | 210 | 6.02 | 681.8 | 92.7 | 7.30 |
| 689-700 | 1608 | 97 | 6.03 | 693.9 | 96.4 | 7.25 |
| 701-712 | 806 | 35 | 4.34 | 705.4 | 98.3 | 7.20 |
| 713-730 | 535 | 20 | 3.74 | 720.1 | 99.5 | 7.15 |
| 731-742 | 136 | 2 | 1.47 | 736.4 | 99.9 | 7.14 |
| 743-900 | 62 | 2 | 3.23 | 753.0 | 100.0 | 7.13 |
# Filter out special bands for normal analysis
normal_bands <- performance_by_band %>%
filter(!ScoreBands %in% c("999-999", "250-250"))
ggplot(normal_bands, aes(x = reorder(ScoreBands, Avg_Score), y = Bad_Rate_Pct)) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.7) +
geom_text(aes(label = paste0(Bad_Rate_Pct, "%")), vjust = -0.5, size = 3) +
labs(
title = "Bad Rate by Score Band",
subtitle = "Lower scores should show higher bad rates (monotonicity)",
x = "Score Band",
y = "Bad Rate (%)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot(normal_bands, aes(x = reorder(ScoreBands, Avg_Score), y = Count)) +
geom_bar(stat = "identity", fill = "darkorange", alpha = 0.7) +
geom_text(aes(label = scales::comma(Count)), vjust = -0.5, size = 3) +
labs(
title = "Population Distribution by Score Band",
x = "Score Band",
y = "Number of Accounts"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Create combined plot
p_combined <- ggplot(normal_bands, aes(x = reorder(ScoreBands, Avg_Score))) +
geom_bar(aes(y = Count, fill = "Population"), stat = "identity", alpha = 0.6) +
geom_line(aes(y = Bad_Rate_Pct * max(Count) / max(Bad_Rate_Pct), group = 1, color = "Bad Rate"), size = 1.5) +
geom_point(aes(y = Bad_Rate_Pct * max(Count) / max(Bad_Rate_Pct), color = "Bad Rate"), size = 2) +
scale_y_continuous(
name = "Number of Accounts",
sec.axis = sec_axis(~ . * max(normal_bands$Bad_Rate_Pct) / max(normal_bands$Count),
name = "Bad Rate (%)")
) +
scale_fill_manual(values = c("Population" = "steelblue")) +
scale_color_manual(values = c("Bad Rate" = "red")) +
labs(
title = "Population Distribution and Bad Rate by Score Band",
x = "Score Band",
fill = "",
color = ""
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top"
)
print(p_combined)# Check monotonicity
monotonicity_check <- normal_bands %>%
arrange(desc(Avg_Score)) %>%
mutate(
Bad_Rate_Change = ifelse(is.na(lag(Bad_Rate_Pct)), NA, Bad_Rate_Pct - lag(Bad_Rate_Pct)),
Monotonicity = case_when(
is.na(Bad_Rate_Change) ~ "Baseline",
Bad_Rate_Change > 1 ~ "Significant Violation",
Bad_Rate_Change > 0 ~ "Minor Violation",
Bad_Rate_Change <= 0 ~ "OK"
)
)
# Display monotonicity results
monotonicity_check %>%
dplyr::select(
`Score Band` = ScoreBands,
`Bad Rate %` = Bad_Rate_Pct,
`Change from Previous` = Bad_Rate_Change,
Monotonicity
) %>%
kable(align = "c", digits = 2, caption = "Monotonicity Check") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which(monotonicity_check$Monotonicity == "Significant Violation"),
bold = TRUE, color = "red")| Score Band | Bad Rate % | Change from Previous | Monotonicity |
|---|---|---|---|
| 743-900 | 3.23 | NA | Baseline |
| 731-742 | 1.47 | -1.76 | OK |
| 713-730 | 3.74 | 2.27 | Significant Violation |
| 701-712 | 4.34 | 0.60 | Minor Violation |
| 689-700 | 6.03 | 1.69 | Significant Violation |
| 677-688 | 6.02 | -0.01 | OK |
| 665-676 | 6.29 | 0.27 | Minor Violation |
| 653-664 | 6.26 | -0.03 | OK |
| 641-652 | 6.69 | 0.43 | Minor Violation |
| 620-640 | 7.39 | 0.70 | Minor Violation |
| 604-619 | 8.54 | 1.15 | Significant Violation |
| 574-603 | 8.17 | -0.37 | OK |
| 530-573 | 8.96 | 0.79 | Minor Violation |
| 251-529 | 9.47 | 0.51 | Minor Violation |
# Create risk segments
risk_segments <- normal_bands %>%
mutate(
Risk_Segment = case_when(
Bad_Rate_Pct >= 20 ~ "Very High Risk",
Bad_Rate_Pct >= 10 ~ "High Risk",
Bad_Rate_Pct >= 5 ~ "Medium-High Risk",
Bad_Rate_Pct >= 2 ~ "Medium Risk",
Bad_Rate_Pct >= 1 ~ "Low Risk",
Bad_Rate_Pct < 1 ~ "Very Low Risk"
)
)
# Display risk segments
risk_segments %>%
dplyr::select(
`Score Band` = ScoreBands,
`Bad Rate %` = Bad_Rate_Pct,
`Risk Segment` = Risk_Segment
) %>%
kable(align = "c", caption = "Risk Segmentation") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Score Band | Bad Rate % | Risk Segment |
|---|---|---|
| 251-529 | 9.47 | Medium-High Risk |
| 530-573 | 8.96 | Medium-High Risk |
| 574-603 | 8.17 | Medium-High Risk |
| 604-619 | 8.54 | Medium-High Risk |
| 620-640 | 7.39 | Medium-High Risk |
| 641-652 | 6.69 | Medium-High Risk |
| 653-664 | 6.26 | Medium-High Risk |
| 665-676 | 6.29 | Medium-High Risk |
| 677-688 | 6.02 | Medium-High Risk |
| 689-700 | 6.03 | Medium-High Risk |
| 701-712 | 4.34 | Medium Risk |
| 713-730 | 3.74 | Medium Risk |
| 731-742 | 1.47 | Low Risk |
| 743-900 | 3.23 | Medium Risk |
portfolio_composition <- risk_segments %>%
group_by(Risk_Segment) %>%
summarise(
Bands = n(),
Total_Accounts = sum(Count),
Total_Bads = sum(Bad_Count),
Avg_Bad_Rate = round(Total_Bads / Total_Accounts * 100, 2),
Portfolio_Share = round(Total_Accounts / sum(Total_Accounts) * 100, 1)
) %>%
arrange(desc(Avg_Bad_Rate))
portfolio_composition %>%
kable(align = "c", digits = 2, caption = "Portfolio Composition by Risk Segment") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Risk_Segment | Bands | Total_Accounts | Total_Bads | Avg_Bad_Rate | Portfolio_Share |
|---|---|---|---|---|---|
| Medium-High Risk | 10 | 41801 | 3031 | 7.25 | 100 |
| Medium Risk | 3 | 1403 | 57 | 4.06 | 100 |
| Low Risk | 1 | 136 | 2 | 1.47 | 100 |
cat("## ANALYSIS SUMMARY\n")## ## ANALYSIS SUMMARY
cat("══════════════════════════════════════\n\n")## ══════════════════════════════════════
# Overall statistics
total_accounts <- sum(performance_by_band$Count)
total_bads <- sum(performance_by_band$Bad_Count)
overall_bad_rate <- total_bads / total_accounts * 100
cat("### Portfolio Statistics:\n")## ### Portfolio Statistics:
cat("- Total Accounts:", format(total_accounts, big.mark = ","), "\n")## - Total Accounts: 43,340
cat("- Total Bad Accounts:", format(total_bads, big.mark = ","), "\n")## - Total Bad Accounts: 3,090
cat("- Overall Bad Rate:", round(overall_bad_rate, 2), "%\n")## - Overall Bad Rate: 7.13 %
cat("- Number of Score Bands:", nrow(performance_by_band), "\n")## - Number of Score Bands: 14
cat("- Model AUC:", round(auc_value, 4), "(", ifelse(auc_value >= 0.8, "Good", "Fair"), ")\n\n")## - Model AUC: 0.5502 ( Fair )
# Identify key bands
high_risk_bands <- normal_bands %>% filter(Bad_Rate_Pct > 10)
low_risk_bands <- normal_bands %>% filter(Bad_Rate_Pct < 1)
significant_violations <- sum(monotonicity_check$Monotonicity == "Significant Violation", na.rm = TRUE)
cat("### Key Findings:\n")## ### Key Findings:
if(nrow(high_risk_bands) > 0) {
cat("- High Risk Bands (Bad Rate > 10%):", paste(high_risk_bands$ScoreBands, collapse = ", "), "\n")
}
if(nrow(low_risk_bands) > 0) {
cat("- Low Risk Bands (Bad Rate < 1%):", paste(low_risk_bands$ScoreBands, collapse = ", "), "\n")
}
if(significant_violations > 0) {
cat("- Monotonicity Violations:", significant_violations, "significant violations detected\n")
}## - Monotonicity Violations: 3 significant violations detected
cat("\n### Recommendations:\n")##
## ### Recommendations:
cat("1. Implement risk-based pricing strategies\n")## 1. Implement risk-based pricing strategies
cat("2. Monitor high-risk bands closely for early warning signs\n")## 2. Monitor high-risk bands closely for early warning signs
cat("3. Consider expanding credit access for low-risk segments\n")## 3. Consider expanding credit access for low-risk segments
cat("4. Regularly validate and recalibrate score bands\n")## 4. Regularly validate and recalibrate score bands
cat("5. Use portfolio composition for strategic planning\n")## 5. Use portfolio composition for strategic planning
## ## DEBUGGING INFORMATION
## ══════════════════════════
## Data dimensions:
## - Original data: 216701 rows
## - Test data: 43340 rows
## Column names in test data:
## [1] "sample_order_id" "ExSubjectId" "ExContractId"
## [4] "SubscriberId" "TypeOfContractId" "IsBadCont30DPDL12M"
## [7] "IsBadCont60DPDL12M" "IsBadCont90DPDL12M" "IsBadCont30DPDL6M"
## [10] "IsBadCont60DPDL6M" "IsBadCont90DPDL6M" "ScoringDate"
## [13] "Score" "RiskGrade" "ScoreBands"
## [16] "predicted_prob" "ScoreBands_Ordered"
##
## Score bands created:
## [1] "701-712" "641-652" "665-676" "620-640" "689-700" "653-664" "713-730"
## [8] "574-603" "604-619" "530-573" "251-529" "677-688" "731-742" "743-900"
##
## Performance metrics calculated successfully.