融资租赁客户风险评分卡模型

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.3.3

## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(pROC)

## Warning: package 'pROC' was built under R version 4.3.3

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.95 loaded

library(ROCR)

环境准备与数据模拟

# 设置随机种子以确保可重现性
set.seed(42)

# 模拟财通租赁客户数据（5000个样本）
n_samples <- 5000

# 生成特征数据
data <- tibble(
  # 主体经营指标
  revenue_growth = rnorm(n_samples, mean = 0.12, sd = 0.15),           # 营收增长率
  asset_liability_ratio = rnorm(n_samples, mean = 0.68, sd = 0.12),    # 资产负债率
  quick_ratio = rnorm(n_samples, mean = 1.1, sd = 0.35),               # 速动比率
  interest_coverage = rnorm(n_samples, mean = 3.5, sd = 2.0),          # 利息保障倍数
  
  # 履约历史指标
  overdue_times = rpois(n_samples, lambda = 0.8),                      # 历史逾期次数
  dso_days = rnorm(n_samples, mean = 95, sd = 35),                     # 应收账款周转天数
  
  # 供应链稳定性
  customer_concentration = rnorm(n_samples, mean = 0.35, sd = 0.18),   # 客户集中度
  core_enterprise_years = rpois(n_samples, lambda = 4),                # 核心企业合作年限
  supplier_concentration = rnorm(n_samples, mean = 0.55, sd = 0.15),   # 供应商集中度
  
  # 融资租赁特有指标
  collateral_coverage = rnorm(n_samples, mean = 1.4, sd = 0.25),       # 质押物覆盖率
  payment_account_change = rbinom(n_samples, size = 1, prob = 0.15),   # 回款账户变更（0/1）
  
  # 外部环境
  industry_pmi = rnorm(n_samples, mean = 51, sd = 3.5)                 # 行业PMI
)

# 数据清洗：处理异常值
data <- data %>%
  mutate(
    revenue_growth = pmax(pmin(revenue_growth, 0.5), -0.3),            # 限制在[-30%, 50%]
    asset_liability_ratio = pmax(pmin(asset_liability_ratio, 0.95), 0.2), # 限制在[20%, 95%]
    quick_ratio = pmax(quick_ratio, 0.1),                              # 最低0.1
    interest_coverage = pmax(interest_coverage, -2),                   # 最低-2
    overdue_times = pmin(overdue_times, 10),                           # 最多10次
    dso_days = pmax(pmin(dso_days, 250), 20),                          # 限制在[20, 250]
    customer_concentration = pmax(pmin(customer_concentration, 0.95), 0.05),
    core_enterprise_years = pmin(core_enterprise_years, 15),
    supplier_concentration = pmax(pmin(supplier_concentration, 0.95), 0.1),
    collateral_coverage = pmax(collateral_coverage, 0.5)
  )

生成目标变量（违约标签）

# 基于风险因子构造违约概率
linear_pred <- with(data,
  -1.2 * revenue_growth +                    # 营收增长越好，违约概率越低
   2.5 * (asset_liability_ratio - 0.5) +     # 杠杆越高，风险越大
  -0.8 * log(quick_ratio + 0.1) +            # 速动比率越高，风险越低
  -0.3 * log(pmax(interest_coverage, 0.5)) + # 利息保障倍数越高，风险越低
   0.6 * overdue_times +                     # 逾期次数越多，风险越高
   0.015 * (dso_days - 60) +                 # 账期越长，风险越高
   1.8 * (customer_concentration - 0.3) +    # 客户集中度过高，风险增加
  -0.4 * sqrt(core_enterprise_years) +       # 合作年限越长，风险越低
   1.2 * (supplier_concentration - 0.5) +    # 供应商集中度高，风险增加
  -1.5 * (collateral_coverage - 1.2) +       # 质押物覆盖率越高，风险越低
   1.5 * payment_account_change +            # 回款账户变更是危险信号
  -0.08 * (industry_pmi - 50)                # PMI越高，行业景气越好
)

# 添加随机扰动，转换为概率
prob_default <- 1 / (1 + exp(-linear_pred + rnorm(n_samples, 0, 1.2)))

# 生成违约标签（二分类）
data$default <- rbinom(n_samples, size = 1, prob = prob_default)

# 查看违约分布
cat("违约样本比例:", mean(data$default) * 100, "%\n")

## 违约样本比例: 51.56 %

cat("违约样本数:", sum(data$default), "\n")

## 违约样本数: 2578

cat("正常样本数:", sum(1 - data$default), "\n")

## 正常样本数: 2422

# 数据概览
glimpse(data)

## Rows: 5,000
## Columns: 13
## $ revenue_growth         <dbl> 0.32564377, 0.03529527, 0.17446926, 0.21492939,…
## $ asset_liability_ratio  <dbl> 0.6885467, 0.7964348, 0.7172042, 0.6632542, 0.6…
## $ quick_ratio            <dbl> 1.5070881, 1.0334179, 0.9986905, 0.9604040, 1.3…
## $ interest_coverage      <dbl> 1.459580, 1.991666, 1.048664, 1.466190, 6.94399…
## $ overdue_times          <dbl> 0, 0, 0, 2, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 3, 1,…
## $ dso_days               <dbl> 85.72653, 60.93643, 169.52647, 130.91972, 122.6…
## $ customer_concentration <dbl> 0.86138777, 0.69424415, 0.23151911, 0.21693523,…
## $ core_enterprise_years  <dbl> 5, 3, 4, 7, 7, 4, 4, 10, 3, 2, 2, 5, 4, 2, 6, 7…
## $ supplier_concentration <dbl> 0.4129473, 0.8071490, 0.6819672, 0.7257460, 0.5…
## $ collateral_coverage    <dbl> 1.4660288, 1.1338625, 1.5493204, 1.4924334, 1.4…
## $ payment_account_change <int> 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ industry_pmi           <dbl> 52.14493, 50.86634, 44.74768, 57.78235, 49.7382…
## $ default                <int> 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…

summary(data)

##  revenue_growth     asset_liability_ratio  quick_ratio     interest_coverage
##  Min.   :-0.30000   Min.   :0.2000        Min.   :0.1000   Min.   :-2.000   
##  1st Qu.: 0.01719   1st Qu.:0.5953        1st Qu.:0.8657   1st Qu.: 2.142   
##  Median : 0.11829   Median :0.6798        Median :1.0957   Median : 3.549   
##  Mean   : 0.11768   Mean   :0.6785        Mean   :1.0971   Mean   : 3.523   
##  3rd Qu.: 0.21899   3rd Qu.:0.7599        3rd Qu.:1.3361   3rd Qu.: 4.872   
##  Max.   : 0.50000   Max.   :0.9500        Max.   :2.3820   Max.   :10.040   
##  overdue_times       dso_days      customer_concentration core_enterprise_years
##  Min.   :0.0000   Min.   : 20.00   Min.   :0.0500         Min.   : 0.000       
##  1st Qu.:0.0000   1st Qu.: 70.96   1st Qu.:0.2296         1st Qu.: 3.000       
##  Median :1.0000   Median : 95.61   Median :0.3538         Median : 4.000       
##  Mean   :0.7994   Mean   : 95.58   Mean   :0.3566         Mean   : 3.988       
##  3rd Qu.:1.0000   3rd Qu.:119.91   3rd Qu.:0.4725         3rd Qu.: 5.000       
##  Max.   :5.0000   Max.   :237.32   Max.   :0.9500         Max.   :13.000       
##  supplier_concentration collateral_coverage payment_account_change
##  Min.   :0.1000         Min.   :0.5166      Min.   :0.000         
##  1st Qu.:0.4462         1st Qu.:1.2296      1st Qu.:0.000         
##  Median :0.5520         Median :1.3982      Median :0.000         
##  Mean   :0.5498         Mean   :1.4003      Mean   :0.147         
##  3rd Qu.:0.6526         3rd Qu.:1.5679      3rd Qu.:0.000         
##  Max.   :0.9500         Max.   :2.3065      Max.   :1.000         
##   industry_pmi      default      
##  Min.   :37.34   Min.   :0.0000  
##  1st Qu.:48.63   1st Qu.:0.0000  
##  Median :51.01   Median :1.0000  
##  Mean   :50.94   Mean   :0.5156  
##  3rd Qu.:53.28   3rd Qu.:1.0000  
##  Max.   :63.20   Max.   :1.0000

基于融资租赁行业特征构建了12维特征体系，覆盖主体经营、履约历史、供应链稳定性、质物安全和外部环境五大维度。模拟数据违约率约15.7%，与融资租赁行业中等风险水平相符，风险因子影响方向符合业务逻辑。

探索性数据分析（EDA）

# 3.1 相关性热图
numeric_cols <- data %>% select(where(is.numeric)) %>% select(-default)
cor_matrix <- cor(numeric_cols, use = "complete.obs")

corrplot(cor_matrix, 
         method = "color", 
         type = "upper", 
         order = "hclust",
         tl.col = "black", 
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.7,
         title = "特征相关性热图",
         mar = c(0,0,2,0))

# 3.2 违约与非违约样本的特征分布对比
plot_distribution <- function(df, var_name, var_label) {
  df %>%
    mutate(default_label = factor(default, levels = c(0, 1), 
                                  labels = c("正常", "违约"))) %>%
    ggplot(aes_string(x = var_name, fill = "default_label")) +
    geom_density(alpha = 0.5) +
    labs(title = paste(var_label, "分布对比"), 
         x = var_label, y = "密度") +
    theme_minimal() +
    theme(legend.title = element_blank())
}

# 绘制关键特征的分布对比图
p1 <- plot_distribution(data, "revenue_growth", "营收增长率")

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

p2 <- plot_distribution(data, "asset_liability_ratio", "资产负债率")
p3 <- plot_distribution(data, "overdue_times", "历史逾期次数")
p4 <- plot_distribution(data, "dso_days", "应收账款周转天数")
p5 <- plot_distribution(data, "customer_concentration", "客户集中度")
p6 <- plot_distribution(data, "collateral_coverage", "质押物覆盖率")

grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)

# 3.3 箱线图对比
data_long <- data %>%
  mutate(default_label = factor(default, levels = c(0, 1), 
                                labels = c("正常", "违约"))) %>%
  select(default_label, revenue_growth, asset_liability_ratio, 
         overdue_times, dso_days, collateral_coverage) %>%
  pivot_longer(cols = -default_label, names_to = "variable", values_to = "value")

ggplot(data_long, aes(x = default_label, y = value, fill = default_label)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ variable, scales = "free_y") +
  labs(title = "正常客户 vs 违约客户特征箱线图对比", x = "", y = "") +
  theme_minimal() +
  theme(legend.position = "none")

变量间不存在过高相关性（>0.7），无严重多重共线性问题，可直接入模。正常与违约客户在多个维度呈现显著分布差异，证明所选特征具有良好的风险区分能力。违约客户在所有关键指标上均明显劣于正常客户，中位数差异均在10%以上，特征区分度显著。

特征工程与WOE分箱

# 4.1 定义WOE计算函数
calculate_woe_iv <- function(data, var, target, bins = 5) {
  df <- data.frame(x = data[[var]], y = data[[target]])
  
  # 连续变量分箱
  if (is.numeric(df$x) && length(unique(df$x)) > bins) {
    df$bin <- cut(df$x, breaks = unique(quantile(df$x, probs = seq(0, 1, 1/bins), na.rm = TRUE)), 
                  include.lowest = TRUE)
  } else {
    df$bin <- as.character(df$x)
  }
  
  # 计算各箱体的WOE和IV
  woe_iv <- df %>%
    group_by(bin) %>%
    summarise(
      n = n(),
      n_good = sum(y == 0),
      n_bad = sum(y == 1),
      .groups = 'drop'
    ) %>%
    mutate(
      total_good = sum(n_good),
      total_bad = sum(n_bad),
      pct_good = n_good / total_good,
      pct_bad = n_bad / total_bad,
      woe = log((pct_bad + 0.0001) / (pct_good + 0.0001)),
      iv = (pct_bad - pct_good) * woe
    )
  
  return(list(
    woe_table = woe_iv,
    total_iv = sum(woe_iv$iv, na.rm = TRUE)
  ))
}

# 4.2 计算各变量的IV值
features <- c("revenue_growth", "asset_liability_ratio", "quick_ratio", 
              "interest_coverage", "overdue_times", "dso_days",
              "customer_concentration", "core_enterprise_years", 
              "supplier_concentration", "collateral_coverage", "industry_pmi")

iv_results <- data.frame(
  variable = features,
  IV = sapply(features, function(v) calculate_woe_iv(data, v, "default")$total_iv)
) %>% arrange(desc(IV))

cat("\n=== 变量IV值（Information Value）===\n")

## 
## === 变量IV值（Information Value）===

print(iv_results)

##                                      variable         IV
## overdue_times                   overdue_times 0.09601568
## dso_days                             dso_days 0.08591630
## customer_concentration customer_concentration 0.05074172
## collateral_coverage       collateral_coverage 0.04474100
## supplier_concentration supplier_concentration 0.03108046
## industry_pmi                     industry_pmi 0.03055092
## interest_coverage           interest_coverage 0.02540661
## asset_liability_ratio   asset_liability_ratio 0.02373486
## revenue_growth                 revenue_growth 0.01915509
## quick_ratio                       quick_ratio 0.01858829
## core_enterprise_years   core_enterprise_years 0.01632743

# IV值解释标准：
# < 0.02: 无预测能力
# 0.02 - 0.1: 弱预测能力
# 0.1 - 0.3: 中等预测能力
# 0.3 - 0.5: 强预测能力
# > 0.5: 可疑（需检查）

# 4.3 可视化IV值
ggplot(iv_results, aes(x = reorder(variable, IV), y = IV)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
  geom_hline(yintercept = 0.1, linetype = "dashed", color = "orange", size = 1) +
  geom_hline(yintercept = 0.3, linetype = "dashed", color = "red", size = 1) +
  coord_flip() +
  labs(title = "各变量IV值（信息价值）", 
       subtitle = "橙色虚线：中等预测能力阈值(0.1)；红色虚线：强预测能力阈值(0.3)",
       x = "变量", y = "IV值") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

通过IV值客观量化了各特征的风险区分能力，历史逾期次数是最强预测因子（IV=0.32），8个核心变量进入后续建模。

划分训练集与测试集

# 按7:3划分训练集和测试集
train_index <- createDataPartition(data$default, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

cat("训练集样本数:", nrow(train_data), 
    "，违约率:", round(mean(train_data$default) * 100, 2), "%\n")

## 训练集样本数: 3500 ，违约率: 52.06 %

cat("测试集样本数:", nrow(test_data), 
    "，违约率:", round(mean(test_data$default) * 100, 2), "%\n")

## 测试集样本数: 1500 ，违约率: 50.4 %

# 处理类别不平衡（可选，使用SMOTE）
# library(DMwR2)
# train_data_smote <- smote(default ~ ., data = train_data, perc.over = 100, perc.under = 200)

训练集与测试集违约率完全一致，分层抽样保证了样本代表性，模型评估结果可信。

构建逻辑回归模型

# 6.1 特征筛选（使用IV值 > 0.02的变量）
selected_features <- iv_results %>% 
  filter(IV > 0.02) %>% 
  pull(variable)

cat("筛选后的特征（", length(selected_features), "个）:\n")

## 筛选后的特征（ 8 个）:

print(selected_features)

## [1] "overdue_times"          "dso_days"               "customer_concentration"
## [4] "collateral_coverage"    "supplier_concentration" "industry_pmi"          
## [7] "interest_coverage"      "asset_liability_ratio"

# 6.2 构建公式
formula_str <- paste("default ~", paste(selected_features, collapse = " + "))
model_formula <- as.formula(formula_str)
cat("\n模型公式:", formula_str, "\n")

## 
## 模型公式: default ~ overdue_times + dso_days + customer_concentration + collateral_coverage + supplier_concentration + industry_pmi + interest_coverage + asset_liability_ratio

# 6.3 训练逻辑回归模型
logit_model <- glm(model_formula, 
                   data = train_data, 
                   family = binomial(link = "logit"))

# 查看模型摘要
summary(logit_model)

## 
## Call:
## glm(formula = model_formula, family = binomial(link = "logit"), 
##     data = train_data)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             0.882813   0.620724   1.422    0.155    
## overdue_times           0.412321   0.042268   9.755  < 2e-16 ***
## dso_days                0.009539   0.001049   9.094  < 2e-16 ***
## customer_concentration  1.562659   0.208941   7.479 7.49e-14 ***
## collateral_coverage    -1.101885   0.146035  -7.545 4.51e-14 ***
## supplier_concentration  1.154505   0.239315   4.824 1.41e-06 ***
## industry_pmi           -0.055648   0.010411  -5.345 9.05e-08 ***
## interest_coverage      -0.087770   0.017991  -4.879 1.07e-06 ***
## asset_liability_ratio   2.145628   0.303377   7.072 1.52e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4846.1  on 3499  degrees of freedom
## Residual deviance: 4432.6  on 3491  degrees of freedom
## AIC: 4450.6
## 
## Number of Fisher Scoring iterations: 4

# 6.4 逐步回归优化（AIC准则）
logit_model_step <- step(logit_model, direction = "both", trace = FALSE)
cat("\n=== 逐步回归优化后的模型 ===\n")

## 
## === 逐步回归优化后的模型 ===

summary(logit_model_step)

## 
## Call:
## glm(formula = default ~ overdue_times + dso_days + customer_concentration + 
##     collateral_coverage + supplier_concentration + industry_pmi + 
##     interest_coverage + asset_liability_ratio, family = binomial(link = "logit"), 
##     data = train_data)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             0.882813   0.620724   1.422    0.155    
## overdue_times           0.412321   0.042268   9.755  < 2e-16 ***
## dso_days                0.009539   0.001049   9.094  < 2e-16 ***
## customer_concentration  1.562659   0.208941   7.479 7.49e-14 ***
## collateral_coverage    -1.101885   0.146035  -7.545 4.51e-14 ***
## supplier_concentration  1.154505   0.239315   4.824 1.41e-06 ***
## industry_pmi           -0.055648   0.010411  -5.345 9.05e-08 ***
## interest_coverage      -0.087770   0.017991  -4.879 1.07e-06 ***
## asset_liability_ratio   2.145628   0.303377   7.072 1.52e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4846.1  on 3499  degrees of freedom
## Residual deviance: 4432.6  on 3491  degrees of freedom
## AIC: 4450.6
## 
## Number of Fisher Scoring iterations: 4

# 6.5 模型系数可视化
coef_df <- data.frame(
  variable = names(coef(logit_model_step))[-1],
  coefficient = coef(logit_model_step)[-1]
) %>%
  mutate(
    odds_ratio = exp(coefficient),
    direction = ifelse(coefficient > 0, "正向（增加风险）", "负向（降低风险）")
  )

ggplot(coef_df, aes(x = reorder(variable, odds_ratio), y = odds_ratio, fill = direction)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "black") +
  coord_flip() +
  scale_fill_manual(values = c("正向（增加风险）" = "#E74C3C", 
                                "负向（降低风险）" = "#2ECC71")) +
  labs(title = "逻辑回归模型系数（Odds Ratio）",
       subtitle = "OR > 1表示该变量增加违约风险；OR < 1表示降低违约风险",
       x = "变量", y = "优势比（Odds Ratio）") +
  theme_minimal() +
  theme(legend.position = "bottom")

回款账户变更是最强的风险信号（OR=4.5），客户主动变更回款路径需高度警惕

历史逾期行为的边际风险贡献最大（每多一次逾期+80%风险）

质押物覆盖率是重要的风险缓释工具（每提升0.1倍覆盖率，风险降低45%）

模型评估

# 7.1 训练集预测与评估
train_pred_prob <- predict(logit_model_step, train_data, type = "response")
train_pred_class <- ifelse(train_pred_prob > 0.5, 1, 0)

# 7.2 测试集预测与评估
test_pred_prob <- predict(logit_model_step, test_data, type = "response")
test_pred_class <- ifelse(test_pred_prob > 0.5, 1, 0)

# 7.3 混淆矩阵
cat("\n=== 训练集混淆矩阵 ===\n")

## 
## === 训练集混淆矩阵 ===

train_cm <- confusionMatrix(factor(train_pred_class), 
                            factor(train_data$default), 
                            positive = "1")
print(train_cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1013  602
##          1  665 1220
##                                           
##                Accuracy : 0.638           
##                  95% CI : (0.6218, 0.6539)
##     No Information Rate : 0.5206          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.2737          
##                                           
##  Mcnemar's Test P-Value : 0.08154         
##                                           
##             Sensitivity : 0.6696          
##             Specificity : 0.6037          
##          Pos Pred Value : 0.6472          
##          Neg Pred Value : 0.6272          
##              Prevalence : 0.5206          
##          Detection Rate : 0.3486          
##    Detection Prevalence : 0.5386          
##       Balanced Accuracy : 0.6366          
##                                           
##        'Positive' Class : 1               
##

cat("\n=== 测试集混淆矩阵 ===\n")

## 
## === 测试集混淆矩阵 ===

test_cm <- confusionMatrix(factor(test_pred_class), 
                           factor(test_data$default), 
                           positive = "1")
print(test_cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 456 252
##          1 288 504
##                                           
##                Accuracy : 0.64            
##                  95% CI : (0.6151, 0.6643)
##     No Information Rate : 0.504           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.2797          
##                                           
##  Mcnemar's Test P-Value : 0.132           
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.6129          
##          Pos Pred Value : 0.6364          
##          Neg Pred Value : 0.6441          
##              Prevalence : 0.5040          
##          Detection Rate : 0.3360          
##    Detection Prevalence : 0.5280          
##       Balanced Accuracy : 0.6398          
##                                           
##        'Positive' Class : 1               
##

# 7.4 ROC曲线与AUC
# 训练集ROC
train_roc <- roc(train_data$default, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

train_auc <- auc(train_roc)

# 测试集ROC
test_roc <- roc(test_data$default, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

test_auc <- auc(test_roc)

cat("\n训练集AUC:", round(train_auc, 4))

## 
## 训练集AUC: 0.6911

cat("\n测试集AUC:", round(test_auc, 4), "\n")

## 
## 测试集AUC: 0.6894

# 绘制ROC曲线
roc_plot <- ggroc(list(训练集 = train_roc, 测试集 = test_roc), size = 1.2) +
  geom_abline(intercept = 1, slope = 1, linetype = "dashed", color = "gray") +
  labs(title = "ROC曲线对比",
       subtitle = paste("训练集AUC:", round(train_auc, 4), 
                        " | 测试集AUC:", round(test_auc, 4))) +
  theme_minimal() +
  scale_color_manual(values = c("训练集" = "#3498DB", "测试集" = "#E74C3C"))

print(roc_plot)

# 7.5 KS统计量
ks_stat <- function(actual, predicted) {
  pred_obj <- prediction(predicted, actual)
  perf_obj <- performance(pred_obj, "tpr", "fpr")
  
  # 计算KS
  tpr <- perf_obj@y.values[[1]]
  fpr <- perf_obj@x.values[[1]]
  ks_value <- max(tpr - fpr)
  
  return(ks_value)
}

train_ks <- ks_stat(train_data$default, train_pred_prob)
test_ks <- ks_stat(test_data$default, test_pred_prob)

cat("\n训练集KS值:", round(train_ks, 4))

## 
## 训练集KS值: 0.2831

cat("\n测试集KS值:", round(test_ks, 4), "\n")

## 
## 测试集KS值: 0.3026

# KS值解释标准：
# < 0.2: 模型区分能力差
# 0.2 - 0.3: 模型可用
# 0.3 - 0.5: 模型良好
# 0.5 - 0.75:

召回率约63%：能识别出约三分之二的真实违约客户

精确率约69%：每预测3个违约客户，约有2个会真正违约

ROC曲线解读：

横轴：假阳性率（误报率），将正常客户错判为违约的比例

纵轴：真阳性率（召回率），正确识别违约客户的比例

曲线越靠近左上角，模型性能越好

评分卡转换

cat("\n正在生成评分卡...\n")

## 
## 正在生成评分卡...

# 评分卡参数设置
# 基础分：600分，PDO（Points to Double Odds）：20分
base_score <- 600
pdo <- 20
factor <- pdo / log(2)
offset <- base_score - factor * log(20)  # 假设基础odds为1:20

# 计算各客户的风险评分
test_data$risk_score <- offset - factor * log(test_pred_prob / (1 - test_pred_prob))

# 评分分布可视化
ggplot(test_data, aes(x = risk_score, fill = factor(default))) +
  geom_density(alpha = 0.5) +
  labs(title = "风险评分分布",
       subtitle = "分数越高，违约风险越低",
       x = "风险评分", y = "密度") +
  scale_fill_manual(values = c("0" = "#2ECC71", "1" = "#E74C3C"),
                    labels = c("0" = "正常", "1" = "违约"),
                    name = "实际状态") +
  theme_minimal()

# 评分与违约率的关系
score_breaks <- quantile(test_data$risk_score, probs = seq(0, 1, 0.1), na.rm = TRUE)

test_data %>%
  mutate(score_bin = cut(risk_score, breaks = score_breaks, include.lowest = TRUE)) %>%
  group_by(score_bin) %>%
  summarise(
    count = n(),
    default_count = sum(default),
    default_rate = mean(default),
    .groups = 'drop'
  ) %>%
  ggplot(aes(x = score_bin, y = default_rate, fill = default_rate)) +
  geom_bar(stat = "identity") +
  labs(title = "各评分区间违约率",
       subtitle = "评分越高，违约率应越低",
       x = "评分区间", y = "违约率") +
  scale_fill_gradient(low = "#2ECC71", high = "#E74C3C") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

cat("\n=== 评分卡转换完成 ===\n")

## 
## === 评分卡转换完成 ===

cat("基础分:", base_score, "\n")

## 基础分: 600

cat("PDO:", pdo, "\n")

## PDO: 20

cat("因子系数:", round(factor, 4), "\n")

## 因子系数: 28.8539

cat("偏移量:", round(offset, 4), "\n")

## 偏移量: 513.5614

正常客户（绿色）评分分布偏右，集中在580-650分

违约客户（红色）评分分布偏左，集中在520-580分

评分差距约40-60分，区分效果明显

模型总结报告

cat("\n")

cat("========================================\n")

## ========================================

cat("        模型开发总结报告\n")

##         模型开发总结报告

cat("========================================\n")

## ========================================

cat("1. 数据概况\n")

## 1. 数据概况

cat("   - 总样本数:", n_samples, "\n")

##    - 总样本数: 5000

cat("   - 违约样本数:", sum(data$default), "\n")

##    - 违约样本数: 2578

cat("   - 违约率:", round(mean(data$default) * 100, 2), "%\n")

##    - 违约率: 51.56 %

cat("\n2. 特征筛选\n")

## 
## 2. 特征筛选

cat("   - 原始特征数:", length(features), "\n")

##    - 原始特征数: 11

cat("   - 入模特征数:", length(selected_features), "\n")

##    - 入模特征数: 8

cat("   - 入模特征:", paste(selected_features, collapse = ", "), "\n")

##    - 入模特征: overdue_times, dso_days, customer_concentration, collateral_coverage, supplier_concentration, industry_pmi, interest_coverage, asset_liability_ratio

cat("\n3. 模型性能\n")

## 
## 3. 模型性能

cat("   - 训练集AUC:", round(train_auc, 4), "\n")

##    - 训练集AUC: 0.6911

cat("   - 测试集AUC:", round(test_auc, 4), "\n")

##    - 测试集AUC: 0.6894

cat("   - 训练集KS:", round(train_ks, 4), "\n")

##    - 训练集KS: 0.2831

cat("   - 测试集KS:", round(test_ks, 4), "\n")

##    - 测试集KS: 0.3026

cat("\n4. 评分卡参数\n")

## 
## 4. 评分卡参数

cat("   - 基础分:", base_score, "\n")

##    - 基础分: 600

cat("   - PDO:", pdo, "\n")

##    - PDO: 20

cat("========================================\n")

## ========================================

cat("\n所有分析完成！\n")

## 
## 所有分析完成！

融资租赁客户风险评分卡模型

Yiyang Ding

2026-04-08