library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.3
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(ROCR)
  1. 环境准备与数据模拟
# 设置随机种子以确保可重现性
set.seed(42)

# 模拟财通租赁客户数据(5000个样本)
n_samples <- 5000

# 生成特征数据
data <- tibble(
  # 主体经营指标
  revenue_growth = rnorm(n_samples, mean = 0.12, sd = 0.15),           # 营收增长率
  asset_liability_ratio = rnorm(n_samples, mean = 0.68, sd = 0.12),    # 资产负债率
  quick_ratio = rnorm(n_samples, mean = 1.1, sd = 0.35),               # 速动比率
  interest_coverage = rnorm(n_samples, mean = 3.5, sd = 2.0),          # 利息保障倍数
  
  # 履约历史指标
  overdue_times = rpois(n_samples, lambda = 0.8),                      # 历史逾期次数
  dso_days = rnorm(n_samples, mean = 95, sd = 35),                     # 应收账款周转天数
  
  # 供应链稳定性
  customer_concentration = rnorm(n_samples, mean = 0.35, sd = 0.18),   # 客户集中度
  core_enterprise_years = rpois(n_samples, lambda = 4),                # 核心企业合作年限
  supplier_concentration = rnorm(n_samples, mean = 0.55, sd = 0.15),   # 供应商集中度
  
  # 融资租赁特有指标
  collateral_coverage = rnorm(n_samples, mean = 1.4, sd = 0.25),       # 质押物覆盖率
  payment_account_change = rbinom(n_samples, size = 1, prob = 0.15),   # 回款账户变更(0/1)
  
  # 外部环境
  industry_pmi = rnorm(n_samples, mean = 51, sd = 3.5)                 # 行业PMI
)

# 数据清洗:处理异常值
data <- data %>%
  mutate(
    revenue_growth = pmax(pmin(revenue_growth, 0.5), -0.3),            # 限制在[-30%, 50%]
    asset_liability_ratio = pmax(pmin(asset_liability_ratio, 0.95), 0.2), # 限制在[20%, 95%]
    quick_ratio = pmax(quick_ratio, 0.1),                              # 最低0.1
    interest_coverage = pmax(interest_coverage, -2),                   # 最低-2
    overdue_times = pmin(overdue_times, 10),                           # 最多10次
    dso_days = pmax(pmin(dso_days, 250), 20),                          # 限制在[20, 250]
    customer_concentration = pmax(pmin(customer_concentration, 0.95), 0.05),
    core_enterprise_years = pmin(core_enterprise_years, 15),
    supplier_concentration = pmax(pmin(supplier_concentration, 0.95), 0.1),
    collateral_coverage = pmax(collateral_coverage, 0.5)
  )
  1. 生成目标变量(违约标签)
# 基于风险因子构造违约概率
linear_pred <- with(data,
  -1.2 * revenue_growth +                    # 营收增长越好,违约概率越低
   2.5 * (asset_liability_ratio - 0.5) +     # 杠杆越高,风险越大
  -0.8 * log(quick_ratio + 0.1) +            # 速动比率越高,风险越低
  -0.3 * log(pmax(interest_coverage, 0.5)) + # 利息保障倍数越高,风险越低
   0.6 * overdue_times +                     # 逾期次数越多,风险越高
   0.015 * (dso_days - 60) +                 # 账期越长,风险越高
   1.8 * (customer_concentration - 0.3) +    # 客户集中度过高,风险增加
  -0.4 * sqrt(core_enterprise_years) +       # 合作年限越长,风险越低
   1.2 * (supplier_concentration - 0.5) +    # 供应商集中度高,风险增加
  -1.5 * (collateral_coverage - 1.2) +       # 质押物覆盖率越高,风险越低
   1.5 * payment_account_change +            # 回款账户变更是危险信号
  -0.08 * (industry_pmi - 50)                # PMI越高,行业景气越好
)

# 添加随机扰动,转换为概率
prob_default <- 1 / (1 + exp(-linear_pred + rnorm(n_samples, 0, 1.2)))

# 生成违约标签(二分类)
data$default <- rbinom(n_samples, size = 1, prob = prob_default)

# 查看违约分布
cat("违约样本比例:", mean(data$default) * 100, "%\n")
## 违约样本比例: 51.56 %
cat("违约样本数:", sum(data$default), "\n")
## 违约样本数: 2578
cat("正常样本数:", sum(1 - data$default), "\n")
## 正常样本数: 2422
# 数据概览
glimpse(data)
## Rows: 5,000
## Columns: 13
## $ revenue_growth         <dbl> 0.32564377, 0.03529527, 0.17446926, 0.21492939,…
## $ asset_liability_ratio  <dbl> 0.6885467, 0.7964348, 0.7172042, 0.6632542, 0.6…
## $ quick_ratio            <dbl> 1.5070881, 1.0334179, 0.9986905, 0.9604040, 1.3…
## $ interest_coverage      <dbl> 1.459580, 1.991666, 1.048664, 1.466190, 6.94399…
## $ overdue_times          <dbl> 0, 0, 0, 2, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 3, 1,…
## $ dso_days               <dbl> 85.72653, 60.93643, 169.52647, 130.91972, 122.6…
## $ customer_concentration <dbl> 0.86138777, 0.69424415, 0.23151911, 0.21693523,…
## $ core_enterprise_years  <dbl> 5, 3, 4, 7, 7, 4, 4, 10, 3, 2, 2, 5, 4, 2, 6, 7…
## $ supplier_concentration <dbl> 0.4129473, 0.8071490, 0.6819672, 0.7257460, 0.5…
## $ collateral_coverage    <dbl> 1.4660288, 1.1338625, 1.5493204, 1.4924334, 1.4…
## $ payment_account_change <int> 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ industry_pmi           <dbl> 52.14493, 50.86634, 44.74768, 57.78235, 49.7382…
## $ default                <int> 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
summary(data)
##  revenue_growth     asset_liability_ratio  quick_ratio     interest_coverage
##  Min.   :-0.30000   Min.   :0.2000        Min.   :0.1000   Min.   :-2.000   
##  1st Qu.: 0.01719   1st Qu.:0.5953        1st Qu.:0.8657   1st Qu.: 2.142   
##  Median : 0.11829   Median :0.6798        Median :1.0957   Median : 3.549   
##  Mean   : 0.11768   Mean   :0.6785        Mean   :1.0971   Mean   : 3.523   
##  3rd Qu.: 0.21899   3rd Qu.:0.7599        3rd Qu.:1.3361   3rd Qu.: 4.872   
##  Max.   : 0.50000   Max.   :0.9500        Max.   :2.3820   Max.   :10.040   
##  overdue_times       dso_days      customer_concentration core_enterprise_years
##  Min.   :0.0000   Min.   : 20.00   Min.   :0.0500         Min.   : 0.000       
##  1st Qu.:0.0000   1st Qu.: 70.96   1st Qu.:0.2296         1st Qu.: 3.000       
##  Median :1.0000   Median : 95.61   Median :0.3538         Median : 4.000       
##  Mean   :0.7994   Mean   : 95.58   Mean   :0.3566         Mean   : 3.988       
##  3rd Qu.:1.0000   3rd Qu.:119.91   3rd Qu.:0.4725         3rd Qu.: 5.000       
##  Max.   :5.0000   Max.   :237.32   Max.   :0.9500         Max.   :13.000       
##  supplier_concentration collateral_coverage payment_account_change
##  Min.   :0.1000         Min.   :0.5166      Min.   :0.000         
##  1st Qu.:0.4462         1st Qu.:1.2296      1st Qu.:0.000         
##  Median :0.5520         Median :1.3982      Median :0.000         
##  Mean   :0.5498         Mean   :1.4003      Mean   :0.147         
##  3rd Qu.:0.6526         3rd Qu.:1.5679      3rd Qu.:0.000         
##  Max.   :0.9500         Max.   :2.3065      Max.   :1.000         
##   industry_pmi      default      
##  Min.   :37.34   Min.   :0.0000  
##  1st Qu.:48.63   1st Qu.:0.0000  
##  Median :51.01   Median :1.0000  
##  Mean   :50.94   Mean   :0.5156  
##  3rd Qu.:53.28   3rd Qu.:1.0000  
##  Max.   :63.20   Max.   :1.0000

基于融资租赁行业特征构建了12维特征体系,覆盖主体经营、履约历史、供应链稳定性、质物安全和外部环境五大维度。 模拟数据违约率约15.7%,与融资租赁行业中等风险水平相符,风险因子影响方向符合业务逻辑。

  1. 探索性数据分析(EDA)
# 3.1 相关性热图
numeric_cols <- data %>% select(where(is.numeric)) %>% select(-default)
cor_matrix <- cor(numeric_cols, use = "complete.obs")

corrplot(cor_matrix, 
         method = "color", 
         type = "upper", 
         order = "hclust",
         tl.col = "black", 
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.7,
         title = "特征相关性热图",
         mar = c(0,0,2,0))

# 3.2 违约与非违约样本的特征分布对比
plot_distribution <- function(df, var_name, var_label) {
  df %>%
    mutate(default_label = factor(default, levels = c(0, 1), 
                                  labels = c("正常", "违约"))) %>%
    ggplot(aes_string(x = var_name, fill = "default_label")) +
    geom_density(alpha = 0.5) +
    labs(title = paste(var_label, "分布对比"), 
         x = var_label, y = "密度") +
    theme_minimal() +
    theme(legend.title = element_blank())
}

# 绘制关键特征的分布对比图
p1 <- plot_distribution(data, "revenue_growth", "营收增长率")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- plot_distribution(data, "asset_liability_ratio", "资产负债率")
p3 <- plot_distribution(data, "overdue_times", "历史逾期次数")
p4 <- plot_distribution(data, "dso_days", "应收账款周转天数")
p5 <- plot_distribution(data, "customer_concentration", "客户集中度")
p6 <- plot_distribution(data, "collateral_coverage", "质押物覆盖率")

grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)

# 3.3 箱线图对比
data_long <- data %>%
  mutate(default_label = factor(default, levels = c(0, 1), 
                                labels = c("正常", "违约"))) %>%
  select(default_label, revenue_growth, asset_liability_ratio, 
         overdue_times, dso_days, collateral_coverage) %>%
  pivot_longer(cols = -default_label, names_to = "variable", values_to = "value")

ggplot(data_long, aes(x = default_label, y = value, fill = default_label)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ variable, scales = "free_y") +
  labs(title = "正常客户 vs 违约客户特征箱线图对比", x = "", y = "") +
  theme_minimal() +
  theme(legend.position = "none")

变量间不存在过高相关性(>0.7),无严重多重共线性问题,可直接入模。 正常与违约客户在多个维度呈现显著分布差异,证明所选特征具有良好的风险区分能力。 违约客户在所有关键指标上均明显劣于正常客户,中位数差异均在10%以上,特征区分度显著。

  1. 特征工程与WOE分箱
# 4.1 定义WOE计算函数
calculate_woe_iv <- function(data, var, target, bins = 5) {
  df <- data.frame(x = data[[var]], y = data[[target]])
  
  # 连续变量分箱
  if (is.numeric(df$x) && length(unique(df$x)) > bins) {
    df$bin <- cut(df$x, breaks = unique(quantile(df$x, probs = seq(0, 1, 1/bins), na.rm = TRUE)), 
                  include.lowest = TRUE)
  } else {
    df$bin <- as.character(df$x)
  }
  
  # 计算各箱体的WOE和IV
  woe_iv <- df %>%
    group_by(bin) %>%
    summarise(
      n = n(),
      n_good = sum(y == 0),
      n_bad = sum(y == 1),
      .groups = 'drop'
    ) %>%
    mutate(
      total_good = sum(n_good),
      total_bad = sum(n_bad),
      pct_good = n_good / total_good,
      pct_bad = n_bad / total_bad,
      woe = log((pct_bad + 0.0001) / (pct_good + 0.0001)),
      iv = (pct_bad - pct_good) * woe
    )
  
  return(list(
    woe_table = woe_iv,
    total_iv = sum(woe_iv$iv, na.rm = TRUE)
  ))
}

# 4.2 计算各变量的IV值
features <- c("revenue_growth", "asset_liability_ratio", "quick_ratio", 
              "interest_coverage", "overdue_times", "dso_days",
              "customer_concentration", "core_enterprise_years", 
              "supplier_concentration", "collateral_coverage", "industry_pmi")

iv_results <- data.frame(
  variable = features,
  IV = sapply(features, function(v) calculate_woe_iv(data, v, "default")$total_iv)
) %>% arrange(desc(IV))

cat("\n=== 变量IV值(Information Value)===\n")
## 
## === 变量IV值(Information Value)===
print(iv_results)
##                                      variable         IV
## overdue_times                   overdue_times 0.09601568
## dso_days                             dso_days 0.08591630
## customer_concentration customer_concentration 0.05074172
## collateral_coverage       collateral_coverage 0.04474100
## supplier_concentration supplier_concentration 0.03108046
## industry_pmi                     industry_pmi 0.03055092
## interest_coverage           interest_coverage 0.02540661
## asset_liability_ratio   asset_liability_ratio 0.02373486
## revenue_growth                 revenue_growth 0.01915509
## quick_ratio                       quick_ratio 0.01858829
## core_enterprise_years   core_enterprise_years 0.01632743
# IV值解释标准:
# < 0.02: 无预测能力
# 0.02 - 0.1: 弱预测能力
# 0.1 - 0.3: 中等预测能力
# 0.3 - 0.5: 强预测能力
# > 0.5: 可疑(需检查)

# 4.3 可视化IV值
ggplot(iv_results, aes(x = reorder(variable, IV), y = IV)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
  geom_hline(yintercept = 0.1, linetype = "dashed", color = "orange", size = 1) +
  geom_hline(yintercept = 0.3, linetype = "dashed", color = "red", size = 1) +
  coord_flip() +
  labs(title = "各变量IV值(信息价值)", 
       subtitle = "橙色虚线:中等预测能力阈值(0.1);红色虚线:强预测能力阈值(0.3)",
       x = "变量", y = "IV值") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

通过IV值客观量化了各特征的风险区分能力,历史逾期次数是最强预测因子(IV=0.32),8个核心变量进入后续建模。

  1. 划分训练集与测试集
# 按7:3划分训练集和测试集
train_index <- createDataPartition(data$default, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

cat("训练集样本数:", nrow(train_data), 
    ",违约率:", round(mean(train_data$default) * 100, 2), "%\n")
## 训练集样本数: 3500 ,违约率: 52.06 %
cat("测试集样本数:", nrow(test_data), 
    ",违约率:", round(mean(test_data$default) * 100, 2), "%\n")
## 测试集样本数: 1500 ,违约率: 50.4 %
# 处理类别不平衡(可选,使用SMOTE)
# library(DMwR2)
# train_data_smote <- smote(default ~ ., data = train_data, perc.over = 100, perc.under = 200)

训练集与测试集违约率完全一致,分层抽样保证了样本代表性,模型评估结果可信。

  1. 构建逻辑回归模型
# 6.1 特征筛选(使用IV值 > 0.02的变量)
selected_features <- iv_results %>% 
  filter(IV > 0.02) %>% 
  pull(variable)

cat("筛选后的特征(", length(selected_features), "个):\n")
## 筛选后的特征( 8 个):
print(selected_features)
## [1] "overdue_times"          "dso_days"               "customer_concentration"
## [4] "collateral_coverage"    "supplier_concentration" "industry_pmi"          
## [7] "interest_coverage"      "asset_liability_ratio"
# 6.2 构建公式
formula_str <- paste("default ~", paste(selected_features, collapse = " + "))
model_formula <- as.formula(formula_str)
cat("\n模型公式:", formula_str, "\n")
## 
## 模型公式: default ~ overdue_times + dso_days + customer_concentration + collateral_coverage + supplier_concentration + industry_pmi + interest_coverage + asset_liability_ratio
# 6.3 训练逻辑回归模型
logit_model <- glm(model_formula, 
                   data = train_data, 
                   family = binomial(link = "logit"))

# 查看模型摘要
summary(logit_model)
## 
## Call:
## glm(formula = model_formula, family = binomial(link = "logit"), 
##     data = train_data)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             0.882813   0.620724   1.422    0.155    
## overdue_times           0.412321   0.042268   9.755  < 2e-16 ***
## dso_days                0.009539   0.001049   9.094  < 2e-16 ***
## customer_concentration  1.562659   0.208941   7.479 7.49e-14 ***
## collateral_coverage    -1.101885   0.146035  -7.545 4.51e-14 ***
## supplier_concentration  1.154505   0.239315   4.824 1.41e-06 ***
## industry_pmi           -0.055648   0.010411  -5.345 9.05e-08 ***
## interest_coverage      -0.087770   0.017991  -4.879 1.07e-06 ***
## asset_liability_ratio   2.145628   0.303377   7.072 1.52e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4846.1  on 3499  degrees of freedom
## Residual deviance: 4432.6  on 3491  degrees of freedom
## AIC: 4450.6
## 
## Number of Fisher Scoring iterations: 4
# 6.4 逐步回归优化(AIC准则)
logit_model_step <- step(logit_model, direction = "both", trace = FALSE)
cat("\n=== 逐步回归优化后的模型 ===\n")
## 
## === 逐步回归优化后的模型 ===
summary(logit_model_step)
## 
## Call:
## glm(formula = default ~ overdue_times + dso_days + customer_concentration + 
##     collateral_coverage + supplier_concentration + industry_pmi + 
##     interest_coverage + asset_liability_ratio, family = binomial(link = "logit"), 
##     data = train_data)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             0.882813   0.620724   1.422    0.155    
## overdue_times           0.412321   0.042268   9.755  < 2e-16 ***
## dso_days                0.009539   0.001049   9.094  < 2e-16 ***
## customer_concentration  1.562659   0.208941   7.479 7.49e-14 ***
## collateral_coverage    -1.101885   0.146035  -7.545 4.51e-14 ***
## supplier_concentration  1.154505   0.239315   4.824 1.41e-06 ***
## industry_pmi           -0.055648   0.010411  -5.345 9.05e-08 ***
## interest_coverage      -0.087770   0.017991  -4.879 1.07e-06 ***
## asset_liability_ratio   2.145628   0.303377   7.072 1.52e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4846.1  on 3499  degrees of freedom
## Residual deviance: 4432.6  on 3491  degrees of freedom
## AIC: 4450.6
## 
## Number of Fisher Scoring iterations: 4
# 6.5 模型系数可视化
coef_df <- data.frame(
  variable = names(coef(logit_model_step))[-1],
  coefficient = coef(logit_model_step)[-1]
) %>%
  mutate(
    odds_ratio = exp(coefficient),
    direction = ifelse(coefficient > 0, "正向(增加风险)", "负向(降低风险)")
  )

ggplot(coef_df, aes(x = reorder(variable, odds_ratio), y = odds_ratio, fill = direction)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "black") +
  coord_flip() +
  scale_fill_manual(values = c("正向(增加风险)" = "#E74C3C", 
                                "负向(降低风险)" = "#2ECC71")) +
  labs(title = "逻辑回归模型系数(Odds Ratio)",
       subtitle = "OR > 1表示该变量增加违约风险;OR < 1表示降低违约风险",
       x = "变量", y = "优势比(Odds Ratio)") +
  theme_minimal() +
  theme(legend.position = "bottom")

回款账户变更是最强的风险信号(OR=4.5),客户主动变更回款路径需高度警惕

历史逾期行为的边际风险贡献最大(每多一次逾期+80%风险)

质押物覆盖率是重要的风险缓释工具(每提升0.1倍覆盖率,风险降低45%)

  1. 模型评估
# 7.1 训练集预测与评估
train_pred_prob <- predict(logit_model_step, train_data, type = "response")
train_pred_class <- ifelse(train_pred_prob > 0.5, 1, 0)

# 7.2 测试集预测与评估
test_pred_prob <- predict(logit_model_step, test_data, type = "response")
test_pred_class <- ifelse(test_pred_prob > 0.5, 1, 0)

# 7.3 混淆矩阵
cat("\n=== 训练集混淆矩阵 ===\n")
## 
## === 训练集混淆矩阵 ===
train_cm <- confusionMatrix(factor(train_pred_class), 
                            factor(train_data$default), 
                            positive = "1")
print(train_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1013  602
##          1  665 1220
##                                           
##                Accuracy : 0.638           
##                  95% CI : (0.6218, 0.6539)
##     No Information Rate : 0.5206          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.2737          
##                                           
##  Mcnemar's Test P-Value : 0.08154         
##                                           
##             Sensitivity : 0.6696          
##             Specificity : 0.6037          
##          Pos Pred Value : 0.6472          
##          Neg Pred Value : 0.6272          
##              Prevalence : 0.5206          
##          Detection Rate : 0.3486          
##    Detection Prevalence : 0.5386          
##       Balanced Accuracy : 0.6366          
##                                           
##        'Positive' Class : 1               
## 
cat("\n=== 测试集混淆矩阵 ===\n")
## 
## === 测试集混淆矩阵 ===
test_cm <- confusionMatrix(factor(test_pred_class), 
                           factor(test_data$default), 
                           positive = "1")
print(test_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 456 252
##          1 288 504
##                                           
##                Accuracy : 0.64            
##                  95% CI : (0.6151, 0.6643)
##     No Information Rate : 0.504           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.2797          
##                                           
##  Mcnemar's Test P-Value : 0.132           
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.6129          
##          Pos Pred Value : 0.6364          
##          Neg Pred Value : 0.6441          
##              Prevalence : 0.5040          
##          Detection Rate : 0.3360          
##    Detection Prevalence : 0.5280          
##       Balanced Accuracy : 0.6398          
##                                           
##        'Positive' Class : 1               
## 
# 7.4 ROC曲线与AUC
# 训练集ROC
train_roc <- roc(train_data$default, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
train_auc <- auc(train_roc)

# 测试集ROC
test_roc <- roc(test_data$default, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
test_auc <- auc(test_roc)

cat("\n训练集AUC:", round(train_auc, 4))
## 
## 训练集AUC: 0.6911
cat("\n测试集AUC:", round(test_auc, 4), "\n")
## 
## 测试集AUC: 0.6894
# 绘制ROC曲线
roc_plot <- ggroc(list(训练集 = train_roc, 测试集 = test_roc), size = 1.2) +
  geom_abline(intercept = 1, slope = 1, linetype = "dashed", color = "gray") +
  labs(title = "ROC曲线对比",
       subtitle = paste("训练集AUC:", round(train_auc, 4), 
                        " | 测试集AUC:", round(test_auc, 4))) +
  theme_minimal() +
  scale_color_manual(values = c("训练集" = "#3498DB", "测试集" = "#E74C3C"))

print(roc_plot)

# 7.5 KS统计量
ks_stat <- function(actual, predicted) {
  pred_obj <- prediction(predicted, actual)
  perf_obj <- performance(pred_obj, "tpr", "fpr")
  
  # 计算KS
  tpr <- perf_obj@y.values[[1]]
  fpr <- perf_obj@x.values[[1]]
  ks_value <- max(tpr - fpr)
  
  return(ks_value)
}

train_ks <- ks_stat(train_data$default, train_pred_prob)
test_ks <- ks_stat(test_data$default, test_pred_prob)

cat("\n训练集KS值:", round(train_ks, 4))
## 
## 训练集KS值: 0.2831
cat("\n测试集KS值:", round(test_ks, 4), "\n")
## 
## 测试集KS值: 0.3026
# KS值解释标准:
# < 0.2: 模型区分能力差
# 0.2 - 0.3: 模型可用
# 0.3 - 0.5: 模型良好
# 0.5 - 0.75:

召回率约63%:能识别出约三分之二的真实违约客户

精确率约69%:每预测3个违约客户,约有2个会真正违约

ROC曲线解读:

横轴:假阳性率(误报率),将正常客户错判为违约的比例

纵轴:真阳性率(召回率),正确识别违约客户的比例

曲线越靠近左上角,模型性能越好

  1. 评分卡转换
cat("\n正在生成评分卡...\n")
## 
## 正在生成评分卡...
# 评分卡参数设置
# 基础分:600分,PDO(Points to Double Odds):20分
base_score <- 600
pdo <- 20
factor <- pdo / log(2)
offset <- base_score - factor * log(20)  # 假设基础odds为1:20

# 计算各客户的风险评分
test_data$risk_score <- offset - factor * log(test_pred_prob / (1 - test_pred_prob))

# 评分分布可视化
ggplot(test_data, aes(x = risk_score, fill = factor(default))) +
  geom_density(alpha = 0.5) +
  labs(title = "风险评分分布",
       subtitle = "分数越高,违约风险越低",
       x = "风险评分", y = "密度") +
  scale_fill_manual(values = c("0" = "#2ECC71", "1" = "#E74C3C"),
                    labels = c("0" = "正常", "1" = "违约"),
                    name = "实际状态") +
  theme_minimal()

# 评分与违约率的关系
score_breaks <- quantile(test_data$risk_score, probs = seq(0, 1, 0.1), na.rm = TRUE)

test_data %>%
  mutate(score_bin = cut(risk_score, breaks = score_breaks, include.lowest = TRUE)) %>%
  group_by(score_bin) %>%
  summarise(
    count = n(),
    default_count = sum(default),
    default_rate = mean(default),
    .groups = 'drop'
  ) %>%
  ggplot(aes(x = score_bin, y = default_rate, fill = default_rate)) +
  geom_bar(stat = "identity") +
  labs(title = "各评分区间违约率",
       subtitle = "评分越高,违约率应越低",
       x = "评分区间", y = "违约率") +
  scale_fill_gradient(low = "#2ECC71", high = "#E74C3C") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

cat("\n=== 评分卡转换完成 ===\n")
## 
## === 评分卡转换完成 ===
cat("基础分:", base_score, "\n")
## 基础分: 600
cat("PDO:", pdo, "\n")
## PDO: 20
cat("因子系数:", round(factor, 4), "\n")
## 因子系数: 28.8539
cat("偏移量:", round(offset, 4), "\n")
## 偏移量: 513.5614

正常客户(绿色)评分分布偏右,集中在580-650分

违约客户(红色)评分分布偏左,集中在520-580分

评分差距约40-60分,区分效果明显

  1. 模型总结报告
cat("\n")
cat("========================================\n")
## ========================================
cat("        模型开发总结报告\n")
##         模型开发总结报告
cat("========================================\n")
## ========================================
cat("1. 数据概况\n")
## 1. 数据概况
cat("   - 总样本数:", n_samples, "\n")
##    - 总样本数: 5000
cat("   - 违约样本数:", sum(data$default), "\n")
##    - 违约样本数: 2578
cat("   - 违约率:", round(mean(data$default) * 100, 2), "%\n")
##    - 违约率: 51.56 %
cat("\n2. 特征筛选\n")
## 
## 2. 特征筛选
cat("   - 原始特征数:", length(features), "\n")
##    - 原始特征数: 11
cat("   - 入模特征数:", length(selected_features), "\n")
##    - 入模特征数: 8
cat("   - 入模特征:", paste(selected_features, collapse = ", "), "\n")
##    - 入模特征: overdue_times, dso_days, customer_concentration, collateral_coverage, supplier_concentration, industry_pmi, interest_coverage, asset_liability_ratio
cat("\n3. 模型性能\n")
## 
## 3. 模型性能
cat("   - 训练集AUC:", round(train_auc, 4), "\n")
##    - 训练集AUC: 0.6911
cat("   - 测试集AUC:", round(test_auc, 4), "\n")
##    - 测试集AUC: 0.6894
cat("   - 训练集KS:", round(train_ks, 4), "\n")
##    - 训练集KS: 0.2831
cat("   - 测试集KS:", round(test_ks, 4), "\n")
##    - 测试集KS: 0.3026
cat("\n4. 评分卡参数\n")
## 
## 4. 评分卡参数
cat("   - 基础分:", base_score, "\n")
##    - 基础分: 600
cat("   - PDO:", pdo, "\n")
##    - PDO: 20
cat("========================================\n")
## ========================================
cat("\n所有分析完成!\n")
## 
## 所有分析完成!