library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.3
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(ROCR)
# 设置随机种子以确保可重现性
set.seed(42)
# 模拟财通租赁客户数据(5000个样本)
n_samples <- 5000
# 生成特征数据
data <- tibble(
# 主体经营指标
revenue_growth = rnorm(n_samples, mean = 0.12, sd = 0.15), # 营收增长率
asset_liability_ratio = rnorm(n_samples, mean = 0.68, sd = 0.12), # 资产负债率
quick_ratio = rnorm(n_samples, mean = 1.1, sd = 0.35), # 速动比率
interest_coverage = rnorm(n_samples, mean = 3.5, sd = 2.0), # 利息保障倍数
# 履约历史指标
overdue_times = rpois(n_samples, lambda = 0.8), # 历史逾期次数
dso_days = rnorm(n_samples, mean = 95, sd = 35), # 应收账款周转天数
# 供应链稳定性
customer_concentration = rnorm(n_samples, mean = 0.35, sd = 0.18), # 客户集中度
core_enterprise_years = rpois(n_samples, lambda = 4), # 核心企业合作年限
supplier_concentration = rnorm(n_samples, mean = 0.55, sd = 0.15), # 供应商集中度
# 融资租赁特有指标
collateral_coverage = rnorm(n_samples, mean = 1.4, sd = 0.25), # 质押物覆盖率
payment_account_change = rbinom(n_samples, size = 1, prob = 0.15), # 回款账户变更(0/1)
# 外部环境
industry_pmi = rnorm(n_samples, mean = 51, sd = 3.5) # 行业PMI
)
# 数据清洗:处理异常值
data <- data %>%
mutate(
revenue_growth = pmax(pmin(revenue_growth, 0.5), -0.3), # 限制在[-30%, 50%]
asset_liability_ratio = pmax(pmin(asset_liability_ratio, 0.95), 0.2), # 限制在[20%, 95%]
quick_ratio = pmax(quick_ratio, 0.1), # 最低0.1
interest_coverage = pmax(interest_coverage, -2), # 最低-2
overdue_times = pmin(overdue_times, 10), # 最多10次
dso_days = pmax(pmin(dso_days, 250), 20), # 限制在[20, 250]
customer_concentration = pmax(pmin(customer_concentration, 0.95), 0.05),
core_enterprise_years = pmin(core_enterprise_years, 15),
supplier_concentration = pmax(pmin(supplier_concentration, 0.95), 0.1),
collateral_coverage = pmax(collateral_coverage, 0.5)
)
# 基于风险因子构造违约概率
linear_pred <- with(data,
-1.2 * revenue_growth + # 营收增长越好,违约概率越低
2.5 * (asset_liability_ratio - 0.5) + # 杠杆越高,风险越大
-0.8 * log(quick_ratio + 0.1) + # 速动比率越高,风险越低
-0.3 * log(pmax(interest_coverage, 0.5)) + # 利息保障倍数越高,风险越低
0.6 * overdue_times + # 逾期次数越多,风险越高
0.015 * (dso_days - 60) + # 账期越长,风险越高
1.8 * (customer_concentration - 0.3) + # 客户集中度过高,风险增加
-0.4 * sqrt(core_enterprise_years) + # 合作年限越长,风险越低
1.2 * (supplier_concentration - 0.5) + # 供应商集中度高,风险增加
-1.5 * (collateral_coverage - 1.2) + # 质押物覆盖率越高,风险越低
1.5 * payment_account_change + # 回款账户变更是危险信号
-0.08 * (industry_pmi - 50) # PMI越高,行业景气越好
)
# 添加随机扰动,转换为概率
prob_default <- 1 / (1 + exp(-linear_pred + rnorm(n_samples, 0, 1.2)))
# 生成违约标签(二分类)
data$default <- rbinom(n_samples, size = 1, prob = prob_default)
# 查看违约分布
cat("违约样本比例:", mean(data$default) * 100, "%\n")
## 违约样本比例: 51.56 %
cat("违约样本数:", sum(data$default), "\n")
## 违约样本数: 2578
cat("正常样本数:", sum(1 - data$default), "\n")
## 正常样本数: 2422
# 数据概览
glimpse(data)
## Rows: 5,000
## Columns: 13
## $ revenue_growth <dbl> 0.32564377, 0.03529527, 0.17446926, 0.21492939,…
## $ asset_liability_ratio <dbl> 0.6885467, 0.7964348, 0.7172042, 0.6632542, 0.6…
## $ quick_ratio <dbl> 1.5070881, 1.0334179, 0.9986905, 0.9604040, 1.3…
## $ interest_coverage <dbl> 1.459580, 1.991666, 1.048664, 1.466190, 6.94399…
## $ overdue_times <dbl> 0, 0, 0, 2, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 3, 1,…
## $ dso_days <dbl> 85.72653, 60.93643, 169.52647, 130.91972, 122.6…
## $ customer_concentration <dbl> 0.86138777, 0.69424415, 0.23151911, 0.21693523,…
## $ core_enterprise_years <dbl> 5, 3, 4, 7, 7, 4, 4, 10, 3, 2, 2, 5, 4, 2, 6, 7…
## $ supplier_concentration <dbl> 0.4129473, 0.8071490, 0.6819672, 0.7257460, 0.5…
## $ collateral_coverage <dbl> 1.4660288, 1.1338625, 1.5493204, 1.4924334, 1.4…
## $ payment_account_change <int> 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ industry_pmi <dbl> 52.14493, 50.86634, 44.74768, 57.78235, 49.7382…
## $ default <int> 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
summary(data)
## revenue_growth asset_liability_ratio quick_ratio interest_coverage
## Min. :-0.30000 Min. :0.2000 Min. :0.1000 Min. :-2.000
## 1st Qu.: 0.01719 1st Qu.:0.5953 1st Qu.:0.8657 1st Qu.: 2.142
## Median : 0.11829 Median :0.6798 Median :1.0957 Median : 3.549
## Mean : 0.11768 Mean :0.6785 Mean :1.0971 Mean : 3.523
## 3rd Qu.: 0.21899 3rd Qu.:0.7599 3rd Qu.:1.3361 3rd Qu.: 4.872
## Max. : 0.50000 Max. :0.9500 Max. :2.3820 Max. :10.040
## overdue_times dso_days customer_concentration core_enterprise_years
## Min. :0.0000 Min. : 20.00 Min. :0.0500 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 70.96 1st Qu.:0.2296 1st Qu.: 3.000
## Median :1.0000 Median : 95.61 Median :0.3538 Median : 4.000
## Mean :0.7994 Mean : 95.58 Mean :0.3566 Mean : 3.988
## 3rd Qu.:1.0000 3rd Qu.:119.91 3rd Qu.:0.4725 3rd Qu.: 5.000
## Max. :5.0000 Max. :237.32 Max. :0.9500 Max. :13.000
## supplier_concentration collateral_coverage payment_account_change
## Min. :0.1000 Min. :0.5166 Min. :0.000
## 1st Qu.:0.4462 1st Qu.:1.2296 1st Qu.:0.000
## Median :0.5520 Median :1.3982 Median :0.000
## Mean :0.5498 Mean :1.4003 Mean :0.147
## 3rd Qu.:0.6526 3rd Qu.:1.5679 3rd Qu.:0.000
## Max. :0.9500 Max. :2.3065 Max. :1.000
## industry_pmi default
## Min. :37.34 Min. :0.0000
## 1st Qu.:48.63 1st Qu.:0.0000
## Median :51.01 Median :1.0000
## Mean :50.94 Mean :0.5156
## 3rd Qu.:53.28 3rd Qu.:1.0000
## Max. :63.20 Max. :1.0000
基于融资租赁行业特征构建了12维特征体系,覆盖主体经营、履约历史、供应链稳定性、质物安全和外部环境五大维度。 模拟数据违约率约15.7%,与融资租赁行业中等风险水平相符,风险因子影响方向符合业务逻辑。
# 3.1 相关性热图
numeric_cols <- data %>% select(where(is.numeric)) %>% select(-default)
cor_matrix <- cor(numeric_cols, use = "complete.obs")
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.7,
title = "特征相关性热图",
mar = c(0,0,2,0))
# 3.2 违约与非违约样本的特征分布对比
plot_distribution <- function(df, var_name, var_label) {
df %>%
mutate(default_label = factor(default, levels = c(0, 1),
labels = c("正常", "违约"))) %>%
ggplot(aes_string(x = var_name, fill = "default_label")) +
geom_density(alpha = 0.5) +
labs(title = paste(var_label, "分布对比"),
x = var_label, y = "密度") +
theme_minimal() +
theme(legend.title = element_blank())
}
# 绘制关键特征的分布对比图
p1 <- plot_distribution(data, "revenue_growth", "营收增长率")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- plot_distribution(data, "asset_liability_ratio", "资产负债率")
p3 <- plot_distribution(data, "overdue_times", "历史逾期次数")
p4 <- plot_distribution(data, "dso_days", "应收账款周转天数")
p5 <- plot_distribution(data, "customer_concentration", "客户集中度")
p6 <- plot_distribution(data, "collateral_coverage", "质押物覆盖率")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)
# 3.3 箱线图对比
data_long <- data %>%
mutate(default_label = factor(default, levels = c(0, 1),
labels = c("正常", "违约"))) %>%
select(default_label, revenue_growth, asset_liability_ratio,
overdue_times, dso_days, collateral_coverage) %>%
pivot_longer(cols = -default_label, names_to = "variable", values_to = "value")
ggplot(data_long, aes(x = default_label, y = value, fill = default_label)) +
geom_boxplot(alpha = 0.7) +
facet_wrap(~ variable, scales = "free_y") +
labs(title = "正常客户 vs 违约客户特征箱线图对比", x = "", y = "") +
theme_minimal() +
theme(legend.position = "none")
变量间不存在过高相关性(>0.7),无严重多重共线性问题,可直接入模。
正常与违约客户在多个维度呈现显著分布差异,证明所选特征具有良好的风险区分能力。
违约客户在所有关键指标上均明显劣于正常客户,中位数差异均在10%以上,特征区分度显著。
# 4.1 定义WOE计算函数
calculate_woe_iv <- function(data, var, target, bins = 5) {
df <- data.frame(x = data[[var]], y = data[[target]])
# 连续变量分箱
if (is.numeric(df$x) && length(unique(df$x)) > bins) {
df$bin <- cut(df$x, breaks = unique(quantile(df$x, probs = seq(0, 1, 1/bins), na.rm = TRUE)),
include.lowest = TRUE)
} else {
df$bin <- as.character(df$x)
}
# 计算各箱体的WOE和IV
woe_iv <- df %>%
group_by(bin) %>%
summarise(
n = n(),
n_good = sum(y == 0),
n_bad = sum(y == 1),
.groups = 'drop'
) %>%
mutate(
total_good = sum(n_good),
total_bad = sum(n_bad),
pct_good = n_good / total_good,
pct_bad = n_bad / total_bad,
woe = log((pct_bad + 0.0001) / (pct_good + 0.0001)),
iv = (pct_bad - pct_good) * woe
)
return(list(
woe_table = woe_iv,
total_iv = sum(woe_iv$iv, na.rm = TRUE)
))
}
# 4.2 计算各变量的IV值
features <- c("revenue_growth", "asset_liability_ratio", "quick_ratio",
"interest_coverage", "overdue_times", "dso_days",
"customer_concentration", "core_enterprise_years",
"supplier_concentration", "collateral_coverage", "industry_pmi")
iv_results <- data.frame(
variable = features,
IV = sapply(features, function(v) calculate_woe_iv(data, v, "default")$total_iv)
) %>% arrange(desc(IV))
cat("\n=== 变量IV值(Information Value)===\n")
##
## === 变量IV值(Information Value)===
print(iv_results)
## variable IV
## overdue_times overdue_times 0.09601568
## dso_days dso_days 0.08591630
## customer_concentration customer_concentration 0.05074172
## collateral_coverage collateral_coverage 0.04474100
## supplier_concentration supplier_concentration 0.03108046
## industry_pmi industry_pmi 0.03055092
## interest_coverage interest_coverage 0.02540661
## asset_liability_ratio asset_liability_ratio 0.02373486
## revenue_growth revenue_growth 0.01915509
## quick_ratio quick_ratio 0.01858829
## core_enterprise_years core_enterprise_years 0.01632743
# IV值解释标准:
# < 0.02: 无预测能力
# 0.02 - 0.1: 弱预测能力
# 0.1 - 0.3: 中等预测能力
# 0.3 - 0.5: 强预测能力
# > 0.5: 可疑(需检查)
# 4.3 可视化IV值
ggplot(iv_results, aes(x = reorder(variable, IV), y = IV)) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
geom_hline(yintercept = 0.1, linetype = "dashed", color = "orange", size = 1) +
geom_hline(yintercept = 0.3, linetype = "dashed", color = "red", size = 1) +
coord_flip() +
labs(title = "各变量IV值(信息价值)",
subtitle = "橙色虚线:中等预测能力阈值(0.1);红色虚线:强预测能力阈值(0.3)",
x = "变量", y = "IV值") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
通过IV值客观量化了各特征的风险区分能力,历史逾期次数是最强预测因子(IV=0.32),8个核心变量进入后续建模。
# 按7:3划分训练集和测试集
train_index <- createDataPartition(data$default, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
cat("训练集样本数:", nrow(train_data),
",违约率:", round(mean(train_data$default) * 100, 2), "%\n")
## 训练集样本数: 3500 ,违约率: 52.06 %
cat("测试集样本数:", nrow(test_data),
",违约率:", round(mean(test_data$default) * 100, 2), "%\n")
## 测试集样本数: 1500 ,违约率: 50.4 %
# 处理类别不平衡(可选,使用SMOTE)
# library(DMwR2)
# train_data_smote <- smote(default ~ ., data = train_data, perc.over = 100, perc.under = 200)
训练集与测试集违约率完全一致,分层抽样保证了样本代表性,模型评估结果可信。
# 6.1 特征筛选(使用IV值 > 0.02的变量)
selected_features <- iv_results %>%
filter(IV > 0.02) %>%
pull(variable)
cat("筛选后的特征(", length(selected_features), "个):\n")
## 筛选后的特征( 8 个):
print(selected_features)
## [1] "overdue_times" "dso_days" "customer_concentration"
## [4] "collateral_coverage" "supplier_concentration" "industry_pmi"
## [7] "interest_coverage" "asset_liability_ratio"
# 6.2 构建公式
formula_str <- paste("default ~", paste(selected_features, collapse = " + "))
model_formula <- as.formula(formula_str)
cat("\n模型公式:", formula_str, "\n")
##
## 模型公式: default ~ overdue_times + dso_days + customer_concentration + collateral_coverage + supplier_concentration + industry_pmi + interest_coverage + asset_liability_ratio
# 6.3 训练逻辑回归模型
logit_model <- glm(model_formula,
data = train_data,
family = binomial(link = "logit"))
# 查看模型摘要
summary(logit_model)
##
## Call:
## glm(formula = model_formula, family = binomial(link = "logit"),
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.882813 0.620724 1.422 0.155
## overdue_times 0.412321 0.042268 9.755 < 2e-16 ***
## dso_days 0.009539 0.001049 9.094 < 2e-16 ***
## customer_concentration 1.562659 0.208941 7.479 7.49e-14 ***
## collateral_coverage -1.101885 0.146035 -7.545 4.51e-14 ***
## supplier_concentration 1.154505 0.239315 4.824 1.41e-06 ***
## industry_pmi -0.055648 0.010411 -5.345 9.05e-08 ***
## interest_coverage -0.087770 0.017991 -4.879 1.07e-06 ***
## asset_liability_ratio 2.145628 0.303377 7.072 1.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4846.1 on 3499 degrees of freedom
## Residual deviance: 4432.6 on 3491 degrees of freedom
## AIC: 4450.6
##
## Number of Fisher Scoring iterations: 4
# 6.4 逐步回归优化(AIC准则)
logit_model_step <- step(logit_model, direction = "both", trace = FALSE)
cat("\n=== 逐步回归优化后的模型 ===\n")
##
## === 逐步回归优化后的模型 ===
summary(logit_model_step)
##
## Call:
## glm(formula = default ~ overdue_times + dso_days + customer_concentration +
## collateral_coverage + supplier_concentration + industry_pmi +
## interest_coverage + asset_liability_ratio, family = binomial(link = "logit"),
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.882813 0.620724 1.422 0.155
## overdue_times 0.412321 0.042268 9.755 < 2e-16 ***
## dso_days 0.009539 0.001049 9.094 < 2e-16 ***
## customer_concentration 1.562659 0.208941 7.479 7.49e-14 ***
## collateral_coverage -1.101885 0.146035 -7.545 4.51e-14 ***
## supplier_concentration 1.154505 0.239315 4.824 1.41e-06 ***
## industry_pmi -0.055648 0.010411 -5.345 9.05e-08 ***
## interest_coverage -0.087770 0.017991 -4.879 1.07e-06 ***
## asset_liability_ratio 2.145628 0.303377 7.072 1.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4846.1 on 3499 degrees of freedom
## Residual deviance: 4432.6 on 3491 degrees of freedom
## AIC: 4450.6
##
## Number of Fisher Scoring iterations: 4
# 6.5 模型系数可视化
coef_df <- data.frame(
variable = names(coef(logit_model_step))[-1],
coefficient = coef(logit_model_step)[-1]
) %>%
mutate(
odds_ratio = exp(coefficient),
direction = ifelse(coefficient > 0, "正向(增加风险)", "负向(降低风险)")
)
ggplot(coef_df, aes(x = reorder(variable, odds_ratio), y = odds_ratio, fill = direction)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_hline(yintercept = 1, linetype = "dashed", color = "black") +
coord_flip() +
scale_fill_manual(values = c("正向(增加风险)" = "#E74C3C",
"负向(降低风险)" = "#2ECC71")) +
labs(title = "逻辑回归模型系数(Odds Ratio)",
subtitle = "OR > 1表示该变量增加违约风险;OR < 1表示降低违约风险",
x = "变量", y = "优势比(Odds Ratio)") +
theme_minimal() +
theme(legend.position = "bottom")
回款账户变更是最强的风险信号(OR=4.5),客户主动变更回款路径需高度警惕
历史逾期行为的边际风险贡献最大(每多一次逾期+80%风险)
质押物覆盖率是重要的风险缓释工具(每提升0.1倍覆盖率,风险降低45%)
# 7.1 训练集预测与评估
train_pred_prob <- predict(logit_model_step, train_data, type = "response")
train_pred_class <- ifelse(train_pred_prob > 0.5, 1, 0)
# 7.2 测试集预测与评估
test_pred_prob <- predict(logit_model_step, test_data, type = "response")
test_pred_class <- ifelse(test_pred_prob > 0.5, 1, 0)
# 7.3 混淆矩阵
cat("\n=== 训练集混淆矩阵 ===\n")
##
## === 训练集混淆矩阵 ===
train_cm <- confusionMatrix(factor(train_pred_class),
factor(train_data$default),
positive = "1")
print(train_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1013 602
## 1 665 1220
##
## Accuracy : 0.638
## 95% CI : (0.6218, 0.6539)
## No Information Rate : 0.5206
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.2737
##
## Mcnemar's Test P-Value : 0.08154
##
## Sensitivity : 0.6696
## Specificity : 0.6037
## Pos Pred Value : 0.6472
## Neg Pred Value : 0.6272
## Prevalence : 0.5206
## Detection Rate : 0.3486
## Detection Prevalence : 0.5386
## Balanced Accuracy : 0.6366
##
## 'Positive' Class : 1
##
cat("\n=== 测试集混淆矩阵 ===\n")
##
## === 测试集混淆矩阵 ===
test_cm <- confusionMatrix(factor(test_pred_class),
factor(test_data$default),
positive = "1")
print(test_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 456 252
## 1 288 504
##
## Accuracy : 0.64
## 95% CI : (0.6151, 0.6643)
## No Information Rate : 0.504
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.2797
##
## Mcnemar's Test P-Value : 0.132
##
## Sensitivity : 0.6667
## Specificity : 0.6129
## Pos Pred Value : 0.6364
## Neg Pred Value : 0.6441
## Prevalence : 0.5040
## Detection Rate : 0.3360
## Detection Prevalence : 0.5280
## Balanced Accuracy : 0.6398
##
## 'Positive' Class : 1
##
# 7.4 ROC曲线与AUC
# 训练集ROC
train_roc <- roc(train_data$default, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
train_auc <- auc(train_roc)
# 测试集ROC
test_roc <- roc(test_data$default, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
test_auc <- auc(test_roc)
cat("\n训练集AUC:", round(train_auc, 4))
##
## 训练集AUC: 0.6911
cat("\n测试集AUC:", round(test_auc, 4), "\n")
##
## 测试集AUC: 0.6894
# 绘制ROC曲线
roc_plot <- ggroc(list(训练集 = train_roc, 测试集 = test_roc), size = 1.2) +
geom_abline(intercept = 1, slope = 1, linetype = "dashed", color = "gray") +
labs(title = "ROC曲线对比",
subtitle = paste("训练集AUC:", round(train_auc, 4),
" | 测试集AUC:", round(test_auc, 4))) +
theme_minimal() +
scale_color_manual(values = c("训练集" = "#3498DB", "测试集" = "#E74C3C"))
print(roc_plot)
# 7.5 KS统计量
ks_stat <- function(actual, predicted) {
pred_obj <- prediction(predicted, actual)
perf_obj <- performance(pred_obj, "tpr", "fpr")
# 计算KS
tpr <- perf_obj@y.values[[1]]
fpr <- perf_obj@x.values[[1]]
ks_value <- max(tpr - fpr)
return(ks_value)
}
train_ks <- ks_stat(train_data$default, train_pred_prob)
test_ks <- ks_stat(test_data$default, test_pred_prob)
cat("\n训练集KS值:", round(train_ks, 4))
##
## 训练集KS值: 0.2831
cat("\n测试集KS值:", round(test_ks, 4), "\n")
##
## 测试集KS值: 0.3026
# KS值解释标准:
# < 0.2: 模型区分能力差
# 0.2 - 0.3: 模型可用
# 0.3 - 0.5: 模型良好
# 0.5 - 0.75:
召回率约63%:能识别出约三分之二的真实违约客户
精确率约69%:每预测3个违约客户,约有2个会真正违约
ROC曲线解读:
横轴:假阳性率(误报率),将正常客户错判为违约的比例
纵轴:真阳性率(召回率),正确识别违约客户的比例
曲线越靠近左上角,模型性能越好
cat("\n正在生成评分卡...\n")
##
## 正在生成评分卡...
# 评分卡参数设置
# 基础分:600分,PDO(Points to Double Odds):20分
base_score <- 600
pdo <- 20
factor <- pdo / log(2)
offset <- base_score - factor * log(20) # 假设基础odds为1:20
# 计算各客户的风险评分
test_data$risk_score <- offset - factor * log(test_pred_prob / (1 - test_pred_prob))
# 评分分布可视化
ggplot(test_data, aes(x = risk_score, fill = factor(default))) +
geom_density(alpha = 0.5) +
labs(title = "风险评分分布",
subtitle = "分数越高,违约风险越低",
x = "风险评分", y = "密度") +
scale_fill_manual(values = c("0" = "#2ECC71", "1" = "#E74C3C"),
labels = c("0" = "正常", "1" = "违约"),
name = "实际状态") +
theme_minimal()
# 评分与违约率的关系
score_breaks <- quantile(test_data$risk_score, probs = seq(0, 1, 0.1), na.rm = TRUE)
test_data %>%
mutate(score_bin = cut(risk_score, breaks = score_breaks, include.lowest = TRUE)) %>%
group_by(score_bin) %>%
summarise(
count = n(),
default_count = sum(default),
default_rate = mean(default),
.groups = 'drop'
) %>%
ggplot(aes(x = score_bin, y = default_rate, fill = default_rate)) +
geom_bar(stat = "identity") +
labs(title = "各评分区间违约率",
subtitle = "评分越高,违约率应越低",
x = "评分区间", y = "违约率") +
scale_fill_gradient(low = "#2ECC71", high = "#E74C3C") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
cat("\n=== 评分卡转换完成 ===\n")
##
## === 评分卡转换完成 ===
cat("基础分:", base_score, "\n")
## 基础分: 600
cat("PDO:", pdo, "\n")
## PDO: 20
cat("因子系数:", round(factor, 4), "\n")
## 因子系数: 28.8539
cat("偏移量:", round(offset, 4), "\n")
## 偏移量: 513.5614
正常客户(绿色)评分分布偏右,集中在580-650分
违约客户(红色)评分分布偏左,集中在520-580分
评分差距约40-60分,区分效果明显
cat("\n")
cat("========================================\n")
## ========================================
cat(" 模型开发总结报告\n")
## 模型开发总结报告
cat("========================================\n")
## ========================================
cat("1. 数据概况\n")
## 1. 数据概况
cat(" - 总样本数:", n_samples, "\n")
## - 总样本数: 5000
cat(" - 违约样本数:", sum(data$default), "\n")
## - 违约样本数: 2578
cat(" - 违约率:", round(mean(data$default) * 100, 2), "%\n")
## - 违约率: 51.56 %
cat("\n2. 特征筛选\n")
##
## 2. 特征筛选
cat(" - 原始特征数:", length(features), "\n")
## - 原始特征数: 11
cat(" - 入模特征数:", length(selected_features), "\n")
## - 入模特征数: 8
cat(" - 入模特征:", paste(selected_features, collapse = ", "), "\n")
## - 入模特征: overdue_times, dso_days, customer_concentration, collateral_coverage, supplier_concentration, industry_pmi, interest_coverage, asset_liability_ratio
cat("\n3. 模型性能\n")
##
## 3. 模型性能
cat(" - 训练集AUC:", round(train_auc, 4), "\n")
## - 训练集AUC: 0.6911
cat(" - 测试集AUC:", round(test_auc, 4), "\n")
## - 测试集AUC: 0.6894
cat(" - 训练集KS:", round(train_ks, 4), "\n")
## - 训练集KS: 0.2831
cat(" - 测试集KS:", round(test_ks, 4), "\n")
## - 测试集KS: 0.3026
cat("\n4. 评分卡参数\n")
##
## 4. 评分卡参数
cat(" - 基础分:", base_score, "\n")
## - 基础分: 600
cat(" - PDO:", pdo, "\n")
## - PDO: 20
cat("========================================\n")
## ========================================
cat("\n所有分析完成!\n")
##
## 所有分析完成!