本文档描述申请评分卡开发中模型建立阶段的流程,本阶段前应已完成变量筛选,缺失值处理等工作,并确定好项目参数(好坏定义,表现窗口长度等等)。
# 载入所需包
library(data.table)
library(pipeR)
library(ggplot2)
library(grid)
library(caret)
library(pROC)
library(scales)
library(reshape2)
# 读入数据
data <- fread("申请评分卡数据.csv")
数据概览:
head(data)
age income children residence.time career residence.type nationality
1: 28-46 <1000 >=1 >=18 18-96 缺失 土、希、德
2: 28-46 <1000 >=1 >=18 >=96 租住 土、希、德
3: 28-46 <1000 >=1 <18 18-96 租住 其它国家
4: 28-46 <1000 >=1 >=18 >=96 租住 土、希、德
5: 28-46 <1000 >=1 >=18 >=96 租住 土、希、德
6: 28-46 <1000 <1 >=18 >=96 租住 土、希、德
creditcard.type flag
1: 欧洲Master、它行Visa或支票账户 1
2: 欧洲Master、它行Visa或支票账户 1
3: 欧洲Master、它行Visa或支票账户 1
4: 欧洲Master、它行Visa或支票账户 1
5: 欧洲Master、它行Visa或支票账户 1
6: 欧洲Master、它行Visa或支票账户 1
table(data$flag)
0 1
1500 45000
1好0坏,坏账率3.23%
UnivariateWOE <- function(varname, flag = "flag", datainput = data){
# 计算单变量WOE(已做过离散化), 返回信息矩阵, 评分卡专用.
# 参数:
# varname: 待转换变量名, 需加""
# flag: 目标变量名,需加"", 1好0坏
# datainput: 包含自变量和目标变量的数据集
x <- datainput[[varname]]
y <- datainput[[flag]]
z <- addmargins(table(x, y)) # 自变量与目标变量的列联表
dimnames1 <- dimnames(z)$x # 结果矩阵行数
dimnames2 <- c("N","%","#good","%good","#bad","%bad","WOE","IV","bad.rate")
m <- matrix(nrow = dim(z)[1], ncol = length(dimnames2), dimnames = list(dimnames1, dimnames2))
m[, "N"] <- z[, 3]
m[, "%"] <- m[, "N"] / m[nrow(m), "N"]
m[, "#good"] <- z[, "1"]
m[, "%good"] <- m[, "#good"] / m[nrow(m), "#good"]
m[, "#bad"] <- z[, "0"]
m[, "%bad"] <- m[, "#bad"] / m[nrow(m), "#bad"]
m[, "WOE"] <- log((m[, "#good"]/m[nrow(m), "#good"])/(m[, "#bad"]/m[nrow(m), "#bad"]))
m[, "IV"] <- (m[, "#good"]/m[nrow(m), "#good"] - m[, "#bad"]/m[nrow(m), "#bad"]) * m[, "WOE"]
m[nrow(m), "IV"] <- sum(m[1:(nrow(m)-1), "IV"])
m[, "bad.rate"] <- m[, "#bad"] / (m[, "#good"] + m[, "#bad"])
return(m)
}
WOE <- function(flag = "flag", datainput = data){
# 计算所有自变量WOE(已做过离散化), 计算WOE得分替换原有变量, 返回结果列表,评分卡专用.
# 参数:
# flag: 目标变量名,需加"", 1好0坏
# datainput: 包含自变量和目标变量的数据集
varnames <- names(datainput)[names(datainput) != flag] # 所有自变量名
woe <- lapply(varnames, UnivariateWOE) # 各个自变量的woe信息列表
names(woe) <- varnames
iv <- sapply(varnames, function(var) woe[[var]][nrow(woe[[var]]), "IV"]) # 所有变量的iv值
dataoutput <- as.data.frame(datainput)
for (var in varnames){
components <- dimnames(woe[[var]])[[1]][1:nrow(woe[[var]])-1] # 每个变量的分量名
for (comp in components){
var.woe <- paste0(var, ".woe")
dataoutput[datainput[[var]] == comp, var.woe] <- woe[[var]][comp, "WOE"] # 以WOE得分替换原变量
}
}
result <- list(WOE = woe,
IV = iv,
New = as.data.table(dataoutput[, setdiff(names(dataoutput), varnames)]))
}
modelingdata <- WOE("flag", data)
print(modelingdata$WOE) # 所有变量的WOE信息
$age
N % #good %good #bad %bad WOE
<23 2774 0.05965591 2520 0.0560000 254 0.1693333 -1.1065175
>=46 10179 0.21890323 10050 0.2233333 129 0.0860000 0.9543181
23-28 8007 0.17219355 7560 0.1680000 447 0.2980000 -0.5731295
28-46 25540 0.54924731 24870 0.5526667 670 0.4466667 0.2129424
Sum 46500 1.00000000 45000 1.0000000 1500 1.0000000 0.0000000
IV bad.rate
<23 0.12540531 0.09156453
>=46 0.13105969 0.01267315
23-28 0.07450684 0.05582615
28-46 0.02257190 0.02623336
Sum 0.35354374 0.03225806
$income
N % #good %good #bad %bad WOE
<1000 14221 0.3058280 13980 0.3106667 241 0.1606667 0.6593887
>=2400 19258 0.4141505 18690 0.4153333 568 0.3786667 0.0924251
1000-2400 13021 0.2800215 12330 0.2740000 691 0.4606667 -0.5195466
Sum 46500 1.0000000 45000 1.0000000 1500 1.0000000 0.0000000
IV bad.rate
<1000 0.09890831 0.01694677
>=2400 0.00338892 0.02949424
1000-2400 0.09698203 0.05306812
Sum 0.19927926 0.03225806
$children
N % #good %good #bad %bad WOE IV
<1 22117 0.4756344 21240 0.472 877 0.5846667 -0.2140629 0.02411775
>=1 24383 0.5243656 23760 0.528 623 0.4153333 0.2400149 0.02704168
Sum 46500 1.0000000 45000 1.000 1500 1.0000000 0.0000000 0.05115943
bad.rate
<1 0.03965276
>=1 0.02555059
Sum 0.03225806
$residence.time
N % #good %good #bad %bad WOE IV
<18 8409 0.1808387 8040 0.1786667 369 0.246 -0.31980966 0.021533851
>=18 38091 0.8191613 36960 0.8213333 1131 0.754 0.08553667 0.005759469
Sum 46500 1.0000000 45000 1.0000000 1500 1.000 0.00000000 0.027293320
bad.rate
<18 0.04388156
>=18 0.02969205
Sum 0.03225806
$career
N % #good %good #bad %bad WOE
<18 8035 0.1727957 7590 0.1686667 445 0.2966667 -0.56468479
>=96 15321 0.3294839 15060 0.3346667 261 0.1740000 0.65407971
18-96 23144 0.4977204 22350 0.4966667 794 0.5293333 -0.06369924
Sum 46500 1.0000000 45000 1.0000000 1500 1.0000000 0.00000000
IV bad.rate
<18 0.072279654 0.05538270
>=96 0.105088807 0.01703544
18-96 0.002080842 0.03430695
Sum 0.179449303 0.03225806
$residence.type
N % #good %good #bad %bad WOE
缺失 8220 0.17677419 7950 0.17666667 270 0.18000000 -0.01869213
自有 2329 0.05008602 2280 0.05066667 49 0.03266667 0.43891304
租住 35951 0.77313978 34770 0.77266667 1181 0.78733333 -0.01880397
Sum 46500 1.00000000 45000 1.00000000 1500 1.00000000 0.00000000
IV bad.rate
缺失 0.00006230711 0.03284672
自有 0.00790043476 0.02103907
租住 0.00027579160 0.03285027
Sum 0.00823853347 0.03225806
$nationality
N % #good %good #bad %bad WOE
其它国家 1088 0.02339785 1020 0.02266667 68 0.04533333 -0.69314718
土、希、德 44377 0.95434409 42960 0.95466667 1417 0.94466667 0.01053011
意、南、西 1035 0.02225806 1020 0.02266667 15 0.01000000 0.81831032
Sum 46500 1.00000000 45000 1.00000000 1500 1.00000000 0.00000000
IV bad.rate
其它国家 0.0157113361 0.06250000
土、希、德 0.0001053011 0.03193096
意、南、西 0.0103652641 0.01449275
Sum 0.0261819013 0.03225806
$creditcard.type
N % #good %good #bad
欧洲Master、它行Visa或支票账户 17115 0.368064516 16830 0.3740000000 285
我行Visa卡 32 0.000688172 30 0.0006666667 2
无信用卡 29167 0.627247312 27960 0.6213333333 1207
运通或其它信用卡 186 0.004000000 180 0.0040000000 6
Sum 46500 1.000000000 45000 1.0000000000 1500
%bad WOE IV
欧洲Master、它行Visa或支票账户 0.190000000 0.6772317 0.1246106374
我行Visa卡 0.001333333 -0.6931472 0.0004620981
无信用卡 0.804666667 -0.2585604 0.0474027412
运通或其它信用卡 0.004000000 0.0000000 0.0000000000
Sum 1.000000000 0.0000000 0.1724754767
bad.rate
欧洲Master、它行Visa或支票账户 0.01665206
我行Visa卡 0.06250000
无信用卡 0.04138238
运通或其它信用卡 0.03225806
Sum 0.03225806
print(modelingdata$IV) # 所有变量的IV信息
age income children residence.time
0.353543737 0.199279259 0.051159429 0.027293320
career residence.type nationality creditcard.type
0.179449303 0.008238533 0.026181901 0.172475477
print(modelingdata$New) # 计算WOE得分替换原有变量
flag age.woe income.woe children.woe residence.time.woe
1: 1 0.2129424 0.6593887 0.2400149 0.08553667
2: 1 0.2129424 0.6593887 0.2400149 0.08553667
3: 1 0.2129424 0.6593887 0.2400149 -0.31980966
4: 1 0.2129424 0.6593887 0.2400149 0.08553667
5: 1 0.2129424 0.6593887 0.2400149 0.08553667
---
46496: 1 0.2129424 -0.5195466 -0.2140629 0.08553667
46497: 1 0.2129424 -0.5195466 -0.2140629 0.08553667
46498: 1 0.2129424 -0.5195466 -0.2140629 0.08553667
46499: 1 0.2129424 -0.5195466 -0.2140629 0.08553667
46500: 1 0.2129424 -0.5195466 -0.2140629 0.08553667
career.woe residence.type.woe nationality.woe creditcard.type.woe
1: -0.06369924 -0.01869213 0.01053011 0.6772317
2: 0.65407971 -0.01880397 0.01053011 0.6772317
3: -0.06369924 -0.01880397 -0.69314718 0.6772317
4: 0.65407971 -0.01880397 0.01053011 0.6772317
5: 0.65407971 -0.01880397 0.01053011 0.6772317
---
46496: -0.06369924 -0.01880397 0.01053011 -0.6931472
46497: -0.06369924 -0.01880397 0.01053011 -0.6931472
46498: -0.06369924 -0.01880397 0.01053011 -0.6931472
46499: -0.06369924 -0.01880397 0.01053011 -0.6931472
46500: -0.06369924 -0.01880397 0.01053011 -0.6931472
trainindex <- createDataPartition(modelingdata$New$flag, p = 0.75, list = FALSE) # 生成区分训练集和测试集的索引
train <- modelingdata$New[as.vector(trainindex)] # 训练集
test <- modelingdata$New[-as.vector(trainindex)] # 测试集
model <- glm(flag~., family = binomial(), data = train) # 建立logistic回归模型
summary(model)
Call:
glm(formula = flag ~ ., family = binomial(), data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4255 0.1661 0.2083 0.2795 0.6234
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.38754 0.03231 104.834 < 2e-16 ***
age.woe 0.75426 0.05499 13.717 < 2e-16 ***
income.woe 0.25204 0.09983 2.525 0.01158 *
children.woe 0.38992 0.14863 2.623 0.00871 **
residence.time.woe 0.10708 0.18099 0.592 0.55409
career.woe 0.56420 0.08287 6.808 9.87e-12 ***
residence.type.woe 0.19875 0.36466 0.545 0.58573
nationality.woe 0.94943 0.18091 5.248 1.54e-07 ***
creditcard.type.woe 0.72143 0.11036 6.537 6.26e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9967.0 on 34874 degrees of freedom
Residual deviance: 9335.4 on 34866 degrees of freedom
AIC: 9353.4
Number of Fisher Scoring iterations: 7
CoefficientsValue <- matrix(coefficients(model), dimnames = list(names(coefficients(model)), "value")) # 系数
print(CoefficientsValue)
value
(Intercept) 3.3875388
age.woe 0.7542584
income.woe 0.2520444
children.woe 0.3899165
residence.time.woe 0.1070800
career.woe 0.5642004
residence.type.woe 0.1987536
nationality.woe 0.9494262
creditcard.type.woe 0.7214278
formula <- paste0("ln(odds) = ", paste(coefficients(model), names(coefficients(model)), sep = "*", collapse = " + ")) # 回归方程
print(formula)
[1] "ln(odds) = 3.3875388099496*(Intercept) + 0.754258366145558*age.woe + 0.252044434418482*income.woe + 0.389916473279434*children.woe + 0.10707997896305*residence.time.woe + 0.564200416409627*career.woe + 0.198753570808216*residence.type.woe + 0.949426229253606*nationality.woe + 0.721427784358597*creditcard.type.woe"
注:本阶段省去了模型诊断步骤,如自变量的多重共线性,回归系数显著性等等。
train$prob <- predict(model, train, type ="response")
test$prob <- predict(model, test, type ="response")
roc.test <- roc(response = test$flag, predictor = test$prob) # 测试集ROC曲线
plot(roc.test, print.auc = TRUE, auc.polygon = TRUE, grid = c(0.1,0.2),
grid.col = c("green", "red"), max.auc.polygon =TRUE,
auc.polygon.col = "skyblue", print.thres = TRUE)
Call:
roc.default(response = test$flag, predictor = test$prob)
Data: test$prob in 371 controls (test$flag 0) < 11254 cases (test$flag 1).
Area under the curve: 0.7239
score = offset + factor * ln(odds)
score + pdo = offset + factor * ln(2 * odds)
解方程组得:
factor = pdo / ln2
offset = score - factor * ln(odds)
score <- 500
odds <- 30
pdo <- 50
factor <- pdo / log(2)
offset <- 500 - factor * log(30)
print(factor) # 斜率
[1] 72.13475
print(offset) # 截距
[1] 254.6555
# 计算评分
train[, score := round(offset + factor * log(prob / (1 - prob)))]
test[, score := round(offset + factor * log(prob / (1 - prob)))]
ggplot(test, aes(x = score)) + geom_histogram() # 测试集评分分布
# 测试集好/坏客户分布
ggplot(test, aes(x = score, color = factor(flag, labels = c("bad","good")))) +
geom_density() +
labs(color = "flag")
# K-S曲线
test[order(score),
list(badcount = sum(flag == 0),
goodcount = sum(flag == 1)),
by = score][, list(score,
badrate = cumsum(badcount) / sum(badcount),
goodrate = cumsum(goodcount) / sum(goodcount))] %>>%
(~ KS <- .[badrate - goodrate == max(badrate - goodrate)]) %>>% # 储存KS值
melt(., id.vars = "score", measure.vars = c("badrate","goodrate"),
variable.name = "variable", value.name = "value") %>>%
with(ggplot(., aes(x = score, y = value, color = variable)) +
geom_line() +
annotate("segment", x = KS[,score], xend = KS[, score], y = KS[, goodrate], yend = KS[, badrate], linetype = "longdash",
arrow = arrow(angle = 90, ends = "both", length = unit(0.1, "cm"), type = "closed")) + # 使用线段标识取KS值的位置
annotate("text", x = KS[,score] - 50, y = KS[, badrate] + 0.05,
color = "blue",
label = paste0("KS = ", round(KS[, badrate - goodrate], 2))) + # 在图中显示KS值
labs(title = "K-S图", x = "score", y = "accumulative % of applicants", color = NULL)
)