本文档描述申请评分卡开发中模型建立阶段的流程,本阶段前应已完成变量筛选缺失值处理等工作,并确定好项目参数(好坏定义表现窗口长度等等)。


1. 数据导入

# 载入所需包 
library(data.table)
library(pipeR)
library(ggplot2)
library(grid)
library(caret)
library(pROC)
library(scales)
library(reshape2)
# 读入数据
data <- fread("申请评分卡数据.csv")

数据概览:

head(data) 
     age income children residence.time career residence.type nationality
1: 28-46  <1000      >=1           >=18  18-96           缺失  土、希、德
2: 28-46  <1000      >=1           >=18   >=96           租住  土、希、德
3: 28-46  <1000      >=1            <18  18-96           租住    其它国家
4: 28-46  <1000      >=1           >=18   >=96           租住  土、希、德
5: 28-46  <1000      >=1           >=18   >=96           租住  土、希、德
6: 28-46  <1000       <1           >=18   >=96           租住  土、希、德
                  creditcard.type flag
1: 欧洲Master、它行Visa或支票账户    1
2: 欧洲Master、它行Visa或支票账户    1
3: 欧洲Master、它行Visa或支票账户    1
4: 欧洲Master、它行Visa或支票账户    1
5: 欧洲Master、它行Visa或支票账户    1
6: 欧洲Master、它行Visa或支票账户    1
table(data$flag)

    0     1 
 1500 45000 

1好0坏,坏账率3.23%


2. 自定义函数

UnivariateWOE <- function(varname, flag = "flag", datainput = data){
# 计算单变量WOE(已做过离散化), 返回信息矩阵, 评分卡专用.
# 参数:
  # varname:   待转换变量名, 需加""
  # flag:      目标变量名,需加"", 1好0坏
  # datainput: 包含自变量和目标变量的数据集
  x <- datainput[[varname]]
  y <- datainput[[flag]]
  z <- addmargins(table(x, y))  # 自变量与目标变量的列联表
  dimnames1 <- dimnames(z)$x  # 结果矩阵行数
  dimnames2 <- c("N","%","#good","%good","#bad","%bad","WOE","IV","bad.rate") 
  m <- matrix(nrow = dim(z)[1], ncol = length(dimnames2), dimnames = list(dimnames1, dimnames2))
  m[, "N"]     <- z[, 3]
  m[, "%"]     <- m[, "N"] / m[nrow(m), "N"]
  m[, "#good"] <- z[, "1"]
  m[, "%good"] <- m[, "#good"] / m[nrow(m), "#good"]
  m[, "#bad"]  <- z[, "0"]
  m[, "%bad"]  <- m[, "#bad"] / m[nrow(m), "#bad"]
  m[, "WOE"]   <- log((m[, "#good"]/m[nrow(m), "#good"])/(m[, "#bad"]/m[nrow(m), "#bad"]))
  m[, "IV"]    <- (m[, "#good"]/m[nrow(m), "#good"] - m[, "#bad"]/m[nrow(m), "#bad"]) * m[, "WOE"]
  m[nrow(m), "IV"] <- sum(m[1:(nrow(m)-1), "IV"])
  m[, "bad.rate"]  <-  m[, "#bad"] / (m[, "#good"] + m[, "#bad"])
  return(m)
}

WOE <- function(flag = "flag", datainput = data){
# 计算所有自变量WOE(已做过离散化), 计算WOE得分替换原有变量, 返回结果列表,评分卡专用.
# 参数:
  # flag:      目标变量名,需加"", 1好0坏
  # datainput: 包含自变量和目标变量的数据集
  varnames <- names(datainput)[names(datainput) != flag]  # 所有自变量名
  woe <- lapply(varnames, UnivariateWOE)  # 各个自变量的woe信息列表
  names(woe) <- varnames
  iv <- sapply(varnames, function(var) woe[[var]][nrow(woe[[var]]), "IV"])  # 所有变量的iv值
  dataoutput <- as.data.frame(datainput)
  for (var in varnames){
    components <- dimnames(woe[[var]])[[1]][1:nrow(woe[[var]])-1]  # 每个变量的分量名
    for (comp in components){
      var.woe <- paste0(var, ".woe")
      dataoutput[datainput[[var]] == comp, var.woe] <- woe[[var]][comp, "WOE"]  # 以WOE得分替换原变量
    }  
  }
  result <- list(WOE = woe,
                 IV  = iv,
                 New = as.data.table(dataoutput[, setdiff(names(dataoutput), varnames)]))
}

3. 建立模型

modelingdata <- WOE("flag", data)
print(modelingdata$WOE)  # 所有变量的WOE信息
$age
          N          % #good     %good #bad      %bad        WOE
<23    2774 0.05965591  2520 0.0560000  254 0.1693333 -1.1065175
>=46  10179 0.21890323 10050 0.2233333  129 0.0860000  0.9543181
23-28  8007 0.17219355  7560 0.1680000  447 0.2980000 -0.5731295
28-46 25540 0.54924731 24870 0.5526667  670 0.4466667  0.2129424
Sum   46500 1.00000000 45000 1.0000000 1500 1.0000000  0.0000000
              IV   bad.rate
<23   0.12540531 0.09156453
>=46  0.13105969 0.01267315
23-28 0.07450684 0.05582615
28-46 0.02257190 0.02623336
Sum   0.35354374 0.03225806

$income
              N         % #good     %good #bad      %bad        WOE
<1000     14221 0.3058280 13980 0.3106667  241 0.1606667  0.6593887
>=2400    19258 0.4141505 18690 0.4153333  568 0.3786667  0.0924251
1000-2400 13021 0.2800215 12330 0.2740000  691 0.4606667 -0.5195466
Sum       46500 1.0000000 45000 1.0000000 1500 1.0000000  0.0000000
                  IV   bad.rate
<1000     0.09890831 0.01694677
>=2400    0.00338892 0.02949424
1000-2400 0.09698203 0.05306812
Sum       0.19927926 0.03225806

$children
        N         % #good %good #bad      %bad        WOE         IV
<1  22117 0.4756344 21240 0.472  877 0.5846667 -0.2140629 0.02411775
>=1 24383 0.5243656 23760 0.528  623 0.4153333  0.2400149 0.02704168
Sum 46500 1.0000000 45000 1.000 1500 1.0000000  0.0000000 0.05115943
      bad.rate
<1  0.03965276
>=1 0.02555059
Sum 0.03225806

$residence.time
         N         % #good     %good #bad  %bad         WOE          IV
<18   8409 0.1808387  8040 0.1786667  369 0.246 -0.31980966 0.021533851
>=18 38091 0.8191613 36960 0.8213333 1131 0.754  0.08553667 0.005759469
Sum  46500 1.0000000 45000 1.0000000 1500 1.000  0.00000000 0.027293320
       bad.rate
<18  0.04388156
>=18 0.02969205
Sum  0.03225806

$career
          N         % #good     %good #bad      %bad         WOE
<18    8035 0.1727957  7590 0.1686667  445 0.2966667 -0.56468479
>=96  15321 0.3294839 15060 0.3346667  261 0.1740000  0.65407971
18-96 23144 0.4977204 22350 0.4966667  794 0.5293333 -0.06369924
Sum   46500 1.0000000 45000 1.0000000 1500 1.0000000  0.00000000
               IV   bad.rate
<18   0.072279654 0.05538270
>=96  0.105088807 0.01703544
18-96 0.002080842 0.03430695
Sum   0.179449303 0.03225806

$residence.type
         N          % #good      %good #bad       %bad         WOE
缺失  8220 0.17677419  7950 0.17666667  270 0.18000000 -0.01869213
自有  2329 0.05008602  2280 0.05066667   49 0.03266667  0.43891304
租住 35951 0.77313978 34770 0.77266667 1181 0.78733333 -0.01880397
Sum  46500 1.00000000 45000 1.00000000 1500 1.00000000  0.00000000
                IV   bad.rate
缺失 0.00006230711 0.03284672
自有 0.00790043476 0.02103907
租住 0.00027579160 0.03285027
Sum  0.00823853347 0.03225806

$nationality
               N          % #good      %good #bad       %bad         WOE
其它国家    1088 0.02339785  1020 0.02266667   68 0.04533333 -0.69314718
土、希、德 44377 0.95434409 42960 0.95466667 1417 0.94466667  0.01053011
意、南、西  1035 0.02225806  1020 0.02266667   15 0.01000000  0.81831032
Sum        46500 1.00000000 45000 1.00000000 1500 1.00000000  0.00000000
                     IV   bad.rate
其它国家   0.0157113361 0.06250000
土、希、德 0.0001053011 0.03193096
意、南、西 0.0103652641 0.01449275
Sum        0.0261819013 0.03225806

$creditcard.type
                                   N           % #good        %good #bad
欧洲Master、它行Visa或支票账户 17115 0.368064516 16830 0.3740000000  285
我行Visa卡                        32 0.000688172    30 0.0006666667    2
无信用卡                       29167 0.627247312 27960 0.6213333333 1207
运通或其它信用卡                 186 0.004000000   180 0.0040000000    6
Sum                            46500 1.000000000 45000 1.0000000000 1500
                                      %bad        WOE           IV
欧洲Master、它行Visa或支票账户 0.190000000  0.6772317 0.1246106374
我行Visa卡                     0.001333333 -0.6931472 0.0004620981
无信用卡                       0.804666667 -0.2585604 0.0474027412
运通或其它信用卡               0.004000000  0.0000000 0.0000000000
Sum                            1.000000000  0.0000000 0.1724754767
                                 bad.rate
欧洲Master、它行Visa或支票账户 0.01665206
我行Visa卡                     0.06250000
无信用卡                       0.04138238
运通或其它信用卡               0.03225806
Sum                            0.03225806
print(modelingdata$IV)   # 所有变量的IV信息
            age          income        children  residence.time 
    0.353543737     0.199279259     0.051159429     0.027293320 
         career  residence.type     nationality creditcard.type 
    0.179449303     0.008238533     0.026181901     0.172475477 
print(modelingdata$New)  # 计算WOE得分替换原有变量
       flag   age.woe income.woe children.woe residence.time.woe
    1:    1 0.2129424  0.6593887    0.2400149         0.08553667
    2:    1 0.2129424  0.6593887    0.2400149         0.08553667
    3:    1 0.2129424  0.6593887    0.2400149        -0.31980966
    4:    1 0.2129424  0.6593887    0.2400149         0.08553667
    5:    1 0.2129424  0.6593887    0.2400149         0.08553667
   ---                                                          
46496:    1 0.2129424 -0.5195466   -0.2140629         0.08553667
46497:    1 0.2129424 -0.5195466   -0.2140629         0.08553667
46498:    1 0.2129424 -0.5195466   -0.2140629         0.08553667
46499:    1 0.2129424 -0.5195466   -0.2140629         0.08553667
46500:    1 0.2129424 -0.5195466   -0.2140629         0.08553667
        career.woe residence.type.woe nationality.woe creditcard.type.woe
    1: -0.06369924        -0.01869213      0.01053011           0.6772317
    2:  0.65407971        -0.01880397      0.01053011           0.6772317
    3: -0.06369924        -0.01880397     -0.69314718           0.6772317
    4:  0.65407971        -0.01880397      0.01053011           0.6772317
    5:  0.65407971        -0.01880397      0.01053011           0.6772317
   ---                                                                   
46496: -0.06369924        -0.01880397      0.01053011          -0.6931472
46497: -0.06369924        -0.01880397      0.01053011          -0.6931472
46498: -0.06369924        -0.01880397      0.01053011          -0.6931472
46499: -0.06369924        -0.01880397      0.01053011          -0.6931472
46500: -0.06369924        -0.01880397      0.01053011          -0.6931472
trainindex <- createDataPartition(modelingdata$New$flag, p = 0.75, list = FALSE)  # 生成区分训练集和测试集的索引
train <- modelingdata$New[as.vector(trainindex)]   # 训练集
test  <- modelingdata$New[-as.vector(trainindex)]  # 测试集
model <- glm(flag~., family = binomial(), data = train)  # 建立logistic回归模型
summary(model)

Call:
glm(formula = flag ~ ., family = binomial(), data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.4255   0.1661   0.2083   0.2795   0.6234  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)          3.38754    0.03231 104.834  < 2e-16 ***
age.woe              0.75426    0.05499  13.717  < 2e-16 ***
income.woe           0.25204    0.09983   2.525  0.01158 *  
children.woe         0.38992    0.14863   2.623  0.00871 ** 
residence.time.woe   0.10708    0.18099   0.592  0.55409    
career.woe           0.56420    0.08287   6.808 9.87e-12 ***
residence.type.woe   0.19875    0.36466   0.545  0.58573    
nationality.woe      0.94943    0.18091   5.248 1.54e-07 ***
creditcard.type.woe  0.72143    0.11036   6.537 6.26e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 9967.0  on 34874  degrees of freedom
Residual deviance: 9335.4  on 34866  degrees of freedom
AIC: 9353.4

Number of Fisher Scoring iterations: 7
CoefficientsValue <- matrix(coefficients(model), dimnames = list(names(coefficients(model)), "value"))  # 系数
print(CoefficientsValue)
                        value
(Intercept)         3.3875388
age.woe             0.7542584
income.woe          0.2520444
children.woe        0.3899165
residence.time.woe  0.1070800
career.woe          0.5642004
residence.type.woe  0.1987536
nationality.woe     0.9494262
creditcard.type.woe 0.7214278
formula <- paste0("ln(odds) = ", paste(coefficients(model), names(coefficients(model)), sep = "*", collapse = " + "))  # 回归方程
print(formula)
[1] "ln(odds) = 3.3875388099496*(Intercept) + 0.754258366145558*age.woe + 0.252044434418482*income.woe + 0.389916473279434*children.woe + 0.10707997896305*residence.time.woe + 0.564200416409627*career.woe + 0.198753570808216*residence.type.woe + 0.949426229253606*nationality.woe + 0.721427784358597*creditcard.type.woe"

注:本阶段省去了模型诊断步骤,如自变量的多重共线性回归系数显著性等等。


4. 模型评估

train$prob <- predict(model, train, type ="response")
test$prob  <- predict(model, test,  type ="response")

roc.test <- roc(response = test$flag, predictor = test$prob)  # 测试集ROC曲线
plot(roc.test, print.auc = TRUE, auc.polygon = TRUE, grid = c(0.1,0.2),
     grid.col = c("green", "red"), max.auc.polygon =TRUE,
     auc.polygon.col = "skyblue", print.thres = TRUE)


Call:
roc.default(response = test$flag, predictor = test$prob)

Data: test$prob in 371 controls (test$flag 0) < 11254 cases (test$flag 1).
Area under the curve: 0.7239

5. 评分校准

score = offset + factor * ln(odds)  
score + pdo = offset + factor * ln(2 * odds)
解方程组得:
factor = pdo / ln2
offset = score - factor * ln(odds)
score <- 500
odds  <- 30
pdo   <- 50
factor <- pdo / log(2) 
offset <- 500 - factor * log(30)
print(factor)  # 斜率
[1] 72.13475
print(offset)  # 截距
[1] 254.6555

6. 评分

# 计算评分
train[, score := round(offset + factor * log(prob / (1 - prob)))] 
test[,  score := round(offset + factor * log(prob / (1 - prob)))]
ggplot(test, aes(x = score)) + geom_histogram()  # 测试集评分分布

# 测试集好/坏客户分布
ggplot(test, aes(x = score, color = factor(flag, labels = c("bad","good")))) + 
geom_density() + 
labs(color = "flag")

# K-S曲线
test[order(score), 
     list(badcount  = sum(flag == 0),
          goodcount = sum(flag == 1)),
     by = score][, list(score,
                        badrate  = cumsum(badcount) / sum(badcount),
                        goodrate = cumsum(goodcount) / sum(goodcount))] %>>% 
(~ KS <- .[badrate - goodrate == max(badrate - goodrate)])  %>>%  # 储存KS值 
melt(., id.vars = "score", measure.vars = c("badrate","goodrate"), 
     variable.name = "variable", value.name = "value") %>>% 
with(ggplot(., aes(x = score, y = value, color = variable)) + 
     geom_line() +
     annotate("segment", x = KS[,score], xend = KS[, score], y = KS[, goodrate], yend = KS[, badrate], linetype = "longdash",
              arrow = arrow(angle = 90, ends = "both", length = unit(0.1, "cm"), type = "closed")) +  # 使用线段标识取KS值的位置
     annotate("text", x = KS[,score] - 50, y = KS[, badrate] + 0.05,
              color = "blue", 
              label = paste0("KS = ", round(KS[, badrate - goodrate], 2))) +  # 在图中显示KS值
     labs(title = "K-S图", x = "score", y = "accumulative % of applicants", color = NULL)
    )