##参数 #1)echo = FALSE：隐藏代码，但运行代码并产生所有输出，曲线图，警告和消息。 #2)eval = FALSE：显示代码，但不对其进行执行。 #3)fig.show = “hide”：隐藏图。 #4)include = FALSE：运行代码，但不显示所有输出。这对于设置代码很有帮助。打开新的R Markdown文档时，您可以在第一个代码块中看到一个示例! #5)message = FALSE：防止软件包在加载时打印消息。这也抑制了函数生成的消息。 #6)results = “hide”：隐藏打印输出。 #7)warning = FALSE：防止软件包和功能显示警告。

##通过DALEX进行模型解释、变量特征重要性、单特征对模型的影响

Including R code

#加载包
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(caret)

## 载入需要的程辑包：lattice
## 
## 载入程辑包：'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(DALEX)

## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## 
## 
## 载入程辑包：'DALEX'
## 
## The following object is masked from 'package:dplyr':
## 
##     explain

1）加载数据并划分训练集和测试集

pb <- data.table::fread("D:/360MoveData/Users/bigdata/Desktop/书：风控之路：评分卡建模实践/样例/公开数据/GiveMeSomeCredit/cs-training.csv", header=TRUE ) # , encoding="UTF-8")


pb_date<-pb %>% drop_na()

id<-sample(2, nrow(pb_date), replace = T, prob = c(0.1,0.9))
pb_date_train<-pb_date[id == 1,]
pb_date_test <-pb_date[id == 2,]

2）使用caret包建立模型

#以SeriousDlqin2yrs 作为响应变量。其余变量作为解释变量，分别建立随机森林模型，广义线性模型，gbm模型。
ozone_rf<-train(SeriousDlqin2yrs ~.,data = pb_date_train,
                method = "rf",
                ntree = 5)

ozone_glm<-train(SeriousDlqin2yrs ~.,data = pb_date_train,
                 method = "glm")

ozone_gbm<-train(SeriousDlqin2yrs ~.,data = pb_date_train,
                 method = "gbm")

3）对模型进行解释

#这里使用DALEX包的explain函数对三个模型进行解释性分析。 #explain函数包含4个信息：1.模型信息；2.标签信息（如果没有，会自动从模型抽取）；3.验证数据集；4.验证数据集中的响应变量。 #模型解释

explainer_rf<-explain(ozone_rf,label = "rf",
                      data = pb_date_train,
                      y = pb_date_train$SeriousDlqin2yrs)

explainer_glm<-explain(ozone_glm,label = "glm",
                       data = pb_date_train,
                       y = pb_date_train$SeriousDlqin2yrs)

explainer_gbm<-explain(ozone_gbm,label = "gbm",
                       data = pb_date_train,
                       y = pb_date_train$SeriousDlqin2yrs)

4）模型表现

#使用model_performance函数

per_rf <-model_performance(explainer_rf)
per_glm<-model_performance(explainer_glm)
per_gbm<-model_performance(explainer_gbm)

#对模型表现分布进行可视化。分别绘制累积残差分布图、箱线图分布图。

#累积残差分布图
plot(per_rf, per_glm, per_gbm)

#箱线图分布图
plot(per_rf, per_glm, per_gbm,geom = "boxplot")

5）变量重要性分析

#分析在不同的模型中，不同变量对于模型预测的相对重要性程度。 #此处损失函数为均方根误差，解释为缺了该变量会对响应变量的预测值带来多大程度的影响。

#变量重要性分析
importance_rf<-variable_importance(
  explainer_rf,
  loss_function = loss_root_mean_square
)

plot(importance_rf)

6）单个连续型解释变量与响应变量关系

##方法1
#使用variable_effect函数，type = "partial_dependency"。
#partial_dependency:解释单个连续性解释变量与响应变量关系
pdp_rf<-variable_effect(explainer_rf,
                        variable = "NumberOfTimes90DaysLate",
                        type = "partial_dependency")
plot(pdp_rf)

#方法2
#使用variable_effect函数，type = "accumulated_dependency"，客服变量间的相关性。
#accumulated_dependency:客服变量相关性
ale_rf<-variable_effect(explainer_rf,
                        variable = "NumberOfTimes90DaysLate",
                        type = "accumulated_dependency")

plot(ale_rf)

7）模型预测

bd<-predict_parts(explainer = explainer_rf,
              new_observation=pb_date_train[1,]
              ,type='break_down')
plot(bd)

pd <-predict_profile(explainer = explainer_rf,
                     new_observation=pb_date_train[1,] )

plot(pd)

案例演示：DALEX模型解释

2023-03-09