1. Model-based variable importance
  2. Permutation-based variable importance
  3. SHAP-based variable importance
  4. PDP/ICE-based variable importance

常用函数

https://koalaverse.github.io/vip/reference/index.html

  1. 模型指标:metric_函数
  2. 提取公式:get_formula()
  3. 列举指标:list_metrics()
  4. 变量重要性:vi()
  5. 基于 ICE 的变量重要性:vi_ice()
  6. 特定于模型的变量重要性:vi_model()
  7. 基于 PDP 的变量重要性:vi_pdp()
  8. 基于排列的变量重要性:vi_permute()
  9. 基于 SHAP 的变量重要性:vi_shap()
  10. 交互作用: vint()
  11. 变量重要性图:vip()

案例

# Simulate training data
set.seed(101)  # for reproducibility
trn <- as.data.frame(mlbench::mlbench.friedman1(500))  # ?mlbench.friedman1

# Inspect data
tibble::as.tibble(trn)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## # A tibble: 500 × 11
##       x.1   x.2   x.3   x.4    x.5     x.6   x.7   x.8   x.9  x.10     y
##     <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 0.372  0.406 0.102 0.322 0.693  0.758   0.518 0.530 0.878 0.763 14.9 
##  2 0.0438 0.602 0.602 0.999 0.776  0.533   0.509 0.487 0.118 0.176 15.3 
##  3 0.710  0.362 0.254 0.548 0.0180 0.765   0.715 0.844 0.334 0.118 15.1 
##  4 0.658  0.291 0.542 0.327 0.230  0.301   0.177 0.346 0.474 0.283 10.7 
##  5 0.250  0.794 0.383 0.947 0.462  0.00487 0.270 0.114 0.489 0.311 17.6 
##  6 0.300  0.701 0.992 0.386 0.666  0.198   0.924 0.775 0.736 0.974 18.3 
##  7 0.585  0.365 0.283 0.488 0.845  0.466   0.715 0.202 0.905 0.640 14.6 
##  8 0.333  0.552 0.858 0.509 0.697  0.388   0.260 0.355 0.517 0.165 17.0 
##  9 0.622  0.118 0.490 0.390 0.468  0.360   0.572 0.891 0.682 0.717  8.54
## 10 0.546  0.150 0.476 0.706 0.829  0.373   0.192 0.873 0.456 0.694 15.0 
## # … with 490 more rows

模型特定的 VI 分数

决策树可能提供最自然的模型特定方法来量化每个特征的重要性。

# Load required packages
library(xgboost)  # for fitting GBMs
library(ranger)   # for fitting random forests
library(rpart)    # for fitting CART-like decision trees

# Fit a single regression tree
tree <- rpart(y ~ ., data = trn)

# Fit a random forest
set.seed(101)
rfo <- ranger(y ~ ., data = trn, importance = "impurity")

# Fit a GBM
set.seed(102)
bst <- xgboost(
  data = data.matrix(subset(trn, select = -y)),
  label = trn$y, 
  objective = "reg:linear",
  nrounds = 100, 
  max_depth = 5, 
  eta = 0.3,
  verbose = 0  # suppress printing
)
## [15:15:05] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.

上述每个包都包括计算模型中所有特征的 VI 分数的能力;但是,实现是特定于包的,如下面的代码块所示。

(vi_tree <- tree$variable.importance)
##       x.4       x.2       x.1       x.5       x.3       x.6       x.7       x.9 
## 4233.8443 2512.6610 2461.1746 1229.9290  687.9090  533.1549  356.8836  330.6542 
##       x.8      x.10 
##  275.6148  274.5077
barplot(vi_tree, horiz = TRUE, las = 1)

(vi_rfo <- rfo$variable.importance)
##       x.1       x.2       x.3       x.4       x.5       x.6       x.7       x.8 
## 2069.4912 2425.6201  950.0576 3782.4357 1495.2632  366.2181  363.7110  366.9129 
##       x.9      x.10 
##  337.3055  385.2958
barplot(vi_rfo, horiz = TRUE, las = 1)

(vi_bst <- xgb.importance(model = bst))
##     Feature        Gain      Cover  Frequency
##  1:     x.4 0.403044885 0.12713681 0.10149673
##  2:     x.2 0.224976505 0.10504115 0.13610851
##  3:     x.1 0.188541029 0.10597358 0.17633302
##  4:     x.5 0.089410552 0.07012969 0.07904584
##  5:     x.3 0.068165727 0.10009244 0.10243218
##  6:     x.9 0.008023709 0.07802100 0.07062675
##  7:     x.6 0.007456255 0.13405129 0.10243218
##  8:     x.7 0.003997671 0.08678822 0.07764266
##  9:    x.10 0.003766490 0.11868040 0.08325538
## 10:     x.8 0.002617178 0.07408544 0.07062675
xgb.ggplot.importance(vi_bst)

基于VIP包的变量重要性

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
vi(tree)  # CART-like decision tree
## # A tibble: 10 × 2
##    Variable Importance
##    <chr>         <dbl>
##  1 x.4           4234.
##  2 x.2           2513.
##  3 x.1           2461.
##  4 x.5           1230.
##  5 x.3            688.
##  6 x.6            533.
##  7 x.7            357.
##  8 x.9            331.
##  9 x.8            276.
## 10 x.10           275.
vi(rfo)   # RF
## # A tibble: 10 × 2
##    Variable Importance
##    <chr>         <dbl>
##  1 x.4           3782.
##  2 x.2           2426.
##  3 x.1           2069.
##  4 x.5           1495.
##  5 x.3            950.
##  6 x.10           385.
##  7 x.8            367.
##  8 x.6            366.
##  9 x.7            364.
## 10 x.9            337.
vi(bst)   # GBM
## # A tibble: 10 × 2
##    Variable Importance
##    <chr>         <dbl>
##  1 x.4         0.403  
##  2 x.2         0.225  
##  3 x.1         0.189  
##  4 x.5         0.0894 
##  5 x.3         0.0682 
##  6 x.9         0.00802
##  7 x.6         0.00746
##  8 x.7         0.00400
##  9 x.10        0.00377
## 10 x.8         0.00262

请注意该vi()函数如何始终返回包含两列的 tibble:Variable和Importance,VI 分数从最高到最低排序。

VIPfunction 可视化

# Load required packages
library(vip)

# Construct ggplot2-based VIPs
p1 <- vip(tree)  # CART-like decision tree
p2 <- vip(rfo, width = 0.5, aesthetics = list(fill = "green3"))   # RF
#> Warning in vip.default(rfo, width = 0.5, aesthetics = list(fill = "green3")):
#> Arguments `width`, `alpha`, `color`, `fill`, `size`, and `shape` have all been
#> deprecated in favor of the new `mapping` and `aesthetics` arguments. They will
#> be removed in version 0.3.0.
p3 <- vip(bst, aesthetics = list(col = "purple2"))   # GBM

# Display all three plots side by side
grid.arrange(p1, p2, p3, ncol = 3)

vip()函数如何始终返回一个”ggplot”对象

线性模型重要性

在多元线性回归或线性模型 (LM) 中,𝑡-statistic 通常用作 VI 的度量。同样的想法也适用于广义线性模型 (GLM)

# Load required packages
library(ggplot2)  # for `aes_string()` function

# Fit a LM
linmod <- lm(y ~ .^2, data = trn)
backward <- step(linmod, direction = "backward", trace = 0)

# Extract VI scores
vi(backward)
## # A tibble: 21 × 3
##    Variable Importance Sign 
##    <chr>         <dbl> <chr>
##  1 x.4           14.2  POS  
##  2 x.2            7.31 POS  
##  3 x.1            5.63 POS  
##  4 x.5            5.21 POS  
##  5 x.3:x.5        2.46 POS  
##  6 x.1:x.10       2.41 NEG  
##  7 x.2:x.6        2.41 NEG  
##  8 x.1:x.5        2.37 NEG  
##  9 x.10           2.21 POS  
## 10 x.3:x.4        2.01 NEG  
## # … with 11 more rows
p1 <- vip(backward, num_features = length(coef(backward)), 
          geom = "point", horizontal = FALSE)
p2 <- vip(backward, num_features = length(coef(backward)), 
          geom = "point", horizontal = FALSE, 
          mapping = aes_string(color = "Sign"))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`
grid.arrange(p1, p2, nrow = 1)

Friedman (1991)提出的多元自适应回归样条(Multivariate adaptive regression splines, MARS)是一种自动回归技术,可以看作是多元线性回归和广义线性模型的推广。在 MARS 算法中,每个预测变量的贡献(或 VI 分数)是使用广义交叉验证 (GCV) 统计量确定的。

# Load required packages
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
#> Loading required package: Formula
#> Loading required package: plotmo
#> Loading required package: plotrix
#> Loading required package: TeachingDemos

# Fit a MARS model
mars <- earth(y ~ ., data = trn, degree = 2, pmethod = "exhaustive")

# Extract VI scores
vi(mars)
## # A tibble: 10 × 2
##    Variable Importance
##    <chr>         <dbl>
##  1 x.4              15
##  2 x.1              14
##  3 x.2              14
##  4 x.5              12
##  5 x.3              10
##  6 x.6               0
##  7 x.7               0
##  8 x.8               0
##  9 x.9               0
## 10 x.10              0
vip(mars)

神经网络

对于 NN,构建 VI 分数的两种流行方法是 Garson 算法(Garson 1991)(后来由Goh (1995)修改)和 Olden 算法(Olden、Joy 和 Death 2004)。对于这两种算法,这些重要性分数的基础是网络的连接权重。

library(nnet)

# Fit a neural network
set.seed(0803)
nn <- nnet(y ~ ., data = trn, size = 7, decay = 0.1, linout = TRUE)
## # weights:  85
## initial  value 126298.176281 
## iter  10 value 5148.185025
## iter  20 value 3563.828062
## iter  30 value 3095.876922
## iter  40 value 2638.672528
## iter  50 value 1983.009001
## iter  60 value 1759.428911
## iter  70 value 1483.284575
## iter  80 value 1112.219052
## iter  90 value 835.941067
## iter 100 value 748.120719
## final  value 748.120719 
## stopped after 100 iterations
vi(mars)
## # A tibble: 10 × 2
##    Variable Importance
##    <chr>         <dbl>
##  1 x.4              15
##  2 x.1              14
##  3 x.2              14
##  4 x.5              12
##  5 x.3              10
##  6 x.6               0
##  7 x.7               0
##  8 x.8               0
##  9 x.9               0
## 10 x.10              0
# Plot VI scores
grid.arrange(
  vip(nn),
  vip(nn, type = "garson"),
  nrow = 1
)

与模型无关的 VI 分数

与模型无关的可解释性将解释与模型分开。与特定于模型的方法相比,与模型无关的 VI 方法更加灵活(因为它们可以应用于任何监督学习算法)。

三种不同方法量化全局特征重要性方法:

  1. PDP
  2. ICE 曲线
  3. 排列

PDP

基于量化每个特征的 PDP 的“平坦度”。PDP 有助于可视化特征空间的低基数子集对估计的预测表面的影响(例如,主效应和二元/三元交互效应)。PDP 提供与模型无关的解释,并且可以用与任何监督学习算法相同的方式构建。

# Load required packages
library(pdp)

# Fit a PPR model (nterms was chosen using the caret package with 5 repeats of 
# 5-fold cross-validation)
pp <- ppr(y ~ ., data = trn, nterms = 11)  

# PDPs for all 10 features
features <- paste0("x.", 1:10)
pdps <- lapply(features, FUN = function(feature) {
  pd <- partial(pp, pred.var = feature)
  autoplot(pd) + 
    ylim(range(trn$y)) + 
    theme_light()
})
grid.arrange(grobs = pdps, ncol = 5)

接下来,我们为 PPR 和 NN 模型计算基于 PDP 的 VI 分数。PDP 方法构建 VI 分数,量化每个 PDP 的“平坦度”(默认情况下,这是通过计算𝑦- 每个 PDP 的轴值)。

# Fit a PPR model (nterms was chosen using the caret package with 5 repeats of 
# 5-fold cross-validation)
pp <- ppr(y ~ ., data = trn, nterms = 11)  

# Plot VI scores
p1 <- vip(pp, method = "firm") + ggtitle("PPR")
p2 <- vip(nn, method = "firm") + ggtitle("NN")

# Display plots side by side
grid.arrange(p1, p2, ncol = 2)

排列法

这个想法是,如果我们随机排列训练数据中重要特征的值,训练性能会降低(因为排列特征值有效地破坏了该特征与目标变量之间的任何关系)。

# Plot VI scores
set.seed(2021)  # for reproducibility
p1 <- vip(pp, method = "permute", target = "y", metric = "rsquared",
          pred_wrapper = predict) + ggtitle("PPR")
p2 <- vip(nn, method = "permute", target = "y", metric = "rsquared",
          pred_wrapper = predict) + ggtitle("NN")
grid.arrange(p1, p2, ncol = 2)

如果计算上可行,您将需要多次运行基于排列的重要性并对结果进行平均。这减少了由排列过程中的随机性引入的错误。

# Plot VI scores
set.seed(2021)  # for reproducibility
vip(pp, method = "permute", target = "y", metric = "rsquared", nsim = 20,
    pred_wrapper = predict, geom = "boxplot", all_permutations = TRUE,
    mapping = aes_string(fill = "Variable"), 
    aesthetics = list(color = "grey35")) + 
  ggtitle("PPR")

grid.arrange(p1, p2, ncol = 2)

在看另外一个例子:

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:ranger':
## 
##     importance
ra <- randomForest(y ~ ., data = trn)

vi_permute(ra,train = trn,target = trn$y,metric = "r2",pred_wrapper = function(object, newdata) predict(object, newdata))
## # A tibble: 11 × 2
##    Variable Importance
##    <chr>         <dbl>
##  1 x.1         0.288  
##  2 x.2         0.346  
##  3 x.3         0.0765 
##  4 x.4         0.574  
##  5 x.5         0.161  
##  6 x.6         0.00734
##  7 x.7         0.00765
##  8 x.8         0.00905
##  9 x.9         0.00668
## 10 x.10        0.00761
## 11 y           0