https://koalaverse.github.io/vip/reference/index.html
# Simulate training data
set.seed(101) # for reproducibility
trn <- as.data.frame(mlbench::mlbench.friedman1(500)) # ?mlbench.friedman1
# Inspect data
tibble::as.tibble(trn)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## # A tibble: 500 × 11
## x.1 x.2 x.3 x.4 x.5 x.6 x.7 x.8 x.9 x.10 y
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.372 0.406 0.102 0.322 0.693 0.758 0.518 0.530 0.878 0.763 14.9
## 2 0.0438 0.602 0.602 0.999 0.776 0.533 0.509 0.487 0.118 0.176 15.3
## 3 0.710 0.362 0.254 0.548 0.0180 0.765 0.715 0.844 0.334 0.118 15.1
## 4 0.658 0.291 0.542 0.327 0.230 0.301 0.177 0.346 0.474 0.283 10.7
## 5 0.250 0.794 0.383 0.947 0.462 0.00487 0.270 0.114 0.489 0.311 17.6
## 6 0.300 0.701 0.992 0.386 0.666 0.198 0.924 0.775 0.736 0.974 18.3
## 7 0.585 0.365 0.283 0.488 0.845 0.466 0.715 0.202 0.905 0.640 14.6
## 8 0.333 0.552 0.858 0.509 0.697 0.388 0.260 0.355 0.517 0.165 17.0
## 9 0.622 0.118 0.490 0.390 0.468 0.360 0.572 0.891 0.682 0.717 8.54
## 10 0.546 0.150 0.476 0.706 0.829 0.373 0.192 0.873 0.456 0.694 15.0
## # … with 490 more rows
决策树可能提供最自然的模型特定方法来量化每个特征的重要性。
# Load required packages
library(xgboost) # for fitting GBMs
library(ranger) # for fitting random forests
library(rpart) # for fitting CART-like decision trees
# Fit a single regression tree
tree <- rpart(y ~ ., data = trn)
# Fit a random forest
set.seed(101)
rfo <- ranger(y ~ ., data = trn, importance = "impurity")
# Fit a GBM
set.seed(102)
bst <- xgboost(
data = data.matrix(subset(trn, select = -y)),
label = trn$y,
objective = "reg:linear",
nrounds = 100,
max_depth = 5,
eta = 0.3,
verbose = 0 # suppress printing
)
## [15:15:05] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
上述每个包都包括计算模型中所有特征的 VI 分数的能力;但是,实现是特定于包的,如下面的代码块所示。
(vi_tree <- tree$variable.importance)
## x.4 x.2 x.1 x.5 x.3 x.6 x.7 x.9
## 4233.8443 2512.6610 2461.1746 1229.9290 687.9090 533.1549 356.8836 330.6542
## x.8 x.10
## 275.6148 274.5077
barplot(vi_tree, horiz = TRUE, las = 1)
(vi_rfo <- rfo$variable.importance)
## x.1 x.2 x.3 x.4 x.5 x.6 x.7 x.8
## 2069.4912 2425.6201 950.0576 3782.4357 1495.2632 366.2181 363.7110 366.9129
## x.9 x.10
## 337.3055 385.2958
barplot(vi_rfo, horiz = TRUE, las = 1)
(vi_bst <- xgb.importance(model = bst))
## Feature Gain Cover Frequency
## 1: x.4 0.403044885 0.12713681 0.10149673
## 2: x.2 0.224976505 0.10504115 0.13610851
## 3: x.1 0.188541029 0.10597358 0.17633302
## 4: x.5 0.089410552 0.07012969 0.07904584
## 5: x.3 0.068165727 0.10009244 0.10243218
## 6: x.9 0.008023709 0.07802100 0.07062675
## 7: x.6 0.007456255 0.13405129 0.10243218
## 8: x.7 0.003997671 0.08678822 0.07764266
## 9: x.10 0.003766490 0.11868040 0.08325538
## 10: x.8 0.002617178 0.07408544 0.07062675
xgb.ggplot.importance(vi_bst)
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
vi(tree) # CART-like decision tree
## # A tibble: 10 × 2
## Variable Importance
## <chr> <dbl>
## 1 x.4 4234.
## 2 x.2 2513.
## 3 x.1 2461.
## 4 x.5 1230.
## 5 x.3 688.
## 6 x.6 533.
## 7 x.7 357.
## 8 x.9 331.
## 9 x.8 276.
## 10 x.10 275.
vi(rfo) # RF
## # A tibble: 10 × 2
## Variable Importance
## <chr> <dbl>
## 1 x.4 3782.
## 2 x.2 2426.
## 3 x.1 2069.
## 4 x.5 1495.
## 5 x.3 950.
## 6 x.10 385.
## 7 x.8 367.
## 8 x.6 366.
## 9 x.7 364.
## 10 x.9 337.
vi(bst) # GBM
## # A tibble: 10 × 2
## Variable Importance
## <chr> <dbl>
## 1 x.4 0.403
## 2 x.2 0.225
## 3 x.1 0.189
## 4 x.5 0.0894
## 5 x.3 0.0682
## 6 x.9 0.00802
## 7 x.6 0.00746
## 8 x.7 0.00400
## 9 x.10 0.00377
## 10 x.8 0.00262
请注意该vi()函数如何始终返回包含两列的 tibble:Variable和Importance,VI 分数从最高到最低排序。
VIPfunction 可视化
# Load required packages
library(vip)
# Construct ggplot2-based VIPs
p1 <- vip(tree) # CART-like decision tree
p2 <- vip(rfo, width = 0.5, aesthetics = list(fill = "green3")) # RF
#> Warning in vip.default(rfo, width = 0.5, aesthetics = list(fill = "green3")):
#> Arguments `width`, `alpha`, `color`, `fill`, `size`, and `shape` have all been
#> deprecated in favor of the new `mapping` and `aesthetics` arguments. They will
#> be removed in version 0.3.0.
p3 <- vip(bst, aesthetics = list(col = "purple2")) # GBM
# Display all three plots side by side
grid.arrange(p1, p2, p3, ncol = 3)
vip()函数如何始终返回一个”ggplot”对象
在多元线性回归或线性模型 (LM) 中,𝑡-statistic 通常用作 VI 的度量。同样的想法也适用于广义线性模型 (GLM)
# Load required packages
library(ggplot2) # for `aes_string()` function
# Fit a LM
linmod <- lm(y ~ .^2, data = trn)
backward <- step(linmod, direction = "backward", trace = 0)
# Extract VI scores
vi(backward)
## # A tibble: 21 × 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 x.4 14.2 POS
## 2 x.2 7.31 POS
## 3 x.1 5.63 POS
## 4 x.5 5.21 POS
## 5 x.3:x.5 2.46 POS
## 6 x.1:x.10 2.41 NEG
## 7 x.2:x.6 2.41 NEG
## 8 x.1:x.5 2.37 NEG
## 9 x.10 2.21 POS
## 10 x.3:x.4 2.01 NEG
## # … with 11 more rows
p1 <- vip(backward, num_features = length(coef(backward)),
geom = "point", horizontal = FALSE)
p2 <- vip(backward, num_features = length(coef(backward)),
geom = "point", horizontal = FALSE,
mapping = aes_string(color = "Sign"))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`
grid.arrange(p1, p2, nrow = 1)
Friedman (1991)提出的多元自适应回归样条(Multivariate adaptive regression splines, MARS)是一种自动回归技术,可以看作是多元线性回归和广义线性模型的推广。在 MARS 算法中,每个预测变量的贡献(或 VI 分数)是使用广义交叉验证 (GCV) 统计量确定的。
# Load required packages
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
#> Loading required package: Formula
#> Loading required package: plotmo
#> Loading required package: plotrix
#> Loading required package: TeachingDemos
# Fit a MARS model
mars <- earth(y ~ ., data = trn, degree = 2, pmethod = "exhaustive")
# Extract VI scores
vi(mars)
## # A tibble: 10 × 2
## Variable Importance
## <chr> <dbl>
## 1 x.4 15
## 2 x.1 14
## 3 x.2 14
## 4 x.5 12
## 5 x.3 10
## 6 x.6 0
## 7 x.7 0
## 8 x.8 0
## 9 x.9 0
## 10 x.10 0
vip(mars)
对于 NN,构建 VI 分数的两种流行方法是 Garson 算法(Garson 1991)(后来由Goh (1995)修改)和 Olden 算法(Olden、Joy 和 Death 2004)。对于这两种算法,这些重要性分数的基础是网络的连接权重。
library(nnet)
# Fit a neural network
set.seed(0803)
nn <- nnet(y ~ ., data = trn, size = 7, decay = 0.1, linout = TRUE)
## # weights: 85
## initial value 126298.176281
## iter 10 value 5148.185025
## iter 20 value 3563.828062
## iter 30 value 3095.876922
## iter 40 value 2638.672528
## iter 50 value 1983.009001
## iter 60 value 1759.428911
## iter 70 value 1483.284575
## iter 80 value 1112.219052
## iter 90 value 835.941067
## iter 100 value 748.120719
## final value 748.120719
## stopped after 100 iterations
vi(mars)
## # A tibble: 10 × 2
## Variable Importance
## <chr> <dbl>
## 1 x.4 15
## 2 x.1 14
## 3 x.2 14
## 4 x.5 12
## 5 x.3 10
## 6 x.6 0
## 7 x.7 0
## 8 x.8 0
## 9 x.9 0
## 10 x.10 0
# Plot VI scores
grid.arrange(
vip(nn),
vip(nn, type = "garson"),
nrow = 1
)
与模型无关的可解释性将解释与模型分开。与特定于模型的方法相比,与模型无关的 VI 方法更加灵活(因为它们可以应用于任何监督学习算法)。
三种不同方法量化全局特征重要性方法:
基于量化每个特征的 PDP 的“平坦度”。PDP 有助于可视化特征空间的低基数子集对估计的预测表面的影响(例如,主效应和二元/三元交互效应)。PDP 提供与模型无关的解释,并且可以用与任何监督学习算法相同的方式构建。
# Load required packages
library(pdp)
# Fit a PPR model (nterms was chosen using the caret package with 5 repeats of
# 5-fold cross-validation)
pp <- ppr(y ~ ., data = trn, nterms = 11)
# PDPs for all 10 features
features <- paste0("x.", 1:10)
pdps <- lapply(features, FUN = function(feature) {
pd <- partial(pp, pred.var = feature)
autoplot(pd) +
ylim(range(trn$y)) +
theme_light()
})
grid.arrange(grobs = pdps, ncol = 5)
接下来,我们为 PPR 和 NN 模型计算基于 PDP 的 VI 分数。PDP 方法构建 VI 分数,量化每个 PDP 的“平坦度”(默认情况下,这是通过计算𝑦- 每个 PDP 的轴值)。
# Fit a PPR model (nterms was chosen using the caret package with 5 repeats of
# 5-fold cross-validation)
pp <- ppr(y ~ ., data = trn, nterms = 11)
# Plot VI scores
p1 <- vip(pp, method = "firm") + ggtitle("PPR")
p2 <- vip(nn, method = "firm") + ggtitle("NN")
# Display plots side by side
grid.arrange(p1, p2, ncol = 2)
这个想法是,如果我们随机排列训练数据中重要特征的值,训练性能会降低(因为排列特征值有效地破坏了该特征与目标变量之间的任何关系)。
# Plot VI scores
set.seed(2021) # for reproducibility
p1 <- vip(pp, method = "permute", target = "y", metric = "rsquared",
pred_wrapper = predict) + ggtitle("PPR")
p2 <- vip(nn, method = "permute", target = "y", metric = "rsquared",
pred_wrapper = predict) + ggtitle("NN")
grid.arrange(p1, p2, ncol = 2)
如果计算上可行,您将需要多次运行基于排列的重要性并对结果进行平均。这减少了由排列过程中的随机性引入的错误。
# Plot VI scores
set.seed(2021) # for reproducibility
vip(pp, method = "permute", target = "y", metric = "rsquared", nsim = 20,
pred_wrapper = predict, geom = "boxplot", all_permutations = TRUE,
mapping = aes_string(fill = "Variable"),
aesthetics = list(color = "grey35")) +
ggtitle("PPR")
grid.arrange(p1, p2, ncol = 2)
在看另外一个例子:
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:ranger':
##
## importance
ra <- randomForest(y ~ ., data = trn)
vi_permute(ra,train = trn,target = trn$y,metric = "r2",pred_wrapper = function(object, newdata) predict(object, newdata))
## # A tibble: 11 × 2
## Variable Importance
## <chr> <dbl>
## 1 x.1 0.288
## 2 x.2 0.346
## 3 x.3 0.0765
## 4 x.4 0.574
## 5 x.5 0.161
## 6 x.6 0.00734
## 7 x.7 0.00765
## 8 x.8 0.00905
## 9 x.9 0.00668
## 10 x.10 0.00761
## 11 y 0