# 创建数据划分
set.seed(123)
covid_split <- initial_split(df_covid_tidy, prop = 0.7, strata = "mortality_30days")
covid_train <- training(covid_split)
covid_test <- testing(covid_split)
###特征选择 采用随机森林算法
library(caret)
#利用递归特征消除法RFE
# rf随机森林特征选择
subsets<-c(5,10,15,20,25,30,35,37)
ctrl_rf <- rfeControl(functions = rfFuncs, method = "cv", number = 5)# 定义控制参数
results_rf <- rfe(covid_train[, !colnames(covid_train) %in% "mortality_30days"],
covid_train$mortality_30days, sizes=subsets, rfeControl=ctrl_rf)# 特征选择
# 提取特征重要性
importance_rf <- as.data.frame(results_rf$fit$importance)
fig_feature_rf <- ggplot(importance_rf, aes(x = reorder(row.names(importance_rf), MeanDecreaseGini), y = MeanDecreaseGini)) +
geom_bar(stat = "identity", fill = "#76EEC6") +#reorder根据Gini降序排序
xlab("Feature") +
ylab("Importance(Gini)") +
ggtitle("Feature Importance (Gini)") +
theme_classic()+
coord_flip()+#x轴y轴转换位置
scale_y_continuous(expand = c(0,0),limits = c(0,15))
print(fig_feature_rf)
library("themis")
covid_rec<-
recipe(mortality_30days ~.,data=covid_train) %>%
themis::step_downsample(mortality_30days) %>%#结果类不平衡
step_normalize(all_predictors(),-all_nominal())%>%
#step_dummy(all_nominal(),-all_outcomes(),keep_original_cols = T)%>% 都是二分类变量不需要哑变量
#step_zv(all_predictors()) %>%
#step_nzv(all_predictors()) %>%
prep()
covid_juiced<-covid_rec %>%
juice()#提取出来检查
names(covid_juiced)
## [1] "male" "age" "Hb1" "RBC"
## [5] "HCT" "WBC" "LC" "MC11"
## [9] "MC12" "EC" "BC" "Plt"
## [13] "RDWSD16" "RDWSD17" "CRP" "IL6"
## [17] "PCT" "DDimer" "PT" "APTT"
## [21] "FIB" "ALT" "AST" "albium"
## [25] "tbil" "rGT" "Cr" "BUN"
## [29] "eGFR" "UA" "ICU" "CRRT"
## [33] "NURSING" "Fever" "antibiotic" "HIGHFLOW"
## [37] "NEGATIV_time" "mortality_30days"
##模型规范并预留tune设置
xgb_spec <- boost_tree(
mode = "classification",
trees = tune(),
mtry = tune(),
min_n = tune()
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_spec
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = tune()
## min_n = tune()
##
## Computational engine: xgboost
# tune自定义网格搜索
param_grid <- expand.grid(
trees = c(100, 200, 300), # 树的数量
mtry = c(2, 3, 4), # mtry 参数
min_n = c(1, 2, 3) # min_n 参数
)
param_grid
## trees mtry min_n
## 1 100 2 1
## 2 200 2 1
## 3 300 2 1
## 4 100 3 1
## 5 200 3 1
## 6 300 3 1
## 7 100 4 1
## 8 200 4 1
## 9 300 4 1
## 10 100 2 2
## 11 200 2 2
## 12 300 2 2
## 13 100 3 2
## 14 200 3 2
## 15 300 3 2
## 16 100 4 2
## 17 200 4 2
## 18 300 4 2
## 19 100 2 3
## 20 200 2 3
## 21 300 2 3
## 22 100 3 3
## 23 200 3 3
## 24 300 3 3
## 25 100 4 3
## 26 200 4 3
## 27 300 4 3
#随机网格搜索
xgb_grid <- grid_random(tree_depth(),
trees(),
learn_rate(),
min_n(),
loss_reduction())#与前面模型规范保持一致
xgb_grid
## # A tibble: 5 × 5
## tree_depth trees learn_rate min_n loss_reduction
## <int> <int> <dbl> <int> <dbl>
## 1 7 1229 2.56e- 5 36 0.000000167
## 2 12 140 2.00e- 7 20 0.0000427
## 3 4 786 1.24e- 8 35 0.00000118
## 4 6 163 2.58e- 5 24 0.00000231
## 5 11 1709 6.31e-10 40 0.0121
对参数的调整通过交叉验证进行,使用vfold_cv将训练集划分为10折,重复3次对于划分为10折的训练集covid_vfold,分别使用analysis()和assessment()提取用于分析和用于评估的部分。
covid_vfold<-vfold_cv(covid_train,v=10,repeats=3)
covid_vfold %>%
mutate(df_ana=map(splits,analysis),
df_ass=map(splits,assessment))
## # 10-fold cross-validation repeated 3 times
## # A tibble: 30 × 5
## splits id id2 df_ana df_ass
## <list> <chr> <chr> <list> <list>
## 1 <split [1477/165]> Repeat1 Fold01 <df [1,477 × 38]> <df [165 × 38]>
## 2 <split [1477/165]> Repeat1 Fold02 <df [1,477 × 38]> <df [165 × 38]>
## 3 <split [1478/164]> Repeat1 Fold03 <df [1,478 × 38]> <df [164 × 38]>
## 4 <split [1478/164]> Repeat1 Fold04 <df [1,478 × 38]> <df [164 × 38]>
## 5 <split [1478/164]> Repeat1 Fold05 <df [1,478 × 38]> <df [164 × 38]>
## 6 <split [1478/164]> Repeat1 Fold06 <df [1,478 × 38]> <df [164 × 38]>
## 7 <split [1478/164]> Repeat1 Fold07 <df [1,478 × 38]> <df [164 × 38]>
## 8 <split [1478/164]> Repeat1 Fold08 <df [1,478 × 38]> <df [164 × 38]>
## 9 <split [1478/164]> Repeat1 Fold09 <df [1,478 × 38]> <df [164 × 38]>
## 10 <split [1478/164]> Repeat1 Fold10 <df [1,478 × 38]> <df [164 × 38]>
## # ℹ 20 more rows
covid_vfold
## # 10-fold cross-validation repeated 3 times
## # A tibble: 30 × 3
## splits id id2
## <list> <chr> <chr>
## 1 <split [1477/165]> Repeat1 Fold01
## 2 <split [1477/165]> Repeat1 Fold02
## 3 <split [1478/164]> Repeat1 Fold03
## 4 <split [1478/164]> Repeat1 Fold04
## 5 <split [1478/164]> Repeat1 Fold05
## 6 <split [1478/164]> Repeat1 Fold06
## 7 <split [1478/164]> Repeat1 Fold07
## 8 <split [1478/164]> Repeat1 Fold08
## 9 <split [1478/164]> Repeat1 Fold09
## 10 <split [1478/164]> Repeat1 Fold10
## # ℹ 20 more rows
##使用工作流将预处理和模型结合起来
xgb_wflow <-
workflow() %>%
add_recipe(covid_rec)%>%
add_model(xgb_spec)
tune_results <- tune_grid(
object = xgb_wflow,
resamples = covid_vfold, # 加上前面设置参数!
grid = param_grid,#加上前面设置参数!
control = control_grid(save_pred = TRUE)
)
tune_results
## # Tuning results
## # 10-fold cross-validation repeated 3 times
## # A tibble: 30 × 6
## splits id id2 .metrics .notes .predictions
## <list> <chr> <chr> <list> <list> <list>
## 1 <split [1477/165]> Repeat1 Fold01 <tibble [54 × 7]> <tibble> <tibble>
## 2 <split [1477/165]> Repeat1 Fold02 <tibble [54 × 7]> <tibble> <tibble>
## 3 <split [1478/164]> Repeat1 Fold03 <tibble [54 × 7]> <tibble> <tibble>
## 4 <split [1478/164]> Repeat1 Fold04 <tibble [54 × 7]> <tibble> <tibble>
## 5 <split [1478/164]> Repeat1 Fold05 <tibble [54 × 7]> <tibble> <tibble>
## 6 <split [1478/164]> Repeat1 Fold06 <tibble [54 × 7]> <tibble> <tibble>
## 7 <split [1478/164]> Repeat1 Fold07 <tibble [54 × 7]> <tibble> <tibble>
## 8 <split [1478/164]> Repeat1 Fold08 <tibble [54 × 7]> <tibble> <tibble>
## 9 <split [1478/164]> Repeat1 Fold09 <tibble [54 × 7]> <tibble> <tibble>
## 10 <split [1478/164]> Repeat1 Fold10 <tibble [54 × 7]> <tibble> <tibble>
## # ℹ 20 more rows
##
## There were issues with some computations:
##
## - Warning(s) x11: Column(s) have zero variance so scaling cannot be used: `CRRT`. C...
##
## Run `show_notes(.Last.tune.result)` for more information.
autoplot(tune_results)
tune_results %>% collect_metrics()
## # A tibble: 54 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 100 1 accuracy binary 0.760 30 0.00600 Preprocessor1_Mode…
## 2 2 100 1 roc_auc binary 0.805 30 0.0128 Preprocessor1_Mode…
## 3 2 200 1 accuracy binary 0.763 30 0.00609 Preprocessor1_Mode…
## 4 2 200 1 roc_auc binary 0.810 30 0.0117 Preprocessor1_Mode…
## 5 2 300 1 accuracy binary 0.763 30 0.00619 Preprocessor1_Mode…
## 6 2 300 1 roc_auc binary 0.813 30 0.0114 Preprocessor1_Mode…
## 7 2 100 2 accuracy binary 0.764 30 0.00715 Preprocessor1_Mode…
## 8 2 100 2 roc_auc binary 0.819 30 0.0133 Preprocessor1_Mode…
## 9 2 200 2 accuracy binary 0.762 30 0.00737 Preprocessor1_Mode…
## 10 2 200 2 roc_auc binary 0.817 30 0.0133 Preprocessor1_Mode…
## # ℹ 44 more rows
best_parameters <- tune_results %>% select_best("roc_auc")
print(best_parameters)
## # A tibble: 1 × 4
## mtry trees min_n .config
## <dbl> <dbl> <dbl> <chr>
## 1 2 100 2 Preprocessor1_Model04
show_best(tune_results,"accuracy") #按照准确率由高到低进行排序
## # A tibble: 5 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 3 200 2 accuracy binary 0.766 30 0.00674 Preprocessor1_Model…
## 2 3 100 2 accuracy binary 0.765 30 0.00634 Preprocessor1_Model…
## 3 3 300 1 accuracy binary 0.764 30 0.00823 Preprocessor1_Model…
## 4 3 300 2 accuracy binary 0.764 30 0.00668 Preprocessor1_Model…
## 5 2 100 2 accuracy binary 0.764 30 0.00715 Preprocessor1_Model…
final_wflow <- xgb_wflow %>%
finalize_workflow(best_parameters)
# 提取特征重要性并可视化
library("vip")
# 在训练集再次训练查看特征重要性
final_wflow_train <- fit(final_wflow, data = covid_train)
importance <- vip(final_wflow_train)
plot(importance)
# 绘制特征重要性条形图
fig_feature <- ggplot(importance$data, aes(y = Importance, x = reorder(Variable,Importance))) +
geom_bar(stat = "identity", fill = "skyblue") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Feature Importance") +
theme_classic()+
coord_flip()+#x轴y轴转换位置
scale_y_continuous(expand = c(0,0),limits = c(0,0.2))
print(fig_feature)
#收集评价指标
final_wflow_both <- final_wflow %>% last_fit(covid_split)#在内部测试集验证
final_wflow_both %>%
collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.733 Preprocessor1_Model1
## 2 roc_auc binary 0.836 Preprocessor1_Model1
#如果有外部测试集,具体调试
#test_preds <- select(external_test_set, mortality_30days) %>% bind_cols(predict(best_xgb_fit, new_data = external_test_set, type = "prob")) %>%
#bind_cols(predict(best_xgb_fit, new_data = external_test_set, type = "class"))#在外部测试集验证
# 预测值
test_pred <- final_wflow_both %>% collect_predictions()
# 召回率
recall <- recall(test_pred$mortality_30days, test_pred$.pred_class)
print(recall)
## [1] 0.9920477
# 精确度
precision <- precision(test_pred$mortality_30days,test_pred$.pred_class)
print(precision)
## [1] 0.7306003
# 准确率
accuracy <- accuracy(test_pred,mortality_30days, .pred_class)
print(accuracy)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.733
#F1分数
f1_score <- test_pred %>% f_meas(mortality_30days, .pred_class)
print(f1_score)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 f_meas binary 0.841
#绘制ROC曲线
test_pred %>%
roc_curve(mortality_30days,.pred_1) %>%
autoplot()
# 美化ROC 曲线
test_pred %>%
roc_curve(mortality_30days,.pred_1) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path(color = "skyblue", size = 1) +
geom_abline(lty = 3) +
coord_equal() +
theme_bw()
# 计算混淆矩阵
conf_matrix <- test_pred %>% conf_mat(mortality_30days, .pred_class)
# 绘制混淆矩阵图
autoplot(conf_matrix)
# 美化混淆矩阵图
conf_mat <- as.data.frame(conf_matrix$table)
ggplot(conf_mat, aes(x = Truth, y = Prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 1) +
scale_fill_gradient(low = "skyblue", high = "skyblue4") + # 自定义颜色
labs(
x = "Truth",
y = "Prediction",
title = "Confusion Matrix"
) +
theme_minimal()
## 模型解释
鉴于机器学习模型的“黑盒”性质,为了更好地理解模型,需要对模型进行合理的解释。Tidymodels框架本身不包含用于模型解释的软件
library("shapviz")#适用于XGBoost及LightGBM模型
#随机选取1000个样本
train_shapdata <- covid_train[sample(nrow(covid_train), 1000), ]
#从final_wflow_train(workflow类型)提取出model
final_shap <- pull_workflow_fit(final_wflow_train)
# 计算Shap值(model_fit_shap$fit,class要是xgb.Booster)
shp <- shapviz(final_shap$fit, X_pred = data.matrix(train_shapdata[,-37]),
X = train_shapdata)
1.样本解释
a.横轴为SHAP值,纵轴是该样本各个特征(就是统计学中的自变量)的名称与取值(这是单个样本)。
b.图中的数字就是变量的取值,+代表其对结局指标有正向影响(黄色,箭头朝右),-代表有负面影响(红色,箭头朝左),类似于常用的回归系数的正负号。
c.最下方E[f(x)]示意SHAP基准值,表示该模型预测的均值,E是期望符号;
d.总体解读:该样本的SHAP值是3.59(显示于右上角),caret对其影响最大,是负向影响;color的影响次之,是正向影响。
# 以瀑布图形式显示,单个样本的解释
sv_waterfall(shp,row_id = 1) #第一行的数据,即第一个样本
# 以力图形式显示,单个样本的解释
sv_force(shp, row_id = 1)
# 多个样本的解释,此时计算的是shap的平均值
sv_waterfall(shp, shp$X$age > 35 & shp$X$age < 65) #选取年龄在35~65的所有样本
2.基于SHAP值的变量重要性排序
# 基于绝对均数SHAP的条形图
sv_importance(shp,fill = 'skyblue3') #颜色自定义
# 蜂窝图,本质是散点图
sv_importance(shp, kind = "beeswarm")
# 条形图 and 散点图
sv_importance(shp, kind = "both", show_numbers = T, bee_width = 0.2)
3.SHAP依赖图
# 使用特征:
sv_dependence(shp, v = "CRP")