covid-19住院患者死亡率预测模型:xgboost模型

加载包和数据集

数据预处理和特征工程

数据集划分

# 创建数据划分
set.seed(123)
covid_split <- initial_split(df_covid_tidy, prop = 0.7, strata = "mortality_30days")
covid_train <- training(covid_split)
covid_test <- testing(covid_split)

###特征选择 采用随机森林算法

library(caret)
#利用递归特征消除法RFE
# rf随机森林特征选择
subsets<-c(5,10,15,20,25,30,35,37)
ctrl_rf <- rfeControl(functions = rfFuncs, method = "cv", number = 5)# 定义控制参数
results_rf <- rfe(covid_train[, !colnames(covid_train) %in% "mortality_30days"], 
               covid_train$mortality_30days, sizes=subsets, rfeControl=ctrl_rf)# 特征选择
# 提取特征重要性
importance_rf <- as.data.frame(results_rf$fit$importance)

fig_feature_rf <- ggplot(importance_rf, aes(x = reorder(row.names(importance_rf), MeanDecreaseGini), y = MeanDecreaseGini)) +
  geom_bar(stat = "identity", fill = "#76EEC6") +#reorder根据Gini降序排序
  xlab("Feature") +
  ylab("Importance(Gini)") +
  ggtitle("Feature Importance (Gini)") +
  theme_classic()+
  coord_flip()+#x轴y轴转换位置
  scale_y_continuous(expand = c(0,0),limits = c(0,15))
print(fig_feature_rf)

recipe创建

  1. recipe()取公式和数据集,创建一个用于数据预处理的”配方”(recipe),这个”配方”描述了应该对数据集采取哪些步骤,以便为数据分析做好准备。可以将两种类型的操作添加到recipe中。
  2. step(“步骤”)操作可以包括常见的操作,比如对数变换、创建哑变量或交互变量等。还可以指定更复杂的计算动作,比如降维或插补。
  3. check*()(“检查”)操作可以实施特定的测试,返回没有问题或修正的数据,否则,会给出错误
  4. 预处理需要通过prep()函数来进行,并用”榨汁”函数juice()将处理好的整洁数据框提取出来,对新数据集进行同样的预处理,可以使用”烘培”函数bake()
library("themis")
covid_rec<-
  recipe(mortality_30days ~.,data=covid_train) %>%
  themis::step_downsample(mortality_30days) %>%#结果类不平衡
  step_normalize(all_predictors(),-all_nominal())%>%
  #step_dummy(all_nominal(),-all_outcomes(),keep_original_cols = T)%>% 都是二分类变量不需要哑变量
  #step_zv(all_predictors()) %>%
  #step_nzv(all_predictors()) %>%
  prep()

covid_juiced<-covid_rec %>% 
  juice()#提取出来检查

names(covid_juiced)
##  [1] "male"             "age"              "Hb1"              "RBC"             
##  [5] "HCT"              "WBC"              "LC"               "MC11"            
##  [9] "MC12"             "EC"               "BC"               "Plt"             
## [13] "RDWSD16"          "RDWSD17"          "CRP"              "IL6"             
## [17] "PCT"              "DDimer"           "PT"               "APTT"            
## [21] "FIB"              "ALT"              "AST"              "albium"          
## [25] "tbil"             "rGT"              "Cr"               "BUN"             
## [29] "eGFR"             "UA"               "ICU"              "CRRT"            
## [33] "NURSING"          "Fever"            "antibiotic"       "HIGHFLOW"        
## [37] "NEGATIV_time"     "mortality_30days"

超参数调优

设置模型超参数搜索空间

##模型规范并预留tune设置
xgb_spec <- boost_tree(
  mode = "classification",
  trees = tune(),
  mtry = tune(),
  min_n = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_spec
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
## 
## Computational engine: xgboost

设置网格搜索

# tune自定义网格搜索
param_grid <- expand.grid(
  trees = c(100, 200, 300),  # 树的数量
  mtry = c(2, 3, 4),         # mtry 参数
  min_n = c(1, 2, 3)         # min_n 参数
)

param_grid
##    trees mtry min_n
## 1    100    2     1
## 2    200    2     1
## 3    300    2     1
## 4    100    3     1
## 5    200    3     1
## 6    300    3     1
## 7    100    4     1
## 8    200    4     1
## 9    300    4     1
## 10   100    2     2
## 11   200    2     2
## 12   300    2     2
## 13   100    3     2
## 14   200    3     2
## 15   300    3     2
## 16   100    4     2
## 17   200    4     2
## 18   300    4     2
## 19   100    2     3
## 20   200    2     3
## 21   300    2     3
## 22   100    3     3
## 23   200    3     3
## 24   300    3     3
## 25   100    4     3
## 26   200    4     3
## 27   300    4     3
#随机网格搜索
xgb_grid <- grid_random(tree_depth(),
                        trees(), 
                        learn_rate(), 
                        min_n(),
                        loss_reduction())#与前面模型规范保持一致
xgb_grid
## # A tibble: 5 × 5
##   tree_depth trees learn_rate min_n loss_reduction
##        <int> <int>      <dbl> <int>          <dbl>
## 1          7  1229   2.56e- 5    36    0.000000167
## 2         12   140   2.00e- 7    20    0.0000427  
## 3          4   786   1.24e- 8    35    0.00000118 
## 4          6   163   2.58e- 5    24    0.00000231 
## 5         11  1709   6.31e-10    40    0.0121

设置交叉验证

对参数的调整通过交叉验证进行,使用vfold_cv将训练集划分为10折,重复3次对于划分为10折的训练集covid_vfold,分别使用analysis()和assessment()提取用于分析和用于评估的部分。

covid_vfold<-vfold_cv(covid_train,v=10,repeats=3)
covid_vfold %>%
   mutate(df_ana=map(splits,analysis),
          df_ass=map(splits,assessment))
## #  10-fold cross-validation repeated 3 times 
## # A tibble: 30 × 5
##    splits             id      id2    df_ana            df_ass         
##    <list>             <chr>   <chr>  <list>            <list>         
##  1 <split [1477/165]> Repeat1 Fold01 <df [1,477 × 38]> <df [165 × 38]>
##  2 <split [1477/165]> Repeat1 Fold02 <df [1,477 × 38]> <df [165 × 38]>
##  3 <split [1478/164]> Repeat1 Fold03 <df [1,478 × 38]> <df [164 × 38]>
##  4 <split [1478/164]> Repeat1 Fold04 <df [1,478 × 38]> <df [164 × 38]>
##  5 <split [1478/164]> Repeat1 Fold05 <df [1,478 × 38]> <df [164 × 38]>
##  6 <split [1478/164]> Repeat1 Fold06 <df [1,478 × 38]> <df [164 × 38]>
##  7 <split [1478/164]> Repeat1 Fold07 <df [1,478 × 38]> <df [164 × 38]>
##  8 <split [1478/164]> Repeat1 Fold08 <df [1,478 × 38]> <df [164 × 38]>
##  9 <split [1478/164]> Repeat1 Fold09 <df [1,478 × 38]> <df [164 × 38]>
## 10 <split [1478/164]> Repeat1 Fold10 <df [1,478 × 38]> <df [164 × 38]>
## # ℹ 20 more rows
covid_vfold
## #  10-fold cross-validation repeated 3 times 
## # A tibble: 30 × 3
##    splits             id      id2   
##    <list>             <chr>   <chr> 
##  1 <split [1477/165]> Repeat1 Fold01
##  2 <split [1477/165]> Repeat1 Fold02
##  3 <split [1478/164]> Repeat1 Fold03
##  4 <split [1478/164]> Repeat1 Fold04
##  5 <split [1478/164]> Repeat1 Fold05
##  6 <split [1478/164]> Repeat1 Fold06
##  7 <split [1478/164]> Repeat1 Fold07
##  8 <split [1478/164]> Repeat1 Fold08
##  9 <split [1478/164]> Repeat1 Fold09
## 10 <split [1478/164]> Repeat1 Fold10
## # ℹ 20 more rows

建立工作流(workflows)

##使用工作流将预处理和模型结合起来
xgb_wflow <-
    workflow() %>%
    add_recipe(covid_rec)%>%
    add_model(xgb_spec)

进行调优

tune_results <- tune_grid(
  object = xgb_wflow,
  resamples = covid_vfold,  # 加上前面设置参数!
  grid = param_grid,#加上前面设置参数!
  control = control_grid(save_pred = TRUE)
)

调优结果

tune_results
## # Tuning results
## # 10-fold cross-validation repeated 3 times 
## # A tibble: 30 × 6
##    splits             id      id2    .metrics          .notes   .predictions
##    <list>             <chr>   <chr>  <list>            <list>   <list>      
##  1 <split [1477/165]> Repeat1 Fold01 <tibble [54 × 7]> <tibble> <tibble>    
##  2 <split [1477/165]> Repeat1 Fold02 <tibble [54 × 7]> <tibble> <tibble>    
##  3 <split [1478/164]> Repeat1 Fold03 <tibble [54 × 7]> <tibble> <tibble>    
##  4 <split [1478/164]> Repeat1 Fold04 <tibble [54 × 7]> <tibble> <tibble>    
##  5 <split [1478/164]> Repeat1 Fold05 <tibble [54 × 7]> <tibble> <tibble>    
##  6 <split [1478/164]> Repeat1 Fold06 <tibble [54 × 7]> <tibble> <tibble>    
##  7 <split [1478/164]> Repeat1 Fold07 <tibble [54 × 7]> <tibble> <tibble>    
##  8 <split [1478/164]> Repeat1 Fold08 <tibble [54 × 7]> <tibble> <tibble>    
##  9 <split [1478/164]> Repeat1 Fold09 <tibble [54 × 7]> <tibble> <tibble>    
## 10 <split [1478/164]> Repeat1 Fold10 <tibble [54 × 7]> <tibble> <tibble>    
## # ℹ 20 more rows
## 
## There were issues with some computations:
## 
##   - Warning(s) x11: Column(s) have zero variance so scaling cannot be used: `CRRT`. C...
## 
## Run `show_notes(.Last.tune.result)` for more information.

结果可视化

autoplot(tune_results)

查看结果

tune_results %>% collect_metrics()
## # A tibble: 54 × 9
##     mtry trees min_n .metric  .estimator  mean     n std_err .config            
##    <dbl> <dbl> <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>              
##  1     2   100     1 accuracy binary     0.760    30 0.00600 Preprocessor1_Mode…
##  2     2   100     1 roc_auc  binary     0.805    30 0.0128  Preprocessor1_Mode…
##  3     2   200     1 accuracy binary     0.763    30 0.00609 Preprocessor1_Mode…
##  4     2   200     1 roc_auc  binary     0.810    30 0.0117  Preprocessor1_Mode…
##  5     2   300     1 accuracy binary     0.763    30 0.00619 Preprocessor1_Mode…
##  6     2   300     1 roc_auc  binary     0.813    30 0.0114  Preprocessor1_Mode…
##  7     2   100     2 accuracy binary     0.764    30 0.00715 Preprocessor1_Mode…
##  8     2   100     2 roc_auc  binary     0.819    30 0.0133  Preprocessor1_Mode…
##  9     2   200     2 accuracy binary     0.762    30 0.00737 Preprocessor1_Mode…
## 10     2   200     2 roc_auc  binary     0.817    30 0.0133  Preprocessor1_Mode…
## # ℹ 44 more rows

选择最佳参数

best_parameters <- tune_results %>% select_best("roc_auc")
print(best_parameters)
## # A tibble: 1 × 4
##    mtry trees min_n .config              
##   <dbl> <dbl> <dbl> <chr>                
## 1     2   100     2 Preprocessor1_Model04

查看准确率最高模型

show_best(tune_results,"accuracy") #按照准确率由高到低进行排序
## # A tibble: 5 × 9
##    mtry trees min_n .metric  .estimator  mean     n std_err .config             
##   <dbl> <dbl> <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1     3   200     2 accuracy binary     0.766    30 0.00674 Preprocessor1_Model…
## 2     3   100     2 accuracy binary     0.765    30 0.00634 Preprocessor1_Model…
## 3     3   300     1 accuracy binary     0.764    30 0.00823 Preprocessor1_Model…
## 4     3   300     2 accuracy binary     0.764    30 0.00668 Preprocessor1_Model…
## 5     2   100     2 accuracy binary     0.764    30 0.00715 Preprocessor1_Model…

最终模型

final_wflow <- xgb_wflow %>% 
  finalize_workflow(best_parameters)

最终模型特征重要性提取及可视化

# 提取特征重要性并可视化
library("vip")
# 在训练集再次训练查看特征重要性
final_wflow_train <- fit(final_wflow, data = covid_train)
importance <- vip(final_wflow_train)
plot(importance)

美化

# 绘制特征重要性条形图
fig_feature <- ggplot(importance$data, aes(y =  Importance, x = reorder(Variable,Importance))) +
  geom_bar(stat = "identity", fill = "skyblue") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Feature Importance") +
  theme_classic()+
  coord_flip()+#x轴y轴转换位置
  scale_y_continuous(expand = c(0,0),limits = c(0,0.2)) 

print(fig_feature)

最终模型在测试集的表现

#收集评价指标
final_wflow_both <- final_wflow %>% last_fit(covid_split)#在内部测试集验证

final_wflow_both %>%
  collect_metrics()
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.733 Preprocessor1_Model1
## 2 roc_auc  binary         0.836 Preprocessor1_Model1
#如果有外部测试集,具体调试
#test_preds <- select(external_test_set, mortality_30days) %>% bind_cols(predict(best_xgb_fit, new_data = external_test_set, type = "prob")) %>%
#bind_cols(predict(best_xgb_fit, new_data = external_test_set, type = "class"))#在外部测试集验证
# 预测值
test_pred <- final_wflow_both %>% collect_predictions()
# 召回率
recall <- recall(test_pred$mortality_30days, test_pred$.pred_class)
print(recall)
## [1] 0.9920477
# 精确度
precision <- precision(test_pred$mortality_30days,test_pred$.pred_class)
print(precision)
## [1] 0.7306003
# 准确率
accuracy <- accuracy(test_pred,mortality_30days, .pred_class)
print(accuracy)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.733
#F1分数
f1_score <- test_pred %>% f_meas(mortality_30days, .pred_class)
print(f1_score)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 f_meas  binary         0.841
#绘制ROC曲线
 test_pred %>% 
  roc_curve(mortality_30days,.pred_1) %>% 
  autoplot()

# 美化ROC 曲线
 test_pred %>% 
  roc_curve(mortality_30days,.pred_1) %>% 
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_path(color = "skyblue", size = 1) +
  geom_abline(lty = 3) +
  coord_equal() +
  theme_bw()

# 计算混淆矩阵
conf_matrix <- test_pred %>% conf_mat(mortality_30days, .pred_class)
# 绘制混淆矩阵图
autoplot(conf_matrix)

# 美化混淆矩阵图
conf_mat <- as.data.frame(conf_matrix$table)
ggplot(conf_mat, aes(x = Truth, y = Prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = 1) +
  scale_fill_gradient(low = "skyblue", high = "skyblue4") +  # 自定义颜色
  labs(
    x = "Truth",
    y = "Prediction",
    title = "Confusion Matrix"
  ) +
  theme_minimal()

## 模型解释

鉴于机器学习模型的“黑盒”性质,为了更好地理解模型,需要对模型进行合理的解释。Tidymodels框架本身不包含用于模型解释的软件

library("shapviz")#适用于XGBoost及LightGBM模型
#随机选取1000个样本
train_shapdata <- covid_train[sample(nrow(covid_train), 1000), ]
#从final_wflow_train(workflow类型)提取出model
final_shap <-  pull_workflow_fit(final_wflow_train)
# 计算Shap值(model_fit_shap$fit,class要是xgb.Booster)
shp <- shapviz(final_shap$fit, X_pred = data.matrix(train_shapdata[,-37]), 
   X = train_shapdata)

1.样本解释

a.横轴为SHAP值,纵轴是该样本各个特征(就是统计学中的自变量)的名称与取值(这是单个样本)。

b.图中的数字就是变量的取值,+代表其对结局指标有正向影响(黄色,箭头朝右),-代表有负面影响(红色,箭头朝左),类似于常用的回归系数的正负号。

c.最下方E[f(x)]示意SHAP基准值,表示该模型预测的均值,E是期望符号;

d.总体解读:该样本的SHAP值是3.59(显示于右上角),caret对其影响最大,是负向影响;color的影响次之,是正向影响。

# 以瀑布图形式显示,单个样本的解释
sv_waterfall(shp,row_id = 1) #第一行的数据,即第一个样本

# 以力图形式显示,单个样本的解释
sv_force(shp, row_id = 1)

# 多个样本的解释,此时计算的是shap的平均值
sv_waterfall(shp, shp$X$age > 35 & shp$X$age < 65)  #选取年龄在35~65的所有样本

2.基于SHAP值的变量重要性排序

# 基于绝对均数SHAP的条形图
sv_importance(shp,fill = 'skyblue3') #颜色自定义

# 蜂窝图,本质是散点图
sv_importance(shp, kind = "beeswarm")

# 条形图 and 散点图
sv_importance(shp, kind = "both", show_numbers = T, bee_width = 0.2)

3.SHAP依赖图

# 使用特征:
sv_dependence(shp, v = "CRP")