Introduction

This document contains a draft workflow to building random forests and PD/ICE plots. The workflow includes examples to color the ICE plots by variables.

Stack Overflow Examples

The Ames Housing dataset is from De Cock (2011). It has 82 fields were recorded for 2,930 properties in Ames IA. This version is copies from the AmesHousing package but does not include a few quality columns that appear to be outcomes rather than predictors.

De Cock, D. (2011). “Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project,” Journal of Statistics Education, Volume 19, Number 3.

library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.2.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## Warning: package 'broom' was built under R version 4.2.3
## Warning: package 'scales' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'infer' was built under R version 4.2.3
## Warning: package 'parsnip' was built under R version 4.2.3
## Warning: package 'rsample' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'tune' was built under R version 4.2.3
## Warning: package 'workflows' was built under R version 4.2.3
## Warning: package 'workflowsets' was built under R version 4.2.3
## Warning: package 'yardstick' was built under R version 4.2.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(DALEXtra) # DALEX is an R package with a set of tools that help to provide Descriptive mAchine Learning EXplanations ranging from global to local interpretability methods.
## Loading required package: DALEX
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## 
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
## 
##     explain
# https://uc-r.github.io/dalex


data(ames)

#transmute is an old dplyr function. Mutate is now used.
#create a data frame with sale price, gr liv area, year built, and bldg type
ames_train <- ames %>%
    transmute(Sale_Price = log10(Sale_Price),
              Gr_Liv_Area = as.numeric(Gr_Liv_Area), 
              Year_Built, Bldg_Type)
#rf model setup using parsnip package

rf_model <- 
    rand_forest(trees = 1000) %>% 
    set_engine("ranger") %>% 
    set_mode("regression")
#workflow framework to run the random forest

rf_wflow <- 
    workflow() %>% 
    add_formula(
        Sale_Price ~ Gr_Liv_Area + Year_Built + Bldg_Type) %>% 
    add_model(rf_model) 
## fit the data from our data frame of interest (in this case the ames_train data) to our workflow and model
rf_fit <- rf_wflow %>% fit(data = ames_train)


# explainer - unified representation of the model which can be used with various "explainers"
explainer_rf <- explain_tidymodels(
    rf_fit, 
    data = dplyr::select(ames_train, -Sale_Price), 
    y = ames_train$Sale_Price,
    label = "random forest"
)
## Preparation of a new explainer is initiated
##   -> model label       :  random forest 
##   -> data              :  2930  rows  3  cols 
##   -> data              :  tibble converted into a data.frame 
##   -> target variable   :  2930  values 
##   -> predict function  :  yhat.workflow  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package tidymodels , ver. 1.2.0 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  4.905564 , mean =  5.22053 , max =  5.512371  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -0.8327836 , mean =  0.0001104245 , max =  0.3664111  
##   A new explainer has been created!
pdp_rf <- model_profile(explainer_rf, N = NULL, 
                        variables = "Gr_Liv_Area", groups = "Bldg_Type")
as_tibble(pdp_rf$agr_profiles) %>%
    mutate(`_label_` = stringr::str_remove(`_label_`, "random forest_")) %>%
    ggplot(aes(`_x_`, `_yhat_`, color = `_label_`)) +
    geom_line(size = 1.2, alpha = 0.8) +
    labs(x = "Gross living area", 
         y = "Sale Price (log)", 
         color = NULL,
         title = "Partial dependence profile for Ames housing sales",
         subtitle = "Predictions from a random forest model")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Conflict Data Experiment

Attempt one to recreate the pdp plots in our HMA random forest by country.

library(tidymodels)
library(DALEXtra) # DALEX is an R package with a set of tools that help to provide Descriptive mAchine Learning EXplanations ranging from global to local interpretability methods.

# https://uc-r.github.io/dalex

## read data in
library(tidyverse)

# load cleaned data
adm.imp = read.csv("../cleaned_data/hma_adm1_imp_df_clean_20240418.csv")


# remove row number variable (X) from df
adm.imp = adm.imp %>% 
  select(-X)

# factor country and adm 1
adm.imp$country = as.factor(adm.imp$country)
adm.imp$adm1 = as.factor(adm.imp$adm1)

# check
str(adm.imp)
## 'data.frame':    2535 obs. of  14 variables:
##  $ country: Factor w/ 12 levels "AFG","BGD","BTN",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ adm1   : Factor w/ 194 levels "Ahal Region",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ year   : int  2022 2021 2020 2019 2018 2017 2016 2015 2014 2013 ...
##  $ pop    : num  1031700 888742 765594 715202 600919 ...
##  $ temp   : num  2.74 2.3 1.24 1.73 2.25 ...
##  $ precip : num  52 35 61.8 61 44.4 ...
##  $ nlights: num  0.60497 0.14603 0.00162 0.00153 0.0014 ...
##  $ ge     : num  -1.44 -1.63 -1.59 -1.5 -1.48 -1.36 -1.25 -1.35 -1.36 -1.4 ...
##  $ cpt    : num  -1.18 -1.15 -1.49 -1.42 -1.5 -1.53 -1.54 -1.35 -1.36 -1.45 ...
##  $ rol    : num  -1.66 -1.88 -1.83 -1.74 -1.69 -1.58 -1.52 -1.52 -1.44 -1.61 ...
##  $ rq     : num  -1.27 -1.31 -1.39 -1.11 -1.14 -1.37 -1.34 -1.02 -1.12 -1.19 ...
##  $ polstab: num  -2.55 -2.52 -2.7 -2.65 -2.75 -2.79 -2.66 -2.56 -2.41 -2.52 ...
##  $ voice  : num  -1.75 -1.57 -1.08 -1.01 -1.01 -0.99 -1.04 -1.12 -1.14 -1.24 ...
##  $ con    : num  129 250 250 200 177 ...
#rf model setup using parsnip package

rf_model <- 
    rand_forest(trees = 1000) %>% 
    set_engine("ranger") %>% 
    set_mode("regression")
#workflow framework to run the random forest

rf_wflow <- 
    workflow() %>% 
    add_formula(
        con ~ .) %>% 
    add_model(rf_model) 
## fit the data from our data frame of interest (in this case the ames_train data) to our workflow and model
rf_fit <- rf_wflow %>% fit(data = adm.imp)
# explainer - unified representation of the model which can be used with various "explainers"
explainer_rf <- explain_tidymodels(
    rf_fit, 
    data = dplyr::select(adm.imp, -con), # data frame without the target variable
    y = adm.imp$con, # target variable
    label = "random forest"
)
## Preparation of a new explainer is initiated
##   -> model label       :  random forest 
##   -> data              :  2535  rows  13  cols 
##   -> target variable   :  2535  values 
##   -> predict function  :  yhat.workflow  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package tidymodels , ver. 1.2.0 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  0.005744941 , mean =  226.016 , max =  3904.413  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -543.5591 , mean =  -0.1728147 , max =  2495.757  
##   A new explainer has been created!
pdp_rf <- model_profile(explainer_rf, N = NULL, 
                        variables = "pop", groups = "country")

# explainer - I think you can only select one variable to look at pdp wise, then group it by a categorical variable, in this case group by country.
pop.pdp = as_tibble(pdp_rf$agr_profiles) %>%
    mutate(`_label_` = stringr::str_remove(`_label_`, "random forest_"))

pop.pdp$label_1 <- NA

pop.pdp$label_1 <- as.character(pop.pdp$label_1)

for(i in 1:length(unique(pop.pdp$`_label_`))) {
  
  pop.pdp[which(pop.pdp$`_label_` == 
                    unique(pop.pdp$`_label_`)[i]), "label_1"][100, ] <- unique(pop.pdp$`_label_`)[i]
}
library(ggrepel)

pop.plot = ggplot(pop.pdp, aes(`_x_`, `_yhat_`, color = `_label_`)) +
  geom_line(size = 1.2, alpha = 0.8, show.legend = FALSE) +
  labs(x = "Population", 
       y = "Conflict Count", 
       color = NULL,
       title = "Population PDP") +
  geom_label_repel(aes(label = label_1), show.legend = FALSE)

pop.plot
## Warning: Removed 1200 rows containing missing values or values outside the scale range
## (`geom_label_repel()`).

# code for labeling!

#pop.pdp_1 = pop.pdp

#pop.pdp_1$label_1 <- NA

#pop.pdp_1$label_1 <- as.character(pop.pdp_1$label_1)

#for(i in 1:length(unique(pop.pdp_1$`_label_`))) {
  
  #pop.pdp_1[which(pop.pdp_1$`_label_` == 
                    #unique(pop.pdp_1$`_label_`)[i]), "label_1"][100, ] <- unique(pop.pdp_1$`_label_`)[i]
#}

#library(ggrepel)

#ggplot(pop.pdp_1, aes(`_x_`, `_yhat_`, color = `_label_`)) +
  #geom_line(size = 1.2, alpha = 0.8, show.legend = FALSE) +
  #labs(x = "Population", 
       #y = "Conflict Count", 
       #color = NULL,
       #title = "Population PDP") +
  #geom_label_repel(aes(label = label_1), show.legend = FALSE)
pdp_rf2 <- model_profile(explainer_rf, N = NULL, 
                        variables = "polstab", groups = "country")

# explainer - I think you can only select one variable to look at pdp wise, then group it by a categorical variable, in this case group by country.
## **ORIGINAL CODE**

#polstab.pdp = as_tibble(pdp_rf2$agr_profiles) %>%
    #mutate(`_label_` = stringr::str_remove(`_label_`, "random forest_")) %>%
    #ggplot(aes(`_x_`, `_yhat_`, color = `_label_`)) +
    #geom_line(size = 1.2, alpha = 0.8) +
    #labs(x = "Political Stability", 
         #y = "Conflict Count", 
         #color = NULL,
         #title = "Political Stability PDP")
         #subtitle = "Predictions from a random forest model")

#polstab.pdp 
polstab.pdp = as_tibble(pdp_rf2$agr_profiles) %>%
    mutate(`_label_` = stringr::str_remove(`_label_`, "random forest_"))

polstab.pdp$label_1 <- NA

polstab.pdp$label_1 <- as.character(polstab.pdp$label_1)

for(i in 1:length(unique(polstab.pdp$`_label_`))) {
  
  polstab.pdp[which(polstab.pdp$`_label_` == 
                    unique(polstab.pdp$`_label_`)[i]), "label_1"][78, ] <- unique(polstab.pdp$`_label_`)[i]
}
library(ggrepel)

polstab.plot = ggplot(polstab.pdp, aes(`_x_`, `_yhat_`, color = `_label_`)) +
  geom_line(size = 1.2, alpha = 0.8, show.legend = FALSE) +
  labs(x = "Political Stability", 
       y = "Conflict Count", 
       color = NULL,
       title = "Political Stability PDP") +
  geom_label_repel(aes(label = label_1), show.legend = FALSE)

polstab.plot
## Warning: Removed 924 rows containing missing values or values outside the scale range
## (`geom_label_repel()`).

pdp_rf3 <- model_profile(explainer_rf, N = NULL, 
                        variables = "temp", groups = "country")

# explainer - I think you can only select one variable to look at pdp wise, then group it by a categorical variable, in this case group by country.
# **ORIGINAL CODE**

#temp.pdp = as_tibble(pdp_rf3$agr_profiles) %>%
    #mutate(`_label_` = stringr::str_remove(`_label_`, "random forest_")) %>%
    #ggplot(aes(`_x_`, `_yhat_`, color = `_label_`)) +
    #geom_line(size = 1.2, alpha = 0.8) +
    #labs(x = "Temperature", 
         #y = "Conflict Count", 
         #color = NULL,
         #title = "Temperature PDP")
         #subtitle = "Predictions from a random forest model")

#temp.pdp 
temp.pdp = as_tibble(pdp_rf3$agr_profiles) %>%
    mutate(`_label_` = stringr::str_remove(`_label_`, "random forest_"))

temp.pdp$label_1 <- NA

temp.pdp$label_1 <- as.character(temp.pdp$label_1)

for(i in 1:length(unique(temp.pdp$`_label_`))) {
  
  temp.pdp[which(temp.pdp$`_label_` == 
                    unique(temp.pdp$`_label_`)[i]), "label_1"][100, ] <- unique(temp.pdp$`_label_`)[i]
}
library(ggrepel)

temp.plot = ggplot(temp.pdp, aes(`_x_`, `_yhat_`, color = `_label_`)) +
  geom_line(size = 1.2, alpha = 0.8, show.legend = FALSE) +
  labs(x = "Temperature", 
       y = "Conflict Count", 
       color = NULL,
       title = "Temperature PDP") +
  geom_label_repel(aes(label = label_1), show.legend = FALSE)

temp.plot
## Warning: Removed 1200 rows containing missing values or values outside the scale range
## (`geom_label_repel()`).
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

pdp_rf4 <- model_profile(explainer_rf, N = NULL, 
                        variables = "precip", groups = "country")

# explainer - I think you can only select one variable to look at pdp wise, then group it by a categorical variable, in this case group by country.
# **ORIGINAL CODE**

#precip.pdp = as_tibble(pdp_rf4$agr_profiles) %>%
    #mutate(`_label_` = stringr::str_remove(`_label_`, "random forest_")) %>%
    #ggplot(aes(`_x_`, `_yhat_`, color = `_label_`)) +
    #geom_line(size = 1.2, alpha = 0.8) +
    #labs(x = "Precipitation", 
         #y = "Conflict Count", 
         #color = NULL,
         #title = "Precipitation PDP")
         #subtitle = "Predictions from a random forest model")

#precip.pdp 
precip.pdp = as_tibble(pdp_rf4$agr_profiles) %>%
    mutate(`_label_` = stringr::str_remove(`_label_`, "random forest_"))

precip.pdp$label_1 <- NA

precip.pdp$label_1 <- as.character(precip.pdp$label_1)

for(i in 1:length(unique(precip.pdp$`_label_`))) {
  
  precip.pdp[which(precip.pdp$`_label_` == 
                    unique(precip.pdp$`_label_`)[i]), "label_1"][100, ] <- unique(precip.pdp$`_label_`)[i]
}
library(ggrepel)

precip.plot = ggplot(precip.pdp, aes(`_x_`, `_yhat_`, color = `_label_`)) +
  geom_line(size = 1.2, alpha = 0.8, show.legend = FALSE) +
  labs(x = "Precipitation", 
       y = "Conflict Count", 
       color = NULL,
       title = "Precipitation PDP") +
  geom_label_repel(aes(label = label_1), show.legend = FALSE)

precip.plot
## Warning: Removed 1200 rows containing missing values or values outside the scale range
## (`geom_label_repel()`).

library(ggpubr)

ggarrange(pop.plot, temp.plot, polstab.plot, precip.plot, ncol = 2, nrow = 2)#, common.legend = TRUE, legend = "right")
## Warning: Removed 1200 rows containing missing values or values outside the scale range
## (`geom_label_repel()`).
## Removed 1200 rows containing missing values or values outside the scale range
## (`geom_label_repel()`).
## Warning: Removed 924 rows containing missing values or values outside the scale range
## (`geom_label_repel()`).
## Warning: Removed 1200 rows containing missing values or values outside the scale range
## (`geom_label_repel()`).
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#gridExtra::grid.arrange(pop.pdp, polstab.pdp, nrow = 2)