8.1, 8.2, 8.3, and 8.7
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.3
Exercises
8.1. Recreate the simulated data from Exercise 7.2:
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.5.2
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
Did the random forest model significantly use the uninformative predictors (V6– V10)?
The random forest model ranks V6-V10 last, with the smallest coefficients/importance levels.It does assign them all a nonzero value, albeit one that is much smaller than the predictors V1-V5.
Fit another random forest model to these data.
set.seed(200)
#add the new value to the data set
sim_data <- mlbench.friedman1(200, sd = 1)
simulated <- as.data.frame(cbind(sim_data$x, y = sim_data$y))
# Add
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
# Verify correlation
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9409437
model2 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
Yes, it went down significantly. When we add another duplicate predictor, it goes down a little more.
rfImp1
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
rfImp2
## Overall
## V1 6.056389320
## V2 6.212229612
## V3 0.618727268
## V4 6.946472869
## V5 2.123011165
## V6 0.194903846
## V7 0.047119933
## V8 -0.117467449
## V9 -0.003865473
## V10 0.076718487
## duplicate1 3.647579333
#adding one more
set.seed(200)
#add the new value to the data set
sim_data <- mlbench.friedman1(200, sd = 1)
simulated <- as.data.frame(cbind(sim_data$x, y = sim_data$y))
# Add both
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
simulated$duplicate2 <- simulated$V1 + rnorm(200) * 0.1
# Verify correlation
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9409437
model3 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp3 <- varImp(model3, scale = FALSE)
rfImp3
## Overall
## V1 5.20727155
## V2 7.02512107
## V3 0.54649146
## V4 7.01078999
## V5 1.90534306
## V6 0.19989219
## V7 -0.03098502
## V8 -0.09105178
## V9 -0.10283914
## V10 0.01618478
## duplicate1 2.47014856
## duplicate2 2.08144998
library(party)
## Warning: package 'party' was built under R version 4.5.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.5.3
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 4.5.2
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.5.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.5.3
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
##
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
##
## where
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
cforest_model <- cforest(y ~ ., data = simulated,
control = cforest_unbiased(ntree = 1000))
cfimp <- varimp(cforest_model, conditional = FALSE)
cfimp_conditional <- varimp(cforest_model, conditional = TRUE)
cfimp
## V1 V2 V3 V4 V5 V6
## 8.778146218 6.455417930 0.041537743 8.332906135 1.941524296 -0.012856182
## V7 V8 V9 V10
## 0.009380766 -0.023694594 0.009971240 -0.017851719
cfimp_conditional
## V1 V2 V3 V4 V5 V6
## 5.487846175 5.156480755 0.040492373 6.612453978 1.248621858 -0.005071012
## V7 V8 V9 V10
## 0.009023736 -0.005743251 0.010803333 -0.003733082
Yes - they show the same order. The non-conditional model shows similar values. The conditional model shows the same order with much lower importance values.
Boosted trees
library(gbm)
## Warning: package 'gbm' was built under R version 4.5.3
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(caret)
library(gbm)
set.seed(100)
sim_data <- mlbench.friedman1(200, sd = 1)
simulatedx <- as.data.frame(sim_data$x)
simulatedy <- sim_data$y
colnames(simulatedx) <- paste0("V", 1:ncol(simulatedx))
gbmModel <- gbm.fit(simulatedx, simulatedy, distribution = "gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 22.8608 nan 0.0010 0.0117
## 2 22.8471 nan 0.0010 0.0085
## 3 22.8350 nan 0.0010 0.0057
## 4 22.8208 nan 0.0010 0.0124
## 5 22.8051 nan 0.0010 0.0114
## 6 22.7921 nan 0.0010 0.0134
## 7 22.7785 nan 0.0010 0.0119
## 8 22.7661 nan 0.0010 0.0126
## 9 22.7531 nan 0.0010 0.0120
## 10 22.7376 nan 0.0010 0.0117
## 20 22.6163 nan 0.0010 0.0120
## 40 22.3569 nan 0.0010 0.0107
## 60 22.1090 nan 0.0010 0.0111
## 80 21.8730 nan 0.0010 0.0100
## 100 21.6469 nan 0.0010 0.0100
gbmGrid <- expand.grid(.interaction.depth = seq(1, 7, by = 2),
.n.trees = seq(100, 1000, by = 50),
.shrinkage = c(0.01, 0.1),
.n.minobsinnode = 10)
set.seed(100)
gbmTune <- train(simulatedx, simulatedy,
method = "gbm",
tuneGrid = gbmGrid,
verbose = FALSE)
Checking important variables:
gbmImp <- varImp(gbmTune, scale = FALSE)
gbmImp
## gbm variable importance
##
## Overall
## V4 5808.4
## V1 3737.6
## V2 3116.3
## V3 2154.1
## V5 1639.2
## V10 325.2
## V6 255.1
## V8 212.9
## V7 205.0
## V9 154.9
This buckets 1-5 into the higher importance category and 6-10 into the lower importance category, though not necessarily in the same order. V4 has the highest importance.
Adding similar variables:
sim_data <- mlbench.friedman1(200, sd = 1)
simulated <- as.data.frame(cbind(sim_data$x, y = sim_data$y))
# Add new duplicate of v4
simulated$duplicate1 <- simulated$V4 + rnorm(200) * 0.1
simulatedx <- simulated[, names(simulated) != "y"]
simulatedy <- simulated$y
colnames(simulatedx) <- paste0("V", 1:ncol(simulatedx))
gbmModel <- gbm.fit(simulatedx, simulatedy, distribution = "gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 18.9214 nan 0.0010 0.0071
## 2 18.9117 nan 0.0010 0.0077
## 3 18.9042 nan 0.0010 0.0050
## 4 18.8967 nan 0.0010 0.0082
## 5 18.8882 nan 0.0010 0.0080
## 6 18.8812 nan 0.0010 0.0052
## 7 18.8750 nan 0.0010 0.0048
## 8 18.8678 nan 0.0010 0.0065
## 9 18.8593 nan 0.0010 0.0077
## 10 18.8509 nan 0.0010 0.0061
## 20 18.7677 nan 0.0010 0.0077
## 40 18.6160 nan 0.0010 0.0055
## 60 18.4787 nan 0.0010 0.0058
## 80 18.3272 nan 0.0010 0.0076
## 100 18.1817 nan 0.0010 0.0051
set.seed(100)
gbmTune2 <- train(simulatedx, simulatedy,
method = "gbm",
tuneGrid = gbmGrid,
verbose = FALSE)
gbmImp2 <- varImp(gbmTune2, scale = FALSE)
gbmImp2
## gbm variable importance
##
## Overall
## V1 3522.6
## V4 3379.4
## V2 2751.8
## V5 1980.8
## V3 1419.2
## V11 801.7
## V9 455.8
## V10 427.0
## V7 384.5
## V6 353.8
## V8 292.3
Adding a colinear variable decreases the importance of V4. It’s now in the second position.
sim_data <- mlbench.friedman1(200, sd = 1)
simulated <- as.data.frame(cbind(sim_data$x, y = sim_data$y))
# Add 2 new vars
simulated$duplicate1 <- simulated$V4 + rnorm(200) * 0.1
simulated$duplicate2 <- simulated$V4 + rnorm(200) * 0.1
simulatedx <- simulated[, names(simulated) != "y"]
simulatedy <- simulated$y
colnames(simulatedx) <- paste0("V", 1:ncol(simulatedx))
gbmModel <- gbm.fit(simulatedx, simulatedy, distribution = "gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 20.6248 nan 0.0010 0.0117
## 2 20.6144 nan 0.0010 0.0075
## 3 20.6046 nan 0.0010 0.0081
## 4 20.5935 nan 0.0010 0.0093
## 5 20.5830 nan 0.0010 0.0093
## 6 20.5722 nan 0.0010 0.0100
## 7 20.5622 nan 0.0010 0.0094
## 8 20.5512 nan 0.0010 0.0116
## 9 20.5403 nan 0.0010 0.0082
## 10 20.5283 nan 0.0010 0.0090
## 20 20.4267 nan 0.0010 0.0099
## 40 20.2403 nan 0.0010 0.0060
## 60 20.0404 nan 0.0010 0.0092
## 80 19.8546 nan 0.0010 0.0065
## 100 19.6751 nan 0.0010 0.0060
set.seed(100)
gbmTune3 <- train(simulatedx, simulatedy,
method = "gbm",
tuneGrid = gbmGrid,
verbose = FALSE)
gbmImp3 <- varImp(gbmTune3, scale = FALSE)
gbmImp3
## gbm variable importance
##
## Overall
## V1 3977.1
## V4 2924.4
## V2 2572.3
## V3 1421.5
## V11 1378.5
## V5 1060.1
## V12 707.5
## V6 411.9
## V10 227.4
## V7 216.6
## V9 198.4
## V8 146.4
With two similar variables, v4 is now fourth in importance.
Cubist
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.5.3
sim_data <- mlbench.friedman1(200, sd = 1)
simulated <- as.data.frame(cbind(sim_data$x, y = sim_data$y))
simulatedx <- as.data.frame(sim_data$x)
simulatedy <- sim_data$y
set.seed(100)
cubistMod <- cubist(simulatedx, simulatedy)
cubistImp <- varImp(cubistMod, scale = FALSE)
summary(cubistMod)
##
## Call:
## cubist.default(x = simulatedx, y = simulatedy)
##
##
## Cubist [Release 2.07 GPL Edition]
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 200 cases (11 attributes) from undefined.data
##
## Model:
##
## Rule 1: [38 cases, mean 9.498390, range 1.674311 to 16.61368, est err 1.704758]
##
## if
## V4 <= 0.2244832
## then
## outcome = 3.708843 + 6.6 V1 + 5.3 V5 + 5 V2 - 2.7 V6
##
## Rule 2: [45 cases, mean 12.571536, range 6.224888 to 19.35134, est err 1.448351]
##
## if
## V2 <= 0.2902141
## V4 > 0.2244832
## then
## outcome = 0.408065 + 13.2 V2 + 10.9 V4 + 5.6 V5 - 1.8 V9 + 1.6 V1
##
## Rule 3: [71 cases, mean 16.099398, range 6.815564 to 25.73058, est err 1.640288]
##
## if
## V1 <= 0.5940672
## V2 > 0.2902141
## V4 > 0.2244832
## then
## outcome = -2.810967 + 16.4 V1 + 10.6 V4 + 6.4 V5 + 5.5 V2
##
## Rule 4: [28 cases, mean 17.589870, range 9.275812 to 24.5218, est err 1.830909]
##
## if
## V1 > 0.5940672
## V2 > 0.2902141
## V3 > 0.5471849
## then
## outcome = 6.142311 + 10.6 V4 + 8.6 V3 + 0.2 V5
##
## Rule 5: [22 cases, mean 18.805161, range 12.18447 to 27.31387, est err 1.662565]
##
## if
## V1 > 0.5940672
## V2 > 0.2902141
## V3 <= 0.5471849
## V4 > 0.2244832
## then
## outcome = 14.197943 - 9.1 V3 + 7.5 V4 + 5 V5
##
##
## Evaluation on training data (200 cases):
##
## Average |error| 1.749757
## Relative |error| 0.43
## Correlation coefficient 0.91
##
##
## Attribute usage:
## Conds Model
##
## 86% 81% V4
## 81% 75% V2
## 59% 75% V1
## 25% 25% V3
## 100% V5
## 22% V9
## 19% V6
cubistImp
## Overall
## V4 83.5
## V2 78.0
## V1 67.0
## V3 25.0
## V5 50.0
## V9 11.0
## V6 0.0
## V7 0.0
## V8 0.0
## V10 0.0
There’s a slightly more detailed breakdown with the summary function. This Cubist model chooses variables 1-5 as the most important, and gives 9 some significance.
Adding 1 and 2 more variables similar to v4:
sim_data <- mlbench.friedman1(200, sd = 1)
simulated <- as.data.frame(cbind(sim_data$x, y = sim_data$y))
# 1 new variable
simulated$duplicate1 <- simulated$V4 + rnorm(200) * 0.1
simulatedx <- simulated[, names(simulated) != "y"]
simulatedy <- simulated$y
set.seed(100)
cubistMod2 <- cubist(simulatedx, simulatedy)
cubistImp2 <- varImp(cubistMod2, scale = FALSE)
summary(cubistMod2)
##
## Call:
## cubist.default(x = simulatedx, y = simulatedy)
##
##
## Cubist [Release 2.07 GPL Edition]
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 200 cases (12 attributes) from undefined.data
##
## Model:
##
## Rule 1: [200 cases, mean 14.850068, range 3.94182 to 28.14138, est err 1.875457]
##
## outcome = -0.10961 + 9.5 V4 + 8 V1 + 7.5 V2 + 4.6 V5
##
##
## Evaluation on training data (200 cases):
##
## Average |error| 2.088893
## Relative |error| 0.55
## Correlation coefficient 0.82
##
##
## Attribute usage:
## Conds Model
##
## 100% V1
## 100% V2
## 100% V4
## 100% V5
cubistImp2
## Overall
## V1 50
## V2 50
## V4 50
## V3 0
## V5 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
## duplicate1 0
Varimp now shows 1, 2, and 4 as the most important variables.
sim_data <- mlbench.friedman1(200, sd = 1)
simulated <- as.data.frame(cbind(sim_data$x, y = sim_data$y))
# 2 new variables
simulated$duplicate1 <- simulated$V4 + rnorm(200) * 0.1
simulated$duplicate2 <- simulated$V4 + rnorm(200) * 0.1
simulatedx <- simulated[, names(simulated) != "y"]
simulatedy <- simulated$y
set.seed(100)
cubistMod3 <- cubist(simulatedx, simulatedy)
cubistImp3 <- varImp(cubistMod3, scale = FALSE)
summary(cubistMod3)
##
## Call:
## cubist.default(x = simulatedx, y = simulatedy)
##
##
## Cubist [Release 2.07 GPL Edition]
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 200 cases (13 attributes) from undefined.data
##
## Model:
##
## Rule 1: [200 cases, mean 14.850068, range 3.94182 to 28.14138, est err 1.865592]
##
## outcome = -0.097486 + 13.8 V4 + 8 V1 + 7.5 V2 + 4.6 V5 - 4.2 duplicate2
##
##
## Evaluation on training data (200 cases):
##
## Average |error| 2.152765
## Relative |error| 0.56
## Correlation coefficient 0.80
##
##
## Attribute usage:
## Conds Model
##
## 100% V1
## 100% V2
## 100% V4
## 100% V5
## 100% duplicate2
cubistImp3
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
## duplicate1 0
## duplicate2 0
Varimp shows 1, 2, 4, and 5. as equally important. Summary shows 1, 2, 4, 5 and duplicate 2 were used.
8.2. Use a simulation to show tree bias with different granularities.
Using a single tree model here, along with one random variable with many values (highly granular), and one random variable with only a few values (less granular):
library(rpart)
set.seed(100)
n <- 1000
# High granularity (a lot of variation, but just noise)
high <- sample(1:3000, n, replace = TRUE)
# Low granularity (not much variation, but still not significant to the model)
low <- sample(1:5, n, replace = TRUE)
#generate a bunch of random numbers in the normal distribution
y <- rnorm(n)
sim_data <- data.frame(y, high, low)
# Train a standard tree
tree_granularity <- rpart(y ~ ., data = sim_data, control = rpart.control(cp = 0))
# varimp
varimpTree <- varImp(tree_granularity, scale = FALSE)
varimpTree
## Overall
## high 3.792959
## low 3.039053
summary(varimpTree)
## Overall
## Min. :3.039
## 1st Qu.:3.228
## Median :3.416
## Mean :3.416
## 3rd Qu.:3.604
## Max. :3.793
The model shows the variable with high granularity as having more importance than the variable with low granularity. This may be because the variable with high granularity has a higher chance to show patterns or more options for split points than the variable with low granularity.
8.3. In stochastic gradient boosting, the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance.
Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
(a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?
The model on the right’s high settings let it detect the strongest relationships quickly. The high learning rate captures most of the patterns in the first few trees, so the importance drops quickly. The result is a smaller tree with the dominant predictors at the top, and many other factors receiving an importance level of 0.
For the model on the left, each branch only contributes a small amount to the model. There’s a lot of unexplained error for later branches to account for, leading to a wider variety of predictors.
(b) Which model do you think would be more predictive of other samples?
The model on the right, with only a few important (boosted) variables, could be overfit. The model on the left, with a large number of nuanced important variables, is more likely to be more predictive of new samples.
(c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig.8.24
Interaction depth is the maximum number of splits. I think a higher interaction depth for either model would decrease the slopes of predictor importance.
8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
data(ChemicalManufacturingProcess)
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.3
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
set.seed(1122)
chem_imputed <- kNN(ChemicalManufacturingProcess, k = 5)
#check that there are no missing values
colSums(is.na(chem_imputed))
## Yield BiologicalMaterial01
## 0 0
## BiologicalMaterial02 BiologicalMaterial03
## 0 0
## BiologicalMaterial04 BiologicalMaterial05
## 0 0
## BiologicalMaterial06 BiologicalMaterial07
## 0 0
## BiologicalMaterial08 BiologicalMaterial09
## 0 0
## BiologicalMaterial10 BiologicalMaterial11
## 0 0
## BiologicalMaterial12 ManufacturingProcess01
## 0 0
## ManufacturingProcess02 ManufacturingProcess03
## 0 0
## ManufacturingProcess04 ManufacturingProcess05
## 0 0
## ManufacturingProcess06 ManufacturingProcess07
## 0 0
## ManufacturingProcess08 ManufacturingProcess09
## 0 0
## ManufacturingProcess10 ManufacturingProcess11
## 0 0
## ManufacturingProcess12 ManufacturingProcess13
## 0 0
## ManufacturingProcess14 ManufacturingProcess15
## 0 0
## ManufacturingProcess16 ManufacturingProcess17
## 0 0
## ManufacturingProcess18 ManufacturingProcess19
## 0 0
## ManufacturingProcess20 ManufacturingProcess21
## 0 0
## ManufacturingProcess22 ManufacturingProcess23
## 0 0
## ManufacturingProcess24 ManufacturingProcess25
## 0 0
## ManufacturingProcess26 ManufacturingProcess27
## 0 0
## ManufacturingProcess28 ManufacturingProcess29
## 0 0
## ManufacturingProcess30 ManufacturingProcess31
## 0 0
## ManufacturingProcess32 ManufacturingProcess33
## 0 0
## ManufacturingProcess34 ManufacturingProcess35
## 0 0
## ManufacturingProcess36 ManufacturingProcess37
## 0 0
## ManufacturingProcess38 ManufacturingProcess39
## 0 0
## ManufacturingProcess40 ManufacturingProcess41
## 0 0
## ManufacturingProcess42 ManufacturingProcess43
## 0 0
## ManufacturingProcess44 ManufacturingProcess45
## 0 0
## Yield_imp BiologicalMaterial01_imp
## 0 0
## BiologicalMaterial02_imp BiologicalMaterial03_imp
## 0 0
## BiologicalMaterial04_imp BiologicalMaterial05_imp
## 0 0
## BiologicalMaterial06_imp BiologicalMaterial07_imp
## 0 0
## BiologicalMaterial08_imp BiologicalMaterial09_imp
## 0 0
## BiologicalMaterial10_imp BiologicalMaterial11_imp
## 0 0
## BiologicalMaterial12_imp ManufacturingProcess01_imp
## 0 0
## ManufacturingProcess02_imp ManufacturingProcess03_imp
## 0 0
## ManufacturingProcess04_imp ManufacturingProcess05_imp
## 0 0
## ManufacturingProcess06_imp ManufacturingProcess07_imp
## 0 0
## ManufacturingProcess08_imp ManufacturingProcess09_imp
## 0 0
## ManufacturingProcess10_imp ManufacturingProcess11_imp
## 0 0
## ManufacturingProcess12_imp ManufacturingProcess13_imp
## 0 0
## ManufacturingProcess14_imp ManufacturingProcess15_imp
## 0 0
## ManufacturingProcess16_imp ManufacturingProcess17_imp
## 0 0
## ManufacturingProcess18_imp ManufacturingProcess19_imp
## 0 0
## ManufacturingProcess20_imp ManufacturingProcess21_imp
## 0 0
## ManufacturingProcess22_imp ManufacturingProcess23_imp
## 0 0
## ManufacturingProcess24_imp ManufacturingProcess25_imp
## 0 0
## ManufacturingProcess26_imp ManufacturingProcess27_imp
## 0 0
## ManufacturingProcess28_imp ManufacturingProcess29_imp
## 0 0
## ManufacturingProcess30_imp ManufacturingProcess31_imp
## 0 0
## ManufacturingProcess32_imp ManufacturingProcess33_imp
## 0 0
## ManufacturingProcess34_imp ManufacturingProcess35_imp
## 0 0
## ManufacturingProcess36_imp ManufacturingProcess37_imp
## 0 0
## ManufacturingProcess38_imp ManufacturingProcess39_imp
## 0 0
## ManufacturingProcess40_imp ManufacturingProcess41_imp
## 0 0
## ManufacturingProcess42_imp ManufacturingProcess43_imp
## 0 0
## ManufacturingProcess44_imp ManufacturingProcess45_imp
## 0 0
chem_final <- chem_imputed |>
dplyr::select(-ends_with("_imp"))
#leaving out the part where I checked for NZV becuase I didn't end up removing any
#variables
set.seed(1122)
train_index <- sample(nrow(chem_final), 0.8 * nrow(chem_final))
#train and test sets (yield + predictors are all in the same df)
train_y <- chem_final$Yield[train_index]
train_x <- chem_final[train_index, ] |> dplyr::select(-Yield)
test_y <- chem_final$Yield[-train_index]
test_x <- chem_final[-train_index, ] |> dplyr::select(-Yield)
library(rpart)
set.seed(100)
rpartTree <- rpart(train_y ~ ., data = train_x)
predictions3 <- predict(rpartTree, test_x)
performance3 <- postResample(pred = predictions3, obs = test_y)
print(performance3)
## RMSE Rsquared MAE
## 1.2861859 0.4824644 0.9939959
singletree <- varImp(rpartTree, scale = FALSE)
singletreeSorted <- singletree[order(-singletree$Overall), , drop = FALSE]
summary(singletree)
## Overall
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2047
## 3rd Qu.:0.3309
## Max. :0.9378
singletreeSorted
## Overall
## BiologicalMaterial05 0.9378373
## ManufacturingProcess32 0.8978708
## ManufacturingProcess17 0.8848699
## ManufacturingProcess13 0.8240241
## ManufacturingProcess31 0.7699755
## ManufacturingProcess11 0.6849640
## BiologicalMaterial09 0.6151343
## BiologicalMaterial12 0.4977202
## BiologicalMaterial11 0.4924216
## ManufacturingProcess19 0.4862417
## BiologicalMaterial03 0.4771567
## ManufacturingProcess06 0.4664649
## BiologicalMaterial06 0.3912471
## ManufacturingProcess09 0.3329268
## ManufacturingProcess18 0.3308739
## ManufacturingProcess30 0.2931291
## ManufacturingProcess39 0.2803768
## BiologicalMaterial02 0.2635236
## BiologicalMaterial04 0.2633127
## ManufacturingProcess36 0.2617109
## ManufacturingProcess34 0.2602247
## BiologicalMaterial10 0.2154033
## ManufacturingProcess27 0.2144916
## ManufacturingProcess24 0.1983796
## ManufacturingProcess33 0.1842469
## ManufacturingProcess42 0.1459598
## BiologicalMaterial01 0.0000000
## BiologicalMaterial07 0.0000000
## BiologicalMaterial08 0.0000000
## ManufacturingProcess01 0.0000000
## ManufacturingProcess02 0.0000000
## ManufacturingProcess03 0.0000000
## ManufacturingProcess04 0.0000000
## ManufacturingProcess05 0.0000000
## ManufacturingProcess07 0.0000000
## ManufacturingProcess08 0.0000000
## ManufacturingProcess10 0.0000000
## ManufacturingProcess12 0.0000000
## ManufacturingProcess14 0.0000000
## ManufacturingProcess15 0.0000000
## ManufacturingProcess16 0.0000000
## ManufacturingProcess20 0.0000000
## ManufacturingProcess21 0.0000000
## ManufacturingProcess22 0.0000000
## ManufacturingProcess23 0.0000000
## ManufacturingProcess25 0.0000000
## ManufacturingProcess26 0.0000000
## ManufacturingProcess28 0.0000000
## ManufacturingProcess29 0.0000000
## ManufacturingProcess35 0.0000000
## ManufacturingProcess37 0.0000000
## ManufacturingProcess38 0.0000000
## ManufacturingProcess40 0.0000000
## ManufacturingProcess41 0.0000000
## ManufacturingProcess43 0.0000000
## ManufacturingProcess44 0.0000000
## ManufacturingProcess45 0.0000000
R-squared is in the middle of these three modesl at .48. The model has both biological and manufacturing variables in the top 10, with biological at the top.
set.seed(100)
cubistMod_chem <- cubist(train_x, train_y)
predictions <- predict(cubistMod_chem, test_x)
performance <- postResample(pred = predictions, obs = test_y)
print(performance)
## RMSE Rsquared MAE
## 1.9860495 0.3112863 1.2134773
cubistImp_chem <- varImp(cubistMod_chem, scale = FALSE)
summary(cubistMod_chem)
##
## Call:
## cubist.default(x = train_x, y = train_y)
##
##
## Cubist [Release 2.07 GPL Edition]
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 140 cases (58 attributes) from undefined.data
##
## Model:
##
## Rule 1: [72 cases, mean 39.053, range 36.12 to 42.31, est err 0.847]
##
## if
## ManufacturingProcess32 <= 158
## then
## outcome = -42.149 + 0.276 ManufacturingProcess32
## + 0.187 BiologicalMaterial03 - 0.143 BiologicalMaterial02
## - 0.224 ManufacturingProcess33 + 0.76 ManufacturingProcess30
## - 0.44 ManufacturingProcess13 + 0.0072 ManufacturingProcess26
## + 0.37 ManufacturingProcess11 + 0.0036 ManufacturingProcess14
## - 0.26 BiologicalMaterial01 - 0.014 ManufacturingProcess35
## + 0.05 ManufacturingProcess23
##
## Rule 2: [68 cases, mean 41.349, range 35.25 to 46.34, est err 0.922]
##
## if
## ManufacturingProcess32 > 158
## then
## outcome = -46.962 + 0.295 ManufacturingProcess32
## + 0.233 BiologicalMaterial03 - 0.178 BiologicalMaterial02
## - 0.279 ManufacturingProcess33 - 0.64 ManufacturingProcess13
## + 0.95 ManufacturingProcess30 + 0.009 ManufacturingProcess26
## + 0.0045 ManufacturingProcess14 - 0.33 BiologicalMaterial01
## - 0.017 ManufacturingProcess35
##
##
## Evaluation on training data (140 cases):
##
## Average |error| 0.768
## Relative |error| 0.49
## Correlation coefficient 0.86
##
##
## Attribute usage:
## Conds Model
##
## 100% 100% ManufacturingProcess32
## 100% BiologicalMaterial01
## 100% BiologicalMaterial02
## 100% BiologicalMaterial03
## 100% ManufacturingProcess13
## 100% ManufacturingProcess14
## 100% ManufacturingProcess26
## 100% ManufacturingProcess30
## 100% ManufacturingProcess33
## 100% ManufacturingProcess35
## 51% ManufacturingProcess11
## 51% ManufacturingProcess23
print(cubistImp_chem)
## Overall
## ManufacturingProcess32 100.0
## BiologicalMaterial01 50.0
## BiologicalMaterial02 50.0
## BiologicalMaterial03 50.0
## ManufacturingProcess13 50.0
## ManufacturingProcess14 50.0
## ManufacturingProcess26 50.0
## ManufacturingProcess30 50.0
## ManufacturingProcess33 50.0
## ManufacturingProcess35 50.0
## ManufacturingProcess11 25.5
## BiologicalMaterial04 0.0
## BiologicalMaterial05 0.0
## BiologicalMaterial06 0.0
## BiologicalMaterial07 0.0
## BiologicalMaterial08 0.0
## BiologicalMaterial09 0.0
## BiologicalMaterial10 0.0
## BiologicalMaterial11 0.0
## BiologicalMaterial12 0.0
## ManufacturingProcess01 0.0
## ManufacturingProcess02 0.0
## ManufacturingProcess03 0.0
## ManufacturingProcess04 0.0
## ManufacturingProcess05 0.0
## ManufacturingProcess06 0.0
## ManufacturingProcess07 0.0
## ManufacturingProcess08 0.0
## ManufacturingProcess09 0.0
## ManufacturingProcess10 0.0
## ManufacturingProcess12 0.0
## ManufacturingProcess15 0.0
## ManufacturingProcess16 0.0
## ManufacturingProcess17 0.0
## ManufacturingProcess18 0.0
## ManufacturingProcess19 0.0
## ManufacturingProcess20 0.0
## ManufacturingProcess21 0.0
## ManufacturingProcess22 0.0
## ManufacturingProcess23 0.0
## ManufacturingProcess24 0.0
## ManufacturingProcess25 0.0
## ManufacturingProcess27 0.0
## ManufacturingProcess28 0.0
## ManufacturingProcess29 0.0
## ManufacturingProcess31 0.0
## ManufacturingProcess34 0.0
## ManufacturingProcess36 0.0
## ManufacturingProcess37 0.0
## ManufacturingProcess38 0.0
## ManufacturingProcess39 0.0
## ManufacturingProcess40 0.0
## ManufacturingProcess41 0.0
## ManufacturingProcess42 0.0
## ManufacturingProcess43 0.0
## ManufacturingProcess44 0.0
## ManufacturingProcess45 0.0
#R-squared is only .311 here, so it’s not a great model. Similar to some linear and nonlinear models, it places manufacturing process 32 at the top.
Bagged trees
library(ipred)
## Warning: package 'ipred' was built under R version 4.5.3
set.seed(100)
#setting nbagg to 100 because this defaults to 25
baggedTree <- ipredbagg(train_y, train_x, nbagg = 100)
predictions <- predict(baggedTree, test_x)
performance2 <- postResample(pred = predictions, obs = test_y)
print(performance2)
## RMSE Rsquared MAE
## 0.8562781 0.7036328 0.6816041
baggedImp <- varImp(baggedTree)
baggedImpSorted <- baggedImp[order(-baggedImp$Overall), , drop = FALSE]
print(baggedImpSorted)
## Overall
## BiologicalMaterial03 0.896170336
## ManufacturingProcess32 0.849468151
## BiologicalMaterial12 0.795561242
## ManufacturingProcess17 0.755865082
## ManufacturingProcess13 0.682349852
## ManufacturingProcess31 0.576228190
## ManufacturingProcess09 0.564226137
## ManufacturingProcess06 0.561627145
## ManufacturingProcess11 0.549535313
## BiologicalMaterial06 0.528423469
## BiologicalMaterial11 0.489454220
## ManufacturingProcess18 0.438703105
## BiologicalMaterial05 0.390373328
## ManufacturingProcess19 0.389607052
## ManufacturingProcess30 0.363068054
## ManufacturingProcess20 0.335280032
## BiologicalMaterial08 0.334361085
## BiologicalMaterial09 0.310521370
## ManufacturingProcess25 0.307304491
## BiologicalMaterial02 0.295105689
## BiologicalMaterial01 0.294377704
## ManufacturingProcess05 0.290848647
## ManufacturingProcess24 0.283905057
## ManufacturingProcess27 0.278126972
## ManufacturingProcess36 0.252763516
## ManufacturingProcess21 0.250898712
## BiologicalMaterial10 0.237576030
## ManufacturingProcess01 0.236715662
## BiologicalMaterial04 0.234486609
## ManufacturingProcess26 0.191968904
## ManufacturingProcess39 0.177595932
## ManufacturingProcess15 0.177466941
## ManufacturingProcess33 0.176372608
## ManufacturingProcess16 0.172783772
## ManufacturingProcess43 0.163925986
## ManufacturingProcess28 0.157732530
## ManufacturingProcess02 0.155717355
## ManufacturingProcess14 0.149948807
## ManufacturingProcess23 0.146888918
## ManufacturingProcess29 0.146254275
## ManufacturingProcess10 0.139441402
## ManufacturingProcess04 0.137903998
## ManufacturingProcess34 0.129734183
## ManufacturingProcess35 0.117927049
## ManufacturingProcess44 0.102941453
## ManufacturingProcess37 0.100570506
## ManufacturingProcess42 0.087829361
## ManufacturingProcess22 0.086342530
## ManufacturingProcess45 0.075259526
## ManufacturingProcess03 0.055322627
## ManufacturingProcess12 0.024211330
## ManufacturingProcess08 0.017457954
## ManufacturingProcess38 0.016262640
## ManufacturingProcess07 0.014645992
## ManufacturingProcess40 0.002462527
## ManufacturingProcess41 0.002462527
## BiologicalMaterial07 0.000000000
R-squared is much higher at ~.704, making this a better model than the previous two.
Bagged trees did significantly better than the other two models with an R-squared of .704.
Predictors 1 and 3 are biological. Overall, four biological predictors make it into the top 10. This is different from what we’ve seen with linear and nonlinear models.
With nonlinear models, manufactruring processes dominated. Several of the important nonlinear variables turned up in the tree-based models as well, including manufacturing process 32 (the top overall for non-linear, and the top manufacturing process here). 13 and 9 were in the top 3 non-linear, and they appear in the top 10 here.
For the top linear model, BiologicalMaterial06, process 09, 13, 32, and 17 all overalp in the top 10 here.
Overall, there is a lot of overlap between the models.
print(baggedImpSorted)
## Overall
## BiologicalMaterial03 0.896170336
## ManufacturingProcess32 0.849468151
## BiologicalMaterial12 0.795561242
## ManufacturingProcess17 0.755865082
## ManufacturingProcess13 0.682349852
## ManufacturingProcess31 0.576228190
## ManufacturingProcess09 0.564226137
## ManufacturingProcess06 0.561627145
## ManufacturingProcess11 0.549535313
## BiologicalMaterial06 0.528423469
## BiologicalMaterial11 0.489454220
## ManufacturingProcess18 0.438703105
## BiologicalMaterial05 0.390373328
## ManufacturingProcess19 0.389607052
## ManufacturingProcess30 0.363068054
## ManufacturingProcess20 0.335280032
## BiologicalMaterial08 0.334361085
## BiologicalMaterial09 0.310521370
## ManufacturingProcess25 0.307304491
## BiologicalMaterial02 0.295105689
## BiologicalMaterial01 0.294377704
## ManufacturingProcess05 0.290848647
## ManufacturingProcess24 0.283905057
## ManufacturingProcess27 0.278126972
## ManufacturingProcess36 0.252763516
## ManufacturingProcess21 0.250898712
## BiologicalMaterial10 0.237576030
## ManufacturingProcess01 0.236715662
## BiologicalMaterial04 0.234486609
## ManufacturingProcess26 0.191968904
## ManufacturingProcess39 0.177595932
## ManufacturingProcess15 0.177466941
## ManufacturingProcess33 0.176372608
## ManufacturingProcess16 0.172783772
## ManufacturingProcess43 0.163925986
## ManufacturingProcess28 0.157732530
## ManufacturingProcess02 0.155717355
## ManufacturingProcess14 0.149948807
## ManufacturingProcess23 0.146888918
## ManufacturingProcess29 0.146254275
## ManufacturingProcess10 0.139441402
## ManufacturingProcess04 0.137903998
## ManufacturingProcess34 0.129734183
## ManufacturingProcess35 0.117927049
## ManufacturingProcess44 0.102941453
## ManufacturingProcess37 0.100570506
## ManufacturingProcess42 0.087829361
## ManufacturingProcess22 0.086342530
## ManufacturingProcess45 0.075259526
## ManufacturingProcess03 0.055322627
## ManufacturingProcess12 0.024211330
## ManufacturingProcess08 0.017457954
## ManufacturingProcess38 0.016262640
## ManufacturingProcess07 0.014645992
## ManufacturingProcess40 0.002462527
## ManufacturingProcess41 0.002462527
## BiologicalMaterial07 0.000000000
#the plot for single tree
ctreeMod <- ctree(train_y ~ ., data = data.frame(train_x, train_y))
plot(ctreeMod)
This plot gives a clear flow of how the different manufacturing processes could relate to each other, and the changes in yield associated with different variables. This shows that, even though predictors are important, they might not be important in every case. For example, according to this model, if process 17 is over a certain threshold, the value of process 37 doesn’t matter.