Libraries
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(vip)
## Warning: package 'vip' was built under R version 4.3.3
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(partykit)
## Warning: package 'partykit' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 4.3.3
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.3
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
8.1
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
a.
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- model1$importance #varImp(model1, scale = FALSE)
vip(model1, color = 'purple', fill='green') + ggtitle('Model1 Var Imp')

model1 <- randomForest(y ~ .,
data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1 |>
arrange(desc(Overall)) |>
knitr::kable()
| V1 |
8.6895807 |
| V4 |
7.6869946 |
| V2 |
6.4296506 |
| V5 |
2.3690477 |
| V3 |
0.7471133 |
| V6 |
0.1099727 |
| V10 |
0.0489594 |
| V7 |
0.0308325 |
| V8 |
-0.1077942 |
| V9 |
-0.1281411 |
Random forest model did not significantly use the variables (V6-
V10)
b.
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9418838
model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
vip(model2, color = 'blue', fill='red') + ggtitle('Model2 Var Imp')

model2 <- randomForest(y ~ .,
data = simulated,
importance = TRUE,
ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2 |>
arrange(desc(Overall)) |>
knitr::kable()
| V4 |
6.9156852 |
| V2 |
6.1651556 |
| V1 |
5.8027373 |
| duplicate1 |
3.9673109 |
| V5 |
2.2239289 |
| V3 |
0.6478974 |
| V6 |
0.2058123 |
| V7 |
0.0453553 |
| V10 |
0.0182078 |
| V9 |
-0.0290585 |
| V8 |
-0.0448675 |
The importance score for V1 decreased significantly. The importance
scores for the other significant variables also decreased slightly
too
c.
cforestModel <- cforest(y ~ .,
data = simulated[,c(1:11)])
data.frame(Importance = varimp(cforestModel)) |>
arrange(desc(Importance)) |>
knitr::kable()
| V1 |
8.2808523 |
| V4 |
7.0858387 |
| V2 |
6.0391727 |
| V5 |
2.0483184 |
| V3 |
0.3150098 |
| V7 |
0.1048612 |
| V6 |
-0.1314990 |
| V10 |
-0.1365436 |
| V9 |
-0.2769216 |
| V8 |
-0.2912119 |
data.frame(Importance = varimp(cforestModel, conditional=T)) |>
arrange(desc(Importance)) |>
knitr::kable()
| V1 |
6.1154264 |
| V4 |
5.9824481 |
| V2 |
5.0077664 |
| V5 |
1.3581843 |
| V3 |
0.1073499 |
| V7 |
-0.0794067 |
| V6 |
-0.1869483 |
| V9 |
-0.2486453 |
| V8 |
-0.2500343 |
| V10 |
-0.2707839 |
Importance remains the same (V1, V4, V2, V5, V3). However, V3 is
much less significant than in the original random forest model. V6-V10
remain unimportant.
d.
# boosted model
boostedModel <- gbm(y ~ .,
data = simulated[,c(1:11)],
distribution = "gaussian",
n.trees=1000)
summary(boostedModel, plotit=F) |>
dplyr::select(-var) |>
knitr::kable()
| V4 |
24.036116 |
| V1 |
23.283463 |
| V2 |
20.668396 |
| V5 |
11.380157 |
| V3 |
9.707534 |
| V7 |
2.413996 |
| V6 |
2.353621 |
| V10 |
2.138098 |
| V9 |
2.096317 |
| V8 |
1.922300 |
The boosted model has the same significant (V1-V5) and insignificant
(V6-V10) predictors. However, the pattern of importance is slightly
different, with V4 having a higher influence than V1.
# cubist model
cubistModel <- train(y ~ .,
data = simulated[,c(1:11)],
method = "cubist")
varImp(cubistModel$finalModel, scale = FALSE) |>
arrange(desc(Overall)) |>
knitr::kable()
| V1 |
72.0 |
| V2 |
54.5 |
| V4 |
49.0 |
| V3 |
42.0 |
| V5 |
40.0 |
| V6 |
11.0 |
| V7 |
0.0 |
| V8 |
0.0 |
| V9 |
0.0 |
| V10 |
0.0 |
In the cubist model, V1 is again the most important variable but in
this model, V2 is rated more important than V4 and V3 is rated more
important than V5. V7-V10 are wholly insignificant but V6 has a slightly
higher importance in this model. The importance of V6 still pales in
comparison to the other significant variables.
8.2
set.seed(415)
a <- sample(1:10 / 10, 200, replace = TRUE)
b <- sample(1:100 / 100, 200, replace = TRUE)
c <- sample(1:1000 / 1000, 200, replace = TRUE)
d <- sample(1:10000 / 10000, 200, replace = TRUE)
y <- a + b + c + rnorm(200, mean=0, sd=5)
simData <- data.frame(a, b, c, d, y)
rpartTree <- rpart(y ~ ., data = simData)
plot(as.party(rpartTree))

varImp(rpartTree)
## Overall
## a 0.3805726
## b 0.9263322
## c 0.7376280
## d 0.8044435
In this simulation, the tree-based model selected the variables that
have more distinct values as more important. It also selected the noisy
or the variables with the most repetitive values as the top node.
8.3
a. The model on the right has a higher bagging fraction and a higher
learning rate (0.9 for both). Therefore, the model tends to fit faster
to a few predictors as it overfits and focuses on the earlier
predictors. The model on the left spreads the importance across more
predictors because there is a lower bagging fraction and a lower
learning rate so the model incorporates information from more
predictors.
b.The left model with a lower bagging fraction and lower learning
rate is probably more predictive of other samples as it is less likely
to overfit to the training data. The model on right is likely overfit
and not as able to generalize to new data.
c. Increasing the interaction depth is likely to spread the variable
importance over more predictors, increasing the importance of some of
the less significant variables.
8.7
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
cmp_predictors = as.matrix(ChemicalManufacturingProcess[,2:58])
cmp_yield = ChemicalManufacturingProcess[,1]
set.seed(100)
train_select <- createDataPartition(cmp_yield, p=0.75, list=F) #create train set
train_x <- ChemicalManufacturingProcess[train_select,-1]
train_y <- ChemicalManufacturingProcess[train_select,1]
test_x <- ChemicalManufacturingProcess[-train_select,-1]
test_y <- ChemicalManufacturingProcess[-train_select,1]
pre_process <- c("nzv", "corr", "center","scale", "medianImpute")
set.seed(200)
ctrl <- trainControl(method = "boot", number = 25)
a.
set.seed(123)
rpartGrid <- expand.grid(maxdepth= seq(1,10,by=1))
rp_model <- train(x = train_x, y = train_y, method = "rpart2",metric = "Rsquared", tuneGrid = rpartGrid,
trControl = ctrl, preProcess=pre_process)
set.seed(415)
rfGrid <- expand.grid(mtry=seq(2,38,by=3))
rf_model <- train(x = train_x, y = train_y, method = "rf", tuneGrid = rfGrid, metric = "Rsquared", importance = TRUE,
trControl = ctrl,preProcess=pre_process)
set.seed(123)
gbmGrid <- expand.grid(interaction.depth=seq(1,6,by=1),
n.trees=c(25,50,100,200),
shrinkage=c(0.01,0.05,0.1,0.2),
n.minobsinnode=5)
gb_model <- train(x = train_x, y = train_y,method = "gbm", metric = "Rsquared",verbose = FALSE,
tuneGrid = gbmGrid, trControl = ctrl, preProcess=pre_process)
set.seed(300)
cubistGrid <- expand.grid(committees = c(1, 5, 10, 20, 50, 100),
neighbors = c(0, 1, 3, 5, 7))
cubist_model <- train(x = train_x, y = train_y,method = "cubist",
verbose = FALSE, metric = "Rsquared", tuneGrid = cubistGrid,trControl = ctrl, preProcess=pre_process)
rp_model
## CART
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (47), scaled (47), median imputation (47), remove (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## maxdepth RMSE Rsquared MAE
## 1 1.690196 0.2453857 1.329880
## 2 1.639331 0.2983005 1.291676
## 3 1.629179 0.3219788 1.297111
## 4 1.616411 0.3402401 1.274203
## 5 1.611857 0.3522453 1.259721
## 6 1.604643 0.3692173 1.247175
## 7 1.608699 0.3672581 1.250057
## 8 1.612859 0.3683148 1.250426
## 9 1.609667 0.3745211 1.234120
## 10 1.612526 0.3710538 1.234737
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 9.
rf_model
## Random Forest
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (47), scaled (47), median imputation (47), remove (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.303028 0.6100732 1.0497760
## 5 1.225974 0.6392512 0.9847810
## 8 1.204323 0.6438640 0.9632693
## 11 1.188722 0.6490856 0.9506980
## 14 1.183323 0.6471495 0.9447251
## 17 1.182808 0.6443045 0.9414697
## 20 1.179384 0.6423053 0.9383393
## 23 1.180337 0.6388259 0.9405971
## 26 1.177250 0.6389037 0.9354435
## 29 1.180567 0.6340534 0.9387843
## 32 1.181884 0.6326042 0.9366325
## 35 1.182666 0.6284457 0.9380844
## 38 1.186828 0.6253370 0.9408241
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 11.
gb_model
## Stochastic Gradient Boosting
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (47), scaled (47), median imputation (47), remove (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees RMSE Rsquared MAE
## 0.01 1 25 1.778820 0.4250029 1.4382599
## 0.01 1 50 1.689642 0.4503775 1.3641241
## 0.01 1 100 1.560112 0.4870382 1.2613075
## 0.01 1 200 1.415944 0.5227382 1.1484883
## 0.01 2 25 1.743751 0.4721129 1.4123781
## 0.01 2 50 1.632455 0.4927882 1.3236075
## 0.01 2 100 1.479777 0.5252021 1.2011062
## 0.01 2 200 1.346657 0.5470629 1.0865569
## 0.01 3 25 1.728590 0.4890706 1.4005543
## 0.01 3 50 1.605972 0.5116278 1.3042885
## 0.01 3 100 1.447419 0.5399252 1.1757286
## 0.01 3 200 1.318261 0.5606073 1.0605653
## 0.01 4 25 1.715755 0.5168298 1.3915697
## 0.01 4 50 1.587248 0.5351980 1.2902573
## 0.01 4 100 1.428679 0.5529776 1.1610599
## 0.01 4 200 1.298732 0.5728234 1.0415164
## 0.01 5 25 1.715498 0.5254055 1.3918021
## 0.01 5 50 1.583384 0.5374533 1.2866235
## 0.01 5 100 1.418352 0.5566165 1.1502691
## 0.01 5 200 1.291630 0.5763051 1.0334618
## 0.01 6 25 1.710393 0.5195480 1.3875657
## 0.01 6 50 1.577584 0.5388127 1.2822880
## 0.01 6 100 1.417650 0.5548769 1.1515401
## 0.01 6 200 1.289347 0.5775145 1.0309699
## 0.05 1 25 1.512849 0.4866767 1.2219441
## 0.05 1 50 1.379024 0.5220269 1.1103620
## 0.05 1 100 1.307806 0.5410636 1.0389930
## 0.05 1 200 1.286970 0.5502744 1.0107437
## 0.05 2 25 1.435614 0.5206687 1.1660179
## 0.05 2 50 1.316195 0.5488065 1.0578949
## 0.05 2 100 1.280778 0.5561018 1.0105517
## 0.05 2 200 1.254143 0.5718693 0.9823317
## 0.05 3 25 1.398439 0.5348659 1.1345059
## 0.05 3 50 1.288210 0.5640321 1.0272896
## 0.05 3 100 1.249442 0.5784970 0.9806560
## 0.05 3 200 1.225145 0.5905359 0.9544796
## 0.05 4 25 1.387086 0.5422217 1.1257788
## 0.05 4 50 1.280060 0.5674988 1.0216455
## 0.05 4 100 1.237861 0.5852009 0.9729893
## 0.05 4 200 1.218023 0.5963110 0.9560371
## 0.05 5 25 1.368383 0.5485958 1.1072082
## 0.05 5 50 1.266979 0.5755483 1.0048438
## 0.05 5 100 1.228518 0.5902988 0.9619375
## 0.05 5 200 1.215467 0.5966888 0.9491315
## 0.05 6 25 1.362614 0.5539193 1.0964223
## 0.05 6 50 1.264381 0.5761423 1.0019225
## 0.05 6 100 1.225261 0.5929322 0.9624200
## 0.05 6 200 1.211567 0.6001751 0.9493778
## 0.10 1 25 1.366093 0.5307393 1.1041488
## 0.10 1 50 1.297141 0.5489580 1.0324421
## 0.10 1 100 1.287503 0.5520652 1.0138624
## 0.10 1 200 1.284574 0.5536828 1.0102073
## 0.10 2 25 1.331845 0.5363735 1.0677348
## 0.10 2 50 1.293520 0.5482392 1.0205492
## 0.10 2 100 1.272767 0.5597542 0.9948234
## 0.10 2 200 1.259311 0.5681329 0.9873656
## 0.10 3 25 1.319656 0.5362327 1.0393035
## 0.10 3 50 1.284653 0.5514494 0.9997362
## 0.10 3 100 1.255961 0.5699942 0.9737487
## 0.10 3 200 1.245709 0.5758247 0.9642348
## 0.10 4 25 1.294238 0.5529217 1.0343880
## 0.10 4 50 1.259429 0.5674862 0.9905934
## 0.10 4 100 1.243242 0.5759588 0.9716459
## 0.10 4 200 1.231833 0.5825864 0.9621759
## 0.10 5 25 1.302823 0.5474943 1.0307887
## 0.10 5 50 1.261601 0.5665345 0.9882995
## 0.10 5 100 1.246789 0.5748364 0.9730788
## 0.10 5 200 1.236241 0.5811695 0.9619466
## 0.10 6 25 1.297187 0.5486155 1.0266988
## 0.10 6 50 1.251588 0.5728344 0.9820789
## 0.10 6 100 1.229233 0.5861358 0.9612253
## 0.10 6 200 1.220481 0.5915938 0.9553760
## 0.20 1 25 1.346969 0.5116236 1.0733407
## 0.20 1 50 1.331752 0.5205984 1.0472223
## 0.20 1 100 1.327409 0.5248659 1.0441472
## 0.20 1 200 1.326649 0.5279554 1.0375076
## 0.20 2 25 1.310089 0.5367669 1.0408923
## 0.20 2 50 1.283335 0.5546175 1.0127880
## 0.20 2 100 1.289921 0.5541846 1.0090610
## 0.20 2 200 1.287574 0.5555275 1.0038774
## 0.20 3 25 1.332570 0.5106666 1.0537657
## 0.20 3 50 1.308465 0.5291638 1.0296329
## 0.20 3 100 1.299932 0.5360254 1.0206149
## 0.20 3 200 1.297973 0.5375716 1.0194229
## 0.20 4 25 1.311812 0.5307552 1.0306374
## 0.20 4 50 1.293451 0.5462173 1.0171186
## 0.20 4 100 1.290003 0.5504031 1.0140327
## 0.20 4 200 1.288437 0.5514447 1.0142673
## 0.20 5 25 1.273227 0.5571838 1.0091540
## 0.20 5 50 1.265032 0.5649241 0.9950279
## 0.20 5 100 1.260980 0.5675297 0.9903326
## 0.20 5 200 1.258175 0.5695635 0.9874677
## 0.20 6 25 1.290332 0.5458426 1.0036063
## 0.20 6 50 1.273412 0.5574859 0.9838748
## 0.20 6 100 1.268224 0.5614962 0.9783521
## 0.20 6 200 1.267447 0.5624443 0.9779844
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 5
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
## 6, shrinkage = 0.05 and n.minobsinnode = 5.
cubist_model
## Cubist
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (47), scaled (47), median imputation (47), remove (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 1.980957 0.3104154 1.4101285
## 1 1 1.972143 0.3307948 1.3835795
## 1 3 1.957554 0.3271925 1.3832304
## 1 5 1.964989 0.3220523 1.3917770
## 1 7 1.974917 0.3178011 1.4003046
## 5 0 1.340676 0.5212775 1.0285352
## 5 1 1.301089 0.5527658 0.9729775
## 5 3 1.310632 0.5430352 0.9918837
## 5 5 1.318788 0.5364905 1.0012430
## 5 7 1.326573 0.5312051 1.0093421
## 10 0 1.235722 0.5720334 0.9579426
## 10 1 1.184477 0.6089357 0.8953398
## 10 3 1.204713 0.5928486 0.9193069
## 10 5 1.212072 0.5873299 0.9310488
## 10 7 1.220169 0.5823344 0.9408028
## 20 0 1.185611 0.5989890 0.9304017
## 20 1 1.129469 0.6357279 0.8608391
## 20 3 1.148486 0.6218788 0.8825683
## 20 5 1.157873 0.6154258 0.8972738
## 20 7 1.167402 0.6098351 0.9091194
## 50 0 1.154450 0.6198562 0.8990872
## 50 1 1.095138 0.6565156 0.8244996
## 50 3 1.116165 0.6423499 0.8525089
## 50 5 1.127548 0.6352506 0.8671864
## 50 7 1.137879 0.6293057 0.8780492
## 100 0 1.143115 0.6281723 0.8903013
## 100 1 1.079676 0.6666712 0.8100236
## 100 3 1.103333 0.6515510 0.8425327
## 100 5 1.115153 0.6441812 0.8573995
## 100 7 1.124801 0.6387006 0.8676343
##
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were committees = 100 and neighbors = 1.
Random forest has the best optimal resampling and test set
performance
b.
## Cubist model:
cubist_imp <- varImp(cubist_model, scale = FALSE)
plot(cubist_imp, top=15, scales = list(y = list(cex = 0.8)))

c.
plot(as.party(rp_model$finalModel),gp=gpar(fontsize=10))
