1 Exercise 8.1

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

1.1 8.1(a)

set.seed(200)
model1 <- randomForest(
  y ~ .,
  data = simulated,
  importance = TRUE,
  ntree = 1000
)

rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
##          Overall
## V1   8.811388491
## V2   6.685318725
## V3   0.629866472
## V4   7.703182937
## V5   2.149608737
## V6   0.093706774
## V7   0.003535298
## V8  -0.124011241
## V9  -0.007978776
## V10 -0.026657627
plot(rfImp1, top = 10)

The Friedman simulation uses V1 through V5 as informative predictors. Predictors V6 through V10 are noise variables.

rfImp1[rownames(rfImp1) %in% paste0("V", 6:10), , drop = FALSE]
##          Overall
## V6   0.093706774
## V7   0.003535298
## V8  -0.124011241
## V9  -0.007978776
## V10 -0.026657627

The random forest may assign some nonzero importance to V6–V10, but their importance should generally be much smaller than the informative variables.

1.2 8.1(b)

set.seed(200)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9497025
set.seed(200)
model2 <- randomForest(
  y ~ .,
  data = simulated,
  importance = TRUE,
  ntree = 1000
)

rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
##                Overall
## V1          5.91964260
## V2          6.16818815
## V3          0.73116589
## V4          6.99550947
## V5          2.28421876
## V6          0.19857217
## V7         -0.01400175
## V8         -0.02834702
## V9          0.08851319
## V10         0.01288275
## duplicate1  4.12004820
plot(rfImp2, top = 15)

Compare V1 before and after adding the correlated duplicate.

rfImp1["V1", , drop = FALSE]
##     Overall
## V1 8.811388
rfImp2[c("V1", "duplicate1"), , drop = FALSE]
##             Overall
## V1         5.919643
## duplicate1 4.120048

Now add another correlated predictor.

set.seed(200)
simulated$duplicate2 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated[, c("V1", "duplicate1", "duplicate2")])
##                   V1 duplicate1 duplicate2
## V1         1.0000000  0.9497025  0.9497025
## duplicate1 0.9497025  1.0000000  1.0000000
## duplicate2 0.9497025  1.0000000  1.0000000
set.seed(200)
model3 <- randomForest(
  y ~ .,
  data = simulated,
  importance = TRUE,
  ntree = 1000
)

rfImp3 <- varImp(model3, scale = FALSE)
rfImp3[c("V1", "duplicate1", "duplicate2"), , drop = FALSE]
##             Overall
## V1         4.684417
## duplicate1 2.916935
## duplicate2 3.261472
plot(rfImp3, top = 15)

When correlated copies of V1 are added, the importance for V1 is usually diluted across the correlated predictors.

1.3 8.1(c)

set.seed(200)
cf_model <- cforest(
  y ~ .,
  data = simulated,
  controls = cforest_unbiased(ntree = 1000, mtry = 3)
)

cf_imp_uncond <- varimp(cf_model, conditional = FALSE)
cf_imp_cond <- varimp(cf_model, conditional = TRUE)

sort(cf_imp_uncond, decreasing = TRUE)
##            V4            V2            V1    duplicate1    duplicate2 
##  5.4644815853  4.8052254155  4.8018033781  2.6695382536  2.0436920369 
##            V5            V3            V9            V6            V7 
##  1.6010809886  0.0969794928 -0.0004679513 -0.0173316635 -0.0196927623 
##           V10            V8 
## -0.0450997894 -0.0557297157
sort(cf_imp_cond, decreasing = TRUE)
##            V4            V2            V1    duplicate1            V5 
##  4.4226711685  3.5666822131  1.6777418311  1.1140625358  1.0434479741 
##    duplicate2            V3            V9            V6            V8 
##  0.7704197225  0.0658172633  0.0072719954  0.0001442224 -0.0052747060 
##           V10            V7 
## -0.0217284300 -0.0387706219
cf_imp_df <- data.frame(
  Predictor = names(cf_imp_uncond),
  Unconditional = as.numeric(cf_imp_uncond),
  Conditional = as.numeric(cf_imp_cond)
)

cf_imp_long <- reshape(
  cf_imp_df,
  varying = c("Unconditional", "Conditional"),
  v.names = "Importance",
  timevar = "Type",
  times = c("Unconditional", "Conditional"),
  direction = "long"
)

ggplot(cf_imp_long, aes(x = reorder(Predictor, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ Type, scales = "free_x") +
  labs(x = "Predictor", y = "Importance")

The conditional importance measure should reduce the tendency to spread importance among correlated predictors.

1.4 8.1(d)

set.seed(200)
gbm_fit <- train(
  y ~ .,
  data = simulated,
  method = "gbm",
  trControl = ctrl,
  verbose = FALSE
)

gbm_fit
## Stochastic Gradient Boosting 
## 
## 200 samples
##  12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE      Rsquared   MAE     
##   1                   50      2.521559  0.7967193  2.062196
##   1                  100      1.919345  0.8611503  1.536884
##   1                  150      1.779234  0.8729550  1.419779
##   2                   50      2.048086  0.8517974  1.657750
##   2                  100      1.746686  0.8772150  1.398428
##   2                  150      1.713835  0.8803533  1.368514
##   3                   50      1.946691  0.8565902  1.574418
##   3                  100      1.766196  0.8741471  1.416428
##   3                  150      1.741639  0.8770742  1.393186
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  2, shrinkage = 0.1 and n.minobsinnode = 10.
gbm_imp <- varImp(gbm_fit, scale = FALSE)
gbm_imp
## gbm variable importance
## 
##            Overall
## V4         4425.03
## V2         3582.19
## V1         3394.46
## V5         1895.16
## V3         1327.24
## duplicate1  975.80
## V6          153.73
## V7          152.88
## V8          122.11
## V10         102.40
## V9           46.75
## duplicate2    0.00
plot(gbm_imp, top = 15)

set.seed(200)
cubist_fit <- train(
  y ~ .,
  data = simulated,
  method = "cubist",
  trControl = ctrl
)

cubist_fit
## Cubist 
## 
## 200 samples
##  12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE     
##    1          0          2.361002  0.7755099  1.925464
##    1          5          2.172225  0.8068897  1.763131
##    1          9          2.125867  0.8146543  1.744352
##   10          0          1.856919  0.8531682  1.461076
##   10          5          1.828996  0.8563161  1.446206
##   10          9          1.786864  0.8636357  1.407301
##   20          0          1.811890  0.8576760  1.413145
##   20          5          1.791767  0.8612386  1.420132
##   20          9          1.751824  0.8678259  1.382664
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
cubist_imp <- varImp(cubist_fit, scale = FALSE)
cubist_imp
## cubist variable importance
## 
##            Overall
## V1            72.0
## V2            57.0
## V4            49.0
## V3            42.0
## V5            37.5
## duplicate1     0.0
## V9             0.0
## V10            0.0
## V6             0.0
## duplicate2     0.0
## V7             0.0
## V8             0.0
plot(cubist_imp, top = 15)

Tree and rule-based models often show reduced or shared importance for highly correlated predictors. The exact pattern depends on the model and its variable selection mechanism.

2 Exercise 8.2

Use a simulation to show tree bias with different granularities.

set.seed(200)

n <- 500
granularity_data <- data.frame(
  y = rnorm(n),
  x_continuous = runif(n),
  x_binary = factor(sample(c("A", "B"), n, replace = TRUE)),
  x_5_levels = factor(sample(letters[1:5], n, replace = TRUE)),
  x_20_levels = factor(sample(letters[1:20], n, replace = TRUE))
)

set.seed(200)
tree_bias_rf <- randomForest(
  y ~ .,
  data = granularity_data,
  importance = TRUE,
  ntree = 1000
)

tree_bias_imp <- varImp(tree_bias_rf, scale = FALSE)
tree_bias_imp
##                    Overall
## x_continuous  0.0024660292
## x_binary     -0.0007940447
## x_5_levels   -0.0059307815
## x_20_levels   0.0149293181
plot(tree_bias_imp)

Because the response is pure noise, no predictor should truly be important. If predictors with more possible split points receive higher importance, this demonstrates selection bias in tree-based models.

3 Exercise 8.3

data(solubility)

sol_train <- solTrainXtrans
sol_test <- solTestXtrans
sol_train_y <- solTrainY
sol_test_y <- solTestY

sol_train_dat <- data.frame(sol_train, Solubility = sol_train_y)
sol_test_dat <- data.frame(sol_test, Solubility = sol_test_y)

3.1 8.3(a)

set.seed(200)
gbm_low <- gbm(
  Solubility ~ .,
  data = sol_train_dat,
  distribution = "gaussian",
  n.trees = 1000,
  interaction.depth = 1,
  shrinkage = 0.1,
  bag.fraction = 0.1,
  verbose = FALSE
)

set.seed(200)
gbm_high <- gbm(
  Solubility ~ .,
  data = sol_train_dat,
  distribution = "gaussian",
  n.trees = 1000,
  interaction.depth = 1,
  shrinkage = 0.9,
  bag.fraction = 0.9,
  verbose = FALSE
)

summary(gbm_low, n.trees = 1000)

##                                 var     rel.inf
## NumCarbon                 NumCarbon 13.93712745
## MolWeight                 MolWeight 13.11877326
## SurfaceArea1           SurfaceArea1  7.76088947
## HydrophilicFactor HydrophilicFactor  6.21647314
## NumMultBonds           NumMultBonds  3.95822674
## SurfaceArea2           SurfaceArea2  3.75925457
## NumHalogen               NumHalogen  2.65202683
## NumHydrogen             NumHydrogen  2.27676649
## NumAtoms                   NumAtoms  2.20575121
## NumNonHBonds           NumNonHBonds  1.98488391
## NumAromaticBonds   NumAromaticBonds  1.66745519
## NumBonds                   NumBonds  1.57351335
## FP072                         FP072  1.45716309
## NumRotBonds             NumRotBonds  1.19725153
## NumNonHAtoms           NumNonHAtoms  1.15490488
## NumOxygen                 NumOxygen  1.08009337
## NumRings                   NumRings  0.89036907
## FP172                         FP172  0.85656466
## NumChlorine             NumChlorine  0.83353495
## FP147                         FP147  0.74561247
## FP111                         FP111  0.73663576
## FP117                         FP117  0.69396724
## FP135                         FP135  0.68336326
## FP206                         FP206  0.63290522
## NumDblBonds             NumDblBonds  0.59773651
## FP173                         FP173  0.57480897
## FP075                         FP075  0.55331699
## FP116                         FP116  0.53712904
## FP127                         FP127  0.52459733
## FP084                         FP084  0.49504064
## FP176                         FP176  0.48556568
## FP140                         FP140  0.48299012
## FP131                         FP131  0.48152348
## FP137                         FP137  0.45122465
## FP074                         FP074  0.44881206
## FP012                         FP012  0.42033889
## FP015                         FP015  0.40593666
## FP057                         FP057  0.40096593
## FP128                         FP128  0.39781485
## FP085                         FP085  0.38949878
## FP104                         FP104  0.36743393
## FP010                         FP010  0.36440198
## FP202                         FP202  0.35848484
## FP148                         FP148  0.35461691
## FP114                         FP114  0.34955791
## FP122                         FP122  0.34580218
## FP124                         FP124  0.33899609
## FP101                         FP101  0.32568672
## FP121                         FP121  0.31657031
## FP099                         FP099  0.31048165
## FP065                         FP065  0.30316936
## FP126                         FP126  0.29472954
## FP125                         FP125  0.27707571
## FP171                         FP171  0.27299398
## FP169                         FP169  0.27097675
## FP064                         FP064  0.26192829
## FP088                         FP088  0.25952478
## FP134                         FP134  0.25720092
## FP080                         FP080  0.25552512
## FP142                         FP142  0.25488063
## FP141                         FP141  0.24167781
## FP106                         FP106  0.23963092
## FP003                         FP003  0.23791942
## FP059                         FP059  0.23611611
## FP009                         FP009  0.23380979
## FP076                         FP076  0.22945730
## FP096                         FP096  0.22287157
## FP181                         FP181  0.22030606
## FP097                         FP097  0.21692882
## FP047                         FP047  0.21660297
## FP024                         FP024  0.20759485
## FP025                         FP025  0.20620851
## FP027                         FP027  0.20616460
## FP115                         FP115  0.20600075
## FP011                         FP011  0.20595459
## FP058                         FP058  0.20508356
## FP016                         FP016  0.20047291
## FP149                         FP149  0.19771292
## FP032                         FP032  0.19717914
## FP020                         FP020  0.19206050
## FP136                         FP136  0.18954926
## FP019                         FP019  0.18927442
## FP050                         FP050  0.18867145
## FP078                         FP078  0.18644216
## FP095                         FP095  0.18221457
## NumNitrogen             NumNitrogen  0.18066322
## FP120                         FP120  0.18008097
## FP028                         FP028  0.17969370
## FP081                         FP081  0.17938197
## FP017                         FP017  0.17836789
## FP069                         FP069  0.17699420
## FP133                         FP133  0.17581059
## FP067                         FP067  0.17194134
## FP043                         FP043  0.17171185
## FP130                         FP130  0.16779522
## FP053                         FP053  0.16477687
## FP163                         FP163  0.16070131
## FP107                         FP107  0.16030775
## FP060                         FP060  0.15374506
## FP205                         FP205  0.15340808
## FP174                         FP174  0.14886719
## FP094                         FP094  0.14691093
## FP008                         FP008  0.14121659
## FP177                         FP177  0.14004336
## FP071                         FP071  0.13916372
## FP002                         FP002  0.13890671
## FP161                         FP161  0.13569269
## FP145                         FP145  0.13497210
## FP123                         FP123  0.13014147
## FP092                         FP092  0.12919509
## FP152                         FP152  0.12641089
## FP013                         FP013  0.12335704
## FP162                         FP162  0.12037325
## FP021                         FP021  0.11285084
## FP110                         FP110  0.11264701
## FP165                         FP165  0.11126241
## FP073                         FP073  0.10959383
## FP138                         FP138  0.10811672
## FP030                         FP030  0.10787433
## FP129                         FP129  0.10601895
## FP112                         FP112  0.10600049
## FP090                         FP090  0.10305890
## FP018                         FP018  0.10126798
## FP109                         FP109  0.09876991
## FP100                         FP100  0.09821082
## FP034                         FP034  0.09798865
## FP007                         FP007  0.09776562
## FP103                         FP103  0.09679838
## FP070                         FP070  0.09678373
## FP175                         FP175  0.09493858
## FP113                         FP113  0.09186634
## FP167                         FP167  0.09059331
## FP153                         FP153  0.08968578
## FP004                         FP004  0.08789331
## FP039                         FP039  0.08779429
## FP023                         FP023  0.08681067
## FP048                         FP048  0.08377994
## FP184                         FP184  0.08372391
## FP179                         FP179  0.08199564
## FP006                         FP006  0.08186916
## FP188                         FP188  0.08129637
## FP164                         FP164  0.07991522
## FP178                         FP178  0.07983419
## FP180                         FP180  0.07653530
## FP170                         FP170  0.07561430
## FP049                         FP049  0.07439950
## FP063                         FP063  0.07417870
## FP204                         FP204  0.07324119
## FP056                         FP056  0.06429984
## FP026                         FP026  0.06426299
## FP066                         FP066  0.06334936
## FP052                         FP052  0.06305490
## FP166                         FP166  0.06021700
## FP150                         FP150  0.05931271
## FP051                         FP051  0.05920356
## FP203                         FP203  0.05882201
## FP093                         FP093  0.05789723
## FP195                         FP195  0.05614594
## FP087                         FP087  0.05416911
## FP118                         FP118  0.05374231
## FP182                         FP182  0.05182214
## FP083                         FP083  0.04706158
## FP189                         FP189  0.04619662
## FP041                         FP041  0.04521463
## FP098                         FP098  0.04318721
## FP185                         FP185  0.04203308
## FP157                         FP157  0.04090116
## FP091                         FP091  0.03879182
## FP046                         FP046  0.03872224
## FP079                         FP079  0.03854426
## FP035                         FP035  0.03602132
## FP105                         FP105  0.03233533
## FP108                         FP108  0.03160314
## FP014                         FP014  0.03129058
## FP029                         FP029  0.03024988
## FP068                         FP068  0.02319246
## FP001                         FP001  0.00000000
## FP005                         FP005  0.00000000
## FP022                         FP022  0.00000000
## FP031                         FP031  0.00000000
## FP033                         FP033  0.00000000
## FP036                         FP036  0.00000000
## FP037                         FP037  0.00000000
## FP038                         FP038  0.00000000
## FP040                         FP040  0.00000000
## FP042                         FP042  0.00000000
## FP044                         FP044  0.00000000
## FP045                         FP045  0.00000000
## FP054                         FP054  0.00000000
## FP055                         FP055  0.00000000
## FP061                         FP061  0.00000000
## FP062                         FP062  0.00000000
## FP077                         FP077  0.00000000
## FP082                         FP082  0.00000000
## FP086                         FP086  0.00000000
## FP089                         FP089  0.00000000
## FP102                         FP102  0.00000000
## FP119                         FP119  0.00000000
## FP132                         FP132  0.00000000
## FP139                         FP139  0.00000000
## FP143                         FP143  0.00000000
## FP144                         FP144  0.00000000
## FP146                         FP146  0.00000000
## FP151                         FP151  0.00000000
## FP154                         FP154  0.00000000
## FP155                         FP155  0.00000000
## FP156                         FP156  0.00000000
## FP158                         FP158  0.00000000
## FP159                         FP159  0.00000000
## FP160                         FP160  0.00000000
## FP168                         FP168  0.00000000
## FP183                         FP183  0.00000000
## FP186                         FP186  0.00000000
## FP187                         FP187  0.00000000
## FP190                         FP190  0.00000000
## FP191                         FP191  0.00000000
## FP192                         FP192  0.00000000
## FP193                         FP193  0.00000000
## FP194                         FP194  0.00000000
## FP196                         FP196  0.00000000
## FP197                         FP197  0.00000000
## FP198                         FP198  0.00000000
## FP199                         FP199  0.00000000
## FP200                         FP200  0.00000000
## FP201                         FP201  0.00000000
## FP207                         FP207  0.00000000
## FP208                         FP208  0.00000000
## NumSulfer                 NumSulfer  0.00000000
summary(gbm_high, n.trees = 1000)

##                                 var      rel.inf
## NumCarbon                 NumCarbon 36.517034820
## SurfaceArea2           SurfaceArea2 17.555705181
## NumNonHAtoms           NumNonHAtoms 10.111406124
## MolWeight                 MolWeight  4.319654975
## FP172                         FP172  3.856395896
## SurfaceArea1           SurfaceArea1  3.352005625
## NumAromaticBonds   NumAromaticBonds  2.137127270
## HydrophilicFactor HydrophilicFactor  2.072022899
## NumChlorine             NumChlorine  1.977582593
## FP134                         FP134  1.663972163
## FP204                         FP204  1.602802113
## FP135                         FP135  1.231177069
## NumHydrogen             NumHydrogen  0.833969166
## NumNonHBonds           NumNonHBonds  0.739393513
## NumMultBonds           NumMultBonds  0.676437444
## FP059                         FP059  0.560404310
## NumRotBonds             NumRotBonds  0.467267768
## NumAtoms                   NumAtoms  0.454830851
## NumBonds                   NumBonds  0.442591808
## NumOxygen                 NumOxygen  0.396316765
## FP142                         FP142  0.390322257
## FP125                         FP125  0.365753946
## FP190                         FP190  0.325472010
## FP154                         FP154  0.312586895
## FP039                         FP039  0.240944953
## FP184                         FP184  0.228150459
## FP133                         FP133  0.223438096
## FP011                         FP011  0.218027852
## NumHalogen               NumHalogen  0.206131587
## FP099                         FP099  0.183582613
## FP203                         FP203  0.181377664
## FP163                         FP163  0.180651265
## FP111                         FP111  0.173591718
## FP174                         FP174  0.167972976
## FP178                         FP178  0.153369194
## NumRings                   NumRings  0.151274463
## FP176                         FP176  0.148673636
## FP040                         FP040  0.136605607
## FP116                         FP116  0.135385087
## FP143                         FP143  0.121850657
## FP127                         FP127  0.117943580
## FP083                         FP083  0.116164951
## NumSulfer                 NumSulfer  0.114426257
## FP129                         FP129  0.110793007
## FP202                         FP202  0.107821205
## FP094                         FP094  0.107295727
## FP147                         FP147  0.105341949
## FP085                         FP085  0.097070120
## FP171                         FP171  0.095007488
## NumDblBonds             NumDblBonds  0.091540999
## FP075                         FP075  0.086093735
## FP131                         FP131  0.084665123
## FP060                         FP060  0.084302035
## FP065                         FP065  0.083355947
## FP193                         FP193  0.071497163
## FP198                         FP198  0.071479166
## FP074                         FP074  0.070314410
## FP026                         FP026  0.070215291
## FP035                         FP035  0.065356235
## FP124                         FP124  0.064904086
## FP110                         FP110  0.062330885
## FP055                         FP055  0.062247119
## FP126                         FP126  0.061935838
## FP043                         FP043  0.059282976
## FP029                         FP029  0.058795377
## FP057                         FP057  0.057599902
## FP044                         FP044  0.057513945
## FP042                         FP042  0.057499436
## FP054                         FP054  0.057345829
## FP049                         FP049  0.055939976
## FP064                         FP064  0.055670202
## FP004                         FP004  0.054902848
## FP027                         FP027  0.054013394
## FP102                         FP102  0.053274435
## FP159                         FP159  0.052727062
## FP188                         FP188  0.050481180
## FP103                         FP103  0.048056372
## FP136                         FP136  0.047361852
## FP071                         FP071  0.046679957
## FP158                         FP158  0.044550412
## FP023                         FP023  0.044356718
## FP036                         FP036  0.043751908
## FP006                         FP006  0.043703930
## FP201                         FP201  0.042673169
## FP186                         FP186  0.041151149
## FP173                         FP173  0.040963698
## FP081                         FP081  0.039360801
## FP160                         FP160  0.038032300
## FP033                         FP033  0.037869943
## FP128                         FP128  0.037167669
## FP016                         FP016  0.036646632
## FP164                         FP164  0.035513471
## FP015                         FP015  0.032769350
## FP078                         FP078  0.032647792
## FP148                         FP148  0.032313973
## FP207                         FP207  0.032086123
## FP024                         FP024  0.031211017
## FP018                         FP018  0.030928932
## FP122                         FP122  0.030672680
## FP161                         FP161  0.027967764
## FP063                         FP063  0.027644942
## FP119                         FP119  0.026770916
## FP022                         FP022  0.026239217
## FP001                         FP001  0.025796348
## FP208                         FP208  0.025639193
## FP090                         FP090  0.025445162
## FP070                         FP070  0.024900892
## FP180                         FP180  0.024414820
## FP108                         FP108  0.024141367
## FP167                         FP167  0.023354616
## FP189                         FP189  0.022795457
## FP101                         FP101  0.021986642
## FP007                         FP007  0.021834879
## FP144                         FP144  0.021582094
## FP146                         FP146  0.021274457
## FP105                         FP105  0.021222634
## FP076                         FP076  0.021178666
## FP139                         FP139  0.020762648
## FP109                         FP109  0.020652098
## FP019                         FP019  0.020154093
## FP025                         FP025  0.019982575
## FP177                         FP177  0.019833735
## FP030                         FP030  0.019418702
## FP034                         FP034  0.019175599
## FP132                         FP132  0.018530167
## FP170                         FP170  0.017701916
## FP091                         FP091  0.017684155
## FP017                         FP017  0.017377315
## FP038                         FP038  0.017247661
## FP112                         FP112  0.016860486
## FP066                         FP066  0.016649316
## FP031                         FP031  0.016404502
## FP165                         FP165  0.016373159
## FP114                         FP114  0.015939417
## FP151                         FP151  0.015897587
## FP121                         FP121  0.015491734
## FP157                         FP157  0.015290433
## FP093                         FP093  0.015245672
## FP150                         FP150  0.014481891
## FP155                         FP155  0.013919812
## FP137                         FP137  0.013703086
## FP097                         FP097  0.013562110
## FP045                         FP045  0.013338042
## FP051                         FP051  0.013185059
## FP073                         FP073  0.013152162
## FP028                         FP028  0.012596907
## FP199                         FP199  0.012049789
## FP020                         FP020  0.011783358
## FP056                         FP056  0.011326546
## FP095                         FP095  0.011282886
## FP048                         FP048  0.010253725
## FP080                         FP080  0.010039002
## FP182                         FP182  0.009966687
## FP183                         FP183  0.009942464
## FP010                         FP010  0.009907849
## FP041                         FP041  0.009372163
## FP009                         FP009  0.009304094
## FP046                         FP046  0.009282731
## NumNitrogen             NumNitrogen  0.008993053
## FP050                         FP050  0.008387931
## FP194                         FP194  0.008197575
## FP077                         FP077  0.008156274
## FP106                         FP106  0.007932124
## FP169                         FP169  0.007733040
## FP058                         FP058  0.007710316
## FP087                         FP087  0.007517236
## FP113                         FP113  0.007406374
## FP149                         FP149  0.007110756
## FP192                         FP192  0.006444042
## FP138                         FP138  0.006259708
## FP140                         FP140  0.006041124
## FP047                         FP047  0.005984927
## FP141                         FP141  0.005648136
## FP079                         FP079  0.005536544
## FP088                         FP088  0.005196089
## FP123                         FP123  0.004944213
## FP082                         FP082  0.004607126
## FP002                         FP002  0.000000000
## FP003                         FP003  0.000000000
## FP005                         FP005  0.000000000
## FP008                         FP008  0.000000000
## FP012                         FP012  0.000000000
## FP013                         FP013  0.000000000
## FP014                         FP014  0.000000000
## FP021                         FP021  0.000000000
## FP032                         FP032  0.000000000
## FP037                         FP037  0.000000000
## FP052                         FP052  0.000000000
## FP053                         FP053  0.000000000
## FP061                         FP061  0.000000000
## FP062                         FP062  0.000000000
## FP067                         FP067  0.000000000
## FP068                         FP068  0.000000000
## FP069                         FP069  0.000000000
## FP072                         FP072  0.000000000
## FP084                         FP084  0.000000000
## FP086                         FP086  0.000000000
## FP089                         FP089  0.000000000
## FP092                         FP092  0.000000000
## FP096                         FP096  0.000000000
## FP098                         FP098  0.000000000
## FP100                         FP100  0.000000000
## FP104                         FP104  0.000000000
## FP107                         FP107  0.000000000
## FP115                         FP115  0.000000000
## FP117                         FP117  0.000000000
## FP118                         FP118  0.000000000
## FP120                         FP120  0.000000000
## FP130                         FP130  0.000000000
## FP145                         FP145  0.000000000
## FP152                         FP152  0.000000000
## FP153                         FP153  0.000000000
## FP156                         FP156  0.000000000
## FP162                         FP162  0.000000000
## FP166                         FP166  0.000000000
## FP168                         FP168  0.000000000
## FP175                         FP175  0.000000000
## FP179                         FP179  0.000000000
## FP181                         FP181  0.000000000
## FP185                         FP185  0.000000000
## FP187                         FP187  0.000000000
## FP191                         FP191  0.000000000
## FP195                         FP195  0.000000000
## FP196                         FP196  0.000000000
## FP197                         FP197  0.000000000
## FP200                         FP200  0.000000000
## FP205                         FP205  0.000000000
## FP206                         FP206  0.000000000

The model with high shrinkage and high bagging fraction makes large updates using a large portion of the data at each iteration. It tends to concentrate importance on a few predictors selected early. The lower shrinkage and lower bagging fraction model learns more gradually and spreads importance across more predictors.

3.2 8.3(b)

pred_low <- predict(gbm_low, sol_test_dat, n.trees = 1000)
pred_high <- predict(gbm_high, sol_test_dat, n.trees = 1000)

postResample(pred_low, sol_test_y)
##      RMSE  Rsquared       MAE 
## 0.6942028 0.8883140 0.5118158
postResample(pred_high, sol_test_y)
##      RMSE  Rsquared       MAE 
## 0.7082669 0.8850553 0.5174387

The model with lower shrinkage is generally expected to be more predictive on new samples because it learns more gradually and is less likely to overfit.

3.3 8.3(c)

set.seed(200)
gbm_low_depth3 <- gbm(
  Solubility ~ .,
  data = sol_train_dat,
  distribution = "gaussian",
  n.trees = 1000,
  interaction.depth = 3,
  shrinkage = 0.1,
  bag.fraction = 0.1,
  verbose = FALSE
)

summary(gbm_low_depth3, n.trees = 1000)

##                                 var      rel.inf
## MolWeight                 MolWeight 13.233982162
## NumCarbon                 NumCarbon 10.619910686
## HydrophilicFactor HydrophilicFactor  6.647749027
## SurfaceArea1           SurfaceArea1  6.352973247
## SurfaceArea2           SurfaceArea2  4.120100725
## NumHydrogen             NumHydrogen  3.205089501
## NumMultBonds           NumMultBonds  3.186035059
## NumBonds                   NumBonds  2.654066288
## NumAtoms                   NumAtoms  2.050341213
## NumRotBonds             NumRotBonds  1.964574413
## NumNonHBonds           NumNonHBonds  1.929190677
## FP072                         FP072  1.778623523
## NumNonHAtoms           NumNonHAtoms  1.764700784
## NumAromaticBonds   NumAromaticBonds  1.528715434
## NumHalogen               NumHalogen  1.102828947
## FP084                         FP084  1.068563726
## FP075                         FP075  0.861320896
## NumOxygen                 NumOxygen  0.777089750
## FP071                         FP071  0.644620406
## FP122                         FP122  0.597273160
## NumRings                   NumRings  0.563766634
## FP173                         FP173  0.538044769
## FP085                         FP085  0.519969273
## FP116                         FP116  0.513178810
## FP107                         FP107  0.491236603
## FP074                         FP074  0.487863593
## FP063                         FP063  0.484914934
## FP111                         FP111  0.473027285
## FP104                         FP104  0.472079375
## FP003                         FP003  0.469729059
## FP070                         FP070  0.439638165
## NumDblBonds             NumDblBonds  0.433769794
## FP099                         FP099  0.433122497
## FP080                         FP080  0.427483120
## FP128                         FP128  0.418614895
## FP124                         FP124  0.414053047
## FP134                         FP134  0.404004135
## FP078                         FP078  0.398040794
## FP106                         FP106  0.398019385
## FP011                         FP011  0.396958813
## FP008                         FP008  0.389238373
## FP098                         FP098  0.376716457
## FP170                         FP170  0.376657683
## FP168                         FP168  0.372950979
## FP135                         FP135  0.366723845
## FP081                         FP081  0.362787298
## FP092                         FP092  0.359012731
## FP142                         FP142  0.357318777
## FP010                         FP010  0.353714315
## FP069                         FP069  0.349775631
## FP120                         FP120  0.345803475
## FP140                         FP140  0.345007882
## FP101                         FP101  0.343704532
## FP206                         FP206  0.339453829
## FP172                         FP172  0.336705150
## FP115                         FP115  0.335521578
## FP046                         FP046  0.335218761
## NumNitrogen             NumNitrogen  0.331248030
## FP012                         FP012  0.324932644
## FP015                         FP015  0.320540513
## FP169                         FP169  0.319549117
## FP009                         FP009  0.311925389
## FP097                         FP097  0.304094417
## FP004                         FP004  0.303456967
## FP102                         FP102  0.302554697
## FP109                         FP109  0.300225707
## FP076                         FP076  0.298087956
## FP138                         FP138  0.296936662
## FP064                         FP064  0.293119117
## FP117                         FP117  0.290266050
## FP164                         FP164  0.286883659
## FP113                         FP113  0.280087009
## FP125                         FP125  0.276068841
## FP073                         FP073  0.272078660
## FP088                         FP088  0.265711592
## FP016                         FP016  0.262207780
## FP066                         FP066  0.255319406
## FP121                         FP121  0.253193934
## FP096                         FP096  0.247250652
## FP094                         FP094  0.243882612
## FP058                         FP058  0.238679999
## FP176                         FP176  0.230893975
## FP105                         FP105  0.227273037
## FP114                         FP114  0.226926366
## FP027                         FP027  0.226457863
## FP127                         FP127  0.222094699
## FP131                         FP131  0.219172127
## FP118                         FP118  0.216758036
## FP132                         FP132  0.216077455
## FP145                         FP145  0.214888933
## FP018                         FP018  0.214569119
## FP060                         FP060  0.210835254
## FP119                         FP119  0.210080254
## FP103                         FP103  0.207219959
## FP093                         FP093  0.200768942
## FP057                         FP057  0.195896569
## FP126                         FP126  0.194142070
## FP017                         FP017  0.189164364
## FP082                         FP082  0.186560787
## FP047                         FP047  0.186472452
## FP123                         FP123  0.185580235
## FP068                         FP068  0.185369056
## FP144                         FP144  0.183148749
## FP062                         FP062  0.180764689
## FP171                         FP171  0.179914912
## FP083                         FP083  0.173030469
## FP130                         FP130  0.172362534
## FP202                         FP202  0.171628387
## FP147                         FP147  0.170881161
## FP166                         FP166  0.169599626
## FP067                         FP067  0.167674681
## FP095                         FP095  0.167278962
## FP065                         FP065  0.161890542
## NumChlorine             NumChlorine  0.161835692
## FP013                         FP013  0.159885841
## FP163                         FP163  0.150015707
## FP007                         FP007  0.146656232
## FP006                         FP006  0.143765711
## FP050                         FP050  0.142281002
## FP061                         FP061  0.141628447
## FP162                         FP162  0.139375337
## FP174                         FP174  0.135390252
## FP108                         FP108  0.132794037
## FP165                         FP165  0.129783073
## FP023                         FP023  0.128913609
## FP022                         FP022  0.123664614
## FP029                         FP029  0.123227594
## FP002                         FP002  0.112732648
## FP149                         FP149  0.110514848
## FP087                         FP087  0.108929383
## FP086                         FP086  0.106064814
## FP030                         FP030  0.103116900
## FP146                         FP146  0.098878293
## FP177                         FP177  0.092027233
## FP025                         FP025  0.091270502
## FP153                         FP153  0.090046700
## FP136                         FP136  0.089152658
## FP053                         FP053  0.088205709
## FP112                         FP112  0.087615950
## FP137                         FP137  0.087091097
## FP167                         FP167  0.085769505
## FP129                         FP129  0.085577579
## FP205                         FP205  0.080029380
## FP028                         FP028  0.079540004
## FP184                         FP184  0.079071889
## FP089                         FP089  0.078181431
## FP178                         FP178  0.074598741
## FP133                         FP133  0.072037569
## FP110                         FP110  0.070723664
## FP091                         FP091  0.070608464
## FP100                         FP100  0.068097335
## FP141                         FP141  0.065388185
## FP034                         FP034  0.065363018
## FP161                         FP161  0.064797482
## FP079                         FP079  0.060023825
## FP024                         FP024  0.057288319
## FP157                         FP157  0.057058817
## FP188                         FP188  0.054269552
## FP152                         FP152  0.053763085
## FP035                         FP035  0.051436684
## FP090                         FP090  0.050913061
## FP148                         FP148  0.047747726
## FP182                         FP182  0.047690701
## FP001                         FP001  0.045074428
## FP051                         FP051  0.044173675
## FP180                         FP180  0.043437659
## FP150                         FP150  0.043314230
## FP049                         FP049  0.041779855
## FP037                         FP037  0.034872295
## FP039                         FP039  0.033946261
## FP021                         FP021  0.032945813
## FP185                         FP185  0.032813805
## FP033                         FP033  0.031116459
## FP031                         FP031  0.029842508
## FP026                         FP026  0.029203793
## FP189                         FP189  0.028297908
## FP187                         FP187  0.027986939
## FP179                         FP179  0.027966977
## FP181                         FP181  0.027525054
## FP203                         FP203  0.026390764
## FP056                         FP056  0.026385850
## FP077                         FP077  0.023820007
## FP032                         FP032  0.022157817
## FP040                         FP040  0.021614697
## FP156                         FP156  0.018529979
## FP186                         FP186  0.017984106
## FP038                         FP038  0.016077750
## FP183                         FP183  0.013258309
## FP055                         FP055  0.012751745
## FP045                         FP045  0.012643736
## FP014                         FP014  0.011909641
## FP175                         FP175  0.009542459
## FP020                         FP020  0.006925307
## FP190                         FP190  0.005616695
## FP005                         FP005  0.000000000
## FP019                         FP019  0.000000000
## FP036                         FP036  0.000000000
## FP041                         FP041  0.000000000
## FP042                         FP042  0.000000000
## FP043                         FP043  0.000000000
## FP044                         FP044  0.000000000
## FP048                         FP048  0.000000000
## FP052                         FP052  0.000000000
## FP054                         FP054  0.000000000
## FP059                         FP059  0.000000000
## FP139                         FP139  0.000000000
## FP143                         FP143  0.000000000
## FP151                         FP151  0.000000000
## FP154                         FP154  0.000000000
## FP155                         FP155  0.000000000
## FP158                         FP158  0.000000000
## FP159                         FP159  0.000000000
## FP160                         FP160  0.000000000
## FP191                         FP191  0.000000000
## FP192                         FP192  0.000000000
## FP193                         FP193  0.000000000
## FP194                         FP194  0.000000000
## FP195                         FP195  0.000000000
## FP196                         FP196  0.000000000
## FP197                         FP197  0.000000000
## FP198                         FP198  0.000000000
## FP199                         FP199  0.000000000
## FP200                         FP200  0.000000000
## FP201                         FP201  0.000000000
## FP204                         FP204  0.000000000
## FP207                         FP207  0.000000000
## FP208                         FP208  0.000000000
## NumSulfer                 NumSulfer  0.000000000

Increasing interaction depth allows trees to model interactions and more complex structure. This may spread importance across more predictors if several variables are useful only through interactions, but it can also increase overfitting.

4 Exercise 8.4

Use a single solubility predictor.

one_pred_train <- data.frame(
  MolWeight = solTrainXtrans$MolWeight,
  Solubility = solTrainY
)

one_pred_test <- data.frame(
  MolWeight = solTestXtrans$MolWeight,
  Solubility = solTestY
)

4.1 8.4(a) Regression tree

set.seed(200)
tree_one <- train(
  Solubility ~ MolWeight,
  data = one_pred_train,
  method = "rpart",
  trControl = ctrl,
  tuneLength = 10
)

tree_one
## CART 
## 
## 951 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 856, 857, 855, 856, 856, 856, ... 
## Resampling results across tuning parameters:
## 
##   cp           RMSE      Rsquared   MAE     
##   0.004859809  1.513942  0.4553317  1.146843
##   0.005682050  1.516304  0.4538220  1.151006
##   0.007133559  1.529476  0.4421052  1.167605
##   0.007232122  1.531079  0.4407651  1.169155
##   0.007475879  1.536400  0.4370756  1.173360
##   0.009950241  1.567202  0.4149204  1.201659
##   0.012109468  1.580066  0.4065854  1.216871
##   0.041878909  1.637650  0.3628154  1.266517
##   0.044542785  1.648801  0.3539423  1.276161
##   0.351234818  1.874180  0.2970638  1.464068
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.004859809.

4.2 8.4(b) Random forest

set.seed(200)
rf_one <- train(
  Solubility ~ MolWeight,
  data = one_pred_train,
  method = "rf",
  trControl = ctrl,
  tuneLength = 5
)

rf_one
## Random Forest 
## 
## 951 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 856, 857, 855, 856, 856, 856, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1.428304  0.5377322  1.010235
## 
## Tuning parameter 'mtry' was held constant at a value of 2

4.3 8.4(c) Cubist models

set.seed(200)
cubist_one <- train(
  Solubility ~ MolWeight,
  data = one_pred_train,
  method = "cubist",
  trControl = ctrl,
  tuneLength = 10
)

cubist_one
## Cubist 
## 
## 951 samples
##   1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 856, 857, 855, 856, 856, 856, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE     
##    1          0          1.543500  0.4444768  1.135313
##    1          5          1.601461  0.3983413  1.212392
##    1          9          1.600714  0.3983907  1.212859
##   10          0          1.531000  0.4527305  1.134907
##   10          5          1.592542  0.4007236  1.210559
##   10          9          1.591739  0.4008521  1.210842
##   20          0          1.531551  0.4523299  1.135052
##   20          5          1.592433  0.4008041  1.210413
##   20          9          1.591720  0.4008763  1.210689
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 10 and neighbors = 0.

4.4 Plot predictions

plot_dat <- one_pred_test
plot_dat$Tree <- predict(tree_one, one_pred_test)
plot_dat$RandomForest <- predict(rf_one, one_pred_test)
plot_dat$Cubist <- predict(cubist_one, one_pred_test)

plot_long <- reshape(
  plot_dat,
  varying = c("Tree", "RandomForest", "Cubist"),
  v.names = "Prediction",
  timevar = "Model",
  times = c("Tree", "RandomForest", "Cubist"),
  direction = "long"
)

ggplot(plot_long, aes(x = MolWeight, y = Solubility)) +
  geom_point(alpha = 0.5) +
  geom_line(aes(y = Prediction), linewidth = 1) +
  facet_wrap(~ Model) +
  labs(x = "Molecular Weight", y = "Solubility")

The single tree gives a step-function fit. Random forests smooth the step behavior by averaging many trees. Cubist produces rule-based linear models and may show piecewise linear behavior.

5 Exercise 8.5

Tecator data.

data("tecator", package = "caret")

tec_x <- absorp
tec_y <- endpoints[, 1]

set.seed(200)
tec_split <- createDataPartition(tec_y, p = 0.8, list = FALSE)

tec_train_x <- tec_x[tec_split, ]
tec_test_x <- tec_x[-tec_split, ]
tec_train_y <- tec_y[tec_split]
tec_test_y <- tec_y[-tec_split]

tec_train_dat <- data.frame(tec_train_x, Fat = tec_train_y)
tec_test_dat <- data.frame(tec_test_x, Fat = tec_test_y)
set.seed(200)
tec_rf <- train(
  Fat ~ .,
  data = tec_train_dat,
  method = "rf",
  trControl = ctrl,
  tuneLength = 5
)

set.seed(200)
tec_gbm <- train(
  Fat ~ .,
  data = tec_train_dat,
  method = "gbm",
  trControl = ctrl,
  tuneLength = 5,
  verbose = FALSE
)

set.seed(200)
tec_cubist <- train(
  Fat ~ .,
  data = tec_train_dat,
  method = "cubist",
  trControl = ctrl,
  tuneLength = 5
)

tec_rf
## Random Forest 
## 
## 175 samples
## 100 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 159, 158, 157, 158, 158, 157, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##     2   6.264721  0.5934020  5.014753
##    26   5.744139  0.6581444  4.573580
##    51   5.714336  0.6616948  4.549824
##    75   5.696486  0.6658019  4.550120
##   100   5.696182  0.6642335  4.541491
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 100.
tec_gbm
## Stochastic Gradient Boosting 
## 
## 175 samples
## 100 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 159, 158, 157, 158, 158, 157, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE      Rsquared   MAE     
##   1                   50      7.769841  0.3932559  6.525743
##   1                  100      7.356951  0.4655490  6.189189
##   1                  150      6.976578  0.5187359  5.839259
##   1                  200      6.653419  0.5615663  5.553646
##   1                  250      6.407074  0.5939155  5.326668
##   2                   50      6.727954  0.5513346  5.633951
##   2                  100      5.861585  0.6554409  4.836730
##   2                  150      5.519354  0.6894326  4.473654
##   2                  200      5.297229  0.7072049  4.225001
##   2                  250      5.205034  0.7170156  4.108108
##   3                   50      6.087864  0.6323005  5.059691
##   3                  100      5.418064  0.6971827  4.368829
##   3                  150      5.133973  0.7246940  4.084392
##   3                  200      5.023936  0.7332142  3.961366
##   3                  250      4.995391  0.7338307  3.911470
##   4                   50      5.781810  0.6649331  4.752156
##   4                  100      5.207728  0.7160135  4.160910
##   4                  150      5.005814  0.7339828  3.915792
##   4                  200      4.951430  0.7375537  3.850069
##   4                  250      4.924128  0.7397336  3.822926
##   5                   50      5.648116  0.6749776  4.644111
##   5                  100      5.112722  0.7258205  4.077695
##   5                  150      4.985183  0.7354320  3.913944
##   5                  200      4.941802  0.7387479  3.845527
##   5                  250      4.944870  0.7370864  3.829234
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 250, interaction.depth =
##  4, shrinkage = 0.1 and n.minobsinnode = 10.
tec_cubist
## Cubist 
## 
## 175 samples
## 100 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 159, 158, 157, 158, 158, 157, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE     
##    1          0          1.947492  0.9623705  1.402281
##    1          5          1.903049  0.9633434  1.333982
##    1          9          1.903597  0.9637864  1.353741
##   10          0          1.653946  0.9726509  1.163971
##   10          5          1.660673  0.9716167  1.133900
##   10          9          1.647019  0.9723705  1.140817
##   20          0          1.616927  0.9748740  1.143220
##   20          5          1.621020  0.9739106  1.105665
##   20          9          1.610114  0.9746221  1.118134
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
tec_results <- data.frame(
  Model = c("Random Forest", "Boosted Tree", "Cubist"),
  RMSE = c(
    postResample(predict(tec_rf, tec_test_dat), tec_test_y)["RMSE"],
    postResample(predict(tec_gbm, tec_test_dat), tec_test_y)["RMSE"],
    postResample(predict(tec_cubist, tec_test_dat), tec_test_y)["RMSE"]
  ),
  Rsquared = c(
    postResample(predict(tec_rf, tec_test_dat), tec_test_y)["Rsquared"],
    postResample(predict(tec_gbm, tec_test_dat), tec_test_y)["Rsquared"],
    postResample(predict(tec_cubist, tec_test_dat), tec_test_y)["Rsquared"]
  )
)

tec_results
##           Model     RMSE  Rsquared
## 1 Random Forest 5.169434 0.7822691
## 2  Boosted Tree 4.967872 0.7907837
## 3        Cubist 1.425098 0.9821022

The Tecator predictors are highly correlated spectral measurements. Transformations such as PCA, PLS scores, derivatives, or feature filtering can reduce collinearity before modeling.

6 Exercise 8.6

Permeability data.

data(permeability)

perm_train_x <- fingerprints
perm_train_y <- permeability

set.seed(200)
perm_split <- createDataPartition(perm_train_y, p = 0.8, list = FALSE)

perm_x_train <- perm_train_x[perm_split, ]
perm_x_test <- perm_train_x[-perm_split, ]
perm_y_train <- perm_train_y[perm_split]
perm_y_test <- perm_train_y[-perm_split]

perm_train_dat <- data.frame(perm_x_train, permeability = perm_y_train)
perm_test_dat <- data.frame(perm_x_test, permeability = perm_y_test)
set.seed(200)
perm_rf <- train(
  permeability ~ .,
  data = perm_train_dat,
  method = "rf",
  trControl = ctrl,
  tuneLength = 5
)

set.seed(200)
perm_gbm <- train(
  permeability ~ .,
  data = perm_train_dat,
  method = "gbm",
  trControl = ctrl,
  tuneLength = 5,
  verbose = FALSE
)

set.seed(200)
perm_cubist <- train(
  permeability ~ .,
  data = perm_train_dat,
  method = "cubist",
  trControl = ctrl,
  tuneLength = 5
)

perm_rf
## Random Forest 
## 
##  133 samples
## 1107 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 118, 120, 120, 120, 121, 120, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##      2  12.94482  0.4976882  10.188432
##      9  10.41951  0.5804964   7.817710
##     47  10.09303  0.5922681   7.323269
##    228  10.32044  0.5730140   7.138908
##   1106  10.67343  0.5456930   7.209991
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 47.
perm_gbm
## Stochastic Gradient Boosting 
## 
##  133 samples
## 1107 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 118, 120, 120, 120, 121, 120, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE      Rsquared   MAE     
##   1                   50      11.16289  0.5100886  7.994561
##   1                  100      11.06110  0.5134304  8.167640
##   1                  150      11.09390  0.5056690  8.285280
##   1                  200      11.03714  0.5100896  8.241052
##   1                  250      11.04629  0.5082170  8.207634
##   2                   50      10.82431  0.5351475  7.867504
##   2                  100      10.64380  0.5406417  7.847871
##   2                  150      10.59024  0.5440277  7.827900
##   2                  200      10.49414  0.5511776  7.781074
##   2                  250      10.47955  0.5564086  7.754257
##   3                   50      10.73176  0.5493965  7.817974
##   3                  100      10.55940  0.5601892  7.730837
##   3                  150      10.50008  0.5649741  7.709304
##   3                  200      10.53677  0.5590194  7.706463
##   3                  250      10.45245  0.5663934  7.677084
##   4                   50      10.80009  0.5342352  7.883804
##   4                  100      10.66247  0.5502385  7.791864
##   4                  150      10.64013  0.5535321  7.819622
##   4                  200      10.66811  0.5494964  7.831996
##   4                  250      10.67480  0.5502911  7.861906
##   5                   50      10.82691  0.5312048  7.888228
##   5                  100      10.47861  0.5551901  7.657204
##   5                  150      10.40627  0.5600670  7.627996
##   5                  200      10.43716  0.5623544  7.637752
##   5                  250      10.44075  0.5605119  7.675183
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  5, shrinkage = 0.1 and n.minobsinnode = 10.
perm_cubist
## Cubist 
## 
##  133 samples
## 1107 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 118, 120, 120, 120, 121, 120, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE       Rsquared   MAE     
##    1          0          11.056438  0.5005614  7.142603
##    1          5          10.480627  0.5565916  6.982333
##    1          9          10.515584  0.5472524  7.065254
##   10          0          10.377633  0.5499849  6.878957
##   10          5           9.857114  0.5990505  6.699756
##   10          9           9.945818  0.5867134  6.762198
##   20          0          10.351670  0.5529765  6.756075
##   20          5           9.758792  0.6050690  6.592828
##   20          9           9.741740  0.6054557  6.632982
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
perm_results <- data.frame(
  Model = c("Random Forest", "Boosted Tree", "Cubist"),
  RMSE = c(
    postResample(predict(perm_rf, perm_test_dat), perm_y_test)["RMSE"],
    postResample(predict(perm_gbm, perm_test_dat), perm_y_test)["RMSE"],
    postResample(predict(perm_cubist, perm_test_dat), perm_y_test)["RMSE"]
  ),
  Rsquared = c(
    postResample(predict(perm_rf, perm_test_dat), perm_y_test)["Rsquared"],
    postResample(predict(perm_gbm, perm_test_dat), perm_y_test)["Rsquared"],
    postResample(predict(perm_cubist, perm_test_dat), perm_y_test)["Rsquared"]
  )
)

perm_results
##           Model     RMSE  Rsquared
## 1 Random Forest 14.22900 0.3061684
## 2  Boosted Tree 12.09356 0.5166912
## 3        Cubist 13.43411 0.3902581

Use test-set RMSE and resampling RMSE to compare models. Lower RMSE and higher \(R^2\) indicate better predictive performance.

7 Exercise 8.7

Chemical manufacturing process.

data(ChemicalManufacturingProcess)

chem <- ChemicalManufacturingProcess

set.seed(200)
chem_split <- createDataPartition(chem$Yield, p = 0.8, list = FALSE)

chem_train <- chem[chem_split, ]
chem_test <- chem[-chem_split, ]

chem_pp <- preProcess(
  chem_train[, names(chem_train) != "Yield"],
  method = c("medianImpute", "center", "scale")
)

chem_train_x <- predict(chem_pp, chem_train[, names(chem_train) != "Yield"])
chem_test_x <- predict(chem_pp, chem_test[, names(chem_test) != "Yield"])

chem_train_dat <- data.frame(chem_train_x, Yield = chem_train$Yield)
chem_test_dat <- data.frame(chem_test_x, Yield = chem_test$Yield)
set.seed(200)
chem_tree <- train(
  Yield ~ .,
  data = chem_train_dat,
  method = "rpart",
  trControl = ctrl,
  tuneLength = 10
)

set.seed(200)
chem_rf <- train(
  Yield ~ .,
  data = chem_train_dat,
  method = "rf",
  trControl = ctrl,
  tuneLength = 5
)

set.seed(200)
chem_gbm <- train(
  Yield ~ .,
  data = chem_train_dat,
  method = "gbm",
  trControl = ctrl,
  tuneLength = 5,
  verbose = FALSE
)

set.seed(200)
chem_cubist <- train(
  Yield ~ .,
  data = chem_train_dat,
  method = "cubist",
  trControl = ctrl,
  tuneLength = 5
)

chem_tree
## CART 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ... 
## Resampling results across tuning parameters:
## 
##   cp           RMSE      Rsquared   MAE     
##   0.000000000  1.436628  0.4374346  1.100095
##   0.006659213  1.434111  0.4383759  1.098775
##   0.011413814  1.438247  0.4357809  1.106436
##   0.027789221  1.427930  0.4299077  1.104787
##   0.029140890  1.414532  0.4350620  1.099322
##   0.036646987  1.389498  0.4507181  1.085295
##   0.049774462  1.359623  0.4627724  1.095111
##   0.070317075  1.337811  0.4784063  1.094097
##   0.094521937  1.429116  0.4037703  1.155646
##   0.433943988  1.657032  0.3046903  1.344962
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.07031708.
chem_rf
## Random Forest 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##    2    1.217403  0.6304545  0.9785288
##   15    1.098043  0.6728132  0.8691547
##   29    1.081931  0.6702642  0.8527328
##   43    1.091399  0.6560575  0.8576204
##   57    1.106936  0.6388681  0.8699486
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
chem_gbm
## Stochastic Gradient Boosting 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE      Rsquared   MAE      
##   1                   50      1.173368  0.5954535  0.9429745
##   1                  100      1.151532  0.6064990  0.9197116
##   1                  150      1.136651  0.6151670  0.9069751
##   1                  200      1.135648  0.6144275  0.9061698
##   1                  250      1.130584  0.6169501  0.9023392
##   2                   50      1.152099  0.6054636  0.9157494
##   2                  100      1.139591  0.6084498  0.9079239
##   2                  150      1.128964  0.6158468  0.9036101
##   2                  200      1.124361  0.6199280  0.9000019
##   2                  250      1.122987  0.6207126  0.8954791
##   3                   50      1.112730  0.6326774  0.8904349
##   3                  100      1.081817  0.6494984  0.8587777
##   3                  150      1.070415  0.6556113  0.8513794
##   3                  200      1.064805  0.6587655  0.8438467
##   3                  250      1.061606  0.6607930  0.8418120
##   4                   50      1.131811  0.6132994  0.9045481
##   4                  100      1.100159  0.6311966  0.8792823
##   4                  150      1.084896  0.6420718  0.8702050
##   4                  200      1.077422  0.6466441  0.8637798
##   4                  250      1.075736  0.6476019  0.8618322
##   5                   50      1.161443  0.5970435  0.9217068
##   5                  100      1.134153  0.6152330  0.8951969
##   5                  150      1.125598  0.6227834  0.8904524
##   5                  200      1.120225  0.6267123  0.8847838
##   5                  250      1.116475  0.6287398  0.8808000
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 250, interaction.depth =
##  3, shrinkage = 0.1 and n.minobsinnode = 10.
chem_cubist
## Cubist 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE       Rsquared   MAE      
##    1          0          1.2015059  0.5904135  0.9438495
##    1          5          1.0153376  0.7034092  0.7833201
##    1          9          1.0827658  0.6676125  0.8372854
##   10          0          1.0635135  0.6538444  0.8518207
##   10          5          0.9219180  0.7467352  0.7301192
##   10          9          0.9904765  0.7023948  0.7871645
##   20          0          1.0466489  0.6689854  0.8405129
##   20          5          0.9100427  0.7552589  0.7214848
##   20          9          0.9756902  0.7154996  0.7762108
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.

7.1 8.7(a)

chem_results <- data.frame(
  Model = c("Single Tree", "Random Forest", "Boosted Tree", "Cubist"),
  RMSE = c(
    postResample(predict(chem_tree, chem_test_dat), chem_test$Yield)["RMSE"],
    postResample(predict(chem_rf, chem_test_dat), chem_test$Yield)["RMSE"],
    postResample(predict(chem_gbm, chem_test_dat), chem_test$Yield)["RMSE"],
    postResample(predict(chem_cubist, chem_test_dat), chem_test$Yield)["RMSE"]
  ),
  Rsquared = c(
    postResample(predict(chem_tree, chem_test_dat), chem_test$Yield)["Rsquared"],
    postResample(predict(chem_rf, chem_test_dat), chem_test$Yield)["Rsquared"],
    postResample(predict(chem_gbm, chem_test_dat), chem_test$Yield)["Rsquared"],
    postResample(predict(chem_cubist, chem_test_dat), chem_test$Yield)["Rsquared"]
  )
)

chem_results[order(chem_results$RMSE), ]
##           Model     RMSE  Rsquared
## 4        Cubist 1.043358 0.7683434
## 3  Boosted Tree 1.078303 0.7333401
## 2 Random Forest 1.333712 0.6027757
## 1   Single Tree 1.760042 0.2550851

7.2 8.7(b)

best_model_name <- chem_results$Model[which.min(chem_results$RMSE)]
best_model_name
## [1] "Cubist"
chem_rf_imp <- varImp(chem_rf, scale = FALSE)
chem_gbm_imp <- varImp(chem_gbm, scale = FALSE)
chem_cubist_imp <- varImp(chem_cubist, scale = FALSE)

chem_rf_imp
## rf variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32 137.454
## BiologicalMaterial12    27.780
## BiologicalMaterial03    22.062
## ManufacturingProcess31  21.121
## ManufacturingProcess17  19.192
## ManufacturingProcess13  17.144
## BiologicalMaterial06    13.581
## ManufacturingProcess28  13.375
## BiologicalMaterial11    13.160
## ManufacturingProcess06  11.990
## ManufacturingProcess09  11.851
## ManufacturingProcess36   6.903
## ManufacturingProcess39   6.900
## BiologicalMaterial04     6.747
## BiologicalMaterial09     5.956
## BiologicalMaterial02     5.788
## ManufacturingProcess30   5.366
## BiologicalMaterial08     5.277
## ManufacturingProcess21   5.053
## ManufacturingProcess11   5.027
plot(chem_rf_imp, top = 10)

Compare whether biological or process variables dominate by inspecting the top-ranked predictors and matching them to the variable descriptions from the original exercise.

7.3 8.7(c)

rpart.plot(chem_tree$finalModel)

The terminal nodes show how the single tree partitions the data. This can be easier to interpret than ensemble models, although it may be less accurate.

chem_tree_pred_nodes <- predict(chem_tree$finalModel, chem_train_dat, type = "vector")
chem_train_dat$TreePrediction <- chem_tree_pred_nodes

ggplot(chem_train_dat, aes(x = factor(TreePrediction), y = Yield)) +
  geom_boxplot() +
  labs(x = "Terminal node prediction", y = "Yield")