set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
set.seed(200)
model1 <- randomForest(
y ~ .,
data = simulated,
importance = TRUE,
ntree = 1000
)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
## Overall
## V1 8.811388491
## V2 6.685318725
## V3 0.629866472
## V4 7.703182937
## V5 2.149608737
## V6 0.093706774
## V7 0.003535298
## V8 -0.124011241
## V9 -0.007978776
## V10 -0.026657627
plot(rfImp1, top = 10)
The Friedman simulation uses V1 through V5
as informative predictors. Predictors V6 through
V10 are noise variables.
rfImp1[rownames(rfImp1) %in% paste0("V", 6:10), , drop = FALSE]
## Overall
## V6 0.093706774
## V7 0.003535298
## V8 -0.124011241
## V9 -0.007978776
## V10 -0.026657627
The random forest may assign some nonzero importance to
V6–V10, but their importance should generally
be much smaller than the informative variables.
set.seed(200)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9497025
set.seed(200)
model2 <- randomForest(
y ~ .,
data = simulated,
importance = TRUE,
ntree = 1000
)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
## Overall
## V1 5.91964260
## V2 6.16818815
## V3 0.73116589
## V4 6.99550947
## V5 2.28421876
## V6 0.19857217
## V7 -0.01400175
## V8 -0.02834702
## V9 0.08851319
## V10 0.01288275
## duplicate1 4.12004820
plot(rfImp2, top = 15)
Compare V1 before and after adding the correlated
duplicate.
rfImp1["V1", , drop = FALSE]
## Overall
## V1 8.811388
rfImp2[c("V1", "duplicate1"), , drop = FALSE]
## Overall
## V1 5.919643
## duplicate1 4.120048
Now add another correlated predictor.
set.seed(200)
simulated$duplicate2 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated[, c("V1", "duplicate1", "duplicate2")])
## V1 duplicate1 duplicate2
## V1 1.0000000 0.9497025 0.9497025
## duplicate1 0.9497025 1.0000000 1.0000000
## duplicate2 0.9497025 1.0000000 1.0000000
set.seed(200)
model3 <- randomForest(
y ~ .,
data = simulated,
importance = TRUE,
ntree = 1000
)
rfImp3 <- varImp(model3, scale = FALSE)
rfImp3[c("V1", "duplicate1", "duplicate2"), , drop = FALSE]
## Overall
## V1 4.684417
## duplicate1 2.916935
## duplicate2 3.261472
plot(rfImp3, top = 15)
When correlated copies of V1 are added, the importance
for V1 is usually diluted across the correlated
predictors.
set.seed(200)
cf_model <- cforest(
y ~ .,
data = simulated,
controls = cforest_unbiased(ntree = 1000, mtry = 3)
)
cf_imp_uncond <- varimp(cf_model, conditional = FALSE)
cf_imp_cond <- varimp(cf_model, conditional = TRUE)
sort(cf_imp_uncond, decreasing = TRUE)
## V4 V2 V1 duplicate1 duplicate2
## 5.4644815853 4.8052254155 4.8018033781 2.6695382536 2.0436920369
## V5 V3 V9 V6 V7
## 1.6010809886 0.0969794928 -0.0004679513 -0.0173316635 -0.0196927623
## V10 V8
## -0.0450997894 -0.0557297157
sort(cf_imp_cond, decreasing = TRUE)
## V4 V2 V1 duplicate1 V5
## 4.4226711685 3.5666822131 1.6777418311 1.1140625358 1.0434479741
## duplicate2 V3 V9 V6 V8
## 0.7704197225 0.0658172633 0.0072719954 0.0001442224 -0.0052747060
## V10 V7
## -0.0217284300 -0.0387706219
cf_imp_df <- data.frame(
Predictor = names(cf_imp_uncond),
Unconditional = as.numeric(cf_imp_uncond),
Conditional = as.numeric(cf_imp_cond)
)
cf_imp_long <- reshape(
cf_imp_df,
varying = c("Unconditional", "Conditional"),
v.names = "Importance",
timevar = "Type",
times = c("Unconditional", "Conditional"),
direction = "long"
)
ggplot(cf_imp_long, aes(x = reorder(Predictor, Importance), y = Importance)) +
geom_col() +
coord_flip() +
facet_wrap(~ Type, scales = "free_x") +
labs(x = "Predictor", y = "Importance")
The conditional importance measure should reduce the tendency to spread importance among correlated predictors.
set.seed(200)
gbm_fit <- train(
y ~ .,
data = simulated,
method = "gbm",
trControl = ctrl,
verbose = FALSE
)
gbm_fit
## Stochastic Gradient Boosting
##
## 200 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 2.521559 0.7967193 2.062196
## 1 100 1.919345 0.8611503 1.536884
## 1 150 1.779234 0.8729550 1.419779
## 2 50 2.048086 0.8517974 1.657750
## 2 100 1.746686 0.8772150 1.398428
## 2 150 1.713835 0.8803533 1.368514
## 3 50 1.946691 0.8565902 1.574418
## 3 100 1.766196 0.8741471 1.416428
## 3 150 1.741639 0.8770742 1.393186
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 2, shrinkage = 0.1 and n.minobsinnode = 10.
gbm_imp <- varImp(gbm_fit, scale = FALSE)
gbm_imp
## gbm variable importance
##
## Overall
## V4 4425.03
## V2 3582.19
## V1 3394.46
## V5 1895.16
## V3 1327.24
## duplicate1 975.80
## V6 153.73
## V7 152.88
## V8 122.11
## V10 102.40
## V9 46.75
## duplicate2 0.00
plot(gbm_imp, top = 15)
set.seed(200)
cubist_fit <- train(
y ~ .,
data = simulated,
method = "cubist",
trControl = ctrl
)
cubist_fit
## Cubist
##
## 200 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 2.361002 0.7755099 1.925464
## 1 5 2.172225 0.8068897 1.763131
## 1 9 2.125867 0.8146543 1.744352
## 10 0 1.856919 0.8531682 1.461076
## 10 5 1.828996 0.8563161 1.446206
## 10 9 1.786864 0.8636357 1.407301
## 20 0 1.811890 0.8576760 1.413145
## 20 5 1.791767 0.8612386 1.420132
## 20 9 1.751824 0.8678259 1.382664
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
cubist_imp <- varImp(cubist_fit, scale = FALSE)
cubist_imp
## cubist variable importance
##
## Overall
## V1 72.0
## V2 57.0
## V4 49.0
## V3 42.0
## V5 37.5
## duplicate1 0.0
## V9 0.0
## V10 0.0
## V6 0.0
## duplicate2 0.0
## V7 0.0
## V8 0.0
plot(cubist_imp, top = 15)
Tree and rule-based models often show reduced or shared importance for highly correlated predictors. The exact pattern depends on the model and its variable selection mechanism.
Use a simulation to show tree bias with different granularities.
set.seed(200)
n <- 500
granularity_data <- data.frame(
y = rnorm(n),
x_continuous = runif(n),
x_binary = factor(sample(c("A", "B"), n, replace = TRUE)),
x_5_levels = factor(sample(letters[1:5], n, replace = TRUE)),
x_20_levels = factor(sample(letters[1:20], n, replace = TRUE))
)
set.seed(200)
tree_bias_rf <- randomForest(
y ~ .,
data = granularity_data,
importance = TRUE,
ntree = 1000
)
tree_bias_imp <- varImp(tree_bias_rf, scale = FALSE)
tree_bias_imp
## Overall
## x_continuous 0.0024660292
## x_binary -0.0007940447
## x_5_levels -0.0059307815
## x_20_levels 0.0149293181
plot(tree_bias_imp)
Because the response is pure noise, no predictor should truly be important. If predictors with more possible split points receive higher importance, this demonstrates selection bias in tree-based models.
data(solubility)
sol_train <- solTrainXtrans
sol_test <- solTestXtrans
sol_train_y <- solTrainY
sol_test_y <- solTestY
sol_train_dat <- data.frame(sol_train, Solubility = sol_train_y)
sol_test_dat <- data.frame(sol_test, Solubility = sol_test_y)
set.seed(200)
gbm_low <- gbm(
Solubility ~ .,
data = sol_train_dat,
distribution = "gaussian",
n.trees = 1000,
interaction.depth = 1,
shrinkage = 0.1,
bag.fraction = 0.1,
verbose = FALSE
)
set.seed(200)
gbm_high <- gbm(
Solubility ~ .,
data = sol_train_dat,
distribution = "gaussian",
n.trees = 1000,
interaction.depth = 1,
shrinkage = 0.9,
bag.fraction = 0.9,
verbose = FALSE
)
summary(gbm_low, n.trees = 1000)
## var rel.inf
## NumCarbon NumCarbon 13.93712745
## MolWeight MolWeight 13.11877326
## SurfaceArea1 SurfaceArea1 7.76088947
## HydrophilicFactor HydrophilicFactor 6.21647314
## NumMultBonds NumMultBonds 3.95822674
## SurfaceArea2 SurfaceArea2 3.75925457
## NumHalogen NumHalogen 2.65202683
## NumHydrogen NumHydrogen 2.27676649
## NumAtoms NumAtoms 2.20575121
## NumNonHBonds NumNonHBonds 1.98488391
## NumAromaticBonds NumAromaticBonds 1.66745519
## NumBonds NumBonds 1.57351335
## FP072 FP072 1.45716309
## NumRotBonds NumRotBonds 1.19725153
## NumNonHAtoms NumNonHAtoms 1.15490488
## NumOxygen NumOxygen 1.08009337
## NumRings NumRings 0.89036907
## FP172 FP172 0.85656466
## NumChlorine NumChlorine 0.83353495
## FP147 FP147 0.74561247
## FP111 FP111 0.73663576
## FP117 FP117 0.69396724
## FP135 FP135 0.68336326
## FP206 FP206 0.63290522
## NumDblBonds NumDblBonds 0.59773651
## FP173 FP173 0.57480897
## FP075 FP075 0.55331699
## FP116 FP116 0.53712904
## FP127 FP127 0.52459733
## FP084 FP084 0.49504064
## FP176 FP176 0.48556568
## FP140 FP140 0.48299012
## FP131 FP131 0.48152348
## FP137 FP137 0.45122465
## FP074 FP074 0.44881206
## FP012 FP012 0.42033889
## FP015 FP015 0.40593666
## FP057 FP057 0.40096593
## FP128 FP128 0.39781485
## FP085 FP085 0.38949878
## FP104 FP104 0.36743393
## FP010 FP010 0.36440198
## FP202 FP202 0.35848484
## FP148 FP148 0.35461691
## FP114 FP114 0.34955791
## FP122 FP122 0.34580218
## FP124 FP124 0.33899609
## FP101 FP101 0.32568672
## FP121 FP121 0.31657031
## FP099 FP099 0.31048165
## FP065 FP065 0.30316936
## FP126 FP126 0.29472954
## FP125 FP125 0.27707571
## FP171 FP171 0.27299398
## FP169 FP169 0.27097675
## FP064 FP064 0.26192829
## FP088 FP088 0.25952478
## FP134 FP134 0.25720092
## FP080 FP080 0.25552512
## FP142 FP142 0.25488063
## FP141 FP141 0.24167781
## FP106 FP106 0.23963092
## FP003 FP003 0.23791942
## FP059 FP059 0.23611611
## FP009 FP009 0.23380979
## FP076 FP076 0.22945730
## FP096 FP096 0.22287157
## FP181 FP181 0.22030606
## FP097 FP097 0.21692882
## FP047 FP047 0.21660297
## FP024 FP024 0.20759485
## FP025 FP025 0.20620851
## FP027 FP027 0.20616460
## FP115 FP115 0.20600075
## FP011 FP011 0.20595459
## FP058 FP058 0.20508356
## FP016 FP016 0.20047291
## FP149 FP149 0.19771292
## FP032 FP032 0.19717914
## FP020 FP020 0.19206050
## FP136 FP136 0.18954926
## FP019 FP019 0.18927442
## FP050 FP050 0.18867145
## FP078 FP078 0.18644216
## FP095 FP095 0.18221457
## NumNitrogen NumNitrogen 0.18066322
## FP120 FP120 0.18008097
## FP028 FP028 0.17969370
## FP081 FP081 0.17938197
## FP017 FP017 0.17836789
## FP069 FP069 0.17699420
## FP133 FP133 0.17581059
## FP067 FP067 0.17194134
## FP043 FP043 0.17171185
## FP130 FP130 0.16779522
## FP053 FP053 0.16477687
## FP163 FP163 0.16070131
## FP107 FP107 0.16030775
## FP060 FP060 0.15374506
## FP205 FP205 0.15340808
## FP174 FP174 0.14886719
## FP094 FP094 0.14691093
## FP008 FP008 0.14121659
## FP177 FP177 0.14004336
## FP071 FP071 0.13916372
## FP002 FP002 0.13890671
## FP161 FP161 0.13569269
## FP145 FP145 0.13497210
## FP123 FP123 0.13014147
## FP092 FP092 0.12919509
## FP152 FP152 0.12641089
## FP013 FP013 0.12335704
## FP162 FP162 0.12037325
## FP021 FP021 0.11285084
## FP110 FP110 0.11264701
## FP165 FP165 0.11126241
## FP073 FP073 0.10959383
## FP138 FP138 0.10811672
## FP030 FP030 0.10787433
## FP129 FP129 0.10601895
## FP112 FP112 0.10600049
## FP090 FP090 0.10305890
## FP018 FP018 0.10126798
## FP109 FP109 0.09876991
## FP100 FP100 0.09821082
## FP034 FP034 0.09798865
## FP007 FP007 0.09776562
## FP103 FP103 0.09679838
## FP070 FP070 0.09678373
## FP175 FP175 0.09493858
## FP113 FP113 0.09186634
## FP167 FP167 0.09059331
## FP153 FP153 0.08968578
## FP004 FP004 0.08789331
## FP039 FP039 0.08779429
## FP023 FP023 0.08681067
## FP048 FP048 0.08377994
## FP184 FP184 0.08372391
## FP179 FP179 0.08199564
## FP006 FP006 0.08186916
## FP188 FP188 0.08129637
## FP164 FP164 0.07991522
## FP178 FP178 0.07983419
## FP180 FP180 0.07653530
## FP170 FP170 0.07561430
## FP049 FP049 0.07439950
## FP063 FP063 0.07417870
## FP204 FP204 0.07324119
## FP056 FP056 0.06429984
## FP026 FP026 0.06426299
## FP066 FP066 0.06334936
## FP052 FP052 0.06305490
## FP166 FP166 0.06021700
## FP150 FP150 0.05931271
## FP051 FP051 0.05920356
## FP203 FP203 0.05882201
## FP093 FP093 0.05789723
## FP195 FP195 0.05614594
## FP087 FP087 0.05416911
## FP118 FP118 0.05374231
## FP182 FP182 0.05182214
## FP083 FP083 0.04706158
## FP189 FP189 0.04619662
## FP041 FP041 0.04521463
## FP098 FP098 0.04318721
## FP185 FP185 0.04203308
## FP157 FP157 0.04090116
## FP091 FP091 0.03879182
## FP046 FP046 0.03872224
## FP079 FP079 0.03854426
## FP035 FP035 0.03602132
## FP105 FP105 0.03233533
## FP108 FP108 0.03160314
## FP014 FP014 0.03129058
## FP029 FP029 0.03024988
## FP068 FP068 0.02319246
## FP001 FP001 0.00000000
## FP005 FP005 0.00000000
## FP022 FP022 0.00000000
## FP031 FP031 0.00000000
## FP033 FP033 0.00000000
## FP036 FP036 0.00000000
## FP037 FP037 0.00000000
## FP038 FP038 0.00000000
## FP040 FP040 0.00000000
## FP042 FP042 0.00000000
## FP044 FP044 0.00000000
## FP045 FP045 0.00000000
## FP054 FP054 0.00000000
## FP055 FP055 0.00000000
## FP061 FP061 0.00000000
## FP062 FP062 0.00000000
## FP077 FP077 0.00000000
## FP082 FP082 0.00000000
## FP086 FP086 0.00000000
## FP089 FP089 0.00000000
## FP102 FP102 0.00000000
## FP119 FP119 0.00000000
## FP132 FP132 0.00000000
## FP139 FP139 0.00000000
## FP143 FP143 0.00000000
## FP144 FP144 0.00000000
## FP146 FP146 0.00000000
## FP151 FP151 0.00000000
## FP154 FP154 0.00000000
## FP155 FP155 0.00000000
## FP156 FP156 0.00000000
## FP158 FP158 0.00000000
## FP159 FP159 0.00000000
## FP160 FP160 0.00000000
## FP168 FP168 0.00000000
## FP183 FP183 0.00000000
## FP186 FP186 0.00000000
## FP187 FP187 0.00000000
## FP190 FP190 0.00000000
## FP191 FP191 0.00000000
## FP192 FP192 0.00000000
## FP193 FP193 0.00000000
## FP194 FP194 0.00000000
## FP196 FP196 0.00000000
## FP197 FP197 0.00000000
## FP198 FP198 0.00000000
## FP199 FP199 0.00000000
## FP200 FP200 0.00000000
## FP201 FP201 0.00000000
## FP207 FP207 0.00000000
## FP208 FP208 0.00000000
## NumSulfer NumSulfer 0.00000000
summary(gbm_high, n.trees = 1000)
## var rel.inf
## NumCarbon NumCarbon 36.517034820
## SurfaceArea2 SurfaceArea2 17.555705181
## NumNonHAtoms NumNonHAtoms 10.111406124
## MolWeight MolWeight 4.319654975
## FP172 FP172 3.856395896
## SurfaceArea1 SurfaceArea1 3.352005625
## NumAromaticBonds NumAromaticBonds 2.137127270
## HydrophilicFactor HydrophilicFactor 2.072022899
## NumChlorine NumChlorine 1.977582593
## FP134 FP134 1.663972163
## FP204 FP204 1.602802113
## FP135 FP135 1.231177069
## NumHydrogen NumHydrogen 0.833969166
## NumNonHBonds NumNonHBonds 0.739393513
## NumMultBonds NumMultBonds 0.676437444
## FP059 FP059 0.560404310
## NumRotBonds NumRotBonds 0.467267768
## NumAtoms NumAtoms 0.454830851
## NumBonds NumBonds 0.442591808
## NumOxygen NumOxygen 0.396316765
## FP142 FP142 0.390322257
## FP125 FP125 0.365753946
## FP190 FP190 0.325472010
## FP154 FP154 0.312586895
## FP039 FP039 0.240944953
## FP184 FP184 0.228150459
## FP133 FP133 0.223438096
## FP011 FP011 0.218027852
## NumHalogen NumHalogen 0.206131587
## FP099 FP099 0.183582613
## FP203 FP203 0.181377664
## FP163 FP163 0.180651265
## FP111 FP111 0.173591718
## FP174 FP174 0.167972976
## FP178 FP178 0.153369194
## NumRings NumRings 0.151274463
## FP176 FP176 0.148673636
## FP040 FP040 0.136605607
## FP116 FP116 0.135385087
## FP143 FP143 0.121850657
## FP127 FP127 0.117943580
## FP083 FP083 0.116164951
## NumSulfer NumSulfer 0.114426257
## FP129 FP129 0.110793007
## FP202 FP202 0.107821205
## FP094 FP094 0.107295727
## FP147 FP147 0.105341949
## FP085 FP085 0.097070120
## FP171 FP171 0.095007488
## NumDblBonds NumDblBonds 0.091540999
## FP075 FP075 0.086093735
## FP131 FP131 0.084665123
## FP060 FP060 0.084302035
## FP065 FP065 0.083355947
## FP193 FP193 0.071497163
## FP198 FP198 0.071479166
## FP074 FP074 0.070314410
## FP026 FP026 0.070215291
## FP035 FP035 0.065356235
## FP124 FP124 0.064904086
## FP110 FP110 0.062330885
## FP055 FP055 0.062247119
## FP126 FP126 0.061935838
## FP043 FP043 0.059282976
## FP029 FP029 0.058795377
## FP057 FP057 0.057599902
## FP044 FP044 0.057513945
## FP042 FP042 0.057499436
## FP054 FP054 0.057345829
## FP049 FP049 0.055939976
## FP064 FP064 0.055670202
## FP004 FP004 0.054902848
## FP027 FP027 0.054013394
## FP102 FP102 0.053274435
## FP159 FP159 0.052727062
## FP188 FP188 0.050481180
## FP103 FP103 0.048056372
## FP136 FP136 0.047361852
## FP071 FP071 0.046679957
## FP158 FP158 0.044550412
## FP023 FP023 0.044356718
## FP036 FP036 0.043751908
## FP006 FP006 0.043703930
## FP201 FP201 0.042673169
## FP186 FP186 0.041151149
## FP173 FP173 0.040963698
## FP081 FP081 0.039360801
## FP160 FP160 0.038032300
## FP033 FP033 0.037869943
## FP128 FP128 0.037167669
## FP016 FP016 0.036646632
## FP164 FP164 0.035513471
## FP015 FP015 0.032769350
## FP078 FP078 0.032647792
## FP148 FP148 0.032313973
## FP207 FP207 0.032086123
## FP024 FP024 0.031211017
## FP018 FP018 0.030928932
## FP122 FP122 0.030672680
## FP161 FP161 0.027967764
## FP063 FP063 0.027644942
## FP119 FP119 0.026770916
## FP022 FP022 0.026239217
## FP001 FP001 0.025796348
## FP208 FP208 0.025639193
## FP090 FP090 0.025445162
## FP070 FP070 0.024900892
## FP180 FP180 0.024414820
## FP108 FP108 0.024141367
## FP167 FP167 0.023354616
## FP189 FP189 0.022795457
## FP101 FP101 0.021986642
## FP007 FP007 0.021834879
## FP144 FP144 0.021582094
## FP146 FP146 0.021274457
## FP105 FP105 0.021222634
## FP076 FP076 0.021178666
## FP139 FP139 0.020762648
## FP109 FP109 0.020652098
## FP019 FP019 0.020154093
## FP025 FP025 0.019982575
## FP177 FP177 0.019833735
## FP030 FP030 0.019418702
## FP034 FP034 0.019175599
## FP132 FP132 0.018530167
## FP170 FP170 0.017701916
## FP091 FP091 0.017684155
## FP017 FP017 0.017377315
## FP038 FP038 0.017247661
## FP112 FP112 0.016860486
## FP066 FP066 0.016649316
## FP031 FP031 0.016404502
## FP165 FP165 0.016373159
## FP114 FP114 0.015939417
## FP151 FP151 0.015897587
## FP121 FP121 0.015491734
## FP157 FP157 0.015290433
## FP093 FP093 0.015245672
## FP150 FP150 0.014481891
## FP155 FP155 0.013919812
## FP137 FP137 0.013703086
## FP097 FP097 0.013562110
## FP045 FP045 0.013338042
## FP051 FP051 0.013185059
## FP073 FP073 0.013152162
## FP028 FP028 0.012596907
## FP199 FP199 0.012049789
## FP020 FP020 0.011783358
## FP056 FP056 0.011326546
## FP095 FP095 0.011282886
## FP048 FP048 0.010253725
## FP080 FP080 0.010039002
## FP182 FP182 0.009966687
## FP183 FP183 0.009942464
## FP010 FP010 0.009907849
## FP041 FP041 0.009372163
## FP009 FP009 0.009304094
## FP046 FP046 0.009282731
## NumNitrogen NumNitrogen 0.008993053
## FP050 FP050 0.008387931
## FP194 FP194 0.008197575
## FP077 FP077 0.008156274
## FP106 FP106 0.007932124
## FP169 FP169 0.007733040
## FP058 FP058 0.007710316
## FP087 FP087 0.007517236
## FP113 FP113 0.007406374
## FP149 FP149 0.007110756
## FP192 FP192 0.006444042
## FP138 FP138 0.006259708
## FP140 FP140 0.006041124
## FP047 FP047 0.005984927
## FP141 FP141 0.005648136
## FP079 FP079 0.005536544
## FP088 FP088 0.005196089
## FP123 FP123 0.004944213
## FP082 FP082 0.004607126
## FP002 FP002 0.000000000
## FP003 FP003 0.000000000
## FP005 FP005 0.000000000
## FP008 FP008 0.000000000
## FP012 FP012 0.000000000
## FP013 FP013 0.000000000
## FP014 FP014 0.000000000
## FP021 FP021 0.000000000
## FP032 FP032 0.000000000
## FP037 FP037 0.000000000
## FP052 FP052 0.000000000
## FP053 FP053 0.000000000
## FP061 FP061 0.000000000
## FP062 FP062 0.000000000
## FP067 FP067 0.000000000
## FP068 FP068 0.000000000
## FP069 FP069 0.000000000
## FP072 FP072 0.000000000
## FP084 FP084 0.000000000
## FP086 FP086 0.000000000
## FP089 FP089 0.000000000
## FP092 FP092 0.000000000
## FP096 FP096 0.000000000
## FP098 FP098 0.000000000
## FP100 FP100 0.000000000
## FP104 FP104 0.000000000
## FP107 FP107 0.000000000
## FP115 FP115 0.000000000
## FP117 FP117 0.000000000
## FP118 FP118 0.000000000
## FP120 FP120 0.000000000
## FP130 FP130 0.000000000
## FP145 FP145 0.000000000
## FP152 FP152 0.000000000
## FP153 FP153 0.000000000
## FP156 FP156 0.000000000
## FP162 FP162 0.000000000
## FP166 FP166 0.000000000
## FP168 FP168 0.000000000
## FP175 FP175 0.000000000
## FP179 FP179 0.000000000
## FP181 FP181 0.000000000
## FP185 FP185 0.000000000
## FP187 FP187 0.000000000
## FP191 FP191 0.000000000
## FP195 FP195 0.000000000
## FP196 FP196 0.000000000
## FP197 FP197 0.000000000
## FP200 FP200 0.000000000
## FP205 FP205 0.000000000
## FP206 FP206 0.000000000
The model with high shrinkage and high bagging fraction makes large updates using a large portion of the data at each iteration. It tends to concentrate importance on a few predictors selected early. The lower shrinkage and lower bagging fraction model learns more gradually and spreads importance across more predictors.
pred_low <- predict(gbm_low, sol_test_dat, n.trees = 1000)
pred_high <- predict(gbm_high, sol_test_dat, n.trees = 1000)
postResample(pred_low, sol_test_y)
## RMSE Rsquared MAE
## 0.6942028 0.8883140 0.5118158
postResample(pred_high, sol_test_y)
## RMSE Rsquared MAE
## 0.7082669 0.8850553 0.5174387
The model with lower shrinkage is generally expected to be more predictive on new samples because it learns more gradually and is less likely to overfit.
set.seed(200)
gbm_low_depth3 <- gbm(
Solubility ~ .,
data = sol_train_dat,
distribution = "gaussian",
n.trees = 1000,
interaction.depth = 3,
shrinkage = 0.1,
bag.fraction = 0.1,
verbose = FALSE
)
summary(gbm_low_depth3, n.trees = 1000)
## var rel.inf
## MolWeight MolWeight 13.233982162
## NumCarbon NumCarbon 10.619910686
## HydrophilicFactor HydrophilicFactor 6.647749027
## SurfaceArea1 SurfaceArea1 6.352973247
## SurfaceArea2 SurfaceArea2 4.120100725
## NumHydrogen NumHydrogen 3.205089501
## NumMultBonds NumMultBonds 3.186035059
## NumBonds NumBonds 2.654066288
## NumAtoms NumAtoms 2.050341213
## NumRotBonds NumRotBonds 1.964574413
## NumNonHBonds NumNonHBonds 1.929190677
## FP072 FP072 1.778623523
## NumNonHAtoms NumNonHAtoms 1.764700784
## NumAromaticBonds NumAromaticBonds 1.528715434
## NumHalogen NumHalogen 1.102828947
## FP084 FP084 1.068563726
## FP075 FP075 0.861320896
## NumOxygen NumOxygen 0.777089750
## FP071 FP071 0.644620406
## FP122 FP122 0.597273160
## NumRings NumRings 0.563766634
## FP173 FP173 0.538044769
## FP085 FP085 0.519969273
## FP116 FP116 0.513178810
## FP107 FP107 0.491236603
## FP074 FP074 0.487863593
## FP063 FP063 0.484914934
## FP111 FP111 0.473027285
## FP104 FP104 0.472079375
## FP003 FP003 0.469729059
## FP070 FP070 0.439638165
## NumDblBonds NumDblBonds 0.433769794
## FP099 FP099 0.433122497
## FP080 FP080 0.427483120
## FP128 FP128 0.418614895
## FP124 FP124 0.414053047
## FP134 FP134 0.404004135
## FP078 FP078 0.398040794
## FP106 FP106 0.398019385
## FP011 FP011 0.396958813
## FP008 FP008 0.389238373
## FP098 FP098 0.376716457
## FP170 FP170 0.376657683
## FP168 FP168 0.372950979
## FP135 FP135 0.366723845
## FP081 FP081 0.362787298
## FP092 FP092 0.359012731
## FP142 FP142 0.357318777
## FP010 FP010 0.353714315
## FP069 FP069 0.349775631
## FP120 FP120 0.345803475
## FP140 FP140 0.345007882
## FP101 FP101 0.343704532
## FP206 FP206 0.339453829
## FP172 FP172 0.336705150
## FP115 FP115 0.335521578
## FP046 FP046 0.335218761
## NumNitrogen NumNitrogen 0.331248030
## FP012 FP012 0.324932644
## FP015 FP015 0.320540513
## FP169 FP169 0.319549117
## FP009 FP009 0.311925389
## FP097 FP097 0.304094417
## FP004 FP004 0.303456967
## FP102 FP102 0.302554697
## FP109 FP109 0.300225707
## FP076 FP076 0.298087956
## FP138 FP138 0.296936662
## FP064 FP064 0.293119117
## FP117 FP117 0.290266050
## FP164 FP164 0.286883659
## FP113 FP113 0.280087009
## FP125 FP125 0.276068841
## FP073 FP073 0.272078660
## FP088 FP088 0.265711592
## FP016 FP016 0.262207780
## FP066 FP066 0.255319406
## FP121 FP121 0.253193934
## FP096 FP096 0.247250652
## FP094 FP094 0.243882612
## FP058 FP058 0.238679999
## FP176 FP176 0.230893975
## FP105 FP105 0.227273037
## FP114 FP114 0.226926366
## FP027 FP027 0.226457863
## FP127 FP127 0.222094699
## FP131 FP131 0.219172127
## FP118 FP118 0.216758036
## FP132 FP132 0.216077455
## FP145 FP145 0.214888933
## FP018 FP018 0.214569119
## FP060 FP060 0.210835254
## FP119 FP119 0.210080254
## FP103 FP103 0.207219959
## FP093 FP093 0.200768942
## FP057 FP057 0.195896569
## FP126 FP126 0.194142070
## FP017 FP017 0.189164364
## FP082 FP082 0.186560787
## FP047 FP047 0.186472452
## FP123 FP123 0.185580235
## FP068 FP068 0.185369056
## FP144 FP144 0.183148749
## FP062 FP062 0.180764689
## FP171 FP171 0.179914912
## FP083 FP083 0.173030469
## FP130 FP130 0.172362534
## FP202 FP202 0.171628387
## FP147 FP147 0.170881161
## FP166 FP166 0.169599626
## FP067 FP067 0.167674681
## FP095 FP095 0.167278962
## FP065 FP065 0.161890542
## NumChlorine NumChlorine 0.161835692
## FP013 FP013 0.159885841
## FP163 FP163 0.150015707
## FP007 FP007 0.146656232
## FP006 FP006 0.143765711
## FP050 FP050 0.142281002
## FP061 FP061 0.141628447
## FP162 FP162 0.139375337
## FP174 FP174 0.135390252
## FP108 FP108 0.132794037
## FP165 FP165 0.129783073
## FP023 FP023 0.128913609
## FP022 FP022 0.123664614
## FP029 FP029 0.123227594
## FP002 FP002 0.112732648
## FP149 FP149 0.110514848
## FP087 FP087 0.108929383
## FP086 FP086 0.106064814
## FP030 FP030 0.103116900
## FP146 FP146 0.098878293
## FP177 FP177 0.092027233
## FP025 FP025 0.091270502
## FP153 FP153 0.090046700
## FP136 FP136 0.089152658
## FP053 FP053 0.088205709
## FP112 FP112 0.087615950
## FP137 FP137 0.087091097
## FP167 FP167 0.085769505
## FP129 FP129 0.085577579
## FP205 FP205 0.080029380
## FP028 FP028 0.079540004
## FP184 FP184 0.079071889
## FP089 FP089 0.078181431
## FP178 FP178 0.074598741
## FP133 FP133 0.072037569
## FP110 FP110 0.070723664
## FP091 FP091 0.070608464
## FP100 FP100 0.068097335
## FP141 FP141 0.065388185
## FP034 FP034 0.065363018
## FP161 FP161 0.064797482
## FP079 FP079 0.060023825
## FP024 FP024 0.057288319
## FP157 FP157 0.057058817
## FP188 FP188 0.054269552
## FP152 FP152 0.053763085
## FP035 FP035 0.051436684
## FP090 FP090 0.050913061
## FP148 FP148 0.047747726
## FP182 FP182 0.047690701
## FP001 FP001 0.045074428
## FP051 FP051 0.044173675
## FP180 FP180 0.043437659
## FP150 FP150 0.043314230
## FP049 FP049 0.041779855
## FP037 FP037 0.034872295
## FP039 FP039 0.033946261
## FP021 FP021 0.032945813
## FP185 FP185 0.032813805
## FP033 FP033 0.031116459
## FP031 FP031 0.029842508
## FP026 FP026 0.029203793
## FP189 FP189 0.028297908
## FP187 FP187 0.027986939
## FP179 FP179 0.027966977
## FP181 FP181 0.027525054
## FP203 FP203 0.026390764
## FP056 FP056 0.026385850
## FP077 FP077 0.023820007
## FP032 FP032 0.022157817
## FP040 FP040 0.021614697
## FP156 FP156 0.018529979
## FP186 FP186 0.017984106
## FP038 FP038 0.016077750
## FP183 FP183 0.013258309
## FP055 FP055 0.012751745
## FP045 FP045 0.012643736
## FP014 FP014 0.011909641
## FP175 FP175 0.009542459
## FP020 FP020 0.006925307
## FP190 FP190 0.005616695
## FP005 FP005 0.000000000
## FP019 FP019 0.000000000
## FP036 FP036 0.000000000
## FP041 FP041 0.000000000
## FP042 FP042 0.000000000
## FP043 FP043 0.000000000
## FP044 FP044 0.000000000
## FP048 FP048 0.000000000
## FP052 FP052 0.000000000
## FP054 FP054 0.000000000
## FP059 FP059 0.000000000
## FP139 FP139 0.000000000
## FP143 FP143 0.000000000
## FP151 FP151 0.000000000
## FP154 FP154 0.000000000
## FP155 FP155 0.000000000
## FP158 FP158 0.000000000
## FP159 FP159 0.000000000
## FP160 FP160 0.000000000
## FP191 FP191 0.000000000
## FP192 FP192 0.000000000
## FP193 FP193 0.000000000
## FP194 FP194 0.000000000
## FP195 FP195 0.000000000
## FP196 FP196 0.000000000
## FP197 FP197 0.000000000
## FP198 FP198 0.000000000
## FP199 FP199 0.000000000
## FP200 FP200 0.000000000
## FP201 FP201 0.000000000
## FP204 FP204 0.000000000
## FP207 FP207 0.000000000
## FP208 FP208 0.000000000
## NumSulfer NumSulfer 0.000000000
Increasing interaction depth allows trees to model interactions and more complex structure. This may spread importance across more predictors if several variables are useful only through interactions, but it can also increase overfitting.
Use a single solubility predictor.
one_pred_train <- data.frame(
MolWeight = solTrainXtrans$MolWeight,
Solubility = solTrainY
)
one_pred_test <- data.frame(
MolWeight = solTestXtrans$MolWeight,
Solubility = solTestY
)
set.seed(200)
tree_one <- train(
Solubility ~ MolWeight,
data = one_pred_train,
method = "rpart",
trControl = ctrl,
tuneLength = 10
)
tree_one
## CART
##
## 951 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 856, 857, 855, 856, 856, 856, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.004859809 1.513942 0.4553317 1.146843
## 0.005682050 1.516304 0.4538220 1.151006
## 0.007133559 1.529476 0.4421052 1.167605
## 0.007232122 1.531079 0.4407651 1.169155
## 0.007475879 1.536400 0.4370756 1.173360
## 0.009950241 1.567202 0.4149204 1.201659
## 0.012109468 1.580066 0.4065854 1.216871
## 0.041878909 1.637650 0.3628154 1.266517
## 0.044542785 1.648801 0.3539423 1.276161
## 0.351234818 1.874180 0.2970638 1.464068
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.004859809.
set.seed(200)
rf_one <- train(
Solubility ~ MolWeight,
data = one_pred_train,
method = "rf",
trControl = ctrl,
tuneLength = 5
)
rf_one
## Random Forest
##
## 951 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 856, 857, 855, 856, 856, 856, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.428304 0.5377322 1.010235
##
## Tuning parameter 'mtry' was held constant at a value of 2
set.seed(200)
cubist_one <- train(
Solubility ~ MolWeight,
data = one_pred_train,
method = "cubist",
trControl = ctrl,
tuneLength = 10
)
cubist_one
## Cubist
##
## 951 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 856, 857, 855, 856, 856, 856, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 1.543500 0.4444768 1.135313
## 1 5 1.601461 0.3983413 1.212392
## 1 9 1.600714 0.3983907 1.212859
## 10 0 1.531000 0.4527305 1.134907
## 10 5 1.592542 0.4007236 1.210559
## 10 9 1.591739 0.4008521 1.210842
## 20 0 1.531551 0.4523299 1.135052
## 20 5 1.592433 0.4008041 1.210413
## 20 9 1.591720 0.4008763 1.210689
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 10 and neighbors = 0.
plot_dat <- one_pred_test
plot_dat$Tree <- predict(tree_one, one_pred_test)
plot_dat$RandomForest <- predict(rf_one, one_pred_test)
plot_dat$Cubist <- predict(cubist_one, one_pred_test)
plot_long <- reshape(
plot_dat,
varying = c("Tree", "RandomForest", "Cubist"),
v.names = "Prediction",
timevar = "Model",
times = c("Tree", "RandomForest", "Cubist"),
direction = "long"
)
ggplot(plot_long, aes(x = MolWeight, y = Solubility)) +
geom_point(alpha = 0.5) +
geom_line(aes(y = Prediction), linewidth = 1) +
facet_wrap(~ Model) +
labs(x = "Molecular Weight", y = "Solubility")
The single tree gives a step-function fit. Random forests smooth the step behavior by averaging many trees. Cubist produces rule-based linear models and may show piecewise linear behavior.
Tecator data.
data("tecator", package = "caret")
tec_x <- absorp
tec_y <- endpoints[, 1]
set.seed(200)
tec_split <- createDataPartition(tec_y, p = 0.8, list = FALSE)
tec_train_x <- tec_x[tec_split, ]
tec_test_x <- tec_x[-tec_split, ]
tec_train_y <- tec_y[tec_split]
tec_test_y <- tec_y[-tec_split]
tec_train_dat <- data.frame(tec_train_x, Fat = tec_train_y)
tec_test_dat <- data.frame(tec_test_x, Fat = tec_test_y)
set.seed(200)
tec_rf <- train(
Fat ~ .,
data = tec_train_dat,
method = "rf",
trControl = ctrl,
tuneLength = 5
)
set.seed(200)
tec_gbm <- train(
Fat ~ .,
data = tec_train_dat,
method = "gbm",
trControl = ctrl,
tuneLength = 5,
verbose = FALSE
)
set.seed(200)
tec_cubist <- train(
Fat ~ .,
data = tec_train_dat,
method = "cubist",
trControl = ctrl,
tuneLength = 5
)
tec_rf
## Random Forest
##
## 175 samples
## 100 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 159, 158, 157, 158, 158, 157, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 6.264721 0.5934020 5.014753
## 26 5.744139 0.6581444 4.573580
## 51 5.714336 0.6616948 4.549824
## 75 5.696486 0.6658019 4.550120
## 100 5.696182 0.6642335 4.541491
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 100.
tec_gbm
## Stochastic Gradient Boosting
##
## 175 samples
## 100 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 159, 158, 157, 158, 158, 157, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 7.769841 0.3932559 6.525743
## 1 100 7.356951 0.4655490 6.189189
## 1 150 6.976578 0.5187359 5.839259
## 1 200 6.653419 0.5615663 5.553646
## 1 250 6.407074 0.5939155 5.326668
## 2 50 6.727954 0.5513346 5.633951
## 2 100 5.861585 0.6554409 4.836730
## 2 150 5.519354 0.6894326 4.473654
## 2 200 5.297229 0.7072049 4.225001
## 2 250 5.205034 0.7170156 4.108108
## 3 50 6.087864 0.6323005 5.059691
## 3 100 5.418064 0.6971827 4.368829
## 3 150 5.133973 0.7246940 4.084392
## 3 200 5.023936 0.7332142 3.961366
## 3 250 4.995391 0.7338307 3.911470
## 4 50 5.781810 0.6649331 4.752156
## 4 100 5.207728 0.7160135 4.160910
## 4 150 5.005814 0.7339828 3.915792
## 4 200 4.951430 0.7375537 3.850069
## 4 250 4.924128 0.7397336 3.822926
## 5 50 5.648116 0.6749776 4.644111
## 5 100 5.112722 0.7258205 4.077695
## 5 150 4.985183 0.7354320 3.913944
## 5 200 4.941802 0.7387479 3.845527
## 5 250 4.944870 0.7370864 3.829234
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 250, interaction.depth =
## 4, shrinkage = 0.1 and n.minobsinnode = 10.
tec_cubist
## Cubist
##
## 175 samples
## 100 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 159, 158, 157, 158, 158, 157, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 1.947492 0.9623705 1.402281
## 1 5 1.903049 0.9633434 1.333982
## 1 9 1.903597 0.9637864 1.353741
## 10 0 1.653946 0.9726509 1.163971
## 10 5 1.660673 0.9716167 1.133900
## 10 9 1.647019 0.9723705 1.140817
## 20 0 1.616927 0.9748740 1.143220
## 20 5 1.621020 0.9739106 1.105665
## 20 9 1.610114 0.9746221 1.118134
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
tec_results <- data.frame(
Model = c("Random Forest", "Boosted Tree", "Cubist"),
RMSE = c(
postResample(predict(tec_rf, tec_test_dat), tec_test_y)["RMSE"],
postResample(predict(tec_gbm, tec_test_dat), tec_test_y)["RMSE"],
postResample(predict(tec_cubist, tec_test_dat), tec_test_y)["RMSE"]
),
Rsquared = c(
postResample(predict(tec_rf, tec_test_dat), tec_test_y)["Rsquared"],
postResample(predict(tec_gbm, tec_test_dat), tec_test_y)["Rsquared"],
postResample(predict(tec_cubist, tec_test_dat), tec_test_y)["Rsquared"]
)
)
tec_results
## Model RMSE Rsquared
## 1 Random Forest 5.169434 0.7822691
## 2 Boosted Tree 4.967872 0.7907837
## 3 Cubist 1.425098 0.9821022
The Tecator predictors are highly correlated spectral measurements. Transformations such as PCA, PLS scores, derivatives, or feature filtering can reduce collinearity before modeling.
Permeability data.
data(permeability)
perm_train_x <- fingerprints
perm_train_y <- permeability
set.seed(200)
perm_split <- createDataPartition(perm_train_y, p = 0.8, list = FALSE)
perm_x_train <- perm_train_x[perm_split, ]
perm_x_test <- perm_train_x[-perm_split, ]
perm_y_train <- perm_train_y[perm_split]
perm_y_test <- perm_train_y[-perm_split]
perm_train_dat <- data.frame(perm_x_train, permeability = perm_y_train)
perm_test_dat <- data.frame(perm_x_test, permeability = perm_y_test)
set.seed(200)
perm_rf <- train(
permeability ~ .,
data = perm_train_dat,
method = "rf",
trControl = ctrl,
tuneLength = 5
)
set.seed(200)
perm_gbm <- train(
permeability ~ .,
data = perm_train_dat,
method = "gbm",
trControl = ctrl,
tuneLength = 5,
verbose = FALSE
)
set.seed(200)
perm_cubist <- train(
permeability ~ .,
data = perm_train_dat,
method = "cubist",
trControl = ctrl,
tuneLength = 5
)
perm_rf
## Random Forest
##
## 133 samples
## 1107 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 118, 120, 120, 120, 121, 120, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 12.94482 0.4976882 10.188432
## 9 10.41951 0.5804964 7.817710
## 47 10.09303 0.5922681 7.323269
## 228 10.32044 0.5730140 7.138908
## 1106 10.67343 0.5456930 7.209991
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 47.
perm_gbm
## Stochastic Gradient Boosting
##
## 133 samples
## 1107 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 118, 120, 120, 120, 121, 120, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 11.16289 0.5100886 7.994561
## 1 100 11.06110 0.5134304 8.167640
## 1 150 11.09390 0.5056690 8.285280
## 1 200 11.03714 0.5100896 8.241052
## 1 250 11.04629 0.5082170 8.207634
## 2 50 10.82431 0.5351475 7.867504
## 2 100 10.64380 0.5406417 7.847871
## 2 150 10.59024 0.5440277 7.827900
## 2 200 10.49414 0.5511776 7.781074
## 2 250 10.47955 0.5564086 7.754257
## 3 50 10.73176 0.5493965 7.817974
## 3 100 10.55940 0.5601892 7.730837
## 3 150 10.50008 0.5649741 7.709304
## 3 200 10.53677 0.5590194 7.706463
## 3 250 10.45245 0.5663934 7.677084
## 4 50 10.80009 0.5342352 7.883804
## 4 100 10.66247 0.5502385 7.791864
## 4 150 10.64013 0.5535321 7.819622
## 4 200 10.66811 0.5494964 7.831996
## 4 250 10.67480 0.5502911 7.861906
## 5 50 10.82691 0.5312048 7.888228
## 5 100 10.47861 0.5551901 7.657204
## 5 150 10.40627 0.5600670 7.627996
## 5 200 10.43716 0.5623544 7.637752
## 5 250 10.44075 0.5605119 7.675183
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 5, shrinkage = 0.1 and n.minobsinnode = 10.
perm_cubist
## Cubist
##
## 133 samples
## 1107 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 118, 120, 120, 120, 121, 120, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 11.056438 0.5005614 7.142603
## 1 5 10.480627 0.5565916 6.982333
## 1 9 10.515584 0.5472524 7.065254
## 10 0 10.377633 0.5499849 6.878957
## 10 5 9.857114 0.5990505 6.699756
## 10 9 9.945818 0.5867134 6.762198
## 20 0 10.351670 0.5529765 6.756075
## 20 5 9.758792 0.6050690 6.592828
## 20 9 9.741740 0.6054557 6.632982
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
perm_results <- data.frame(
Model = c("Random Forest", "Boosted Tree", "Cubist"),
RMSE = c(
postResample(predict(perm_rf, perm_test_dat), perm_y_test)["RMSE"],
postResample(predict(perm_gbm, perm_test_dat), perm_y_test)["RMSE"],
postResample(predict(perm_cubist, perm_test_dat), perm_y_test)["RMSE"]
),
Rsquared = c(
postResample(predict(perm_rf, perm_test_dat), perm_y_test)["Rsquared"],
postResample(predict(perm_gbm, perm_test_dat), perm_y_test)["Rsquared"],
postResample(predict(perm_cubist, perm_test_dat), perm_y_test)["Rsquared"]
)
)
perm_results
## Model RMSE Rsquared
## 1 Random Forest 14.22900 0.3061684
## 2 Boosted Tree 12.09356 0.5166912
## 3 Cubist 13.43411 0.3902581
Use test-set RMSE and resampling RMSE to compare models. Lower RMSE and higher \(R^2\) indicate better predictive performance.
Chemical manufacturing process.
data(ChemicalManufacturingProcess)
chem <- ChemicalManufacturingProcess
set.seed(200)
chem_split <- createDataPartition(chem$Yield, p = 0.8, list = FALSE)
chem_train <- chem[chem_split, ]
chem_test <- chem[-chem_split, ]
chem_pp <- preProcess(
chem_train[, names(chem_train) != "Yield"],
method = c("medianImpute", "center", "scale")
)
chem_train_x <- predict(chem_pp, chem_train[, names(chem_train) != "Yield"])
chem_test_x <- predict(chem_pp, chem_test[, names(chem_test) != "Yield"])
chem_train_dat <- data.frame(chem_train_x, Yield = chem_train$Yield)
chem_test_dat <- data.frame(chem_test_x, Yield = chem_test$Yield)
set.seed(200)
chem_tree <- train(
Yield ~ .,
data = chem_train_dat,
method = "rpart",
trControl = ctrl,
tuneLength = 10
)
set.seed(200)
chem_rf <- train(
Yield ~ .,
data = chem_train_dat,
method = "rf",
trControl = ctrl,
tuneLength = 5
)
set.seed(200)
chem_gbm <- train(
Yield ~ .,
data = chem_train_dat,
method = "gbm",
trControl = ctrl,
tuneLength = 5,
verbose = FALSE
)
set.seed(200)
chem_cubist <- train(
Yield ~ .,
data = chem_train_dat,
method = "cubist",
trControl = ctrl,
tuneLength = 5
)
chem_tree
## CART
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.000000000 1.436628 0.4374346 1.100095
## 0.006659213 1.434111 0.4383759 1.098775
## 0.011413814 1.438247 0.4357809 1.106436
## 0.027789221 1.427930 0.4299077 1.104787
## 0.029140890 1.414532 0.4350620 1.099322
## 0.036646987 1.389498 0.4507181 1.085295
## 0.049774462 1.359623 0.4627724 1.095111
## 0.070317075 1.337811 0.4784063 1.094097
## 0.094521937 1.429116 0.4037703 1.155646
## 0.433943988 1.657032 0.3046903 1.344962
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.07031708.
chem_rf
## Random Forest
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.217403 0.6304545 0.9785288
## 15 1.098043 0.6728132 0.8691547
## 29 1.081931 0.6702642 0.8527328
## 43 1.091399 0.6560575 0.8576204
## 57 1.106936 0.6388681 0.8699486
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
chem_gbm
## Stochastic Gradient Boosting
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 1.173368 0.5954535 0.9429745
## 1 100 1.151532 0.6064990 0.9197116
## 1 150 1.136651 0.6151670 0.9069751
## 1 200 1.135648 0.6144275 0.9061698
## 1 250 1.130584 0.6169501 0.9023392
## 2 50 1.152099 0.6054636 0.9157494
## 2 100 1.139591 0.6084498 0.9079239
## 2 150 1.128964 0.6158468 0.9036101
## 2 200 1.124361 0.6199280 0.9000019
## 2 250 1.122987 0.6207126 0.8954791
## 3 50 1.112730 0.6326774 0.8904349
## 3 100 1.081817 0.6494984 0.8587777
## 3 150 1.070415 0.6556113 0.8513794
## 3 200 1.064805 0.6587655 0.8438467
## 3 250 1.061606 0.6607930 0.8418120
## 4 50 1.131811 0.6132994 0.9045481
## 4 100 1.100159 0.6311966 0.8792823
## 4 150 1.084896 0.6420718 0.8702050
## 4 200 1.077422 0.6466441 0.8637798
## 4 250 1.075736 0.6476019 0.8618322
## 5 50 1.161443 0.5970435 0.9217068
## 5 100 1.134153 0.6152330 0.8951969
## 5 150 1.125598 0.6227834 0.8904524
## 5 200 1.120225 0.6267123 0.8847838
## 5 250 1.116475 0.6287398 0.8808000
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 250, interaction.depth =
## 3, shrinkage = 0.1 and n.minobsinnode = 10.
chem_cubist
## Cubist
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 1.2015059 0.5904135 0.9438495
## 1 5 1.0153376 0.7034092 0.7833201
## 1 9 1.0827658 0.6676125 0.8372854
## 10 0 1.0635135 0.6538444 0.8518207
## 10 5 0.9219180 0.7467352 0.7301192
## 10 9 0.9904765 0.7023948 0.7871645
## 20 0 1.0466489 0.6689854 0.8405129
## 20 5 0.9100427 0.7552589 0.7214848
## 20 9 0.9756902 0.7154996 0.7762108
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
chem_results <- data.frame(
Model = c("Single Tree", "Random Forest", "Boosted Tree", "Cubist"),
RMSE = c(
postResample(predict(chem_tree, chem_test_dat), chem_test$Yield)["RMSE"],
postResample(predict(chem_rf, chem_test_dat), chem_test$Yield)["RMSE"],
postResample(predict(chem_gbm, chem_test_dat), chem_test$Yield)["RMSE"],
postResample(predict(chem_cubist, chem_test_dat), chem_test$Yield)["RMSE"]
),
Rsquared = c(
postResample(predict(chem_tree, chem_test_dat), chem_test$Yield)["Rsquared"],
postResample(predict(chem_rf, chem_test_dat), chem_test$Yield)["Rsquared"],
postResample(predict(chem_gbm, chem_test_dat), chem_test$Yield)["Rsquared"],
postResample(predict(chem_cubist, chem_test_dat), chem_test$Yield)["Rsquared"]
)
)
chem_results[order(chem_results$RMSE), ]
## Model RMSE Rsquared
## 4 Cubist 1.043358 0.7683434
## 3 Boosted Tree 1.078303 0.7333401
## 2 Random Forest 1.333712 0.6027757
## 1 Single Tree 1.760042 0.2550851
best_model_name <- chem_results$Model[which.min(chem_results$RMSE)]
best_model_name
## [1] "Cubist"
chem_rf_imp <- varImp(chem_rf, scale = FALSE)
chem_gbm_imp <- varImp(chem_gbm, scale = FALSE)
chem_cubist_imp <- varImp(chem_cubist, scale = FALSE)
chem_rf_imp
## rf variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 137.454
## BiologicalMaterial12 27.780
## BiologicalMaterial03 22.062
## ManufacturingProcess31 21.121
## ManufacturingProcess17 19.192
## ManufacturingProcess13 17.144
## BiologicalMaterial06 13.581
## ManufacturingProcess28 13.375
## BiologicalMaterial11 13.160
## ManufacturingProcess06 11.990
## ManufacturingProcess09 11.851
## ManufacturingProcess36 6.903
## ManufacturingProcess39 6.900
## BiologicalMaterial04 6.747
## BiologicalMaterial09 5.956
## BiologicalMaterial02 5.788
## ManufacturingProcess30 5.366
## BiologicalMaterial08 5.277
## ManufacturingProcess21 5.053
## ManufacturingProcess11 5.027
plot(chem_rf_imp, top = 10)
Compare whether biological or process variables dominate by inspecting the top-ranked predictors and matching them to the variable descriptions from the original exercise.
rpart.plot(chem_tree$finalModel)
The terminal nodes show how the single tree partitions the data. This can be easier to interpret than ensemble models, although it may be less accurate.
chem_tree_pred_nodes <- predict(chem_tree$finalModel, chem_train_dat, type = "vector")
chem_train_dat$TreePrediction <- chem_tree_pred_nodes
ggplot(chem_train_dat, aes(x = factor(TreePrediction), y = Yield)) +
geom_boxplot() +
labs(x = "Terminal node prediction", y = "Yield")