exercise5

8.1(a)

library(mlbench)

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

head(simulated)

##          V1        V2         V3         V4         V5         V6        V7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
##          V8         V9       V10        y
## 1 0.4214081 0.59111440 0.5886216 18.46398
## 2 0.8446239 0.92819306 0.7584008 16.09836
## 3 0.3495908 0.01759542 0.4441185 17.76165
## 4 0.7970260 0.68986918 0.4450716 13.78730
## 5 0.9038919 0.39696995 0.5500808 18.42984
## 6 0.6489177 0.53116033 0.9066182 20.85817

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: lattice

model1 <- randomForest(
  y ~ .,
  data = simulated,
  importance = TRUE,
  ntree = 1000
)

model1

## 
## Call:
##  randomForest(formula = y ~ ., data = simulated, importance = TRUE,      ntree = 1000) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 6.514982
##                     % Var explained: 73.28

rfImp1 <- varImp(model1, scale = FALSE)

rfImp1

##         Overall
## V1   8.86329776
## V2   6.72851763
## V3   0.84145353
## V4   7.60284159
## V5   2.26864193
## V6   0.11268425
## V7   0.07374772
## V8  -0.07210708
## V9  -0.06913906
## V10 -0.10577619

No, the random forest model did not significantly use the uninformative predictors (V6–V10). Their importance scores were very close to zero, and some were negative, which suggests they contributed little to the model compared with the informative predictors.

8.1(b)

simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9356508

model2 <- randomForest(
  y ~ .,
  data = simulated,
  importance = TRUE,
  ntree = 1000
)

rfImp2 <- varImp(model2, scale = FALSE)
rfImp2

##                Overall
## V1          5.78865821
## V2          6.48386619
## V3          0.55469974
## V4          6.78484850
## V5          1.96248183
## V6          0.10126938
## V7          0.14210730
## V8         -0.09726812
## V9         -0.08440763
## V10         0.04878300
## duplicate1  4.64551303

simulated$duplicate2 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate2, simulated$V1)

## [1] 0.9432429

model3 <- randomForest(
  y ~ .,
  data = simulated,
  importance = TRUE,
  ntree = 1000
)

rfImp3 <- varImp(model3, scale = FALSE)
rfImp3

##                 Overall
## V1          4.642486671
## V2          6.999126963
## V3          0.410061203
## V4          7.088994719
## V5          1.989570698
## V6          0.152581329
## V7         -0.020144331
## V8         -0.077797272
## V9         -0.019773933
## V10         0.004167046
## duplicate1  3.625925699
## duplicate2  2.729716790

Yes. The importance score for V1 changed. It decreased from 8.5321 to 6.7219 after adding duplicate1. When another predictor highly correlated with V1 was added, the importance score for V1 decreased further to 5.4285. This suggests that the model split the importance among the correlated predictors.

8.1(c)

library(party)

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

cf1 <- cforest(
  y ~ .,
  data = simulated
)

cf_imp1 <- varimp(cf1, conditional = FALSE)
sort(cf_imp1, decreasing = TRUE)

##           V4           V2           V1   duplicate2   duplicate1           V5 
##  7.224490398  6.305633822  5.483627440  2.804676212  1.831954644  1.652266288 
##           V9           V3           V7           V8          V10           V6 
##  0.026731626  0.024151673 -0.001313370 -0.005437581 -0.013254646 -0.015582595

cf_imp2 <- varimp(cf1, conditional = TRUE)
sort(cf_imp2, decreasing = TRUE)

##           V4           V2           V1           V5   duplicate2   duplicate1 
##  5.773446339  4.830593732  2.083838071  1.140880775  0.937012669  0.865926318 
##           V6           V7           V9           V3           V8          V10 
##  0.003183752 -0.005794233 -0.008223196 -0.009487717 -0.013262723 -0.015798371

Yes, the importances show a similar overall pattern to the traditional random forest model because the informative predictors are still the most important and the uninformative predictors remain close to zero. However, with conditional importance, the highly correlated duplicate predictors receive lower importance, so the bias caused by correlated predictors is reduced.

8.1(d)

library(gbm)

## Loaded gbm 2.2.3

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

set.seed(200)

boosted1 <- train(
  y ~ .,
  data = simulated,
  method = "gbm",
  verbose = FALSE
)

boosted1

## Stochastic Gradient Boosting 
## 
## 200 samples
##  12 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE      Rsquared   MAE     
##   1                   50      2.753398  0.7631490  2.259903
##   1                  100      2.190555  0.8326585  1.762706
##   1                  150      1.998494  0.8516450  1.586641
##   2                   50      2.353646  0.8132762  1.906217
##   2                  100      2.028984  0.8478486  1.611846
##   2                  150      1.955259  0.8561150  1.540235
##   3                   50      2.246567  0.8217561  1.810376
##   3                  100      2.029633  0.8454516  1.611703
##   3                  150      1.990602  0.8492301  1.574892
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  2, shrinkage = 0.1 and n.minobsinnode = 10.

boostImp1 <- varImp(boosted1, scale = FALSE)
boostImp1

## gbm variable importance
## 
##            Overall
## V4         4739.79
## V2         3472.27
## V1         1763.87
## V5         1750.04
## V3         1289.50
## duplicate2 1267.97
## duplicate1 1263.26
## V7          174.28
## V6          148.92
## V10         115.85
## V8           97.82
## V9           61.96

boostImp1$importance[order(boostImp1$importance$Overall, decreasing = TRUE), , drop = FALSE]

##               Overall
## V4         4739.79394
## V2         3472.27477
## V1         1763.86812
## V5         1750.03654
## V3         1289.50405
## duplicate2 1267.97346
## duplicate1 1263.25952
## V7          174.28176
## V6          148.92196
## V10         115.85375
## V8           97.81571
## V9           61.96149

library(Cubist)

set.seed(200)

cubist1 <- train(
  y ~ .,
  data = simulated,
  method = "cubist"
)

cubist1

## Cubist 
## 
## 200 samples
##  12 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE     
##    1          0          3.078639  0.6536919  2.307586
##    1          5          3.049600  0.6623299  2.291685
##    1          9          3.025975  0.6652084  2.270909
##   10          0          2.166077  0.8178947  1.710553
##   10          5          2.152045  0.8179869  1.707946
##   10          9          2.131020  0.8221017  1.688051
##   20          0          2.088086  0.8322453  1.655212
##   20          5          2.075335  0.8311372  1.652699
##   20          9          2.055971  0.8351463  1.634019
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.

cubistImp1 <- varImp(cubist1, scale = FALSE)
cubistImp1

## cubist variable importance
## 
##            Overall
## V1            69.0
## V2            59.0
## V4            49.0
## V3            40.5
## V5            34.0
## V6             8.0
## duplicate2     5.5
## V7             0.0
## V9             0.0
## duplicate1     0.0
## V10            0.0
## V8             0.0

Yes, the same general pattern occurs. In both boosted trees and Cubist, the informative predictors (V1–V5) are more important than the uninformative predictors (V6–V10). However, the exact pattern is not identical across models. In the boosted tree model, the correlated duplicate predictors still have some importance, while in the Cubist model they have zero importance.

exercise5

Wei You

2026-04-05