8.1

Recreating data.

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

8.1.1

set.seed(200)
library(randomForest)
library(caret)
model1 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp1 <- varImp(model1, scale = FALSE)

While the random forest model does make use of the uninformative predictors v6-v10 they do not have significant importance to the model.

arrange(rfImp1, desc(Overall))
##        Overall
## V1   8.5406729
## V4   7.6128264
## V2   6.4511374
## V5   2.2684088
## V3   0.7470911
## V6   0.1190456
## V7   0.1021863
## V10 -0.0564990
## V9  -0.1296344
## V8  -0.1632359

8.1.2

The importance score of V1 decreases when a highly correlated variable is introduced, and the highly correlated variable has a similar, though still noticably lesser, degree of importance.

set.seed(200)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9497025
model_cor1 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp_cor1 <- varImp(model_cor1, scale = FALSE)
arrange(rfImp_cor1, desc(Overall))
##                 Overall
## V4          6.929862226
## V2          6.090608279
## V1          6.062482986
## duplicate1  4.084363474
## V5          2.141978654
## V3          0.643432061
## V6          0.207868621
## V9          0.003902854
## V7         -0.026533024
## V10        -0.079523621
## V8         -0.094702352

When adding another highly correlated variable (totaling 3 variables with high correlation) we see the importance of V1 decrease further, with both the highly correlated variables achieving a similar degree of importance.

set.seed(76)
simulated$duplicate2 <- simulated$V1 + rnorm(200) * .01
cor(simulated$duplicate2, simulated$V1)
## [1] 0.9994365
model_cor2 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp_cor2 <- varImp(model_cor2, scale = FALSE)
arrange(rfImp_cor2, desc(Overall))
##                Overall
## V4          7.26467844
## V2          6.53089038
## duplicate2  4.41554613
## V1          4.16735913
## duplicate1  2.89908044
## V5          2.22210117
## V3          0.51745837
## V6          0.23532105
## V7          0.05532943
## V10         0.05349107
## V8         -0.04386452
## V9         -0.07426262

8.1.3

set.seed(76)
library(party)

con_forest <- cforest(y ~ ., data = simulated, control = cforest_control(ntree = 1000))
con_imp <- varimp(con_forest)
con_imp_conditional <- varimp(con_forest, conditional = TRUE)

When evaluating the importance without the conditional argument the cforest tree has similar importance for V1 and duplicate2 but much lower importance for duplicate1. When using the conditional argument the importance of all three correlated variableshave relatively low importance levels.

sort(con_imp, decreasing = TRUE)
##           V4           V2           V1   duplicate2           V5   duplicate1 
##  7.849845199  6.204135909  4.921544601  3.638809809  1.922262125  1.550033379 
##           V6           V7           V3           V9           V8          V10 
##  0.090475259  0.033684918  0.032046603 -0.007837774 -0.033394068 -0.033768162
sort(con_imp_conditional, decreasing = TRUE)
##            V4            V2            V1            V5    duplicate2 
##  4.6437607836  3.6334259743  0.8788956317  0.7189653447  0.5697962258 
##    duplicate1            V3            V6            V7            V9 
##  0.3730331007  0.0279763603  0.0262110660  0.0074302430  0.0006090235 
##           V10            V8 
## -0.0015977920 -0.0023899748

8.1.4

The boosted tree model has the same issue as the previous models, givine the correlated variables similar importance levels.

library(gbm)
set.seed(76)

boosted_tree <- gbm(y ~ ., data = simulated, distribution = "gaussian", n.trees = 1000)
boost_imp <- relative.influence(boosted_tree)
sort(boost_imp, decreasing = TRUE)
##         V4         V2 duplicate2         V5         V3 duplicate1         V1 
##  5073.8225  3814.5544  2440.9837  2159.9803  1922.1461  1661.3532  1336.2021 
##         V7         V6        V10         V8         V9 
##   685.3063   507.1981   347.3127   342.7780   340.3526

The cubist model has slightly better behavior, discarding one of the highly correlated variables but retaining the other two.

library(Cubist)
set.seed(76)

cube_tree <- cubist(x = subset(simulated, select = -y), y = simulated$y)
summary(cube_tree)
## 
## Call:
## cubist.default(x = subset(simulated, select = -y), y = simulated$y)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Sat Nov 16 15:45:35 2024
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 200 cases (13 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [200 cases, mean 14.416183, range 3.55596 to 28.38167, est err 1.950383]
## 
##  outcome = 0.088236 + 30 V1 - 22 duplicate2 + 8.9 V4 + 7.1 V2 + 5.3 V5
## 
## 
## Evaluation on training data (200 cases):
## 
##     Average  |error|           2.073322
##     Relative |error|               0.51
##     Correlation coefficient        0.86
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##           100%    V1
##           100%    V2
##           100%    V4
##           100%    V5
##           100%    duplicate2
## 
## 
## Time: 0.0 secs
varImp(cube_tree)
##            Overall
## V1              50
## V2              50
## V4              50
## V5              50
## duplicate2      50
## V3               0
## V6               0
## V7               0
## V8               0
## V9               0
## V10              0
## duplicate1       0

8.2

We can see there is a heavy bias for the variable with many unique values even though y is dependant on both dist1 and dist3. While dist3 is has a higher importance than the other variables, the granularity of dist1 is more useful to the model.

set.seed(200)
dist1 <- sample(1:1000, 1000, replace = TRUE) # not replacing values to ensure all are unique
dist2 <- sample(1:500, 1000, replace = TRUE)
dist3 <- sample(1:100, 1000, replace = TRUE)
dist4 <- sample(1:10, 1000, replace = TRUE)
y <- dist1 + dist3 + rnorm(1000, mean = 0, sd = 10)

df <- data.frame(dist1, dist2, dist3, dist4, y)
rf <- randomForest(y ~ ., data = df, importance = TRUE, ntree = 1000)
rf_imp <- varImp(rf)
arrange(rf_imp, desc(Overall))
##          Overall
## dist1 196.593485
## dist3   8.994477
## dist4   0.736620
## dist2  -1.152775

8.3

8.3.1

The right hand model focuses in on the first few predictors as a result of the large bagging and learning rates. The large sample size from bagging means the data will be more representative of the population and the high learning rate does little to regularize the greedy nature of the boosting algorithm. This means the optimal weak learner for the population will be chosen more frequently resulting in a consistently higher weight for the most significant predictors. The left hand model has a much smaller bagging fraction, meaning there is more opportunity to select a sample that is not representative of the population as well as a much smaller learning rate which does more to regularize the model, allowing for more feature variety in the component decision trees of the ensemble model.

8.3.2

I would expect the left handle model to perform better on the right hand model, as the large bagging and in particular large learning rate will likely lead the model to overfit. The left hand model will generalize better to new samples.

8.3.3

Increasing the interaction depth, which is equivalent to tree depth, would allow for more predictors to be included in each tree leading to a more even distribution of predictor importance. This would result in a much flatter slope of the predictor importance.

8.7

set.seed(8675309)
data(ChemicalManufacturingProcess)

# Apply BoxCox, center, and scale the imputation model to get better imputation values
impute_model <- preProcess(ChemicalManufacturingProcess, 
                           method = c("knnImpute", "BoxCox", "center", "scale"))
imputed_chemicals <- predict(impute_model, ChemicalManufacturingProcess)

chem_train_ind <- createDataPartition(imputed_chemicals$Yield, p = 0.8, list = FALSE)
chem_train <- imputed_chemicals[chem_train_ind, ]
chem_test <- imputed_chemicals[-chem_train_ind, ]


ctrl <- trainControl(method = "repeatedcv",
                     number = 10,
                     repeats = 3,
                     allowParallel = TRUE)

8.7.1

The cubist model performs the best by far with \(RMSE=0.5101924\) and \(R^2=0.7576669\) for the test set.

Random Forest

The random forest model saw a \(RMSE=0.6118233\) and \(R^2=0.6542884\) over the cross validation and \(RMSE=0.6173558\) and \(R^2=0.6469261\) on the test set.

set.seed(8675309)

rf_model <- train(Yield ~ .,
                  data = chem_train,
                  method = "rf",
                  trControl = ctrl)
rf_model
## Random Forest 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 130, 131, 131, 129, 128, 129, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE      
##    2    0.6636054  0.6581693  0.5420762
##   29    0.6118233  0.6542884  0.4813332
##   57    0.6231824  0.6252118  0.4858375
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
rf_model_pred <- predict(rf_model, newdata = chem_test)
postResample(pred = rf_model_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.6173558 0.6469261 0.4762914

Gradient Boosted

The gradient boosted model saw an \(RMSE=0.5711522\) and \(R^2=0.6807894\) over the cross validation set but appears to be overfit with \(RMSE=0.6634816\) and \(R^2=0.5630893\) on the test set.

set.seed(8675309)

tune_grid <- expand.grid(interaction.depth = c(1:3),
                         n.trees = c(100, 250, 500, 1000),
                         shrinkage = c(0.1, 0.05, 0.01, 0.005, 0.001),
                         n.minobsinnode = 5)

boosted_model <- train(Yield ~ .,
                  data = chem_train,
                  method = "gbm",
                  trControl = ctrl,
                  tuneGrid = tune_grid)
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9354             nan     0.0500    0.0504
##      2        0.8866             nan     0.0500    0.0375
##      3        0.8510             nan     0.0500    0.0310
##      4        0.8078             nan     0.0500    0.0320
##      5        0.7718             nan     0.0500    0.0285
##      6        0.7379             nan     0.0500    0.0250
##      7        0.7047             nan     0.0500    0.0269
##      8        0.6709             nan     0.0500    0.0237
##      9        0.6400             nan     0.0500    0.0174
##     10        0.6140             nan     0.0500    0.0174
##     20        0.4337             nan     0.0500    0.0060
##     40        0.2635             nan     0.0500   -0.0011
##     60        0.1853             nan     0.0500   -0.0016
##     80        0.1322             nan     0.0500   -0.0011
##    100        0.1014             nan     0.0500   -0.0004
##    120        0.0813             nan     0.0500   -0.0006
##    140        0.0649             nan     0.0500   -0.0008
##    160        0.0542             nan     0.0500   -0.0011
##    180        0.0444             nan     0.0500   -0.0005
##    200        0.0368             nan     0.0500   -0.0002
##    220        0.0309             nan     0.0500   -0.0003
##    240        0.0259             nan     0.0500   -0.0003
##    260        0.0214             nan     0.0500   -0.0002
##    280        0.0182             nan     0.0500   -0.0001
##    300        0.0155             nan     0.0500   -0.0002
##    320        0.0132             nan     0.0500   -0.0002
##    340        0.0114             nan     0.0500   -0.0002
##    360        0.0100             nan     0.0500   -0.0001
##    380        0.0086             nan     0.0500   -0.0000
##    400        0.0072             nan     0.0500   -0.0000
##    420        0.0062             nan     0.0500   -0.0001
##    440        0.0054             nan     0.0500   -0.0000
##    460        0.0046             nan     0.0500   -0.0000
##    480        0.0041             nan     0.0500   -0.0001
##    500        0.0035             nan     0.0500   -0.0001
##    520        0.0031             nan     0.0500   -0.0001
##    540        0.0026             nan     0.0500   -0.0000
##    560        0.0023             nan     0.0500   -0.0000
##    580        0.0020             nan     0.0500   -0.0000
##    600        0.0017             nan     0.0500   -0.0000
##    620        0.0015             nan     0.0500   -0.0000
##    640        0.0014             nan     0.0500   -0.0000
##    660        0.0012             nan     0.0500   -0.0000
##    680        0.0010             nan     0.0500   -0.0000
##    700        0.0009             nan     0.0500   -0.0000
##    720        0.0008             nan     0.0500   -0.0000
##    740        0.0007             nan     0.0500   -0.0000
##    760        0.0006             nan     0.0500   -0.0000
##    780        0.0005             nan     0.0500   -0.0000
##    800        0.0005             nan     0.0500   -0.0000
##    820        0.0004             nan     0.0500   -0.0000
##    840        0.0003             nan     0.0500   -0.0000
##    860        0.0003             nan     0.0500   -0.0000
##    880        0.0003             nan     0.0500   -0.0000
##    900        0.0002             nan     0.0500   -0.0000
##    920        0.0002             nan     0.0500   -0.0000
##    940        0.0002             nan     0.0500   -0.0000
##    960        0.0002             nan     0.0500   -0.0000
##    980        0.0001             nan     0.0500   -0.0000
##   1000        0.0001             nan     0.0500   -0.0000
boosted_model
## Stochastic Gradient Boosting 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 130, 131, 131, 129, 128, 129, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  RMSE       Rsquared   MAE      
##   0.001      1                   100     0.9557462  0.4690751  0.7847082
##   0.001      1                   250     0.9171398  0.4871042  0.7524466
##   0.001      1                   500     0.8656382  0.5142556  0.7095881
##   0.001      1                  1000     0.7916629  0.5533067  0.6471789
##   0.001      2                   100     0.9478277  0.5353481  0.7787913
##   0.001      2                   250     0.8987021  0.5425847  0.7381482
##   0.001      2                   500     0.8340638  0.5598741  0.6851060
##   0.001      2                  1000     0.7484560  0.5875161  0.6096420
##   0.001      3                   100     0.9444016  0.5597752  0.7761708
##   0.001      3                   250     0.8900447  0.5698777  0.7315348
##   0.001      3                   500     0.8203145  0.5819427  0.6743258
##   0.001      3                  1000     0.7300142  0.6034830  0.5940621
##   0.005      1                   100     0.8646808  0.5171832  0.7083959
##   0.005      1                   250     0.7640896  0.5656151  0.6224124
##   0.005      1                   500     0.6847649  0.6031171  0.5500461
##   0.005      1                  1000     0.6375289  0.6269008  0.5035383
##   0.005      2                   100     0.8354224  0.5561986  0.6861385
##   0.005      2                   250     0.7212818  0.5931743  0.5859592
##   0.005      2                   500     0.6517624  0.6210505  0.5178719
##   0.005      2                  1000     0.6145241  0.6448605  0.4844147
##   0.005      3                   100     0.8203578  0.5777118  0.6748174
##   0.005      3                   250     0.7029548  0.6098738  0.5682774
##   0.005      3                   500     0.6378158  0.6354447  0.5033865
##   0.005      3                  1000     0.6025926  0.6574691  0.4735176
##   0.010      1                   100     0.7932048  0.5436458  0.6490127
##   0.010      1                   250     0.6873831  0.5962944  0.5521106
##   0.010      1                   500     0.6364120  0.6269252  0.5025861
##   0.010      1                  1000     0.6159027  0.6412539  0.4856003
##   0.010      2                   100     0.7487954  0.5828481  0.6093674
##   0.010      2                   250     0.6519835  0.6206981  0.5179031
##   0.010      2                   500     0.6161255  0.6412492  0.4846621
##   0.010      2                  1000     0.5979571  0.6555783  0.4746286
##   0.010      3                   100     0.7330629  0.5964311  0.5956681
##   0.010      3                   250     0.6398368  0.6309125  0.5053534
##   0.010      3                   500     0.6056703  0.6516723  0.4752425
##   0.010      3                  1000     0.5902260  0.6639837  0.4635558
##   0.050      1                   100     0.6371260  0.6260976  0.5053697
##   0.050      1                   250     0.6158538  0.6416851  0.4880198
##   0.050      1                   500     0.6199163  0.6335982  0.4952785
##   0.050      1                  1000     0.6169214  0.6363934  0.4974479
##   0.050      2                   100     0.6123478  0.6447559  0.4807513
##   0.050      2                   250     0.5921935  0.6602375  0.4702550
##   0.050      2                   500     0.5833296  0.6675897  0.4644057
##   0.050      2                  1000     0.5768791  0.6729141  0.4601591
##   0.050      3                   100     0.6048168  0.6460851  0.4770036
##   0.050      3                   250     0.5854999  0.6662630  0.4601282
##   0.050      3                   500     0.5743202  0.6782137  0.4522903
##   0.050      3                  1000     0.5711522  0.6807894  0.4500711
##   0.100      1                   100     0.6217754  0.6309721  0.4959317
##   0.100      1                   250     0.6109476  0.6417554  0.4926435
##   0.100      1                   500     0.6066902  0.6489223  0.4869771
##   0.100      1                  1000     0.6023890  0.6544537  0.4875112
##   0.100      2                   100     0.6103898  0.6400486  0.4836854
##   0.100      2                   250     0.6083333  0.6426656  0.4817478
##   0.100      2                   500     0.6062262  0.6442794  0.4795821
##   0.100      2                  1000     0.6050880  0.6452668  0.4781108
##   0.100      3                   100     0.5973428  0.6516468  0.4698252
##   0.100      3                   250     0.5867870  0.6637489  0.4611056
##   0.100      3                   500     0.5851345  0.6646028  0.4602032
##   0.100      3                  1000     0.5846814  0.6651989  0.4598344
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 1000, interaction.depth =
##  3, shrinkage = 0.05 and n.minobsinnode = 5.
boosted_model_pred <- predict(boosted_model, newdata = chem_test)
postResample(pred = boosted_model_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.6634816 0.5630893 0.5348362

Cubist

The cubist model saw an \(RMSE=0.5550510\) and \(R^2=0.6933197\) over the cross validation set with a \(RMSE=0.5101924\) and \(R^2=0.7576669\) over the test set.

set.seed(8675309)

cubist_model <- train(Yield ~ .,
                  data = chem_train,
                  method = "cubist",
                  trControl = ctrl)
cubist_model
## Cubist 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 130, 131, 131, 129, 128, 129, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE       Rsquared   MAE      
##    1          0          0.7556838  0.5584823  0.5666996
##    1          5          0.6828877  0.6291379  0.4965219
##    1          9          0.7165360  0.5965385  0.5310083
##   10          0          0.6167780  0.6244449  0.4893956
##   10          5          0.5657702  0.6813465  0.4429541
##   10          9          0.5934120  0.6499609  0.4649550
##   20          0          0.6099230  0.6334817  0.4884902
##   20          5          0.5548670  0.6932691  0.4376287
##   20          9          0.5853244  0.6603067  0.4632644
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
cubist_model_pred <- predict(cubist_model, newdata = chem_test)
postResample(pred = cubist_model_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.5101924 0.7576669 0.3839135

8.7.2

There are six manufacturing processes in the top ten most important predictors, indicating a more even split between biological and manufacturing predictors. While the most important predictor has remained consistent, many of the other predictors have shifted in importance from the previous models and ManufacturingProcess31 has significantly gained importance becoming the third most significant predictor. Additionally the predictor importance decays much more quickly for the cubist model than the other models.

set.seed(8675309)

varImp(cubist_model)
## cubist variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## BiologicalMaterial03     49.54
## ManufacturingProcess31   48.62
## ManufacturingProcess17   41.28
## BiologicalMaterial02     40.37
## BiologicalMaterial06     37.61
## ManufacturingProcess09   37.61
## BiologicalMaterial12     32.11
## ManufacturingProcess04   26.61
## ManufacturingProcess33   25.69
## ManufacturingProcess26   22.94
## ManufacturingProcess13   22.02
## ManufacturingProcess20   21.10
## ManufacturingProcess11   19.27
## ManufacturingProcess29   19.27
## ManufacturingProcess28   17.43
## BiologicalMaterial08     15.60
## ManufacturingProcess30   14.68
## ManufacturingProcess14   13.76
## ManufacturingProcess15   12.84

8.7.3

Cubist models do not have a way of determining which is the “optimal” single tree as it generates rules based models. In the spirit of the question we will train a quick CART model and visualize it. The visualization gives us insight into what the thresholds for the cuts are, and whether the yield will be increased or decreased by looking at the different subtrees.

library(rpart)
library(rpart.plot)
set.seed(8675309)

cart_model <- rpart(Yield ~ ., data = chem_train)
rpart.plot(cart_model)