8.1

library(mlbench)

set.seed(200)
simulated = mlbench.friedman1(200, sd=1)
simulated = cbind(simulated$x, simulated$y)
simulated = as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] = "y"
  1. Fit a random forest model to all of the predictors, then estimate the variable importance scores:
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
model1 = randomForest(y ~ ., data=simulated,
                      importance=TRUE,
                      ntree=1000)
rfImp1 = varImp(model1, scale=FALSE)
rfImp1
##          Overall
## V1   8.732235404
## V2   6.415369387
## V3   0.763591825
## V4   7.615118809
## V5   2.023524577
## V6   0.165111172
## V7  -0.005961659
## V8  -0.166362581
## V9  -0.095292651
## V10 -0.074944788

Did the random forest model significantly use the uninformative predictors (V6-V10)? The random forest model significantly used V6-V10 predictors.

  1. Now add an additional predictor that is highly correlated with one of the informative predictors. For example: Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

The importance score decreased for V1 when fitting another random forest model to the data. When adding another predictor that is highly correlated with V1, the other variables will increase in importance while V1 decreases.

simulated$duplicate1 = simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
model2 = randomForest(y~ ., data=simulated,
                      importance=TRUE,
                      ntree=1000)
rfImp2 = varImp(model2, scale=FALSE)
rfImp2
##                Overall
## V1          5.69119973
## V2          6.06896061
## V3          0.62970218
## V4          7.04752238
## V5          1.87238438
## V6          0.13569065
## V7         -0.01345645
## V8         -0.04370565
## V9          0.00840438
## V10         0.02894814
## duplicate1  4.28331581
simulated$duplicate2 = simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate2, simulated$V1)
## [1] 0.9408631
  1. Use the cforest function in the party package to fit a random forest moddel using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. Do these importances show the same pattern as the traditional random forest model?

The importances show similar patterns as the traditional. V1 was the most important in the randomForest function. In the cforest function, V4 has the greatest significance.

library(party)
## Warning: package 'party' was built under R version 4.3.2
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.3.2
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.2
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
## 
##     where
model3 = cforest(y~., data=simulated[, c(1:11)])
rfImp3 = varimp(model3, conditional=TRUE)
rfImp3
##           V1           V2           V3           V4           V5           V6 
##  5.662787411  5.269436397  0.049373561  6.359038734  1.135878712 -0.007394301 
##           V7           V8           V9          V10 
##  0.035611549  0.007715732 -0.006887697 -0.030898866
  1. Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

For the boosted model, the same occurs where V4 has the highest importance score. For the cubist model, V1 has the highest importance score.

library(gbm)
## Warning: package 'gbm' was built under R version 4.3.2
## Loaded gbm 2.1.8.1
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.3.2
set.seed(200)
gbmGrid = expand.grid(interaction.depth = seq(1,7,by=2),
                      n.trees=seq(100, 1000, by=50),
                      shrinkage=c(0.01,0.1,by=0.01),
                      n.minobsinnode=10)
gbmTune = train(y~., data=simulated[, c(1:11)],
                method="gbm",
                trControl=trainControl(method="cv",n=10),
                tuneGrid=gbmGrid,
                verbose=FALSE)
rfImp4 = varImp(gbmTune$finalModel, scale=FALSE)
rfImp4
##       Overall
## V1  4541.6933
## V2  3706.0111
## V3  1670.8471
## V4  5282.7089
## V5  2213.7441
## V6   256.9574
## V7   231.5540
## V8   255.0921
## V9   147.5098
## V10  212.1611
cubistTuned = train(y~., data=simulated[, c(1:11)],
                    method="cubist")
rfImp5 = varImp(cubistTuned$finalModel, scale=FALSE)
rfImp5
##     Overall
## V1     72.0
## V3     42.0
## V2     54.5
## V4     49.0
## V5     40.0
## V6     11.0
## V7      0.0
## V8      0.0
## V9      0.0
## V10     0.0

8.2

Use a simulation to show tree bias with different granularities.

library(rpart)
library(partykit)
## Warning: package 'partykit' was built under R version 4.3.2
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 4.3.2
## 
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
##     node_terminal, varimp
set.seed(200)

A = sample(1:10/10, 500, replace=TRUE)
B = sample(1:100/100, 500, replace=TRUE)
C = sample(1:1000/1000, 500, replace=TRUE)
D = sample(1:10000/10000, 500, replace=TRUE)
E = sample(1:100000/100000, 500, replace=TRUE)
F = sample(1:1000000/1000000, 500, replace=TRUE)

y = A+B+C+D+E+F
samsim = data.frame(A,B,C,D,E,F,y)
rmodel = rpart(y~., data=samsim)
varImp(rmodel)
##     Overall
## A 0.8058655
## B 3.6915180
## C 4.6177438
## D 4.2017067
## E 2.5931329
## F 2.3592789

8.3

  1. Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The right model focused its importance on the first few predictors because of the learning rate. Ideally, you want a low learning rate and bagging fraction for the model to learn slower but perform better, as shown in the left plot. Having the importance distributed across multiple variables shows the models performance on the data provided. Alternatively, the right plot has a high learning rate and bagging fraction, resulting in faster computation but sacrifices performance.

  1. Which model do you think would be more predictive of other samples?

The left model would be more predictive on other samples due to more iterations, slower learning rates, and better performance.

  1. How would increasing interaction depth affect the slope of predictor importance for either model?

Increasing interaction depth would increase the importance of predictors, which calls for more contribution for other predictors. Therefore, the slope would increase for the model.

8.7

library(AppliedPredictiveModeling)
set.seed(100)

data(ChemicalManufacturingProcess)
miss = preProcess(ChemicalManufacturingProcess, method="bagImpute")
Chemical = predict(miss, ChemicalManufacturingProcess)
Chemical = Chemical[, -nearZeroVar(Chemical)]

index = createDataPartition(Chemical$Yield, p=.8, list=FALSE)
trainx=Chemical[index,-1]
trainy=Chemical[index,1]
testx=Chemical[-index,-1]
testy=Chemical[-index,1]
  1. Which tree-based regression model gives the optimal resampling and test set performance?

According to the RMSE of each model, the boosted model gave the optimal resampling and best test set performance.

library(ipred)
set.seed(100)

cartTune = train(trainx, trainy,
                 method="rpart",
                 tuneLength = 10,
                 trControl=trainControl(method="cv"))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
cartPred = predict(cartTune,testx)
cart = postResample(cartPred,testy)

bagTree = ipredbagg(trainy,trainx)
bagPred = predict(bagTree, testx)
bag = postResample(bagPred, testy)

rf = randomForest(trainx, trainy,
                  importance=TRUE,
                  ntree=1000)
rfPred = predict(rf, testx)
ranf = postResample(rfPred, testy)

gbmGrid = expand.grid(interaction.depth = seq(1,7,by=2),
                      n.trees=seq(100, 1000, by=50),
                      shrinkage=c(0.01,0.1,by=0.01),
                      n.minobsinnode=10)
gbmTune = train(trainx, trainy,
                method="gbm",
                trControl=trainControl(method="cv",n=10),
                tuneGrid=gbmGrid,
                verbose=FALSE)
gbmPred = predict(gbmTune, testx)
gbm = postResample(gbmPred, testy)

cubistTuned = train(trainx, trainy,
                    method="cubist")
cubistPred = predict(cubistTuned, testx)
cube = postResample(cubistPred, testy)

rbind(cart, bag, ranf, gbm, cube)
##          RMSE  Rsquared       MAE
## cart 1.203068 0.6521373 0.8952776
## bag  1.188396 0.7094529 0.8912451
## ranf 1.183908 0.7219475 0.8858208
## gbm  1.013115 0.7569328 0.7700629
## cube 1.128399 0.6869464 0.8257386
  1. Which predictors most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

For the given boosted model, 6 process variables and 4 chemical variables have the highest importance. When compared to the top 10 predictors from the optimal linear and nonlinear models, the results seem to resemble similar results.

varImp(gbmTune, scale=TRUE)
## gbm variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess06  32.631
## BiologicalMaterial12    28.099
## ManufacturingProcess17  27.361
## ManufacturingProcess13  26.190
## ManufacturingProcess31  21.326
## ManufacturingProcess09  20.751
## BiologicalMaterial11    19.244
## BiologicalMaterial02    14.513
## BiologicalMaterial05    12.741
## BiologicalMaterial09    10.887
## ManufacturingProcess37   8.780
## ManufacturingProcess27   8.704
## BiologicalMaterial06     8.637
## BiologicalMaterial08     8.526
## BiologicalMaterial03     7.608
## ManufacturingProcess04   7.438
## ManufacturingProcess14   7.374
## BiologicalMaterial04     7.004
## ManufacturingProcess01   6.805
  1. Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?
library(rpart.plot)
train = Chemical[index,]
test = Chemical[-index,]

rf_model = rpart(Yield~., data=train)
rpart.plot(rf_model)