suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(randomForest))
suppressPackageStartupMessages(library(AppliedPredictiveModeling))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(partykit))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(party))
suppressPackageStartupMessages(library(gbm))
suppressPackageStartupMessages(library(Cubist))
suppressPackageStartupMessages(library(rpart))
#suppressPackageStartupMessages(library(varImp))

Question 8.1

Recreate the simulated data from Exercise 7.2:

library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

(A) Fit a random forest model to all of the predictors, then estimate the variable importance scores:

model1 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp1 <- varImp(model1, scale = FALSE)

Did the random forest model significantly use the uninformative predictors (V6 – V10)?

rfImp1
##         Overall
## V1   8.83890885
## V2   6.49023056
## V3   0.67583163
## V4   7.58822553
## V5   2.27426009
## V6   0.17436781
## V7   0.15136583
## V8  -0.03078937
## V9  -0.02989832
## V10 -0.08529218

No, the uninformative predictors (V6 – V10) were NOT significantly used.

(B) Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9396216

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

model2 <- randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 )
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
##                Overall
## V1          6.29780744
## V2          6.08038134
## V3          0.58410718
## V4          6.93924427
## V5          2.03104094
## V6          0.07947642
## V7         -0.02566414
## V8         -0.11007435
## V9         -0.08839463
## V10        -0.00715093
## duplicate1  3.56411581

Introducing a highly correlated variable significantly decreased the importance score of V1

(C) Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

ctrl <- cforest_control(mtry = ncol(simulated) - 1)
cf_fit <- party::cforest(y ~ ., data = simulated, controls = ctrl)
cfImp1 <- varImp(cf_fit) 
cfImp2 <- varImp(cf_fit, conditional=T) 
cfImp1 <- cfImp1 %>% setNames(c("cf_ti")) 
cfImp2 <- cfImp2 %>% setNames(c("cf_ci"))
names<- rownames(cfImp2) 
names[11] <- "V11"
Imp_cf <- cbind(cfImp1,cfImp2)
rownames(Imp_cf) <- NULL
Imp_cf <- cbind(names,Imp_cf) 
Imp_cf
##    names       cf_ti        cf_ci
## 1     V1  8.41556912  1.761521323
## 2     V2  7.76850810  4.673402231
## 3     V3  0.01775823  0.028797708
## 4     V4 10.14481309  5.659304965
## 5     V5  2.30344986  0.843799759
## 6     V6 -0.02006879 -0.006068060
## 7     V7  0.07769047  0.026783185
## 8     V8 -0.01937243  0.001640106
## 9     V9 -0.02654575 -0.008257553
## 10   V10  0.01218696  0.003307968
## 11   V11  1.32708646  0.098630026

The pattern of importance for the predictor variables is same. The uninformative predictors (V6 – V10) still NOT significantly used.

(D) Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

simulated_orig <- simulated[,-12]
gbm_fit <- gbm.fit(simulated_orig[,-11], simulated_orig[,11], n.trees = 100,distribution="gaussian")
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1       24.3778             nan     0.0010    0.0008
##      2       24.3656             nan     0.0010    0.0114
##      3       24.3520             nan     0.0010    0.0117
##      4       24.3387             nan     0.0010    0.0098
##      5       24.3277             nan     0.0010    0.0084
##      6       24.3128             nan     0.0010    0.0121
##      7       24.2983             nan     0.0010    0.0106
##      8       24.2888             nan     0.0010    0.0064
##      9       24.2771             nan     0.0010    0.0108
##     10       24.2647             nan     0.0010    0.0107
##     20       24.1401             nan     0.0010    0.0114
##     40       23.8962             nan     0.0010    0.0115
##     60       23.6494             nan     0.0010    0.0098
##     80       23.4065             nan     0.0010    0.0099
##    100       23.1742             nan     0.0010    0.0100
gbmImp1 <- varImp(gbm_fit, numTrees = 100)
gbmImp1 <- gbmImp1 %>% add_row(Overall=NA) %>% setNames(c("gbm1")) 
gbm_fit2 <- gbm.fit(simulated[,-11], simulated[,11], n.trees = 100,distribution="gaussian")
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1       24.3736             nan     0.0010    0.0081
##      2       24.3615             nan     0.0010    0.0116
##      3       24.3463             nan     0.0010    0.0110
##      4       24.3332             nan     0.0010    0.0093
##      5       24.3179             nan     0.0010    0.0101
##      6       24.3059             nan     0.0010    0.0102
##      7       24.2932             nan     0.0010    0.0100
##      8       24.2836             nan     0.0010    0.0068
##      9       24.2695             nan     0.0010    0.0113
##     10       24.2564             nan     0.0010    0.0116
##     20       24.1267             nan     0.0010    0.0061
##     40       23.8757             nan     0.0010    0.0095
##     60       23.6213             nan     0.0010    0.0110
##     80       23.3848             nan     0.0010    0.0085
##    100       23.1479             nan     0.0010    0.0099
gbmImp2 <- varImp(gbm_fit2, numTrees = 100)
gbmImp2 <- gbmImp2 %>%setNames(c("gbm2"))
imp <- data.frame(gbmImp1,gbmImp2)
imp
##             gbm1      gbm2
## V1    41764.4452 37333.666
## V2    16991.1125 19873.995
## V3        0.0000     0.000
## V4    12039.7283 11147.187
## V5      621.5645     0.000
## V6        0.0000     0.000
## V7        0.0000     0.000
## V8        0.0000     0.000
## V9        0.0000     0.000
## V10       0.0000     0.000
## ...11         NA  5938.015
cubist_fit1 <- cubist(simulated_orig[,-11], simulated_orig[,11])
cubist_fit2 <- cubist(simulated[,-11],simulated[,11])
cbImp1 <- varImp(cubist_fit1)%>% add_row(Overall=NA) %>% setNames(c("Cubist1"))
cbImp2<- varImp(cubist_fit2) %>% setNames(c("Cubist2"))
cImp <- data.frame(cbImp1,cbImp2)
cImp
##       Cubist1 Cubist2
## V1         50      50
## V2         50      50
## V4         50      50
## V5         50      50
## V3          0       0
## V6          0       0
## V7          0       0
## V8          0       0
## V9          0       0
## V10         0       0
## ...11      NA       0

The same pattern occurs with the other models

Question 8.2

Use a simulation to show tree bias with different granularities.

x1 <- rep(1:2, each=100) 
x2 <- rnorm(200, mean=0, sd=4)
y <- x1 + rnorm(200, mean= 0 , sd=1)
sim <- data.frame(y,x1,x2)

sim_tree2 <- rpart(y~., data=sim)
plot(as.party(sim_tree2))

varImp(sim_tree2)
##      Overall
## x1 0.2047092
## x2 0.4212352

Question 8.3

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

(A) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The model on the right has bagging fraction set high (0.9) therefore fewer predictors will be regarded as important - the stochastic element is smaller.

(B) Which model do you think would be more predictive of other samples?

The model on the left would be more predictive, the model on the right would miss predictors.

(C) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

Increasing interaction depth would cause more predictors to become dominant.

Question 8.7

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

data(ChemicalManufacturingProcess)
processPredictors = as.matrix(ChemicalManufacturingProcess[,2:58])
yield = ChemicalManufacturingProcess[,1]  
train_r <- createDataPartition(yield, p=0.75, list=F)
pp_train <- ChemicalManufacturingProcess[train_r,-1]
y_train <-  ChemicalManufacturingProcess[train_r,1]
pp_test <- ChemicalManufacturingProcess[-train_r,-1]
y_test <-  ChemicalManufacturingProcess[-train_r,1]
p_pro <- c("nzv", "center","scale", "medianImpute")
#PLS
t_ctrl <- trainControl(method = "repeatedcv", repeats = 5)
pls_fit<-train(pp_train, y_train, method="pls", tuneLength = 10,preProcess=p_pro, trainControl=t_ctrl)
pls_fit
## Partial Least Squares 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     1.561215  0.3998819  1.185143
##    2     1.764208  0.3938604  1.153681
##    3     1.615733  0.4568704  1.132305
##    4     1.692610  0.4541577  1.152932
##    5     1.858179  0.4177678  1.234912
##    6     2.002659  0.3961615  1.288455
##    7     2.073512  0.3819987  1.323774
##    8     2.186294  0.3658923  1.371203
##    9     2.291930  0.3512078  1.413300
##   10     2.363162  0.3409140  1.441327
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
pls_pred <- predict(pls_fit, pp_test)
pls_met <- postResample(pred = pls_pred, obs = y_test)
#SVM
svm_fit <- train(pp_train, y_train, method="svmRadial", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl)
svm_fit
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE      
##     0.25  1.445752  0.4567120  1.1784961
##     0.50  1.335505  0.5129416  1.0792990
##     1.00  1.258567  0.5584455  1.0094827
##     2.00  1.230776  0.5703510  0.9824361
##     4.00  1.218184  0.5763996  0.9721202
##     8.00  1.217078  0.5770642  0.9717604
##    16.00  1.217181  0.5769676  0.9716910
##    32.00  1.217181  0.5769676  0.9716910
##    64.00  1.217181  0.5769676  0.9716910
##   128.00  1.217181  0.5769676  0.9716910
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01472647
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01472647 and C = 8.
svm_pred <- predict(svm_fit, newdata=pp_test)
svm_met<- postResample(pred=svm_pred,y_test)
# Classification and Regression Tree
rpart_fit <- train(pp_train, y_train,method = "rpart2",tuneLength = 10, preProcess= p_pro,trControl = t_ctrl)
## note: only 8 possible values of the max tree depth from the initial fit.
##  Truncating the grid to 8 .
rpart_fit
## CART 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 120, 118, 118, 120, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  RMSE      Rsquared   MAE     
##   1         1.657043  0.2668565  1.318424
##   2         1.664055  0.2923191  1.310737
##   3         1.622978  0.3267926  1.299452
##   4         1.605143  0.3450805  1.269968
##   5         1.575368  0.3725999  1.250045
##   6         1.547986  0.3883784  1.216469
##   8         1.532809  0.4067889  1.182646
##   9         1.531029  0.4077865  1.179478
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 9.
rpart_pred <- predict(rpart_fit,newdata=pp_test)
rpart_met <- postResample(pred=rpart_pred,y_test)
# Cubist
t_ctrl1 <- trainControl(method = "boot", number = 25)
cubist_fit = train(pp_train, y_train, method="cubist", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl1)
cubist_fit
## Cubist 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE      
##    1          0          1.739171  0.3714657  1.3056829
##    1          5          1.713417  0.3847065  1.2754138
##    1          9          1.722277  0.3801796  1.2832894
##   10          0          1.202202  0.5896811  0.9216567
##   10          5          1.184232  0.6014384  0.8984029
##   10          9          1.193261  0.5950013  0.9081524
##   20          0          1.172837  0.6073090  0.9051957
##   20          5          1.154413  0.6191364  0.8799411
##   20          9          1.163940  0.6126305  0.8908981
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
cubist_pred <- predict(cubist_fit,newdata=pp_test)
cub_met <- postResample(pred=cubist_pred,y_test)
# Gradient Boosting
t_ctrl1 <- trainControl(method = "boot", number = 25)
gbmGrid <- expand.grid(interaction.depth=seq(1,6,by=1), n.trees=c(25,50,100,200), shrinkage=c(0.01,0.05,0.1,0.2), n.minobsinnode=10)
gbm_fit <- suppressWarnings(train(pp_train, y_train, method = "gbm", metric = "Rsquared",tuneGrid=gbmGrid,  trControl = t_ctrl1, verbose=F))


gbm_pred <- predict(gbm_fit,newdata=pp_test)
gbm_met<- postResample(pred=gbm_pred,y_test)

(A) Which tree-based regression model gives the optimal resampling and test set performance?

comb_met <- data.frame(pls_met, svm_met, rpart_met, cub_met,gbm_met) 
comb_met <- data.frame(t(comb_met))
model<- c("PLS", "SVM", "RPART", "CUBIST", "GBM")
rownames(comb_met) <- NULL
comb_met <- data.frame(cbind(model,comb_met))
comb_met
##    model     RMSE  Rsquared       MAE
## 1    PLS 1.442490 0.3294248 1.1384654
## 2    SVM 1.176656 0.5411589 0.9476897
## 3  RPART 1.431202 0.3780401 1.0958127
## 4 CUBIST 1.247379 0.5149756 0.9185457
## 5    GBM 1.192515 0.5372166 0.9174782

SVM gave the best resampling and test set performance with lowest RMSE and hioghest R squared values.

(B) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

varImp(svm_fit)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess13  100.00
## ManufacturingProcess17   88.97
## ManufacturingProcess09   86.27
## ManufacturingProcess32   84.20
## BiologicalMaterial06     80.90
## BiologicalMaterial12     79.59
## ManufacturingProcess11   71.68
## BiologicalMaterial03     69.52
## ManufacturingProcess36   68.48
## ManufacturingProcess31   67.85
## BiologicalMaterial02     65.20
## ManufacturingProcess30   60.45
## ManufacturingProcess06   59.94
## BiologicalMaterial11     53.31
## BiologicalMaterial08     49.54
## BiologicalMaterial04     46.72
## ManufacturingProcess33   43.20
## BiologicalMaterial09     39.94
## ManufacturingProcess29   38.05
## BiologicalMaterial01     36.95

The manufacturing processes dominate

(C) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

rp_fit <- rpart(y_train~., pp_train, maxdepth=3)
plot(as.party(rp_fit),gp=gpar(fontsize = 8))

The distribution of yield does not favor the biological predictors, Manufacturing Processes are top predictors.