Libraries

8.1 Recreate the simulated data from Exercise 7.2:

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

(a) Fit a random forest model to all of the predictors, then estimate the variable importance scores:

model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000) 
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
##          Overall
## V1   8.732235404
## V2   6.415369387
## V3   0.763591825
## V4   7.615118809
## V5   2.023524577
## V6   0.165111172
## V7  -0.005961659
## V8  -0.166362581
## V9  -0.095292651
## V10 -0.074944788

Did the random forest model significantly use the uninformative predictors (V6-V10)?

The variable importance function for random forest reveals that only, V1-V5 wer the dominant variables, V6-V10 were not utilized significantly

(b) Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

model2 <- randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 )
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
##                Overall
## V1          5.69119973
## V2          6.06896061
## V3          0.62970218
## V4          7.04752238
## V5          1.87238438
## V6          0.13569065
## V7         -0.01345645
## V8         -0.04370565
## V9          0.00840438
## V10         0.02894814
## duplicate1  4.28331581
rfImp1 <- rfImp1 %>% add_row(Overall=NA) %>% setNames(c("model1")) 
rfImp2 <- rfImp2 %>% setNames(c("model2"))
names<- rownames(rfImp2) 
names[11] <- "V11"
Imp_all <- cbind(rfImp1,rfImp2)
rownames(Imp_all) <- NULL
Imp_all <- cbind(names,Imp_all) 
Imp_all
##    names       model1      model2
## 1     V1  8.732235404  5.69119973
## 2     V2  6.415369387  6.06896061
## 3     V3  0.763591825  0.62970218
## 4     V4  7.615118809  7.04752238
## 5     V5  2.023524577  1.87238438
## 6     V6  0.165111172  0.13569065
## 7     V7 -0.005961659 -0.01345645
## 8     V8 -0.166362581 -0.04370565
## 9     V9 -0.095292651  0.00840438
## 10   V10 -0.074944788  0.02894814
## 11   V11           NA  4.28331581

Intoducing a highly correlated variable decreased the importance score of V1. The impiortance of V1 now drops down to second place, with V4 becoming the dominant predictor.

(c) Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predict or importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

set.seed(200)
ctrl <- cforest_control(mtry = ncol(simulated) - 1)
cf_fit <- party::cforest(y ~ ., data = simulated, controls = ctrl)
cfImp1 <- varImp(cf_fit) 
cfImp2 <- varImp(cf_fit, conditional=T) 
cfImp1 <- cfImp1 %>% setNames(c("cf_ti")) 
cfImp2 <- cfImp2 %>% setNames(c("cf_ci"))
names<- rownames(cfImp2) 
names[11] <- "V11"
Imp_cf <- cbind(cfImp1,cfImp2)
rownames(Imp_cf) <- NULL
Imp_cf <- cbind(names,Imp_cf) 
Imp_cf
##    names       cf_ti        cf_ci
## 1     V1  3.75726531  0.600678542
## 2     V2  7.35210373  4.588727102
## 3     V3  0.02552462  0.005858870
## 4     V4  9.95219052  6.140435409
## 5     V5  2.07525107  0.726520861
## 6     V6 -0.03627375  0.003208094
## 7     V7  0.03866849  0.021965866
## 8     V8 -0.04209865 -0.004550812
## 9     V9 -0.02705505  0.003543102
## 10   V10  0.02546114  0.014950855
## 11   V11  5.51730707  0.774554525

The pattern of importance for the predictor variables is same, again variables V6 - V10 being not utilized significantly.

(d) Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

simulated_orig <- simulated[,-12]
gbm_fit <- gbm.fit(simulated_orig[,-11], simulated_orig[,11], n.trees = 100,distribution="gaussian")
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1       24.3716             nan     0.0010    0.0101
##      2       24.3587             nan     0.0010    0.0101
##      3       24.3460             nan     0.0010    0.0104
##      4       24.3345             nan     0.0010    0.0084
##      5       24.3227             nan     0.0010    0.0084
##      6       24.3126             nan     0.0010    0.0058
##      7       24.3006             nan     0.0010    0.0109
##      8       24.2875             nan     0.0010    0.0108
##      9       24.2755             nan     0.0010    0.0100
##     10       24.2643             nan     0.0010    0.0110
##     20       24.1394             nan     0.0010    0.0086
##     40       23.8966             nan     0.0010    0.0042
##     60       23.6548             nan     0.0010    0.0112
##     80       23.4069             nan     0.0010    0.0109
##    100       23.1839             nan     0.0010    0.0083
gbmImp1 <- varImp(gbm_fit, numTrees = 100)
gbmImp1 <- gbmImp1 %>% add_row(Overall=NA) %>% setNames(c("gbm1")) 
gbm_fit2 <- gbm.fit(simulated[,-11], simulated[,11], n.trees = 100,distribution="gaussian")
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1       24.3737             nan     0.0010    0.0095
##      2       24.3610             nan     0.0010    0.0087
##      3       24.3512             nan     0.0010    0.0047
##      4       24.3388             nan     0.0010    0.0103
##      5       24.3264             nan     0.0010    0.0098
##      6       24.3141             nan     0.0010    0.0080
##      7       24.3019             nan     0.0010    0.0069
##      8       24.2916             nan     0.0010    0.0091
##      9       24.2796             nan     0.0010    0.0104
##     10       24.2669             nan     0.0010    0.0104
##     20       24.1347             nan     0.0010    0.0114
##     40       23.8809             nan     0.0010    0.0095
##     60       23.6437             nan     0.0010    0.0091
##     80       23.4089             nan     0.0010    0.0063
##    100       23.1668             nan     0.0010    0.0090
gbmImp2 <- varImp(gbm_fit2, numTrees = 100)
gbmImp2 <- gbmImp2 %>%setNames(c("gbm2"))
imp <- data.frame(gbmImp1,gbmImp2)
imp
##                gbm1     gbm2
## V1         37303.00 26729.87
## V2         21991.62 19097.14
## V3             0.00     0.00
## V4         11202.37 13459.04
## V5             0.00     0.00
## V6             0.00     0.00
## V7             0.00     0.00
## V8             0.00     0.00
## V9             0.00     0.00
## V10            0.00     0.00
## duplicate1       NA 12834.88

We ran gradient boosting model, varibale importance pattern remains the same except V3, which is no longer a dominant predictor, in addition the scale for level of importance is different. The importance score is very high although pattern remains same except V3.

cubist_fit1 <- cubist(simulated_orig[,-11], simulated_orig[,11])
cubist_fit2 <- cubist(simulated[,-11],simulated[,11])
cbImp1 <- varImp(cubist_fit1)%>% add_row(Overall=NA) %>% setNames(c("Cubist1"))
cbImp2<- varImp(cubist_fit2) %>% setNames(c("Cubist2"))
cImp <- data.frame(cbImp1,cbImp2)
cImp
##            Cubist1 Cubist2
## V1              50      50
## V2              50      50
## V4              50      50
## V5              50      50
## duplicate1       0      50
## V3               0       0
## V6               0       0
## V7               0       0
## V8               0       0
## V9               0       0
## V10             NA       0

Cubist model provide a different inference, the pattern nremains same, with the addition of correlated predictor, both the pattern and level of significance for predictors remains same. Although the variable V3 is also no longer considered dominant as was the case with hgbm model.

8.2 Use a simulation to show tree bias with different granularities.

Predictors with higher number of distinct values are favoured more over granular predictors also referred as selection bias in tree based models. However if the data contains noise variables, they are favored over the low variance predictors. With the increase in noise in the data, the chances of selection of granular predictor increases. We will create simulated data for low variance predictor and higher granular predictor to check the tree bias.

https://stats.stackexchange.com/questions/262794/why-does-a-decision-tree-have-low-bias-high-variance

x1 <- rep(1:2, each=100) 
x2 <- rnorm(200, mean=0, sd=4)
y <- x1 + rnorm(200, mean= 0 , sd=1)
sim <- data.frame(y,x1,x2)

sim_tree2 <- rpart(y~., data=sim)
plot(as.party(sim_tree2))

varImp(sim_tree2)
##      Overall
## x1 0.3302428
## x2 0.4565380

Variable x2 is not correlated to the response, x1 has a low variance and should be preffered, however due to increase in noise, x2 is favoured and has more importance in the tree model.

8.3 In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

(a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

Learning rate is considered as a remedy for greediness strategy of the model to constrain the learning process. Smallest values of the learning rate works best, with the increase in learning rate model follows greedy strategy and selects few predictors as important. Also increase in bagging fraction increased randomness which will pick up few predictorsa as important. Therefore model on right with high learning rate and bagging fractions pick up few predictors as important.

(b) Which model do you think would be more predictive of other samples?

As per textbook, the greedy models have the drawbacks of not finding the optimal model as well as over fitting the training data. Lower values of learning rate are generally preferred as they make the model robust to the specific characteristics of tree and thus allowing it to generalize well. Less greedy model will be more predictive of other samples. Model on left would be more predictive.

(c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

With the increase in interaction depth, more predictotrs will be emphasized and considered as dominant model on right will be benefitted more.

8.7 Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
processPredictors = as.matrix(ChemicalManufacturingProcess[,2:58])
yield = ChemicalManufacturingProcess[,1]  
train_r <- createDataPartition(yield, p=0.75, list=F)
pp_train <- ChemicalManufacturingProcess[train_r,-1]
y_train <-  ChemicalManufacturingProcess[train_r,1]
pp_test <- ChemicalManufacturingProcess[-train_r,-1]
y_test <-  ChemicalManufacturingProcess[-train_r,1]
p_pro <- c("nzv", "center","scale", "medianImpute")

We will reun the previous models PLS and our best non linear model SVM from the previous exercises and compare with our regression tree models.

PLS

t_ctrl <- trainControl(method = "repeatedcv", repeats = 5)
pls_fit<-train(pp_train, y_train, method="pls", tuneLength = 10,preProcess=p_pro, trainControl=t_ctrl)
pls_fit
## Partial Least Squares 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56),
##  remove (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     1.561055  0.3774014  1.163414
##    2     1.934334  0.3446223  1.212661
##    3     1.787266  0.3816435  1.180361
##    4     2.050947  0.3508562  1.227950
##    5     2.389644  0.3250761  1.303656
##    6     2.517391  0.3155029  1.344945
##    7     2.634067  0.3178341  1.381436
##    8     2.673197  0.3228776  1.413350
##    9     2.823067  0.3215343  1.465793
##   10     2.991490  0.3087708  1.507498
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
pls_pred <- predict(pls_fit, pp_test)
pls_met <- postResample(pred = pls_pred, obs = y_test)

SVM

svm_fit <- train(pp_train, y_train, method="svmRadial", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl)
svm_fit
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56),
##  remove (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE      
##     0.25  1.420427  0.4334232  1.1464269
##     0.50  1.338306  0.4739383  1.0751440
##     1.00  1.272648  0.5133583  1.0191364
##     2.00  1.231911  0.5391300  0.9824667
##     4.00  1.206938  0.5537465  0.9581289
##     8.00  1.199927  0.5580732  0.9474439
##    16.00  1.199796  0.5578186  0.9465771
##    32.00  1.199796  0.5578186  0.9465771
##    64.00  1.199796  0.5578186  0.9465771
##   128.00  1.199796  0.5578186  0.9465771
## 
## Tuning parameter 'sigma' was held constant at a value of 0.0134643
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0134643 and C = 16.
plot(svm_fit)

svm_pred <- predict(svm_fit, newdata=pp_test)
svm_met<- postResample(pred=svm_pred,y_test)

Classification and Regression Tree

set.seed(100)
rpart_fit <- train(pp_train, y_train,method = "rpart2",tuneLength = 10, preProcess= p_pro,trControl = t_ctrl)
## note: only 9 possible values of the max tree depth from the initial fit.
##  Truncating the grid to 9 .
rpart_fit
## CART 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56),
##  remove (1) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 118, 118, 117, 119, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  RMSE      Rsquared   MAE     
##    1        1.371111  0.4725263  1.108974
##    2        1.474377  0.4026948  1.193796
##    3        1.501114  0.3911224  1.215081
##    4        1.507949  0.3890736  1.214658
##    5        1.506176  0.3993266  1.201189
##    6        1.509971  0.3989018  1.200145
##    8        1.520948  0.4039172  1.199111
##    9        1.529030  0.4031171  1.207630
##   10        1.531485  0.4010683  1.214313
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 1.
plot(rpart_fit)

rpart_pred <- predict(rpart_fit,newdata=pp_test)
rpart_met <- postResample(pred=rpart_pred,y_test)

Cubist

t_ctrl1 <- trainControl(method = "boot", number = 25)
cubist_fit = train(pp_train, y_train, method="cubist", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl1)
cubist_fit
## Cubist 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56),
##  remove (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE      
##    1          0          1.940278  0.2830995  1.4049493
##    1          5          1.929041  0.2893836  1.3878733
##    1          9          1.924183  0.2885351  1.3890951
##   10          0          1.362353  0.4781958  1.0397573
##   10          5          1.345388  0.4917660  1.0200376
##   10          9          1.351590  0.4856743  1.0260618
##   20          0          1.309823  0.5081134  1.0021506
##   20          5          1.288967  0.5239613  0.9770453
##   20          9          1.296986  0.5168396  0.9844058
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
plot(cubist_fit)

cubist_pred <- predict(cubist_fit,newdata=pp_test)
cub_met <- postResample(pred=cubist_pred,y_test)

Gradient Boosting

t_ctrl1 <- trainControl(method = "boot", number = 25)
gbmGrid <- expand.grid(interaction.depth=seq(1,6,by=1), n.trees=c(25,50,100,200), shrinkage=c(0.01,0.05,0.1,0.2), n.minobsinnode=10)
gbm_fit <- suppressWarnings(train(pp_train, y_train, method = "gbm", metric = "Rsquared",tuneGrid=gbmGrid,  trControl = t_ctrl1, verbose=F))
gbm_fit
## Stochastic Gradient Boosting 
## 
## 132 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  RMSE      Rsquared   MAE     
##   0.01       1                   25      1.712756  0.4454776  1.385802
##   0.01       1                   50      1.621988  0.4661492  1.312226
##   0.01       1                  100      1.503242  0.4837555  1.214042
##   0.01       1                  200      1.388204  0.4971314  1.115721
##   0.01       2                   25      1.690658  0.4748192  1.368556
##   0.01       2                   50      1.583752  0.4837508  1.280807
##   0.01       2                  100      1.451896  0.4984267  1.171379
##   0.01       2                  200      1.352892  0.5034784  1.079600
##   0.01       3                   25      1.679457  0.4736221  1.359423
##   0.01       3                   50      1.570638  0.4791374  1.272411
##   0.01       3                  100      1.439364  0.4926281  1.160686
##   0.01       3                  200      1.344050  0.5037625  1.069747
##   0.01       4                   25      1.678033  0.4744426  1.358522
##   0.01       4                   50      1.568640  0.4788039  1.271183
##   0.01       4                  100      1.436651  0.4911090  1.158420
##   0.01       4                  200      1.342080  0.5032271  1.067274
##   0.01       5                   25      1.679078  0.4722191  1.359078
##   0.01       5                   50      1.568681  0.4796975  1.269627
##   0.01       5                  100      1.438563  0.4914669  1.159340
##   0.01       5                  200      1.342580  0.5030176  1.068448
##   0.01       6                   25      1.679085  0.4774034  1.358302
##   0.01       6                   50      1.566659  0.4869812  1.267870
##   0.01       6                  100      1.436209  0.4952178  1.155661
##   0.01       6                  200      1.340005  0.5065398  1.064695
##   0.05       1                   25      1.461375  0.4834037  1.177216
##   0.05       1                   50      1.365967  0.4968327  1.092312
##   0.05       1                  100      1.322783  0.5038166  1.046304
##   0.05       1                  200      1.314258  0.5039139  1.031652
##   0.05       2                   25      1.421236  0.4802892  1.141874
##   0.05       2                   50      1.340784  0.4981844  1.068276
##   0.05       2                  100      1.314378  0.5053502  1.036043
##   0.05       2                  200      1.304759  0.5110213  1.021403
##   0.05       3                   25      1.408264  0.4835587  1.131230
##   0.05       3                   50      1.339575  0.4924542  1.062561
##   0.05       3                  100      1.324908  0.4965663  1.046805
##   0.05       3                  200      1.314981  0.5047032  1.035036
##   0.05       4                   25      1.405206  0.4777145  1.124655
##   0.05       4                   50      1.336463  0.4936479  1.055397
##   0.05       4                  100      1.312522  0.5051812  1.032138
##   0.05       4                  200      1.304388  0.5110657  1.021616
##   0.05       5                   25      1.408188  0.4783661  1.130393
##   0.05       5                   50      1.341027  0.4919164  1.064794
##   0.05       5                  100      1.320026  0.4998635  1.039727
##   0.05       5                  200      1.315285  0.5031942  1.034693
##   0.05       6                   25      1.406991  0.4776274  1.122459
##   0.05       6                   50      1.343760  0.4848734  1.055168
##   0.05       6                  100      1.322768  0.4945521  1.031141
##   0.05       6                  200      1.311970  0.5024329  1.022440
##   0.10       1                   25      1.356622  0.4905137  1.085220
##   0.10       1                   50      1.325515  0.4975304  1.050352
##   0.10       1                  100      1.318844  0.4989554  1.032740
##   0.10       1                  200      1.312747  0.5052850  1.027093
##   0.10       2                   25      1.369932  0.4706975  1.093761
##   0.10       2                   50      1.339275  0.4892267  1.057327
##   0.10       2                  100      1.339444  0.4887403  1.050442
##   0.10       2                  200      1.330924  0.4964887  1.044655
##   0.10       3                   25      1.370322  0.4629538  1.082265
##   0.10       3                   50      1.346881  0.4776285  1.058223
##   0.10       3                  100      1.332679  0.4891845  1.048575
##   0.10       3                  200      1.325857  0.4958345  1.041328
##   0.10       4                   25      1.335380  0.4968940  1.053380
##   0.10       4                   50      1.319177  0.5023743  1.038059
##   0.10       4                  100      1.315065  0.5061081  1.031123
##   0.10       4                  200      1.324114  0.5001117  1.036582
##   0.10       5                   25      1.339207  0.4907860  1.059958
##   0.10       5                   50      1.329592  0.4902802  1.043180
##   0.10       5                  100      1.321380  0.4965297  1.035965
##   0.10       5                  200      1.324372  0.4968625  1.041277
##   0.10       6                   25      1.354030  0.4779726  1.069869
##   0.10       6                   50      1.334962  0.4883882  1.052556
##   0.10       6                  100      1.331603  0.4899311  1.046333
##   0.10       6                  200      1.331162  0.4926485  1.044852
##   0.20       1                   25      1.348991  0.4747716  1.072010
##   0.20       1                   50      1.336140  0.4855201  1.051852
##   0.20       1                  100      1.334149  0.4913595  1.050797
##   0.20       1                  200      1.356129  0.4795985  1.067157
##   0.20       2                   25      1.350177  0.4782256  1.062311
##   0.20       2                   50      1.340871  0.4851169  1.050132
##   0.20       2                  100      1.349305  0.4858283  1.058905
##   0.20       2                  200      1.353554  0.4846227  1.063774
##   0.20       3                   25      1.363556  0.4633107  1.073163
##   0.20       3                   50      1.352829  0.4723090  1.070220
##   0.20       3                  100      1.362558  0.4695725  1.074481
##   0.20       3                  200      1.365353  0.4695418  1.075685
##   0.20       4                   25      1.370253  0.4613634  1.083815
##   0.20       4                   50      1.363473  0.4677953  1.071989
##   0.20       4                  100      1.369527  0.4661172  1.076336
##   0.20       4                  200      1.369108  0.4674363  1.073481
##   0.20       5                   25      1.368572  0.4607400  1.067419
##   0.20       5                   50      1.364387  0.4663933  1.062282
##   0.20       5                  100      1.364317  0.4687609  1.064470
##   0.20       5                  200      1.363963  0.4700279  1.064037
##   0.20       6                   25      1.375013  0.4554775  1.075382
##   0.20       6                   50      1.376221  0.4573589  1.078668
##   0.20       6                  100      1.384439  0.4535269  1.083926
##   0.20       6                  200      1.386157  0.4540562  1.084007
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200,
##  interaction.depth = 4, shrinkage = 0.05 and n.minobsinnode = 10.
plot(gbm_fit)

gbm_pred <- predict(gbm_fit,newdata=pp_test)
gbm_met<- postResample(pred=gbm_pred,y_test)
  1. Which tree-based regression model gives the optimal resampling and test set performance?
comb_met <- data.frame(pls_met, svm_met, rpart_met, cub_met,gbm_met) 
comb_met <- data.frame(t(comb_met))
model<- c("PLS", "SVM", "RPART", "CUBIST", "GBM")
rownames(comb_met) <- NULL
comb_met <- data.frame(cbind(model,comb_met))
comb_met
##    model     RMSE  Rsquared       MAE
## 1    PLS 1.430764 0.4467640 1.2118667
## 2    SVM 1.150287 0.6799685 0.8682791
## 3  RPART 1.578932 0.3276660 1.2274628
## 4 CUBIST 1.014967 0.7530604 0.7547706
## 5    GBM 1.023675 0.7482967 0.7649099

Cubist Model outperforms the previous pls, previous nonlinear SVM and regression tree models with the lowest RMSE and hioghest R squared values on the test set.

  1. Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

The variable importance from Cubist Model reveals that dominant predictors are aligned with the results from the previous linear and non-linear models. ManufacturingProcess32 was the most important predictor variables in all three models. In addition, in all three models ManufacturingProcess variables were dominant than the BiologicalMaterila variables. The ratio of importance of Manufacturing to Biological in top 10 variables was almost same in three models. The level of significance was however slightly different for the dominan predictors.

cub_dot <-dotPlot(varImp(cubist_fit), top=10)
pls_dot <-dotPlot(varImp(pls_fit), top=10)
## Warning: package 'pls' was built under R version 3.5.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
svm_dot <- dotPlot(varImp(svm_fit), top=10)
#grid.arrange(cub_dot,pls_dot, ncol=2)
cub_dot

pls_dot

svm_dot

cImp<- varImp(cubist_fit)$importance %>%setNames("CUBIST") %>% rownames_to_column("var")%>% arrange(desc(CUBIST)) %>%top_n(10)
## Selecting by CUBIST
pImp<- varImp(pls_fit)$importance %>%setNames("PLS") %>% rownames_to_column("var")%>% arrange(desc(PLS)) %>%top_n(10)
## Selecting by PLS
sImp<- varImp(svm_fit)$importance %>%setNames("SVM")%>% rownames_to_column("var")%>% arrange(desc(SVM)) %>%top_n(10)
## Selecting by SVM
imp_list <- data.frame(cImp,pImp,sImp)
imp_list
##                       var    CUBIST                  var.1       PLS
## 1    BiologicalMaterial03 100.00000 ManufacturingProcess32 100.00000
## 2  ManufacturingProcess32  79.26829 ManufacturingProcess13  86.07998
## 3  ManufacturingProcess17  56.09756 ManufacturingProcess36  84.94073
## 4  ManufacturingProcess09  48.78049 ManufacturingProcess09  80.34108
## 5  ManufacturingProcess33  47.56098   BiologicalMaterial06  75.75377
## 6    BiologicalMaterial02  36.58537   BiologicalMaterial03  75.21953
## 7  ManufacturingProcess04  35.36585   BiologicalMaterial02  75.18435
## 8    BiologicalMaterial12  34.14634 ManufacturingProcess33  68.07548
## 9    BiologicalMaterial06  32.92683 ManufacturingProcess12  67.27790
## 10 ManufacturingProcess21  29.26829 ManufacturingProcess17  67.20556
##                     var.2       SVM
## 1  ManufacturingProcess32 100.00000
## 2  ManufacturingProcess13  93.47736
## 3    BiologicalMaterial06  82.54199
## 4    BiologicalMaterial12  78.21556
## 5  ManufacturingProcess36  74.99325
## 6  ManufacturingProcess17  74.50045
## 7    BiologicalMaterial03  74.46390
## 8  ManufacturingProcess09  68.45708
## 9  ManufacturingProcess31  65.16971
## 10 ManufacturingProcess11  62.00754
  1. Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

ManufacturingProcess32 is the top predictor which is in agreement with the other models. The biological predictors does not seems to have much dominance with the response yield and the results are in accordance with previous models providing no additional knowledge of biological predictors with yield.

rp_fit <- rpart(y_train~., pp_train, maxdepth=3)
plot(as.party(rp_fit),gp=gpar(fontsize = 8))

#https://www.statmethods.net/advstats/cart.html