suppressPackageStartupMessages(library(AppliedPredictiveModeling))
suppressPackageStartupMessages(library(party))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(partykit))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(randomForest))
suppressPackageStartupMessages(library(gbm))
suppressPackageStartupMessages(library(Cubist))
suppressPackageStartupMessages(library(rpart))
suppressPackageStartupMessages(library(varImp))

Home Work 9

Question 8.1

Recreate the simulated data from Exercise 7.2:

library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

(A)

Fit a random forest model to all of the predictors, then estimate the variable importance scores:

model1 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp1 <- varImp(model1, scale = FALSE)
## Error in varImp(model1, scale = FALSE): trying to get slot "responses" from an object (class "randomForest.formula") that is not an S4 object

Did the random forest model significantly use the uninformative predictors (V6 – V10)?

rfImp1
## Error in eval(expr, envir, enclos): object 'rfImp1' not found

No, the uninformative predictors (V6 – V10) were NOT significantly used.

(B)

Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9396216

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

model2 <- randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 )
rfImp2 <- varImp(model2, scale = FALSE)
## Error in varImp(model2, scale = FALSE): trying to get slot "responses" from an object (class "randomForest.formula") that is not an S4 object
rfImp2
## Error in eval(expr, envir, enclos): object 'rfImp2' not found

Introducing a highly correlated variable significantly decreased the importance score of V1

(C)

Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

ctrl <- cforest_control(mtry = ncol(simulated) - 1)
cf_fit <- party::cforest(y ~ ., data = simulated, controls = ctrl)
cfImp1 <- varImp(cf_fit) 
## Error in varImp(cf_fit): Measure is not suitable for regression
cfImp2 <- varImp(cf_fit, conditional=T) 
## Error in varImp(cf_fit, conditional = T): Measure is not suitable for regression
cfImp1 <- cfImp1 %>% setNames(c("cf_ti")) 
## Error in setNames(., c("cf_ti")): object 'cfImp1' not found
cfImp2 <- cfImp2 %>% setNames(c("cf_ci"))
## Error in setNames(., c("cf_ci")): object 'cfImp2' not found
names<- rownames(cfImp2) 
## Error in rownames(cfImp2): object 'cfImp2' not found
names[11] <- "V11"
## Error in names[11] <- "V11": object of type 'builtin' is not subsettable
Imp_cf <- cbind(cfImp1,cfImp2)
## Error in cbind(cfImp1, cfImp2): object 'cfImp1' not found
rownames(Imp_cf) <- NULL
## Error in rownames(Imp_cf) <- NULL: object 'Imp_cf' not found
Imp_cf <- cbind(names,Imp_cf) 
## Error in cbind(names, Imp_cf): object 'Imp_cf' not found
Imp_cf
## Error in eval(expr, envir, enclos): object 'Imp_cf' not found

The pattern of importance for the predictor variables is same. The uninformative predictors (V6 – V10) still NOT significantly used.

(D)

Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

simulated_orig <- simulated[,-12]
gbm_fit <- gbm.fit(simulated_orig[,-11], simulated_orig[,11], n.trees = 100,distribution="gaussian")
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1       24.3718             nan     0.0010    0.0114
##      2       24.3594             nan     0.0010    0.0135
##      3       24.3430             nan     0.0010    0.0110
##      4       24.3307             nan     0.0010    0.0082
##      5       24.3172             nan     0.0010    0.0118
##      6       24.3071             nan     0.0010    0.0042
##      7       24.2980             nan     0.0010    0.0063
##      8       24.2843             nan     0.0010    0.0126
##      9       24.2707             nan     0.0010    0.0118
##     10       24.2590             nan     0.0010    0.0118
##     20       24.1390             nan     0.0010    0.0108
##     40       23.8754             nan     0.0010    0.0099
##     60       23.6395             nan     0.0010    0.0099
##     80       23.3995             nan     0.0010    0.0107
##    100       23.1708             nan     0.0010    0.0067
gbmImp1 <- varImp(gbm_fit, numTrees = 100)
## Error in varImp(gbm_fit, numTrees = 100): trying to get slot "responses" from an object (class "gbm") that is not an S4 object
gbmImp1 <- gbmImp1 %>% add_row(Overall=NA) %>% setNames(c("gbm1")) 
## Error in add_row(., Overall = NA): object 'gbmImp1' not found
gbm_fit2 <- gbm.fit(simulated[,-11], simulated[,11], n.trees = 100,distribution="gaussian")
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1       24.3714             nan     0.0010    0.0118
##      2       24.3597             nan     0.0010    0.0114
##      3       24.3465             nan     0.0010    0.0116
##      4       24.3350             nan     0.0010    0.0080
##      5       24.3203             nan     0.0010    0.0126
##      6       24.3049             nan     0.0010    0.0104
##      7       24.2922             nan     0.0010    0.0101
##      8       24.2794             nan     0.0010    0.0103
##      9       24.2681             nan     0.0010    0.0101
##     10       24.2553             nan     0.0010    0.0127
##     20       24.1255             nan     0.0010    0.0121
##     40       23.8756             nan     0.0010    0.0083
##     60       23.6306             nan     0.0010    0.0068
##     80       23.3951             nan     0.0010    0.0102
##    100       23.1574             nan     0.0010    0.0089
gbmImp2 <- varImp(gbm_fit2, numTrees = 100)
## Error in varImp(gbm_fit2, numTrees = 100): trying to get slot "responses" from an object (class "gbm") that is not an S4 object
gbmImp2 <- gbmImp2 %>%setNames(c("gbm2"))
## Error in setNames(., c("gbm2")): object 'gbmImp2' not found
imp <- data.frame(gbmImp1,gbmImp2)
## Error in data.frame(gbmImp1, gbmImp2): object 'gbmImp1' not found
imp
## Error in eval(expr, envir, enclos): object 'imp' not found
cubist_fit1 <- cubist(simulated_orig[,-11], simulated_orig[,11])
cubist_fit2 <- cubist(simulated[,-11],simulated[,11])
cbImp1 <- varImp(cubist_fit1)%>% add_row(Overall=NA) %>% setNames(c("Cubist1"))
## Error in varImp(cubist_fit1): trying to get slot "responses" from an object (class "cubist") that is not an S4 object
cbImp2<- varImp(cubist_fit2) %>% setNames(c("Cubist2"))
## Error in varImp(cubist_fit2): trying to get slot "responses" from an object (class "cubist") that is not an S4 object
cImp <- data.frame(cbImp1,cbImp2)
## Error in data.frame(cbImp1, cbImp2): object 'cbImp1' not found
cImp
## Error in eval(expr, envir, enclos): object 'cImp' not found

The same pattern occurs with the other models

Question 8.2

Use a simulation to show tree bias with different granularities.

x1 <- rep(1:2, each=100) 
x2 <- rnorm(200, mean=0, sd=4)
y <- x1 + rnorm(200, mean= 0 , sd=1)
sim <- data.frame(y,x1,x2)

sim_tree2 <- rpart(y~., data=sim)
plot(as.party(sim_tree2))

varImp(sim_tree2)
## Error in varImp(sim_tree2): trying to get slot "responses" from an object (class "rpart") that is not an S4 object

Question 8.3

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

(A)

Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The model on the right has bagging fraction set high (0.9) therefore fewer predictors will be regarded as important - the stochastic element is smaller.

(B)

Which model do you think would be more predictive of other samples?

The model on the left would be more predictive, the model on the right would miss predictors.

(C)

How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

Increasing interaction depth would cause more predictors to become dominant.

Question 8.7

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

data(ChemicalManufacturingProcess)
processPredictors = as.matrix(ChemicalManufacturingProcess[,2:58])
yield = ChemicalManufacturingProcess[,1]  
train_r <- createDataPartition(yield, p=0.75, list=F)
pp_train <- ChemicalManufacturingProcess[train_r,-1]
y_train <-  ChemicalManufacturingProcess[train_r,1]
pp_test <- ChemicalManufacturingProcess[-train_r,-1]
y_test <-  ChemicalManufacturingProcess[-train_r,1]
p_pro <- c("nzv", "center","scale", "medianImpute")
#PLS
t_ctrl <- trainControl(method = "repeatedcv", repeats = 5)
pls_fit<-train(pp_train, y_train, method="pls", tuneLength = 10,preProcess=p_pro, trainControl=t_ctrl)
pls_fit
## Partial Least Squares 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     1.619672  0.3899854  1.226014
##    2     1.806724  0.4151920  1.192454
##    3     1.515115  0.5099394  1.077925
##    4     1.518033  0.5117636  1.077668
##    5     1.745627  0.4634265  1.131591
##    6     1.874700  0.4292937  1.190204
##    7     2.070938  0.4001250  1.255639
##    8     2.309731  0.3723671  1.331463
##    9     2.574058  0.3500995  1.398325
##   10     2.728626  0.3305441  1.446876
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
pls_pred <- predict(pls_fit, pp_test)
pls_met <- postResample(pred = pls_pred, obs = y_test)
#SVM
svm_fit <- train(pp_train, y_train, method="svmRadial", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl)
svm_fit
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE      
##     0.25  1.497549  0.4979407  1.1897643
##     0.50  1.384163  0.5380575  1.0905659
##     1.00  1.310488  0.5727228  1.0261071
##     2.00  1.262695  0.5949758  0.9835300
##     4.00  1.250438  0.5969767  0.9696230
##     8.00  1.254478  0.5938080  0.9710719
##    16.00  1.254132  0.5939378  0.9706330
##    32.00  1.254132  0.5939378  0.9706330
##    64.00  1.254132  0.5939378  0.9706330
##   128.00  1.254132  0.5939378  0.9706330
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01378345
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01378345 and C = 4.
svm_pred <- predict(svm_fit, newdata=pp_test)
svm_met<- postResample(pred=svm_pred,y_test)
# Classification and Regression Tree
rpart_fit <- train(pp_train, y_train,method = "rpart2",tuneLength = 10, preProcess= p_pro,trControl = t_ctrl)
rpart_fit
## CART 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 120, 119, 116, 119, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  RMSE      Rsquared   MAE      
##    1        1.373299  0.5120430  1.1074630
##    2        1.356566  0.5232234  1.1041125
##    3        1.318448  0.5513454  1.0556657
##    4        1.197006  0.6275368  0.9461867
##    5        1.180373  0.6432835  0.9356485
##    6        1.161420  0.6536918  0.9143704
##    7        1.170932  0.6485803  0.9262424
##    8        1.171596  0.6472252  0.9217154
##    9        1.165374  0.6509283  0.9121702
##   10        1.165212  0.6510336  0.9116365
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 6.
rpart_pred <- predict(rpart_fit,newdata=pp_test)
rpart_met <- postResample(pred=rpart_pred,y_test)
# Cubist
t_ctrl1 <- trainControl(method = "boot", number = 25)
cubist_fit = train(pp_train, y_train, method="cubist", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl1)
cubist_fit
## Cubist 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE      
##    1          0          1.740484  0.3924867  1.2475545
##    1          5          1.700805  0.4134543  1.2094519
##    1          9          1.710865  0.4060152  1.2203114
##   10          0          1.216010  0.6028660  0.9256945
##   10          5          1.181225  0.6225655  0.8903929
##   10          9          1.197269  0.6133188  0.9052482
##   20          0          1.161968  0.6397063  0.8820953
##   20          5          1.127313  0.6579207  0.8449674
##   20          9          1.142622  0.6494874  0.8604530
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
cubist_pred <- predict(cubist_fit,newdata=pp_test)
cub_met <- postResample(pred=cubist_pred,y_test)
# Gradient Boosting
t_ctrl1 <- trainControl(method = "boot", number = 25)
gbmGrid <- expand.grid(interaction.depth=seq(1,6,by=1), n.trees=c(25,50,100,200), shrinkage=c(0.01,0.05,0.1,0.2), n.minobsinnode=10)
gbm_fit <- suppressWarnings(train(pp_train, y_train, method = "gbm", metric = "Rsquared",tuneGrid=gbmGrid,  trControl = t_ctrl1, verbose=F))


gbm_pred <- predict(gbm_fit,newdata=pp_test)
gbm_met<- postResample(pred=gbm_pred,y_test)

(A)

Which tree-based regression model gives the optimal resampling and test set performance?

comb_met <- data.frame(pls_met, svm_met, rpart_met, cub_met,gbm_met) 
comb_met <- data.frame(t(comb_met))
model<- c("PLS", "SVM", "RPART", "CUBIST", "GBM")
rownames(comb_met) <- NULL
comb_met <- data.frame(cbind(model,comb_met))
comb_met
##    model     RMSE  Rsquared       MAE
## 1    PLS 1.382631 0.4188962 1.1093949
## 2    SVM 1.119372 0.5429422 0.9450940
## 3  RPART 1.609950 0.2345724 1.1844448
## 4 CUBIST 1.097507 0.5919357 0.8329999
## 5    GBM 1.248234 0.4635216 0.9993444

SVM gave the best resampling and test set performance with lowest RMSE and hioghest R squared values.

(B)

Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

varImp(svm_fit)
## Error in varImp(svm_fit): trying to get slot "responses" from an object (class "train") that is not an S4 object

The manufacturing processes dominate

(C)

Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

rp_fit <- rpart(y_train~., pp_train, maxdepth=3)
plot(as.party(rp_fit),gp=gpar(fontsize = 8))

The distribution of yield does not favor the biological predictors, Manufacturing Processes are top predictors.