suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(randomForest))
suppressPackageStartupMessages(library(AppliedPredictiveModeling))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(partykit))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(party))
suppressPackageStartupMessages(library(gbm))
suppressPackageStartupMessages(library(Cubist))
suppressPackageStartupMessages(library(rpart))
#suppressPackageStartupMessages(library(varImp))
Recreate the simulated data from Exercise 7.2:
library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
(A) Fit a random forest model to all of the predictors, then estimate the variable importance scores:
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
Did the random forest model significantly use the uninformative predictors (V6 – V10)?
rfImp1
## Overall
## V1 8.83890885
## V2 6.49023056
## V3 0.67583163
## V4 7.58822553
## V5 2.27426009
## V6 0.17436781
## V7 0.15136583
## V8 -0.03078937
## V9 -0.02989832
## V10 -0.08529218
No, the uninformative predictors (V6 – V10) were NOT significantly used.
(B) Now add an additional predictor that is highly correlated with one of the informative predictors. For example:
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9396216
Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
model2 <- randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 )
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
## Overall
## V1 6.29780744
## V2 6.08038134
## V3 0.58410718
## V4 6.93924427
## V5 2.03104094
## V6 0.07947642
## V7 -0.02566414
## V8 -0.11007435
## V9 -0.08839463
## V10 -0.00715093
## duplicate1 3.56411581
Introducing a highly correlated variable significantly decreased the importance score of V1
(C) Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?
ctrl <- cforest_control(mtry = ncol(simulated) - 1)
cf_fit <- party::cforest(y ~ ., data = simulated, controls = ctrl)
cfImp1 <- varImp(cf_fit)
cfImp2 <- varImp(cf_fit, conditional=T)
cfImp1 <- cfImp1 %>% setNames(c("cf_ti"))
cfImp2 <- cfImp2 %>% setNames(c("cf_ci"))
names<- rownames(cfImp2)
names[11] <- "V11"
Imp_cf <- cbind(cfImp1,cfImp2)
rownames(Imp_cf) <- NULL
Imp_cf <- cbind(names,Imp_cf)
Imp_cf
## names cf_ti cf_ci
## 1 V1 8.41556912 1.761521323
## 2 V2 7.76850810 4.673402231
## 3 V3 0.01775823 0.028797708
## 4 V4 10.14481309 5.659304965
## 5 V5 2.30344986 0.843799759
## 6 V6 -0.02006879 -0.006068060
## 7 V7 0.07769047 0.026783185
## 8 V8 -0.01937243 0.001640106
## 9 V9 -0.02654575 -0.008257553
## 10 V10 0.01218696 0.003307968
## 11 V11 1.32708646 0.098630026
The pattern of importance for the predictor variables is same. The uninformative predictors (V6 – V10) still NOT significantly used.
(D) Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?
simulated_orig <- simulated[,-12]
gbm_fit <- gbm.fit(simulated_orig[,-11], simulated_orig[,11], n.trees = 100,distribution="gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 24.3778 nan 0.0010 0.0008
## 2 24.3656 nan 0.0010 0.0114
## 3 24.3520 nan 0.0010 0.0117
## 4 24.3387 nan 0.0010 0.0098
## 5 24.3277 nan 0.0010 0.0084
## 6 24.3128 nan 0.0010 0.0121
## 7 24.2983 nan 0.0010 0.0106
## 8 24.2888 nan 0.0010 0.0064
## 9 24.2771 nan 0.0010 0.0108
## 10 24.2647 nan 0.0010 0.0107
## 20 24.1401 nan 0.0010 0.0114
## 40 23.8962 nan 0.0010 0.0115
## 60 23.6494 nan 0.0010 0.0098
## 80 23.4065 nan 0.0010 0.0099
## 100 23.1742 nan 0.0010 0.0100
gbmImp1 <- varImp(gbm_fit, numTrees = 100)
gbmImp1 <- gbmImp1 %>% add_row(Overall=NA) %>% setNames(c("gbm1"))
gbm_fit2 <- gbm.fit(simulated[,-11], simulated[,11], n.trees = 100,distribution="gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 24.3736 nan 0.0010 0.0081
## 2 24.3615 nan 0.0010 0.0116
## 3 24.3463 nan 0.0010 0.0110
## 4 24.3332 nan 0.0010 0.0093
## 5 24.3179 nan 0.0010 0.0101
## 6 24.3059 nan 0.0010 0.0102
## 7 24.2932 nan 0.0010 0.0100
## 8 24.2836 nan 0.0010 0.0068
## 9 24.2695 nan 0.0010 0.0113
## 10 24.2564 nan 0.0010 0.0116
## 20 24.1267 nan 0.0010 0.0061
## 40 23.8757 nan 0.0010 0.0095
## 60 23.6213 nan 0.0010 0.0110
## 80 23.3848 nan 0.0010 0.0085
## 100 23.1479 nan 0.0010 0.0099
gbmImp2 <- varImp(gbm_fit2, numTrees = 100)
gbmImp2 <- gbmImp2 %>%setNames(c("gbm2"))
imp <- data.frame(gbmImp1,gbmImp2)
imp
## gbm1 gbm2
## V1 41764.4452 37333.666
## V2 16991.1125 19873.995
## V3 0.0000 0.000
## V4 12039.7283 11147.187
## V5 621.5645 0.000
## V6 0.0000 0.000
## V7 0.0000 0.000
## V8 0.0000 0.000
## V9 0.0000 0.000
## V10 0.0000 0.000
## ...11 NA 5938.015
cubist_fit1 <- cubist(simulated_orig[,-11], simulated_orig[,11])
cubist_fit2 <- cubist(simulated[,-11],simulated[,11])
cbImp1 <- varImp(cubist_fit1)%>% add_row(Overall=NA) %>% setNames(c("Cubist1"))
cbImp2<- varImp(cubist_fit2) %>% setNames(c("Cubist2"))
cImp <- data.frame(cbImp1,cbImp2)
cImp
## Cubist1 Cubist2
## V1 50 50
## V2 50 50
## V4 50 50
## V5 50 50
## V3 0 0
## V6 0 0
## V7 0 0
## V8 0 0
## V9 0 0
## V10 0 0
## ...11 NA 0
The same pattern occurs with the other models
Use a simulation to show tree bias with different granularities.
x1 <- rep(1:2, each=100)
x2 <- rnorm(200, mean=0, sd=4)
y <- x1 + rnorm(200, mean= 0 , sd=1)
sim <- data.frame(y,x1,x2)
sim_tree2 <- rpart(y~., data=sim)
plot(as.party(sim_tree2))
varImp(sim_tree2)
## Overall
## x1 0.2047092
## x2 0.4212352
In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
(A) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?
The model on the right has bagging fraction set high (0.9) therefore fewer predictors will be regarded as important - the stochastic element is smaller.
(B) Which model do you think would be more predictive of other samples?
The model on the left would be more predictive, the model on the right would miss predictors.
(C) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?
Increasing interaction depth would cause more predictors to become dominant.
Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
data(ChemicalManufacturingProcess)
processPredictors = as.matrix(ChemicalManufacturingProcess[,2:58])
yield = ChemicalManufacturingProcess[,1]
train_r <- createDataPartition(yield, p=0.75, list=F)
pp_train <- ChemicalManufacturingProcess[train_r,-1]
y_train <- ChemicalManufacturingProcess[train_r,1]
pp_test <- ChemicalManufacturingProcess[-train_r,-1]
y_test <- ChemicalManufacturingProcess[-train_r,1]
p_pro <- c("nzv", "center","scale", "medianImpute")
#PLS
t_ctrl <- trainControl(method = "repeatedcv", repeats = 5)
pls_fit<-train(pp_train, y_train, method="pls", tuneLength = 10,preProcess=p_pro, trainControl=t_ctrl)
pls_fit
## Partial Least Squares
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.561215 0.3998819 1.185143
## 2 1.764208 0.3938604 1.153681
## 3 1.615733 0.4568704 1.132305
## 4 1.692610 0.4541577 1.152932
## 5 1.858179 0.4177678 1.234912
## 6 2.002659 0.3961615 1.288455
## 7 2.073512 0.3819987 1.323774
## 8 2.186294 0.3658923 1.371203
## 9 2.291930 0.3512078 1.413300
## 10 2.363162 0.3409140 1.441327
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
pls_pred <- predict(pls_fit, pp_test)
pls_met <- postResample(pred = pls_pred, obs = y_test)
#SVM
svm_fit <- train(pp_train, y_train, method="svmRadial", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl)
svm_fit
## Support Vector Machines with Radial Basis Function Kernel
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.445752 0.4567120 1.1784961
## 0.50 1.335505 0.5129416 1.0792990
## 1.00 1.258567 0.5584455 1.0094827
## 2.00 1.230776 0.5703510 0.9824361
## 4.00 1.218184 0.5763996 0.9721202
## 8.00 1.217078 0.5770642 0.9717604
## 16.00 1.217181 0.5769676 0.9716910
## 32.00 1.217181 0.5769676 0.9716910
## 64.00 1.217181 0.5769676 0.9716910
## 128.00 1.217181 0.5769676 0.9716910
##
## Tuning parameter 'sigma' was held constant at a value of 0.01472647
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01472647 and C = 8.
svm_pred <- predict(svm_fit, newdata=pp_test)
svm_met<- postResample(pred=svm_pred,y_test)
# Classification and Regression Tree
rpart_fit <- train(pp_train, y_train,method = "rpart2",tuneLength = 10, preProcess= p_pro,trControl = t_ctrl)
## note: only 8 possible values of the max tree depth from the initial fit.
## Truncating the grid to 8 .
rpart_fit
## CART
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 120, 118, 118, 120, 119, 119, ...
## Resampling results across tuning parameters:
##
## maxdepth RMSE Rsquared MAE
## 1 1.657043 0.2668565 1.318424
## 2 1.664055 0.2923191 1.310737
## 3 1.622978 0.3267926 1.299452
## 4 1.605143 0.3450805 1.269968
## 5 1.575368 0.3725999 1.250045
## 6 1.547986 0.3883784 1.216469
## 8 1.532809 0.4067889 1.182646
## 9 1.531029 0.4077865 1.179478
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 9.
rpart_pred <- predict(rpart_fit,newdata=pp_test)
rpart_met <- postResample(pred=rpart_pred,y_test)
# Cubist
t_ctrl1 <- trainControl(method = "boot", number = 25)
cubist_fit = train(pp_train, y_train, method="cubist", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl1)
cubist_fit
## Cubist
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (56), scaled (56), median imputation (56), remove (1)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 1.739171 0.3714657 1.3056829
## 1 5 1.713417 0.3847065 1.2754138
## 1 9 1.722277 0.3801796 1.2832894
## 10 0 1.202202 0.5896811 0.9216567
## 10 5 1.184232 0.6014384 0.8984029
## 10 9 1.193261 0.5950013 0.9081524
## 20 0 1.172837 0.6073090 0.9051957
## 20 5 1.154413 0.6191364 0.8799411
## 20 9 1.163940 0.6126305 0.8908981
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
cubist_pred <- predict(cubist_fit,newdata=pp_test)
cub_met <- postResample(pred=cubist_pred,y_test)
# Gradient Boosting
t_ctrl1 <- trainControl(method = "boot", number = 25)
gbmGrid <- expand.grid(interaction.depth=seq(1,6,by=1), n.trees=c(25,50,100,200), shrinkage=c(0.01,0.05,0.1,0.2), n.minobsinnode=10)
gbm_fit <- suppressWarnings(train(pp_train, y_train, method = "gbm", metric = "Rsquared",tuneGrid=gbmGrid, trControl = t_ctrl1, verbose=F))
gbm_pred <- predict(gbm_fit,newdata=pp_test)
gbm_met<- postResample(pred=gbm_pred,y_test)
(A) Which tree-based regression model gives the optimal resampling and test set performance?
comb_met <- data.frame(pls_met, svm_met, rpart_met, cub_met,gbm_met)
comb_met <- data.frame(t(comb_met))
model<- c("PLS", "SVM", "RPART", "CUBIST", "GBM")
rownames(comb_met) <- NULL
comb_met <- data.frame(cbind(model,comb_met))
comb_met
## model RMSE Rsquared MAE
## 1 PLS 1.442490 0.3294248 1.1384654
## 2 SVM 1.176656 0.5411589 0.9476897
## 3 RPART 1.431202 0.3780401 1.0958127
## 4 CUBIST 1.247379 0.5149756 0.9185457
## 5 GBM 1.192515 0.5372166 0.9174782
SVM gave the best resampling and test set performance with lowest RMSE and hioghest R squared values.
(B) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?
varImp(svm_fit)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess17 88.97
## ManufacturingProcess09 86.27
## ManufacturingProcess32 84.20
## BiologicalMaterial06 80.90
## BiologicalMaterial12 79.59
## ManufacturingProcess11 71.68
## BiologicalMaterial03 69.52
## ManufacturingProcess36 68.48
## ManufacturingProcess31 67.85
## BiologicalMaterial02 65.20
## ManufacturingProcess30 60.45
## ManufacturingProcess06 59.94
## BiologicalMaterial11 53.31
## BiologicalMaterial08 49.54
## BiologicalMaterial04 46.72
## ManufacturingProcess33 43.20
## BiologicalMaterial09 39.94
## ManufacturingProcess29 38.05
## BiologicalMaterial01 36.95
The manufacturing processes dominate
(C) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?
rp_fit <- rpart(y_train~., pp_train, maxdepth=3)
plot(as.party(rp_fit),gp=gpar(fontsize = 8))
The distribution of yield does not favor the biological
predictors, Manufacturing Processes are top predictors.