Do problems 8.1, 8.2, 8.3, and 8.7 in Kuhn and Johnson. Please submit the Rpubs link along with the .rmd file.
8.1. Recreate the simulated data from Exercise 7.2:
#install.packages('tidyr')
library(tidyr)
library(earth)
## Loading required package: plotmo
## Loading required package: Formula
## Loading required package: plotrix
## Loading required package: TeachingDemos
library(AppliedPredictiveModeling)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(missMDA)
library(DMwR)
## Loading required package: grid
library(ggplot2) # plotting
library(vip) # variable importance
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(pdp) # variable relationships
library(partykit)
## Loading required package: libcoin
## Loading required package: mvtnorm
library(rpart)
library(mda)
## Loading required package: class
## Loaded mda 0.4-10
library(party)
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'party'
## The following objects are masked from 'package:partykit':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner,
## node_surv, node_terminal, varimp
library(gbm)
## Loaded gbm 2.1.5
library(Cubist)
library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
Did the random forest model signi???cantly use the uninformative predictors (V6 - V10)? We can see that the predictors V6 - V10 are not informative because they are less than 0.5
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
## Overall
## V1 5.69119973
## V2 6.06896061
## V3 0.62970218
## V4 7.04752238
## V5 1.87238438
## V6 0.13569065
## V7 -0.01345645
## V8 -0.04370565
## V9 0.00840438
## V10 0.02894814
## duplicate1 4.28331581
We can see that the Overall importance of VI dropped to 5.5 compared to 8.7 score. This additional correlated predictor is negatively impacting
Adding another predictor highly correlated with V1
simulated$duplicate2 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate2, simulated$V1)
## [1] 0.9408631
model3 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp3 <- varImp(model3, scale = FALSE)
rfImp3
## Overall
## V1 4.91687329
## V2 6.52816504
## V3 0.58711552
## V4 7.04870917
## V5 2.03115561
## V6 0.14213148
## V7 0.10991985
## V8 -0.08405687
## V9 -0.01075028
## V10 0.09230576
## duplicate1 3.80068234
## duplicate2 1.87721959
This additional predictor further reduced the importance of the V1
#install.packages('partykit')
bagCtrl <- cforest_control(mtry = ncol(simulated) - 1)
baggedTree <- party::cforest(y ~ ., data =simulated, controls = bagCtrl )
#baggedTree <-train(simulated[,-y],simulated[,'y'] , method = "cforest")
#bgImp3 <- varImp(baggedTree)
#bgImp3
Yes this shows a very similar pattern to the random forest model
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2), n.trees = seq(100, 1000, by = 50), shrinkage = c(0.01, 0.1), n.minobsinnode=c(10))
set.seed(100)
gbmTune <- train(simulated[,!names(simulated) %in% "y"],simulated[,c('y')] , method = "gbm", tuneGrid = gbmGrid,verbose = FALSE)
varImp(gbmTune)
## gbm variable importance
##
## Overall
## V4 100.0000
## V2 89.2197
## V1 71.2702
## V5 39.0132
## V3 32.4961
## duplicate1 19.4348
## duplicate2 9.5860
## V7 3.5989
## V6 2.0797
## V10 1.0239
## V9 0.4966
## V8 0.0000
The results are a bit different as in this case V2 is ranked highest
Cubist
cubistMod <- cubist(simulated[,!names(simulated) %in% "y"],simulated[,c('y')])
varImp(cubistMod)
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## duplicate1 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
## duplicate2 0
THis is a bit different as well but more in line with the boosted trees model
Use a simulation to show tree bias with di???erent granularities
sim2 <- as.data.frame(cbind(runif(100, 1, 3000), floor(runif(100, 1,100)), floor(runif(100, 25,75)), floor(runif(100,50,60))))
colnames(sim2) <- c("Y", "high", "middle", "low")
rpartTree <- rpart(Y~., sim2 )
varImp(rpartTree)
## Overall
## high 0.3310383
## low 0.1875499
## middle 0.2157140
plot(as.party(rpartTree))
In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters a???ect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
Learning rate effect: as new trees are created to correct the residual errors in the predictions from the existing sequence of trees. The effect is that the model can quickly fit, then overfit the training dataset. A technique to slow down the learning in the gradient boosting model is to apply a weighting factor for the corrections by new trees when added to the model. Setting values less than 1.0 has the effect of making less corrections for each tree added to the model. This in turn results in more trees that must be added to the model.
There high value of the learning rate seems like the model is overfitting and depending too much on the first predictors
the 0.1 model will be more predictive because of the above explanation (c) How would increasing interaction depth a???ect the slope of predictor importance for either model in Fig.8.24?
Increasing the interaction depth would most likely increase the effect of the non important predictors. There by making the slope less steep as it will spread the importance across variables
Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
data("ChemicalManufacturingProcess")
ChemicalManuf_df<-ChemicalManufacturingProcess
ChemicalManuf_df <- knnImputation(ChemicalManuf_df[, !names(ChemicalManuf_df) %in% "Yield"])
#anyNA(ChemicalManuf_df)
ChemicalManuf_df$Yield <-ChemicalManufacturingProcess$Yield
## 75% of the sample size
smp_size <- floor(0.75 * nrow(ChemicalManuf_df))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(ChemicalManuf_df)), size = smp_size)
train <- ChemicalManuf_df[train_ind, ]
test <-ChemicalManuf_df[-train_ind, ]
solTrainXtrans <- train[, !names(train) %in% "Yield"]
solTrainY <- train[, "Yield"]
solTestXtrans <- test[, !names(train) %in% "Yield"]
solTestY <- test[, "Yield"]
ctrl <- trainControl(method = "cv", number = 10)
Cubist
cubistTuned <- train(solTrainXtrans, solTrainY, method = "cubist", trControl = ctrl,
preProc = c("center", "scale"))
cubistTunedPred <- predict( cubistTuned, newdata = solTestXtrans)
postResample(pred =cubistTunedPred, obs = solTestY)
## RMSE Rsquared MAE
## 1.0131820 0.6994734 0.8129190
plot(varImp( cubistTuned))
GBM model
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2), n.trees = seq(100, 1000, by = 50), shrinkage = c(0.01, 0.1), n.minobsinnode=c(10))
set.seed(100)
gbmTune <- train(solTrainXtrans, solTrainY, method = "gbm", tuneGrid = gbmGrid,verbose = FALSE)
gbmTunedPred <- predict( gbmTune, newdata = solTestXtrans)
postResample(pred =gbmTunedPred, obs = solTestY)
## RMSE Rsquared MAE
## 1.1304688 0.6293820 0.8329294
plot(varImp(gbmTune))
Random Forest
rt_grid <- expand.grid(maxdepth= seq(1,10,by=1))
rt_Tune <- train(solTrainXtrans, solTrainY, method = "rpart2", metric = "Rsquared", tuneGrid = rt_grid, trControl =ctrl)
rt_TunePred <- predict( rt_Tune, newdata = solTestXtrans)
postResample(pred =rt_TunePred, obs = solTestY)
## RMSE Rsquared MAE
## 1.8961069 0.1847277 1.4388405
plot(varImp(rt_Tune))
from the above the Cubist model gave a better performance.
The Manufacturing Process 32 has the highest level of significance. For top 10 mostly ist process variables. Cubist model heavily relys on top 2 predictors vs PLS same as MARS. c) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?
plot(as.party(rt_Tune$finalModel),gp=gpar(fontsize=11))