8.1. Recreate the simulated data from Exercise 7.2:
> library(mlbench)
> set.seed(200)
> simulated <- mlbench.friedman1(200, sd = 1)
> simulated <- cbind(simulated$x, simulated$y)
> simulated <- as.data.frame(simulated)
> colnames(simulated)[ncol(simulated)] <- "y"
> library(randomForest)
> library(caret)
> model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
> rfImp1 <- varImp(model1, scale = FALSE)
Did the random forest model significantly use the uninformative predic- tors (V6 – V10)?
> simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
> cor(simulated$duplicate1, simulated$V1)
Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that func- tion toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?
Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?
Fig. 8.24: A comparison of variable importance magnitudes for differing values of the bagging fraction and shrinkage parameters. Both tuning parameters are set to 0.1 in the left figure. Both are set to 0.9 in the right figure
library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
model1 <- randomForest(y ~., data = simulated, importance = TRUE, ntree = 1000)
(rfImp <- varImp(model1, scale = FALSE))
## Overall
## V1 8.83890885
## V2 6.49023056
## V3 0.67583163
## V4 7.58822553
## V5 2.27426009
## V6 0.17436781
## V7 0.15136583
## V8 -0.03078937
## V9 -0.02989832
## V10 -0.08529218
library(data.table)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9396216
dt <- setDT(varImp(model1, scale = FALSE), keep.rownames = TRUE)[]
options(scipen=99)
dt[order(-Overall)]
## rn Overall
## 1: V1 8.83890885
## 2: V4 7.58822553
## 3: V2 6.49023056
## 4: V5 2.27426009
## 5: V3 0.67583163
## 6: V6 0.17436781
## 7: V7 0.15136583
## 8: V9 -0.02989832
## 9: V8 -0.03078937
## 10: V10 -0.08529218
library(data.table)
simulated$duplicate2 <- simulated$V1 + rnorm(200) * .15
cor(simulated$duplicate2, simulated$V1)
## [1] 0.8945605
model2 <- randomForest(y ~., data = simulated, importance = TRUE, ntree = 1000)
rfImp <- varImp(model2, scale = FALSE)
dt <- setDT(varImp(model2, scale = FALSE), keep.rownames = TRUE)[]
options(scipen=99)
dt[order(-Overall)]
## rn Overall
## 1: V4 7.21776316
## 2: V2 6.45956335
## 3: V1 5.65399070
## 4: duplicate1 2.65416045
## 5: V5 1.97242335
## 6: duplicate2 1.58972104
## 7: V3 0.60371103
## 8: V6 0.12652966
## 9: V10 -0.01073958
## 10: V7 -0.01201493
## 11: V9 -0.05637654
## 12: V8 -0.09943217
library(data.table)
fit.cforest <- cforest(y ~., data = simulated)
dt <- setDT(varimp(fit.cforest, conditional = TRUE) %>% data.frame(), keep.rownames = TRUE)[]
options(scipen=99)
dt[order(-.)]
## rn .
## 1: V4 5.46562157
## 2: V2 4.66059632
## 3: V1 3.20446849
## 4: V5 1.39716392
## 5: duplicate1 1.24840582
## 6: duplicate2 1.09574887
## 7: V6 0.02341780
## 8: V3 -0.09675796
## 9: V9 -0.15673783
## 10: V8 -0.19956241
## 11: V10 -0.20368319
## 12: V7 -0.24295729
dt <- setDT(varimp(fit.cforest, conditional = FALSE) %>% data.frame(), keep.rownames = TRUE)[]
options(scipen=99)
dt[order(-.)]
## rn .
## 1: V4 6.34961451
## 2: V1 6.26481997
## 3: V2 5.37851273
## 4: duplicate1 3.52874469
## 5: duplicate2 3.20044071
## 6: V5 2.06735420
## 7: V6 0.13700519
## 8: V3 0.10642765
## 9: V7 0.06689043
## 10: V9 0.04811456
## 11: V8 -0.02418668
## 12: V10 -0.22198342
fit.cubist <- cubist(simulated %>% select(-y), simulated$y, committees = 4)
fit.cubist
##
## Call:
## cubist.default(x = simulated %>% select(-y), y = simulated$y, committees
## = 4)
##
## Number of samples: 200
## Number of predictors: 12
##
## Number of committees: 4
## Number of rules per committee: 1, 4, 1, 4
dt <- setDT(varImp(fit.cubist) %>% data.frame(), keep.rownames = TRUE)[]
options(scipen=99)
dt[order(-Overall)]
## rn Overall
## 1: V1 67.0
## 2: V2 54.5
## 3: V4 50.0
## 4: V5 46.5
## 5: V3 43.0
## 6: duplicate2 14.5
## 7: V6 2.5
## 8: duplicate1 2.5
## 9: V7 0.0
## 10: V8 0.0
## 11: V9 0.0
## 12: V10 0.0
8.2. Use a simulation to show tree bias with different granularities.
data.set <- twoClassSim(300, noiseVars = 8, corrVar = 6, corrValue = 0.8) %>%
mutate(TwoFactor1 = as.factor(round(TwoFactor1, 0)),
TwoFactor2 = as.factor(round(TwoFactor2, 0)))
r.fit <- rpart(Linear01 ~ ., data=data.set)
rpart.plot(r.fit)
dt <- setDT(varImp(r.fit), keep.rownames = TRUE)[]
options(scipen=99)
dt[order(-Overall)]
## rn Overall
## 1: Noise5 1.18482825
## 2: Corr4 1.03593205
## 3: Linear06 1.02184898
## 4: Nonlinear2 0.85142287
## 5: Noise8 0.78952667
## 6: Linear08 0.76693139
## 7: TwoFactor1 0.69097583
## 8: Linear04 0.60181778
## 9: Corr3 0.57738032
## 10: Linear07 0.51707612
## 11: Noise4 0.48274395
## 12: Linear03 0.48268985
## 13: Noise2 0.47560469
## 14: Corr6 0.47212661
## 15: Corr2 0.44345254
## 16: Noise7 0.44232603
## 17: Linear09 0.35801898
## 18: Corr1 0.35266603
## 19: Linear05 0.31854720
## 20: Noise6 0.28661098
## 21: Noise3 0.27578621
## 22: Nonlinear1 0.24194755
## 23: Linear10 0.22428898
## 24: TwoFactor2 0.22242313
## 25: Nonlinear3 0.10469571
## 26: Noise1 0.08518753
## 27: Corr5 0.05646623
## 28: Linear02 0.00000000
## 29: Class 0.00000000
## rn Overall
8.3. In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
We will import imputation and data splitting code from 6.3 below and will use several models to evaluate the results:
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
df <- ChemicalManufacturingProcess
df_imp1 <- missForest(df)
## missForest iteration 1 in progress...done!
## missForest iteration 2 in progress...done!
## missForest iteration 3 in progress...done!
df_imp <- df_imp1$ximp
data <- df_imp[,2:58]
target <- df_imp[,1]
training <- createDataPartition( target, p=0.75 )
predictor_training <- data[training$Resample1,]
target_training <- target[training$Resample]
predictor_testing <- data[-training$Resample1,]
target_testing <- target[-training$Resample1]
ctrl <- trainControl(method = "cv", number = 10)
Use Single Tree model and evaluate the results:
Tune:
rt_grid <- expand.grid(maxdepth= seq(1,10,by=1))
rt_Tune <- train(x = predictor_training, y = target_training, method = "rpart2", metric = "Rsquared", tuneGrid = rt_grid, trControl = ctrl)
Predict:
rt_pred = predict(rt_Tune, predictor_testing)
postResample(pred = rt_pred, obs = target_testing)
## RMSE Rsquared MAE
## 1.5289341 0.4102548 1.2064965
Use Random forest model to evaluate:
Tune:
rf_grid <- expand.grid(mtry=seq(2,38,by=3))
rf_Tune <- train(x = predictor_training, y = target_training, method = "rf", tuneGrid = rf_grid, metric = "Rsquared", importance = TRUE, trControl = ctrl)
Predict:
rf_pred = predict(rt_Tune, predictor_testing)
postResample(pred = rt_pred, obs = target_testing)
## RMSE Rsquared MAE
## 1.5289341 0.4102548 1.2064965
Use Cubist model:
Tune:
cube_grid <- expand.grid(committees = c(1, 5, 10, 20, 50), neighbors = c(0, 1, 3, 5))
cube_Tune <- train(x = predictor_training, y = target_training, method = "cubist", metric = "Rsquared", tuneGrid = cube_grid, trControl = ctrl)
Predict:
cube_pred = predict(cube_Tune, predictor_testing)
postResample(pred = cube_pred, obs = target_testing)
## RMSE Rsquared MAE
## 0.8414842 0.8466099 0.6354866
Review the predictor importance of cubist model:
plot(varImp(cube_Tune), top=10, scales = list(y = list(cex = 0.8)))
Plot the Optimal Sinlge tree:
plot(as.party(rt_Tune$finalModel),gp=gpar(fontsize=11))
Manufacturing process32 is at the top only few Biological processes affect target in any meaningful way.
Cubist model is the latest and strongest confirmation of that, so any further study should be focused on finding optimal values for manufacturing proceses.