library(mlbench)
library(randomForest)
library(caret)
library(tidyverse)
library(party)
library(gbm)
library(Cubist)
library(rpart)
library(partykit)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
set.seed(123)
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
## Overall
## V1 8.50271328
## V2 6.60654902
## V3 0.77254765
## V4 7.61477567
## V5 2.12199871
## V6 0.12468125
## V7 0.07498385
## V8 -0.14796065
## V9 -0.11660208
## V10 -0.15644346
Did the random forest model significantly use the uninformative predictors (V6 – V10)?
The random forest model did not significantly use the uninformative predictors. Predictors V6-V10 were on the lower end in terms of importance.
simulated_copy <- simulated
set.seed(126)
model3 <- cforest(y ~ ., data = simulated[, c(1:11)])
rfImp3 <- varimp(model3, conditional = TRUE)
rfImp3
## V1 V2 V3 V4 V5 V6 V7
## 6.2081222 5.0840947 0.1276384 6.0939553 1.5671070 -0.1205725 -0.2819707
## V8 V9 V10
## -0.3542182 -0.3173039 -0.1080203
The importance values from the cforest function show the same pattern as the values from the traditional random forest model.
sim <- simulated_copy
boostedGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2), n.trees = seq(100, 1000, by = 50), shrinkage = c(0.01, 0.1), n.minobsinnode = 10)
set.seed(127)
boosted <- train(y ~ ., data = sim, method = "gbm", tuneGrid = boostedGrid, verbose = FALSE)
bImp <- varImp(boosted$finalModel, scale = FALSE)
bImp
## Overall
## V1 4747.4568
## V2 3898.2010
## V3 1528.8084
## V4 4965.1268
## V5 1892.8625
## V6 193.1214
## V7 234.9903
## V8 130.1226
## V9 129.6012
## V10 121.7100
set.seed(100)
cubist <- train(y ~ ., data = sim, method = "cubist")
cImp <- varImp(cubist$finalModel, scale = FALSE)
cImp
## Overall
## V1 72.0
## V3 42.0
## V2 54.5
## V4 49.0
## V5 40.0
## V6 11.0
## V7 0.0
## V8 0.0
## V9 0.0
## V10 0.0
Use a simulation to show tree bias with different granularities.
set.seed(624)
x <- 10
s <- 500
a <- sample(1:x^1/x^1, s, replace = TRUE)
b <- sample(1:x^2/x^2, s, replace = TRUE)
c <- sample(1:x^3/x^3, s, replace = TRUE)
d <- sample(1:x^4/x^4, s, replace = TRUE)
e <- sample(1:x^5/x^5, s, replace = TRUE)
f <- sample(1:x^6/x^6, s, replace = TRUE)
g <- sample(1:x^7/x^7, s, replace = TRUE)
y <- a+b+c+d+e+f+g
simData <- data.frame(a,b,c,d,e,f,g,y)
rpartTree <- rpart(y ~ ., data = simData)
plot(as.party(rpartTree), gp = gpar(fontsize = 7))
varImp(rpartTree)
## Overall
## a 2.057017
## b 2.580122
## c 3.615722
## d 1.447580
## e 2.770340
## f 2.947516
## g 4.065677
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
preProcess <- preProcess(ChemicalManufacturingProcess,
method = c("BoxCox", "knnImpute", "center", "scale"))
predPreProcess <- predict(preProcess, ChemicalManufacturingProcess)
predPreProcess$Yield = ChemicalManufacturingProcess$Yield
ind <- sample(seq_len(nrow(predPreProcess)), size = floor(0.85 * nrow(predPreProcess)))
train <- predPreProcess[ind, ]
test <- predPreProcess[-ind, ]
set.seed(100)
cart <- train(Yield~., data=train,
method = "rpart",
tuneLength = 10,
trControl = trainControl(method = "cv"))
cartPred <- predict(cart, test)
set.seed(100)
rfModel <- randomForest(Yield~., data=train,
importance = TRUE,
ntree = 1000)
rfPred <- predict(rfModel, test)
set.seed(100)
boost <- train(Yield~., data=train,
method = "gbm",
tuneGrid = boostedGrid,
verbose = FALSE)
boostPred <- predict(boost, test)
set.seed(100)
cubisT <- train(Yield~., data=train,
method = "cubist")
cPred <- predict(cubisT, test)
as.data.frame(rbind(
"CART" = postResample(pred = cartPred, obs = test$Yield),
"Random Forest" = postResample(pred = rfPred, obs = test$Yield),
"Boosted" = postResample(pred = boostPred, obs = test$Yield),
"Cubist" = postResample(pred = cPred, obs = test$Yield)
)) %>% arrange(RMSE)
## RMSE Rsquared MAE
## Cubist 0.7284471 0.8358305 0.5600602
## Random Forest 0.8529082 0.8111093 0.7095435
## Boosted 0.8789016 0.7601582 0.7189186
## CART 1.3127849 0.4893145 1.1557587
According to the RMSE of all models, the Cubist model gives the optimal resampling and test set performance.
The most important variables for the cubist model are shown below. There seem to be more manufacturing process variables in the top ten most important variables. The most optimal models for linear and nonlinear models were PLS and SVM respectively. The PLS model had more manufacturing process variables in top importance list. The SVM model had an equal number between the manufacturing process and biological material variables.
varImp(cubisT, scale = TRUE)
## cubist variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess17 75.439
## ManufacturingProcess29 48.246
## BiologicalMaterial02 34.211
## ManufacturingProcess27 34.211
## ManufacturingProcess13 31.579
## ManufacturingProcess04 30.702
## ManufacturingProcess09 28.947
## BiologicalMaterial06 28.070
## ManufacturingProcess21 26.316
## ManufacturingProcess33 23.684
## ManufacturingProcess39 22.807
## BiologicalMaterial03 21.930
## ManufacturingProcess25 17.544
## ManufacturingProcess31 13.158
## ManufacturingProcess26 13.158
## BiologicalMaterial04 13.158
## BiologicalMaterial12 12.281
## ManufacturingProcess45 12.281
## ManufacturingProcess06 8.772
plot(varImp(cubisT), top = 10)
plot(as.party(rpart(Yield ~ ., data = predPreProcess)), ip_args = list(abbreviate = 4), gp = gpar(fontsize = 8))
The plot does seem to show more insight than the importance numbers, but
it helps with vizualization.