Kuhn and Johnson
8.1) Recreate the simulated data from Exercise 7.2
library(mlbench)
library(caret)
set.seed(200)
simulated <- mlbench.friedman1(200, sd=1)
simulated <- cbind(simulated$x,simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated) [ncol(simulated)] <- "y"
head(simulated)
## V1 V2 V3 V4 V5 V6
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891
## V7 V8 V9 V10 y
## 1 0.8266609 0.4214081 0.59111440 0.5886216 18.46398
## 2 0.6489601 0.8446239 0.92819306 0.7584008 16.09836
## 3 0.1785614 0.3495908 0.01759542 0.4441185 17.76165
## 4 0.5133614 0.7970260 0.68986918 0.4450716 13.78730
## 5 0.6644906 0.9038919 0.39696995 0.5500808 18.42984
## 6 0.3360117 0.6489177 0.53116033 0.9066182 20.85817
library(randomForest)
set.seed(200)
model1 <- randomForest(y ~ ., data=simulated,
importance=TRUE,
ntree=1000)
rfImp1 <- varImp(model1, scale=FALSE)
rfImp1
## Overall
## V1 8.605365900
## V2 6.831259165
## V3 0.741534943
## V4 7.883384091
## V5 2.244750293
## V6 0.136054182
## V7 0.055950944
## V8 -0.068195812
## V9 0.003196175
## V10 -0.054705900
The random forest did not significantly use the uninformative predictors predictors, V6-V10. The order of importance is: V1, V4, V2, V5, V3.
set.seed(200)
simulated$duplicate1 <- simulated$V1 + rnorm(200)*.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9497025
model2 <- randomForest(y ~ ., data=simulated,
importance=TRUE,
ntree=1000)
rfImp2 <- varImp(model2, scale=FALSE)
rfImp2
## Overall
## V1 6.00709780
## V2 6.05937899
## V3 0.58465293
## V4 6.86363294
## V5 2.19939891
## V6 0.10898039
## V7 0.06104207
## V8 -0.04059204
## V9 0.06123662
## V10 0.09999339
## duplicate1 4.43323167
V1 and duplicate1 are highly correlated. The correlation is 0.94. The importance score of V1 decreased from 8.6 to 6.0 when another predictor that is highly correlated with V1 was added. The order of importance is now V4, V1, V2, V5, V3.
What happens when you add another predictor that is also highly correlated with V1?
set.seed(200)
simulated$duplicate2 <- simulated$V1 + rnorm(200)*.1
cor(simulated$duplicate2, simulated$V1)
## [1] 0.9497025
model3 <- randomForest(y ~ ., data=simulated,
importance=TRUE,
ntree=1000)
rfImp3 <- varImp(model3, scale=FALSE)
rfImp3
## Overall
## V1 4.84857115
## V2 6.50242654
## V3 0.51808831
## V4 7.21616819
## V5 2.11631632
## V6 0.22066363
## V7 -0.02066815
## V8 -0.06964769
## V9 -0.02908959
## V10 -0.03105942
## duplicate1 2.84168163
## duplicate2 2.94318033
The importance score of V1 decreased further to 4.8 when yet another predictor that is highly correlated with V1 was added. Now the order of importance is V4, V2, V1, V5, V3.
library(party)
set.seed(200)
simulated_removedups <- subset(simulated, select=-c(duplicate1,duplicate2))
model4 <- cforest(y ~., data=simulated_removedups)
rfImp4 <- varimp(model4)
sort(rfImp4)
## V9 V6 V8 V10 V7
## -0.040833731 -0.026382693 -0.020945307 -0.017820271 0.003470791
## V3 V5 V2 V4 V1
## 0.048199904 1.911482592 6.475678898 8.374714465 8.889688266
The importances show the same patterns as the traditional random forest model: V1, V4, V2, V5, V3.
set.seed(200)
library(gbm)
#gbmModel <- gbm(y ~ ., data=simulated_removedups, distribution = "gaussian")
gbmGrid <- expand.grid(interaction.depth =seq(1,6, by=2),
n.trees=seq(100,1000,by=50),
shrinkage=c(0.01,0.1),
n.minobsinnode = c(5, 10, 20, 30))
gbmTune <- train(y ~ ., data=simulated_removedups,
method="gbm",
tuneGrid = gbmGrid,
verbose=FALSE)
rfImp5 <- varImp(gbmTune)
rfImp5
## gbm variable importance
##
## Overall
## V1 100.00000
## V4 94.53967
## V2 76.45820
## V5 39.29335
## V3 29.29430
## V7 3.01017
## V6 2.01212
## V8 0.40918
## V9 0.06288
## V10 0.00000
The boosted trees model gives the same pattern of importance for the predictors: V1, V4, V2, V5, V3.
set.seed(200)
library(Cubist)
cubistTuned <- train(y ~ ., data=simulated_removedups,
method="cubist")
rfImp6 <- varImp(cubistTuned)
rfImp6
## cubist variable importance
##
## Overall
## V1 100.00
## V2 75.69
## V4 68.06
## V3 58.33
## V5 55.56
## V6 15.28
## V8 0.00
## V10 0.00
## V7 0.00
## V9 0.00
The Cubist model gave a different order of importance to the predictor variables. :V1,V2,V4,V3,V5
8.2) Use a simulation to show tree bias with different granularities.
Max node depth:
library(rpart)
library(partykit)
set.seed(200)
#maximum depth set method=rpart2
rpartTune <- train(y ~ ., data=simulated_removedups,
method = "rpart2",
tuneLength = 14,
trControl = trainControl(method = "cv"))
#plot(rpartTune)
plot(as.party(rpartTune$finalModel))
Decision Tree with 7 nodes
set.seed(200)
#maximum depth set method=rpart2
rpartTune <- train(y ~ ., data=simulated_removedups,
method = "rpart",
tuneLength = 7,
trControl = trainControl(method = "cv"))
plot(as.party(rpartTune$finalModel))
set.seed(200)
#maximum depth set method=rpart2
rpartTune <- train(y ~ ., data=simulated_removedups,
method = "rpart",
maxdepth=2,
trControl = trainControl(method = "cv"))
plot(as.party(rpartTune$finalModel))
The decision tree with the maximum depth has 14 nodes and gives box plots for each node with the smallest range. two of the nodes have 3 outliers. The decision tree with 7 nodes has the same initial splits as the 14 node model. Only 1 node has 2 outliers. However the first and third quartiles are wider ranges. The decision tree with 2 levels (node depth=2) has 3 nodes, one of which has 1 outlier. The first and third quartiles are farther from the median value than in the other models. The splits are present the same as splits in the other models.
8.3) 8.3. In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters a???ect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
When the bagging fraction and learning rate are set to 0.9, there are just a few predictors of importance.
In order to minimize the likelihood of overfitting in a gradient boosting model, a learning rate (or shrinkage) is implemented. The lower the learning rate, the smaller the fraction of the predicted value is added to the previous value. The bagging fraction is the fraction of the training data that is used. A higher bagging fraction means a larger proportion of the training data is used. Having a learning rate and the high the bagging fraction makes the prediction less stable. Since a higher fraction of the predicted value is added to the previous value and a large fraction of the training data is used, the model tends toward fewer variables.
I think the model on the left that is built using a bagging fraction of 0.1 and a learning rate of 0.1 will be more predictive of other samples.
Interaction depth is the same as tree depth. “Variable importance for boosting is a function of the reduction in square error.” (page 207) The greater the interaction depth, the lower the root mean square error and the steeper the slope of predictor importance.
8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
The data set ChemicalManufacturingProcess is loaded.
I imputed the median of a column for missing values. The data is separated into a training set and testing set. Preprocess data to remove 19 highly correlated variables.
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
impute.median <- function(x) replace(x, is.na(x), median(x, na.rm = TRUE))
for (i in 1:ncol(ChemicalManufacturingProcess)){
ChemicalManufacturingProcess[,i] <- impute.median(ChemicalManufacturingProcess[,i])
}
train_chem <- ChemicalManufacturingProcess[1:130,]
test_chem <- ChemicalManufacturingProcess[131:176,]
highCorr_chem <- caret::findCorrelation(cor(train_chem[,2:58]), cutoff=.7, exact=TRUE)
length(highCorr_chem)
## [1] 19
filtered_train_chem <- train_chem[,-highCorr_chem]
ncol(filtered_train_chem)
## [1] 39
correlation_chem <- cor(filtered_train_chem, method = "pearson")
#summary(ChemicalManufacturingProcess)
Random Forest
library(randomForest)
set.seed(200)
rf_model_chem <- randomForest(x=train_chem[,-1],
y=train_chem$Yield,
importance=TRUE,
ntree=1000)
rfImp_chem <- varImp(rf_model_chem, scale=FALSE)
rf_model_chem
##
## Call:
## randomForest(x = train_chem[, -1], y = train_chem$Yield, ntree = 1000, importance = TRUE)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 19
##
## Mean of squared residuals: 1.139097
## % Var explained: 66.09
rf_Pred_chem <- predict(rf_model_chem, newdata = test_chem[,-1])
##The function 'postResample' can be used to get the test set performance values
postResample(pred=rf_Pred_chem,obs=test_chem$Yield)
## RMSE Rsquared MAE
## 1.6907708 0.3292906 1.4560297
Boosted Trees
set.seed(200)
library(gbm)
gbmGrid <- expand.grid(interaction.depth =seq(1,6, by=2),
n.trees=seq(100,1000,by=50),
shrinkage=c(0.01,0.1),
n.minobsinnode = c(5, 10, 20, 30))
gbmTune_chem <- train(y=train_chem$Yield, x=train_chem[,-1],
method="gbm",
tuneGrid = gbmGrid,
verbose=FALSE)
gbm_Imp_chem <- varImp(gbmTune_chem)
gbm_Pred_chem <- predict(gbmTune_chem, newdata = test_chem[,-1])
##The function 'postResample' can be used to get the test set performance values
postResample(pred=gbm_Pred_chem,obs=test_chem$Yield)
## RMSE Rsquared MAE
## 1.9922084 0.2328677 1.7130731
Cubist
set.seed(200)
library(Cubist)
cubistTuned_chem <- train(y=train_chem$Yield, x=train_chem[,-1],
method="cubist")
cubist_Imp_chem <- varImp(cubistTuned_chem)
cubist_Pred_chem <- predict(cubistTuned_chem, newdata = test_chem[,-1])
##The function 'postResample' can be used to get the test set performance values
postResample(pred=cubist_Pred_chem,obs=test_chem$Yield)
## RMSE Rsquared MAE
## 2.0387329 0.2717414 1.8389371
The random forest model has the lowest root mean square error for the test set.
#rfImp_chem <- varImp(rf_model_chem, scale=FALSE)
#plot(rf_model_chem$localImp)
#plot(rfImp_chem)
varImpPlot(rf_model_chem, sort=TRUE,type=2, n.var=10)
Manufacturing process variables dominate the list. The most important predictors are manufacturing processes 32, 17, 09, 13, biological materials 03, 06, manufacturing processes 06, 31, biological material 12 and manufacturing process 11.
The top non-linear model was the SVM model and the most important variables were: manufacturing processes 17, 13, 09, 32, biological process 06, manufacturing processes 6, 11, 36, 30 and biological process 03.
The top linear model was the partial least square regression and the most important variables were biological processes 6, 11, 12, 04, 08, and manufacturing processes 2, 33, 36, 15, 04.
There is overlap between the most important predicted variables in the random forest and SVM models.
#library(tree)
#plot(randomForest::getTree(rf_model_chem,pool=TRUE), type="unrooted")
#plot.reprtree(rf_model_chem)
#tree::treep1(rf_model_chem)
#library(reprtree)
library(dplyr)
library(ggraph)
library(igraph)
min(rf_model_chem$mse)
## [1] 1.116339
tree <- randomForest::getTree(rf_model_chem,
k =183,
labelVar = TRUE) %>%
tibble::rownames_to_column() %>%
# make leaf split points to NA, so the 0s won't get plotted
mutate(`split point` = ifelse(is.na(prediction), `split point`, NA))
# prepare data frame for graph
graph_frame <- data.frame(from = rep(tree$rowname, 2),
to = c(tree$`left daughter`, tree$`right daughter`))
# convert to graph and delete the last node that we don't want to plot
graph <- graph_from_data_frame(graph_frame) %>%
delete_vertices("0")
# set node labels
V(graph)$node_label <- gsub("_", " ", as.character(tree$`split var`))
V(graph)$leaf_label <- as.character(tree$prediction)
V(graph)$split <- as.character(round(tree$`split point`, digits = 2))
# plot
plot <- ggraph(graph, 'dendrogram') +
theme_bw() +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = node_label), na.rm = TRUE, repel = TRUE) +
# geom_node_label(aes(label = split), vjust = 2.5, na.rm = TRUE, fill = "white") +
# geom_node_label(aes(label = leaf_label, fill = leaf_label), na.rm = TRUE,
# repel = TRUE, colour = "white", fontface = "bold", show.legend = FALSE) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
plot.background = element_rect(fill = "white"),
panel.border = element_blank(),
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
plot.title = element_text(size = 18))
print(plot)
#code from https://shiring.github.io/machine_learning/2017/03/16/rf_plot_ggraph
I graphed the tree with the lowest mean square error. Now the pattern of how the predictive model is built can be seen. To make the graph easier to visualize I removed the values at each split. However using those values would enable one to determine how much the yield is affected by each process or material.