Do problems 8.1, 8.2, 8.3, and 8.7 in Kuhn and Johnson. Please submit the Rpubs link along with the .rmd file.
library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
## Overall
## V1 8.62743275
## V2 6.27437240
## V3 0.72305459
## V4 7.50258584
## V5 2.13575650
## V6 0.12395003
## V7 0.02927888
## V8 -0.11724317
## V9 -0.10344797
## V10 0.04312556
Did the random forest model significantly use the uninformative predictors (V6– V10)?
Comparing V6 to V10 and V1 to V5, these variables where not that significant, as the importance. V6 to V10 has very performance and there even some negative importance scores.
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9485201
Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
model_2 = randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp2 <- varImp(model_2, scale = FALSE)
rfImp2
## Overall
## V1 6.774034589
## V2 6.426340527
## V3 0.613805379
## V4 7.135941576
## V5 2.135242904
## V6 0.171933358
## V7 0.142238552
## V8 -0.073192083
## V9 -0.098719872
## V10 -0.009701234
## duplicate1 3.084990840
The importance score for the new model decrease for V1 compared to first model.
simulated$duplicate2 = simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate2, simulated$V1)
## [1] 0.9337221
Adding another highly correlated predictor, shifts the importance of the variables toward V4.
model_3 = randomForest(y ~ .,
data = simulated,
importance = TRUE,
ntree = 1000)
rfImp3 = varImp(model_3, scale = FALSE)
rfImp3
## Overall
## V1 5.908641677
## V2 6.586726939
## V3 0.559845667
## V4 7.373782389
## V5 1.987341138
## V6 0.162417814
## V7 0.038423138
## V8 0.007497423
## V9 -0.001806331
## V10 0.004023755
## duplicate1 2.351543736
## duplicate2 2.305339113
{r}# install.packages("party")
{r}# install.packages("matrixStats")
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
# we looking for the columns from 1 to 11, with our response and predictor variables,
# V1 to V10 being the predictor and y being the response variable.
model_4 <- cforest(y ~ ., data = simulated[, c(1:11)])
#checking for important variables in the random forest model
# conditional = True , will check for variables that are related and this will help order the importance
rfImp4 <- varimp(model_4, conditional = TRUE)
rfImp4
## V1 V2 V3 V4 V5 V6
## 5.335045520 5.312801960 0.034111050 6.679015663 1.146860652 -0.015403265
## V7 V8 V9 V10
## 0.009873139 -0.008375437 -0.012514552 -0.031028246
Comparing a traditional random forest with a random forest with our the condtional inference trees, show the silar patterns in variables that are important to the model.
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# Boosted model
# gbm is gradient boosting machines , used for boosting
# gaussian for a continuous response, this assumes there is normal distribution between the predicted and actual values.
# make predictoin from teh first 1000 trees
model_5 = gbm(y ~ ., data = simulated[,c(1:11)], distribution = "gaussian", n.trees=1000)
summary(model_5)
## var rel.inf
## V4 V4 24.952144
## V1 V1 23.630928
## V2 V2 19.685740
## V5 V5 11.086713
## V3 V3 9.084071
## V6 V6 2.841103
## V7 V7 2.556823
## V9 V9 2.417430
## V8 V8 2.167407
## V10 V10 1.577642
# use plotit = false, to remove graph
rfImp5 = summary(model_5, n.trees = 1000, plotit = FALSE)
rfImp5
## var rel.inf
## V4 V4 24.952144
## V1 V1 23.630928
## V2 V2 19.685740
## V5 V5 11.086713
## V3 V3 9.084071
## V6 V6 2.841103
## V7 V7 2.556823
## V9 V9 2.417430
## V8 V8 2.167407
## V10 V10 1.577642
library(Cubist)
#Cubist Model
model_6 = train(y ~ ., data = simulated[, c(1:11)], method = "cubist")
rfImp6 = varImp(model_6)
rfImp6
## cubist variable importance
##
## Overall
## V1 100.00
## V2 75.69
## V4 68.06
## V3 58.33
## V5 55.56
## V6 15.28
## V9 0.00
## V8 0.00
## V10 0.00
## V7 0.00
A simlar pattern occurs in the cubist model, but is absent in the the boosted model.
8.2. Use a simulation to show tree bias with different granularities.
{r}# install.packages("rpart")
library(rpart)
# this llibrary to plot to for the rpart model for decision trees
library(partykit)
## Loading required package: libcoin
##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
## node_terminal, varimp
# setting seed to replicate date
set.seed(419)
#Granularity refers to how much we can divided the date
A1 <- sample(1:10 / 10, 250, replace = TRUE)
# this will pick 250 values from a sample size of 10 values, like 0.1, 0.2
B2 <- sample(1:100 / 100, 250, replace = TRUE)
# this will pick 250 values from a size of 100 values , 0.01 and 0.03 to 1
C3 <- sample(1:1000 / 1000, 250, replace = TRUE)
# this will pick 250 values from a sample size of 1000, like 0.001 to 1
D4 <- sample(1:10000 / 10000, 250, replace = TRUE)
# this will pick 250 values from a sample size of 10000, like 0.0001 to 0.0006
# all the simulated values will ranges from 0 and 1 and get more detailed as the we increase the granularity (ratio)
# We will add all the variables together to make new variable / a new column called "y"
# eg, like y = 0.1+0.03+0.006+0.0003
# for each row
y = A1 + B2 + C3 + D4
# we will put variables together into a data frame (kinda like an excell sheet)
simulated_data <- data.frame(A1,B2,C3,D4)
# we can now build a decision tree to predict how y will look like in when all columns come together
# rpart will build a decision tree
model_7 <- rpart(y ~ ., data = simulated_data)
# plotting and finding the important variable
# gpar help to change the fontsize
plot(as.party(model_7), gp = gpar(fontsize = 8))
#important variables
sort(model_7$variable.importance, decreasing = TRUE)
## D4 B2 A1 C3
## 24.83758 22.38069 19.57992 19.12128
8.3. In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradi- ent. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors? The model on the left having a 0.1 bagging fraction and 0.1 learning rate, will have a to use smaller data sample. The left model model also has has slow learning rate at 0.01, which cause the model to update information slowly. This type of model will focus on exploring more predictors will causes the spread of important variables. The model on the right with a bagging fraction of 0.9 and learning rate of 0.9, allows the model to explore most the data each time. With a high learning rate, it choose the dominant predictors , which causes a drop in importance among the variables.
Which model do you think would be more predictive of other samples? The left model is more predictive of other sample because it focuses on more diverse predictors, which will reduce relying on few variables seen in the data. Having a slower learning rate is good for generalization due to the decrease in over fitting. A larger bagging fraction and faster learning rate on right will be prone to some over fitting.
How would increasing interaction depth affect the slope of predictor importance for either model in Fig.8.24?
Increasing the interaction depth will causes the tree to be more complex. It will explore the important variables that may have not been significant previously.
Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess)
## [1] 176 58
# Filling out missing values with imputation method
# we are pre processing the data with the k to nearest neighbour to fill out missing values
preprocessed_data = preProcess(ChemicalManufacturingProcess, method = "knnImpute") # this uses the k to nearst neightbour approach to impuate the data
# predict fuction will appy the KNN to the new data
imputed_data = predict(preprocessed_data, newdata = ChemicalManufacturingProcess)
sum(is.na(imputed_data))
## [1] 0
dim(imputed_data)
## [1] 176 58
#### Spliting the data into a training and a test set
# Yield is the response variable that we want to predict
# we use the default 80/20 split
split_index = createDataPartition(imputed_data$Yield, p = 0.8, list = FALSE)
train_imputed = imputed_data[split_index, ]
test_imputed = imputed_data[-split_index, ]
# Using 10 cross fold validation
training_ctrl2 = trainControl(method = "cv", number = 10)
# Single Tree Model
set.seed(123)
cart_model = train(Yield~.,
data=train_imputed,
method = "rpart",
tuneLength = 10,
trControl = training_ctrl2)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Predicting using the test data set
cart_pred = predict(cart_model, newdata = test_imputed)
cart_perf = postResample(pred = cart_pred, obs = test_imputed$Yield)
cart_perf
## RMSE Rsquared MAE
## 0.8283422 0.3393586 0.6079676
library(ipred)
# Bagged Tree Model
bagged_model = bagging(Yield~.,
data=train_imputed)
bagged_pred = predict(bagged_model, newdata = test_imputed)
bagged_perf = postResample(pred = bagged_pred, obs = test_imputed$Yield)
bagged_perf
## RMSE Rsquared MAE
## 0.4808078 0.7677526 0.3886573
# Random Forest
rando_model = randomForest(Yield~.,
data=train_imputed,
importance = TRUE,
ntree = 1000)
rando_pred = predict(rando_model, newdata = test_imputed)
rando_perf = postResample(pred = rando_pred, obs = test_imputed$Yield)
rando_perf
## RMSE Rsquared MAE
## 0.4812480 0.7975288 0.3662786
#Boosted Tree Model
boost_model = gbm(Yield~.,
data=train_imputed, distribution = "gaussian")
boost_pred = predict(boost_model, newdata = test_imputed)
## Using 100 trees...
boost_perf = postResample(pred = boost_pred, obs = test_imputed$Yield)
boost_perf
## RMSE Rsquared MAE
## 0.5725295 0.6552084 0.4577019
# Grabbing all the model perfomance
model_perf = rbind(
CART = cart_perf,
Bagged = bagged_perf,
RandomForest = rando_perf,
Boosted = boost_perf
)
model_perf
## RMSE Rsquared MAE
## CART 0.8283422 0.3393586 0.6079676
## Bagged 0.4808078 0.7677526 0.3886573
## RandomForest 0.4812480 0.7975288 0.3662786
## Boosted 0.5725295 0.6552084 0.4577019
Looking the best model requires a model with lower RMSE, meaning the model accuracy in comparing prediction vs actual values, and mean absolute error(MAE) detail’s the average size of error throughout the models predictions. Lastly the r square shows how much variation can be explained with the model. The closer the r square is to 1, the accurate the prediction. The model that fits this best is our Random Forest Model, with the lowest RMSE of 0.61, lowest MAE of 0.43 and high r square of 0.68.
# Tree Based Regression Model
rfplot = varImpPlot(rando_model,type = 2, n.var = min(10) )
?varImpPlot
# Support Vector Machine Model (Non - Linear Model)
set.seed(867)
svm_model = train(
Yield ~ .,
data = train_imputed,
method = "svmRadial",
preProcess = c("center", "scale"),
trControl = training_ctrl2,
tuneLength = 10
)
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 128, 130, 130, 131, 129, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.7724797 0.5127555 0.6344027
## 0.50 0.7110123 0.5432178 0.5846157
## 1.00 0.6569614 0.5812756 0.5429152
## 2.00 0.6276587 0.6028958 0.5209366
## 4.00 0.6300245 0.5942761 0.5164624
## 8.00 0.6274044 0.5986006 0.5125832
## 16.00 0.6243245 0.6028677 0.5101924
## 32.00 0.6243245 0.6028677 0.5101924
## 64.00 0.6243245 0.6028677 0.5101924
## 128.00 0.6243245 0.6028677 0.5101924
##
## Tuning parameter 'sigma' was held constant at a value of 0.01394861
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01394861 and C = 16.
varImp(svm_model)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess32 95.02
## ManufacturingProcess09 84.23
## ManufacturingProcess17 82.26
## BiologicalMaterial06 80.14
## ManufacturingProcess36 73.80
## ManufacturingProcess06 73.03
## BiologicalMaterial03 69.33
## BiologicalMaterial12 63.14
## ManufacturingProcess31 63.06
## BiologicalMaterial02 61.52
## ManufacturingProcess11 56.73
## BiologicalMaterial04 49.80
## BiologicalMaterial11 46.41
## BiologicalMaterial08 46.05
## ManufacturingProcess30 41.82
## ManufacturingProcess33 40.62
## BiologicalMaterial01 38.21
## ManufacturingProcess29 34.82
## ManufacturingProcess25 34.43
plot(varImp(svm_model), 10, main = "SVM Important Variables ")
# Linear Regression Model (Linear Model)
lm_model = train(
Yield ~ ., # Use all predictors to predict Yield
data = imputed_data ,
method = "lm", # Linear regression
trControl = training_ctrl2
)
varImp(lm_model)
## lm variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess28 62.19
## ManufacturingProcess33 61.53
## ManufacturingProcess04 53.02
## ManufacturingProcess37 52.91
## ManufacturingProcess45 40.21
## ManufacturingProcess09 37.76
## ManufacturingProcess29 37.24
## BiologicalMaterial07 36.91
## ManufacturingProcess43 35.30
## BiologicalMaterial05 34.51
## ManufacturingProcess12 24.30
## BiologicalMaterial11 24.11
## ManufacturingProcess27 22.60
## ManufacturingProcess38 22.03
## BiologicalMaterial08 21.34
## ManufacturingProcess03 21.12
## BiologicalMaterial03 20.84
## ManufacturingProcess35 20.48
## ManufacturingProcess24 20.05
plot(varImp(lm_model), 10, main = "Linear Regression Important Variables")
The ManufacturingProcess32 is shown to be a most significant variables
in three models. ManufacturingProcess13 also play a significant role in
both the Single Tress model and the support vector machine model. Each
model will rank the predictors of process and biological materials
differently. Both random forest model and linear regression based in
dominated mostly by process variables with some biological variables
appearing throughout the ranks. However, the support vector machine
model does give some weight the biological variables, with Biological
Material06 ranked as it third most important variable.
plot(as.party(cart_model$finalModel), gp = gpar(fontsize = 8))
The single tree demonstrates that ManufacturingProcess32, is its most important variables wit start the tree. The closer a tree is ManufacturingProcess32 or the root, the more important the variable is. Each branch shows how different predictors relate to the response variable