library(mlbench)
library(randomForest)
library(caret)
library(tidyverse)
library(party)
library(gbm)
library(Cubist)
library(rpart)
8.1)
Did the random forest model significantly use the uninformative predictors (V6 – V10)?
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
library(randomForest)
library(caret)
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1 %>% arrange(desc(Overall))
## Overall
## V1 8.732235404
## V4 7.615118809
## V2 6.415369387
## V5 2.023524577
## V3 0.763591825
## V6 0.165111172
## V7 -0.005961659
## V10 -0.074944788
## V9 -0.095292651
## V8 -0.166362581
Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
set.seed(123)
model_2<- randomForest(y ~.,
data=simulated,
importance = TRUE, ntree = 1000)
new_imp<-varImp(model_2, scale=FALSE)
new_imp %>% arrange(desc(Overall))
## Overall
## V4 6.84585204
## V2 6.17711385
## V1 5.62937024
## duplicate1 4.32662911
## V5 1.81782072
## V3 0.73469142
## V6 0.06863788
## V9 -0.04279069
## V7 -0.04750291
## V10 -0.04786506
## V8 -0.07015860
Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importance show the same pattern as the traditional random forest model?
c_model<- cforest(y~., data=simulated)
rank_conditional<- varimp(c_model, conditional = TRUE) %>% sort(decreasing=TRUE)
rank_unconditional<- varimp(c_model, conditional = FALSE)%>% sort(decreasing=TRUE)
#Combining all results into one dataframe
rank_comparsion<- cbind(new_imp, rank_conditional, rank_unconditional)
#Chaning colum name
names(rank_comparsion)[1]<-'original_ranking'
rank_comparsion
## original_ranking rank_conditional rank_unconditional
## V1 5.62937024 5.973580188 7.392804129
## V2 6.17711385 4.801569808 6.093296844
## V3 0.73469142 1.947654295 5.085813474
## V4 6.84585204 1.915734317 4.428669947
## V5 1.81782072 1.018403069 1.864273405
## V6 0.06863788 0.027304648 0.015577318
## V7 -0.04750291 0.006510864 -0.007276374
## V8 -0.07015860 0.006153603 -0.013143895
## V9 -0.04279069 0.001293016 -0.024148669
## V10 -0.04786506 -0.001297792 -0.029442569
## duplicate1 4.32662911 -0.008865821 -0.029797156
Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?
Using the boosted model:
set.seed(123)
#Removing the duplicate1 column:
simulated<- simulated %>% select(-duplicate1)
#fit the boosted tree model:
og_gbm_model<-train(y~.,
data = simulated,
method = 'gbm',
distribution = "gaussian",
verbose = FALSE)
varImp(og_gbm_model)
## gbm variable importance
##
## Overall
## V4 100.0000
## V1 99.5369
## V2 93.1077
## V5 45.6778
## V3 29.8281
## V7 2.4474
## V8 2.1192
## V6 1.5888
## V10 0.2352
## V9 0.0000
Using the Cubist model:
set.seed(123)
simulated_x<- simulated %>% select(-y)
cubist<- train(x=simulated_x, y=simulated$y,
method='cubist')
varImp(cubist)
## cubist variable importance
##
## Overall
## V1 100.00
## V2 75.69
## V4 68.06
## V3 58.33
## V5 55.56
## V6 15.28
## V8 0.00
## V10 0.00
## V7 0.00
## V9 0.00
8.2) Use a simulation to show tree bias with different granularities.
set.seed(3341)
#Repeat the process 1000 times:
n_iteration<- 1000
#Store the variable for the first split:
root_vars<- character(n_iteration)
#Main loop for repeating the modeling 1000 times:
for (i in 1:n_iteration){
#Define low, mid, high variance:
low<-sample(c('A','B'), 100, replace = TRUE)
mid<-sample(letters[1:5],100, replace = TRUE)
high<-rnorm(100)
y<- rnorm(100)
df<- data.frame(y=y,
low=factor(low),
mid=factor(mid),
high=high)
test_model<- rpart(y~., data=df)
#Extract root node split variable:
root_var <- as.character(test_model$frame$var[1])
root_vars[i] <- root_var
}
#Counting which variable splits for which level of granularity:
table(root_vars)
## root_vars
## high low mid
## 591 53 356
8.3 a)
8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
Train split the data:
set.seed(123)
#checking for missing data:
is.na(ChemicalManufacturingProcess) %>% sum() #106 missing data
## [1] 106
#impute the data with knn:
imputed_data<- preProcess(ChemicalManufacturingProcess, method = 'knnImpute')
#apply to the dataset:
imputed_chem<- predict(imputed_data, ChemicalManufacturingProcess)
imputed_chem %>% is.na() %>% sum() # 0 missing data
## [1] 0
#spliting data in 70-30:
chem_train_idx<- createDataPartition(imputed_chem$Yield, p = 0.7, list=FALSE)
#training dataset
chem_train_x<- imputed_chem[chem_train_idx, ] %>% select(-Yield)
chem_train_y<- imputed_chem$Yield[chem_train_idx]
#testing dataset:
chem_test_x<- imputed_chem[-chem_train_idx, ] %>% select(-Yield)
chem_test_y<- imputed_chem$Yield[-chem_train_idx]
#Define CrossValidation parameter:
cv<- trainControl(method='cv', number = 10)
randomForest model:
set.seed(50)
#tuning mtry
rf_tune<-expand.grid(.mtry=(5:20))
rf_model<- train(x=chem_train_x,
y=chem_train_y,
method = 'rf',
tuneGrid = rf_tune,
trControl=cv,
ntree=1000)
Boosted Tree model
set.seed(50)
#define tuning parameters:
gbm_tune<- expand.grid(interaction.depth = seq(1, 7, by =2),
n.trees = seq(100, 1000, by =50),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 10)
gbm_model<- train(x=chem_train_x,
y=chem_train_y,
method = 'gbm',
tuneGrid = gbm_tune,
trControl=cv,
verbose = FALSE)
plot(gbm_model)
Cubist:
set.seed(50)
cub_tune<-expand.grid(committees = 1:10,
neighbors = 1:4)
cub_model<- train(x=chem_train_x,
y=chem_train_y,
method = 'cubist',
trControl = cv,
tuneGrid = cub_tune)
plot(cub_model)
Compiling the prediction results:
rf_pred<- predict(rf_model, chem_test_x)
gbm_pred<- predict(gbm_model, chem_test_x)
cub_pred<- predict(cub_model, chem_test_x)
list(randomForest = postResample(rf_pred, chem_test_y),
Boosted = postResample(gbm_pred, chem_test_y),
cubist = postResample(cub_pred, chem_test_y))
## $randomForest
## RMSE Rsquared MAE
## 0.4945301 0.7537722 0.3894710
##
## $Boosted
## RMSE Rsquared MAE
## 0.5707246 0.6443742 0.4609822
##
## $cubist
## RMSE Rsquared MAE
## 0.6228166 0.5759597 0.4516156
plot(varImp(rf_model), top=10)