library(party)
## Warning: package 'party' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.3
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.3.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.3
8.1.
library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
Did the random forest model significantly use the uninformative predictors (V6 – V10)? – No
library(randomForest)
library(caret)
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9418838
# Fit the random forest model
rf_model <- randomForest(V1 ~ ., data=simulated, importance=TRUE, ntree=500)
# Check the importance of each variable
importance(rf_model)
## %IncMSE IncNodePurity
## V2 5.7810426 0.5499149
## V3 -1.6816794 0.5352620
## V4 1.8694530 0.5611178
## V5 0.1128018 0.4020345
## V6 4.2167282 0.6533169
## V7 1.4100323 0.4049313
## V8 -2.1695949 0.4960107
## V9 -1.7873826 0.3981558
## V10 1.9310055 0.4771813
## y 15.5740843 2.1266941
## duplicate1 57.3870008 8.2622688
simulated$duplicate2 <- simulated$V1 + rnorm(nrow(simulated), mean = 0, sd = 0.1)
# Fit the random forest model to the data including the new variables
rf_model <- randomForest(V1 ~ ., data = simulated, importance = TRUE, ntree = 500)
# Print the importance of each variable
print(importance(rf_model))
## %IncMSE IncNodePurity
## V2 1.7924940 0.2282359
## V3 -2.0756508 0.2117414
## V4 1.6701836 0.2046090
## V5 -0.9929312 0.1481494
## V6 1.7144144 0.2946590
## V7 0.2351306 0.1833665
## V8 1.7045039 0.2452011
## V9 -2.2210955 0.1524607
## V10 -1.3595657 0.2027746
## y 9.9304256 1.2044355
## duplicate1 29.8599880 5.7568550
## duplicate2 31.9612410 6.1676491
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9401422
simulated$duplicate1 <- simulated$V1 + rnorm(nrow(simulated), mean = 0, sd = 0.1)
simulated$duplicate2 <- simulated$V1 + rnorm(nrow(simulated), mean = 0, sd = 0.1)
cf_model <- cforest(V1 ~ ., data = simulated, controls=cforest_unbiased(ntree=500, mtry=2))
traditional_importance <- varimp(cf_model, conditional = FALSE)
# Calculate the modified variable importance (Strobl et al. 2007)
conditional_importance <- varimp(cf_model, conditional = TRUE)
# Print both importance measures
print("Traditional Importance:")
## [1] "Traditional Importance:"
print(traditional_importance)
## V2 V3 V4 V5 V6
## 1.028139e-04 -2.836967e-04 1.298467e-04 -1.990490e-04 6.181554e-04
## V7 V8 V9 V10 y
## -5.720315e-05 2.230324e-04 -2.194198e-04 1.905222e-05 6.137399e-03
## duplicate1 duplicate2
## 3.921562e-02 3.823858e-02
print("Conditional Importance:")
## [1] "Conditional Importance:"
print(conditional_importance)
## V2 V3 V4 V5 V6
## -5.068608e-06 -4.544828e-05 8.024009e-05 -1.014944e-05 3.584621e-04
## V7 V8 V9 V10 y
## 4.577201e-05 -2.574821e-05 -4.233119e-05 -4.610501e-05 1.128660e-03
## duplicate1 duplicate2
## 1.590702e-02 1.262782e-02
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.3.3
# Prepare data for xgboost (numeric matrix and vector output)
data_matrix <- model.matrix(V1 ~ ., data = simulated)[,-1]
dtrain <- xgb.DMatrix(data = data_matrix, label = simulated$V1)
# Fit the model
xgb_model <- xgb.train(data = dtrain, nrounds = 100, objective = "reg:squarederror")
# Get importance
importance_xgb <- xgb.importance(model = xgb_model)
print(importance_xgb)
## Feature Gain Cover Frequency
## 1: duplicate1 0.882551022 0.15884171 0.08196721
## 2: duplicate2 0.089975991 0.13389035 0.08968177
## 3: V4 0.004524742 0.06265308 0.08486017
## 4: V5 0.004397571 0.07325169 0.08003857
## 5: V2 0.003960683 0.06265308 0.13404050
## 6: y 0.003681367 0.05356530 0.04435873
## 7: V6 0.002768246 0.06887948 0.07907425
## 8: V7 0.002097959 0.08847430 0.08775313
## 9: V9 0.001832303 0.08442257 0.08100289
## 10: V10 0.001758903 0.07006982 0.06750241
## 11: V3 0.001242511 0.07169509 0.09546770
## 12: V8 0.001208702 0.07160353 0.07425265
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.3.3
# Fit the Cubist model
cubist_model <- cubist(x = simulated[, !names(simulated) %in% "V1"], y = simulated$V1)
# Get summary which includes variable usage
cubist_summary <- summary(cubist_model)
print(cubist_summary)
##
## Call:
## cubist.default(x = simulated[, !names(simulated) %in% "V1"], y = simulated$V1)
##
##
## Cubist [Release 2.07 GPL Edition] Wed May 15 23:26:31 2024
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 200 cases (13 attributes) from undefined.data
##
## Model:
##
## Rule 1: [200 cases, mean 0.4814613, range 0.002806243 to 0.9989924, est err 0.0564194]
##
## outcome = 0.0125057 + 0.521 duplicate1 + 0.45 duplicate2
##
##
## Evaluation on training data (200 cases):
##
## Average |error| 0.0732740
## Relative |error| 0.31
## Correlation coefficient 0.94
##
##
## Attribute usage:
## Conds Model
##
## 100% duplicate1
## 100% duplicate2
##
##
## Time: 0.0 secs
8.2 Use a simulation to show tree bias with different granularities
library(rpart)
set.seed(123)
# Generate data
n <- 1000
X1 <- runif(n, 0, 1)
Y <- 2 * X1 + rnorm(n)
# Create additional predictors with varying granularities
X2 <- X1 + rnorm(n, 0, 0.01) # Slightly noisy version
X3 <- X1 + rnorm(n, 0, 0.1) # More noisy version
X4 <- X1 + rnorm(n, 0, 0.5) # Even more noisy version
data <- data.frame(Y, X1, X2, X3, X4)
model_dt <- rpart(Y ~ ., data = data)
importance_dt <- as.data.frame(varImp(model_dt))
model_rf <- randomForest(Y ~ ., data = data, importance = TRUE)
importance_rf <- importance(model_rf)
data_matrix <- xgb.DMatrix(data = as.matrix(data[,-1]), label = data$Y)
params <- list(objective = "reg:squarederror", eta = 0.1)
model_xgb <- xgb.train(params, data_matrix, nrounds = 100)
importance_xgb <- xgb.importance(model = model_xgb)
print("Decision Tree Importance:")
## [1] "Decision Tree Importance:"
print(importance_dt)
## Overall
## X1 0.26932050
## X2 0.27318356
## X3 0.24034119
## X4 0.06637222
print("Random Forest Importance:")
## [1] "Random Forest Importance:"
print(importance_rf)
## %IncMSE IncNodePurity
## X1 25.10714 313.5791
## X2 20.17042 316.3687
## X3 14.19792 300.6970
## X4 11.97809 248.2279
print("Boosted Tree Importance:")
## [1] "Boosted Tree Importance:"
print(importance_xgb)
## Feature Gain Cover Frequency
## 1: X1 0.3010930 0.1866074 0.3341685
## 2: X2 0.2850099 0.2176759 0.1509552
## 3: X3 0.2134621 0.3238139 0.2662073
## 4: X4 0.2004350 0.2719028 0.2486690
8.3. In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9: (a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors? (b) Which model do you think would be more predictive of other samples? (c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?
—-The model on the right uses a high learning rate and a high bagging fraction, which means it learns very quickly and uses almost all of the data available for each tree. This setup makes the model focus intensely on the most obvious and strong predictors right from the start, ignoring less dominant features. The model with the low learning rate and low bagging fraction (the left-hand model) is likely more predictive for new samples.Increasing the interaction depth allows the models to consider more complex relationships between features.
8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models: (a) Which tree-based regression model gives the optimal resampling and test set performance? (b) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models? (c) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
library(missForest)
## Warning: package 'missForest' was built under R version 4.3.3
library(randomForest)
library(Cubist)
library(caret)
library(missForest)
data(ChemicalManufacturingProcess)
df <- ChemicalManufacturingProcess
# Impute missing values
df_imp1 <- missForest(df)$ximp
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
# Split data into training and test sets
set.seed(123)
trainIndex <- createDataPartition(df_imp1$Yield, p = 0.75, list = FALSE)
trainData <- df_imp1[trainIndex, ]
testData <- df_imp1[-trainIndex, ]
# Define training control
trainControl <- trainControl(method = "cv", number = 10)
# Train a random forest model
model_rf <- train(Yield ~ ., data = trainData, method = "rf", trControl = trainControl)
# Train other models for comparison
model_lm <- train(Yield ~ ., data = trainData, method = "lm", trControl = trainControl)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
# Evaluate models on test set
predictions_rf <- predict(model_rf, testData)
predictions_lm <- predict(model_lm, testData)
perf_rf <- postResample(predictions_rf, testData$Yield)
perf_lm <- postResample(predictions_lm, testData$Yield)
# Print model performances
print(perf_rf)
## RMSE Rsquared MAE
## 1.1910134 0.5352042 0.8415213
print(perf_lm)
## RMSE Rsquared MAE
## 1.5931977 0.4132146 1.3186301
# Train a Cubist model
model_cubist <- train(Yield ~ ., data = trainData, method = "cubist", trControl = trainControl)
# Evaluate the Cubist model on test set
predictions_cubist <- predict(model_cubist, testData)
perf_cubist <- postResample(predictions_cubist, testData$Yield)
# Print Cubist model performance
print(perf_cubist)
## RMSE Rsquared MAE
## 1.0022078 0.6733054 0.8074259
Cubist has best RMSE
# Calculate variable importance
importance <- varImp(model_cubist, scale = FALSE)
# Print the importance of predictors
print(importance)
## cubist variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 46.0
## ManufacturingProcess17 44.5
## ManufacturingProcess09 20.5
## ManufacturingProcess06 17.5
## ManufacturingProcess13 14.5
## BiologicalMaterial06 13.0
## BiologicalMaterial02 12.0
## ManufacturingProcess02 11.5
## ManufacturingProcess33 11.5
## ManufacturingProcess04 7.5
## ManufacturingProcess01 7.0
## ManufacturingProcess45 7.0
## ManufacturingProcess05 7.0
## BiologicalMaterial03 6.5
## ManufacturingProcess37 6.0
## ManufacturingProcess29 4.5
## ManufacturingProcess31 4.5
## BiologicalMaterial10 4.5
## ManufacturingProcess15 4.5
## BiologicalMaterial08 4.0