Data624_homework9

library(party)

## Warning: package 'party' was built under R version 4.3.3

## Loading required package: grid

## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 4.3.3

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Warning: package 'strucchange' was built under R version 4.3.3

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Warning: package 'sandwich' was built under R version 4.3.3

8.1.

 library(mlbench)
 set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.3.3

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

 library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.3

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: lattice

model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)

Did the random forest model significantly use the uninformative predictors (V6 – V10)? – No

Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

library(randomForest)
 library(caret)
 model1 <- randomForest(y ~ ., data = simulated,
 importance = TRUE,
 ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9418838

# Fit the random forest model
rf_model <- randomForest(V1 ~ ., data=simulated, importance=TRUE, ntree=500)

# Check the importance of each variable
importance(rf_model)

##               %IncMSE IncNodePurity
## V2          5.7810426     0.5499149
## V3         -1.6816794     0.5352620
## V4          1.8694530     0.5611178
## V5          0.1128018     0.4020345
## V6          4.2167282     0.6533169
## V7          1.4100323     0.4049313
## V8         -2.1695949     0.4960107
## V9         -1.7873826     0.3981558
## V10         1.9310055     0.4771813
## y          15.5740843     2.1266941
## duplicate1 57.3870008     8.2622688

simulated$duplicate2 <- simulated$V1 + rnorm(nrow(simulated), mean = 0, sd = 0.1)

# Fit the random forest model to the data including the new variables
rf_model <- randomForest(V1 ~ ., data = simulated, importance = TRUE, ntree = 500)

# Print the importance of each variable
print(importance(rf_model))

##               %IncMSE IncNodePurity
## V2          1.7924940     0.2282359
## V3         -2.0756508     0.2117414
## V4          1.6701836     0.2046090
## V5         -0.9929312     0.1481494
## V6          1.7144144     0.2946590
## V7          0.2351306     0.1833665
## V8          1.7045039     0.2452011
## V9         -2.2210955     0.1524607
## V10        -1.3595657     0.2027746
## y           9.9304256     1.2044355
## duplicate1 29.8599880     5.7568550
## duplicate2 31.9612410     6.1676491

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9401422

simulated$duplicate1 <- simulated$V1 + rnorm(nrow(simulated), mean = 0, sd = 0.1)


simulated$duplicate2 <- simulated$V1 + rnorm(nrow(simulated), mean = 0, sd = 0.1)


cf_model <- cforest(V1 ~ ., data = simulated, controls=cforest_unbiased(ntree=500, mtry=2))


traditional_importance <- varimp(cf_model, conditional = FALSE)

# Calculate the modified variable importance (Strobl et al. 2007)
conditional_importance <- varimp(cf_model, conditional = TRUE)

# Print both importance measures
print("Traditional Importance:")

## [1] "Traditional Importance:"

print(traditional_importance)

##            V2            V3            V4            V5            V6 
##  1.028139e-04 -2.836967e-04  1.298467e-04 -1.990490e-04  6.181554e-04 
##            V7            V8            V9           V10             y 
## -5.720315e-05  2.230324e-04 -2.194198e-04  1.905222e-05  6.137399e-03 
##    duplicate1    duplicate2 
##  3.921562e-02  3.823858e-02

print("Conditional Importance:")

## [1] "Conditional Importance:"

print(conditional_importance)

##            V2            V3            V4            V5            V6 
## -5.068608e-06 -4.544828e-05  8.024009e-05 -1.014944e-05  3.584621e-04 
##            V7            V8            V9           V10             y 
##  4.577201e-05 -2.574821e-05 -4.233119e-05 -4.610501e-05  1.128660e-03 
##    duplicate1    duplicate2 
##  1.590702e-02  1.262782e-02

Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.3.3

# Prepare data for xgboost (numeric matrix and vector output)
data_matrix <- model.matrix(V1 ~ ., data = simulated)[,-1]
dtrain <- xgb.DMatrix(data = data_matrix, label = simulated$V1)

# Fit the model
xgb_model <- xgb.train(data = dtrain, nrounds = 100, objective = "reg:squarederror")

# Get importance
importance_xgb <- xgb.importance(model = xgb_model)
print(importance_xgb)

##        Feature        Gain      Cover  Frequency
##  1: duplicate1 0.882551022 0.15884171 0.08196721
##  2: duplicate2 0.089975991 0.13389035 0.08968177
##  3:         V4 0.004524742 0.06265308 0.08486017
##  4:         V5 0.004397571 0.07325169 0.08003857
##  5:         V2 0.003960683 0.06265308 0.13404050
##  6:          y 0.003681367 0.05356530 0.04435873
##  7:         V6 0.002768246 0.06887948 0.07907425
##  8:         V7 0.002097959 0.08847430 0.08775313
##  9:         V9 0.001832303 0.08442257 0.08100289
## 10:        V10 0.001758903 0.07006982 0.06750241
## 11:         V3 0.001242511 0.07169509 0.09546770
## 12:         V8 0.001208702 0.07160353 0.07425265

library(Cubist)

## Warning: package 'Cubist' was built under R version 4.3.3

# Fit the Cubist model
cubist_model <- cubist(x = simulated[, !names(simulated) %in% "V1"], y = simulated$V1)

# Get summary which includes variable usage
cubist_summary <- summary(cubist_model)
print(cubist_summary)

## 
## Call:
## cubist.default(x = simulated[, !names(simulated) %in% "V1"], y = simulated$V1)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Wed May 15 23:26:31 2024
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 200 cases (13 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [200 cases, mean 0.4814613, range 0.002806243 to 0.9989924, est err 0.0564194]
## 
##  outcome = 0.0125057 + 0.521 duplicate1 + 0.45 duplicate2
## 
## 
## Evaluation on training data (200 cases):
## 
##     Average  |error|          0.0732740
##     Relative |error|               0.31
##     Correlation coefficient        0.94
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##           100%    duplicate1
##           100%    duplicate2
## 
## 
## Time: 0.0 secs

8.2 Use a simulation to show tree bias with different granularities

library(rpart)
set.seed(123)  

# Generate data
n <- 1000
X1 <- runif(n, 0, 1)  
Y <- 2 * X1 + rnorm(n)  

# Create additional predictors with varying granularities
X2 <- X1 + rnorm(n, 0, 0.01)  # Slightly noisy version
X3 <- X1 + rnorm(n, 0, 0.1)   # More noisy version
X4 <- X1 + rnorm(n, 0, 0.5)   # Even more noisy version

data <- data.frame(Y, X1, X2, X3, X4)


model_dt <- rpart(Y ~ ., data = data)
importance_dt <- as.data.frame(varImp(model_dt))


model_rf <- randomForest(Y ~ ., data = data, importance = TRUE)
importance_rf <- importance(model_rf)


data_matrix <- xgb.DMatrix(data = as.matrix(data[,-1]), label = data$Y)
params <- list(objective = "reg:squarederror", eta = 0.1)
model_xgb <- xgb.train(params, data_matrix, nrounds = 100)
importance_xgb <- xgb.importance(model = model_xgb)


print("Decision Tree Importance:")

## [1] "Decision Tree Importance:"

print(importance_dt)

##       Overall
## X1 0.26932050
## X2 0.27318356
## X3 0.24034119
## X4 0.06637222

print("Random Forest Importance:")

## [1] "Random Forest Importance:"

print(importance_rf)

##     %IncMSE IncNodePurity
## X1 25.10714      313.5791
## X2 20.17042      316.3687
## X3 14.19792      300.6970
## X4 11.97809      248.2279

print("Boosted Tree Importance:")

## [1] "Boosted Tree Importance:"

print(importance_xgb)

##    Feature      Gain     Cover Frequency
## 1:      X1 0.3010930 0.1866074 0.3341685
## 2:      X2 0.2850099 0.2176759 0.1509552
## 3:      X3 0.2134621 0.3238139 0.2662073
## 4:      X4 0.2004350 0.2719028 0.2486690

8.3. In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9: (a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors? (b) Which model do you think would be more predictive of other samples? (c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

—-The model on the right uses a high learning rate and a high bagging fraction, which means it learns very quickly and uses almost all of the data available for each tree. This setup makes the model focus intensely on the most obvious and strong predictors right from the start, ignoring less dominant features. The model with the low learning rate and low bagging fraction (the left-hand model) is likely more predictive for new samples.Increasing the interaction depth allows the models to consider more complex relationships between features.

8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models: (a) Which tree-based regression model gives the optimal resampling and test set performance? (b) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models? (c) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3

library(missForest)

## Warning: package 'missForest' was built under R version 4.3.3

library(randomForest)
library(Cubist)
library(caret)
library(missForest)

data(ChemicalManufacturingProcess)
df <- ChemicalManufacturingProcess

# Impute missing values
df_imp1 <- missForest(df)$ximp

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values.  Are you sure you want to do
## regression?

# Split data into training and test sets
set.seed(123)
trainIndex <- createDataPartition(df_imp1$Yield, p = 0.75, list = FALSE)
trainData <- df_imp1[trainIndex, ]
testData <- df_imp1[-trainIndex, ]

# Define training control
trainControl <- trainControl(method = "cv", number = 10)

# Train a random forest model
model_rf <- train(Yield ~ ., data = trainData, method = "rf", trControl = trainControl)

# Train other models for comparison
model_lm <- train(Yield ~ ., data = trainData, method = "lm", trControl = trainControl)

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

# Evaluate models on test set
predictions_rf <- predict(model_rf, testData)
predictions_lm <- predict(model_lm, testData)


perf_rf <- postResample(predictions_rf, testData$Yield)
perf_lm <- postResample(predictions_lm, testData$Yield)


# Print model performances
print(perf_rf)

##      RMSE  Rsquared       MAE 
## 1.1910134 0.5352042 0.8415213

print(perf_lm)

##      RMSE  Rsquared       MAE 
## 1.5931977 0.4132146 1.3186301

# Train a Cubist model
model_cubist <- train(Yield ~ ., data = trainData, method = "cubist", trControl = trainControl)

# Evaluate the Cubist model on test set
predictions_cubist <- predict(model_cubist, testData)
perf_cubist <- postResample(predictions_cubist, testData$Yield)

# Print Cubist model performance
print(perf_cubist)

##      RMSE  Rsquared       MAE 
## 1.0022078 0.6733054 0.8074259

Cubist has best RMSE

# Calculate variable importance
importance <- varImp(model_cubist, scale = FALSE)

# Print the importance of predictors
print(importance)

## cubist variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32    46.0
## ManufacturingProcess17    44.5
## ManufacturingProcess09    20.5
## ManufacturingProcess06    17.5
## ManufacturingProcess13    14.5
## BiologicalMaterial06      13.0
## BiologicalMaterial02      12.0
## ManufacturingProcess02    11.5
## ManufacturingProcess33    11.5
## ManufacturingProcess04     7.5
## ManufacturingProcess01     7.0
## ManufacturingProcess45     7.0
## ManufacturingProcess05     7.0
## BiologicalMaterial03       6.5
## ManufacturingProcess37     6.0
## ManufacturingProcess29     4.5
## ManufacturingProcess31     4.5
## BiologicalMaterial10       4.5
## ManufacturingProcess15     4.5
## BiologicalMaterial08       4.0

Data624_homework9_01

2024-05-15