All libraries needed for the Homework

library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## -- Attaching packages -------------------------------------------- fpp3 1.0.0 --
## v tibble      3.2.1     v tsibble     1.1.5
## v dplyr       1.1.2     v tsibbledata 0.4.1
## v tidyr       1.3.0     v feasts      0.3.2
## v lubridate   1.9.2     v fable       0.3.4
## v ggplot2     3.5.1     v fabletools  0.4.2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tsibble' was built under R version 4.3.3
## Warning: package 'tsibbledata' was built under R version 4.3.3
## Warning: package 'feasts' was built under R version 4.3.3
## Warning: package 'fabletools' was built under R version 4.3.3
## Warning: package 'fable' was built under R version 4.3.3
## -- Conflicts ------------------------------------------------- fpp3_conflicts --
## x lubridate::date()    masks base::date()
## x dplyr::filter()      masks stats::filter()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval()  masks lubridate::interval()
## x dplyr::lag()         masks stats::lag()
## x tsibble::setdiff()   masks base::setdiff()
## x tsibble::union()     masks base::union()
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v forcats 1.0.0     v readr   2.1.4
## v purrr   1.0.1     v stringr 1.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()     masks stats::filter()
## x tsibble::interval() masks lubridate::interval()
## x dplyr::lag()        masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lubridate)
library(tsibble)
library(pracma)
## Warning: package 'pracma' was built under R version 4.3.3
## 
## Attaching package: 'pracma'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(corrplot)
## corrplot 0.92 loaded
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.1
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:pracma':
## 
##     sigmoid
## 
## The following object is masked from 'package:fabletools':
## 
##     interpolate
library(psych)
## Warning: package 'psych' was built under R version 4.3.1
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:pracma':
## 
##     logit, polar
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(imputeTS)
## Warning: package 'imputeTS' was built under R version 4.3.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.1
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## The following objects are masked from 'package:fabletools':
## 
##     MAE, RMSE
library(mlbench)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:psych':
## 
##     outlier
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(party)
## Warning: package 'party' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.3
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 4.3.1
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## 
## The following object is masked from 'package:fabletools':
## 
##     refit
## 
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.1
## 
## Attaching package: 'zoo'
## 
## The following object is masked from 'package:imputeTS':
## 
##     na.locf
## 
## The following object is masked from 'package:tsibble':
## 
##     index
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.2
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## 
## Attaching package: 'party'
## 
## The following object is masked from 'package:fabletools':
## 
##     response
## 
## The following object is masked from 'package:dplyr':
## 
##     where
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(ipred)
## Warning: package 'ipred' was built under R version 4.3.3
library(AppliedPredictiveModeling)    
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3

8.1. Recreate the simulated data from Exercise 7.2: > library(mlbench) > set.seed(200) > simulated <- mlbench.friedman1(200, sd = 1) > simulated <- cbind(simulated\(x, simulated\)y) > simulated <- as.data.frame(simulated) > colnames(simulated)[ncol(simulated)] <- “y”

(a). Fit a random forest model to all of the predictors, then estimate the variable importance scores: > library(randomForest) > library(caret) > model1 <- randomForest(y ~ ., data = simulated, + importance = TRUE, + ntree = 1000)

Did the random forest model significantly use the uninformative predictors (V6 – V10)?

  1. Now add an additional predictor that is highly correlated with one of the informative predictors.Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

  2. Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

  3. Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

(a). Fit a random forest model to all of the predictors

rf_model <- randomForest(y ~ ., data = simulated, 
                       importance = TRUE,
                       ntree = 1000)
rf_imp <- varImp(rf_model, scale = FALSE)

rf_imp
##          Overall
## V1   8.732235404
## V2   6.415369387
## V3   0.763591825
## V4   7.615118809
## V5   2.023524577
## V6   0.165111172
## V7  -0.005961659
## V8  -0.166362581
## V9  -0.095292651
## V10 -0.074944788

The random forest model does not use the uninformative predictors significantly.Furthermore, V6-V10 are insignificant as compared to V1-V5.

  1. Now add an additional predictor that is highly correlated with one of the informative predictors. For example:
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206

(b).Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

rf_model_2 <- randomForest(y ~ ., 
                      data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rf_imp_2 <- varImp(rf_model_2 , scale = FALSE)

rf_imp_2 |>
  arrange(desc(Overall)) |>
  knitr::kable()
Overall
V4 7.0475224
V2 6.0689606
V1 5.6911997
duplicate1 4.2833158
V5 1.8723844
V3 0.6297022
V6 0.1356906
V10 0.0289481
V9 0.0084044
V7 -0.0134564
V8 -0.0437056
simulated$duplicate2 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate2, simulated$V1)
## [1] 0.9408631
rf_model_3 <- randomForest(y ~ ., 
                      data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rf_imp_3 <- varImp(rf_model_3 , scale = FALSE)

rf_imp_3 |>
  arrange(desc(Overall)) |>
  knitr::kable()
Overall
V4 7.0487092
V2 6.5281650
V1 4.9168733
duplicate1 3.8006823
V5 2.0311556
duplicate2 1.8772196
V3 0.5871155
V6 0.1421315
V7 0.1099199
V10 0.0923058
V9 -0.0107503
V8 -0.0840569

From the table above we can see that the importance score got less for V1, on the other hand the scores V2 and V4 got bigger and became more important than V1.Based on this, we can see that with the introduction of another highly correlated predictor, the important scores for the other variables increase and the importance of V1 decreases.

  1. Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?
crf_model <- cforest(y ~ ., data = simulated[, c(1:11)])
crf_imp <- varimp(crf_model, conditional = TRUE)
crf_imp
##            V1            V2            V3            V4            V5 
##  5.699856e+00  5.190114e+00  6.219633e-03  6.741766e+00  1.182547e+00 
##            V6            V7            V8            V9           V10 
## -1.104739e-02  1.491985e-05 -8.383608e-03 -7.355064e-03 -2.342891e-02

The importance scores indicate very similar patterns as the traditional random forest model.In cforest function, V1 is considered the most important and V4 is the second most important.

  1. Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?
                             #Boosted Tree model
bt_model <- gbm(y ~ .,
                    data = simulated[,c(1:11)],
                    distribution = "gaussian",
                    n.trees=1000)

summary(bt_model, plotit=F) |>
  dplyr::select(-var) |>
  knitr::kable()
rel.inf
V4 24.250181
V1 22.715850
V2 20.431478
V5 11.471817
V3 9.355412
V7 3.262117
V6 2.643509
V8 1.972263
V9 1.969329
V10 1.928044
                        #Cubist model

cb_model <- train(y ~ .,
                     data = simulated[,c(1:11)],
                     method = "cubist")

varImp(cb_model$finalModel, scale = FALSE) |>
  arrange(desc(Overall)) |>
  knitr::kable()
Overall
V1 72.0
V2 54.5
V4 49.0
V3 42.0
V5 40.0
V6 11.0
V7 0.0
V8 0.0
V9 0.0
V10 0.0

The boosted tree has similar significance (V1-V5) and insignificant predictors (V6-V10). The difference is, however, is V4 has a higher influence than V1. In terms of the Cubist model, V1 once again has the biggest significance. V2 is more important than V4 and V4 is more important than V3. (V7-V10) has zero significance.

8.2. Use a simulation to show tree bias with different granularities.

set.seed(4)
a <- sample(1:10 / 10, 200, replace = TRUE)
b <- sample(1:100 / 100, 200, replace = TRUE)
c <- sample(1:1000 / 1000, 200, replace = TRUE)
d <- sample(1:10000 / 10000, 200, replace = TRUE)

y <- a + b + c + rnorm(200, mean=0, sd=5)

simulation_data<- data.frame(a, b, c, d, y) 

r_part_tree <- rpart(y ~ ., data = simulation_data)

rpart.plot(r_part_tree, 
           cex = 0.8,  # Adjust font size
           branch = 1,  # Adjust branch length
           split.cex = 0.7) 

Looking at the tree, it looks like the noise variables (ones that are not important) will be selected over the informative variables in the top nodes.There seems to be more bias to predictors with distinct values.

8.3 - In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

  1. Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?
  2. Which model do you think would be more predictive of other samples?
  3. How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?
library(imager)
## Warning: package 'imager' was built under R version 4.3.3
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following objects are masked from 'package:pracma':
## 
##     and, mod, or
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
## 
##     add
## The following object is masked from 'package:party':
## 
##     where
## The following object is masked from 'package:strucchange':
## 
##     boundary
## The following object is masked from 'package:grid':
## 
##     depth
## The following object is masked from 'package:randomForest':
## 
##     grow
## The following object is masked from 'package:stringr':
## 
##     boundary
## The following object is masked from 'package:tidyr':
## 
##     fill
## The following object is masked from 'package:dplyr':
## 
##     where
## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum
## The following object is masked from 'package:graphics':
## 
##     frame
## The following object is masked from 'package:base':
## 
##     save.image
im = load.image("C:/Users/Staff/Dropbox/MS in Data Science/DATA 624 - Predictive Analytics/Week 12 - Trees and Rules/image.PNG")

plot(im)

  1. Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The model on the right seems to have a higher learning rate.In consequence, tends to fit faster to a very small amount of predictors concentrating its attention on predictors which came earlier.The model on the left seems to spread across more predictors, which indicates that it has a lower bagging fraction and lower learning rate than the model on the right, meaning it acquires information from more predictors.

  1. Which model do you think would be more predictive of other samples?

Since the model on the left has a lower bagging fraction and lower learning rate, it will be more predictive of all other samples due to the fact that there is less of a chance it will overfit. In consequence, the model on the left will do a better job than the model on the right in its ability to generalize incoming data.

  1. How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

Interaction depth refers to the number of splits performed on the tree.Thus, as the interaction depth increases, the importance of the predictor variables increases. This in turn, gives way for the more important smaller predictors to give more contributions.

8.7 - Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

  1. Which tree-based regression model gives the optimal resampling and test set performance?

  2. Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

  3. Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

set.seed(10)

data(ChemicalManufacturingProcess)

# imputation
impute_chemical <- preProcess(ChemicalManufacturingProcess, method = "bagImpute")
Chemical <- predict(impute_chemical , ChemicalManufacturingProcess)

# filtering low frequencies
Chemical <- Chemical[, -nearZeroVar(Chemical)]

set.seed(45)

# index for training
index <- createDataPartition(Chemical$Yield, p = .8, list = FALSE)

# train 
train_x_chemical <- Chemical[index, -1]
train_y_chemical <- Chemical[index, 1]

# test
test_x_chemical <- Chemical[-index, -1]
test_y_chemical <- Chemical[-index, 1]
  1. Which tree-based regression model gives the optimal resampling and test set performance?
                            #Cart model
cart_model <- train(train_x_chemical, train_y_chemical,
                  method = "rpart",
                  tuneLength = 10,
                  trControl = trainControl(method = "cv"))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
cart_prediction <- predict(cart_model, test_x_chemical)

postResample(cart_prediction, test_y_chemical)
##      RMSE  Rsquared       MAE 
## 1.6093758 0.3143007 1.2096219
                         #Bagged Trees model


bagged_tree_model <- ipredbagg(train_y_chemical, train_x_chemical)
bagged_tree_prediction <- predict(bagged_tree_model, test_x_chemical)
postResample(bagged_tree_prediction, test_y_chemical)
##      RMSE  Rsquared       MAE 
## 1.4632904 0.4175478 1.1440904
                       #Random Forest model


rf_model <- randomForest(train_x_chemical, train_y_chemical, 
                        importance = TRUE,
                        ntree = 1000)

rf_prediction<- predict(rf_model, test_x_chemical)

postResample(rf_prediction, test_y_chemical)
##      RMSE  Rsquared       MAE 
## 1.3632890 0.4831346 1.0754396
                         #Cubist

cubist_model<- train(train_x_chemical , train_y_chemical , 
                     method = "cubist")

cubist_prediction <- predict(cubist_model, test_x_chemical)

postResample(cubist_prediction, test_y_chemical)
##      RMSE  Rsquared       MAE 
## 1.3197331 0.5541123 1.0088960
                  #comparing the models

rbind(cart = postResample(cart_prediction, test_y_chemical),
      bagged = postResample(bagged_tree_prediction, test_y_chemical),
      randomForest = postResample(rf_prediction, test_y_chemical),
      cubist = postResample(cubist_prediction , test_y_chemical))
##                  RMSE  Rsquared      MAE
## cart         1.609376 0.3143007 1.209622
## bagged       1.463290 0.4175478 1.144090
## randomForest 1.363289 0.4831346 1.075440
## cubist       1.319733 0.5541123 1.008896

Since the cubist model has the highest R2 and lowest RMSE, it gives the optimal resampling and test set performance

  1. Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

Looking into the Cubist model, since it is the most opimal tree-based regression model(from previous comparison)

#10 most important predictors
varImp(cubist_model, scale = TRUE)
## cubist variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   79.05
## ManufacturingProcess17   48.57
## ManufacturingProcess09   46.67
## BiologicalMaterial03     30.48
## BiologicalMaterial12     30.48
## BiologicalMaterial06     24.76
## ManufacturingProcess04   20.95
## ManufacturingProcess24   19.05
## BiologicalMaterial05     19.05
## ManufacturingProcess33   17.14
## ManufacturingProcess11   16.19
## ManufacturingProcess27   16.19
## ManufacturingProcess37   14.29
## ManufacturingProcess06   12.38
## BiologicalMaterial04     12.38
## BiologicalMaterial11     11.43
## ManufacturingProcess19   11.43
## BiologicalMaterial10     11.43
## ManufacturingProcess28   10.48
plot(varImp(cubist_model), 10)

It looks like the top two predictors, namely, Manufacturing32 and Manufacturing13 in the tree based model are the same as those in the non-linear SVM model. BiologicalMaterial06 is the fourth most important one, just like in the non-linear model. Manufacturing Process17 is the last predictor which is shared between the non-linear and tree-based models.

  1. Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?
single_tree <- rpart(Yield ~ ., 
                     data = Chemical[index, ])

rpart.plot(single_tree)

We see that the top predictor is in fact Manufacturing32, which is consistent all across other models and was expected. However, we see more biological predictors in the single tree model than in models that show more robustness. The tree does a good job in providing additional knowledge about the predictors as well as their relationship with the yield. The single tree has good interpretability.