Data624 -HW9-NC

Libraries

library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.3

## Loading required package: lattice

library(mlbench)

## Warning: package 'mlbench' was built under R version 4.3.3

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.3.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(vip)

## Warning: package 'vip' was built under R version 4.3.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.2

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(partykit)

## Warning: package 'partykit' was built under R version 4.3.3

## Loading required package: grid

## Loading required package: libcoin

## Warning: package 'libcoin' was built under R version 4.3.3

## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 4.3.3

library(gbm)

## Warning: package 'gbm' was built under R version 4.3.3

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(rpart)

## Warning: package 'rpart' was built under R version 4.3.3

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3

8.1

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

a.

model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- model1$importance #varImp(model1, scale = FALSE)
vip(model1, color = 'purple', fill='green') + ggtitle('Model1 Var Imp')

model1 <- randomForest(y ~ ., 
                       data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp1 <- varImp(model1, scale = FALSE)

rfImp1 |>
  arrange(desc(Overall)) |>
  knitr::kable()

	Overall
V1	8.6895807
V4	7.6869946
V2	6.4296506
V5	2.3690477
V3	0.7471133
V6	0.1099727
V10	0.0489594
V7	0.0308325
V8	-0.1077942
V9	-0.1281411

Random forest model did not significantly use the variables (V6- V10)

b.

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9418838

model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
vip(model2, color = 'blue', fill='red') + ggtitle('Model2 Var Imp')

model2 <- randomForest(y ~ ., 
                       data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp2 <- varImp(model2, scale = FALSE)

rfImp2 |>
  arrange(desc(Overall)) |>
  knitr::kable()

	Overall
V4	6.9156852
V2	6.1651556
V1	5.8027373
duplicate1	3.9673109
V5	2.2239289
V3	0.6478974
V6	0.2058123
V7	0.0453553
V10	0.0182078
V9	-0.0290585
V8	-0.0448675

The importance score for V1 decreased significantly. The importance scores for the other significant variables also decreased slightly too

c.

cforestModel <- cforest(y ~ .,
        data = simulated[,c(1:11)])

data.frame(Importance = varimp(cforestModel)) |>
  arrange(desc(Importance)) |>
  knitr::kable()

	Importance
V1	8.2808523
V4	7.0858387
V2	6.0391727
V5	2.0483184
V3	0.3150098
V7	0.1048612
V6	-0.1314990
V10	-0.1365436
V9	-0.2769216
V8	-0.2912119

data.frame(Importance = varimp(cforestModel, conditional=T)) |>
    arrange(desc(Importance)) |>
  knitr::kable()

	Importance
V1	6.1154264
V4	5.9824481
V2	5.0077664
V5	1.3581843
V3	0.1073499
V7	-0.0794067
V6	-0.1869483
V9	-0.2486453
V8	-0.2500343
V10	-0.2707839

Importance remains the same (V1, V4, V2, V5, V3). However, V3 is much less significant than in the original random forest model. V6-V10 remain unimportant.

d.

# boosted model
boostedModel <- gbm(y ~ .,
                    data = simulated[,c(1:11)],
                    distribution = "gaussian",
                    n.trees=1000)

summary(boostedModel, plotit=F) |>
  dplyr::select(-var) |>
  knitr::kable()

	rel.inf
V4	24.036116
V1	23.283463
V2	20.668396
V5	11.380157
V3	9.707534
V7	2.413996
V6	2.353621
V10	2.138098
V9	2.096317
V8	1.922300

The boosted model has the same significant (V1-V5) and insignificant (V6-V10) predictors. However, the pattern of importance is slightly different, with V4 having a higher influence than V1.

# cubist model
cubistModel <- train(y ~ .,
                     data = simulated[,c(1:11)],
                     method = "cubist")

varImp(cubistModel$finalModel, scale = FALSE) |>
  arrange(desc(Overall)) |>
  knitr::kable()

	Overall
V1	72.0
V2	54.5
V4	49.0
V3	42.0
V5	40.0
V6	11.0
V7	0.0
V8	0.0
V9	0.0
V10	0.0

In the cubist model, V1 is again the most important variable but in this model, V2 is rated more important than V4 and V3 is rated more important than V5. V7-V10 are wholly insignificant but V6 has a slightly higher importance in this model. The importance of V6 still pales in comparison to the other significant variables.

8.2

set.seed(415)
a <- sample(1:10 / 10, 200, replace = TRUE)
b <- sample(1:100 / 100, 200, replace = TRUE)
c <- sample(1:1000 / 1000, 200, replace = TRUE)
d <- sample(1:10000 / 10000, 200, replace = TRUE)
y <- a + b + c + rnorm(200, mean=0, sd=5)
simData <- data.frame(a, b, c, d, y) 
rpartTree <- rpart(y ~ ., data = simData)
plot(as.party(rpartTree))

varImp(rpartTree)

##     Overall
## a 0.3805726
## b 0.9263322
## c 0.7376280
## d 0.8044435

In this simulation, the tree-based model selected the variables that have more distinct values as more important. It also selected the noisy or the variables with the most repetitive values as the top node.

8.3 a. The model on the right has a higher bagging fraction and a higher learning rate (0.9 for both). Therefore, the model tends to fit faster to a few predictors as it overfits and focuses on the earlier predictors. The model on the left spreads the importance across more predictors because there is a lower bagging fraction and a lower learning rate so the model incorporates information from more predictors.

b.The left model with a lower bagging fraction and lower learning rate is probably more predictive of other samples as it is less likely to overfit to the training data. The model on right is likely overfit and not as able to generalize to new data.

c. Increasing the interaction depth is likely to spread the variable importance over more predictors, increasing the importance of some of the less significant variables.

8.7

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

cmp_predictors = as.matrix(ChemicalManufacturingProcess[,2:58])
cmp_yield = ChemicalManufacturingProcess[,1]  

set.seed(100)
train_select <- createDataPartition(cmp_yield, p=0.75, list=F) #create train set
train_x <- ChemicalManufacturingProcess[train_select,-1]
train_y <-  ChemicalManufacturingProcess[train_select,1]
test_x <- ChemicalManufacturingProcess[-train_select,-1]
test_y <-  ChemicalManufacturingProcess[-train_select,1]


pre_process <- c("nzv",  "corr", "center","scale", "medianImpute")
set.seed(200)
ctrl <- trainControl(method = "boot", number = 25)

a.

set.seed(123)
rpartGrid <- expand.grid(maxdepth= seq(1,10,by=1))
rp_model <- train(x = train_x, y = train_y, method = "rpart2",metric = "Rsquared", tuneGrid = rpartGrid,
                       trControl = ctrl, preProcess=pre_process)

set.seed(415)
rfGrid <- expand.grid(mtry=seq(2,38,by=3))
rf_model <- train(x = train_x, y = train_y, method = "rf", tuneGrid = rfGrid, metric = "Rsquared", importance = TRUE, 
                  trControl = ctrl,preProcess=pre_process)

set.seed(123)
gbmGrid <- expand.grid(interaction.depth=seq(1,6,by=1),
                       n.trees=c(25,50,100,200),
                       shrinkage=c(0.01,0.05,0.1,0.2),
                       n.minobsinnode=5)

gb_model <- train(x = train_x, y = train_y,method = "gbm", metric = "Rsquared",verbose = FALSE, 
                  tuneGrid = gbmGrid, trControl = ctrl, preProcess=pre_process)

set.seed(300)
cubistGrid <- expand.grid(committees = c(1, 5, 10, 20, 50, 100), 
                          neighbors = c(0, 1, 3, 5, 7))

cubist_model <- train(x = train_x, y = train_y,method = "cubist", 
                        verbose = FALSE, metric = "Rsquared", tuneGrid = cubistGrid,trControl = ctrl, preProcess=pre_process)

rp_model

## CART 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (47), scaled (47), median imputation (47), remove (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  RMSE      Rsquared   MAE     
##    1        1.690196  0.2453857  1.329880
##    2        1.639331  0.2983005  1.291676
##    3        1.629179  0.3219788  1.297111
##    4        1.616411  0.3402401  1.274203
##    5        1.611857  0.3522453  1.259721
##    6        1.604643  0.3692173  1.247175
##    7        1.608699  0.3672581  1.250057
##    8        1.612859  0.3683148  1.250426
##    9        1.609667  0.3745211  1.234120
##   10        1.612526  0.3710538  1.234737
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 9.

rf_model

## Random Forest 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (47), scaled (47), median imputation (47), remove (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##    2    1.303028  0.6100732  1.0497760
##    5    1.225974  0.6392512  0.9847810
##    8    1.204323  0.6438640  0.9632693
##   11    1.188722  0.6490856  0.9506980
##   14    1.183323  0.6471495  0.9447251
##   17    1.182808  0.6443045  0.9414697
##   20    1.179384  0.6423053  0.9383393
##   23    1.180337  0.6388259  0.9405971
##   26    1.177250  0.6389037  0.9354435
##   29    1.180567  0.6340534  0.9387843
##   32    1.181884  0.6326042  0.9366325
##   35    1.182666  0.6284457  0.9380844
##   38    1.186828  0.6253370  0.9408241
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 11.

gb_model

## Stochastic Gradient Boosting 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (47), scaled (47), median imputation (47), remove (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  RMSE      Rsquared   MAE      
##   0.01       1                   25      1.778820  0.4250029  1.4382599
##   0.01       1                   50      1.689642  0.4503775  1.3641241
##   0.01       1                  100      1.560112  0.4870382  1.2613075
##   0.01       1                  200      1.415944  0.5227382  1.1484883
##   0.01       2                   25      1.743751  0.4721129  1.4123781
##   0.01       2                   50      1.632455  0.4927882  1.3236075
##   0.01       2                  100      1.479777  0.5252021  1.2011062
##   0.01       2                  200      1.346657  0.5470629  1.0865569
##   0.01       3                   25      1.728590  0.4890706  1.4005543
##   0.01       3                   50      1.605972  0.5116278  1.3042885
##   0.01       3                  100      1.447419  0.5399252  1.1757286
##   0.01       3                  200      1.318261  0.5606073  1.0605653
##   0.01       4                   25      1.715755  0.5168298  1.3915697
##   0.01       4                   50      1.587248  0.5351980  1.2902573
##   0.01       4                  100      1.428679  0.5529776  1.1610599
##   0.01       4                  200      1.298732  0.5728234  1.0415164
##   0.01       5                   25      1.715498  0.5254055  1.3918021
##   0.01       5                   50      1.583384  0.5374533  1.2866235
##   0.01       5                  100      1.418352  0.5566165  1.1502691
##   0.01       5                  200      1.291630  0.5763051  1.0334618
##   0.01       6                   25      1.710393  0.5195480  1.3875657
##   0.01       6                   50      1.577584  0.5388127  1.2822880
##   0.01       6                  100      1.417650  0.5548769  1.1515401
##   0.01       6                  200      1.289347  0.5775145  1.0309699
##   0.05       1                   25      1.512849  0.4866767  1.2219441
##   0.05       1                   50      1.379024  0.5220269  1.1103620
##   0.05       1                  100      1.307806  0.5410636  1.0389930
##   0.05       1                  200      1.286970  0.5502744  1.0107437
##   0.05       2                   25      1.435614  0.5206687  1.1660179
##   0.05       2                   50      1.316195  0.5488065  1.0578949
##   0.05       2                  100      1.280778  0.5561018  1.0105517
##   0.05       2                  200      1.254143  0.5718693  0.9823317
##   0.05       3                   25      1.398439  0.5348659  1.1345059
##   0.05       3                   50      1.288210  0.5640321  1.0272896
##   0.05       3                  100      1.249442  0.5784970  0.9806560
##   0.05       3                  200      1.225145  0.5905359  0.9544796
##   0.05       4                   25      1.387086  0.5422217  1.1257788
##   0.05       4                   50      1.280060  0.5674988  1.0216455
##   0.05       4                  100      1.237861  0.5852009  0.9729893
##   0.05       4                  200      1.218023  0.5963110  0.9560371
##   0.05       5                   25      1.368383  0.5485958  1.1072082
##   0.05       5                   50      1.266979  0.5755483  1.0048438
##   0.05       5                  100      1.228518  0.5902988  0.9619375
##   0.05       5                  200      1.215467  0.5966888  0.9491315
##   0.05       6                   25      1.362614  0.5539193  1.0964223
##   0.05       6                   50      1.264381  0.5761423  1.0019225
##   0.05       6                  100      1.225261  0.5929322  0.9624200
##   0.05       6                  200      1.211567  0.6001751  0.9493778
##   0.10       1                   25      1.366093  0.5307393  1.1041488
##   0.10       1                   50      1.297141  0.5489580  1.0324421
##   0.10       1                  100      1.287503  0.5520652  1.0138624
##   0.10       1                  200      1.284574  0.5536828  1.0102073
##   0.10       2                   25      1.331845  0.5363735  1.0677348
##   0.10       2                   50      1.293520  0.5482392  1.0205492
##   0.10       2                  100      1.272767  0.5597542  0.9948234
##   0.10       2                  200      1.259311  0.5681329  0.9873656
##   0.10       3                   25      1.319656  0.5362327  1.0393035
##   0.10       3                   50      1.284653  0.5514494  0.9997362
##   0.10       3                  100      1.255961  0.5699942  0.9737487
##   0.10       3                  200      1.245709  0.5758247  0.9642348
##   0.10       4                   25      1.294238  0.5529217  1.0343880
##   0.10       4                   50      1.259429  0.5674862  0.9905934
##   0.10       4                  100      1.243242  0.5759588  0.9716459
##   0.10       4                  200      1.231833  0.5825864  0.9621759
##   0.10       5                   25      1.302823  0.5474943  1.0307887
##   0.10       5                   50      1.261601  0.5665345  0.9882995
##   0.10       5                  100      1.246789  0.5748364  0.9730788
##   0.10       5                  200      1.236241  0.5811695  0.9619466
##   0.10       6                   25      1.297187  0.5486155  1.0266988
##   0.10       6                   50      1.251588  0.5728344  0.9820789
##   0.10       6                  100      1.229233  0.5861358  0.9612253
##   0.10       6                  200      1.220481  0.5915938  0.9553760
##   0.20       1                   25      1.346969  0.5116236  1.0733407
##   0.20       1                   50      1.331752  0.5205984  1.0472223
##   0.20       1                  100      1.327409  0.5248659  1.0441472
##   0.20       1                  200      1.326649  0.5279554  1.0375076
##   0.20       2                   25      1.310089  0.5367669  1.0408923
##   0.20       2                   50      1.283335  0.5546175  1.0127880
##   0.20       2                  100      1.289921  0.5541846  1.0090610
##   0.20       2                  200      1.287574  0.5555275  1.0038774
##   0.20       3                   25      1.332570  0.5106666  1.0537657
##   0.20       3                   50      1.308465  0.5291638  1.0296329
##   0.20       3                  100      1.299932  0.5360254  1.0206149
##   0.20       3                  200      1.297973  0.5375716  1.0194229
##   0.20       4                   25      1.311812  0.5307552  1.0306374
##   0.20       4                   50      1.293451  0.5462173  1.0171186
##   0.20       4                  100      1.290003  0.5504031  1.0140327
##   0.20       4                  200      1.288437  0.5514447  1.0142673
##   0.20       5                   25      1.273227  0.5571838  1.0091540
##   0.20       5                   50      1.265032  0.5649241  0.9950279
##   0.20       5                  100      1.260980  0.5675297  0.9903326
##   0.20       5                  200      1.258175  0.5695635  0.9874677
##   0.20       6                   25      1.290332  0.5458426  1.0036063
##   0.20       6                   50      1.273412  0.5574859  0.9838748
##   0.20       6                  100      1.268224  0.5614962  0.9783521
##   0.20       6                  200      1.267447  0.5624443  0.9779844
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 5
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
##  6, shrinkage = 0.05 and n.minobsinnode = 5.

cubist_model

## Cubist 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (47), scaled (47), median imputation (47), remove (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE      
##     1         0          1.980957  0.3104154  1.4101285
##     1         1          1.972143  0.3307948  1.3835795
##     1         3          1.957554  0.3271925  1.3832304
##     1         5          1.964989  0.3220523  1.3917770
##     1         7          1.974917  0.3178011  1.4003046
##     5         0          1.340676  0.5212775  1.0285352
##     5         1          1.301089  0.5527658  0.9729775
##     5         3          1.310632  0.5430352  0.9918837
##     5         5          1.318788  0.5364905  1.0012430
##     5         7          1.326573  0.5312051  1.0093421
##    10         0          1.235722  0.5720334  0.9579426
##    10         1          1.184477  0.6089357  0.8953398
##    10         3          1.204713  0.5928486  0.9193069
##    10         5          1.212072  0.5873299  0.9310488
##    10         7          1.220169  0.5823344  0.9408028
##    20         0          1.185611  0.5989890  0.9304017
##    20         1          1.129469  0.6357279  0.8608391
##    20         3          1.148486  0.6218788  0.8825683
##    20         5          1.157873  0.6154258  0.8972738
##    20         7          1.167402  0.6098351  0.9091194
##    50         0          1.154450  0.6198562  0.8990872
##    50         1          1.095138  0.6565156  0.8244996
##    50         3          1.116165  0.6423499  0.8525089
##    50         5          1.127548  0.6352506  0.8671864
##    50         7          1.137879  0.6293057  0.8780492
##   100         0          1.143115  0.6281723  0.8903013
##   100         1          1.079676  0.6666712  0.8100236
##   100         3          1.103333  0.6515510  0.8425327
##   100         5          1.115153  0.6441812  0.8573995
##   100         7          1.124801  0.6387006  0.8676343
## 
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were committees = 100 and neighbors = 1.

Random forest has the best optimal resampling and test set performance

b.

## Cubist model:

cubist_imp <- varImp(cubist_model, scale = FALSE)
plot(cubist_imp, top=15, scales = list(y = list(cex = 0.8)))

c.

plot(as.party(rp_model$finalModel),gp=gpar(fontsize=10))

Data624 -HW9-NC

CN

2024-11-16

Libraries

8.1

a.

Random forest model did not significantly use the variables (V6- V10)

b.

The importance score for V1 decreased significantly. The importance scores for the other significant variables also decreased slightly too

c.

Importance remains the same (V1, V4, V2, V5, V3). However, V3 is much less significant than in the original random forest model. V6-V10 remain unimportant.

d.

The boosted model has the same significant (V1-V5) and insignificant (V6-V10) predictors. However, the pattern of importance is slightly different, with V4 having a higher influence than V1.

8.2

In this simulation, the tree-based model selected the variables that have more distinct values as more important. It also selected the noisy or the variables with the most repetitive values as the top node.

8.3

b.The left model with a lower bagging fraction and lower learning rate is probably more predictive of other samples as it is less likely to overfit to the training data. The model on right is likely overfit and not as able to generalize to new data.

c. Increasing the interaction depth is likely to spread the variable importance over more predictors, increasing the importance of some of the less significant variables.

8.7

a.

Random forest has the best optimal resampling and test set performance

b.

c.