Homework 9

library(mlbench)
library(randomForest)
library(caret)
library(party)
library(gbm)
library(Cubist)

8.1

set.seed(200)
simulated <- mlbench.friedman1(200, sd=1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

model1 <- randomForest(y~., data=simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp <- varImp(model1, scale=FALSE)

rfImp

##          Overall
## V1   8.732235404
## V2   6.415369387
## V3   0.763591825
## V4   7.615118809
## V5   2.023524577
## V6   0.165111172
## V7  -0.005961659
## V8  -0.166362581
## V9  -0.095292651
## V10 -0.074944788

simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9460206

model2 <- randomForest(y~., data=simulated,
                       importance=TRUE,
                       ntree=1000)
rfImp2 <- varImp(model2, scale=FALSE)

rfImp2

##                Overall
## V1          5.69119973
## V2          6.06896061
## V3          0.62970218
## V4          7.04752238
## V5          1.87238438
## V6          0.13569065
## V7         -0.01345645
## V8         -0.04370565
## V9          0.00840438
## V10         0.02894814
## duplicate1  4.28331581

The importance of V1 decreased.

Conditional Inference Tree

model3 <- cforest(y~., data=simulated)
party::varimp(model3, conditional=FALSE)

##            V1            V2            V3            V4            V5 
##  4.6171158805  6.0579730772  0.0003116115  7.6223892727  1.7161194047 
##            V6            V7            V8            V9           V10 
## -0.0289427183  0.0465374951 -0.0380965511  0.0046062409 -0.0310326410 
##    duplicate1 
##  5.0941897280

party::varimp(model3, conditional=TRUE)

##           V1           V2           V3           V4           V5           V6 
##  1.807953145  4.688980360  0.012878752  6.190578351  1.051666850  0.028174759 
##           V7           V8           V9          V10   duplicate1 
## -0.011709437 -0.004356587  0.015118505 -0.022190587  1.926751729

The variable importances are in a different order between traditional and modified.

model5 <- cubist(simulated[1:10], simulated$y)
varImp(model5)

##     Overall
## V1       50
## V2       50
## V4       50
## V5       50
## V3        0
## V6        0
## V7        0
## V8        0
## V9        0
## V10       0

8.3

The model with learning rate=0.9 and bagging fraction=0.9 focuses its importance on a few of the predictors because it converges to a local optimum more quickly. It might have missed the importance of the other predictors and just selected a few of the best ones.
The model with learning rate=0.1 takes smaller steps but it also uses a smaller portion of training data. A medium between the two models may be best.
Increasing interactive depth for the model on the right would cause the predictor importance slope to increase because more splits would be made, giving importance to the corresponding predictors. For the model on the left, the predictors’ importances would become more level.

8.7

library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
X <- ChemicalManufacturingProcess[2:58]
y <- ChemicalManufacturingProcess[1]
sum(is.na(X))

## [1] 106

prep <- preProcess(X, method=c("knnImpute"))
prep

## Created from 152 samples and 57 variables
## 
## Pre-processing:
##   - centered (57)
##   - ignored (0)
##   - 5 nearest neighbor imputation (57)
##   - scaled (57)

X_imputed <- predict(prep, X)
sum(is.na(X_imputed))

## [1] 0

sample <- sample(c(TRUE, FALSE), nrow(X_imputed), replace=TRUE, prob=c(0.7,0.3))
X_imputed_train <- X_imputed[sample, ]
X_imputed_test <- X_imputed[!sample, ]

y_train <- y[sample, ]
y_test <- y[!sample, ]

model6 <- randomForest(X_imputed_train, y_train, importance=TRUE, ntrees=1000)

model7 <- gbm.fit(X_imputed_train, y_train, distribution = "gaussian")

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        3.7410             nan     0.0010    0.0030
##      2        3.7382             nan     0.0010    0.0030
##      3        3.7348             nan     0.0010    0.0031
##      4        3.7319             nan     0.0010    0.0033
##      5        3.7284             nan     0.0010    0.0031
##      6        3.7255             nan     0.0010    0.0030
##      7        3.7220             nan     0.0010    0.0033
##      8        3.7189             nan     0.0010    0.0033
##      9        3.7157             nan     0.0010    0.0032
##     10        3.7129             nan     0.0010    0.0030
##     20        3.6818             nan     0.0010    0.0033
##     40        3.6176             nan     0.0010    0.0031
##     60        3.5594             nan     0.0010    0.0026
##     80        3.5030             nan     0.0010    0.0028
##    100        3.4471             nan     0.0010    0.0027

# gbmGrid <- expand.grid(.interaction.depth=seq(1,7,by=2), .n.trees=seq(100,1000,by=50), .shrinkage=c(0.01,0.1))
# gbmTune <- train(X_imputed_train, y_train,
#                  method="gbm",
#                  tuneGrid=gbmGrid,
#                  verbose=FALSE)

model8 <- cubist(X_imputed_train, y_train)

rfPred <- predict(model6, newdata=X_imputed_test)
postResample(pred=rfPred, obs=y_test)

##      RMSE  Rsquared       MAE 
## 0.9729932 0.6183819 0.7772096

gbmPred <- predict(model7, newdata=X_imputed_test)

## Using 100 trees...

postResample(pred=gbmPred, obs=y_test)

##      RMSE  Rsquared       MAE 
## 1.5208723 0.3077065 1.2511209

cubPred <- predict(model8, newdata=X_imputed_test)
postResample(pred=cubPred, obs=y_test)

##      RMSE  Rsquared       MAE 
## 1.1389650 0.5507048 0.9143831

Random Forest has the best Rsquared.

varImp(model6)

##                            Overall
## BiologicalMaterial01    3.46957805
## BiologicalMaterial02    7.25587149
## BiologicalMaterial03    8.89571390
## BiologicalMaterial04    4.72496196
## BiologicalMaterial05    4.45273599
## BiologicalMaterial06    9.13110172
## BiologicalMaterial07    1.00100150
## BiologicalMaterial08    4.46730174
## BiologicalMaterial09    3.78469247
## BiologicalMaterial10    2.71424922
## BiologicalMaterial11    6.37803220
## BiologicalMaterial12   10.58740240
## ManufacturingProcess01  4.20535511
## ManufacturingProcess02  3.54644577
## ManufacturingProcess03  1.87289651
## ManufacturingProcess04  0.43030516
## ManufacturingProcess05  0.59025824
## ManufacturingProcess06  4.14659867
## ManufacturingProcess07  0.24295675
## ManufacturingProcess08 -1.12903407
## ManufacturingProcess09  4.74726040
## ManufacturingProcess10  0.07719570
## ManufacturingProcess11  3.06288939
## ManufacturingProcess12  0.51524347
## ManufacturingProcess13  8.60317183
## ManufacturingProcess14  1.26972601
## ManufacturingProcess15  1.17458602
## ManufacturingProcess16  0.95218464
## ManufacturingProcess17 10.05866043
## ManufacturingProcess18  3.75453178
## ManufacturingProcess19  1.13851737
## ManufacturingProcess20  2.45374740
## ManufacturingProcess21  3.94953824
## ManufacturingProcess22  0.14887561
## ManufacturingProcess23  4.15559362
## ManufacturingProcess24  0.23883881
## ManufacturingProcess25  3.20575592
## ManufacturingProcess26  0.88195137
## ManufacturingProcess27  4.24304390
## ManufacturingProcess28  6.31570540
## ManufacturingProcess29  4.02894794
## ManufacturingProcess30  3.30838659
## ManufacturingProcess31  3.59374328
## ManufacturingProcess32 22.97443691
## ManufacturingProcess33  3.80718271
## ManufacturingProcess34  1.23928196
## ManufacturingProcess35 -0.67764375
## ManufacturingProcess36  5.22341382
## ManufacturingProcess37  1.70434810
## ManufacturingProcess38  0.35412762
## ManufacturingProcess39  6.37174168
## ManufacturingProcess40  0.76267013
## ManufacturingProcess41 -0.03627818
## ManufacturingProcess42  2.11360416
## ManufacturingProcess43  1.60683988
## ManufacturingProcess44 -0.50105098
## ManufacturingProcess45  1.79213330

Homework 9

Tora Mullings

2023-11-19

8.1

8.3

8.7