library(mlbench)
library(randomForest)
library(caret)
library(party)
library(gbm)
library(Cubist)

8.1

set.seed(200)
simulated <- mlbench.friedman1(200, sd=1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
model1 <- randomForest(y~., data=simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp <- varImp(model1, scale=FALSE)
rfImp
##          Overall
## V1   8.732235404
## V2   6.415369387
## V3   0.763591825
## V4   7.615118809
## V5   2.023524577
## V6   0.165111172
## V7  -0.005961659
## V8  -0.166362581
## V9  -0.095292651
## V10 -0.074944788
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
model2 <- randomForest(y~., data=simulated,
                       importance=TRUE,
                       ntree=1000)
rfImp2 <- varImp(model2, scale=FALSE)
rfImp2
##                Overall
## V1          5.69119973
## V2          6.06896061
## V3          0.62970218
## V4          7.04752238
## V5          1.87238438
## V6          0.13569065
## V7         -0.01345645
## V8         -0.04370565
## V9          0.00840438
## V10         0.02894814
## duplicate1  4.28331581

The importance of V1 decreased.

Conditional Inference Tree

model3 <- cforest(y~., data=simulated)
party::varimp(model3, conditional=FALSE)
##            V1            V2            V3            V4            V5 
##  4.6171158805  6.0579730772  0.0003116115  7.6223892727  1.7161194047 
##            V6            V7            V8            V9           V10 
## -0.0289427183  0.0465374951 -0.0380965511  0.0046062409 -0.0310326410 
##    duplicate1 
##  5.0941897280
party::varimp(model3, conditional=TRUE)
##           V1           V2           V3           V4           V5           V6 
##  1.807953145  4.688980360  0.012878752  6.190578351  1.051666850  0.028174759 
##           V7           V8           V9          V10   duplicate1 
## -0.011709437 -0.004356587  0.015118505 -0.022190587  1.926751729

The variable importances are in a different order between traditional and modified.

model5 <- cubist(simulated[1:10], simulated$y)
varImp(model5)
##     Overall
## V1       50
## V2       50
## V4       50
## V5       50
## V3        0
## V6        0
## V7        0
## V8        0
## V9        0
## V10       0

8.3

  1. The model with learning rate=0.9 and bagging fraction=0.9 focuses its importance on a few of the predictors because it converges to a local optimum more quickly. It might have missed the importance of the other predictors and just selected a few of the best ones.
  2. The model with learning rate=0.1 takes smaller steps but it also uses a smaller portion of training data. A medium between the two models may be best.
  3. Increasing interactive depth for the model on the right would cause the predictor importance slope to increase because more splits would be made, giving importance to the corresponding predictors. For the model on the left, the predictors’ importances would become more level.

8.7

library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
X <- ChemicalManufacturingProcess[2:58]
y <- ChemicalManufacturingProcess[1]
sum(is.na(X))
## [1] 106
prep <- preProcess(X, method=c("knnImpute"))
prep
## Created from 152 samples and 57 variables
## 
## Pre-processing:
##   - centered (57)
##   - ignored (0)
##   - 5 nearest neighbor imputation (57)
##   - scaled (57)
X_imputed <- predict(prep, X)
sum(is.na(X_imputed))
## [1] 0
sample <- sample(c(TRUE, FALSE), nrow(X_imputed), replace=TRUE, prob=c(0.7,0.3))
X_imputed_train <- X_imputed[sample, ]
X_imputed_test <- X_imputed[!sample, ]

y_train <- y[sample, ]
y_test <- y[!sample, ]
model6 <- randomForest(X_imputed_train, y_train, importance=TRUE, ntrees=1000)
model7 <- gbm.fit(X_imputed_train, y_train, distribution = "gaussian")
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        3.7410             nan     0.0010    0.0030
##      2        3.7382             nan     0.0010    0.0030
##      3        3.7348             nan     0.0010    0.0031
##      4        3.7319             nan     0.0010    0.0033
##      5        3.7284             nan     0.0010    0.0031
##      6        3.7255             nan     0.0010    0.0030
##      7        3.7220             nan     0.0010    0.0033
##      8        3.7189             nan     0.0010    0.0033
##      9        3.7157             nan     0.0010    0.0032
##     10        3.7129             nan     0.0010    0.0030
##     20        3.6818             nan     0.0010    0.0033
##     40        3.6176             nan     0.0010    0.0031
##     60        3.5594             nan     0.0010    0.0026
##     80        3.5030             nan     0.0010    0.0028
##    100        3.4471             nan     0.0010    0.0027
# gbmGrid <- expand.grid(.interaction.depth=seq(1,7,by=2), .n.trees=seq(100,1000,by=50), .shrinkage=c(0.01,0.1))
# gbmTune <- train(X_imputed_train, y_train,
#                  method="gbm",
#                  tuneGrid=gbmGrid,
#                  verbose=FALSE)
model8 <- cubist(X_imputed_train, y_train)
rfPred <- predict(model6, newdata=X_imputed_test)
postResample(pred=rfPred, obs=y_test)
##      RMSE  Rsquared       MAE 
## 0.9729932 0.6183819 0.7772096
gbmPred <- predict(model7, newdata=X_imputed_test)
## Using 100 trees...
postResample(pred=gbmPred, obs=y_test)
##      RMSE  Rsquared       MAE 
## 1.5208723 0.3077065 1.2511209
cubPred <- predict(model8, newdata=X_imputed_test)
postResample(pred=cubPred, obs=y_test)
##      RMSE  Rsquared       MAE 
## 1.1389650 0.5507048 0.9143831

Random Forest has the best Rsquared.

varImp(model6)
##                            Overall
## BiologicalMaterial01    3.46957805
## BiologicalMaterial02    7.25587149
## BiologicalMaterial03    8.89571390
## BiologicalMaterial04    4.72496196
## BiologicalMaterial05    4.45273599
## BiologicalMaterial06    9.13110172
## BiologicalMaterial07    1.00100150
## BiologicalMaterial08    4.46730174
## BiologicalMaterial09    3.78469247
## BiologicalMaterial10    2.71424922
## BiologicalMaterial11    6.37803220
## BiologicalMaterial12   10.58740240
## ManufacturingProcess01  4.20535511
## ManufacturingProcess02  3.54644577
## ManufacturingProcess03  1.87289651
## ManufacturingProcess04  0.43030516
## ManufacturingProcess05  0.59025824
## ManufacturingProcess06  4.14659867
## ManufacturingProcess07  0.24295675
## ManufacturingProcess08 -1.12903407
## ManufacturingProcess09  4.74726040
## ManufacturingProcess10  0.07719570
## ManufacturingProcess11  3.06288939
## ManufacturingProcess12  0.51524347
## ManufacturingProcess13  8.60317183
## ManufacturingProcess14  1.26972601
## ManufacturingProcess15  1.17458602
## ManufacturingProcess16  0.95218464
## ManufacturingProcess17 10.05866043
## ManufacturingProcess18  3.75453178
## ManufacturingProcess19  1.13851737
## ManufacturingProcess20  2.45374740
## ManufacturingProcess21  3.94953824
## ManufacturingProcess22  0.14887561
## ManufacturingProcess23  4.15559362
## ManufacturingProcess24  0.23883881
## ManufacturingProcess25  3.20575592
## ManufacturingProcess26  0.88195137
## ManufacturingProcess27  4.24304390
## ManufacturingProcess28  6.31570540
## ManufacturingProcess29  4.02894794
## ManufacturingProcess30  3.30838659
## ManufacturingProcess31  3.59374328
## ManufacturingProcess32 22.97443691
## ManufacturingProcess33  3.80718271
## ManufacturingProcess34  1.23928196
## ManufacturingProcess35 -0.67764375
## ManufacturingProcess36  5.22341382
## ManufacturingProcess37  1.70434810
## ManufacturingProcess38  0.35412762
## ManufacturingProcess39  6.37174168
## ManufacturingProcess40  0.76267013
## ManufacturingProcess41 -0.03627818
## ManufacturingProcess42  2.11360416
## ManufacturingProcess43  1.60683988
## ManufacturingProcess44 -0.50105098
## ManufacturingProcess45  1.79213330