library(AppliedPredictiveModeling)
library(mice) # impute NAs
library(visdat) # missing value
library(e1071)
library(caret)
library(dplyr)
library(tidyr)
library(mlbench) # chp 7 and 8
library(kableExtra)
library(randomForest)
library(party)
library(gbm)
library(Cubist)
library(rpart)
library(RANN)
library(pls)
library(earth)
library(kernlab)

6.3

data("ChemicalManufacturingProcess")
Pharma_data <- ChemicalManufacturingProcess
str(Pharma_data)
## 'data.frame':    176 obs. of  58 variables:
##  $ Yield                 : num  38 42.4 42 41.4 42.5 ...
##  $ BiologicalMaterial01  : num  6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
##  $ BiologicalMaterial02  : num  49.6 61 61 61 63.3 ...
##  $ BiologicalMaterial03  : num  57 67.5 67.5 67.5 72.2 ...
##  $ BiologicalMaterial04  : num  12.7 14.7 14.7 14.7 14 ...
##  $ BiologicalMaterial05  : num  19.5 19.4 19.4 19.4 17.9 ...
##  $ BiologicalMaterial06  : num  43.7 53.1 53.1 53.1 54.7 ...
##  $ BiologicalMaterial07  : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ BiologicalMaterial08  : num  16.7 19 19 19 18.2 ...
##  $ BiologicalMaterial09  : num  11.4 12.6 12.6 12.6 12.8 ...
##  $ BiologicalMaterial10  : num  3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
##  $ BiologicalMaterial11  : num  138 154 154 154 148 ...
##  $ BiologicalMaterial12  : num  18.8 21.1 21.1 21.1 21.1 ...
##  $ ManufacturingProcess01: num  NA 0 0 0 10.7 12 11.5 12 12 12 ...
##  $ ManufacturingProcess02: num  NA 0 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess03: num  NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
##  $ ManufacturingProcess04: num  NA 917 912 911 918 924 933 929 928 938 ...
##  $ ManufacturingProcess05: num  NA 1032 1004 1015 1028 ...
##  $ ManufacturingProcess06: num  NA 210 207 213 206 ...
##  $ ManufacturingProcess07: num  NA 177 178 177 178 178 177 178 177 177 ...
##  $ ManufacturingProcess08: num  NA 178 178 177 178 178 178 178 177 177 ...
##  $ ManufacturingProcess09: num  43 46.6 45.1 44.9 45 ...
##  $ ManufacturingProcess10: num  NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
##  $ ManufacturingProcess11: num  NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
##  $ ManufacturingProcess12: num  NA 0 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess13: num  35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
##  $ ManufacturingProcess14: num  4898 4869 4878 4897 4992 ...
##  $ ManufacturingProcess15: num  6108 6095 6087 6102 6233 ...
##  $ ManufacturingProcess16: num  4682 4617 4617 4635 4733 ...
##  $ ManufacturingProcess17: num  35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
##  $ ManufacturingProcess18: num  4865 4867 4877 4872 4886 ...
##  $ ManufacturingProcess19: num  6049 6097 6078 6073 6102 ...
##  $ ManufacturingProcess20: num  4665 4621 4621 4611 4659 ...
##  $ ManufacturingProcess21: num  0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
##  $ ManufacturingProcess22: num  NA 3 4 5 8 9 1 2 3 4 ...
##  $ ManufacturingProcess23: num  NA 0 1 2 4 1 1 2 3 1 ...
##  $ ManufacturingProcess24: num  NA 3 4 5 18 1 1 2 3 4 ...
##  $ ManufacturingProcess25: num  4873 4869 4897 4892 4930 ...
##  $ ManufacturingProcess26: num  6074 6107 6116 6111 6151 ...
##  $ ManufacturingProcess27: num  4685 4630 4637 4630 4684 ...
##  $ ManufacturingProcess28: num  10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
##  $ ManufacturingProcess29: num  21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
##  $ ManufacturingProcess30: num  9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
##  $ ManufacturingProcess31: num  69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
##  $ ManufacturingProcess32: num  156 169 173 171 171 173 159 161 160 164 ...
##  $ ManufacturingProcess33: num  66 66 66 68 70 70 65 65 65 66 ...
##  $ ManufacturingProcess34: num  2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ ManufacturingProcess35: num  486 508 509 496 468 490 475 478 491 488 ...
##  $ ManufacturingProcess36: num  0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
##  $ ManufacturingProcess37: num  0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
##  $ ManufacturingProcess38: num  3 2 2 2 2 2 2 2 3 3 ...
##  $ ManufacturingProcess39: num  7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
##  $ ManufacturingProcess40: num  NA 0.1 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess41: num  NA 0.15 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess42: num  11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
##  $ ManufacturingProcess43: num  3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
##  $ ManufacturingProcess44: num  1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
##  $ ManufacturingProcess45: num  2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...
visdat::vis_miss(Pharma_data, sort_miss = TRUE)

prepro_1 <- preProcess(subset(Pharma_data, select = -Yield), method = c("BoxCox", "knnImpute", "center", "scale"))
pharma_imput <- predict(prepro_1, Pharma_data[, -1])


visdat::vis_miss(pharma_imput)

# Restore the response variable values to original
pharma_imput$Yield = Pharma_data$Yield

# Remove highly correlated predictors
cor_matrix <- cor(pharma_imput)
high_cor <- findCorrelation(cor_matrix, cutoff = 0.90)
pharma_imput <- pharma_imput[, -high_cor]

# Remove near-zero variance predictors
nzv <- nearZeroVar(pharma_imput)
pharma_imput <- pharma_imput[, -nzv]

# Split the data into a training and a test set
set.seed(123)
trainRows <- createDataPartition(pharma_imput$Yield, p = .80, list = FALSE)
training <- pharma_imput[trainRows,]
test <- pharma_imput[-trainRows,]

# Train a PLS model
set.seed(123)

ctrl <- trainControl(method = "cv", number = 5)
pslfit <- train(
  x = training[, -ncol(training)],
  y = training$Yield,
  method = "pls",
  tuneLength = 20,
  trControl = ctrl
)
print(pslfit)
## Partial Least Squares 
## 
## 144 samples
##  47 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 116, 116, 115, 116, 113 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     1.349780  0.4804392  1.1046642
##    2     1.175328  0.5889834  0.9475752
##    3     1.168439  0.6150759  0.9557336
##    4     1.176341  0.6176024  0.9600506
##    5     1.184559  0.6139388  0.9617742
##    6     1.205142  0.6080452  0.9796574
##    7     1.217894  0.6049307  0.9939552
##    8     1.259650  0.5882543  1.0269892
##    9     1.291723  0.5789170  1.0489907
##   10     1.317733  0.5657914  1.0676808
##   11     1.354709  0.5520161  1.0860524
##   12     1.394908  0.5360526  1.1197126
##   13     1.422760  0.5266737  1.1345927
##   14     1.473333  0.5128033  1.1583224
##   15     1.502401  0.5067104  1.1704796
##   16     1.514326  0.5064079  1.1748180
##   17     1.523976  0.5039993  1.1756099
##   18     1.535185  0.5019438  1.1802772
##   19     1.559650  0.4959072  1.1873298
##   20     1.581071  0.4913535  1.1900139
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plot(pslfit)

set.seed(123)
# Predict on test set
pred_test <- predict(pslfit, newdata = test, ncomp = 2)

# Performance metrics for test set
test_perf <- postResample(pred_test, test$Yield)

# Performance metrics for training set
train_perf <- getTrainPerf(pslfit)

# Compare training and test performance
perf_df <- data.frame(c(train_perf[1:3], test_perf[1:3]))
names(perf_df) <- c("Training RMSE", "Training Rsquared", "Training MAE", "Test RMSE", "Test Rsquared", "Test MAE")
print(perf_df)
##   Training RMSE Training Rsquared Training MAE Test RMSE Test Rsquared Test MAE
## 1      1.168439         0.6150759    0.9557336  1.342466     0.4876693  1.14565
# Plot residuals
plot(residuals(pslfit, ncomp = 2))
abline(h = 0)

( e)

# Find important predictors
plot(caret::varImp(pslfit), top = 20)

# Important predictors
imp_predictors <- c('ManufacturingProcess32', 'ManufacturingProcess13', 'ManufacturingProcess09', 'ManufacturingProcess17', 'ManufacturingProcess36', 'Yield')

# Plot relationships
pharma_imput %>%
  select(imp_predictors) %>%
  gather(var, val, -Yield) %>%
  ggplot(aes(x = val, y = Yield, color = var)) +
  geom_point() +
  facet_wrap(~var)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(imp_predictors)
## 
##   # Now:
##   data %>% select(all_of(imp_predictors))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

7.2

Simulating Data:

set.seed(123)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)


set.seed(123)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Visualize the training data

featurePlot(trainingData$x, trainingData$y)

KNN

set.seed(123)
knnModel <- train(x = trainingData$x,y = trainingData$y, method = "knn", preProc = c("center", "scale"),tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.360145  0.5186277  2.691602
##    7  3.200822  0.5652109  2.559310
##    9  3.146979  0.5885721  2.517311
##   11  3.115995  0.6038019  2.480000
##   13  3.114977  0.6142479  2.488102
##   15  3.102224  0.6306039  2.479184
##   17  3.103956  0.6407442  2.483608
##   19  3.114121  0.6439695  2.485436
##   21  3.140855  0.6459751  2.510892
##   23  3.158940  0.6507325  2.529683
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = testData$x)
knn_results <- postResample(pred = knnPred, obs = testData$y)
print(knn_results)
##      RMSE  Rsquared       MAE 
## 3.2421805 0.6536034 2.5899776

MARS

set.seed(123)
mars_grid <- expand.grid(.degree = 1:2, .nprune = 2:38)
mars_model <- train(x = trainingData$x, y = trainingData$y, method = "earth", preProcess = c("center", "scale"), tuneGrid = mars_grid, trControl = trainControl(method = "cv"))
mars_predict <- predict(mars_model, newdata = testData$x)
mars_results <- postResample(pred = mars_predict, obs = testData$y)
print(mars_results)
##      RMSE  Rsquared       MAE 
## 1.2193997 0.9392656 0.9587801
plot(varImp(mars_model))

SVM

set.seed(123)
svm_model <- train(x = trainingData$x, y = trainingData$y, method = "svmRadial", preProcess = c("center", "scale"), tuneLength = 20)
svm_predict <- predict(svm_model, newdata = testData$x)
svm_results <- postResample(pred = svm_predict, obs = testData$y)
print(svm_results)
##      RMSE  Rsquared       MAE 
## 2.1679276 0.8083005 1.6333459

Neural Net

set.seed(123)
nnet_grid <- expand.grid(.decay = c(0, 0.01, 0.1), .size = c(1:10), .bag = FALSE)
ctrl <- trainControl(method = "cv")
nnet_model <- train(x = trainingData$x, y = trainingData$y, method = "avNNet", tuneGrid = nnet_grid, trControl = ctrl, preProcess = c("center", "scale"), linout = TRUE, trace = FALSE, maxit = 500, MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1)
## Warning: executing %dopar% sequentially: no parallel backend registered
nnet_predict <- predict(nnet_model, newdata = testData$x)
nnet_results <- postResample(pred = nnet_predict, obs = testData$y)
print(nnet_results)
##      RMSE  Rsquared       MAE 
## 2.2330230 0.7981146 1.6782098

Model Performance comparison

model_comparison <- data.frame(rbind(knn_results, mars_results, svm_results, nnet_results))
rownames(model_comparison) <- c("KNN", "MARS", "SVM", "Neural Network")
model_comparison <- model_comparison[order(model_comparison$RMSE), ]
print(model_comparison)
##                    RMSE  Rsquared       MAE
## MARS           1.219400 0.9392656 0.9587801
## SVM            2.167928 0.8083005 1.6333459
## Neural Network 2.233023 0.7981146 1.6782098
## KNN            3.242180 0.6536034 2.5899776

The MARS model outperforms the other models in predicting the simulated data. It also identifies the most important predictors as shown in the plot.

7.5

  1. KNN
set.seed(123)
knnModel <- train(x = training[, -ncol(training)], y = training$Yield, method = "knn", tuneGrid = data.frame(.k = 1:20), trControl = trainControl(method = "cv"))
knn_pred <- predict(knnModel, newdata = test[, -ncol(test)])
knn_results <- postResample(pred = knn_pred, obs = test$Yield)
print(knn_results)
##     RMSE Rsquared      MAE 
## 1.345935 0.460384 1.018750

MARS

set.seed(123)
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsModel <- train(x = training[, -ncol(training)], y = training$Yield, method = "earth", tuneGrid = marsGrid, trControl = trainControl(method = "cv"))
mars_pred <- predict(marsModel, newdata = test[, -ncol(test)])
mars_results <- postResample(pred = mars_pred, obs = test$Yield)
print(mars_results)
##      RMSE  Rsquared       MAE 
## 1.4118759 0.4163619 1.1226807

SVM

set.seed(123)
svmModel <- train(x = training[, -ncol(training)], y = training$Yield, method = "svmRadial", tuneLength = 14, trControl = trainControl(method = "cv"))
svm_pred <- predict(svmModel, newdata = test[, -ncol(test)])
svm_results <- postResample(pred = svm_pred, obs = test$Yield)
print(svm_results)
##      RMSE  Rsquared       MAE 
## 1.1513032 0.6053502 0.9572238

Neural Net

set.seed(123)
nnetModel <- train(
  x = training[, -ncol(training)],
  y = training$Yield,
  method = "nnet",
  tuneGrid = expand.grid(size = 1:5, decay = c(0, 0.01, 0.1)),
  linout = TRUE,
  trace = FALSE,
  MaxNWts = 10 * (ncol(training[, -ncol(training)]) + 1) + 10 + 1,
  maxit = 500
)
nnet_pred <- predict(nnetModel, newdata = test[, -ncol(test)])
nnet_results <- postResample(pred = nnet_pred, obs = test$Yield)
print(nnet_results)
##       RMSE   Rsquared        MAE 
## 1.75833927 0.08372262 1.48089754

Model Performance comparison

model_comparison <- data.frame(
  Model = c("KNN", "Neural Network", "SVM", "MARS"),
  RMSE = c(knn_results[1], nnet_results[1], svm_results[1], mars_results[1]),
  Rsquared = c(knn_results[2], nnet_results[2], svm_results[2], mars_results[2]),
  MAE = c(knn_results[3], nnet_results[3], svm_results[3], mars_results[3])
)

model_comparison <- model_comparison[order(model_comparison$RMSE), ]
print(model_comparison)
##            Model     RMSE   Rsquared       MAE
## 3            SVM 1.151303 0.60535021 0.9572238
## 1            KNN 1.345935 0.46038400 1.0187500
## 4           MARS 1.411876 0.41636188 1.1226807
## 2 Neural Network 1.758339 0.08372262 1.4808975
vimp <- varImp(svmModel)
ggplot(vimp, top = 10) + labs(title = "Top 10 Important Variables - SVM")

ggplot(varImp(pslfit), top = 10) +
  labs(title = "Top 10 Important Variables - PLS")

Optimal nonlinear model Dominated by process variables.

Comparison with Linear Model Both models agree on top features but differ in ranks.

# Predictors importance
vimp_importance <- varImp(svmModel)$importance

# Top 5 predictors
top5_vars <- head(rownames(vimp_importance)[order(-vimp_importance$Overall)], 5)

X <- ChemicalManufacturingProcess[, top5_vars]
Y <- ChemicalManufacturingProcess$Yield

featurePlot(X, Y)

8.1

set.seed(123)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
str(simulated)
## 'data.frame':    200 obs. of  11 variables:
##  $ V1 : num  0.288 0.788 0.409 0.883 0.94 ...
##  $ V2 : num  0.239 0.962 0.601 0.515 0.403 ...
##  $ V3 : num  0.986 0.137 0.905 0.576 0.395 ...
##  $ V4 : num  0.237 0.686 0.226 0.318 0.174 ...
##  $ V5 : num  0.471 0.366 0.121 0.047 0.263 ...
##  $ V6 : num  0.274 0.594 0.16 0.853 0.848 ...
##  $ V7 : num  0.859 0.887 0.489 0.718 0.487 ...
##  $ V8 : num  0.2332 0.2307 0.0617 0.4971 0.2441 ...
##  $ V9 : num  0.639 0.125 0.255 0.821 0.804 ...
##  $ V10: num  0.155 0.846 0.214 0.67 0.618 ...
##  $ y  : num  10.6 17.2 13.1 13.3 10 ...
set.seed(123)
model1 <- randomForest(y ~ ., data = simulated,importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
##         Overall
## V1   3.06124177
## V2   7.85943835
## V3   0.38056337
## V4  10.81444548
## V5   1.94286224
## V6   0.23845388
## V7   0.14670567
## V8   0.24935492
## V9  -0.07158366
## V10  0.20212279

Did the random forest model significantly use the uninformative predictors (V6 – V10)? No

set.seed(123)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9449864
model2 <- randomForest(y ~ ., data = simulated,importance = TRUE, ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
##                Overall
## V1          2.70094025
## V2          7.82707084
## V3          0.48798079
## V4         10.48694160
## V5          1.79881087
## V6          0.35376124
## V7          0.03525398
## V8          0.13056769
## V9          0.01981366
## V10         0.12224580
## duplicate1  1.31970471

Fit another random forest model to these data. Did the importance score for V1 change? Yes. What happens when you add another predictor that is also highly correlated with V1? The importance score of V1 decreases and the decrease was added to the duplicated1, the imporatnce score of V1 split between the 2.

set.seed(123)
model3 <- cforest(y ~ ., data = simulated)
ctrue <- varImp(model3, conditional=TRUE)
ctrue
##                 Overall
## V1          1.265164346
## V2          6.242188438
## V3          0.025008218
## V4          9.556471133
## V5          0.990179164
## V6          0.128781057
## V7          0.002201511
## V8          0.030530401
## V9         -0.029302913
## V10         0.023414696
## duplicate1  0.245946010
cfalse<- varImp(model3, conditional=FALSE)
cfalse
##                 Overall
## V1          1.984345833
## V2          8.185299913
## V3          0.023494492
## V4         12.121302755
## V5          1.350836279
## V6          0.151229594
## V7         -0.006018013
## V8          0.092381122
## V9          0.017947852
## V10         0.004090445
## duplicate1  0.940619520
result<-cbind(rfImp2, ctrue, cfalse)
colnames(result) <- c("tran_rf", "cf_con", "cf_uncon")
result
##                tran_rf       cf_con     cf_uncon
## V1          2.70094025  1.265164346  1.984345833
## V2          7.82707084  6.242188438  8.185299913
## V3          0.48798079  0.025008218  0.023494492
## V4         10.48694160  9.556471133 12.121302755
## V5          1.79881087  0.990179164  1.350836279
## V6          0.35376124  0.128781057  0.151229594
## V7          0.03525398  0.002201511 -0.006018013
## V8          0.13056769  0.030530401  0.092381122
## V9          0.01981366 -0.029302913  0.017947852
## V10         0.12224580  0.023414696  0.004090445
## duplicate1  1.31970471  0.245946010  0.940619520

V2 and V4 are the highest significant variables in all the 3 scenarios. V6 to V10 remains low importance score in all 3 scenarios.

GBM

set.seed(123)
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
                       n.trees = seq(100, 1000, by = 50),
                       shrinkage = 0.1,
                       n.minobsinnode = 5)

gbmModel <- train(y ~ ., data = simulated,
                  method = "gbm",
                  tuneGrid = gbmGrid,
                  verbose = FALSE)

gbm_imp <- varImp(gbmModel)
gbm_imp
## gbm variable importance
## 
##             Overall
## V4         100.0000
## V2          69.2635
## V1          28.0542
## V5          25.0033
## V3          19.7353
## duplicate1   7.8523
## V7           2.2947
## V10          0.6253
## V8           0.5732
## V9           0.3744
## V6           0.0000

Cubist

set.seed(123)
cubistModel <- train(y ~ ., data = simulated, method = 'cubist')
cubist_imp <- varImp(cubistModel)
cubist_imp
## cubist variable importance
## 
##            Overall
## V2         100.000
## V4          99.225
## V3          75.194
## V1          62.791
## V5          57.364
## duplicate1  31.783
## V8          18.605
## V7          13.178
## V6           5.426
## V10          2.326
## V9           0.000

V2 and V4 are the highest significant variables in 2 scenarios. V6 to V10 remains low importance score in all 2 scenarios.