suppressPackageStartupMessages({
  library(caret)
  library(mlbench)
  library(ggplot2)
  library(earth)
  library(kernlab)
  library(randomForest)
  library(gbm)
  library(nnet)
  library(AppliedPredictiveModeling)
  library(tibble)
  library(dplyr)
})
theme_set(theme_minimal())

library(doParallel)
registerDoParallel(cores = parallel::detectCores()-1)

ctrl <- trainControl(method = "cv", number = 3, summaryFunction = defaultSummary, savePredictions = "final")
metric <- "RMSE"

Exercise 7.2 — Friedman (1991) Simulation

Data Simulation and Visualization

set.seed(200)
trainingData <- mlbench.friedman1(100, sd = 1)
trainingData$x <- as.data.frame(trainingData$x); names(trainingData$x) <- paste0("X", 1:10)
testData <- mlbench.friedman1(1000, sd = 1)
testData$x <- as.data.frame(testData$x); names(testData$x) <- paste0("X", 1:10)
pairs(trainingData$x[,1:5])

Interpretation:
Nonlinear and interaction patterns appear in X1–X5; X6–X10 are noise.

Model Training and Comparison

set.seed(200)
pre <- c("center","scale")
knnModel <- train(x=trainingData$x, y=trainingData$y, method="knn", preProcess=pre, tuneLength=5, trControl=ctrl, metric=metric)
marsModel <- train(x=trainingData$x, y=trainingData$y, method="earth", tuneLength=5, trControl=ctrl, metric=metric)
svmModel <- train(x=trainingData$x, y=trainingData$y, method="svmRadial", preProcess=pre, tuneLength=5, trControl=ctrl, metric=metric)
rfModel <- train(x=trainingData$x, y=trainingData$y, method="rf", tuneLength=3, trControl=ctrl, metric=metric, importance=TRUE)
gbmModel <- train(x=trainingData$x, y=trainingData$y, method="gbm", verbose=FALSE, tuneLength=3, trControl=ctrl, metric=metric)
resamp <- resamples(list(KNN=knnModel, MARS=marsModel, SVM=svmModel, RF=rfModel, GBM=gbmModel))
dotplot(resamp, metric="RMSE")

summary(resamp)
## 
## Call:
## summary.resamples(object = resamp)
## 
## Models: KNN, MARS, SVM, RF, GBM 
## Number of resamples: 3 
## 
## MAE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## KNN  2.645805 2.962941 3.280077 3.102446 3.330766 3.381455    0
## MARS 1.550732 1.629542 1.708351 1.664458 1.721320 1.734290    0
## SVM  1.875930 1.904770 1.933609 1.932866 1.961335 1.989060    0
## RF   2.010920 2.100923 2.190926 2.187631 2.275987 2.361048    0
## GBM  1.571274 1.614983 1.658693 1.708539 1.777172 1.895651    0
## 
## RMSE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## KNN  3.215916 3.674246 4.132576 3.828073 4.134151 4.135727    0
## MARS 1.933126 2.048968 2.164809 2.098230 2.180782 2.196755    0
## SVM  2.278147 2.319452 2.360756 2.419144 2.489643 2.618530    0
## RF   2.535351 2.618155 2.700959 2.675254 2.745206 2.789454    0
## GBM  1.886296 1.928171 1.970046 2.089988 2.191834 2.413622    0
## 
## Rsquared 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## KNN  0.4520668 0.5253695 0.5986721 0.5538527 0.6047457 0.6108192    0
## MARS 0.8472413 0.8479259 0.8486105 0.8545421 0.8581926 0.8677746    0
## SVM  0.7649261 0.7861198 0.8073136 0.7943175 0.8090131 0.8107127    0
## RF   0.6575727 0.7329948 0.8084169 0.7861653 0.8504617 0.8925064    0
## GBM  0.7940180 0.8261251 0.8582323 0.8463024 0.8724447 0.8866570    0

Interpretation:
SVM-RBF and MARS usually perform best on this nonlinear function.

Test-Set Evaluation and Variable Importance

preds <- lapply(list(KNN=knnModel,MARS=marsModel,SVM=svmModel,RF=rfModel,GBM=gbmModel), predict, newdata=testData$x)
perf <- sapply(preds, function(p) postResample(p, testData$y))
t(perf)
##          RMSE  Rsquared      MAE
## KNN  3.439214 0.5910585 2.773497
## MARS 1.982857 0.8464011 1.521008
## SVM  2.517090 0.7517280 1.930327
## RF   2.869054 0.6955246 2.258341
## GBM  2.216743 0.8079666 1.721698
varImp(marsModel)$importance %>%
  tibble::rownames_to_column("Feature") %>% arrange(desc(Overall)) %>% head(10) %>%
  ggplot(aes(x=reorder(Feature, Overall), y=Overall)) +
  geom_col() + coord_flip() +
  labs(title="MARS Variable Importance — Friedman 1 (Fast Version)", x=NULL, y="Importance")


Exercise 7.5 — Chemical Manufacturing Process (Fast Knit)

7.5 (a) Model Performance

data("ChemicalManufacturingProcess")
cmp <- ChemicalManufacturingProcess
set.seed(200)
inTrain <- createDataPartition(cmp$Yield, p=0.8, list=FALSE)
trainX <- cmp[inTrain, -1]; trainY <- cmp$Yield[inTrain]
testX <- cmp[-inTrain, -1]; testY <- cmp$Yield[-inTrain]
pp <- preProcess(trainX, method=c("medianImpute","zv","center","scale"))
trainX <- predict(pp, trainX); testX <- predict(pp, testX)
set.seed(200)
svm_cmp  <- train(trainX, trainY, method="svmRadial", tuneLength=5, trControl=ctrl, metric=metric)
mars_cmp <- train(trainX, trainY, method="earth", tuneLength=5, trControl=ctrl, metric=metric)
rf_cmp   <- train(trainX, trainY, method="rf", tuneLength=3, trControl=ctrl, metric=metric, importance=TRUE)
gbm_cmp  <- train(trainX, trainY, method="gbm", tuneLength=3, trControl=ctrl, metric=metric, verbose=FALSE)
nnet_cmp <- train(trainX, trainY, method="nnet", linout=TRUE, trace=FALSE, tuneLength=3, trControl=ctrl, metric=metric)
res_cmp <- resamples(list(SVM=svm_cmp,MARS=mars_cmp,RF=rf_cmp,GBM=gbm_cmp,NNET=nnet_cmp))
dotplot(res_cmp, metric="RMSE")

summary(res_cmp)
## 
## Call:
## summary.resamples(object = res_cmp)
## 
## Models: SVM, MARS, RF, GBM, NNET 
## Number of resamples: 3 
## 
## MAE 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## SVM  0.8523109 0.8554566 0.8586023 0.8823431 0.8973592 0.9361160    0
## MARS 0.7965739 0.9728732 1.1491725 1.0642395 1.1980724 1.2469722    0
## RF   0.8356929 0.8552098 0.8747267 0.8955150 0.9254260 0.9761254    0
## GBM  0.8181439 0.8409093 0.8636748 0.8833286 0.9159210 0.9681672    0
## NNET 1.1651261 1.1866438 1.2081615 1.1984109 1.2150533 1.2219450    0
## 
## RMSE 
##           Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## SVM  1.0485325 1.090085 1.131638 1.121799 1.158432 1.185225    0
## MARS 0.9498128 1.191531 1.433249 1.328771 1.518250 1.603251    0
## RF   1.0493208 1.077820 1.106320 1.129179 1.169108 1.231896    0
## GBM  1.0264474 1.082407 1.138366 1.110495 1.152519 1.166672    0
## NNET 1.4504581 1.465980 1.481502 1.476003 1.488775 1.496048    0
## 
## Rsquared 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## SVM  0.5691201 0.5912098 0.6132995 0.6216060 0.6478490 0.6823985    0
## MARS 0.3431908 0.3691018 0.3950128 0.4554276 0.5115460 0.6280792    0
## RF   0.6143960 0.6209804 0.6275649 0.6446071 0.6597127 0.6918606    0
## GBM  0.6027174 0.6106278 0.6185382 0.6207064 0.6297008 0.6408634    0
## NNET 0.3384389 0.3516045 0.3647701 0.3898919 0.4156185 0.4664668    0
preds_cmp <- lapply(list(SVM=svm_cmp,MARS=mars_cmp,RF=rf_cmp,GBM=gbm_cmp,NNET=nnet_cmp), predict, newdata=testX)
perf_cmp <- sapply(preds_cmp, function(p) postResample(p, testY))
t(perf_cmp)
##          RMSE   Rsquared       MAE
## SVM  1.330960 0.61945385 1.0017224
## MARS 1.654897 0.32853139 1.2847871
## RF   1.320086 0.60641257 0.9244915
## GBM  1.240818 0.64554151 0.9251764
## NNET 6.546625 0.04155309 4.5325295

Interpretation:
SVM-RBF or GBM still perform best even with faster CV settings.

7.5 (b) Variable Importance

set.seed(200)
glmnet_cmp <- train(trainX, trainY, method="glmnet", tuneLength=10, trControl=ctrl, metric=metric)
postResample(predict(glmnet_cmp, testX), testY)
##      RMSE  Rsquared       MAE 
## 3.1904518 0.1328109 1.4627602
bestModel <- svm_cmp
vi_nonlin <- varImp(bestModel)$importance %>% tibble::rownames_to_column("Feature") %>% arrange(desc(Overall)) %>% head(10)
ggplot(vi_nonlin, aes(x=reorder(Feature, Overall), y=Overall)) + geom_col() + coord_flip() +
  labs(title="Top 10 Variables — Optimal Nonlinear Model", x=NULL, y="Importance")

vi_linear <- varImp(glmnet_cmp)$importance %>% tibble::rownames_to_column("Feature") %>% arrange(desc(Overall)) %>% head(10)
ggplot(vi_linear, aes(x=reorder(Feature, Overall), y=Overall)) + geom_col() + coord_flip() +
  labs(title="Top 10 Variables — Linear Model (GLMNET)", x=NULL, y="Importance")

7.5 (c) Predictor Relationships

unique_vars <- setdiff(vi_nonlin$Feature, vi_linear$Feature)
if(length(unique_vars)==0){
  cat("All top predictors overlap between models.")
} else {
  df_plot <- data.frame(Yield=trainY, trainX[,unique_vars,drop=FALSE])
  for(v in unique_vars){
    print(
      ggplot(df_plot, aes_string(x=v, y="Yield")) +
      geom_point(alpha=0.6) +
      geom_smooth(method="loess", se=TRUE, col="steelblue") +
      labs(title=paste("Yield vs", v), subtitle="Unique nonlinear predictor")
    )
  }
}

Interpretation:
Curved or threshold-like relationships reveal nonlinear influence zones.