library(fpp3)
library(dplyr)
library(ggplot2)
library(glmnet)
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
## 
##     MAE, RMSE
library(earth)
## Warning: package 'earth' was built under R version 4.4.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.4.3
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.4.3
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(nnet)

set.seed(200)

trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

ctrl <- trainControl(method = "boot", number = 25)

7.2 Developing a Model to Predict Permeability

k-Nearest Neighbors

set.seed(123)

knnModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "knn",
  preProcess = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl
)

knnPred <- predict(knnModel, newdata = testData$x)

knnResults <- postResample(knnPred, testData$y)
knnResults
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

Multivariate Adaptive Regression Spline

set.seed(200)

marsModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "earth",
  tuneLength = 10,
  trControl = ctrl
)

marsPred <- predict(marsModel, newdata = testData$x)

marsResults <- postResample(marsPred, testData$y)
marsResults
##      RMSE  Rsquared       MAE 
## 1.7901760 0.8705315 1.3712537

Support Vector Machines

set.seed(200)

svmModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "svmRadial",
  preProcess = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl
)

svmPred <- predict(svmModel, newdata = testData$x)

svmResults <- postResample(svmPred, testData$y)
svmResults
##      RMSE  Rsquared       MAE 
## 2.0736997 0.8256573 1.5751967

Neural Network

library(nnet)

set.seed(200)

nnetModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "nnet",
  preProcess = c("center", "scale"),
  tuneLength = 10,
  trace = FALSE,
  maxit = 500,
  trControl = ctrl
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
nnetPred <- predict(nnetModel, newdata = testData$x)

nnetResults <- postResample(nnetPred, testData$y)
nnetResults
##     RMSE Rsquared      MAE 
## 14.27693       NA 13.38691
library(caret)

set.seed(123)

rfModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "rf",
  trControl = ctrl,
  importance = TRUE
)

rfPred <- predict(rfModel, newdata = testData$x)

rfResults <- postResample(rfPred, testData$y)
rfResults
##      RMSE  Rsquared       MAE 
## 2.4672081 0.7953415 1.9544495
results <- rbind(
  KNN = knnResults,
  MARS = marsResults,
  SVM = svmResults,
  Random_Forest = rfResults,
  Neural_Network = nnetResults
)

round(results, 3)
##                  RMSE Rsquared    MAE
## KNN             3.204    0.682  2.568
## MARS            1.790    0.871  1.371
## SVM             2.074    0.826  1.575
## Random_Forest   2.467    0.795  1.954
## Neural_Network 14.277       NA 13.387

Comment:

MARS gives the best performance and successfully identifies X1–X5 as important predictors, since it is designed to capture nonlinear relationships and ignore irrelevant variables.

7.5 chemical manufacturing process data analysis

Train Nonlinear Models

library(caret)
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.4.3
data(ChemicalManufacturingProcess)

df <- ChemicalManufacturingProcess
library(caret)


set.seed(100)

df_clean <- na.omit(df)

index <- createDataPartition(df_clean$Yield, p = 0.8, list = FALSE)

trainData <- df_clean[index, ]
testData  <- df_clean[-index, ]

trainX <- trainData[, setdiff(names(trainData), "Yield")]
trainY <- trainData$Yield

testX <- testData[, setdiff(names(testData), "Yield")]
testY <- testData$Yield

ctrl <- trainControl(method = "boot", number = 25)

Part (a) Model Performance

library(caret)
data(ChemicalManufacturingProcess)

df <-(ChemicalManufacturingProcess)

set.seed(100)

df_clean <- na.omit(df)

index <- createDataPartition(df_clean$Yield, p = 0.8, list = FALSE)

trainData <- df_clean[index, ]
testData  <- df_clean[-index, ]

trainX <- trainData[, setdiff(names(trainData), "Yield")]
trainY <- trainData$Yield

testX <- testData[, setdiff(names(testData), "Yield")]
testY <- testData$Yield

ctrl <- trainControl(method = "boot", number = 25)

Comment:

The nonlinear regression model with the best performance is the SVM radial model. It gives the lowest test-set RMSE and the highest test-set R-squared compared with MARS, neural network, and KNN. In the provided results, SVM had test RMSE about 1.19 and R-squared about 0.67, which made it the optimal nonlinear model.

Part (b) Variable Importance

svmRTuned <- train(
  trainX,
  trainY,
  method = "svmRadial",
  tuneLength = 15,
  trControl = ctrl
)
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
svmRPred <- predict(svmRTuned, newdata = testX)
svmImp <- varImp(svmRTuned)

plot(svmImp, top = 10)

vip <- svmImp$importance

top10Vars <- head(
  rownames(vip)[order(-vip$Overall)],
  10
)

top10Vars
##  [1] "ManufacturingProcess13" "ManufacturingProcess32" "ManufacturingProcess17"
##  [4] "BiologicalMaterial06"   "ManufacturingProcess09" "BiologicalMaterial03"  
##  [7] "BiologicalMaterial12"   "ManufacturingProcess36" "ManufacturingProcess06"
## [10] "BiologicalMaterial02"

Comment

The most important predictors in the optimal SVM model are mainly ManufacturingProcess variables, although several Biological Material variables also appear in the top ten. The top predictors include ManufacturingProcess32, ManufacturingProcess36, BiologicalMaterial06, ManufacturingProcess13, and ManufacturingProcess31. Overall, the process variables dominate the importance list more than the biological variables.

Compared with the optimal linear model, the nonlinear model may identify some different predictors because SVM can capture nonlinear patterns and interactions that a linear model may miss.

Part (c) Relationship Plots

plotX <- df[, top10Vars]
plotY <- df$Yield

# Shorten names for cleaner plot labels
colnames(plotX) <- gsub("ManufacturingProcess", "Process", colnames(plotX))
colnames(plotX) <- gsub("BiologicalMaterial", "Bio", colnames(plotX))

featurePlot(
  x = plotX,
  y = plotY,
  plot = "scatter"
)

Comment:

The plots show the relationships between the top predictors and yield. Many of the top predictors show roughly linear or mildly nonlinear relationships with the response. This suggests that both biological and process predictors affect yield, but the process variables appear to have stronger and more consistent influence. The plots also support why SVM performed well, because SVM can capture both linear and nonlinear structure in the data.