library(fpp3)
library(dplyr)
library(ggplot2)
library(glmnet)
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
##
## MAE, RMSE
library(earth)
## Warning: package 'earth' was built under R version 4.4.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.4.3
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.4.3
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(nnet)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
ctrl <- trainControl(method = "boot", number = 25)
set.seed(123)
knnModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = ctrl
)
knnPred <- predict(knnModel, newdata = testData$x)
knnResults <- postResample(knnPred, testData$y)
knnResults
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
set.seed(200)
marsModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneLength = 10,
trControl = ctrl
)
marsPred <- predict(marsModel, newdata = testData$x)
marsResults <- postResample(marsPred, testData$y)
marsResults
## RMSE Rsquared MAE
## 1.7901760 0.8705315 1.3712537
set.seed(200)
svmModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = ctrl
)
svmPred <- predict(svmModel, newdata = testData$x)
svmResults <- postResample(svmPred, testData$y)
svmResults
## RMSE Rsquared MAE
## 2.0736997 0.8256573 1.5751967
library(nnet)
set.seed(200)
nnetModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "nnet",
preProcess = c("center", "scale"),
tuneLength = 10,
trace = FALSE,
maxit = 500,
trControl = ctrl
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
nnetPred <- predict(nnetModel, newdata = testData$x)
nnetResults <- postResample(nnetPred, testData$y)
nnetResults
## RMSE Rsquared MAE
## 14.27693 NA 13.38691
library(caret)
set.seed(123)
rfModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "rf",
trControl = ctrl,
importance = TRUE
)
rfPred <- predict(rfModel, newdata = testData$x)
rfResults <- postResample(rfPred, testData$y)
rfResults
## RMSE Rsquared MAE
## 2.4672081 0.7953415 1.9544495
results <- rbind(
KNN = knnResults,
MARS = marsResults,
SVM = svmResults,
Random_Forest = rfResults,
Neural_Network = nnetResults
)
round(results, 3)
## RMSE Rsquared MAE
## KNN 3.204 0.682 2.568
## MARS 1.790 0.871 1.371
## SVM 2.074 0.826 1.575
## Random_Forest 2.467 0.795 1.954
## Neural_Network 14.277 NA 13.387
MARS gives the best performance and successfully identifies X1–X5 as important predictors, since it is designed to capture nonlinear relationships and ignore irrelevant variables.
library(caret)
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.4.3
data(ChemicalManufacturingProcess)
df <- ChemicalManufacturingProcess
library(caret)
set.seed(100)
df_clean <- na.omit(df)
index <- createDataPartition(df_clean$Yield, p = 0.8, list = FALSE)
trainData <- df_clean[index, ]
testData <- df_clean[-index, ]
trainX <- trainData[, setdiff(names(trainData), "Yield")]
trainY <- trainData$Yield
testX <- testData[, setdiff(names(testData), "Yield")]
testY <- testData$Yield
ctrl <- trainControl(method = "boot", number = 25)
library(caret)
data(ChemicalManufacturingProcess)
df <-(ChemicalManufacturingProcess)
set.seed(100)
df_clean <- na.omit(df)
index <- createDataPartition(df_clean$Yield, p = 0.8, list = FALSE)
trainData <- df_clean[index, ]
testData <- df_clean[-index, ]
trainX <- trainData[, setdiff(names(trainData), "Yield")]
trainY <- trainData$Yield
testX <- testData[, setdiff(names(testData), "Yield")]
testY <- testData$Yield
ctrl <- trainControl(method = "boot", number = 25)
svmRTuned <- train(
trainX,
trainY,
method = "svmRadial",
tuneLength = 15,
trControl = ctrl
)
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
svmRPred <- predict(svmRTuned, newdata = testX)
svmImp <- varImp(svmRTuned)
plot(svmImp, top = 10)
vip <- svmImp$importance
top10Vars <- head(
rownames(vip)[order(-vip$Overall)],
10
)
top10Vars
## [1] "ManufacturingProcess13" "ManufacturingProcess32" "ManufacturingProcess17"
## [4] "BiologicalMaterial06" "ManufacturingProcess09" "BiologicalMaterial03"
## [7] "BiologicalMaterial12" "ManufacturingProcess36" "ManufacturingProcess06"
## [10] "BiologicalMaterial02"
The most important predictors in the optimal SVM model are mainly ManufacturingProcess variables, although several Biological Material variables also appear in the top ten. The top predictors include ManufacturingProcess32, ManufacturingProcess36, BiologicalMaterial06, ManufacturingProcess13, and ManufacturingProcess31. Overall, the process variables dominate the importance list more than the biological variables.
Compared with the optimal linear model, the nonlinear model may identify some different predictors because SVM can capture nonlinear patterns and interactions that a linear model may miss.
plotX <- df[, top10Vars]
plotY <- df$Yield
# Shorten names for cleaner plot labels
colnames(plotX) <- gsub("ManufacturingProcess", "Process", colnames(plotX))
colnames(plotX) <- gsub("BiologicalMaterial", "Bio", colnames(plotX))
featurePlot(
x = plotX,
y = plotY,
plot = "scatter"
)
The plots show the relationships between the top predictors and yield. Many of the top predictors show roughly linear or mildly nonlinear relationships with the response. This suggests that both biological and process predictors affect yield, but the process variables appear to have stronger and more consistent influence. The plots also support why SVM performed well, because SVM can capture both linear and nonlinear structure in the data.
Comment:
The nonlinear regression model with the best performance is the SVM radial model. It gives the lowest test-set RMSE and the highest test-set R-squared compared with MARS, neural network, and KNN. In the provided results, SVM had test RMSE about 1.19 and R-squared about 0.67, which made it the optimal nonlinear model.