library(mlbench)
library(caret)
library(AppliedPredictiveModeling)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
featurePlot(trainingData$x, trainingData$y,
plot = "scatter",
between = list(x = 1, y = 1))
The Friedman simulation creates a nonlinear relationship where only X1-X5 are informative predictors, while X6-X10 are noise variables. The true model includes interactions and squared terms.
set.seed(100)
ctrl <- trainControl(method = "cv", number = 10)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
knnModel$bestTune
## k
## 3 9
knnPred <- predict(knnModel, newdata = testData$x)
knnResults <- postResample(pred = knnPred, obs = testData$y)
knnResults
## RMSE Rsquared MAE
## 3.1172319 0.6556622 2.4899907
KNN achieves an R² of 0.656 on the test set with k = 9 neighbors. This model captures nonlinear patterns without explicit feature engineering.
set.seed(100)
nnetGrid <- expand.grid(decay = c(0, 0.01, 0.1),
size = c(1, 3, 5, 7))
nnetModel <- train(x = trainingData$x,
y = trainingData$y,
method = "nnet",
preProc = c("center", "scale"),
tuneGrid = nnetGrid,
trControl = ctrl,
linout = TRUE,
trace = FALSE,
maxit = 500)
nnetModel$bestTune
## size decay
## 2 3 0
nnetPred <- predict(nnetModel, newdata = testData$x)
nnetResults <- postResample(pred = nnetPred, obs = testData$y)
nnetResults
## RMSE Rsquared MAE
## 1.7957507 0.8727715 1.3798915
The neural network with 3 hidden units and decay = 0 achieves R² of 0.873. Neural networks can model complex nonlinear interactions naturally.
set.seed(100)
marsModel <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
marsModel$bestTune
## nprune degree
## 10 15 1
marsPred <- predict(marsModel, newdata = testData$x)
marsResults <- postResample(pred = marsPred, obs = testData$y)
marsResults
## RMSE Rsquared MAE
## 1.8136467 0.8677298 1.3911836
varImp(marsModel)
## earth variable importance
##
## Overall
## X1 100.00
## X4 82.08
## X2 62.79
## X5 38.07
## X3 25.80
## X6 0.00
MARS achieves R² of 0.868 using 15 terms. Examining variable importance, MARS successfully identifies X1-X5 as the informative predictors, correctly ignoring the noise variables X6-X10. This demonstrates MARS’s automatic feature selection capability.
set.seed(100)
svmModel <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
svmModel$bestTune
## sigma C
## 6 0.06509124 8
svmPred <- predict(svmModel, newdata = testData$x)
svmResults <- postResample(pred = svmPred, obs = testData$y)
svmResults
## RMSE Rsquared MAE
## 2.0631908 0.8275736 1.5662213
SVM with radial basis function achieves R² of 0.828. The optimal parameters are C = 8 and sigma = 0.0651.
results_friedman <- data.frame(
Model = c("KNN", "Neural Network", "MARS", "SVM"),
RMSE = c(knnResults[1], nnetResults[1], marsResults[1], svmResults[1]),
Rsquared = c(knnResults[2], nnetResults[2], marsResults[2], svmResults[2])
)
results_friedman[order(results_friedman$Rsquared, decreasing = TRUE), ]
## Model RMSE Rsquared
## 2 Neural Network 1.795751 0.8727715
## 3 MARS 1.813647 0.8677298
## 4 SVM 2.063191 0.8275736
## 1 KNN 3.117232 0.6556622
Among the nonlinear models, Neural Network provides the best performance with R² = 0.873. Neural Network, MARS, and SVM all perform quite well (R² > 0.82), successfully capturing the nonlinear Friedman function. MARS stands out for combining strong performance with interpretability, successfully selecting only the informative predictors X1-X5 while ignoring noise variables. KNN performs noticeably worse, likely struggling with the high dimensionality relative to sample size.
data(ChemicalManufacturingProcess)
preproc_impute <- preProcess(ChemicalManufacturingProcess[, -1],
method = c("knnImpute"))
imputed_predictors <- predict(preproc_impute,
ChemicalManufacturingProcess[, -1])
set.seed(456)
train_idx <- createDataPartition(ChemicalManufacturingProcess$Yield,
p = 0.75, list = FALSE)
train_x <- imputed_predictors[train_idx, ]
train_y <- ChemicalManufacturingProcess$Yield[train_idx]
test_x <- imputed_predictors[-train_idx, ]
test_y <- ChemicalManufacturingProcess$Yield[-train_idx]
ctrl_chem <- trainControl(method = "cv", number = 10)
set.seed(456)
nnet_grid <- expand.grid(decay = c(0, 0.01, 0.1),
size = c(3, 5, 7))
nnet_chem <- train(x = train_x,
y = train_y,
method = "nnet",
preProc = c("center", "scale"),
tuneGrid = nnet_grid,
trControl = ctrl_chem,
linout = TRUE,
trace = FALSE,
maxit = 500)
nnet_chem$bestTune
## size decay
## 9 7 0.1
nnet_pred <- predict(nnet_chem, test_x)
nnet_test_r2 <- cor(nnet_pred, test_y)^2
nnet_test_rmse <- sqrt(mean((nnet_pred - test_y)^2))
set.seed(456)
mars_chem <- train(x = train_x,
y = train_y,
method = "earth",
preProc = c("center", "scale"),
tuneLength = 15,
trControl = ctrl_chem)
mars_chem$bestTune
## nprune degree
## 2 3 1
mars_pred <- predict(mars_chem, test_x)
mars_test_r2 <- cor(mars_pred, test_y)^2
mars_test_rmse <- sqrt(mean((mars_pred - test_y)^2))
set.seed(456)
svm_chem <- train(x = train_x,
y = train_y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl_chem)
svm_chem$bestTune
## sigma C
## 7 0.01401408 16
svm_pred <- predict(svm_chem, test_x)
svm_test_r2 <- cor(svm_pred, test_y)^2
svm_test_rmse <- sqrt(mean((svm_pred - test_y)^2))
set.seed(456)
knn_chem <- train(x = train_x,
y = train_y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl_chem)
knn_chem$bestTune
## k
## 1 5
knn_pred <- predict(knn_chem, test_x)
knn_test_r2 <- cor(knn_pred, test_y)^2
knn_test_rmse <- sqrt(mean((knn_pred - test_y)^2))
results_chem <- data.frame(
Model = c("Neural Network", "MARS", "SVM", "KNN"),
CV_Rsquared = c(max(nnet_chem$results$Rsquared),
max(mars_chem$results$Rsquared),
max(svm_chem$results$Rsquared),
max(knn_chem$results$Rsquared)),
Test_R2 = c(nnet_test_r2, mars_test_r2, svm_test_r2, knn_test_r2),
Test_RMSE = c(nnet_test_rmse, mars_test_rmse, svm_test_rmse, knn_test_rmse)
)
results_chem[order(results_chem$Test_R2, decreasing = TRUE), ]
## Model CV_Rsquared Test_R2 Test_RMSE
## 3 SVM 0.6482254 5.600323e-01 1.199467
## 2 MARS 0.5604391 5.364571e-01 1.244812
## 4 KNN 0.5267843 3.541141e-01 1.452068
## 1 Neural Network 0.4179899 6.457706e-06 3.521851
The SVM model achieves the best test set performance with R² = 0.56 and RMSE = 1.2%, with MARS performing very similarly (R² = 0.54, RMSE = 1.24%). Both outperform the linear PLS model from Problem 6.3 (R² = 0.202), confirming important nonlinear relationships in the manufacturing data. KNN shows moderate performance (R² = 0.35), capturing some of the nonlinear patterns. The Neural Network model appears to have severely overfit the training data, achieving poor test performance (R² ≈ 0, RMSE = 3.52%) despite reasonable cross-validation results, suggesting the model memorized training patterns that don’t generalize.
best_model_name <- results_chem$Model[which.max(results_chem$Test_R2)]
if(best_model_name == "MARS") {
best_model <- mars_chem
} else if(best_model_name == "Neural Network") {
best_model <- nnet_chem
} else if(best_model_name == "SVM") {
best_model <- svm_chem
} else {
best_model <- knn_chem
}
var_imp_nonlinear <- varImp(best_model)
plot(var_imp_nonlinear, top = 20, main = "Top 20 Predictors: Optimal Nonlinear Model")
top_20_nonlinear <- rownames(var_imp_nonlinear$importance)[order(var_imp_nonlinear$importance$Overall,
decreasing = TRUE)[1:20]]
manufacturing_nonlinear <- grep("^Manufacturing", top_20_nonlinear, value = TRUE)
biological_nonlinear <- grep("^Biological", top_20_nonlinear, value = TRUE)
length(manufacturing_nonlinear)
## [1] 12
length(biological_nonlinear)
## [1] 8
In the optimal nonlinear model, 12 manufacturing predictors and 8 biological predictors appear in the top 20. Comparing to the linear PLS model (which had 12 manufacturing and 8 biological in top 20), the nonlinear model shows similar dominance of manufacturing variables, confirming these controllable process parameters are consistently important across model types.
top_10_nonlinear <- rownames(var_imp_nonlinear$importance)[order(var_imp_nonlinear$importance$Overall,
decreasing = TRUE)[1:10]]
top_10_nonlinear
## [1] "ManufacturingProcess32" "ManufacturingProcess13" "BiologicalMaterial06"
## [4] "BiologicalMaterial12" "BiologicalMaterial03" "ManufacturingProcess36"
## [7] "ManufacturingProcess17" "BiologicalMaterial02" "ManufacturingProcess09"
## [10] "ManufacturingProcess31"
pls_chem_comparison <- train(train_x, train_y,
method = "pls",
tuneLength = 20,
trControl = ctrl_chem,
preProcess = c("center", "scale"))
var_imp_pls <- varImp(pls_chem_comparison)
top_10_pls <- rownames(var_imp_pls$importance)[order(var_imp_pls$importance$Overall,
decreasing = TRUE)[1:10]]
unique_predictors <- setdiff(top_10_nonlinear, top_10_pls)
unique_predictors
## [1] "BiologicalMaterial12" "ManufacturingProcess31"
Predictors unique to the nonlinear model’s top 10: BiologicalMaterial12, ManufacturingProcess31. These predictors may have nonlinear relationships with yield that the linear model couldn’t capture.
if(length(unique_predictors) > 0) {
n_plots <- min(4, length(unique_predictors))
par(mfrow = c(2, 2))
for(i in 1:n_plots) {
pred <- unique_predictors[i]
plot(imputed_predictors[, pred], ChemicalManufacturingProcess$Yield,
xlab = pred,
ylab = "Yield (%)",
main = paste("Yield vs", pred),
pch = 16, col = rgb(0, 0, 1, 0.5))
loess_fit <- loess(ChemicalManufacturingProcess$Yield ~ imputed_predictors[, pred])
x_order <- order(imputed_predictors[, pred])
lines(imputed_predictors[x_order, pred],
predict(loess_fit)[x_order],
col = "red", lwd = 2)
}
par(mfrow = c(1, 1))
}
The scatter plots with LOESS curves show clear nonlinear patterns that the linear model couldn’t capture. Several predictors show threshold effects or curved relationships with yield, which explains the superior performance of nonlinear models. These findings suggest that simply increasing or decreasing process parameters linearly won’t maximize yield. Instead, certain variables appear to have optimal operating ranges or diminishing returns, meaning operators should focus on keeping these parameters within specific windows rather than pushing them in one direction. This insight could be valuable for refining manufacturing protocols to boost yields.