library(ggplot2)
library(RANN)
library(magrittr)
library(caret)
## Loading required package: lattice
library(ggplot2)
library(reshape2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nnet)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
Question 7.2
library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

Setting up the test data
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
performance_log <- data.frame(matrix(ncol = 3, nrow = 0,
dimnames = list(NULL, c("RMSE", "Rsquared", "MAE"))),
stringsAsFactors = FALSE)
We use performance_log to organize and track each model’s
performance metrics, which simplifies comparing models based on RMSE,
R-squared, and MAE. By recording these metrics as each model is
evaluated, we avoid the need to re-run models to retrieve results later
or go back and forth in our code to compare results.
Tune several models on these data. For example:
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
performance_log <- rbind(performance_log, postResample(pred = knnPred, obs = testData$y))
We’ll also train a MARS model
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsMod <- train(x=trainingData$x,
y=trainingData$y, method = "earth", tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
marsMod
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.462296 0.2176253 3.697979
## 1 3 3.720663 0.4673821 2.949121
## 1 4 2.680039 0.7094916 2.123848
## 1 5 2.333538 0.7781559 1.856629
## 1 6 2.367933 0.7754329 1.901509
## 1 7 1.809983 0.8656526 1.414985
## 1 8 1.692656 0.8838936 1.333678
## 1 9 1.704958 0.8845683 1.339517
## 1 10 1.688559 0.8842495 1.309838
## 1 11 1.669043 0.8886165 1.296522
## 1 12 1.645066 0.8892796 1.271981
## 1 13 1.655570 0.8886896 1.271232
## 1 14 1.666354 0.8879143 1.285545
## 1 15 1.666354 0.8879143 1.285545
## 1 16 1.666354 0.8879143 1.285545
## 1 17 1.666354 0.8879143 1.285545
## 1 18 1.666354 0.8879143 1.285545
## 1 19 1.666354 0.8879143 1.285545
## 1 20 1.666354 0.8879143 1.285545
## 1 21 1.666354 0.8879143 1.285545
## 1 22 1.666354 0.8879143 1.285545
## 1 23 1.666354 0.8879143 1.285545
## 1 24 1.666354 0.8879143 1.285545
## 1 25 1.666354 0.8879143 1.285545
## 1 26 1.666354 0.8879143 1.285545
## 1 27 1.666354 0.8879143 1.285545
## 1 28 1.666354 0.8879143 1.285545
## 1 29 1.666354 0.8879143 1.285545
## 1 30 1.666354 0.8879143 1.285545
## 1 31 1.666354 0.8879143 1.285545
## 1 32 1.666354 0.8879143 1.285545
## 1 33 1.666354 0.8879143 1.285545
## 1 34 1.666354 0.8879143 1.285545
## 1 35 1.666354 0.8879143 1.285545
## 1 36 1.666354 0.8879143 1.285545
## 1 37 1.666354 0.8879143 1.285545
## 1 38 1.666354 0.8879143 1.285545
## 2 2 4.440854 0.2204755 3.686796
## 2 3 3.697203 0.4714312 2.938566
## 2 4 2.664266 0.7149235 2.119566
## 2 5 2.313371 0.7837374 1.852172
## 2 6 2.335796 0.7875253 1.841919
## 2 7 1.833780 0.8622906 1.462210
## 2 8 1.688673 0.8883137 1.325754
## 2 9 1.557314 0.9002634 1.234207
## 2 10 1.463018 0.9133897 1.174354
## 2 11 1.350247 0.9265882 1.099432
## 2 12 1.305955 0.9344683 1.049853
## 2 13 1.261130 0.9357469 1.017123
## 2 14 1.286463 0.9315381 1.040156
## 2 15 1.337104 0.9297651 1.069602
## 2 16 1.337560 0.9294593 1.067973
## 2 17 1.318152 0.9322833 1.054436
## 2 18 1.324331 0.9319631 1.055971
## 2 19 1.324331 0.9319631 1.055971
## 2 20 1.324331 0.9319631 1.055971
## 2 21 1.324331 0.9319631 1.055971
## 2 22 1.324331 0.9319631 1.055971
## 2 23 1.324331 0.9319631 1.055971
## 2 24 1.324331 0.9319631 1.055971
## 2 25 1.324331 0.9319631 1.055971
## 2 26 1.324331 0.9319631 1.055971
## 2 27 1.324331 0.9319631 1.055971
## 2 28 1.324331 0.9319631 1.055971
## 2 29 1.324331 0.9319631 1.055971
## 2 30 1.324331 0.9319631 1.055971
## 2 31 1.324331 0.9319631 1.055971
## 2 32 1.324331 0.9319631 1.055971
## 2 33 1.324331 0.9319631 1.055971
## 2 34 1.324331 0.9319631 1.055971
## 2 35 1.324331 0.9319631 1.055971
## 2 36 1.324331 0.9319631 1.055971
## 2 37 1.324331 0.9319631 1.055971
## 2 38 1.324331 0.9319631 1.055971
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 2.
marsPredict <- predict(marsMod, newdata = testData$x)
performance_log <- rbind(performance_log, postResample(pred = marsPredict, obs = testData$y))
We’ll also train a tuned SVM model:
svmMod <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
svmMod
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.475865 0.7988016 1.992183
## 0.50 2.214599 0.8166629 1.774775
## 1.00 2.046869 0.8398392 1.623489
## 2.00 1.953012 0.8519284 1.552263
## 4.00 1.891812 0.8587464 1.523110
## 8.00 1.875668 0.8604860 1.532309
## 16.00 1.879129 0.8595239 1.542180
## 32.00 1.879024 0.8595396 1.542161
## 64.00 1.879024 0.8595396 1.542161
## 128.00 1.879024 0.8595396 1.542161
##
## Tuning parameter 'sigma' was held constant at a value of 0.06437208
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06437208 and C = 8.
svmPredict <- predict(svmMod, newdata = testData$x)
performance_log <- rbind(performance_log, postResample(pred = svmPredict, obs = testData$y))
Lastly, we’ll fit a Neural Networks model:
NuerNetMod <- nnet(trainingData$x,
trainingData$y,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1)
NuerNetMod
## a 10-5-1 network with 61 weights
## options were - linear output units decay=0.01
nnPredict <- predict(NuerNetMod, newdata = testData$x)
performance_log <- rbind(performance_log, postResample(pred = nnPredict, obs = testData$y))
colnames(performance_log) <- c("RMSE","Rsquared","MAE")
performance_log$Name <- c("KNN", "MARS", "SVM", "Nueral Network")
performance_log <- performance_log %>% relocate("Name") %>% arrange(RMSE)
performance_log
## Name RMSE Rsquared MAE
## 1 MARS 1.280306 0.9335241 1.016867
## 2 SVM 2.059968 0.8280925 1.563584
## 3 Nueral Network 2.580505 0.7464773 1.963774
## 4 KNN 3.204059 0.6819919 2.568346
MARS has the lowest RMSE (1.280), highest R-squared
(0.934), and lowest MAE (1.017), indicating it is likely the
best-performing model in terms of accuracy. Neural
Network performs reasonably well, with RMSE of 1.855 and
R-squared of 0.865, but has higher error metrics compared to MARS.
SVM has an RMSE of 2.060 and R-squared of 0.828,
showing lower performance than MARS and Neural Network. Lastly,
KNN has the highest RMSE (3.204), lowest R-squared
(0.682), and highest MAE (2.568), making it the least accurate of the
four models. In conclusion, MARS is the most accurate model and KNN is
the least accurate based on our metrics.
Question 7.5
Exercise 6.3 describes data for a chemical manufacturing process.
Use the same data imputation, data splitting, and pre-processing steps
as before and train several nonlinear regression models.
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
data("ChemicalManufacturingProcess")
chem <- ChemicalManufacturingProcess
preProcValues <- preProcess(chem, method = c("knnImpute"))
data_imp <- predict(preProcValues, chem)
set.seed(123)
index <- createDataPartition(data_imp$Yield, p=0.8, list=FALSE)
Train <- data_imp[index, ]
Test <- data_imp[-index, ]
trans_train <- preProcess(Train, method = c("center", "scale"))
trans_test <- preProcess(Test, method = c("center", "scale"))
Train_prep <- predict(trans_train, Train)
Test_prep <- predict(trans_test, Test)
performance2 <- data.frame(matrix(ncol = 3, nrow = 0,
dimnames = list(NULL, c("RMSE", "Rsquared", "MAE"))),
stringsAsFactors = FALSE)
KNN
KNN2 <- train(x = Train[, 2:58],
y = Train$Yield,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
KNNPR2 <- predict(KNN2, newdata = Test[, 2:58])
performance2 <- rbind(performance2, postResample(pred = KNNPR2, obs = Test$Yield))
rownames(performance2)[nrow(performance2)] <- "KNN"
MARS
MARS2<- earth(Train_prep[, 2:58], Train_prep$Yield)
MARSPR2 <- predict(MARS2, newdata = Test_prep[, 2:58])
performance2 <- rbind(performance2, postResample(pred = MARSPR2, obs = Test$Yield))
rownames(performance2)[nrow(performance2)] <- "MARS"
SVM
SVM2 <- train(x = Train[, 2:58],
y = Train$Yield,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
SVMPR2 <- predict(SVM2, newdata = Test[, 2:58])
performance2 <- rbind(performance2, postResample(pred = SVMPR2, obs = Test$Yield))
rownames(performance2)[nrow(performance2)] <- "SVM"
Nueral Networks
NN2 <- train(x = Train[, 2:58],
y = Train$Yield,
method = "nnet",
preProc = c("center", "scale"),
tuneLength = 5, # Reduced from 10 to 5
trControl = trainControl(method = "cv"),
linout = TRUE, trace = FALSE)
# Make predictions on the test set
NNPR2 <- predict(NN2, newdata = Test[, 2:58])
nn_performance <- postResample(pred = NNPR2, obs = Test$Yield)
performance2 <- rbind(performance2, nn_performance)
rownames(performance2)[nrow(performance2)] <- "Neural Network"
colnames(performance2) <- c("RMSE", "R-squared", "MAE")
print(performance2)
## RMSE R-squared MAE
## KNN 0.7585275 0.42841762 0.6270432
## MARS 1.7761006 0.05438788 1.2923367
## SVM 0.6665632 0.55360118 0.5630546
## Neural Network 0.7713767 0.49668489 0.6611830
b) Which predictors are most important in the
optimal nonlinear regression model? Do either the biological or process
variables dominate the list? How do the top ten important predictors
compare to the top ten predictors from the optimal linear model?
importance_scores<- varImp(SVM2, 10)
importance_scores
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 94.06
## BiologicalMaterial03 81.27
## ManufacturingProcess13 80.63
## ManufacturingProcess36 79.17
## ManufacturingProcess31 76.84
## BiologicalMaterial02 76.04
## ManufacturingProcess17 75.92
## ManufacturingProcess09 73.04
## BiologicalMaterial12 69.48
## ManufacturingProcess06 66.28
## BiologicalMaterial11 59.72
## ManufacturingProcess33 58.60
## ManufacturingProcess29 54.77
## BiologicalMaterial04 53.93
## ManufacturingProcess11 49.55
## BiologicalMaterial01 45.62
## BiologicalMaterial08 44.93
## BiologicalMaterial09 40.88
## ManufacturingProcess30 40.31
The ranking shows that specific manufacturing processes greatly
impact product yield, with ManufacturingProcess32 being particularly
crucial for optimization. While biological predictors like
BiologicalMaterial02 and BiologicalMaterial06 are significant, they do
not rank in the top five, suggesting that manufacturing adjustments are
more influential on yield. In section 6.3, there was a more balanced
influence from both manufacturing and biological predictors, but the
current analysis emphasizes the importance of manufacturing processes as
the main drivers of yield.
c) Explore the relationships between the top
predictors and the response for the predictors that are unique to the
optimal nonlinear regression model.Do these plots reveal intuition about
the biological or process predictors and their relationship with
yield?
importance_scores <- varImp(SVM2, scale = FALSE)
importance_df <- as.data.frame(importance_scores$importance)
importance_df$feature <- rownames(importance_df)
importance_df <- importance_df[order(importance_df$Overall, decreasing = TRUE), ]
top_important_features <- head(importance_df$feature, 10)
correlation_matrix <- cor(data_imp[, c(top_important_features, "Yield")], use = "pairwise.complete.obs")
# Create a heatmap
melted_correlation_matrix <- melt(correlation_matrix)
ggplot(data = melted_correlation_matrix, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1),
name = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Heatmap of Feature Correlations with Yield",
x = "Features",
y = "Features")

We can see that some features, like “BiologicalMaterial03” and
“ManufacturingProcess13,” have strong positive correlations with yield,
shown by the dark red color. These features may play a crucial role in
increasing yield. Conversely, features like “ManufacturingProcess31” and
“ManufacturingProcess36” are strongly negatively correlated with yield,
shown in darker blue, suggesting that they might inhibit yield.
Additionally, there are correlations between features themselves (not
involving yield), which can reveal relationships between materials and
processes within the manufacturing context.