library(ggplot2)
library(RANN)
library(magrittr)
library(caret)
## Loading required package: lattice
library(ggplot2)
library(reshape2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nnet)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::lift()      masks caret::lift()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

Question 7.2

library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

Setting up the test data

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
performance_log <- data.frame(matrix(ncol = 3, nrow = 0,
                dimnames = list(NULL, c("RMSE", "Rsquared", "MAE"))),
                stringsAsFactors = FALSE)

We use performance_log to organize and track each model’s performance metrics, which simplifies comparing models based on RMSE, R-squared, and MAE. By recording these metrics as each model is evaluated, we avoid the need to re-run models to retrieve results later or go back and forth in our code to compare results.

Tune several models on these data. For example:

knnModel <- train(x = trainingData$x,
 y = trainingData$y,
 method = "knn", 
 preProc = c("center", "scale"),
 tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)

performance_log <- rbind(performance_log, postResample(pred = knnPred, obs = testData$y)) 

We’ll also train a MARS model

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsMod <- train(x=trainingData$x,
                   y=trainingData$y, method = "earth", tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
marsMod
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.462296  0.2176253  3.697979
##   1        3      3.720663  0.4673821  2.949121
##   1        4      2.680039  0.7094916  2.123848
##   1        5      2.333538  0.7781559  1.856629
##   1        6      2.367933  0.7754329  1.901509
##   1        7      1.809983  0.8656526  1.414985
##   1        8      1.692656  0.8838936  1.333678
##   1        9      1.704958  0.8845683  1.339517
##   1       10      1.688559  0.8842495  1.309838
##   1       11      1.669043  0.8886165  1.296522
##   1       12      1.645066  0.8892796  1.271981
##   1       13      1.655570  0.8886896  1.271232
##   1       14      1.666354  0.8879143  1.285545
##   1       15      1.666354  0.8879143  1.285545
##   1       16      1.666354  0.8879143  1.285545
##   1       17      1.666354  0.8879143  1.285545
##   1       18      1.666354  0.8879143  1.285545
##   1       19      1.666354  0.8879143  1.285545
##   1       20      1.666354  0.8879143  1.285545
##   1       21      1.666354  0.8879143  1.285545
##   1       22      1.666354  0.8879143  1.285545
##   1       23      1.666354  0.8879143  1.285545
##   1       24      1.666354  0.8879143  1.285545
##   1       25      1.666354  0.8879143  1.285545
##   1       26      1.666354  0.8879143  1.285545
##   1       27      1.666354  0.8879143  1.285545
##   1       28      1.666354  0.8879143  1.285545
##   1       29      1.666354  0.8879143  1.285545
##   1       30      1.666354  0.8879143  1.285545
##   1       31      1.666354  0.8879143  1.285545
##   1       32      1.666354  0.8879143  1.285545
##   1       33      1.666354  0.8879143  1.285545
##   1       34      1.666354  0.8879143  1.285545
##   1       35      1.666354  0.8879143  1.285545
##   1       36      1.666354  0.8879143  1.285545
##   1       37      1.666354  0.8879143  1.285545
##   1       38      1.666354  0.8879143  1.285545
##   2        2      4.440854  0.2204755  3.686796
##   2        3      3.697203  0.4714312  2.938566
##   2        4      2.664266  0.7149235  2.119566
##   2        5      2.313371  0.7837374  1.852172
##   2        6      2.335796  0.7875253  1.841919
##   2        7      1.833780  0.8622906  1.462210
##   2        8      1.688673  0.8883137  1.325754
##   2        9      1.557314  0.9002634  1.234207
##   2       10      1.463018  0.9133897  1.174354
##   2       11      1.350247  0.9265882  1.099432
##   2       12      1.305955  0.9344683  1.049853
##   2       13      1.261130  0.9357469  1.017123
##   2       14      1.286463  0.9315381  1.040156
##   2       15      1.337104  0.9297651  1.069602
##   2       16      1.337560  0.9294593  1.067973
##   2       17      1.318152  0.9322833  1.054436
##   2       18      1.324331  0.9319631  1.055971
##   2       19      1.324331  0.9319631  1.055971
##   2       20      1.324331  0.9319631  1.055971
##   2       21      1.324331  0.9319631  1.055971
##   2       22      1.324331  0.9319631  1.055971
##   2       23      1.324331  0.9319631  1.055971
##   2       24      1.324331  0.9319631  1.055971
##   2       25      1.324331  0.9319631  1.055971
##   2       26      1.324331  0.9319631  1.055971
##   2       27      1.324331  0.9319631  1.055971
##   2       28      1.324331  0.9319631  1.055971
##   2       29      1.324331  0.9319631  1.055971
##   2       30      1.324331  0.9319631  1.055971
##   2       31      1.324331  0.9319631  1.055971
##   2       32      1.324331  0.9319631  1.055971
##   2       33      1.324331  0.9319631  1.055971
##   2       34      1.324331  0.9319631  1.055971
##   2       35      1.324331  0.9319631  1.055971
##   2       36      1.324331  0.9319631  1.055971
##   2       37      1.324331  0.9319631  1.055971
##   2       38      1.324331  0.9319631  1.055971
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 2.
marsPredict <- predict(marsMod, newdata = testData$x)
performance_log <- rbind(performance_log, postResample(pred = marsPredict, obs = testData$y))

We’ll also train a tuned SVM model:

svmMod <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 10,
                   trControl = trainControl(method = "cv"))



svmMod
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.475865  0.7988016  1.992183
##     0.50  2.214599  0.8166629  1.774775
##     1.00  2.046869  0.8398392  1.623489
##     2.00  1.953012  0.8519284  1.552263
##     4.00  1.891812  0.8587464  1.523110
##     8.00  1.875668  0.8604860  1.532309
##    16.00  1.879129  0.8595239  1.542180
##    32.00  1.879024  0.8595396  1.542161
##    64.00  1.879024  0.8595396  1.542161
##   128.00  1.879024  0.8595396  1.542161
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06437208
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06437208 and C = 8.
svmPredict <- predict(svmMod, newdata = testData$x)
performance_log <- rbind(performance_log, postResample(pred = svmPredict, obs = testData$y))

Lastly, we’ll fit a Neural Networks model:

NuerNetMod <-  nnet(trainingData$x,
                trainingData$y,
                size = 5,
                decay = 0.01,
                linout = TRUE,
                trace = FALSE,
                maxit = 500,
                MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1)
NuerNetMod
## a 10-5-1 network with 61 weights
## options were - linear output units  decay=0.01
nnPredict <- predict(NuerNetMod, newdata = testData$x)
performance_log <- rbind(performance_log, postResample(pred = nnPredict, obs = testData$y))
colnames(performance_log) <- c("RMSE","Rsquared","MAE")
performance_log$Name <- c("KNN", "MARS", "SVM", "Nueral Network")
performance_log <- performance_log %>% relocate("Name") %>% arrange(RMSE)
performance_log
##             Name     RMSE  Rsquared      MAE
## 1           MARS 1.280306 0.9335241 1.016867
## 2            SVM 2.059968 0.8280925 1.563584
## 3 Nueral Network 2.580505 0.7464773 1.963774
## 4            KNN 3.204059 0.6819919 2.568346

MARS has the lowest RMSE (1.280), highest R-squared (0.934), and lowest MAE (1.017), indicating it is likely the best-performing model in terms of accuracy. Neural Network performs reasonably well, with RMSE of 1.855 and R-squared of 0.865, but has higher error metrics compared to MARS. SVM has an RMSE of 2.060 and R-squared of 0.828, showing lower performance than MARS and Neural Network. Lastly, KNN has the highest RMSE (3.204), lowest R-squared (0.682), and highest MAE (2.568), making it the least accurate of the four models. In conclusion, MARS is the most accurate model and KNN is the least accurate based on our metrics.

Question 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
data("ChemicalManufacturingProcess")
chem <- ChemicalManufacturingProcess
preProcValues <- preProcess(chem, method = c("knnImpute"))
data_imp <- predict(preProcValues, chem)
set.seed(123)
index <- createDataPartition(data_imp$Yield, p=0.8, list=FALSE) 
Train <- data_imp[index, ]
Test <- data_imp[-index, ]


trans_train <- preProcess(Train, method = c("center", "scale"))
trans_test <- preProcess(Test, method = c("center", "scale"))

Train_prep <- predict(trans_train, Train)
Test_prep <- predict(trans_test, Test)
performance2 <- data.frame(matrix(ncol = 3, nrow = 0,
                                   dimnames = list(NULL, c("RMSE", "Rsquared", "MAE"))),
                            stringsAsFactors = FALSE)

KNN

KNN2 <- train(x = Train[, 2:58],
                  y = Train$Yield,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)

KNNPR2 <- predict(KNN2, newdata = Test[, 2:58])
performance2 <- rbind(performance2, postResample(pred = KNNPR2, obs = Test$Yield))
rownames(performance2)[nrow(performance2)] <- "KNN"

MARS

MARS2<- earth(Train_prep[, 2:58], Train_prep$Yield)
      
MARSPR2 <- predict(MARS2, newdata = Test_prep[, 2:58])
performance2 <- rbind(performance2, postResample(pred = MARSPR2, obs = Test$Yield))
rownames(performance2)[nrow(performance2)] <- "MARS"

SVM

SVM2 <- train(x = Train[, 2:58],
                   y = Train$Yield,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 10,
                   trControl = trainControl(method = "cv"))

SVMPR2 <- predict(SVM2, newdata = Test[, 2:58])
performance2 <- rbind(performance2, postResample(pred = SVMPR2, obs = Test$Yield))
rownames(performance2)[nrow(performance2)] <- "SVM"

Nueral Networks

NN2 <- train(x = Train[, 2:58],
             y = Train$Yield,
             method = "nnet",
             preProc = c("center", "scale"),
             tuneLength = 5,  # Reduced from 10 to 5
             trControl = trainControl(method = "cv"),
             linout = TRUE, trace = FALSE)
# Make predictions on the test set
NNPR2 <- predict(NN2, newdata = Test[, 2:58])
nn_performance <- postResample(pred = NNPR2, obs = Test$Yield)
performance2 <- rbind(performance2, nn_performance)
rownames(performance2)[nrow(performance2)] <- "Neural Network"
colnames(performance2) <- c("RMSE", "R-squared", "MAE")


print(performance2)
##                     RMSE  R-squared       MAE
## KNN            0.7585275 0.42841762 0.6270432
## MARS           1.7761006 0.05438788 1.2923367
## SVM            0.6665632 0.55360118 0.5630546
## Neural Network 0.7713767 0.49668489 0.6611830

a) Which nonlinear regression model gives the optimal resampling and test set performance?

The SVM model gives the best performance with the lowest RMSE (0.6666) and the highest R-squared (0.5536) among the models evaluated. Thus, the SVM would be considered the optimal model based on these metrics.

b) Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

importance_scores<- varImp(SVM2, 10)
importance_scores
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## BiologicalMaterial06     94.06
## BiologicalMaterial03     81.27
## ManufacturingProcess13   80.63
## ManufacturingProcess36   79.17
## ManufacturingProcess31   76.84
## BiologicalMaterial02     76.04
## ManufacturingProcess17   75.92
## ManufacturingProcess09   73.04
## BiologicalMaterial12     69.48
## ManufacturingProcess06   66.28
## BiologicalMaterial11     59.72
## ManufacturingProcess33   58.60
## ManufacturingProcess29   54.77
## BiologicalMaterial04     53.93
## ManufacturingProcess11   49.55
## BiologicalMaterial01     45.62
## BiologicalMaterial08     44.93
## BiologicalMaterial09     40.88
## ManufacturingProcess30   40.31

The ranking shows that specific manufacturing processes greatly impact product yield, with ManufacturingProcess32 being particularly crucial for optimization. While biological predictors like BiologicalMaterial02 and BiologicalMaterial06 are significant, they do not rank in the top five, suggesting that manufacturing adjustments are more influential on yield. In section 6.3, there was a more balanced influence from both manufacturing and biological predictors, but the current analysis emphasizes the importance of manufacturing processes as the main drivers of yield.

c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model.Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

importance_scores <- varImp(SVM2, scale = FALSE)
importance_df <- as.data.frame(importance_scores$importance)
importance_df$feature <- rownames(importance_df)
importance_df <- importance_df[order(importance_df$Overall, decreasing = TRUE), ]
top_important_features <- head(importance_df$feature, 10)
correlation_matrix <- cor(data_imp[, c(top_important_features, "Yield")], use = "pairwise.complete.obs")
# Create a heatmap 
melted_correlation_matrix <- melt(correlation_matrix)

ggplot(data = melted_correlation_matrix, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), 
                       name = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Heatmap of Feature Correlations with Yield",
       x = "Features",
       y = "Features")

We can see that some features, like “BiologicalMaterial03” and “ManufacturingProcess13,” have strong positive correlations with yield, shown by the dark red color. These features may play a crucial role in increasing yield. Conversely, features like “ManufacturingProcess31” and “ManufacturingProcess36” are strongly negatively correlated with yield, shown in darker blue, suggesting that they might inhibit yield. Additionally, there are correlations between features themselves (not involving yield), which can reveal relationships between materials and processes within the manufacturing context.