\[7.2\]

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(caret)

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

knnModel <- train(x = trainingData$x, 
                  y = trainingData$y, 
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample(pred = knnPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461
## Neural Network Model:

findCorrelation(cor(trainingData$x), cutoff = 0.7) # no highly correlated variables
## integer(0)
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1), 
                        .size = c(1:10))

set.seed(123)
nnetTune <- train(trainingData$x,
                  trainingData$y,
                  method = "nnet",
                  tuneGrid = nnetGrid,
                  trControl = trainControl(method = "cv"),
                  preProc = c("center", "scale"),
                  linout = TRUE,
                  trace = FALSE,
                  MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
                  maxit = 500)

nnetTune
## Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE       Rsquared   MAE     
##   0.00    1     2.428469  0.7653147  1.881622
##   0.00    2     2.658319  0.7176523  2.123341
##   0.00    3     2.439662  0.7576737  1.909810
##   0.00    4     2.395450  0.7718850  1.870523
##   0.00    5     3.158842  0.6600032  2.306494
##   0.00    6     5.114760  0.5927134  2.878301
##   0.00    7     3.998873  0.5863680  2.807368
##   0.00    8    13.038224  0.3380882  6.216311
##   0.00    9     3.418244  0.6064786  2.750387
##   0.00   10    14.282253  0.3903329  5.817233
##   0.01    1     2.428178  0.7651061  1.880566
##   0.01    2     2.628711  0.7352039  2.081463
##   0.01    3     2.500025  0.7465002  1.952391
##   0.01    4     2.487535  0.7612713  1.893652
##   0.01    5     2.627321  0.7308586  2.175922
##   0.01    6     2.779170  0.7162527  2.154943
##   0.01    7     2.974175  0.6754817  2.306175
##   0.01    8     3.029658  0.6786905  2.368330
##   0.01    9     3.530934  0.6095407  2.864381
##   0.01   10     3.427628  0.6325805  2.755656
##   0.10    1     2.441856  0.7622291  1.892003
##   0.10    2     2.571867  0.7378116  1.967507
##   0.10    3     2.209160  0.8028016  1.797465
##   0.10    4     2.384313  0.7706745  1.946165
##   0.10    5     2.703109  0.7134765  2.109896
##   0.10    6     2.691025  0.7343891  2.178368
##   0.10    7     2.807927  0.7008760  2.202846
##   0.10    8     2.906717  0.7081779  2.361708
##   0.10    9     3.246571  0.6386602  2.586619
##   0.10   10     3.239180  0.6467128  2.555378
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 3 and decay = 0.1.
nnetPred <- predict(nnetTune, testData$x)
postResample(nnetPred, testData$y)
##      RMSE  Rsquared       MAE 
## 2.4763089 0.7565333 1.8564599
## MARS Model:

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)

set.seed(123)
marsTuned <- train(trainingData$x,
                   trainingData$y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.3.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.3
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.3.2
marsTuned
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.311247  0.2748122  3.603533
##   1        3      3.531005  0.5107259  2.857560
##   1        4      2.609132  0.7291471  2.109945
##   1        5      2.234494  0.8007350  1.788244
##   1        6      2.279819  0.7999357  1.803273
##   1        7      1.792708  0.8748522  1.398541
##   1        8      1.710582  0.8857656  1.323419
##   1        9      1.662155  0.8892531  1.291466
##   1       10      1.706154  0.8823897  1.306578
##   1       11      1.743116  0.8739494  1.360521
##   1       12      1.740790  0.8734421  1.357507
##   1       13      1.703492  0.8788657  1.326192
##   1       14      1.700604  0.8791430  1.324716
##   1       15      1.692444  0.8801290  1.317804
##   1       16      1.692444  0.8801290  1.317804
##   1       17      1.692444  0.8801290  1.317804
##   1       18      1.692444  0.8801290  1.317804
##   1       19      1.692444  0.8801290  1.317804
##   1       20      1.692444  0.8801290  1.317804
##   1       21      1.692444  0.8801290  1.317804
##   1       22      1.692444  0.8801290  1.317804
##   1       23      1.692444  0.8801290  1.317804
##   1       24      1.692444  0.8801290  1.317804
##   1       25      1.692444  0.8801290  1.317804
##   1       26      1.692444  0.8801290  1.317804
##   1       27      1.692444  0.8801290  1.317804
##   1       28      1.692444  0.8801290  1.317804
##   1       29      1.692444  0.8801290  1.317804
##   1       30      1.692444  0.8801290  1.317804
##   1       31      1.692444  0.8801290  1.317804
##   1       32      1.692444  0.8801290  1.317804
##   1       33      1.692444  0.8801290  1.317804
##   1       34      1.692444  0.8801290  1.317804
##   1       35      1.692444  0.8801290  1.317804
##   1       36      1.692444  0.8801290  1.317804
##   1       37      1.692444  0.8801290  1.317804
##   1       38      1.692444  0.8801290  1.317804
##   2        2      4.311247  0.2748122  3.603533
##   2        3      3.531005  0.5107259  2.857560
##   2        4      2.609132  0.7291471  2.109945
##   2        5      2.243508  0.7985944  1.788189
##   2        6      2.236723  0.7987764  1.770156
##   2        7      1.815177  0.8693557  1.425563
##   2        8      1.699050  0.8834064  1.317662
##   2        9      1.487692  0.9084049  1.182061
##   2       10      1.469496  0.9053535  1.160443
##   2       11      1.392318  0.9178210  1.085187
##   2       12      1.302695  0.9312685  1.032827
##   2       13      1.293800  0.9331208  1.033397
##   2       14      1.265082  0.9371588  1.012795
##   2       15      1.275804  0.9351561  1.019457
##   2       16      1.288843  0.9335588  1.031360
##   2       17      1.296439  0.9327583  1.035093
##   2       18      1.296439  0.9327583  1.035093
##   2       19      1.296439  0.9327583  1.035093
##   2       20      1.296439  0.9327583  1.035093
##   2       21      1.296439  0.9327583  1.035093
##   2       22      1.296439  0.9327583  1.035093
##   2       23      1.296439  0.9327583  1.035093
##   2       24      1.296439  0.9327583  1.035093
##   2       25      1.296439  0.9327583  1.035093
##   2       26      1.296439  0.9327583  1.035093
##   2       27      1.296439  0.9327583  1.035093
##   2       28      1.296439  0.9327583  1.035093
##   2       29      1.296439  0.9327583  1.035093
##   2       30      1.296439  0.9327583  1.035093
##   2       31      1.296439  0.9327583  1.035093
##   2       32      1.296439  0.9327583  1.035093
##   2       33      1.296439  0.9327583  1.035093
##   2       34      1.296439  0.9327583  1.035093
##   2       35      1.296439  0.9327583  1.035093
##   2       36      1.296439  0.9327583  1.035093
##   2       37      1.296439  0.9327583  1.035093
##   2       38      1.296439  0.9327583  1.035093
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
marsPred <- predict(marsTuned, testData$x)
postResample(marsPred, testData$y)
##      RMSE  Rsquared       MAE 
## 1.1722635 0.9448890 0.9324923
## SVM Model:

svmRTuned <- train(trainingData$x,
                   trainingData$y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))

svmRTuned
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.483272  0.8002484  1.987944
##      0.50  2.222253  0.8176803  1.773054
##      1.00  2.053650  0.8394871  1.613834
##      2.00  1.906672  0.8593733  1.498131
##      4.00  1.820527  0.8683577  1.405055
##      8.00  1.795894  0.8714776  1.414387
##     16.00  1.799680  0.8713663  1.426459
##     32.00  1.800000  0.8713301  1.426565
##     64.00  1.800000  0.8713301  1.426565
##    128.00  1.800000  0.8713301  1.426565
##    256.00  1.800000  0.8713301  1.426565
##    512.00  1.800000  0.8713301  1.426565
##   1024.00  1.800000  0.8713301  1.426565
##   2048.00  1.800000  0.8713301  1.426565
## 
## Tuning parameter 'sigma' was held constant at a value of 0.05865089
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.05865089 and C = 8.
svmPred <- predict(svmRTuned, testData$x)
postResample(svmPred, testData$y)
##      RMSE  Rsquared       MAE 
## 2.0367520 0.8318292 1.5442180
rbind(knnMod = postResample(knnPred, testData$y),
      nnetMod = postResample(nnetPred, testData$y),
      marsMod = postResample(marsPred, testData$y),
      svmMod = postResample(svmPred, testData$y))
##             RMSE  Rsquared       MAE
## knnMod  3.204059 0.6819919 2.5683461
## nnetMod 2.476309 0.7565333 1.8564599
## marsMod 1.172263 0.9448890 0.9324923
## svmMod  2.036752 0.8318292 1.5442180

The MARS model has the best performance with the highest R2 of 0.94 and the lowest RMSE of 1.17.

varImp(marsTuned)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.24
## X2   48.74
## X5   15.53
## X3    0.00

MARS model selects X1, X4, X2, and X5 as the most informative predictors. However, X3 has an overall importance of 0 in this model.

\[7.5\]

library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
library(caret)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.3.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.3.2
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
# load data

data(ChemicalManufacturingProcess)

# data splitting, and processing from homework 7

imputations <- preProcess(ChemicalManufacturingProcess, 
               method = c("knnImpute"), 
               k=5)

chem_man_imputed <- predict(imputations, ChemicalManufacturingProcess)

chem_man_filtered <- chem_man_imputed[,-nearZeroVar(chem_man_imputed)]

set.seed(123)

# split into training and testing

train_indices <- sample(nrow(chem_man_filtered), nrow(chem_man_filtered)*.8, replace=F)
trainChem <- chem_man_filtered[train_indices,]
testChem <- chem_man_filtered[-train_indices,]
## a.

# KNN:

knnModel <- train(Yield ~ .,
                  data=trainChem,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 140 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 140, 140, 140, 140, 140, 140, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.8051317  0.3751255  0.6326099
##    7  0.7915491  0.3944335  0.6275697
##    9  0.7880395  0.4009045  0.6242525
##   11  0.7944818  0.3876910  0.6294825
##   13  0.7924476  0.3959429  0.6270117
##   15  0.7989080  0.3895254  0.6323954
##   17  0.8069230  0.3801722  0.6391329
##   19  0.8107104  0.3832165  0.6418239
##   21  0.8166273  0.3804731  0.6487960
##   23  0.8179001  0.3835214  0.6491087
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
knnPred <- predict(knnModel, testChem)
# Neural Network Model

trainChem_x <- trainChem |> 
  dplyr::select(-Yield)

trainChem_y <- trainChem |>
  dplyr::select(Yield)

testChem_x <- testChem |> 
  dplyr::select(-Yield)

testChem_y <- testChem |>
  dplyr::select(Yield)

corr_indices <- findCorrelation(cor(trainChem_x), cutoff = 0.7)

trainChemFiltered <- trainChem_x[, -corr_indices]
testChemFiltered <- testChem_x[, -corr_indices]

trainChemFiltered$Yield <- trainChem_y$Yield
testChemFiltered$Yield <- testChem_y$Yield # no highly correlated variables
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1), 
                        .size = c(1:10))

set.seed(613)
nnetTune <- train(Yield ~ .,
                  data=trainChemFiltered,
                  method = "nnet",
                  tuneGrid = nnetGrid,
                  trControl = trainControl(method = "cv"),
                  preProc = c("center", "scale"),
                  linout = TRUE,
                  trace = FALSE,
                  MaxNWts = 10 * (ncol(trainChemFiltered)) + 10 + 1,
                  maxit = 500)

nnetTune
## Neural Network 
## 
## 140 samples
##  34 predictor
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 126, 127, 126, 124, 126, 126, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE       Rsquared   MAE      
##   0.00    1    0.8860080  0.3276148  0.6908584
##   0.00    2    0.9284650  0.3004782  0.7192417
##   0.00    3    1.1724749  0.3008227  0.9426286
##   0.00    4    1.1434151  0.2525458  0.9164239
##   0.00    5    1.1022941  0.2552948  0.8771900
##   0.00    6    1.3843251  0.1782694  1.0832505
##   0.00    7    1.3482599  0.1682292  1.0402059
##   0.00    8    1.1045291  0.2407480  0.8767745
##   0.00    9    1.0053394  0.3015537  0.7864717
##   0.00   10    0.9630697  0.3471828  0.8022991
##   0.01    1    0.9399990  0.3016182  0.7591808
##   0.01    2    0.8752422  0.4118767  0.6787028
##   0.01    3    1.2722424  0.2416129  1.0132543
##   0.01    4    1.1520935  0.2662412  0.9329635
##   0.01    5    1.1449964  0.2719272  0.9525353
##   0.01    6    0.9270159  0.4164483  0.7405059
##   0.01    7    0.8722660  0.3954060  0.7248977
##   0.01    8    0.9235080  0.3203924  0.7340114
##   0.01    9    0.8737608  0.3763744  0.7044871
##   0.01   10    0.8120468  0.4439574  0.6569336
##   0.10    1    0.7972195  0.4000895  0.6467419
##   0.10    2    0.9275708  0.4067340  0.7693819
##   0.10    3    1.0285978  0.3172875  0.8188056
##   0.10    4    0.9351223  0.3453424  0.7437457
##   0.10    5    0.9246938  0.3467975  0.7554783
##   0.10    6    0.8913509  0.3563416  0.7393845
##   0.10    7    0.8117307  0.4388129  0.6655987
##   0.10    8    0.8766561  0.3540777  0.7129637
##   0.10    9    0.8334868  0.4145568  0.6885491
##   0.10   10    0.8019996  0.4253869  0.6555231
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.1.
nnetPred <- predict(nnetTune, testChemFiltered)
# MARS Model:

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)

set.seed(613)
marsTuned <- train(Yield ~ .,
                   data=trainChem,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))

marsTuned
## Multivariate Adaptive Regression Spline 
## 
## 140 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 126, 127, 126, 124, 126, 126, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE       Rsquared   MAE      
##   1        2      0.7447063  0.4439679  0.5915238
##   1        3      0.6376569  0.5767418  0.5178721
##   1        4      0.7048392  0.5182127  0.5385218
##   1        5      0.7422687  0.4712073  0.5580131
##   1        6      0.7868659  0.4560331  0.5872088
##   1        7      0.7982740  0.4581982  0.5971732
##   1        8      0.8310610  0.4165120  0.6211374
##   1        9      0.8083470  0.4278275  0.6082458
##   1       10      0.8070489  0.4412733  0.6070551
##   1       11      0.8098200  0.4334555  0.6150378
##   1       12      0.8270384  0.4550866  0.6138953
##   1       13      0.8105597  0.4500556  0.6135100
##   1       14      0.8288852  0.4335370  0.6314773
##   1       15      0.7878973  0.4512077  0.6057949
##   1       16      0.7797134  0.4605289  0.5975943
##   1       17      0.7815254  0.4537066  0.5965864
##   1       18      0.7799054  0.4555640  0.5944069
##   1       19      0.7799054  0.4555640  0.5944069
##   1       20      0.7799054  0.4555640  0.5944069
##   1       21      0.7799054  0.4555640  0.5944069
##   1       22      0.7799054  0.4555640  0.5944069
##   1       23      0.7799054  0.4555640  0.5944069
##   1       24      0.7799054  0.4555640  0.5944069
##   1       25      0.7799054  0.4555640  0.5944069
##   1       26      0.7799054  0.4555640  0.5944069
##   1       27      0.7799054  0.4555640  0.5944069
##   1       28      0.7799054  0.4555640  0.5944069
##   1       29      0.7799054  0.4555640  0.5944069
##   1       30      0.7799054  0.4555640  0.5944069
##   1       31      0.7799054  0.4555640  0.5944069
##   1       32      0.7799054  0.4555640  0.5944069
##   1       33      0.7799054  0.4555640  0.5944069
##   1       34      0.7799054  0.4555640  0.5944069
##   1       35      0.7799054  0.4555640  0.5944069
##   1       36      0.7799054  0.4555640  0.5944069
##   1       37      0.7799054  0.4555640  0.5944069
##   1       38      0.7799054  0.4555640  0.5944069
##   2        2      0.7447063  0.4439679  0.5915238
##   2        3      0.6684686  0.5256099  0.5320134
##   2        4      0.6435412  0.5555535  0.5149291
##   2        5      0.6204790  0.6019286  0.4901222
##   2        6      0.6192558  0.5960994  0.4989223
##   2        7      0.6118131  0.6132054  0.4882839
##   2        8      0.6114004  0.6195031  0.4773827
##   2        9      0.6418331  0.5782684  0.5082956
##   2       10      0.6315870  0.5965272  0.5064992
##   2       11      0.6335806  0.6003002  0.5070047
##   2       12      0.6655862  0.5924883  0.5408615
##   2       13      0.6662762  0.5955636  0.5419373
##   2       14      0.6742523  0.5964206  0.5443044
##   2       15      0.6898043  0.5875630  0.5479353
##   2       16      0.6934838  0.5905822  0.5546733
##   2       17      0.6912745  0.6012155  0.5510503
##   2       18      0.6955843  0.6013427  0.5541434
##   2       19      0.6974083  0.6048551  0.5499242
##   2       20      0.6891369  0.6099843  0.5475698
##   2       21      0.6896803  0.6161657  0.5567493
##   2       22      0.7104724  0.5999726  0.5750748
##   2       23      0.7051356  0.6014778  0.5707753
##   2       24      0.7138143  0.5932797  0.5803284
##   2       25      0.7138143  0.5932797  0.5803284
##   2       26      0.7138143  0.5932797  0.5803284
##   2       27      0.7138143  0.5932797  0.5803284
##   2       28      0.7138143  0.5932797  0.5803284
##   2       29      0.7138143  0.5932797  0.5803284
##   2       30      0.7138143  0.5932797  0.5803284
##   2       31      0.7138143  0.5932797  0.5803284
##   2       32      0.7138143  0.5932797  0.5803284
##   2       33      0.7138143  0.5932797  0.5803284
##   2       34      0.7138143  0.5932797  0.5803284
##   2       35      0.7138143  0.5932797  0.5803284
##   2       36      0.7138143  0.5932797  0.5803284
##   2       37      0.7138143  0.5932797  0.5803284
##   2       38      0.7138143  0.5932797  0.5803284
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 8 and degree = 2.
marsPred <- predict(marsTuned, testChem)
# SVM Model:

svmRTuned <- train(Yield ~ .,
                   data=trainChem,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))

svmRTuned
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 140 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 126, 124, 128, 126, 127, 125, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE       Rsquared   MAE      
##      0.25  0.7683277  0.4751059  0.6232367
##      0.50  0.7055179  0.5313663  0.5727722
##      1.00  0.6588907  0.5844897  0.5343755
##      2.00  0.6232091  0.6277378  0.5064778
##      4.00  0.6223625  0.6223060  0.5064420
##      8.00  0.6183000  0.6168089  0.5089816
##     16.00  0.6125701  0.6235705  0.5034715
##     32.00  0.6125701  0.6235705  0.5034715
##     64.00  0.6125701  0.6235705  0.5034715
##    128.00  0.6125701  0.6235705  0.5034715
##    256.00  0.6125701  0.6235705  0.5034715
##    512.00  0.6125701  0.6235705  0.5034715
##   1024.00  0.6125701  0.6235705  0.5034715
##   2048.00  0.6125701  0.6235705  0.5034715
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01220051
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01220051 and C = 16.
svmPred <- predict(svmRTuned, testChem)
rbind(knnMod = postResample(knnPred, testChem$Yield),
      nnetMod = postResample(nnetPred, testChemFiltered$Yield),
      marsMod = postResample(marsPred, testChem$Yield),
      svmMod = postResample(svmPred, testChem$Yield))
##              RMSE  Rsquared       MAE
## knnMod  0.8164187 0.5549754 0.6749210
## nnetMod 0.7513991 0.5490702 0.5680688
## marsMod 0.7768123 0.5854577 0.5701455
## svmMod  0.6050492 0.7761230 0.4482863

The SVM model has the highest R2 ~0.78 and lowest RMSE of 60 so this is the best performing model.

## b.

plot(varImp(svmRTuned), 10)# best most important predictors

lasso_mod <- train(Yield ~ .,
                   data=trainChem,
                   method = "glmnet",
                   preProcess = c("center", "scale"),
                   trControl = trainControl(method = "cv"),
                   tuneGrid = expand.grid(.alpha = 1, .lambda = seq(0, 1, 0.05)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(varImp(lasso_mod), 10)

ManufacturingProcess32 remains the top most important variable for both models, but I see the top 10 changed the level of importance comparing the 02 models.

##  c.

chem_man_filtered[,c("Yield", "BiologicalMaterial06", "ManufacturingProcess31", "BiologicalMaterial03", "BiologicalMaterial12")] |>
  cor() |>
  corrplot(method="shade",
           diag=FALSE,
           type="full",
           addCoef.col = "blue",
           number.cex=0.5)

## BiologicalMaterial06 has the highest positive correlation with Yield, followed by BiologicalMaterial03 and BiologicalMaterial12. ManufacturingProcess31 has only a slight negative correlation with Yield.