Data624: HW8

Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.

library(mlbench)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(MASS)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(TeachingDemos)
library(earth)

## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix

library(kernlab)

## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(AppliedPredictiveModeling)
library(DataExplorer)
library(corrplot)

## corrplot 0.92 loaded

7.2. Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\(y = 10\sin(\pi x_1x_2) + 20(x_3 - 0.5)^2+10x_4 + 5x_5 + N(0,\sigma^2)\)

where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)

 ## We convert the 'x' data from a matrix to a data frame
 ## One reason is that this will give the columns names.

trainingData$x <- data.frame(trainingData$x)
 ## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. For example:

set.seed(20)
knnModel <- train(x = trainingData$x,y = trainingData$y,method = "knn",preProc = c("center", "scale"),tuneLength = 10)


knnEval <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set 
## performance values

postResample(pred = knnEval, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

netCorr <- findCorrelation(cor(trainingData$x), cutoff = .75)
netCorr

## integer(0)

Since there arent any overly correlated variables then all of the predictors will be included

tr_ctrl <- trainControl(method = "cv", number = 10)

 ## Create a specific candidate set of models to evaluate:
 nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
                         .size = c(1:10))
set.seed(80)
nnetTune <- train(trainingData$x, trainingData$y, 
                  method = "nnet",
                  tuneGrid = nnetGrid,
                   trControl = tr_ctrl,
                  preProc = c("center", "scale"),
                   linout = TRUE,
                  trace = FALSE,
                  MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
                  maxit = 500)

nnEval <- predict(nnetTune, testData$x)

postResample(nnEval, testData$y)

##      RMSE  Rsquared       MAE 
## 1.8084487 0.8700858 1.3824598

Let’s run MARS

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)

set.seed(49)

# tune
marsTune <- train(trainingData$x, trainingData$y,
                  method = "earth",
                  tuneGrid = marsGrid,
                  trControl = trainControl(method = "cv"))

marsTune

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      4.453298  0.2381048  3.7135188
##   1        3      3.716789  0.4534328  3.0174508
##   1        4      2.647399  0.7272873  2.1426995
##   1        5      2.437011  0.7612669  1.9662234
##   1        6      2.288726  0.7929477  1.8499270
##   1        7      1.846335  0.8599617  1.4531217
##   1        8      1.765444  0.8758672  1.4042664
##   1        9      1.687700  0.8830707  1.3437344
##   1       10      1.649443  0.8898077  1.2953918
##   1       11      1.638811  0.8920319  1.2992477
##   1       12      1.600789  0.8995394  1.2726796
##   1       13      1.587373  0.9024364  1.2541751
##   1       14      1.588057  0.9024416  1.2569983
##   1       15      1.612197  0.8996727  1.2797158
##   1       16      1.612197  0.8996727  1.2797158
##   1       17      1.612197  0.8996727  1.2797158
##   1       18      1.612197  0.8996727  1.2797158
##   1       19      1.612197  0.8996727  1.2797158
##   1       20      1.612197  0.8996727  1.2797158
##   1       21      1.612197  0.8996727  1.2797158
##   1       22      1.612197  0.8996727  1.2797158
##   1       23      1.612197  0.8996727  1.2797158
##   1       24      1.612197  0.8996727  1.2797158
##   1       25      1.612197  0.8996727  1.2797158
##   1       26      1.612197  0.8996727  1.2797158
##   1       27      1.612197  0.8996727  1.2797158
##   1       28      1.612197  0.8996727  1.2797158
##   1       29      1.612197  0.8996727  1.2797158
##   1       30      1.612197  0.8996727  1.2797158
##   1       31      1.612197  0.8996727  1.2797158
##   1       32      1.612197  0.8996727  1.2797158
##   1       33      1.612197  0.8996727  1.2797158
##   1       34      1.612197  0.8996727  1.2797158
##   1       35      1.612197  0.8996727  1.2797158
##   1       36      1.612197  0.8996727  1.2797158
##   1       37      1.612197  0.8996727  1.2797158
##   1       38      1.612197  0.8996727  1.2797158
##   2        2      4.453298  0.2381048  3.7135188
##   2        3      3.716789  0.4534328  3.0174508
##   2        4      2.640000  0.7297111  2.1125508
##   2        5      2.432251  0.7598140  1.9461738
##   2        6      2.271330  0.7962838  1.7995052
##   2        7      1.766331  0.8718004  1.4021638
##   2        8      1.681055  0.8863789  1.3096644
##   2        9      1.549945  0.9047549  1.2326606
##   2       10      1.368519  0.9237221  1.1115723
##   2       11      1.290260  0.9322441  1.0302311
##   2       12      1.250630  0.9363355  1.0001367
##   2       13      1.240721  0.9355636  0.9826562
##   2       14      1.216473  0.9376457  0.9504278
##   2       15      1.209608  0.9384868  0.9503953
##   2       16      1.202894  0.9394094  0.9496357
##   2       17      1.211619  0.9388804  0.9563377
##   2       18      1.211619  0.9388804  0.9563377
##   2       19      1.211619  0.9388804  0.9563377
##   2       20      1.211619  0.9388804  0.9563377
##   2       21      1.211619  0.9388804  0.9563377
##   2       22      1.211619  0.9388804  0.9563377
##   2       23      1.211619  0.9388804  0.9563377
##   2       24      1.211619  0.9388804  0.9563377
##   2       25      1.211619  0.9388804  0.9563377
##   2       26      1.211619  0.9388804  0.9563377
##   2       27      1.211619  0.9388804  0.9563377
##   2       28      1.211619  0.9388804  0.9563377
##   2       29      1.211619  0.9388804  0.9563377
##   2       30      1.211619  0.9388804  0.9563377
##   2       31      1.211619  0.9388804  0.9563377
##   2       32      1.211619  0.9388804  0.9563377
##   2       33      1.211619  0.9388804  0.9563377
##   2       34      1.211619  0.9388804  0.9563377
##   2       35      1.211619  0.9388804  0.9563377
##   2       36      1.211619  0.9388804  0.9563377
##   2       37      1.211619  0.9388804  0.9563377
##   2       38      1.211619  0.9388804  0.9563377
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 16 and degree = 2.

marsEval <- predict(marsTune, testData$x)

postResample(marsEval, testData$y)

##      RMSE  Rsquared       MAE 
## 1.1492504 0.9471145 0.9158382

set.seed(100)

svmRTune <- train(trainingData$x, trainingData$y,
                  method = "svmRadial",
                  preProc = c("center", "scale"),
                  tuneLength = 14,
                  trControl = trainControl(method = "cv"))

svmRTune

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.530787  0.7922715  2.013175
##      0.50  2.259539  0.8064569  1.789962
##      1.00  2.099807  0.8274221  1.656171
##      2.00  2.002943  0.8412934  1.583791
##      4.00  1.943618  0.8504425  1.546586
##      8.00  1.918690  0.8547623  1.532967
##     16.00  1.920708  0.8536069  1.536132
##     32.00  1.920708  0.8536069  1.536132
##     64.00  1.920708  0.8536069  1.536132
##    128.00  1.920708  0.8536069  1.536132
##    256.00  1.920708  0.8536069  1.536132
##    512.00  1.920708  0.8536069  1.536132
##   1024.00  1.920708  0.8536069  1.536132
##   2048.00  1.920708  0.8536069  1.536132
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06509124
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06509124 and C = 8.

svmEval <- predict(svmRTune, testData$x)

postResample(svmEval, testData$y)

##      RMSE  Rsquared       MAE 
## 2.0631908 0.8275736 1.5662213

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

Let’s bring out the test evaluation metrics all together:

rbind(knn = postResample(knnEval, testData$y),
      neuralnet = postResample(nnEval, testData$y),
      mars = postResample(marsEval, testData$y),
      svmRadial = postResample(svmEval, testData$y))

##               RMSE  Rsquared       MAE
## knn       3.204059 0.6819919 2.5683461
## neuralnet 1.808449 0.8700858 1.3824598
## mars      1.149250 0.9471145 0.9158382
## svmRadial 2.063191 0.8275736 1.5662213

The MARS model appears to do best on the unseen data with a RMSE of 1.14 and an \(R^2\) of 94.7% which outperforms every other tested model somewhat substantially.

varImp(marsTune)

## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.23
## X2   48.73
## X5   15.52
## X3    0.00

Most of the variables between X1 - X5 are significant for the model; however, X3 appears to have been eliminated/penalized from the best version of the model.

7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

library(AppliedPredictiveModeling)

Preprocessing Data

data("ChemicalManufacturingProcess")

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

summary(ChemicalManufacturingProcess)

##      Yield       BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
##  Min.   :35.25   Min.   :4.580        Min.   :46.87        Min.   :56.97       
##  1st Qu.:38.75   1st Qu.:5.978        1st Qu.:52.68        1st Qu.:64.98       
##  Median :39.97   Median :6.305        Median :55.09        Median :67.22       
##  Mean   :40.18   Mean   :6.411        Mean   :55.69        Mean   :67.70       
##  3rd Qu.:41.48   3rd Qu.:6.870        3rd Qu.:58.74        3rd Qu.:70.43       
##  Max.   :46.34   Max.   :8.810        Max.   :64.75        Max.   :78.25       
##                                                                                
##  BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
##  Min.   : 9.38        Min.   :13.24        Min.   :40.60       
##  1st Qu.:11.24        1st Qu.:17.23        1st Qu.:46.05       
##  Median :12.10        Median :18.49        Median :48.46       
##  Mean   :12.35        Mean   :18.60        Mean   :48.91       
##  3rd Qu.:13.22        3rd Qu.:19.90        3rd Qu.:51.34       
##  Max.   :23.09        Max.   :24.85        Max.   :59.38       
##                                                                
##  BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
##  Min.   :100.0        Min.   :15.88        Min.   :11.44       
##  1st Qu.:100.0        1st Qu.:17.06        1st Qu.:12.60       
##  Median :100.0        Median :17.51        Median :12.84       
##  Mean   :100.0        Mean   :17.49        Mean   :12.85       
##  3rd Qu.:100.0        3rd Qu.:17.88        3rd Qu.:13.13       
##  Max.   :100.8        Max.   :19.14        Max.   :14.08       
##                                                                
##  BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
##  Min.   :1.770        Min.   :135.8        Min.   :18.35       
##  1st Qu.:2.460        1st Qu.:143.8        1st Qu.:19.73       
##  Median :2.710        Median :146.1        Median :20.12       
##  Mean   :2.801        Mean   :147.0        Mean   :20.20       
##  3rd Qu.:2.990        3rd Qu.:149.6        3rd Qu.:20.75       
##  Max.   :6.870        Max.   :158.7        Max.   :22.21       
##                                                                
##  ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
##  Min.   : 0.00          Min.   : 0.00          Min.   :1.47          
##  1st Qu.:10.80          1st Qu.:19.30          1st Qu.:1.53          
##  Median :11.40          Median :21.00          Median :1.54          
##  Mean   :11.21          Mean   :16.68          Mean   :1.54          
##  3rd Qu.:12.15          3rd Qu.:21.50          3rd Qu.:1.55          
##  Max.   :14.10          Max.   :22.50          Max.   :1.60          
##  NA's   :1              NA's   :3              NA's   :15            
##  ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
##  Min.   :911.0          Min.   : 923.0         Min.   :203.0         
##  1st Qu.:928.0          1st Qu.: 986.8         1st Qu.:205.7         
##  Median :934.0          Median : 999.2         Median :206.8         
##  Mean   :931.9          Mean   :1001.7         Mean   :207.4         
##  3rd Qu.:936.0          3rd Qu.:1008.9         3rd Qu.:208.7         
##  Max.   :946.0          Max.   :1175.3         Max.   :227.4         
##  NA's   :1              NA's   :1              NA's   :2             
##  ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
##  Min.   :177.0          Min.   :177.0          Min.   :38.89         
##  1st Qu.:177.0          1st Qu.:177.0          1st Qu.:44.89         
##  Median :177.0          Median :178.0          Median :45.73         
##  Mean   :177.5          Mean   :177.6          Mean   :45.66         
##  3rd Qu.:178.0          3rd Qu.:178.0          3rd Qu.:46.52         
##  Max.   :178.0          Max.   :178.0          Max.   :49.36         
##  NA's   :1              NA's   :1                                    
##  ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
##  Min.   : 7.500         Min.   : 7.500         Min.   :   0.0        
##  1st Qu.: 8.700         1st Qu.: 9.000         1st Qu.:   0.0        
##  Median : 9.100         Median : 9.400         Median :   0.0        
##  Mean   : 9.179         Mean   : 9.386         Mean   : 857.8        
##  3rd Qu.: 9.550         3rd Qu.: 9.900         3rd Qu.:   0.0        
##  Max.   :11.600         Max.   :11.500         Max.   :4549.0        
##  NA's   :9              NA's   :10             NA's   :1             
##  ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
##  Min.   :32.10          Min.   :4701           Min.   :5904          
##  1st Qu.:33.90          1st Qu.:4828           1st Qu.:6010          
##  Median :34.60          Median :4856           Median :6032          
##  Mean   :34.51          Mean   :4854           Mean   :6039          
##  3rd Qu.:35.20          3rd Qu.:4882           3rd Qu.:6061          
##  Max.   :38.60          Max.   :5055           Max.   :6233          
##                         NA's   :1                                    
##  ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
##  Min.   :   0           Min.   :31.30          Min.   :   0          
##  1st Qu.:4561           1st Qu.:33.50          1st Qu.:4813          
##  Median :4588           Median :34.40          Median :4835          
##  Mean   :4566           Mean   :34.34          Mean   :4810          
##  3rd Qu.:4619           3rd Qu.:35.10          3rd Qu.:4862          
##  Max.   :4852           Max.   :40.00          Max.   :4971          
##                                                                      
##  ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
##  Min.   :5890           Min.   :   0           Min.   :-1.8000       
##  1st Qu.:6001           1st Qu.:4553           1st Qu.:-0.6000       
##  Median :6022           Median :4582           Median :-0.3000       
##  Mean   :6028           Mean   :4556           Mean   :-0.1642       
##  3rd Qu.:6050           3rd Qu.:4610           3rd Qu.: 0.0000       
##  Max.   :6146           Max.   :4759           Max.   : 3.6000       
##                                                                      
##  ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
##  Min.   : 0.000         Min.   :0.000          Min.   : 0.000        
##  1st Qu.: 3.000         1st Qu.:2.000          1st Qu.: 4.000        
##  Median : 5.000         Median :3.000          Median : 8.000        
##  Mean   : 5.406         Mean   :3.017          Mean   : 8.834        
##  3rd Qu.: 8.000         3rd Qu.:4.000          3rd Qu.:14.000        
##  Max.   :12.000         Max.   :6.000          Max.   :23.000        
##  NA's   :1              NA's   :1              NA's   :1             
##  ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
##  Min.   :   0           Min.   :   0           Min.   :   0          
##  1st Qu.:4832           1st Qu.:6020           1st Qu.:4560          
##  Median :4855           Median :6047           Median :4587          
##  Mean   :4828           Mean   :6016           Mean   :4563          
##  3rd Qu.:4877           3rd Qu.:6070           3rd Qu.:4609          
##  Max.   :4990           Max.   :6161           Max.   :4710          
##  NA's   :5              NA's   :5              NA's   :5             
##  ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
##  Min.   : 0.000         Min.   : 0.00          Min.   : 0.000        
##  1st Qu.: 0.000         1st Qu.:19.70          1st Qu.: 8.800        
##  Median :10.400         Median :19.90          Median : 9.100        
##  Mean   : 6.592         Mean   :20.01          Mean   : 9.161        
##  3rd Qu.:10.750         3rd Qu.:20.40          3rd Qu.: 9.700        
##  Max.   :11.500         Max.   :22.00          Max.   :11.200        
##  NA's   :5              NA's   :5              NA's   :5             
##  ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
##  Min.   : 0.00          Min.   :143.0          Min.   :56.00         
##  1st Qu.:70.10          1st Qu.:155.0          1st Qu.:62.00         
##  Median :70.80          Median :158.0          Median :64.00         
##  Mean   :70.18          Mean   :158.5          Mean   :63.54         
##  3rd Qu.:71.40          3rd Qu.:162.0          3rd Qu.:65.00         
##  Max.   :72.50          Max.   :173.0          Max.   :70.00         
##  NA's   :5                                     NA's   :5             
##  ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
##  Min.   :2.300          Min.   :463.0          Min.   :0.01700       
##  1st Qu.:2.500          1st Qu.:490.0          1st Qu.:0.01900       
##  Median :2.500          Median :495.0          Median :0.02000       
##  Mean   :2.494          Mean   :495.6          Mean   :0.01957       
##  3rd Qu.:2.500          3rd Qu.:501.5          3rd Qu.:0.02000       
##  Max.   :2.600          Max.   :522.0          Max.   :0.02200       
##  NA's   :5              NA's   :5              NA's   :5             
##  ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
##  Min.   :0.000          Min.   :0.000          Min.   :0.000         
##  1st Qu.:0.700          1st Qu.:2.000          1st Qu.:7.100         
##  Median :1.000          Median :3.000          Median :7.200         
##  Mean   :1.014          Mean   :2.534          Mean   :6.851         
##  3rd Qu.:1.300          3rd Qu.:3.000          3rd Qu.:7.300         
##  Max.   :2.300          Max.   :3.000          Max.   :7.500         
##                                                                      
##  ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
##  Min.   :0.00000        Min.   :0.00000        Min.   : 0.00         
##  1st Qu.:0.00000        1st Qu.:0.00000        1st Qu.:11.40         
##  Median :0.00000        Median :0.00000        Median :11.60         
##  Mean   :0.01771        Mean   :0.02371        Mean   :11.21         
##  3rd Qu.:0.00000        3rd Qu.:0.00000        3rd Qu.:11.70         
##  Max.   :0.10000        Max.   :0.20000        Max.   :12.10         
##  NA's   :1              NA's   :1                                    
##  ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
##  Min.   : 0.0000        Min.   :0.000          Min.   :0.000         
##  1st Qu.: 0.6000        1st Qu.:1.800          1st Qu.:2.100         
##  Median : 0.8000        Median :1.900          Median :2.200         
##  Mean   : 0.9119        Mean   :1.805          Mean   :2.138         
##  3rd Qu.: 1.0250        3rd Qu.:1.900          3rd Qu.:2.300         
##  Max.   :11.0000        Max.   :2.100          Max.   :2.600         
##

x_chem <- ChemicalManufacturingProcess |> select(-Yield)
y_chem <- ChemicalManufacturingProcess |> select(Yield)

A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).

x_chem |> pivot_longer(cols=colnames(x_chem)) |> filter(is.na(value)) |> group_by(name) |> summarise(cnt=n()) |> arrange(-cnt)

## # A tibble: 28 × 2
##    name                     cnt
##    <chr>                  <int>
##  1 ManufacturingProcess03    15
##  2 ManufacturingProcess11    10
##  3 ManufacturingProcess10     9
##  4 ManufacturingProcess25     5
##  5 ManufacturingProcess26     5
##  6 ManufacturingProcess27     5
##  7 ManufacturingProcess28     5
##  8 ManufacturingProcess29     5
##  9 ManufacturingProcess30     5
## 10 ManufacturingProcess31     5
## # ℹ 18 more rows

This graphic shows the percentages of missing values by column, which can be helpful to visualize all in one place rather than reviewing specific instances as in the code above.

plot_missing(x_chem, missing_only = TRUE,
                   ggtheme = theme_classic())

There are a couple of easy methods within preProcess that can be used to apply transformation; however, to preserve the original shape of the data the K-Nearest Neighbors method will be used.

Let’s apply KNN to find the nearest neighbors for imputation:

imputed <- preProcess(x_chem,method=c('center','scale','knnImpute'),k=5)
x_chem_imp <- predict(imputed,x_chem)

set.seed(21)
chem_train <- caret::createDataPartition(y_chem$Yield, p = .70, list= FALSE)
x_train_chem <- x_chem_imp[chem_train,]
x_test_chem <- x_chem_imp[-chem_train,]
y_train_chem <- y_chem[chem_train,]
y_test_chem <- y_chem[-chem_train,]

Which nonlinear regression model gives the optimal resampling and test set performance?

knnModelChem <- train(x_train_chem, y_train_chem,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)

knnModelChem

## k-Nearest Neighbors 
## 
## 124 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 124, 124, 124, 124, 124, 124, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.539353  0.3730672  1.187735
##    7  1.550228  0.3674418  1.203827
##    9  1.540725  0.3749625  1.203754
##   11  1.539332  0.3820396  1.211714
##   13  1.557893  0.3699454  1.237965
##   15  1.571715  0.3623136  1.251439
##   17  1.580508  0.3571565  1.256772
##   19  1.593748  0.3477022  1.269411
##   21  1.604783  0.3431912  1.279409
##   23  1.613204  0.3436896  1.286263
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.

Neural Network:

overCorr <- findCorrelation(cor(x_train_chem), cutoff = .75)


train_x_chem_nnet <- x_train_chem[, -overCorr]
test_x_chem_nnet <- x_test_chem[, -overCorr]

nnetGridChem <- expand.grid(.decay = c(0, 0.01, .1),
                        .size = c(1:10))
set.seed(81)

# tune
nnetTuneChem <- train(train_x_chem_nnet, y_train_chem,
                  method = "nnet",
                  tuneGrid = nnetGridChem,
                  trControl = tr_ctrl,
                  preProc = c("center", "scale"),
                  linout = TRUE,
                  trace = FALSE,
                  MaxNWts = 10 * (ncol(train_x_chem_nnet) + 1) + 10 + 1,
                  maxit = 500)

nnetTuneChem

## Neural Network 
## 
## 124 samples
##  39 predictor
## 
## Pre-processing: centered (39), scaled (39) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 111, 112, 111, 112, 111, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0.00    1    7.844379  0.1826533  3.983008
##   0.00    2    1.858824  0.2189794  1.483899
##   0.00    3    3.133294  0.2058258  2.485250
##   0.00    4    3.655149  0.1574215  2.857236
##   0.00    5    3.294300  0.2210634  2.613794
##   0.00    6    3.285878  0.2218604  2.591461
##   0.00    7    3.465201  0.3287468  2.793021
##   0.00    8    3.604301  0.2678209  2.821120
##   0.00    9    3.229658  0.1458415  2.544217
##   0.00   10    6.159846  0.1201793  4.495554
##   0.01    1    1.867994  0.2969659  1.508758
##   0.01    2    2.285950  0.2543983  1.770648
##   0.01    3    2.781737  0.2199273  2.173680
##   0.01    4    2.740031  0.2947405  2.092941
##   0.01    5    2.258026  0.3590512  1.819064
##   0.01    6    3.094524  0.2360432  2.181921
##   0.01    7    2.389343  0.2738695  1.897760
##   0.01    8    2.544579  0.2597203  2.091573
##   0.01    9    2.502324  0.2675740  2.035874
##   0.01   10    2.916754  0.2207930  2.359329
##   0.10    1    1.661193  0.4068491  1.374987
##   0.10    2    2.603700  0.2955486  1.988476
##   0.10    3    2.397494  0.2876218  1.831526
##   0.10    4    2.888888  0.2121642  2.117608
##   0.10    5    2.685789  0.2991142  1.882285
##   0.10    6    2.112659  0.3458989  1.622275
##   0.10    7    2.367887  0.3481763  1.627867
##   0.10    8    2.106920  0.3096946  1.614022
##   0.10    9    2.522811  0.2175889  1.837074
##   0.10   10    2.072935  0.3847126  1.535177
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.1.

MARS Method:

set.seed(92)

# tune
marsTuneChem <- train(x_train_chem, y_train_chem,
                  method = "earth",
                  tuneGrid = marsGrid,
                  trControl = trainControl(method = "cv"))

marsTuneChem

## Multivariate Adaptive Regression Spline 
## 
## 124 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 112, 112, 112, 111, 112, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      1.386589  0.4927736  1.0920068
##   1        3      1.308517  0.5319293  1.0612891
##   1        4      1.209389  0.6102553  0.9658341
##   1        5      1.256809  0.5726219  1.0226502
##   1        6      1.232662  0.5924830  1.0239954
##   1        7      1.219399  0.6048860  1.0033693
##   1        8      1.200411  0.6107601  1.0169214
##   1        9      1.263842  0.5617176  1.0671156
##   1       10      1.288852  0.5461784  1.0445129
##   1       11      1.270242  0.5754780  1.0121190
##   1       12      1.256598  0.5864675  1.0150753
##   1       13      1.247105  0.6012662  1.0031781
##   1       14      1.255035  0.5991353  0.9936429
##   1       15      1.279054  0.5931904  1.0072173
##   1       16      1.266442  0.5992590  0.9999428
##   1       17      1.266442  0.5992590  0.9999428
##   1       18      1.282769  0.5884693  1.0106446
##   1       19      1.276720  0.5928197  1.0041556
##   1       20      1.276720  0.5928197  1.0041556
##   1       21      1.276720  0.5928197  1.0041556
##   1       22      1.276720  0.5928197  1.0041556
##   1       23      1.276720  0.5928197  1.0041556
##   1       24      1.276720  0.5928197  1.0041556
##   1       25      1.276720  0.5928197  1.0041556
##   1       26      1.276720  0.5928197  1.0041556
##   1       27      1.276720  0.5928197  1.0041556
##   1       28      1.276720  0.5928197  1.0041556
##   1       29      1.276720  0.5928197  1.0041556
##   1       30      1.276720  0.5928197  1.0041556
##   1       31      1.276720  0.5928197  1.0041556
##   1       32      1.276720  0.5928197  1.0041556
##   1       33      1.276720  0.5928197  1.0041556
##   1       34      1.276720  0.5928197  1.0041556
##   1       35      1.276720  0.5928197  1.0041556
##   1       36      1.276720  0.5928197  1.0041556
##   1       37      1.276720  0.5928197  1.0041556
##   1       38      1.276720  0.5928197  1.0041556
##   2        2      1.386589  0.4927736  1.0920068
##   2        3      1.413533  0.4506697  1.0995737
##   2        4      1.266432  0.5862261  0.9795224
##   2        5      1.202265  0.6014250  0.9559162
##   2        6      1.365030  0.4993974  1.1033808
##   2        7      1.357540  0.5252833  1.0762755
##   2        8      1.295411  0.5625124  1.0525017
##   2        9      1.431936  0.4787384  1.1551828
##   2       10      1.408014  0.4992009  1.1319615
##   2       11      1.374688  0.5389312  1.1253720
##   2       12      1.325663  0.5742589  1.0794703
##   2       13      1.297948  0.5967460  1.0666514
##   2       14      1.347467  0.5718709  1.0988817
##   2       15      1.314815  0.5888026  1.0596562
##   2       16      1.328870  0.5793341  1.0754231
##   2       17      1.341726  0.5791109  1.0856543
##   2       18      1.517880  0.5392842  1.1924067
##   2       19      1.628665  0.5141540  1.2521292
##   2       20      1.643833  0.5137175  1.2506165
##   2       21      1.633878  0.5174490  1.2462587
##   2       22      1.619016  0.5145373  1.2426204
##   2       23      1.602043  0.5145993  1.2380195
##   2       24      1.602043  0.5145993  1.2380195
##   2       25      1.602043  0.5145993  1.2380195
##   2       26      1.602043  0.5145993  1.2380195
##   2       27      1.602043  0.5145993  1.2380195
##   2       28      1.602043  0.5145993  1.2380195
##   2       29      1.602043  0.5145993  1.2380195
##   2       30      1.602043  0.5145993  1.2380195
##   2       31      1.602043  0.5145993  1.2380195
##   2       32      1.602043  0.5145993  1.2380195
##   2       33      1.602043  0.5145993  1.2380195
##   2       34      1.602043  0.5145993  1.2380195
##   2       35      1.602043  0.5145993  1.2380195
##   2       36      1.602043  0.5145993  1.2380195
##   2       37      1.602043  0.5145993  1.2380195
##   2       38      1.602043  0.5145993  1.2380195
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 8 and degree = 1.

set.seed(83)

svmRTuneChem <- train(x_train_chem, y_train_chem,
                  method = "svmRadial",
                  preProc = c("center", "scale"),
                  tuneLength = 14,
                  trControl = trainControl(method = "cv"))

svmRTuneChem

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 124 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 111, 112, 112, 112, 112, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE      
##      0.25  1.510612  0.4682538  1.2342797
##      0.50  1.403611  0.5150776  1.1469636
##      1.00  1.318116  0.5733986  1.0789297
##      2.00  1.266319  0.5995068  1.0228569
##      4.00  1.242479  0.6008563  0.9927292
##      8.00  1.233571  0.5982649  0.9818278
##     16.00  1.233284  0.5969224  0.9776539
##     32.00  1.233284  0.5969224  0.9776539
##     64.00  1.233284  0.5969224  0.9776539
##    128.00  1.233284  0.5969224  0.9776539
##    256.00  1.233284  0.5969224  0.9776539
##    512.00  1.233284  0.5969224  0.9776539
##   1024.00  1.233284  0.5969224  0.9776539
##   2048.00  1.233284  0.5969224  0.9776539
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01253516
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01253516 and C = 16.

svmChemEval <- predict(svmRTuneChem, x_test_chem)
marsChemEval <- predict(marsTuneChem, x_test_chem)
nnetChemEval <- predict(nnetTuneChem, x_test_chem)
knnChemEval <- predict(knnModelChem, x_test_chem)

chemEvals <- rbind(postResample(svmChemEval, y_test_chem),postResample(marsChemEval,y_test_chem),
      postResample(nnetChemEval,y_test_chem),postResample(knnChemEval,y_test_chem))
rownames(chemEvals) <- c('SVM - Radial','MARS','Neural Net','KNN')
chemEvals

##                   RMSE  Rsquared       MAE
## SVM - Radial 0.9876493 0.6828667 0.7464189
## MARS         1.1175141 0.5959044 0.8771180
## Neural Net   1.4834263 0.4127663 1.1407492
## KNN          1.1538981 0.5978144 0.9593298

The Radial Basis SVM model performs the best of all of the non-linear methods tested. It has the lowest test set RMSE and explains about 68.29% of the variation in the chemical manufacturing.

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

set.seed(12)
# Grid search for optimized penalty lambdas
enet_gs <- expand.grid(.lambda = c(0, 0.01, .1), .fraction = seq(.05, 1, length = 20))

# tuning penalized regression model
enet_fit_chem <- train(x_train_chem, y_train_chem, method = "enet",
                  tuneGrid = enet_gs, trControl = tr_ctrl, preProc = c("center", "scale"))

x_enet_pred_chem <- predict(enet_fit_chem,x_test_chem)

varImp(enet_fit_chem)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   69.74
## BiologicalMaterial06     57.49
## BiologicalMaterial03     54.28
## ManufacturingProcess17   53.61
## BiologicalMaterial02     50.70
## ManufacturingProcess36   46.85
## ManufacturingProcess09   46.28
## ManufacturingProcess06   43.46
## ManufacturingProcess29   42.11
## ManufacturingProcess31   41.70
## BiologicalMaterial12     40.91
## ManufacturingProcess33   40.34
## BiologicalMaterial11     35.19
## BiologicalMaterial04     35.10
## BiologicalMaterial09     34.22
## ManufacturingProcess02   30.31
## ManufacturingProcess30   27.60
## BiologicalMaterial01     26.26
## ManufacturingProcess16   25.92

varImp(svmRTuneChem)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   69.74
## BiologicalMaterial06     57.49
## BiologicalMaterial03     54.28
## ManufacturingProcess17   53.61
## BiologicalMaterial02     50.70
## ManufacturingProcess36   46.85
## ManufacturingProcess09   46.28
## ManufacturingProcess06   43.46
## ManufacturingProcess29   42.11
## ManufacturingProcess31   41.70
## BiologicalMaterial12     40.91
## ManufacturingProcess33   40.34
## BiologicalMaterial11     35.19
## BiologicalMaterial04     35.10
## BiologicalMaterial09     34.22
## ManufacturingProcess02   30.31
## ManufacturingProcess30   27.60
## BiologicalMaterial01     26.26
## ManufacturingProcess16   25.92

par(mfrow = c(1, 2))
plot(varImp(enet_fit_chem),top=10)

plot(varImp(svmRTuneChem),top=10)

The most important variables in both the linear and non-linear models are the same for the top 10 which would indicate that both are capturing the significance.

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

Since the top 10 predictors are the same for both linear and linear models it appears that most of the predictors have fairly linear relationships with the predictor. Manufacturing Process 29 and 36 appears to have the weakest linear relationship among the most significant predictors.

top10svm <- varImp(svmRTuneChem)$importance |> arrange(-Overall) |>
  head(10)

corrplot(cor(cbind(x_chem_imp,y_chem) |> select(c('Yield',rownames(top10svm)))))

imp_chem_preds <- c('ManufacturingProcess32','ManufacturingProcess13','BiologicalMaterial06','BiologicalMaterial03','BiologicalMaterial02','ManufacturingProcess17','ManufacturingProcess36','ManufacturingProcess09','ManufacturingProcess06','ManufacturingProcess29','Yield')
par(mfrow=c(4,3))
par(mai=c(.3,.3,.3,.3))

plot(x=x_chem_imp$ManufacturingProcess32,y=y_chem$Yield,main='Manu_Process32 vs Yield') 
plot(x=x_chem_imp$ManufacturingProcess36,y=y_chem$Yield,main='Manu_Process36 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess17,y=y_chem$Yield,main='Manu_Process17 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess13,y=y_chem$Yield,main='Manu_Process13 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess06,y=y_chem$Yield,main='Manu_Process06 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess29,y=y_chem$Yield,main='Manu_Process29 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess09,y=y_chem$Yield,main='Manu_Process09 vs Yield')
plot(x=x_chem_imp$BiologicalMaterial03,y=y_chem$Yield,main='Bio_Process03 vs Yield')
plot(x=x_chem_imp$BiologicalMaterial02,y=y_chem$Yield,main='Bio_Process03 vs Yield')
plot(x=x_chem_imp$BiologicalMaterial06,y=y_chem$Yield,main='Bio_Process06 vs Yield')

Data624: HW8

jfor: Josh Forster

2024-03-24

Preprocessing Data