Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.
library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(MASS)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(TeachingDemos)
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(kernlab)
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## alpha
library(AppliedPredictiveModeling)
library(DataExplorer)
library(corrplot)
## corrplot 0.92 loaded
7.2. Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:
\(y = 10\sin(\pi x_1x_2) + 20(x_3 - 0.5)^2+10x_4 + 5x_5 + N(0,\sigma^2)\)
where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
set.seed(20)
knnModel <- train(x = trainingData$x,y = trainingData$y,method = "knn",preProc = c("center", "scale"),tuneLength = 10)
knnEval <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnEval, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
netCorr <- findCorrelation(cor(trainingData$x), cutoff = .75)
netCorr
## integer(0)
Since there arent any overly correlated variables then all of the predictors will be included
tr_ctrl <- trainControl(method = "cv", number = 10)
## Create a specific candidate set of models to evaluate:
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10))
set.seed(80)
nnetTune <- train(trainingData$x, trainingData$y,
method = "nnet",
tuneGrid = nnetGrid,
trControl = tr_ctrl,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
maxit = 500)
nnEval <- predict(nnetTune, testData$x)
postResample(nnEval, testData$y)
## RMSE Rsquared MAE
## 1.8084487 0.8700858 1.3824598
Let’s run MARS
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(49)
# tune
marsTune <- train(trainingData$x, trainingData$y,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marsTune
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.453298 0.2381048 3.7135188
## 1 3 3.716789 0.4534328 3.0174508
## 1 4 2.647399 0.7272873 2.1426995
## 1 5 2.437011 0.7612669 1.9662234
## 1 6 2.288726 0.7929477 1.8499270
## 1 7 1.846335 0.8599617 1.4531217
## 1 8 1.765444 0.8758672 1.4042664
## 1 9 1.687700 0.8830707 1.3437344
## 1 10 1.649443 0.8898077 1.2953918
## 1 11 1.638811 0.8920319 1.2992477
## 1 12 1.600789 0.8995394 1.2726796
## 1 13 1.587373 0.9024364 1.2541751
## 1 14 1.588057 0.9024416 1.2569983
## 1 15 1.612197 0.8996727 1.2797158
## 1 16 1.612197 0.8996727 1.2797158
## 1 17 1.612197 0.8996727 1.2797158
## 1 18 1.612197 0.8996727 1.2797158
## 1 19 1.612197 0.8996727 1.2797158
## 1 20 1.612197 0.8996727 1.2797158
## 1 21 1.612197 0.8996727 1.2797158
## 1 22 1.612197 0.8996727 1.2797158
## 1 23 1.612197 0.8996727 1.2797158
## 1 24 1.612197 0.8996727 1.2797158
## 1 25 1.612197 0.8996727 1.2797158
## 1 26 1.612197 0.8996727 1.2797158
## 1 27 1.612197 0.8996727 1.2797158
## 1 28 1.612197 0.8996727 1.2797158
## 1 29 1.612197 0.8996727 1.2797158
## 1 30 1.612197 0.8996727 1.2797158
## 1 31 1.612197 0.8996727 1.2797158
## 1 32 1.612197 0.8996727 1.2797158
## 1 33 1.612197 0.8996727 1.2797158
## 1 34 1.612197 0.8996727 1.2797158
## 1 35 1.612197 0.8996727 1.2797158
## 1 36 1.612197 0.8996727 1.2797158
## 1 37 1.612197 0.8996727 1.2797158
## 1 38 1.612197 0.8996727 1.2797158
## 2 2 4.453298 0.2381048 3.7135188
## 2 3 3.716789 0.4534328 3.0174508
## 2 4 2.640000 0.7297111 2.1125508
## 2 5 2.432251 0.7598140 1.9461738
## 2 6 2.271330 0.7962838 1.7995052
## 2 7 1.766331 0.8718004 1.4021638
## 2 8 1.681055 0.8863789 1.3096644
## 2 9 1.549945 0.9047549 1.2326606
## 2 10 1.368519 0.9237221 1.1115723
## 2 11 1.290260 0.9322441 1.0302311
## 2 12 1.250630 0.9363355 1.0001367
## 2 13 1.240721 0.9355636 0.9826562
## 2 14 1.216473 0.9376457 0.9504278
## 2 15 1.209608 0.9384868 0.9503953
## 2 16 1.202894 0.9394094 0.9496357
## 2 17 1.211619 0.9388804 0.9563377
## 2 18 1.211619 0.9388804 0.9563377
## 2 19 1.211619 0.9388804 0.9563377
## 2 20 1.211619 0.9388804 0.9563377
## 2 21 1.211619 0.9388804 0.9563377
## 2 22 1.211619 0.9388804 0.9563377
## 2 23 1.211619 0.9388804 0.9563377
## 2 24 1.211619 0.9388804 0.9563377
## 2 25 1.211619 0.9388804 0.9563377
## 2 26 1.211619 0.9388804 0.9563377
## 2 27 1.211619 0.9388804 0.9563377
## 2 28 1.211619 0.9388804 0.9563377
## 2 29 1.211619 0.9388804 0.9563377
## 2 30 1.211619 0.9388804 0.9563377
## 2 31 1.211619 0.9388804 0.9563377
## 2 32 1.211619 0.9388804 0.9563377
## 2 33 1.211619 0.9388804 0.9563377
## 2 34 1.211619 0.9388804 0.9563377
## 2 35 1.211619 0.9388804 0.9563377
## 2 36 1.211619 0.9388804 0.9563377
## 2 37 1.211619 0.9388804 0.9563377
## 2 38 1.211619 0.9388804 0.9563377
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 16 and degree = 2.
marsEval <- predict(marsTune, testData$x)
postResample(marsEval, testData$y)
## RMSE Rsquared MAE
## 1.1492504 0.9471145 0.9158382
set.seed(100)
svmRTune <- train(trainingData$x, trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmRTune
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.530787 0.7922715 2.013175
## 0.50 2.259539 0.8064569 1.789962
## 1.00 2.099807 0.8274221 1.656171
## 2.00 2.002943 0.8412934 1.583791
## 4.00 1.943618 0.8504425 1.546586
## 8.00 1.918690 0.8547623 1.532967
## 16.00 1.920708 0.8536069 1.536132
## 32.00 1.920708 0.8536069 1.536132
## 64.00 1.920708 0.8536069 1.536132
## 128.00 1.920708 0.8536069 1.536132
## 256.00 1.920708 0.8536069 1.536132
## 512.00 1.920708 0.8536069 1.536132
## 1024.00 1.920708 0.8536069 1.536132
## 2048.00 1.920708 0.8536069 1.536132
##
## Tuning parameter 'sigma' was held constant at a value of 0.06509124
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06509124 and C = 8.
svmEval <- predict(svmRTune, testData$x)
postResample(svmEval, testData$y)
## RMSE Rsquared MAE
## 2.0631908 0.8275736 1.5662213
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?
Let’s bring out the test evaluation metrics all together:
rbind(knn = postResample(knnEval, testData$y),
neuralnet = postResample(nnEval, testData$y),
mars = postResample(marsEval, testData$y),
svmRadial = postResample(svmEval, testData$y))
## RMSE Rsquared MAE
## knn 3.204059 0.6819919 2.5683461
## neuralnet 1.808449 0.8700858 1.3824598
## mars 1.149250 0.9471145 0.9158382
## svmRadial 2.063191 0.8275736 1.5662213
The MARS model appears to do best on the unseen data with a RMSE of 1.14 and an \(R^2\) of 94.7% which outperforms every other tested model somewhat substantially.
varImp(marsTune)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.23
## X2 48.73
## X5 15.52
## X3 0.00
Most of the variables between X1 - X5 are significant for the model; however, X3 appears to have been eliminated/penalized from the best version of the model.
7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
summary(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## Min. :35.25 Min. :4.580 Min. :46.87 Min. :56.97
## 1st Qu.:38.75 1st Qu.:5.978 1st Qu.:52.68 1st Qu.:64.98
## Median :39.97 Median :6.305 Median :55.09 Median :67.22
## Mean :40.18 Mean :6.411 Mean :55.69 Mean :67.70
## 3rd Qu.:41.48 3rd Qu.:6.870 3rd Qu.:58.74 3rd Qu.:70.43
## Max. :46.34 Max. :8.810 Max. :64.75 Max. :78.25
##
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## Min. : 9.38 Min. :13.24 Min. :40.60
## 1st Qu.:11.24 1st Qu.:17.23 1st Qu.:46.05
## Median :12.10 Median :18.49 Median :48.46
## Mean :12.35 Mean :18.60 Mean :48.91
## 3rd Qu.:13.22 3rd Qu.:19.90 3rd Qu.:51.34
## Max. :23.09 Max. :24.85 Max. :59.38
##
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## Min. :100.0 Min. :15.88 Min. :11.44
## 1st Qu.:100.0 1st Qu.:17.06 1st Qu.:12.60
## Median :100.0 Median :17.51 Median :12.84
## Mean :100.0 Mean :17.49 Mean :12.85
## 3rd Qu.:100.0 3rd Qu.:17.88 3rd Qu.:13.13
## Max. :100.8 Max. :19.14 Max. :14.08
##
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## Min. :1.770 Min. :135.8 Min. :18.35
## 1st Qu.:2.460 1st Qu.:143.8 1st Qu.:19.73
## Median :2.710 Median :146.1 Median :20.12
## Mean :2.801 Mean :147.0 Mean :20.20
## 3rd Qu.:2.990 3rd Qu.:149.6 3rd Qu.:20.75
## Max. :6.870 Max. :158.7 Max. :22.21
##
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## Min. : 0.00 Min. : 0.00 Min. :1.47
## 1st Qu.:10.80 1st Qu.:19.30 1st Qu.:1.53
## Median :11.40 Median :21.00 Median :1.54
## Mean :11.21 Mean :16.68 Mean :1.54
## 3rd Qu.:12.15 3rd Qu.:21.50 3rd Qu.:1.55
## Max. :14.10 Max. :22.50 Max. :1.60
## NA's :1 NA's :3 NA's :15
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## Min. :911.0 Min. : 923.0 Min. :203.0
## 1st Qu.:928.0 1st Qu.: 986.8 1st Qu.:205.7
## Median :934.0 Median : 999.2 Median :206.8
## Mean :931.9 Mean :1001.7 Mean :207.4
## 3rd Qu.:936.0 3rd Qu.:1008.9 3rd Qu.:208.7
## Max. :946.0 Max. :1175.3 Max. :227.4
## NA's :1 NA's :1 NA's :2
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## Min. :177.0 Min. :177.0 Min. :38.89
## 1st Qu.:177.0 1st Qu.:177.0 1st Qu.:44.89
## Median :177.0 Median :178.0 Median :45.73
## Mean :177.5 Mean :177.6 Mean :45.66
## 3rd Qu.:178.0 3rd Qu.:178.0 3rd Qu.:46.52
## Max. :178.0 Max. :178.0 Max. :49.36
## NA's :1 NA's :1
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## Min. : 7.500 Min. : 7.500 Min. : 0.0
## 1st Qu.: 8.700 1st Qu.: 9.000 1st Qu.: 0.0
## Median : 9.100 Median : 9.400 Median : 0.0
## Mean : 9.179 Mean : 9.386 Mean : 857.8
## 3rd Qu.: 9.550 3rd Qu.: 9.900 3rd Qu.: 0.0
## Max. :11.600 Max. :11.500 Max. :4549.0
## NA's :9 NA's :10 NA's :1
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## Min. :32.10 Min. :4701 Min. :5904
## 1st Qu.:33.90 1st Qu.:4828 1st Qu.:6010
## Median :34.60 Median :4856 Median :6032
## Mean :34.51 Mean :4854 Mean :6039
## 3rd Qu.:35.20 3rd Qu.:4882 3rd Qu.:6061
## Max. :38.60 Max. :5055 Max. :6233
## NA's :1
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## Min. : 0 Min. :31.30 Min. : 0
## 1st Qu.:4561 1st Qu.:33.50 1st Qu.:4813
## Median :4588 Median :34.40 Median :4835
## Mean :4566 Mean :34.34 Mean :4810
## 3rd Qu.:4619 3rd Qu.:35.10 3rd Qu.:4862
## Max. :4852 Max. :40.00 Max. :4971
##
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## Min. :5890 Min. : 0 Min. :-1.8000
## 1st Qu.:6001 1st Qu.:4553 1st Qu.:-0.6000
## Median :6022 Median :4582 Median :-0.3000
## Mean :6028 Mean :4556 Mean :-0.1642
## 3rd Qu.:6050 3rd Qu.:4610 3rd Qu.: 0.0000
## Max. :6146 Max. :4759 Max. : 3.6000
##
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## Min. : 0.000 Min. :0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.:2.000 1st Qu.: 4.000
## Median : 5.000 Median :3.000 Median : 8.000
## Mean : 5.406 Mean :3.017 Mean : 8.834
## 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.:14.000
## Max. :12.000 Max. :6.000 Max. :23.000
## NA's :1 NA's :1 NA's :1
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.:4832 1st Qu.:6020 1st Qu.:4560
## Median :4855 Median :6047 Median :4587
## Mean :4828 Mean :6016 Mean :4563
## 3rd Qu.:4877 3rd Qu.:6070 3rd Qu.:4609
## Max. :4990 Max. :6161 Max. :4710
## NA's :5 NA's :5 NA's :5
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.:19.70 1st Qu.: 8.800
## Median :10.400 Median :19.90 Median : 9.100
## Mean : 6.592 Mean :20.01 Mean : 9.161
## 3rd Qu.:10.750 3rd Qu.:20.40 3rd Qu.: 9.700
## Max. :11.500 Max. :22.00 Max. :11.200
## NA's :5 NA's :5 NA's :5
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## Min. : 0.00 Min. :143.0 Min. :56.00
## 1st Qu.:70.10 1st Qu.:155.0 1st Qu.:62.00
## Median :70.80 Median :158.0 Median :64.00
## Mean :70.18 Mean :158.5 Mean :63.54
## 3rd Qu.:71.40 3rd Qu.:162.0 3rd Qu.:65.00
## Max. :72.50 Max. :173.0 Max. :70.00
## NA's :5 NA's :5
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## Min. :2.300 Min. :463.0 Min. :0.01700
## 1st Qu.:2.500 1st Qu.:490.0 1st Qu.:0.01900
## Median :2.500 Median :495.0 Median :0.02000
## Mean :2.494 Mean :495.6 Mean :0.01957
## 3rd Qu.:2.500 3rd Qu.:501.5 3rd Qu.:0.02000
## Max. :2.600 Max. :522.0 Max. :0.02200
## NA's :5 NA's :5 NA's :5
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.700 1st Qu.:2.000 1st Qu.:7.100
## Median :1.000 Median :3.000 Median :7.200
## Mean :1.014 Mean :2.534 Mean :6.851
## 3rd Qu.:1.300 3rd Qu.:3.000 3rd Qu.:7.300
## Max. :2.300 Max. :3.000 Max. :7.500
##
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## Min. :0.00000 Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:11.40
## Median :0.00000 Median :0.00000 Median :11.60
## Mean :0.01771 Mean :0.02371 Mean :11.21
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:11.70
## Max. :0.10000 Max. :0.20000 Max. :12.10
## NA's :1 NA's :1
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## Min. : 0.0000 Min. :0.000 Min. :0.000
## 1st Qu.: 0.6000 1st Qu.:1.800 1st Qu.:2.100
## Median : 0.8000 Median :1.900 Median :2.200
## Mean : 0.9119 Mean :1.805 Mean :2.138
## 3rd Qu.: 1.0250 3rd Qu.:1.900 3rd Qu.:2.300
## Max. :11.0000 Max. :2.100 Max. :2.600
##
x_chem <- ChemicalManufacturingProcess |> select(-Yield)
y_chem <- ChemicalManufacturingProcess |> select(Yield)
x_chem |> pivot_longer(cols=colnames(x_chem)) |> filter(is.na(value)) |> group_by(name) |> summarise(cnt=n()) |> arrange(-cnt)
## # A tibble: 28 × 2
## name cnt
## <chr> <int>
## 1 ManufacturingProcess03 15
## 2 ManufacturingProcess11 10
## 3 ManufacturingProcess10 9
## 4 ManufacturingProcess25 5
## 5 ManufacturingProcess26 5
## 6 ManufacturingProcess27 5
## 7 ManufacturingProcess28 5
## 8 ManufacturingProcess29 5
## 9 ManufacturingProcess30 5
## 10 ManufacturingProcess31 5
## # ℹ 18 more rows
This graphic shows the percentages of missing values by column, which can be helpful to visualize all in one place rather than reviewing specific instances as in the code above.
plot_missing(x_chem, missing_only = TRUE,
ggtheme = theme_classic())
There are a couple of easy methods within preProcess that can be used to apply transformation; however, to preserve the original shape of the data the K-Nearest Neighbors method will be used.
Let’s apply KNN to find the nearest neighbors for imputation:
imputed <- preProcess(x_chem,method=c('center','scale','knnImpute'),k=5)
x_chem_imp <- predict(imputed,x_chem)
set.seed(21)
chem_train <- caret::createDataPartition(y_chem$Yield, p = .70, list= FALSE)
x_train_chem <- x_chem_imp[chem_train,]
x_test_chem <- x_chem_imp[-chem_train,]
y_train_chem <- y_chem[chem_train,]
y_test_chem <- y_chem[-chem_train,]
knnModelChem <- train(x_train_chem, y_train_chem,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModelChem
## k-Nearest Neighbors
##
## 124 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 124, 124, 124, 124, 124, 124, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.539353 0.3730672 1.187735
## 7 1.550228 0.3674418 1.203827
## 9 1.540725 0.3749625 1.203754
## 11 1.539332 0.3820396 1.211714
## 13 1.557893 0.3699454 1.237965
## 15 1.571715 0.3623136 1.251439
## 17 1.580508 0.3571565 1.256772
## 19 1.593748 0.3477022 1.269411
## 21 1.604783 0.3431912 1.279409
## 23 1.613204 0.3436896 1.286263
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
Neural Network:
overCorr <- findCorrelation(cor(x_train_chem), cutoff = .75)
train_x_chem_nnet <- x_train_chem[, -overCorr]
test_x_chem_nnet <- x_test_chem[, -overCorr]
nnetGridChem <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10))
set.seed(81)
# tune
nnetTuneChem <- train(train_x_chem_nnet, y_train_chem,
method = "nnet",
tuneGrid = nnetGridChem,
trControl = tr_ctrl,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(train_x_chem_nnet) + 1) + 10 + 1,
maxit = 500)
nnetTuneChem
## Neural Network
##
## 124 samples
## 39 predictor
##
## Pre-processing: centered (39), scaled (39)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 111, 112, 111, 112, 111, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 7.844379 0.1826533 3.983008
## 0.00 2 1.858824 0.2189794 1.483899
## 0.00 3 3.133294 0.2058258 2.485250
## 0.00 4 3.655149 0.1574215 2.857236
## 0.00 5 3.294300 0.2210634 2.613794
## 0.00 6 3.285878 0.2218604 2.591461
## 0.00 7 3.465201 0.3287468 2.793021
## 0.00 8 3.604301 0.2678209 2.821120
## 0.00 9 3.229658 0.1458415 2.544217
## 0.00 10 6.159846 0.1201793 4.495554
## 0.01 1 1.867994 0.2969659 1.508758
## 0.01 2 2.285950 0.2543983 1.770648
## 0.01 3 2.781737 0.2199273 2.173680
## 0.01 4 2.740031 0.2947405 2.092941
## 0.01 5 2.258026 0.3590512 1.819064
## 0.01 6 3.094524 0.2360432 2.181921
## 0.01 7 2.389343 0.2738695 1.897760
## 0.01 8 2.544579 0.2597203 2.091573
## 0.01 9 2.502324 0.2675740 2.035874
## 0.01 10 2.916754 0.2207930 2.359329
## 0.10 1 1.661193 0.4068491 1.374987
## 0.10 2 2.603700 0.2955486 1.988476
## 0.10 3 2.397494 0.2876218 1.831526
## 0.10 4 2.888888 0.2121642 2.117608
## 0.10 5 2.685789 0.2991142 1.882285
## 0.10 6 2.112659 0.3458989 1.622275
## 0.10 7 2.367887 0.3481763 1.627867
## 0.10 8 2.106920 0.3096946 1.614022
## 0.10 9 2.522811 0.2175889 1.837074
## 0.10 10 2.072935 0.3847126 1.535177
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.1.
MARS Method:
set.seed(92)
# tune
marsTuneChem <- train(x_train_chem, y_train_chem,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marsTuneChem
## Multivariate Adaptive Regression Spline
##
## 124 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 112, 112, 112, 111, 112, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.386589 0.4927736 1.0920068
## 1 3 1.308517 0.5319293 1.0612891
## 1 4 1.209389 0.6102553 0.9658341
## 1 5 1.256809 0.5726219 1.0226502
## 1 6 1.232662 0.5924830 1.0239954
## 1 7 1.219399 0.6048860 1.0033693
## 1 8 1.200411 0.6107601 1.0169214
## 1 9 1.263842 0.5617176 1.0671156
## 1 10 1.288852 0.5461784 1.0445129
## 1 11 1.270242 0.5754780 1.0121190
## 1 12 1.256598 0.5864675 1.0150753
## 1 13 1.247105 0.6012662 1.0031781
## 1 14 1.255035 0.5991353 0.9936429
## 1 15 1.279054 0.5931904 1.0072173
## 1 16 1.266442 0.5992590 0.9999428
## 1 17 1.266442 0.5992590 0.9999428
## 1 18 1.282769 0.5884693 1.0106446
## 1 19 1.276720 0.5928197 1.0041556
## 1 20 1.276720 0.5928197 1.0041556
## 1 21 1.276720 0.5928197 1.0041556
## 1 22 1.276720 0.5928197 1.0041556
## 1 23 1.276720 0.5928197 1.0041556
## 1 24 1.276720 0.5928197 1.0041556
## 1 25 1.276720 0.5928197 1.0041556
## 1 26 1.276720 0.5928197 1.0041556
## 1 27 1.276720 0.5928197 1.0041556
## 1 28 1.276720 0.5928197 1.0041556
## 1 29 1.276720 0.5928197 1.0041556
## 1 30 1.276720 0.5928197 1.0041556
## 1 31 1.276720 0.5928197 1.0041556
## 1 32 1.276720 0.5928197 1.0041556
## 1 33 1.276720 0.5928197 1.0041556
## 1 34 1.276720 0.5928197 1.0041556
## 1 35 1.276720 0.5928197 1.0041556
## 1 36 1.276720 0.5928197 1.0041556
## 1 37 1.276720 0.5928197 1.0041556
## 1 38 1.276720 0.5928197 1.0041556
## 2 2 1.386589 0.4927736 1.0920068
## 2 3 1.413533 0.4506697 1.0995737
## 2 4 1.266432 0.5862261 0.9795224
## 2 5 1.202265 0.6014250 0.9559162
## 2 6 1.365030 0.4993974 1.1033808
## 2 7 1.357540 0.5252833 1.0762755
## 2 8 1.295411 0.5625124 1.0525017
## 2 9 1.431936 0.4787384 1.1551828
## 2 10 1.408014 0.4992009 1.1319615
## 2 11 1.374688 0.5389312 1.1253720
## 2 12 1.325663 0.5742589 1.0794703
## 2 13 1.297948 0.5967460 1.0666514
## 2 14 1.347467 0.5718709 1.0988817
## 2 15 1.314815 0.5888026 1.0596562
## 2 16 1.328870 0.5793341 1.0754231
## 2 17 1.341726 0.5791109 1.0856543
## 2 18 1.517880 0.5392842 1.1924067
## 2 19 1.628665 0.5141540 1.2521292
## 2 20 1.643833 0.5137175 1.2506165
## 2 21 1.633878 0.5174490 1.2462587
## 2 22 1.619016 0.5145373 1.2426204
## 2 23 1.602043 0.5145993 1.2380195
## 2 24 1.602043 0.5145993 1.2380195
## 2 25 1.602043 0.5145993 1.2380195
## 2 26 1.602043 0.5145993 1.2380195
## 2 27 1.602043 0.5145993 1.2380195
## 2 28 1.602043 0.5145993 1.2380195
## 2 29 1.602043 0.5145993 1.2380195
## 2 30 1.602043 0.5145993 1.2380195
## 2 31 1.602043 0.5145993 1.2380195
## 2 32 1.602043 0.5145993 1.2380195
## 2 33 1.602043 0.5145993 1.2380195
## 2 34 1.602043 0.5145993 1.2380195
## 2 35 1.602043 0.5145993 1.2380195
## 2 36 1.602043 0.5145993 1.2380195
## 2 37 1.602043 0.5145993 1.2380195
## 2 38 1.602043 0.5145993 1.2380195
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 8 and degree = 1.
set.seed(83)
svmRTuneChem <- train(x_train_chem, y_train_chem,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmRTuneChem
## Support Vector Machines with Radial Basis Function Kernel
##
## 124 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 111, 112, 112, 112, 112, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.510612 0.4682538 1.2342797
## 0.50 1.403611 0.5150776 1.1469636
## 1.00 1.318116 0.5733986 1.0789297
## 2.00 1.266319 0.5995068 1.0228569
## 4.00 1.242479 0.6008563 0.9927292
## 8.00 1.233571 0.5982649 0.9818278
## 16.00 1.233284 0.5969224 0.9776539
## 32.00 1.233284 0.5969224 0.9776539
## 64.00 1.233284 0.5969224 0.9776539
## 128.00 1.233284 0.5969224 0.9776539
## 256.00 1.233284 0.5969224 0.9776539
## 512.00 1.233284 0.5969224 0.9776539
## 1024.00 1.233284 0.5969224 0.9776539
## 2048.00 1.233284 0.5969224 0.9776539
##
## Tuning parameter 'sigma' was held constant at a value of 0.01253516
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01253516 and C = 16.
svmChemEval <- predict(svmRTuneChem, x_test_chem)
marsChemEval <- predict(marsTuneChem, x_test_chem)
nnetChemEval <- predict(nnetTuneChem, x_test_chem)
knnChemEval <- predict(knnModelChem, x_test_chem)
chemEvals <- rbind(postResample(svmChemEval, y_test_chem),postResample(marsChemEval,y_test_chem),
postResample(nnetChemEval,y_test_chem),postResample(knnChemEval,y_test_chem))
rownames(chemEvals) <- c('SVM - Radial','MARS','Neural Net','KNN')
chemEvals
## RMSE Rsquared MAE
## SVM - Radial 0.9876493 0.6828667 0.7464189
## MARS 1.1175141 0.5959044 0.8771180
## Neural Net 1.4834263 0.4127663 1.1407492
## KNN 1.1538981 0.5978144 0.9593298
The Radial Basis SVM model performs the best of all of the non-linear methods tested. It has the lowest test set RMSE and explains about 68.29% of the variation in the chemical manufacturing.
set.seed(12)
# Grid search for optimized penalty lambdas
enet_gs <- expand.grid(.lambda = c(0, 0.01, .1), .fraction = seq(.05, 1, length = 20))
# tuning penalized regression model
enet_fit_chem <- train(x_train_chem, y_train_chem, method = "enet",
tuneGrid = enet_gs, trControl = tr_ctrl, preProc = c("center", "scale"))
x_enet_pred_chem <- predict(enet_fit_chem,x_test_chem)
varImp(enet_fit_chem)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 69.74
## BiologicalMaterial06 57.49
## BiologicalMaterial03 54.28
## ManufacturingProcess17 53.61
## BiologicalMaterial02 50.70
## ManufacturingProcess36 46.85
## ManufacturingProcess09 46.28
## ManufacturingProcess06 43.46
## ManufacturingProcess29 42.11
## ManufacturingProcess31 41.70
## BiologicalMaterial12 40.91
## ManufacturingProcess33 40.34
## BiologicalMaterial11 35.19
## BiologicalMaterial04 35.10
## BiologicalMaterial09 34.22
## ManufacturingProcess02 30.31
## ManufacturingProcess30 27.60
## BiologicalMaterial01 26.26
## ManufacturingProcess16 25.92
varImp(svmRTuneChem)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 69.74
## BiologicalMaterial06 57.49
## BiologicalMaterial03 54.28
## ManufacturingProcess17 53.61
## BiologicalMaterial02 50.70
## ManufacturingProcess36 46.85
## ManufacturingProcess09 46.28
## ManufacturingProcess06 43.46
## ManufacturingProcess29 42.11
## ManufacturingProcess31 41.70
## BiologicalMaterial12 40.91
## ManufacturingProcess33 40.34
## BiologicalMaterial11 35.19
## BiologicalMaterial04 35.10
## BiologicalMaterial09 34.22
## ManufacturingProcess02 30.31
## ManufacturingProcess30 27.60
## BiologicalMaterial01 26.26
## ManufacturingProcess16 25.92
par(mfrow = c(1, 2))
plot(varImp(enet_fit_chem),top=10)
plot(varImp(svmRTuneChem),top=10)
The most important variables in both the linear and non-linear models are the same for the top 10 which would indicate that both are capturing the significance.
Since the top 10 predictors are the same for both linear and linear models it appears that most of the predictors have fairly linear relationships with the predictor. Manufacturing Process 29 and 36 appears to have the weakest linear relationship among the most significant predictors.
top10svm <- varImp(svmRTuneChem)$importance |> arrange(-Overall) |>
head(10)
corrplot(cor(cbind(x_chem_imp,y_chem) |> select(c('Yield',rownames(top10svm)))))
imp_chem_preds <- c('ManufacturingProcess32','ManufacturingProcess13','BiologicalMaterial06','BiologicalMaterial03','BiologicalMaterial02','ManufacturingProcess17','ManufacturingProcess36','ManufacturingProcess09','ManufacturingProcess06','ManufacturingProcess29','Yield')
par(mfrow=c(4,3))
par(mai=c(.3,.3,.3,.3))
plot(x=x_chem_imp$ManufacturingProcess32,y=y_chem$Yield,main='Manu_Process32 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess36,y=y_chem$Yield,main='Manu_Process36 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess17,y=y_chem$Yield,main='Manu_Process17 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess13,y=y_chem$Yield,main='Manu_Process13 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess06,y=y_chem$Yield,main='Manu_Process06 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess29,y=y_chem$Yield,main='Manu_Process29 vs Yield')
plot(x=x_chem_imp$ManufacturingProcess09,y=y_chem$Yield,main='Manu_Process09 vs Yield')
plot(x=x_chem_imp$BiologicalMaterial03,y=y_chem$Yield,main='Bio_Process03 vs Yield')
plot(x=x_chem_imp$BiologicalMaterial02,y=y_chem$Yield,main='Bio_Process03 vs Yield')
plot(x=x_chem_imp$BiologicalMaterial06,y=y_chem$Yield,main='Bio_Process06 vs Yield')