library(gmodels)
library(ggpubr)
## Loading required package: ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.90 loaded
library(e1071)
library(lattice)
library(AppliedPredictiveModeling)
library(mlbench)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(reshape2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.4     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%()    masks ggplot2::%+%()
## x psych::alpha()  masks ggplot2::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(caTools)
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.2
## 
## Attaching package: 'lars'
## The following object is masked from 'package:psych':
## 
##     error.bars
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:corrplot':
## 
##     corrplot
## The following object is masked from 'package:stats':
## 
##     loadings
#rm(list=ls())

seed<-1234

Problem 6.2

Developing a model to predict permeability (see sect 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecutles that have susfficient permeability to become a drug:

  1. Start R and use these commands to load the data:
library(AppliedPredictiveModeling)
data(permeability)
#head(permeability)
#head(fingerprints)
dim(permeability)
## [1] 165   1
dim(fingerprints)
## [1]  165 1107
class(permeability)
## [1] "matrix" "array"
class(fingerprints)
## [1] "matrix" "array"

The matrix finterprints contains the 1107 binary molecular predictors for the 165 compounds, while permeability contains permeability for response.

  1. The fingerprint predictors indicate the presence of absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?
fp_out <- nearZeroVar(fingerprints)

fingerprints1 <- fingerprints[,-fp_out]


dim(fingerprints1)
## [1] 165 388
class(fingerprints1)
## [1] "matrix" "array"

DISCUSSION:

There are 388 variables of the 1107 remaining.

  1. Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R^2.

Split into .75 /.25

set.seed(seed)
Rows <- createDataPartition(permeability, p=0.75, list=FALSE) 
train_X <- fingerprints1[Rows, ]
train_Y <- permeability[Rows,]
test_X <- fingerprints1[-Rows, ]
test_Y <- permeability[-Rows,]

dim(train_X)
## [1] 125 388
dim(train_Y)
## NULL
dim(test_X)
## [1]  40 388
dim(test_Y)
## NULL

PLS Model, preprocess and tune

set.seed(seed)
plsfit <- train(x=train_X, y=train_Y, method="pls", tuneLength=20, 
               preProcess=c("center", "scale"), 
               trControl=trainControl(method="cv"))
plsfit
## Partial Least Squares 
## 
## 125 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 112, 112, 113, 113, 113, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.12852  0.3455755   9.962939
##    2     12.14546  0.4330069   8.633534
##    3     12.36872  0.3933098   9.303858
##    4     12.35652  0.3973752   9.354498
##    5     12.16572  0.4181501   9.214789
##    6     11.96498  0.4380348   9.068164
##    7     11.92518  0.4420047   9.280596
##    8     11.79796  0.4514234   9.416467
##    9     12.00177  0.4417509   9.650388
##   10     12.17286  0.4380764   9.881674
##   11     12.42101  0.4272782   9.999735
##   12     12.70978  0.4133552  10.140333
##   13     12.81560  0.4120447  10.164046
##   14     12.67963  0.4253213  10.078276
##   15     12.93839  0.4168035  10.284851
##   16     13.22580  0.4063959  10.537008
##   17     13.46258  0.4090001  10.880191
##   18     13.57954  0.4160205  10.901255
##   19     13.89109  0.4041376  11.164370
##   20     14.01144  0.4055658  11.049839
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 8.

ncomp RMSE Rsquared MAE
8 11.79796 0.4514234 9.416467

plot(plsfit, main="Partial Least Squares Model")

plsfit$results[plsfit$bestTune$ncomp,]
##   ncomp     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 8     8 11.79796 0.4514234 9.416467 2.818105  0.2461524 2.361153
  1. Predict the response for the test set. What is the test set estimate of R2?
plspredict <- predict(plsfit, test_X)

postResample(pred=plspredict, obs=test_Y)
##       RMSE   Rsquared        MAE 
## 10.9615763  0.5689258  8.1669353
  1. Try building other models discussed in this chapter. Do any have better predictive performance?

Let’s do a Ridge regression……

ridgeGrid <- data.frame(.lambda = seq(0,.1, length=15))
set.seed(312)
ridgeRegFit <- train(train_X, train_Y,
method = 'ridge',
tuneGrid = ridgeGrid,
trControl = trainControl(method="cv"))
## Warning: model fit failed for Fold08: lambda=0.000000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
ridgeRegFit
## Ridge Regression 
## 
## 125 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 112, 113, 111, 113, 112, ... 
## Resampling results across tuning parameters:
## 
##   lambda       RMSE      Rsquared   MAE     
##   0.000000000  13.51017  0.4415335  10.08679
##   0.007142857  15.79046  0.3932941  11.74546
##   0.014285714  14.89143  0.4146741  11.10171
##   0.021428571  14.49589  0.4229997  10.88241
##   0.028571429  14.22930  0.4263777  10.73022
##   0.035714286  14.04348  0.4292697  10.60040
##   0.042857143  13.94078  0.4280588  10.57463
##   0.050000000  13.83755  0.4285155  10.53546
##   0.057142857  13.77773  0.4285733  10.51208
##   0.064285714  13.72227  0.4282230  10.49211
##   0.071428571  13.68807  0.4271481  10.48229
##   0.078571429  13.65750  0.4286293  10.47339
##   0.085714286  13.63030  0.4273175  10.46167
##   0.092857143  13.60496  0.4274678  10.45556
##   0.100000000  13.57020  0.4278376  10.42175
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.
plot(ridgeRegFit)

lambda RMSE Rsquared MAE

0.100000000 12.62522 0.4660854 9.115865

#Let's try ridge on the test data.
#set.seed(1312)
#ridgeRegFit2 <- train(test_X, test_Y,
#method = 'ridge',
#tuneGrid = ridgeGrid,
#trControl = trainControl(method="cv"))

#ridgeRegFit2
#plot(ridgeRegFit2)
  1. Would you recommend any of your models to replace the permeability laboratory experiment?

The partial least squares models seems to perform better than the ridge method. However, with an R^2 in neighbor of .4, I don’t feel it is strong enough to replace the more expensive permeability assays PAMPA. These are drugs treating human beings and I feel there should be a more convincing model before replacing the expensive, but accurate assays.

Problem 6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements fo the manufacturing process (predictors), and the response of the product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other handm manufacturing process predictors can be changed in the manufacturing process. Improving product yield bu 1% will boost revenue by approximately one hundred thousand dollars per batch:

  1. Start R and use these commands to load the data:
#library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
dim(ChemicalManufacturingProcess)
## [1] 176  58

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) fir tge 176 manufacturing runs, yield contains the percent yield for each run.

Note: the text is erroneous the dataset is dim(ChemicalManufacturingProcess) which contains 57 predictors and the “Yield”.

str(ChemicalManufacturingProcess)
## 'data.frame':    176 obs. of  58 variables:
##  $ Yield                 : num  38 42.4 42 41.4 42.5 ...
##  $ BiologicalMaterial01  : num  6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
##  $ BiologicalMaterial02  : num  49.6 61 61 61 63.3 ...
##  $ BiologicalMaterial03  : num  57 67.5 67.5 67.5 72.2 ...
##  $ BiologicalMaterial04  : num  12.7 14.6 14.6 14.6 14 ...
##  $ BiologicalMaterial05  : num  19.5 19.4 19.4 19.4 17.9 ...
##  $ BiologicalMaterial06  : num  43.7 53.1 53.1 53.1 54.7 ...
##  $ BiologicalMaterial07  : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ BiologicalMaterial08  : num  16.7 19 19 19 18.2 ...
##  $ BiologicalMaterial09  : num  11.4 12.6 12.6 12.6 12.8 ...
##  $ BiologicalMaterial10  : num  3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
##  $ BiologicalMaterial11  : num  138 154 154 154 148 ...
##  $ BiologicalMaterial12  : num  18.8 21.1 21.1 21.1 21.1 ...
##  $ ManufacturingProcess01: num  NA 0 0 0 10.7 12 11.5 12 12 12 ...
##  $ ManufacturingProcess02: num  NA 0 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess03: num  NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
##  $ ManufacturingProcess04: num  NA 917 912 911 918 924 933 929 928 938 ...
##  $ ManufacturingProcess05: num  NA 1032 1004 1015 1028 ...
##  $ ManufacturingProcess06: num  NA 210 207 213 206 ...
##  $ ManufacturingProcess07: num  NA 177 178 177 178 178 177 178 177 177 ...
##  $ ManufacturingProcess08: num  NA 178 178 177 178 178 178 178 177 177 ...
##  $ ManufacturingProcess09: num  43 46.6 45.1 44.9 45 ...
##  $ ManufacturingProcess10: num  NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
##  $ ManufacturingProcess11: num  NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
##  $ ManufacturingProcess12: num  NA 0 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess13: num  35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
##  $ ManufacturingProcess14: num  4898 4869 4878 4897 4992 ...
##  $ ManufacturingProcess15: num  6108 6095 6087 6102 6233 ...
##  $ ManufacturingProcess16: num  4682 4617 4617 4635 4733 ...
##  $ ManufacturingProcess17: num  35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
##  $ ManufacturingProcess18: num  4865 4867 4877 4872 4886 ...
##  $ ManufacturingProcess19: num  6049 6097 6078 6073 6102 ...
##  $ ManufacturingProcess20: num  4665 4621 4621 4611 4659 ...
##  $ ManufacturingProcess21: num  0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
##  $ ManufacturingProcess22: num  NA 3 4 5 8 9 1 2 3 4 ...
##  $ ManufacturingProcess23: num  NA 0 1 2 4 1 1 2 3 1 ...
##  $ ManufacturingProcess24: num  NA 3 4 5 18 1 1 2 3 4 ...
##  $ ManufacturingProcess25: num  4873 4869 4897 4892 4930 ...
##  $ ManufacturingProcess26: num  6074 6107 6116 6111 6151 ...
##  $ ManufacturingProcess27: num  4685 4630 4637 4630 4684 ...
##  $ ManufacturingProcess28: num  10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
##  $ ManufacturingProcess29: num  21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
##  $ ManufacturingProcess30: num  9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
##  $ ManufacturingProcess31: num  69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
##  $ ManufacturingProcess32: num  156 169 173 171 171 173 159 161 160 164 ...
##  $ ManufacturingProcess33: num  66 66 66 68 70 70 65 65 65 66 ...
##  $ ManufacturingProcess34: num  2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ ManufacturingProcess35: num  486 508 509 496 468 490 475 478 491 488 ...
##  $ ManufacturingProcess36: num  0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
##  $ ManufacturingProcess37: num  0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
##  $ ManufacturingProcess38: num  3 2 2 2 2 2 2 2 3 3 ...
##  $ ManufacturingProcess39: num  7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
##  $ ManufacturingProcess40: num  NA 0.1 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess41: num  NA 0.15 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess42: num  11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
##  $ ManufacturingProcess43: num  3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
##  $ ManufacturingProcess44: num  1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
##  $ ManufacturingProcess45: num  2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...
  1. A small percentage of cells in the predictor set contain missing values. Us an imputation function to fill in these missing values (EG see Sect 3.8).
md.pattern(ChemicalManufacturingProcess)

##     Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 152     1                    1                    1                    1
## 6       1                    1                    1                    1
## 1       1                    1                    1                    1
## 7       1                    1                    1                    1
## 5       1                    1                    1                    1
## 2       1                    1                    1                    1
## 1       1                    1                    1                    1
## 1       1                    1                    1                    1
## 1       1                    1                    1                    1
##         0                    0                    0                    0
##     BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 152                    1                    1                    1
## 6                      1                    1                    1
## 1                      1                    1                    1
## 7                      1                    1                    1
## 5                      1                    1                    1
## 2                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
##                        0                    0                    0
##     BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 152                    1                    1                    1
## 6                      1                    1                    1
## 1                      1                    1                    1
## 7                      1                    1                    1
## 5                      1                    1                    1
## 2                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
##                        0                    0                    0
##     BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 152                    1                    1                    1
## 6                      1                    1                    1
## 1                      1                    1                    1
## 7                      1                    1                    1
## 5                      1                    1                    1
## 2                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
## 1                      1                    1                    1
##                        0                    0                    0
##     ManufacturingProcess09 ManufacturingProcess13 ManufacturingProcess15
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess32 ManufacturingProcess37 ManufacturingProcess38
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess39 ManufacturingProcess42 ManufacturingProcess43
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          0                      0                      0
##     ManufacturingProcess44 ManufacturingProcess45 ManufacturingProcess01
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      0
##                          0                      0                      1
##     ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess07
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        0                      0                      0
##                          1                      1                      1
##     ManufacturingProcess08 ManufacturingProcess12 ManufacturingProcess14
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      0
## 1                        0                      0                      1
##                          1                      1                      1
##     ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        0                      0                      0
##                          1                      1                      1
##     ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess06
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      1                      1
## 2                        1                      1                      1
## 1                        1                      1                      0
## 1                        1                      1                      1
## 1                        0                      0                      0
##                          1                      1                      2
##     ManufacturingProcess02 ManufacturingProcess25 ManufacturingProcess26
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        1                      0                      0
## 2                        0                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        0                      1                      1
##                          3                      5                      5
##     ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        0                      0                      0
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          5                      5                      5
##     ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess33
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        0                      0                      0
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          5                      5                      5
##     ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 152                      1                      1                      1
## 6                        1                      1                      1
## 1                        1                      1                      1
## 7                        1                      1                      1
## 5                        0                      0                      0
## 2                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
## 1                        1                      1                      1
##                          5                      5                      5
##     ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess03    
## 152                      1                      1                      1   0
## 6                        1                      1                      0   1
## 1                        1                      0                      1   1
## 7                        0                      0                      0   3
## 5                        1                      1                      1  11
## 2                        1                      1                      1   1
## 1                        1                      1                      1   1
## 1                        0                      0                      0   4
## 1                        0                      0                      0  16
##                          9                     10                     15 106
countNA<-colSums(is.na(ChemicalManufacturingProcess))

#Look at predictor part of dataset
pred <- ChemicalManufacturingProcess[,-c(1)]

#Imput with KNN

Imp_pred <- preProcess(pred, method="knnImpute") 

#predict function
pred1 <- predict(Imp_pred, pred)
  1. Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal vlaue of the performance metric
pred2 <- preProcess(pred1, method=c("center", "scale"))
pred3 <- predict(pred2, pred1)

Split .75/.25

#split

set.seed(seed)
trainingRows <- createDataPartition(ChemicalManufacturingProcess$Yield, 
                                    p=0.75, list=FALSE) 
train_X2 <- pred3[trainingRows, ]
train_Y2 <- ChemicalManufacturingProcess$Yield[trainingRows]
test_X2 <- pred3[-trainingRows, ]
test_Y2 <- ChemicalManufacturingProcess$Yield[-trainingRows]

Model choice - Elastic net

enetGrid <- expand.grid(.lambda = c(0,0.01,0.1),
.fraction = seq(0.05,1,length = 20))

set.seed(213)

enetTune <- train(train_X2, train_Y2,
method = 'enet',
tuneGrid = enetGrid,
trControl = trainControl(method="cv"))

enetTune
## Elasticnet 
## 
## 132 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 119, 119, 119, 119, 117, 119, ... 
## Resampling results across tuning parameters:
## 
##   lambda  fraction  RMSE      Rsquared   MAE      
##   0.00    0.05      1.196364  0.6604813  0.9515763
##   0.00    0.10      1.202265  0.6300705  0.9389656
##   0.00    0.15      1.273513  0.6157280  0.9943903
##   0.00    0.20      1.334883  0.6082306  1.0288518
##   0.00    0.25      1.767774  0.5316105  1.2074856
##   0.00    0.30      2.012139  0.5114541  1.2898516
##   0.00    0.35      2.320827  0.4910974  1.3898549
##   0.00    0.40      2.521302  0.4828169  1.4585595
##   0.00    0.45      2.772847  0.4439913  1.5457002
##   0.00    0.50      3.175853  0.4093572  1.6788568
##   0.00    0.55      3.400765  0.3957124  1.7580884
##   0.00    0.60      3.481987  0.3869828  1.7924444
##   0.00    0.65      3.491975  0.3794832  1.8067084
##   0.00    0.70      3.516570  0.3748153  1.8195658
##   0.00    0.75      3.918369  0.3692505  1.9368358
##   0.00    0.80      4.306222  0.3649000  2.0487443
##   0.00    0.85      4.683754  0.3605339  2.1580432
##   0.00    0.90      5.078900  0.3565629  2.2714625
##   0.00    0.95      5.457427  0.3529087  2.3798296
##   0.00    1.00      5.846113  0.3497191  2.4906457
##   0.01    0.05      1.511222  0.6191118  1.2347724
##   0.01    0.10      1.270047  0.6516676  1.0291793
##   0.01    0.15      1.151547  0.6528445  0.9217279
##   0.01    0.20      1.146955  0.6444170  0.9001146
##   0.01    0.25      1.205349  0.6183044  0.9382235
##   0.01    0.30      1.252879  0.6044556  0.9738452
##   0.01    0.35      1.274157  0.6006345  1.0002324
##   0.01    0.40      1.305733  0.5927875  1.0212191
##   0.01    0.45      1.358795  0.5791086  1.0443706
##   0.01    0.50      1.375807  0.5756555  1.0513349
##   0.01    0.55      1.467389  0.5374025  1.1019688
##   0.01    0.60      1.537284  0.5415406  1.1226000
##   0.01    0.65      1.784976  0.4883554  1.2101432
##   0.01    0.70      1.985310  0.4718883  1.2737385
##   0.01    0.75      2.110698  0.4645421  1.3159351
##   0.01    0.80      2.192962  0.4600051  1.3453379
##   0.01    0.85      2.256486  0.4564382  1.3689862
##   0.01    0.90      2.306465  0.4532957  1.3886582
##   0.01    0.95      2.372355  0.4494567  1.4125324
##   0.01    1.00      2.533233  0.4439807  1.4637142
##   0.10    0.05      1.657974  0.5625402  1.3572444
##   0.10    0.10      1.493139  0.6278935  1.2212737
##   0.10    0.15      1.350969  0.6511545  1.1037857
##   0.10    0.20      1.239960  0.6579016  1.0088209
##   0.10    0.25      1.168771  0.6579373  0.9414444
##   0.10    0.30      1.136492  0.6553682  0.9058205
##   0.10    0.35      1.138477  0.6484594  0.8961178
##   0.10    0.40      1.162564  0.6377300  0.9134589
##   0.10    0.45      1.199289  0.6249147  0.9380021
##   0.10    0.50      1.230626  0.6084067  0.9644756
##   0.10    0.55      1.275595  0.5911339  0.9960616
##   0.10    0.60      1.317275  0.5790389  1.0205009
##   0.10    0.65      1.326402  0.5804132  1.0262751
##   0.10    0.70      1.318145  0.5897091  1.0259770
##   0.10    0.75      1.318259  0.5921207  1.0282495
##   0.10    0.80      1.323812  0.5904224  1.0324935
##   0.10    0.85      1.337711  0.5818915  1.0539403
##   0.10    0.90      1.359091  0.5689503  1.0753589
##   0.10    0.95      1.388538  0.5525814  1.0962552
##   0.10    1.00      1.400830  0.5492931  1.1046290
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.3 and lambda = 0.1.
plot(enetTune)

The final values used for the model were fraction = 0.3 and lambda = 0.1.

lambda fraction RMSE Rsquared MAE
0.10 0.30 1.136492 0.6553682 0.9058205

  1. Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?
pred <- predict(enetTune, test_X2) #make predictions 
postResample(pred, test_Y2)   #show test stats 
##      RMSE  Rsquared       MAE 
## 1.2912274 0.5309323 1.0685611
  1. Which predictors are most important in the model you have trained? Do either othe biological or process predictors dominate the list?
varImp(enetTune)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess13  100.00
## ManufacturingProcess32   93.67
## BiologicalMaterial03     92.86
## BiologicalMaterial06     86.68
## ManufacturingProcess17   80.34
## BiologicalMaterial12     76.76
## ManufacturingProcess09   76.15
## ManufacturingProcess36   75.95
## ManufacturingProcess06   63.29
## BiologicalMaterial02     59.13
## ManufacturingProcess11   54.38
## BiologicalMaterial11     53.65
## ManufacturingProcess31   51.81
## BiologicalMaterial04     48.83
## BiologicalMaterial09     47.43
## ManufacturingProcess18   43.88
## ManufacturingProcess30   42.31
## BiologicalMaterial08     40.33
## BiologicalMaterial01     40.08
## ManufacturingProcess33   38.33

Discussion

Manufacturing has the top two spots followed by Biological in 3rd, 4th.

There are 11 Manufacturing in the top 20.

  1. Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?
varuse <- varImp(enetTune)$importance %>% arrange(desc(Overall)) %>% rownames() %>% .[1:20]
varuse
##  [1] "ManufacturingProcess13" "ManufacturingProcess32" "BiologicalMaterial03"  
##  [4] "BiologicalMaterial06"   "ManufacturingProcess17" "BiologicalMaterial12"  
##  [7] "ManufacturingProcess09" "ManufacturingProcess36" "ManufacturingProcess06"
## [10] "BiologicalMaterial02"   "ManufacturingProcess11" "BiologicalMaterial11"  
## [13] "ManufacturingProcess31" "BiologicalMaterial04"   "BiologicalMaterial09"  
## [16] "ManufacturingProcess18" "ManufacturingProcess30" "BiologicalMaterial08"  
## [19] "BiologicalMaterial01"   "ManufacturingProcess33"
use1 <- pred3  %>% cbind(ChemicalManufacturingProcess$Yield)



#m <- enetTune %>% select(rn) %>% cbind(ChemicalManufacturingProcess$Yield)

cor(use1$ManufacturingProcess13,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.5036797
cor(use1$ManufacturingProcess32,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.6083321
cor(use1$BiologicalMaterial03,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.445086
cor(use1$BiologicalMaterial06,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.4781634
cor(use1$ManufacturingProcess17,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.4258069
cor(use1$BiologicalMaterial12,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3674976
cor(use1$ManufacturingProcess09,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.5034705
cor(use1$ManufacturingProcess36,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.5237389
cor(use1$ManufacturingProcess06,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3918329
cor(use1$BiologicalMaterial02,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.4815158
cor(use1$ManufacturingProcess11,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3525799
cor(use1$BiologicalMaterial11,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3549143
cor(use1$ManufacturingProcess31,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.07069156
cor(use1$BiologicalMaterial04,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3798401
cor(use1$BiologicalMaterial09,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.09203649
cor(use1$ManufacturingProcess18,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.05892925
cor(use1$ManufacturingProcess30,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.2304898
cor(use1$BiologicalMaterial08,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3809402
cor(use1$BiologicalMaterial01,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.358938
cor(use1$ManufacturingProcess33,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.4249171

The correlation does not adjust for other variables. This information is helpful to know how one variables is correlated to another with no other variables in the mix.

This could help in future decision making.