library(gmodels)
library(ggpubr)
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.90 loaded
library(e1071)
library(lattice)
library(AppliedPredictiveModeling)
library(mlbench)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(reshape2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(caTools)
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.2
##
## Attaching package: 'lars'
## The following object is masked from 'package:psych':
##
## error.bars
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:stats':
##
## loadings
#rm(list=ls())
seed<-1234
Problem 6.2
Developing a model to predict permeability (see sect 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecutles that have susfficient permeability to become a drug:
library(AppliedPredictiveModeling)
data(permeability)
#head(permeability)
#head(fingerprints)
dim(permeability)
## [1] 165 1
dim(fingerprints)
## [1] 165 1107
class(permeability)
## [1] "matrix" "array"
class(fingerprints)
## [1] "matrix" "array"
The matrix finterprints contains the 1107 binary molecular predictors for the 165 compounds, while permeability contains permeability for response.
fp_out <- nearZeroVar(fingerprints)
fingerprints1 <- fingerprints[,-fp_out]
dim(fingerprints1)
## [1] 165 388
class(fingerprints1)
## [1] "matrix" "array"
DISCUSSION:
There are 388 variables of the 1107 remaining.
set.seed(seed)
Rows <- createDataPartition(permeability, p=0.75, list=FALSE)
train_X <- fingerprints1[Rows, ]
train_Y <- permeability[Rows,]
test_X <- fingerprints1[-Rows, ]
test_Y <- permeability[-Rows,]
dim(train_X)
## [1] 125 388
dim(train_Y)
## NULL
dim(test_X)
## [1] 40 388
dim(test_Y)
## NULL
set.seed(seed)
plsfit <- train(x=train_X, y=train_Y, method="pls", tuneLength=20,
preProcess=c("center", "scale"),
trControl=trainControl(method="cv"))
plsfit
## Partial Least Squares
##
## 125 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 112, 112, 113, 113, 113, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.12852 0.3455755 9.962939
## 2 12.14546 0.4330069 8.633534
## 3 12.36872 0.3933098 9.303858
## 4 12.35652 0.3973752 9.354498
## 5 12.16572 0.4181501 9.214789
## 6 11.96498 0.4380348 9.068164
## 7 11.92518 0.4420047 9.280596
## 8 11.79796 0.4514234 9.416467
## 9 12.00177 0.4417509 9.650388
## 10 12.17286 0.4380764 9.881674
## 11 12.42101 0.4272782 9.999735
## 12 12.70978 0.4133552 10.140333
## 13 12.81560 0.4120447 10.164046
## 14 12.67963 0.4253213 10.078276
## 15 12.93839 0.4168035 10.284851
## 16 13.22580 0.4063959 10.537008
## 17 13.46258 0.4090001 10.880191
## 18 13.57954 0.4160205 10.901255
## 19 13.89109 0.4041376 11.164370
## 20 14.01144 0.4055658 11.049839
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 8.
ncomp RMSE Rsquared MAE
8 11.79796 0.4514234 9.416467
plot(plsfit, main="Partial Least Squares Model")
plsfit$results[plsfit$bestTune$ncomp,]
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 8 8 11.79796 0.4514234 9.416467 2.818105 0.2461524 2.361153
plspredict <- predict(plsfit, test_X)
postResample(pred=plspredict, obs=test_Y)
## RMSE Rsquared MAE
## 10.9615763 0.5689258 8.1669353
Let’s do a Ridge regression……
ridgeGrid <- data.frame(.lambda = seq(0,.1, length=15))
set.seed(312)
ridgeRegFit <- train(train_X, train_Y,
method = 'ridge',
tuneGrid = ridgeGrid,
trControl = trainControl(method="cv"))
## Warning: model fit failed for Fold08: lambda=0.000000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
ridgeRegFit
## Ridge Regression
##
## 125 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 112, 113, 111, 113, 112, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 13.51017 0.4415335 10.08679
## 0.007142857 15.79046 0.3932941 11.74546
## 0.014285714 14.89143 0.4146741 11.10171
## 0.021428571 14.49589 0.4229997 10.88241
## 0.028571429 14.22930 0.4263777 10.73022
## 0.035714286 14.04348 0.4292697 10.60040
## 0.042857143 13.94078 0.4280588 10.57463
## 0.050000000 13.83755 0.4285155 10.53546
## 0.057142857 13.77773 0.4285733 10.51208
## 0.064285714 13.72227 0.4282230 10.49211
## 0.071428571 13.68807 0.4271481 10.48229
## 0.078571429 13.65750 0.4286293 10.47339
## 0.085714286 13.63030 0.4273175 10.46167
## 0.092857143 13.60496 0.4274678 10.45556
## 0.100000000 13.57020 0.4278376 10.42175
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.
plot(ridgeRegFit)
lambda RMSE Rsquared MAE
0.100000000 12.62522 0.4660854 9.115865
#Let's try ridge on the test data.
#set.seed(1312)
#ridgeRegFit2 <- train(test_X, test_Y,
#method = 'ridge',
#tuneGrid = ridgeGrid,
#trControl = trainControl(method="cv"))
#ridgeRegFit2
#plot(ridgeRegFit2)
The partial least squares models seems to perform better than the ridge method. However, with an R^2 in neighbor of .4, I don’t feel it is strong enough to replace the more expensive permeability assays PAMPA. These are drugs treating human beings and I feel there should be a more convincing model before replacing the expensive, but accurate assays.
A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements fo the manufacturing process (predictors), and the response of the product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other handm manufacturing process predictors can be changed in the manufacturing process. Improving product yield bu 1% will boost revenue by approximately one hundred thousand dollars per batch:
#library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
dim(ChemicalManufacturingProcess)
## [1] 176 58
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) fir tge 176 manufacturing runs, yield contains the percent yield for each run.
Note: the text is erroneous the dataset is dim(ChemicalManufacturingProcess) which contains 57 predictors and the “Yield”.
str(ChemicalManufacturingProcess)
## 'data.frame': 176 obs. of 58 variables:
## $ Yield : num 38 42.4 42 41.4 42.5 ...
## $ BiologicalMaterial01 : num 6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
## $ BiologicalMaterial02 : num 49.6 61 61 61 63.3 ...
## $ BiologicalMaterial03 : num 57 67.5 67.5 67.5 72.2 ...
## $ BiologicalMaterial04 : num 12.7 14.6 14.6 14.6 14 ...
## $ BiologicalMaterial05 : num 19.5 19.4 19.4 19.4 17.9 ...
## $ BiologicalMaterial06 : num 43.7 53.1 53.1 53.1 54.7 ...
## $ BiologicalMaterial07 : num 100 100 100 100 100 100 100 100 100 100 ...
## $ BiologicalMaterial08 : num 16.7 19 19 19 18.2 ...
## $ BiologicalMaterial09 : num 11.4 12.6 12.6 12.6 12.8 ...
## $ BiologicalMaterial10 : num 3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
## $ BiologicalMaterial11 : num 138 154 154 154 148 ...
## $ BiologicalMaterial12 : num 18.8 21.1 21.1 21.1 21.1 ...
## $ ManufacturingProcess01: num NA 0 0 0 10.7 12 11.5 12 12 12 ...
## $ ManufacturingProcess02: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess03: num NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
## $ ManufacturingProcess04: num NA 917 912 911 918 924 933 929 928 938 ...
## $ ManufacturingProcess05: num NA 1032 1004 1015 1028 ...
## $ ManufacturingProcess06: num NA 210 207 213 206 ...
## $ ManufacturingProcess07: num NA 177 178 177 178 178 177 178 177 177 ...
## $ ManufacturingProcess08: num NA 178 178 177 178 178 178 178 177 177 ...
## $ ManufacturingProcess09: num 43 46.6 45.1 44.9 45 ...
## $ ManufacturingProcess10: num NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
## $ ManufacturingProcess11: num NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
## $ ManufacturingProcess12: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess13: num 35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
## $ ManufacturingProcess14: num 4898 4869 4878 4897 4992 ...
## $ ManufacturingProcess15: num 6108 6095 6087 6102 6233 ...
## $ ManufacturingProcess16: num 4682 4617 4617 4635 4733 ...
## $ ManufacturingProcess17: num 35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
## $ ManufacturingProcess18: num 4865 4867 4877 4872 4886 ...
## $ ManufacturingProcess19: num 6049 6097 6078 6073 6102 ...
## $ ManufacturingProcess20: num 4665 4621 4621 4611 4659 ...
## $ ManufacturingProcess21: num 0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
## $ ManufacturingProcess22: num NA 3 4 5 8 9 1 2 3 4 ...
## $ ManufacturingProcess23: num NA 0 1 2 4 1 1 2 3 1 ...
## $ ManufacturingProcess24: num NA 3 4 5 18 1 1 2 3 4 ...
## $ ManufacturingProcess25: num 4873 4869 4897 4892 4930 ...
## $ ManufacturingProcess26: num 6074 6107 6116 6111 6151 ...
## $ ManufacturingProcess27: num 4685 4630 4637 4630 4684 ...
## $ ManufacturingProcess28: num 10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
## $ ManufacturingProcess29: num 21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
## $ ManufacturingProcess30: num 9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
## $ ManufacturingProcess31: num 69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
## $ ManufacturingProcess32: num 156 169 173 171 171 173 159 161 160 164 ...
## $ ManufacturingProcess33: num 66 66 66 68 70 70 65 65 65 66 ...
## $ ManufacturingProcess34: num 2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ ManufacturingProcess35: num 486 508 509 496 468 490 475 478 491 488 ...
## $ ManufacturingProcess36: num 0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
## $ ManufacturingProcess37: num 0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
## $ ManufacturingProcess38: num 3 2 2 2 2 2 2 2 3 3 ...
## $ ManufacturingProcess39: num 7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
## $ ManufacturingProcess40: num NA 0.1 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess41: num NA 0.15 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess42: num 11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
## $ ManufacturingProcess43: num 3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
## $ ManufacturingProcess44: num 1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
## $ ManufacturingProcess45: num 2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...
md.pattern(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 152 1 1 1 1
## 6 1 1 1 1
## 1 1 1 1 1
## 7 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess09 ManufacturingProcess13 ManufacturingProcess15
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess32 ManufacturingProcess37 ManufacturingProcess38
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess39 ManufacturingProcess42 ManufacturingProcess43
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess44 ManufacturingProcess45 ManufacturingProcess01
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 0
## 0 0 1
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess07
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess08 ManufacturingProcess12 ManufacturingProcess14
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 0
## 1 0 0 1
## 1 1 1
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 0
## 1 1 1 1
## 1 0 0 0
## 1 1 2
## ManufacturingProcess02 ManufacturingProcess25 ManufacturingProcess26
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 0 0
## 2 0 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 1 1
## 3 5 5
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess33
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess03
## 152 1 1 1 0
## 6 1 1 0 1
## 1 1 0 1 1
## 7 0 0 0 3
## 5 1 1 1 11
## 2 1 1 1 1
## 1 1 1 1 1
## 1 0 0 0 4
## 1 0 0 0 16
## 9 10 15 106
countNA<-colSums(is.na(ChemicalManufacturingProcess))
#Look at predictor part of dataset
pred <- ChemicalManufacturingProcess[,-c(1)]
#Imput with KNN
Imp_pred <- preProcess(pred, method="knnImpute")
#predict function
pred1 <- predict(Imp_pred, pred)
pred2 <- preProcess(pred1, method=c("center", "scale"))
pred3 <- predict(pred2, pred1)
#split
set.seed(seed)
trainingRows <- createDataPartition(ChemicalManufacturingProcess$Yield,
p=0.75, list=FALSE)
train_X2 <- pred3[trainingRows, ]
train_Y2 <- ChemicalManufacturingProcess$Yield[trainingRows]
test_X2 <- pred3[-trainingRows, ]
test_Y2 <- ChemicalManufacturingProcess$Yield[-trainingRows]
enetGrid <- expand.grid(.lambda = c(0,0.01,0.1),
.fraction = seq(0.05,1,length = 20))
set.seed(213)
enetTune <- train(train_X2, train_Y2,
method = 'enet',
tuneGrid = enetGrid,
trControl = trainControl(method="cv"))
enetTune
## Elasticnet
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 119, 119, 119, 117, 119, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.00 0.05 1.196364 0.6604813 0.9515763
## 0.00 0.10 1.202265 0.6300705 0.9389656
## 0.00 0.15 1.273513 0.6157280 0.9943903
## 0.00 0.20 1.334883 0.6082306 1.0288518
## 0.00 0.25 1.767774 0.5316105 1.2074856
## 0.00 0.30 2.012139 0.5114541 1.2898516
## 0.00 0.35 2.320827 0.4910974 1.3898549
## 0.00 0.40 2.521302 0.4828169 1.4585595
## 0.00 0.45 2.772847 0.4439913 1.5457002
## 0.00 0.50 3.175853 0.4093572 1.6788568
## 0.00 0.55 3.400765 0.3957124 1.7580884
## 0.00 0.60 3.481987 0.3869828 1.7924444
## 0.00 0.65 3.491975 0.3794832 1.8067084
## 0.00 0.70 3.516570 0.3748153 1.8195658
## 0.00 0.75 3.918369 0.3692505 1.9368358
## 0.00 0.80 4.306222 0.3649000 2.0487443
## 0.00 0.85 4.683754 0.3605339 2.1580432
## 0.00 0.90 5.078900 0.3565629 2.2714625
## 0.00 0.95 5.457427 0.3529087 2.3798296
## 0.00 1.00 5.846113 0.3497191 2.4906457
## 0.01 0.05 1.511222 0.6191118 1.2347724
## 0.01 0.10 1.270047 0.6516676 1.0291793
## 0.01 0.15 1.151547 0.6528445 0.9217279
## 0.01 0.20 1.146955 0.6444170 0.9001146
## 0.01 0.25 1.205349 0.6183044 0.9382235
## 0.01 0.30 1.252879 0.6044556 0.9738452
## 0.01 0.35 1.274157 0.6006345 1.0002324
## 0.01 0.40 1.305733 0.5927875 1.0212191
## 0.01 0.45 1.358795 0.5791086 1.0443706
## 0.01 0.50 1.375807 0.5756555 1.0513349
## 0.01 0.55 1.467389 0.5374025 1.1019688
## 0.01 0.60 1.537284 0.5415406 1.1226000
## 0.01 0.65 1.784976 0.4883554 1.2101432
## 0.01 0.70 1.985310 0.4718883 1.2737385
## 0.01 0.75 2.110698 0.4645421 1.3159351
## 0.01 0.80 2.192962 0.4600051 1.3453379
## 0.01 0.85 2.256486 0.4564382 1.3689862
## 0.01 0.90 2.306465 0.4532957 1.3886582
## 0.01 0.95 2.372355 0.4494567 1.4125324
## 0.01 1.00 2.533233 0.4439807 1.4637142
## 0.10 0.05 1.657974 0.5625402 1.3572444
## 0.10 0.10 1.493139 0.6278935 1.2212737
## 0.10 0.15 1.350969 0.6511545 1.1037857
## 0.10 0.20 1.239960 0.6579016 1.0088209
## 0.10 0.25 1.168771 0.6579373 0.9414444
## 0.10 0.30 1.136492 0.6553682 0.9058205
## 0.10 0.35 1.138477 0.6484594 0.8961178
## 0.10 0.40 1.162564 0.6377300 0.9134589
## 0.10 0.45 1.199289 0.6249147 0.9380021
## 0.10 0.50 1.230626 0.6084067 0.9644756
## 0.10 0.55 1.275595 0.5911339 0.9960616
## 0.10 0.60 1.317275 0.5790389 1.0205009
## 0.10 0.65 1.326402 0.5804132 1.0262751
## 0.10 0.70 1.318145 0.5897091 1.0259770
## 0.10 0.75 1.318259 0.5921207 1.0282495
## 0.10 0.80 1.323812 0.5904224 1.0324935
## 0.10 0.85 1.337711 0.5818915 1.0539403
## 0.10 0.90 1.359091 0.5689503 1.0753589
## 0.10 0.95 1.388538 0.5525814 1.0962552
## 0.10 1.00 1.400830 0.5492931 1.1046290
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.3 and lambda = 0.1.
plot(enetTune)
The final values used for the model were fraction = 0.3 and lambda = 0.1.
lambda fraction RMSE Rsquared MAE
0.10 0.30 1.136492 0.6553682 0.9058205
pred <- predict(enetTune, test_X2) #make predictions
postResample(pred, test_Y2) #show test stats
## RMSE Rsquared MAE
## 1.2912274 0.5309323 1.0685611
varImp(enetTune)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess32 93.67
## BiologicalMaterial03 92.86
## BiologicalMaterial06 86.68
## ManufacturingProcess17 80.34
## BiologicalMaterial12 76.76
## ManufacturingProcess09 76.15
## ManufacturingProcess36 75.95
## ManufacturingProcess06 63.29
## BiologicalMaterial02 59.13
## ManufacturingProcess11 54.38
## BiologicalMaterial11 53.65
## ManufacturingProcess31 51.81
## BiologicalMaterial04 48.83
## BiologicalMaterial09 47.43
## ManufacturingProcess18 43.88
## ManufacturingProcess30 42.31
## BiologicalMaterial08 40.33
## BiologicalMaterial01 40.08
## ManufacturingProcess33 38.33
Manufacturing has the top two spots followed by Biological in 3rd, 4th.
There are 11 Manufacturing in the top 20.
varuse <- varImp(enetTune)$importance %>% arrange(desc(Overall)) %>% rownames() %>% .[1:20]
varuse
## [1] "ManufacturingProcess13" "ManufacturingProcess32" "BiologicalMaterial03"
## [4] "BiologicalMaterial06" "ManufacturingProcess17" "BiologicalMaterial12"
## [7] "ManufacturingProcess09" "ManufacturingProcess36" "ManufacturingProcess06"
## [10] "BiologicalMaterial02" "ManufacturingProcess11" "BiologicalMaterial11"
## [13] "ManufacturingProcess31" "BiologicalMaterial04" "BiologicalMaterial09"
## [16] "ManufacturingProcess18" "ManufacturingProcess30" "BiologicalMaterial08"
## [19] "BiologicalMaterial01" "ManufacturingProcess33"
use1 <- pred3 %>% cbind(ChemicalManufacturingProcess$Yield)
#m <- enetTune %>% select(rn) %>% cbind(ChemicalManufacturingProcess$Yield)
cor(use1$ManufacturingProcess13,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.5036797
cor(use1$ManufacturingProcess32,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.6083321
cor(use1$BiologicalMaterial03,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.445086
cor(use1$BiologicalMaterial06,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.4781634
cor(use1$ManufacturingProcess17,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.4258069
cor(use1$BiologicalMaterial12,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3674976
cor(use1$ManufacturingProcess09,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.5034705
cor(use1$ManufacturingProcess36,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.5237389
cor(use1$ManufacturingProcess06,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3918329
cor(use1$BiologicalMaterial02,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.4815158
cor(use1$ManufacturingProcess11,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3525799
cor(use1$BiologicalMaterial11,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3549143
cor(use1$ManufacturingProcess31,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.07069156
cor(use1$BiologicalMaterial04,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3798401
cor(use1$BiologicalMaterial09,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.09203649
cor(use1$ManufacturingProcess18,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.05892925
cor(use1$ManufacturingProcess30,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.2304898
cor(use1$BiologicalMaterial08,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3809402
cor(use1$BiologicalMaterial01,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.358938
cor(use1$ManufacturingProcess33,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.4249171
The correlation does not adjust for other variables. This information is helpful to know how one variables is correlated to another with no other variables in the mix.
This could help in future decision making.