#install.packages(“RANN”)
6.2. Developing a model to predict permeability (see Sect. 1.4) could save sig- nificant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug: (a) Start R and use these commands to load the data:
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.1.3
data(permeability)
head(permeability)
## permeability
## 1 12.520
## 2 1.120
## 3 19.405
## 4 1.730
## 5 1.680
## 6 0.510
The matrix fingerprints contains the 1,107 binary molecular predic- tors for the 165 compounds, while permeability contains permeability response. (b) The fingerprint predictors indicate the presence or absence of substruc- tures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?
colus <- nearZeroVar(fingerprints)
dataframe1 <- fingerprints[,-colus]
dim(dataframe1)
## [1] 165 388
set.seed(888)
transformed <- preProcess(dataframe1,
method = c('center', 'scale'))
dataframe1 <- predict(transformed, dataframe1)
ytransf <- preProcess(permeability,
method = c('center', 'scale'))
y <- predict(ytransf, permeability)
sample <- sample.split(permeability, SplitRatio = 0.75)
X_train = subset(dataframe1, sample == TRUE)
X_test = subset(dataframe1, sample == FALSE)
X_train_i <- rownames(X_train)
X_test_i <- rownames(X_test)
y_train <- permeability[X_train_i,]
y_test <- permeability[X_test_i,]
pFIT <- train(X_train, y_train,
method = 'pls',
tuneLength = 100
)
plot(pFIT)
head(pFIT)
## $method
## [1] "pls"
##
## $modelInfo
## $modelInfo$label
## [1] "Partial Least Squares"
##
## $modelInfo$library
## [1] "pls"
##
## $modelInfo$type
## [1] "Regression" "Classification"
##
## $modelInfo$parameters
## parameter class label
## 1 ncomp numeric #Components
##
## $modelInfo$grid
## function (x, y, len = NULL, search = "grid")
## {
## if (search == "grid") {
## out <- data.frame(ncomp = seq(1, min(ncol(x) - 1, len),
## by = 1))
## }
## else {
## out <- data.frame(ncomp = unique(sample(1:ncol(x), replace = TRUE)))
## }
## out
## }
##
## $modelInfo$loop
## function (grid)
## {
## grid <- grid[order(grid$ncomp, decreasing = TRUE), , drop = FALSE]
## loop <- grid[1, , drop = FALSE]
## submodels <- list(grid[-1, , drop = FALSE])
## list(loop = loop, submodels = submodels)
## }
##
## $modelInfo$fit
## function (x, y, wts, param, lev, last, classProbs, ...)
## {
## ncomp <- min(ncol(x), param$ncomp)
## out <- if (is.factor(y)) {
## plsda(x, y, method = "oscorespls", ncomp = ncomp, ...)
## }
## else {
## dat <- if (is.data.frame(x))
## x
## else as.data.frame(x, stringsAsFactors = TRUE)
## dat$.outcome <- y
## pls::plsr(.outcome ~ ., data = dat, method = "oscorespls",
## ncomp = ncomp, ...)
## }
## out
## }
## <bytecode: 0x00000000296e5788>
##
## $modelInfo$predict
## function (modelFit, newdata, submodels = NULL)
## {
## out <- if (modelFit$problemType == "Classification") {
## if (!is.matrix(newdata))
## newdata <- as.matrix(newdata)
## out <- predict(modelFit, newdata, type = "class")
## }
## else as.vector(pls:::predict.mvr(modelFit, newdata, ncomp = max(modelFit$ncomp)))
## if (!is.null(submodels)) {
## tmp <- vector(mode = "list", length = nrow(submodels))
## if (modelFit$problemType == "Classification") {
## if (length(submodels$ncomp) > 1) {
## tmp <- as.list(predict(modelFit, newdata, ncomp = submodels$ncomp))
## }
## else tmp <- list(predict(modelFit, newdata, ncomp = submodels$ncomp))
## }
## else {
## tmp <- as.list(as.data.frame(apply(predict(modelFit,
## newdata, ncomp = submodels$ncomp), 3, function(x) list(x))))
## }
## out <- c(list(out), tmp)
## }
## out
## }
## <bytecode: 0x0000000026026fd0>
##
## $modelInfo$prob
## function (modelFit, newdata, submodels = NULL)
## {
## if (!is.matrix(newdata))
## newdata <- as.matrix(newdata)
## out <- predict(modelFit, newdata, type = "prob", ncomp = modelFit$tuneValue$ncomp)
## if (length(dim(out)) == 3) {
## if (dim(out)[1] > 1) {
## out <- out[, , 1]
## }
## else {
## out <- as.data.frame(t(out[, , 1]), stringsAsFactors = TRUE)
## }
## }
## if (!is.null(submodels)) {
## tmp <- vector(mode = "list", length = nrow(submodels) +
## 1)
## tmp[[1]] <- out
## for (j in seq(along = submodels$ncomp)) {
## tmpProb <- predict(modelFit, newdata, type = "prob",
## ncomp = submodels$ncomp[j])
## if (length(dim(tmpProb)) == 3) {
## if (dim(tmpProb)[1] > 1) {
## tmpProb <- tmpProb[, , 1]
## }
## else {
## tmpProb <- as.data.frame(t(tmpProb[, , 1]),
## stringsAsFactors = TRUE)
## }
## }
## tmp[[j + 1]] <- as.data.frame(tmpProb[, modelFit$obsLevels])
## }
## out <- tmp
## }
## out
## }
##
## $modelInfo$varImp
## function (object, estimate = NULL, ...)
## {
## library(pls)
## modelCoef <- coef(object, intercept = FALSE, comps = 1:object$ncomp)
## perf <- pls:::MSEP.mvr(object)$val
## nms <- dimnames(perf)
## if (length(nms$estimate) > 1) {
## pIndex <- if (is.null(estimate))
## 1
## else which(nms$estimate == estimate)
## perf <- perf[pIndex, , , drop = FALSE]
## }
## numResp <- dim(modelCoef)[2]
## if (numResp <= 2) {
## modelCoef <- modelCoef[, 1, , drop = FALSE]
## perf <- perf[, 1, ]
## delta <- -diff(perf)
## delta <- delta/sum(delta)
## out <- data.frame(Overall = apply(abs(modelCoef), 1,
## weighted.mean, w = delta))
## }
## else {
## if (dim(perf)[3] <= 2) {
## perf <- -t(t(apply(perf[1, , ], 1, diff)))
## perf <- t(t(apply(perf, 1, function(u) u/sum(u))))
## }
## else {
## perf <- -t(apply(perf[1, , ], 1, diff))
## perf <- t(apply(perf, 1, function(u) u/sum(u)))
## }
## out <- matrix(NA, ncol = numResp, nrow = dim(modelCoef)[1])
## for (i in 1:numResp) {
## tmp <- abs(modelCoef[, i, , drop = FALSE])
## out[, i] <- apply(tmp, 1, weighted.mean, w = perf[i,
## ])
## }
## colnames(out) <- dimnames(modelCoef)[[2]]
## rownames(out) <- dimnames(modelCoef)[[1]]
## }
## as.data.frame(out, stringsAsFactors = TRUE)
## }
##
## $modelInfo$predictors
## function (x, ...)
## rownames(x$projection)
##
## $modelInfo$levels
## function (x)
## x$obsLevels
##
## $modelInfo$tags
## [1] "Partial Least Squares" "Feature Extraction" "Linear Classifier"
## [4] "Linear Regression"
##
## $modelInfo$sort
## function (x)
## x[order(x[, 1]), ]
##
##
## $modelType
## [1] "Regression"
##
## $results
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD
## 1 1 1.366139e+01 0.25131059 1.001715e+01 1.494556e+00 0.13856158
## 2 2 1.301676e+01 0.32663942 9.120971e+00 1.343893e+00 0.12657074
## 3 3 1.315546e+01 0.32723696 9.464251e+00 1.297240e+00 0.10636635
## 4 4 1.322623e+01 0.32672881 9.469037e+00 1.483368e+00 0.10936973
## 5 5 1.322792e+01 0.33500488 9.371653e+00 1.540210e+00 0.12060956
## 6 6 1.319335e+01 0.34593758 9.426308e+00 1.723772e+00 0.12791854
## 7 7 1.313151e+01 0.35521759 9.415516e+00 1.693123e+00 0.11968673
## 8 8 1.327325e+01 0.35227560 9.546441e+00 1.738245e+00 0.12003325
## 9 9 1.332618e+01 0.35671783 9.581289e+00 1.856224e+00 0.12313457
## 10 10 1.344960e+01 0.35667302 9.712811e+00 1.785315e+00 0.12351388
## 11 11 1.362459e+01 0.35284820 9.832980e+00 1.777208e+00 0.12267584
## 12 12 1.380111e+01 0.34790630 9.963278e+00 1.756458e+00 0.12495588
## 13 13 1.414861e+01 0.33461517 1.017093e+01 1.872781e+00 0.13084999
## 14 14 1.443927e+01 0.32540147 1.038425e+01 1.961534e+00 0.13155166
## 15 15 1.468080e+01 0.32149650 1.061372e+01 1.961063e+00 0.13080418
## 16 16 1.502717e+01 0.31012656 1.085969e+01 2.011367e+00 0.13118795
## 17 17 1.520971e+01 0.30374857 1.100863e+01 1.929629e+00 0.12387258
## 18 18 1.539022e+01 0.29859394 1.114569e+01 1.875672e+00 0.12185028
## 19 19 1.554127e+01 0.29304452 1.132839e+01 1.883619e+00 0.12292268
## 20 20 1.565734e+01 0.28978691 1.139249e+01 1.847385e+00 0.12050633
## 21 21 1.584247e+01 0.28273409 1.154598e+01 1.804376e+00 0.11511195
## 22 22 1.603669e+01 0.27857529 1.166604e+01 1.772259e+00 0.11145302
## 23 23 1.631657e+01 0.27086895 1.182542e+01 1.790726e+00 0.10919948
## 24 24 1.653329e+01 0.26559656 1.200488e+01 1.836196e+00 0.10869744
## 25 25 1.683476e+01 0.25912724 1.223108e+01 1.889293e+00 0.10707886
## 26 26 1.712511e+01 0.25249645 1.241565e+01 2.043840e+00 0.10621206
## 27 27 1.740468e+01 0.24582064 1.259348e+01 2.164216e+00 0.10394466
## 28 28 1.763806e+01 0.24185847 1.275817e+01 2.267230e+00 0.10449976
## 29 29 1.785595e+01 0.23674687 1.288404e+01 2.314394e+00 0.10280187
## 30 30 1.805814e+01 0.23247542 1.299872e+01 2.358688e+00 0.10126963
## 31 31 1.828494e+01 0.22702663 1.313915e+01 2.413140e+00 0.09939894
## 32 32 1.847292e+01 0.22276482 1.323848e+01 2.460724e+00 0.09794891
## 33 33 1.869244e+01 0.21762803 1.338230e+01 2.568145e+00 0.09827529
## 34 34 1.894520e+01 0.21181188 1.354918e+01 2.720023e+00 0.09762345
## 35 35 1.919961e+01 0.20591380 1.371123e+01 2.841790e+00 0.09611718
## 36 36 1.941875e+01 0.20096655 1.385223e+01 2.973174e+00 0.09512574
## 37 37 1.963968e+01 0.19653400 1.400003e+01 3.149265e+00 0.09512594
## 38 38 1.992651e+01 0.19126114 1.418800e+01 3.368589e+00 0.09646213
## 39 39 2.019549e+01 0.18731589 1.437575e+01 3.529225e+00 0.09822148
## 40 40 2.041501e+01 0.18425731 1.452298e+01 3.703418e+00 0.09901582
## 41 41 2.061244e+01 0.18158844 1.467149e+01 3.909549e+00 0.10055614
## 42 42 2.084574e+01 0.17817568 1.483795e+01 4.171014e+00 0.10220347
## 43 43 2.100213e+01 0.17606839 1.494276e+01 4.355455e+00 0.10375766
## 44 44 2.113925e+01 0.17450296 1.504124e+01 4.541700e+00 0.10502651
## 45 45 2.123975e+01 0.17281261 1.511255e+01 4.634345e+00 0.10644041
## 46 46 2.130317e+01 0.17190541 1.516064e+01 4.727032e+00 0.10686872
## 47 47 2.137245e+01 0.17146440 1.521041e+01 4.761278e+00 0.10766294
## 48 48 2.146030e+01 0.17090272 1.525973e+01 4.890932e+00 0.10873171
## 49 49 2.153340e+01 0.17024502 1.530390e+01 4.984644e+00 0.10970741
## 50 50 2.159339e+01 0.16961161 1.534679e+01 5.053186e+00 0.10990536
## 51 51 2.164087e+01 0.16928906 1.538644e+01 5.093831e+00 0.11001924
## 52 52 2.168194e+01 0.16924906 1.542193e+01 5.120292e+00 0.11047438
## 53 53 2.171008e+01 0.16962416 1.544847e+01 5.143950e+00 0.11096440
## 54 54 2.173052e+01 0.16974311 1.546427e+01 5.158114e+00 0.11118934
## 55 55 2.175165e+01 0.16962344 1.547327e+01 5.164727e+00 0.11127212
## 56 56 2.177206e+01 0.16969945 1.548721e+01 5.193989e+00 0.11163018
## 57 57 2.178573e+01 0.16966911 1.550240e+01 5.201556e+00 0.11181249
## 58 58 2.180157e+01 0.16953271 1.552101e+01 5.196648e+00 0.11184174
## 59 59 2.181326e+01 0.16938214 1.553564e+01 5.199180e+00 0.11193532
## 60 60 2.181678e+01 0.16938591 1.553802e+01 5.193087e+00 0.11214021
## 61 61 2.181113e+01 0.16980613 1.552823e+01 5.194674e+00 0.11282318
## 62 62 2.180992e+01 0.16992417 1.552661e+01 5.196026e+00 0.11312259
## 63 63 3.950852e+13 0.16612040 3.238785e+13 1.975426e+14 0.11710398
## 64 64 1.408718e+14 0.16261956 1.134091e+14 4.905484e+14 0.12078581
## 65 65 2.825276e+14 0.16139984 2.305011e+14 6.852701e+14 0.12257050
## 66 66 6.015004e+14 0.15260080 4.832592e+14 1.046951e+15 0.12349060
## 67 67 9.401741e+14 0.13182138 7.591330e+14 1.352257e+15 0.12404405
## 68 68 1.280338e+15 0.10681486 1.014845e+15 1.634759e+15 0.12125819
## 69 69 1.637432e+15 0.09643310 1.324474e+15 1.817861e+15 0.12150837
## 70 70 1.941094e+15 0.07590977 1.551880e+15 1.931397e+15 0.10201507
## 71 71 2.325233e+15 0.06126394 1.854832e+15 1.922697e+15 0.09518010
## 72 72 2.698351e+15 0.04177078 2.149663e+15 1.941660e+15 0.05791162
## 73 73 3.040729e+15 0.04777799 2.440140e+15 2.111857e+15 0.07035465
## 74 74 3.339466e+15 0.03852203 2.683187e+15 2.218786e+15 0.06824209
## 75 75 3.552012e+15 0.03630920 2.840604e+15 2.282257e+15 0.07463548
## 76 76 3.728098e+15 0.03457849 2.979864e+15 2.368975e+15 0.07930430
## 77 77 3.864808e+15 0.03356840 3.096955e+15 2.461975e+15 0.07930738
## 78 78 3.958098e+15 0.03280492 3.176569e+15 2.511265e+15 0.07720315
## 79 79 4.050290e+15 0.03232299 3.257448e+15 2.603163e+15 0.07627755
## 80 80 4.131895e+15 0.03256417 3.330901e+15 2.673289e+15 0.07542778
## 81 81 4.199134e+15 0.03274999 3.391076e+15 2.741234e+15 0.07469742
## 82 82 4.273011e+15 0.03340186 3.450389e+15 2.815742e+15 0.07459216
## 83 83 4.326022e+15 0.03385092 3.494313e+15 2.873041e+15 0.07456767
## 84 84 4.367699e+15 0.03505334 3.530284e+15 2.919662e+15 0.07480580
## 85 85 4.399072e+15 0.03600681 3.554023e+15 2.946626e+15 0.07492630
## 86 86 4.432952e+15 0.03635510 3.581078e+15 2.977731e+15 0.07513522
## 87 87 4.464412e+15 0.03633696 3.600213e+15 3.016430e+15 0.07502430
## 88 88 4.491555e+15 0.03616316 3.619503e+15 3.047404e+15 0.07477347
## 89 89 4.514463e+15 0.03589927 3.636105e+15 3.072982e+15 0.07437338
## 90 90 4.534445e+15 0.03560313 3.649539e+15 3.096074e+15 0.07380011
## 91 91 4.551703e+15 0.03537793 3.662604e+15 3.110506e+15 0.07355336
## 92 92 4.570826e+15 0.03509556 3.677844e+15 3.125654e+15 0.07295716
## 93 93 4.591474e+15 0.03479295 3.694877e+15 3.146871e+15 0.07277954
## 94 94 4.610679e+15 0.03452206 3.710204e+15 3.168146e+15 0.07232652
## 95 95 4.622197e+15 0.03427336 3.719372e+15 3.178051e+15 0.07214005
## 96 96 4.634026e+15 0.03414774 3.729064e+15 3.191355e+15 0.07182174
## 97 97 4.642404e+15 0.03397280 3.737756e+15 3.203634e+15 0.07128266
## 98 98 4.655040e+15 0.03401845 3.748953e+15 3.223080e+15 0.07107886
## 99 99 4.661766e+15 0.03395983 3.754091e+15 3.234085e+15 0.07062621
## 100 100 4.667819e+15 0.03388396 3.760129e+15 3.242195e+15 0.07032378
## MAESD
## 1 1.210441e+00
## 2 9.855809e-01
## 3 9.933712e-01
## 4 1.160809e+00
## 5 1.244921e+00
## 6 1.385141e+00
## 7 1.344532e+00
## 8 1.437491e+00
## 9 1.480055e+00
## 10 1.461433e+00
## 11 1.405062e+00
## 12 1.345016e+00
## 13 1.386543e+00
## 14 1.418501e+00
## 15 1.459707e+00
## 16 1.531334e+00
## 17 1.548394e+00
## 18 1.522068e+00
## 19 1.532515e+00
## 20 1.508873e+00
## 21 1.452002e+00
## 22 1.390460e+00
## 23 1.311491e+00
## 24 1.265230e+00
## 25 1.233505e+00
## 26 1.291823e+00
## 27 1.334802e+00
## 28 1.388839e+00
## 29 1.408025e+00
## 30 1.440459e+00
## 31 1.453951e+00
## 32 1.509110e+00
## 33 1.594725e+00
## 34 1.671398e+00
## 35 1.754682e+00
## 36 1.862277e+00
## 37 1.978835e+00
## 38 2.159814e+00
## 39 2.301606e+00
## 40 2.431969e+00
## 41 2.540304e+00
## 42 2.694317e+00
## 43 2.781752e+00
## 44 2.876293e+00
## 45 2.928887e+00
## 46 2.979674e+00
## 47 2.988302e+00
## 48 3.056781e+00
## 49 3.121854e+00
## 50 3.172625e+00
## 51 3.212660e+00
## 52 3.228589e+00
## 53 3.257600e+00
## 54 3.271707e+00
## 55 3.277580e+00
## 56 3.289953e+00
## 57 3.286989e+00
## 58 3.278409e+00
## 59 3.276193e+00
## 60 3.270056e+00
## 61 3.266269e+00
## 62 3.265899e+00
## 63 1.619392e+14
## 64 3.936432e+14
## 65 5.547202e+14
## 66 8.371535e+14
## 67 1.083805e+15
## 68 1.276154e+15
## 69 1.422585e+15
## 70 1.497481e+15
## 71 1.482608e+15
## 72 1.473365e+15
## 73 1.641696e+15
## 74 1.734798e+15
## 75 1.770011e+15
## 76 1.838143e+15
## 77 1.920349e+15
## 78 1.964031e+15
## 79 2.050760e+15
## 80 2.114280e+15
## 81 2.173792e+15
## 82 2.235384e+15
## 83 2.280749e+15
## 84 2.323960e+15
## 85 2.346615e+15
## 86 2.372753e+15
## 87 2.395794e+15
## 88 2.418820e+15
## 89 2.440534e+15
## 90 2.457077e+15
## 91 2.468810e+15
## 92 2.483448e+15
## 93 2.503684e+15
## 94 2.521401e+15
## 95 2.532181e+15
## 96 2.545169e+15
## 97 2.558518e+15
## 98 2.574906e+15
## 99 2.583503e+15
## 100 2.591840e+15
##
## $pred
## NULL
##
## $bestTune
## ncomp
## 2 2
lmPredict <- predict(pFIT, X_test)
lmpredVal1 <- data.frame(obs = y_test, pred = lmPredict)
defaultSummary(lmpredVal1)
## RMSE Rsquared MAE
## 11.8071806 0.4945307 8.2450456
options(warn = - 1)
eNGrid <- expand.grid(.lambda = c(0,0.01,0.1),
.fraction = seq(0.05,1,length = 20))
set.seed(999)
eTune <- train(X_train, y_train,
method = 'enet',
tuneGrid = eNGrid
)
lmPRED2 <- predict(eTune, X_test)
lmVal2 <- data.frame(obs = y_test, pred = lmPRED2)
defaultSummary(lmVal2)
## RMSE Rsquared MAE
## 11.4344891 0.5748558 7.9721040
Yes, the Elastic Net model performed better in predicting the values of the test data set.
6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the re- lationship between biological measurements of the raw materials (predictors), 6.5 Computing 139 measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing pro- cess. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch: (a) Start R and use these commands to load the data:
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
head(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 38.00 6.25 49.58 56.97
## 2 42.44 8.01 60.97 67.48
## 3 42.03 8.01 60.97 67.48
## 4 41.42 8.01 60.97 67.48
## 5 42.49 7.47 63.33 72.25
## 6 43.57 6.12 58.36 65.31
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1 12.74 19.51 43.73
## 2 14.65 19.36 53.14
## 3 14.65 19.36 53.14
## 4 14.65 19.36 53.14
## 5 14.02 17.91 54.66
## 6 15.17 21.79 51.23
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1 100 16.66 11.44
## 2 100 19.04 12.55
## 3 100 19.04 12.55
## 4 100 19.04 12.55
## 5 100 18.22 12.80
## 6 100 18.30 12.13
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1 3.46 138.09 18.83
## 2 3.46 153.67 21.05
## 3 3.46 153.67 21.05
## 4 3.46 153.67 21.05
## 5 3.05 147.61 21.05
## 6 3.78 151.88 20.76
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1 NA NA NA
## 2 0.0 0 NA
## 3 0.0 0 NA
## 4 0.0 0 NA
## 5 10.7 0 NA
## 6 12.0 0 NA
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1 NA NA NA
## 2 917 1032.2 210.0
## 3 912 1003.6 207.1
## 4 911 1014.6 213.3
## 5 918 1027.5 205.7
## 6 924 1016.8 208.9
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1 NA NA 43.00
## 2 177 178 46.57
## 3 178 178 45.07
## 4 177 177 44.92
## 5 178 178 44.96
## 6 178 178 45.32
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1 NA NA NA
## 2 NA NA 0
## 3 NA NA 0
## 4 NA NA 0
## 5 NA NA 0
## 6 NA NA 0
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1 35.5 4898 6108
## 2 34.0 4869 6095
## 3 34.8 4878 6087
## 4 34.8 4897 6102
## 5 34.6 4992 6233
## 6 34.0 4985 6222
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1 4682 35.5 4865
## 2 4617 34.0 4867
## 3 4617 34.8 4877
## 4 4635 34.8 4872
## 5 4733 33.9 4886
## 6 4786 33.4 4862
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1 6049 4665 0.0
## 2 6097 4621 0.0
## 3 6078 4621 0.0
## 4 6073 4611 0.0
## 5 6102 4659 -0.7
## 6 6115 4696 -0.6
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1 NA NA NA
## 2 3 0 3
## 3 4 1 4
## 4 5 2 5
## 5 8 4 18
## 6 9 1 1
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1 4873 6074 4685
## 2 4869 6107 4630
## 3 4897 6116 4637
## 4 4892 6111 4630
## 5 4930 6151 4684
## 6 4871 6128 4687
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1 10.7 21.0 9.9
## 2 11.2 21.4 9.9
## 3 11.1 21.3 9.4
## 4 11.1 21.3 9.4
## 5 11.3 21.6 9.0
## 6 11.4 21.7 10.1
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1 69.1 156 66
## 2 68.7 169 66
## 3 69.3 173 66
## 4 69.3 171 68
## 5 69.4 171 70
## 6 68.2 173 70
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1 2.4 486 0.019
## 2 2.6 508 0.019
## 3 2.6 509 0.018
## 4 2.5 496 0.018
## 5 2.5 468 0.017
## 6 2.5 490 0.018
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1 0.5 3 7.2
## 2 2.0 2 7.2
## 3 0.7 2 7.2
## 4 1.2 2 7.2
## 5 0.2 2 7.3
## 6 0.4 2 7.2
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1 NA NA 11.6
## 2 0.1 0.15 11.1
## 3 0.0 0.00 12.0
## 4 0.0 0.00 10.6
## 5 0.0 0.00 11.0
## 6 0.0 0.00 11.5
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1 3.0 1.8 2.4
## 2 0.9 1.9 2.2
## 3 1.0 1.8 2.3
## 4 1.1 1.8 2.1
## 5 1.1 1.7 2.1
## 6 2.2 1.8 2.0
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
missmap(ChemicalManufacturingProcess)
#using knn imputation
(CHEM_knn_impute <- preProcess(ChemicalManufacturingProcess, method=c('knnImpute')))
## Created from 152 samples and 58 variables
##
## Pre-processing:
## - centered (58)
## - ignored (0)
## - 5 nearest neighbor imputation (58)
## - scaled (58)
CHEM_dataframe <- predict(CHEM_knn_impute, ChemicalManufacturingProcess)
summary(CHEM_dataframe)
## Yield BiologicalMaterial01 BiologicalMaterial02
## Min. :-2.6692 Min. :-2.5653 Min. :-2.1858
## 1st Qu.:-0.7716 1st Qu.:-0.6078 1st Qu.:-0.7457
## Median :-0.1119 Median :-0.1491 Median :-0.1484
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7035 3rd Qu.: 0.6423 3rd Qu.: 0.7557
## Max. : 3.3394 Max. : 3.3597 Max. : 2.2459
## BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05
## Min. :-2.6830 Min. :-1.6731 Min. :-2.90576
## 1st Qu.:-0.6811 1st Qu.:-0.6222 1st Qu.:-0.73944
## Median :-0.1212 Median :-0.1405 Median :-0.05891
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6804 3rd Qu.: 0.4907 3rd Qu.: 0.70568
## Max. : 2.6355 Max. : 6.0523 Max. : 3.38985
## BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
## Min. :-2.2184 Min. :-0.1313 Min. :-2.38535
## 1st Qu.:-0.7622 1st Qu.:-0.1313 1st Qu.:-0.64225
## Median :-0.1202 Median :-0.1313 Median : 0.02249
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6499 3rd Qu.:-0.1313 3rd Qu.: 0.56906
## Max. : 2.7948 Max. : 7.5723 Max. : 2.43034
## BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11
## Min. :-3.39629 Min. :-1.7202 Min. :-2.3116
## 1st Qu.:-0.59627 1st Qu.:-0.5685 1st Qu.:-0.6505
## Median :-0.03627 Median :-0.1513 Median :-0.1811
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67428 3rd Qu.: 0.3161 3rd Qu.: 0.5491
## Max. : 2.96246 Max. : 6.7920 Max. : 2.4431
## BiologicalMaterial12 ManufacturingProcess01 ManufacturingProcess02
## Min. :-2.3914 Min. :-6.149703 Min. :-1.969253
## 1st Qu.:-0.6074 1st Qu.:-0.223563 1st Qu.: 0.308956
## Median :-0.1033 Median : 0.105667 Median : 0.509627
## Mean : 0.0000 Mean : 0.001224 Mean : 0.009518
## 3rd Qu.: 0.7112 3rd Qu.: 0.503487 3rd Qu.: 0.568648
## Max. : 2.5986 Max. : 1.587202 Max. : 0.686690
## ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
## Min. :-3.10582 Min. :-3.323233 Min. :-2.577803
## 1st Qu.:-0.42705 1st Qu.:-0.613828 1st Qu.:-0.487046
## Median : 0.37658 Median : 0.342432 Median :-0.086583
## Mean : 0.04123 Mean : 0.003213 Mean :-0.002534
## 3rd Qu.: 0.46587 3rd Qu.: 0.661186 3rd Qu.: 0.230347
## Max. : 2.69818 Max. : 2.254953 Max. : 5.686954
## ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
## Min. :-1.630631 Min. :-0.9580199 Min. :-1.111973
## 1st Qu.:-0.630408 1st Qu.:-0.9580199 1st Qu.:-1.111973
## Median :-0.222910 Median :-0.9580199 Median : 0.894164
## Mean :-0.006574 Mean :-0.0009072 Mean :-0.001759
## 3rd Qu.: 0.480950 3rd Qu.: 1.0378549 3rd Qu.: 0.894164
## Max. : 7.408415 Max. : 1.0378549 Max. : 0.894164
## ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11
## Min. :-4.37787 Min. :-2.18999 Min. :-2.63442
## 1st Qu.:-0.49799 1st Qu.:-0.62482 1st Qu.:-0.53867
## Median : 0.04519 Median :-0.10310 Median : 0.02020
## Mean : 0.00000 Mean : 0.02156 Mean : 0.03163
## 3rd Qu.: 0.55281 3rd Qu.: 0.54906 3rd Qu.: 0.71878
## Max. : 2.39252 Max. : 3.15768 Max. : 2.95425
## ManufacturingProcess12 ManufacturingProcess13 ManufacturingProcess14
## Min. :-0.480694 Min. :-2.37172 Min. :-2.803712
## 1st Qu.:-0.480694 1st Qu.:-0.59881 1st Qu.:-0.488202
## Median :-0.480694 Median : 0.09066 Median : 0.029921
## Mean :-0.002731 Mean : 0.00000 Mean :-0.004071
## 3rd Qu.:-0.480694 3rd Qu.: 0.68163 3rd Qu.: 0.520534
## Max. : 2.068439 Max. : 4.03046 Max. : 3.688885
## ManufacturingProcess15 ManufacturingProcess16 ManufacturingProcess17
## Min. :-2.3137 Min. :-12.98219 Min. :-2.43850
## 1st Qu.:-0.4960 1st Qu.: -0.01436 1st Qu.:-0.67597
## Median :-0.1273 Median : 0.06312 Median : 0.04507
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.3786 3rd Qu.: 0.15126 3rd Qu.: 0.60587
## Max. : 3.3283 Max. : 0.81376 Max. : 4.53150
## ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
## Min. :-13.08836 Min. :-3.0321 Min. :-13.05542
## 1st Qu.: 0.00903 1st Qu.:-0.6022 1st Qu.: -0.01063
## Median : 0.06890 Median :-0.1360 Median : 0.07318
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.14237 3rd Qu.: 0.4838 3rd Qu.: 0.15197
## Max. : 0.43899 Max. : 2.5846 Max. : 0.58033
## ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
## Min. :-2.1018 Min. :-1.6230324 Min. :-1.814768
## 1st Qu.:-0.5599 1st Qu.:-0.7223009 1st Qu.:-0.611797
## Median :-0.1745 Median :-0.1218132 Median :-0.010311
## Mean : 0.0000 Mean : 0.0003314 Mean : 0.004726
## 3rd Qu.: 0.2110 3rd Qu.: 0.7789183 3rd Qu.: 0.591175
## Max. : 4.8365 Max. : 1.9798937 Max. : 1.794146
## ManufacturingProcess24 ManufacturingProcess25 ManufacturingProcess26
## Min. :-1.523304 Min. :-12.927496 Min. :-12.940454
## 1st Qu.:-0.833580 1st Qu.: 0.002208 1st Qu.: 0.008505
## Median :-0.143857 Median : 0.069146 Median : 0.063251
## Mean : 0.005061 Mean : 0.000105 Mean : 0.000404
## 3rd Qu.: 0.890729 3rd Qu.: 0.128720 3rd Qu.: 0.115417
## Max. : 2.442608 Max. : 0.433287 Max. : 0.312785
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## Min. :-12.888994 Min. :-1.25583 Min. :-12.026718
## 1st Qu.: 0.000681 1st Qu.:-1.25583 1st Qu.: -0.186978
## Median : 0.066362 Median : 0.72551 Median : -0.066778
## Mean : 0.001121 Mean :-0.02868 Mean : -0.003263
## 3rd Qu.: 0.131337 3rd Qu.: 0.78266 3rd Qu.: 0.233723
## Max. : 0.416660 Max. : 0.93507 Max. : 1.195326
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
## Min. :-9.38589 Min. :-12.632749 Min. :-2.86552
## 1st Qu.:-0.37026 1st Qu.: -0.015263 1st Qu.:-0.64216
## Median : 0.03954 Median : 0.110732 Median :-0.08632
## Mean : 0.01114 Mean : -0.001722 Mean : 0.00000
## 3rd Qu.: 0.55179 3rd Qu.: 0.218728 3rd Qu.: 0.65480
## Max. : 2.08855 Max. : 0.416720 Max. : 2.69287
## ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35
## Min. :-3.037737 Min. :-3.558813 Min. :-3.01270
## 1st Qu.:-0.621676 1st Qu.: 0.118269 1st Qu.:-0.51725
## Median : 0.183677 Median : 0.118269 Median :-0.05513
## Mean :-0.005764 Mean :-0.009176 Mean :-0.02362
## 3rd Qu.: 0.586354 3rd Qu.: 0.118269 3rd Qu.: 0.49941
## Max. : 2.599738 Max. : 1.956810 Max. : 2.44032
## ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
## Min. :-2.944307 Min. :-2.27741 Min. :-3.9024
## 1st Qu.:-0.655777 1st Qu.:-0.70467 1st Qu.:-0.8225
## Median :-0.083645 Median :-0.03064 Median : 0.7175
## Mean :-0.008228 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.488487 3rd Qu.: 0.64339 3rd Qu.: 0.7175
## Max. : 2.777017 Max. : 2.89017 Max. : 0.7175
## ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess41
## Min. :-4.5508 Min. :-0.4626528 Min. :-0.440588
## 1st Qu.: 0.1653 1st Qu.:-0.4626528 1st Qu.:-0.440588
## Median : 0.2317 Median :-0.4626528 Median :-0.440588
## Mean : 0.0000 Mean : 0.0003392 Mean :-0.000392
## 3rd Qu.: 0.2982 3rd Qu.:-0.4626528 3rd Qu.:-0.440588
## Max. : 0.4310 Max. : 2.1490969 Max. : 3.275213
## ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
## Min. :-5.77163 Min. :-1.0506 Min. :-5.60583
## 1st Qu.: 0.09979 1st Qu.:-0.3594 1st Qu.:-0.01588
## Median : 0.20280 Median :-0.1290 Median : 0.29467
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.25430 3rd Qu.: 0.1303 3rd Qu.: 0.29467
## Max. : 0.46031 Max. :11.6224 Max. : 0.91578
## ManufacturingProcess45
## Min. :-5.25447
## 1st Qu.:-0.09356
## Median : 0.15220
## Mean : 0.00000
## 3rd Qu.: 0.39796
## Max. : 1.13523
dim(CHEM_dataframe)
## [1] 176 58
CHEM_dataframe2 <- CHEM_dataframe[, -nearZeroVar(CHEM_dataframe)]
dim(CHEM_dataframe2)
## [1] 176 57
set.seed(555)
select_train <- createDataPartition(CHEM_dataframe2$Yield, times = 1, p = .80, list = FALSE)
train_x2 <- CHEM_dataframe2[select_train, ][, -c(1)]
test_x2 <- CHEM_dataframe2[-select_train, ][, -c(1)]
train_y2 <- CHEM_dataframe2[select_train, ]$Yield
test_y2 <- CHEM_dataframe2[-select_train, ]$Yield
(P_fit2 <- train(x = train_x2, y = train_y2,
method = "pls",
metric = "Rsquared",
tuneLength = 25,
trControl = trainControl(method = "cv", number=10),
preProcess = c('center', 'scale')
))
## Partial Least Squares
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 129, 130, 130, 129, 130, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.7338982 0.4317348 0.6175189
## 2 0.7722558 0.4684310 0.5812528
## 3 0.8134501 0.5194408 0.5823906
## 4 0.9540440 0.5193093 0.6360902
## 5 1.0721588 0.5241915 0.6733250
## 6 1.1436786 0.5185259 0.6912503
## 7 1.2458840 0.5085213 0.7365470
## 8 1.3875640 0.4976078 0.7926451
## 9 1.4661458 0.4849316 0.8268420
## 10 1.5879129 0.4644183 0.8643309
## 11 1.6495018 0.4598224 0.8756271
## 12 1.7627131 0.4554407 0.8976769
## 13 1.7487823 0.4650528 0.8931955
## 14 1.6777520 0.4656269 0.8684202
## 15 1.6377169 0.4705341 0.8508950
## 16 1.6194606 0.4693782 0.8400870
## 17 1.5915410 0.4743053 0.8205206
## 18 1.5932854 0.4752533 0.8122477
## 19 1.6520158 0.4751787 0.8387607
## 20 1.6905995 0.4728315 0.8552220
## 21 1.7800104 0.4698293 0.8932960
## 22 1.8218081 0.4683798 0.9101214
## 23 1.9167962 0.4630918 0.9435827
## 24 1.9756004 0.4611352 0.9652468
## 25 2.0143881 0.4592907 0.9780208
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 5.
plot(P_fit2)
P_predict2 <- predict(P_fit2, newdata=test_x2)
(postResample(pred=P_predict2, obs=test_y2))
## RMSE Rsquared MAE
## 0.6344353 0.6718573 0.5241329
plot(varImp(P_fit2, scale = FALSE), top=20, scales = list(y = list(cex = 0.8)))
As we can view, that the top preds are ManufacturingProcess36, ManufacturingProcess32 and ManufacturingProcess13.
You can see that ManufacturingProcess32 and ManufacturingProcess36 are negatively correlated. ManufacturingProcess32 has strong positive correlation with Yield. If we continue on this increase the yield. ManufacturingPocess36 and ManufacturingProcess13 are moderately correlated with Yield.
correlation <- cor(select(CHEM_dataframe2, 'ManufacturingProcess32','ManufacturingProcess36','ManufacturingProcess13','Yield'))
corrplot::corrplot(correlation, method='square', type="upper")