#install.packages(“RANN”)

R Markdown

6.2. Developing a model to predict permeability (see Sect. 1.4) could save sig- nificant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug: (a) Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.1.3
data(permeability)
head(permeability)
##   permeability
## 1       12.520
## 2        1.120
## 3       19.405
## 4        1.730
## 5        1.680
## 6        0.510

The matrix fingerprints contains the 1,107 binary molecular predic- tors for the 165 compounds, while permeability contains permeability response. (b) The fingerprint predictors indicate the presence or absence of substruc- tures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

colus <- nearZeroVar(fingerprints)

dataframe1 <- fingerprints[,-colus]

dim(dataframe1)
## [1] 165 388
  1. Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?
set.seed(888)
transformed <- preProcess(dataframe1,
    method = c('center', 'scale'))

dataframe1 <- predict(transformed, dataframe1)

ytransf <- preProcess(permeability,
    method = c('center', 'scale'))

y <- predict(ytransf, permeability)

sample <- sample.split(permeability, SplitRatio = 0.75)
X_train = subset(dataframe1, sample == TRUE)
X_test = subset(dataframe1, sample == FALSE)

X_train_i <- rownames(X_train)
X_test_i <- rownames(X_test)


y_train <- permeability[X_train_i,]
y_test <- permeability[X_test_i,]


pFIT <- train(X_train, y_train,
            method = 'pls',
            tuneLength = 100
            )

plot(pFIT)

head(pFIT)
## $method
## [1] "pls"
## 
## $modelInfo
## $modelInfo$label
## [1] "Partial Least Squares"
## 
## $modelInfo$library
## [1] "pls"
## 
## $modelInfo$type
## [1] "Regression"     "Classification"
## 
## $modelInfo$parameters
##   parameter   class       label
## 1     ncomp numeric #Components
## 
## $modelInfo$grid
## function (x, y, len = NULL, search = "grid") 
## {
##     if (search == "grid") {
##         out <- data.frame(ncomp = seq(1, min(ncol(x) - 1, len), 
##             by = 1))
##     }
##     else {
##         out <- data.frame(ncomp = unique(sample(1:ncol(x), replace = TRUE)))
##     }
##     out
## }
## 
## $modelInfo$loop
## function (grid) 
## {
##     grid <- grid[order(grid$ncomp, decreasing = TRUE), , drop = FALSE]
##     loop <- grid[1, , drop = FALSE]
##     submodels <- list(grid[-1, , drop = FALSE])
##     list(loop = loop, submodels = submodels)
## }
## 
## $modelInfo$fit
## function (x, y, wts, param, lev, last, classProbs, ...) 
## {
##     ncomp <- min(ncol(x), param$ncomp)
##     out <- if (is.factor(y)) {
##         plsda(x, y, method = "oscorespls", ncomp = ncomp, ...)
##     }
##     else {
##         dat <- if (is.data.frame(x)) 
##             x
##         else as.data.frame(x, stringsAsFactors = TRUE)
##         dat$.outcome <- y
##         pls::plsr(.outcome ~ ., data = dat, method = "oscorespls", 
##             ncomp = ncomp, ...)
##     }
##     out
## }
## <bytecode: 0x00000000296e5788>
## 
## $modelInfo$predict
## function (modelFit, newdata, submodels = NULL) 
## {
##     out <- if (modelFit$problemType == "Classification") {
##         if (!is.matrix(newdata)) 
##             newdata <- as.matrix(newdata)
##         out <- predict(modelFit, newdata, type = "class")
##     }
##     else as.vector(pls:::predict.mvr(modelFit, newdata, ncomp = max(modelFit$ncomp)))
##     if (!is.null(submodels)) {
##         tmp <- vector(mode = "list", length = nrow(submodels))
##         if (modelFit$problemType == "Classification") {
##             if (length(submodels$ncomp) > 1) {
##                 tmp <- as.list(predict(modelFit, newdata, ncomp = submodels$ncomp))
##             }
##             else tmp <- list(predict(modelFit, newdata, ncomp = submodels$ncomp))
##         }
##         else {
##             tmp <- as.list(as.data.frame(apply(predict(modelFit, 
##                 newdata, ncomp = submodels$ncomp), 3, function(x) list(x))))
##         }
##         out <- c(list(out), tmp)
##     }
##     out
## }
## <bytecode: 0x0000000026026fd0>
## 
## $modelInfo$prob
## function (modelFit, newdata, submodels = NULL) 
## {
##     if (!is.matrix(newdata)) 
##         newdata <- as.matrix(newdata)
##     out <- predict(modelFit, newdata, type = "prob", ncomp = modelFit$tuneValue$ncomp)
##     if (length(dim(out)) == 3) {
##         if (dim(out)[1] > 1) {
##             out <- out[, , 1]
##         }
##         else {
##             out <- as.data.frame(t(out[, , 1]), stringsAsFactors = TRUE)
##         }
##     }
##     if (!is.null(submodels)) {
##         tmp <- vector(mode = "list", length = nrow(submodels) + 
##             1)
##         tmp[[1]] <- out
##         for (j in seq(along = submodels$ncomp)) {
##             tmpProb <- predict(modelFit, newdata, type = "prob", 
##                 ncomp = submodels$ncomp[j])
##             if (length(dim(tmpProb)) == 3) {
##                 if (dim(tmpProb)[1] > 1) {
##                   tmpProb <- tmpProb[, , 1]
##                 }
##                 else {
##                   tmpProb <- as.data.frame(t(tmpProb[, , 1]), 
##                     stringsAsFactors = TRUE)
##                 }
##             }
##             tmp[[j + 1]] <- as.data.frame(tmpProb[, modelFit$obsLevels])
##         }
##         out <- tmp
##     }
##     out
## }
## 
## $modelInfo$varImp
## function (object, estimate = NULL, ...) 
## {
##     library(pls)
##     modelCoef <- coef(object, intercept = FALSE, comps = 1:object$ncomp)
##     perf <- pls:::MSEP.mvr(object)$val
##     nms <- dimnames(perf)
##     if (length(nms$estimate) > 1) {
##         pIndex <- if (is.null(estimate)) 
##             1
##         else which(nms$estimate == estimate)
##         perf <- perf[pIndex, , , drop = FALSE]
##     }
##     numResp <- dim(modelCoef)[2]
##     if (numResp <= 2) {
##         modelCoef <- modelCoef[, 1, , drop = FALSE]
##         perf <- perf[, 1, ]
##         delta <- -diff(perf)
##         delta <- delta/sum(delta)
##         out <- data.frame(Overall = apply(abs(modelCoef), 1, 
##             weighted.mean, w = delta))
##     }
##     else {
##         if (dim(perf)[3] <= 2) {
##             perf <- -t(t(apply(perf[1, , ], 1, diff)))
##             perf <- t(t(apply(perf, 1, function(u) u/sum(u))))
##         }
##         else {
##             perf <- -t(apply(perf[1, , ], 1, diff))
##             perf <- t(apply(perf, 1, function(u) u/sum(u)))
##         }
##         out <- matrix(NA, ncol = numResp, nrow = dim(modelCoef)[1])
##         for (i in 1:numResp) {
##             tmp <- abs(modelCoef[, i, , drop = FALSE])
##             out[, i] <- apply(tmp, 1, weighted.mean, w = perf[i, 
##                 ])
##         }
##         colnames(out) <- dimnames(modelCoef)[[2]]
##         rownames(out) <- dimnames(modelCoef)[[1]]
##     }
##     as.data.frame(out, stringsAsFactors = TRUE)
## }
## 
## $modelInfo$predictors
## function (x, ...) 
## rownames(x$projection)
## 
## $modelInfo$levels
## function (x) 
## x$obsLevels
## 
## $modelInfo$tags
## [1] "Partial Least Squares" "Feature Extraction"    "Linear Classifier"    
## [4] "Linear Regression"    
## 
## $modelInfo$sort
## function (x) 
## x[order(x[, 1]), ]
## 
## 
## $modelType
## [1] "Regression"
## 
## $results
##     ncomp         RMSE   Rsquared          MAE       RMSESD RsquaredSD
## 1       1 1.366139e+01 0.25131059 1.001715e+01 1.494556e+00 0.13856158
## 2       2 1.301676e+01 0.32663942 9.120971e+00 1.343893e+00 0.12657074
## 3       3 1.315546e+01 0.32723696 9.464251e+00 1.297240e+00 0.10636635
## 4       4 1.322623e+01 0.32672881 9.469037e+00 1.483368e+00 0.10936973
## 5       5 1.322792e+01 0.33500488 9.371653e+00 1.540210e+00 0.12060956
## 6       6 1.319335e+01 0.34593758 9.426308e+00 1.723772e+00 0.12791854
## 7       7 1.313151e+01 0.35521759 9.415516e+00 1.693123e+00 0.11968673
## 8       8 1.327325e+01 0.35227560 9.546441e+00 1.738245e+00 0.12003325
## 9       9 1.332618e+01 0.35671783 9.581289e+00 1.856224e+00 0.12313457
## 10     10 1.344960e+01 0.35667302 9.712811e+00 1.785315e+00 0.12351388
## 11     11 1.362459e+01 0.35284820 9.832980e+00 1.777208e+00 0.12267584
## 12     12 1.380111e+01 0.34790630 9.963278e+00 1.756458e+00 0.12495588
## 13     13 1.414861e+01 0.33461517 1.017093e+01 1.872781e+00 0.13084999
## 14     14 1.443927e+01 0.32540147 1.038425e+01 1.961534e+00 0.13155166
## 15     15 1.468080e+01 0.32149650 1.061372e+01 1.961063e+00 0.13080418
## 16     16 1.502717e+01 0.31012656 1.085969e+01 2.011367e+00 0.13118795
## 17     17 1.520971e+01 0.30374857 1.100863e+01 1.929629e+00 0.12387258
## 18     18 1.539022e+01 0.29859394 1.114569e+01 1.875672e+00 0.12185028
## 19     19 1.554127e+01 0.29304452 1.132839e+01 1.883619e+00 0.12292268
## 20     20 1.565734e+01 0.28978691 1.139249e+01 1.847385e+00 0.12050633
## 21     21 1.584247e+01 0.28273409 1.154598e+01 1.804376e+00 0.11511195
## 22     22 1.603669e+01 0.27857529 1.166604e+01 1.772259e+00 0.11145302
## 23     23 1.631657e+01 0.27086895 1.182542e+01 1.790726e+00 0.10919948
## 24     24 1.653329e+01 0.26559656 1.200488e+01 1.836196e+00 0.10869744
## 25     25 1.683476e+01 0.25912724 1.223108e+01 1.889293e+00 0.10707886
## 26     26 1.712511e+01 0.25249645 1.241565e+01 2.043840e+00 0.10621206
## 27     27 1.740468e+01 0.24582064 1.259348e+01 2.164216e+00 0.10394466
## 28     28 1.763806e+01 0.24185847 1.275817e+01 2.267230e+00 0.10449976
## 29     29 1.785595e+01 0.23674687 1.288404e+01 2.314394e+00 0.10280187
## 30     30 1.805814e+01 0.23247542 1.299872e+01 2.358688e+00 0.10126963
## 31     31 1.828494e+01 0.22702663 1.313915e+01 2.413140e+00 0.09939894
## 32     32 1.847292e+01 0.22276482 1.323848e+01 2.460724e+00 0.09794891
## 33     33 1.869244e+01 0.21762803 1.338230e+01 2.568145e+00 0.09827529
## 34     34 1.894520e+01 0.21181188 1.354918e+01 2.720023e+00 0.09762345
## 35     35 1.919961e+01 0.20591380 1.371123e+01 2.841790e+00 0.09611718
## 36     36 1.941875e+01 0.20096655 1.385223e+01 2.973174e+00 0.09512574
## 37     37 1.963968e+01 0.19653400 1.400003e+01 3.149265e+00 0.09512594
## 38     38 1.992651e+01 0.19126114 1.418800e+01 3.368589e+00 0.09646213
## 39     39 2.019549e+01 0.18731589 1.437575e+01 3.529225e+00 0.09822148
## 40     40 2.041501e+01 0.18425731 1.452298e+01 3.703418e+00 0.09901582
## 41     41 2.061244e+01 0.18158844 1.467149e+01 3.909549e+00 0.10055614
## 42     42 2.084574e+01 0.17817568 1.483795e+01 4.171014e+00 0.10220347
## 43     43 2.100213e+01 0.17606839 1.494276e+01 4.355455e+00 0.10375766
## 44     44 2.113925e+01 0.17450296 1.504124e+01 4.541700e+00 0.10502651
## 45     45 2.123975e+01 0.17281261 1.511255e+01 4.634345e+00 0.10644041
## 46     46 2.130317e+01 0.17190541 1.516064e+01 4.727032e+00 0.10686872
## 47     47 2.137245e+01 0.17146440 1.521041e+01 4.761278e+00 0.10766294
## 48     48 2.146030e+01 0.17090272 1.525973e+01 4.890932e+00 0.10873171
## 49     49 2.153340e+01 0.17024502 1.530390e+01 4.984644e+00 0.10970741
## 50     50 2.159339e+01 0.16961161 1.534679e+01 5.053186e+00 0.10990536
## 51     51 2.164087e+01 0.16928906 1.538644e+01 5.093831e+00 0.11001924
## 52     52 2.168194e+01 0.16924906 1.542193e+01 5.120292e+00 0.11047438
## 53     53 2.171008e+01 0.16962416 1.544847e+01 5.143950e+00 0.11096440
## 54     54 2.173052e+01 0.16974311 1.546427e+01 5.158114e+00 0.11118934
## 55     55 2.175165e+01 0.16962344 1.547327e+01 5.164727e+00 0.11127212
## 56     56 2.177206e+01 0.16969945 1.548721e+01 5.193989e+00 0.11163018
## 57     57 2.178573e+01 0.16966911 1.550240e+01 5.201556e+00 0.11181249
## 58     58 2.180157e+01 0.16953271 1.552101e+01 5.196648e+00 0.11184174
## 59     59 2.181326e+01 0.16938214 1.553564e+01 5.199180e+00 0.11193532
## 60     60 2.181678e+01 0.16938591 1.553802e+01 5.193087e+00 0.11214021
## 61     61 2.181113e+01 0.16980613 1.552823e+01 5.194674e+00 0.11282318
## 62     62 2.180992e+01 0.16992417 1.552661e+01 5.196026e+00 0.11312259
## 63     63 3.950852e+13 0.16612040 3.238785e+13 1.975426e+14 0.11710398
## 64     64 1.408718e+14 0.16261956 1.134091e+14 4.905484e+14 0.12078581
## 65     65 2.825276e+14 0.16139984 2.305011e+14 6.852701e+14 0.12257050
## 66     66 6.015004e+14 0.15260080 4.832592e+14 1.046951e+15 0.12349060
## 67     67 9.401741e+14 0.13182138 7.591330e+14 1.352257e+15 0.12404405
## 68     68 1.280338e+15 0.10681486 1.014845e+15 1.634759e+15 0.12125819
## 69     69 1.637432e+15 0.09643310 1.324474e+15 1.817861e+15 0.12150837
## 70     70 1.941094e+15 0.07590977 1.551880e+15 1.931397e+15 0.10201507
## 71     71 2.325233e+15 0.06126394 1.854832e+15 1.922697e+15 0.09518010
## 72     72 2.698351e+15 0.04177078 2.149663e+15 1.941660e+15 0.05791162
## 73     73 3.040729e+15 0.04777799 2.440140e+15 2.111857e+15 0.07035465
## 74     74 3.339466e+15 0.03852203 2.683187e+15 2.218786e+15 0.06824209
## 75     75 3.552012e+15 0.03630920 2.840604e+15 2.282257e+15 0.07463548
## 76     76 3.728098e+15 0.03457849 2.979864e+15 2.368975e+15 0.07930430
## 77     77 3.864808e+15 0.03356840 3.096955e+15 2.461975e+15 0.07930738
## 78     78 3.958098e+15 0.03280492 3.176569e+15 2.511265e+15 0.07720315
## 79     79 4.050290e+15 0.03232299 3.257448e+15 2.603163e+15 0.07627755
## 80     80 4.131895e+15 0.03256417 3.330901e+15 2.673289e+15 0.07542778
## 81     81 4.199134e+15 0.03274999 3.391076e+15 2.741234e+15 0.07469742
## 82     82 4.273011e+15 0.03340186 3.450389e+15 2.815742e+15 0.07459216
## 83     83 4.326022e+15 0.03385092 3.494313e+15 2.873041e+15 0.07456767
## 84     84 4.367699e+15 0.03505334 3.530284e+15 2.919662e+15 0.07480580
## 85     85 4.399072e+15 0.03600681 3.554023e+15 2.946626e+15 0.07492630
## 86     86 4.432952e+15 0.03635510 3.581078e+15 2.977731e+15 0.07513522
## 87     87 4.464412e+15 0.03633696 3.600213e+15 3.016430e+15 0.07502430
## 88     88 4.491555e+15 0.03616316 3.619503e+15 3.047404e+15 0.07477347
## 89     89 4.514463e+15 0.03589927 3.636105e+15 3.072982e+15 0.07437338
## 90     90 4.534445e+15 0.03560313 3.649539e+15 3.096074e+15 0.07380011
## 91     91 4.551703e+15 0.03537793 3.662604e+15 3.110506e+15 0.07355336
## 92     92 4.570826e+15 0.03509556 3.677844e+15 3.125654e+15 0.07295716
## 93     93 4.591474e+15 0.03479295 3.694877e+15 3.146871e+15 0.07277954
## 94     94 4.610679e+15 0.03452206 3.710204e+15 3.168146e+15 0.07232652
## 95     95 4.622197e+15 0.03427336 3.719372e+15 3.178051e+15 0.07214005
## 96     96 4.634026e+15 0.03414774 3.729064e+15 3.191355e+15 0.07182174
## 97     97 4.642404e+15 0.03397280 3.737756e+15 3.203634e+15 0.07128266
## 98     98 4.655040e+15 0.03401845 3.748953e+15 3.223080e+15 0.07107886
## 99     99 4.661766e+15 0.03395983 3.754091e+15 3.234085e+15 0.07062621
## 100   100 4.667819e+15 0.03388396 3.760129e+15 3.242195e+15 0.07032378
##            MAESD
## 1   1.210441e+00
## 2   9.855809e-01
## 3   9.933712e-01
## 4   1.160809e+00
## 5   1.244921e+00
## 6   1.385141e+00
## 7   1.344532e+00
## 8   1.437491e+00
## 9   1.480055e+00
## 10  1.461433e+00
## 11  1.405062e+00
## 12  1.345016e+00
## 13  1.386543e+00
## 14  1.418501e+00
## 15  1.459707e+00
## 16  1.531334e+00
## 17  1.548394e+00
## 18  1.522068e+00
## 19  1.532515e+00
## 20  1.508873e+00
## 21  1.452002e+00
## 22  1.390460e+00
## 23  1.311491e+00
## 24  1.265230e+00
## 25  1.233505e+00
## 26  1.291823e+00
## 27  1.334802e+00
## 28  1.388839e+00
## 29  1.408025e+00
## 30  1.440459e+00
## 31  1.453951e+00
## 32  1.509110e+00
## 33  1.594725e+00
## 34  1.671398e+00
## 35  1.754682e+00
## 36  1.862277e+00
## 37  1.978835e+00
## 38  2.159814e+00
## 39  2.301606e+00
## 40  2.431969e+00
## 41  2.540304e+00
## 42  2.694317e+00
## 43  2.781752e+00
## 44  2.876293e+00
## 45  2.928887e+00
## 46  2.979674e+00
## 47  2.988302e+00
## 48  3.056781e+00
## 49  3.121854e+00
## 50  3.172625e+00
## 51  3.212660e+00
## 52  3.228589e+00
## 53  3.257600e+00
## 54  3.271707e+00
## 55  3.277580e+00
## 56  3.289953e+00
## 57  3.286989e+00
## 58  3.278409e+00
## 59  3.276193e+00
## 60  3.270056e+00
## 61  3.266269e+00
## 62  3.265899e+00
## 63  1.619392e+14
## 64  3.936432e+14
## 65  5.547202e+14
## 66  8.371535e+14
## 67  1.083805e+15
## 68  1.276154e+15
## 69  1.422585e+15
## 70  1.497481e+15
## 71  1.482608e+15
## 72  1.473365e+15
## 73  1.641696e+15
## 74  1.734798e+15
## 75  1.770011e+15
## 76  1.838143e+15
## 77  1.920349e+15
## 78  1.964031e+15
## 79  2.050760e+15
## 80  2.114280e+15
## 81  2.173792e+15
## 82  2.235384e+15
## 83  2.280749e+15
## 84  2.323960e+15
## 85  2.346615e+15
## 86  2.372753e+15
## 87  2.395794e+15
## 88  2.418820e+15
## 89  2.440534e+15
## 90  2.457077e+15
## 91  2.468810e+15
## 92  2.483448e+15
## 93  2.503684e+15
## 94  2.521401e+15
## 95  2.532181e+15
## 96  2.545169e+15
## 97  2.558518e+15
## 98  2.574906e+15
## 99  2.583503e+15
## 100 2.591840e+15
## 
## $pred
## NULL
## 
## $bestTune
##   ncomp
## 2     2
  1. Predict the response for the test set. What is the test set estimate of R2?
lmPredict <- predict(pFIT, X_test)

lmpredVal1 <- data.frame(obs = y_test, pred = lmPredict)
defaultSummary(lmpredVal1)
##       RMSE   Rsquared        MAE 
## 11.8071806  0.4945307  8.2450456
  1. Try building other models discussed in this chapter. Do any have better predictive performance?
options(warn = - 1)
eNGrid <- expand.grid(.lambda = c(0,0.01,0.1),
            .fraction = seq(0.05,1,length = 20))

set.seed(999)
eTune <- train(X_train, y_train,
                  method = 'enet',
                  tuneGrid = eNGrid
                  )
lmPRED2 <- predict(eTune, X_test)

lmVal2 <- data.frame(obs = y_test, pred = lmPRED2)
defaultSummary(lmVal2)
##       RMSE   Rsquared        MAE 
## 11.4344891  0.5748558  7.9721040

Yes, the Elastic Net model performed better in predicting the values of the test data set.

  1. Would you recommend any of your models to replace the permeability laboratory experiment? I would not think so. This is because R2 values are relatively low.

6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the re- lationship between biological measurements of the raw materials (predictors), 6.5 Computing 139 measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing pro- cess. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch: (a) Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
head(ChemicalManufacturingProcess)
##   Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 38.00                 6.25                49.58                56.97
## 2 42.44                 8.01                60.97                67.48
## 3 42.03                 8.01                60.97                67.48
## 4 41.42                 8.01                60.97                67.48
## 5 42.49                 7.47                63.33                72.25
## 6 43.57                 6.12                58.36                65.31
##   BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1                12.74                19.51                43.73
## 2                14.65                19.36                53.14
## 3                14.65                19.36                53.14
## 4                14.65                19.36                53.14
## 5                14.02                17.91                54.66
## 6                15.17                21.79                51.23
##   BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1                  100                16.66                11.44
## 2                  100                19.04                12.55
## 3                  100                19.04                12.55
## 4                  100                19.04                12.55
## 5                  100                18.22                12.80
## 6                  100                18.30                12.13
##   BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1                 3.46               138.09                18.83
## 2                 3.46               153.67                21.05
## 3                 3.46               153.67                21.05
## 4                 3.46               153.67                21.05
## 5                 3.05               147.61                21.05
## 6                 3.78               151.88                20.76
##   ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1                     NA                     NA                     NA
## 2                    0.0                      0                     NA
## 3                    0.0                      0                     NA
## 4                    0.0                      0                     NA
## 5                   10.7                      0                     NA
## 6                   12.0                      0                     NA
##   ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1                     NA                     NA                     NA
## 2                    917                 1032.2                  210.0
## 3                    912                 1003.6                  207.1
## 4                    911                 1014.6                  213.3
## 5                    918                 1027.5                  205.7
## 6                    924                 1016.8                  208.9
##   ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1                     NA                     NA                  43.00
## 2                    177                    178                  46.57
## 3                    178                    178                  45.07
## 4                    177                    177                  44.92
## 5                    178                    178                  44.96
## 6                    178                    178                  45.32
##   ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1                     NA                     NA                     NA
## 2                     NA                     NA                      0
## 3                     NA                     NA                      0
## 4                     NA                     NA                      0
## 5                     NA                     NA                      0
## 6                     NA                     NA                      0
##   ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1                   35.5                   4898                   6108
## 2                   34.0                   4869                   6095
## 3                   34.8                   4878                   6087
## 4                   34.8                   4897                   6102
## 5                   34.6                   4992                   6233
## 6                   34.0                   4985                   6222
##   ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1                   4682                   35.5                   4865
## 2                   4617                   34.0                   4867
## 3                   4617                   34.8                   4877
## 4                   4635                   34.8                   4872
## 5                   4733                   33.9                   4886
## 6                   4786                   33.4                   4862
##   ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1                   6049                   4665                    0.0
## 2                   6097                   4621                    0.0
## 3                   6078                   4621                    0.0
## 4                   6073                   4611                    0.0
## 5                   6102                   4659                   -0.7
## 6                   6115                   4696                   -0.6
##   ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1                     NA                     NA                     NA
## 2                      3                      0                      3
## 3                      4                      1                      4
## 4                      5                      2                      5
## 5                      8                      4                     18
## 6                      9                      1                      1
##   ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1                   4873                   6074                   4685
## 2                   4869                   6107                   4630
## 3                   4897                   6116                   4637
## 4                   4892                   6111                   4630
## 5                   4930                   6151                   4684
## 6                   4871                   6128                   4687
##   ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1                   10.7                   21.0                    9.9
## 2                   11.2                   21.4                    9.9
## 3                   11.1                   21.3                    9.4
## 4                   11.1                   21.3                    9.4
## 5                   11.3                   21.6                    9.0
## 6                   11.4                   21.7                   10.1
##   ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1                   69.1                    156                     66
## 2                   68.7                    169                     66
## 3                   69.3                    173                     66
## 4                   69.3                    171                     68
## 5                   69.4                    171                     70
## 6                   68.2                    173                     70
##   ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1                    2.4                    486                  0.019
## 2                    2.6                    508                  0.019
## 3                    2.6                    509                  0.018
## 4                    2.5                    496                  0.018
## 5                    2.5                    468                  0.017
## 6                    2.5                    490                  0.018
##   ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1                    0.5                      3                    7.2
## 2                    2.0                      2                    7.2
## 3                    0.7                      2                    7.2
## 4                    1.2                      2                    7.2
## 5                    0.2                      2                    7.3
## 6                    0.4                      2                    7.2
##   ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1                     NA                     NA                   11.6
## 2                    0.1                   0.15                   11.1
## 3                    0.0                   0.00                   12.0
## 4                    0.0                   0.00                   10.6
## 5                    0.0                   0.00                   11.0
## 6                    0.0                   0.00                   11.5
##   ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1                    3.0                    1.8                    2.4
## 2                    0.9                    1.9                    2.2
## 3                    1.0                    1.8                    2.3
## 4                    1.1                    1.8                    2.1
## 5                    1.1                    1.7                    2.1
## 6                    2.2                    1.8                    2.0

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

  1. A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).
missmap(ChemicalManufacturingProcess)

#using knn imputation
(CHEM_knn_impute <- preProcess(ChemicalManufacturingProcess, method=c('knnImpute')))
## Created from 152 samples and 58 variables
## 
## Pre-processing:
##   - centered (58)
##   - ignored (0)
##   - 5 nearest neighbor imputation (58)
##   - scaled (58)
CHEM_dataframe <- predict(CHEM_knn_impute, ChemicalManufacturingProcess)
summary(CHEM_dataframe)
##      Yield         BiologicalMaterial01 BiologicalMaterial02
##  Min.   :-2.6692   Min.   :-2.5653      Min.   :-2.1858     
##  1st Qu.:-0.7716   1st Qu.:-0.6078      1st Qu.:-0.7457     
##  Median :-0.1119   Median :-0.1491      Median :-0.1484     
##  Mean   : 0.0000   Mean   : 0.0000      Mean   : 0.0000     
##  3rd Qu.: 0.7035   3rd Qu.: 0.6423      3rd Qu.: 0.7557     
##  Max.   : 3.3394   Max.   : 3.3597      Max.   : 2.2459     
##  BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05
##  Min.   :-2.6830      Min.   :-1.6731      Min.   :-2.90576    
##  1st Qu.:-0.6811      1st Qu.:-0.6222      1st Qu.:-0.73944    
##  Median :-0.1212      Median :-0.1405      Median :-0.05891    
##  Mean   : 0.0000      Mean   : 0.0000      Mean   : 0.00000    
##  3rd Qu.: 0.6804      3rd Qu.: 0.4907      3rd Qu.: 0.70568    
##  Max.   : 2.6355      Max.   : 6.0523      Max.   : 3.38985    
##  BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
##  Min.   :-2.2184      Min.   :-0.1313      Min.   :-2.38535    
##  1st Qu.:-0.7622      1st Qu.:-0.1313      1st Qu.:-0.64225    
##  Median :-0.1202      Median :-0.1313      Median : 0.02249    
##  Mean   : 0.0000      Mean   : 0.0000      Mean   : 0.00000    
##  3rd Qu.: 0.6499      3rd Qu.:-0.1313      3rd Qu.: 0.56906    
##  Max.   : 2.7948      Max.   : 7.5723      Max.   : 2.43034    
##  BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11
##  Min.   :-3.39629     Min.   :-1.7202      Min.   :-2.3116     
##  1st Qu.:-0.59627     1st Qu.:-0.5685      1st Qu.:-0.6505     
##  Median :-0.03627     Median :-0.1513      Median :-0.1811     
##  Mean   : 0.00000     Mean   : 0.0000      Mean   : 0.0000     
##  3rd Qu.: 0.67428     3rd Qu.: 0.3161      3rd Qu.: 0.5491     
##  Max.   : 2.96246     Max.   : 6.7920      Max.   : 2.4431     
##  BiologicalMaterial12 ManufacturingProcess01 ManufacturingProcess02
##  Min.   :-2.3914      Min.   :-6.149703      Min.   :-1.969253     
##  1st Qu.:-0.6074      1st Qu.:-0.223563      1st Qu.: 0.308956     
##  Median :-0.1033      Median : 0.105667      Median : 0.509627     
##  Mean   : 0.0000      Mean   : 0.001224      Mean   : 0.009518     
##  3rd Qu.: 0.7112      3rd Qu.: 0.503487      3rd Qu.: 0.568648     
##  Max.   : 2.5986      Max.   : 1.587202      Max.   : 0.686690     
##  ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
##  Min.   :-3.10582       Min.   :-3.323233      Min.   :-2.577803     
##  1st Qu.:-0.42705       1st Qu.:-0.613828      1st Qu.:-0.487046     
##  Median : 0.37658       Median : 0.342432      Median :-0.086583     
##  Mean   : 0.04123       Mean   : 0.003213      Mean   :-0.002534     
##  3rd Qu.: 0.46587       3rd Qu.: 0.661186      3rd Qu.: 0.230347     
##  Max.   : 2.69818       Max.   : 2.254953      Max.   : 5.686954     
##  ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
##  Min.   :-1.630631      Min.   :-0.9580199     Min.   :-1.111973     
##  1st Qu.:-0.630408      1st Qu.:-0.9580199     1st Qu.:-1.111973     
##  Median :-0.222910      Median :-0.9580199     Median : 0.894164     
##  Mean   :-0.006574      Mean   :-0.0009072     Mean   :-0.001759     
##  3rd Qu.: 0.480950      3rd Qu.: 1.0378549     3rd Qu.: 0.894164     
##  Max.   : 7.408415      Max.   : 1.0378549     Max.   : 0.894164     
##  ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11
##  Min.   :-4.37787       Min.   :-2.18999       Min.   :-2.63442      
##  1st Qu.:-0.49799       1st Qu.:-0.62482       1st Qu.:-0.53867      
##  Median : 0.04519       Median :-0.10310       Median : 0.02020      
##  Mean   : 0.00000       Mean   : 0.02156       Mean   : 0.03163      
##  3rd Qu.: 0.55281       3rd Qu.: 0.54906       3rd Qu.: 0.71878      
##  Max.   : 2.39252       Max.   : 3.15768       Max.   : 2.95425      
##  ManufacturingProcess12 ManufacturingProcess13 ManufacturingProcess14
##  Min.   :-0.480694      Min.   :-2.37172       Min.   :-2.803712     
##  1st Qu.:-0.480694      1st Qu.:-0.59881       1st Qu.:-0.488202     
##  Median :-0.480694      Median : 0.09066       Median : 0.029921     
##  Mean   :-0.002731      Mean   : 0.00000       Mean   :-0.004071     
##  3rd Qu.:-0.480694      3rd Qu.: 0.68163       3rd Qu.: 0.520534     
##  Max.   : 2.068439      Max.   : 4.03046       Max.   : 3.688885     
##  ManufacturingProcess15 ManufacturingProcess16 ManufacturingProcess17
##  Min.   :-2.3137        Min.   :-12.98219      Min.   :-2.43850      
##  1st Qu.:-0.4960        1st Qu.: -0.01436      1st Qu.:-0.67597      
##  Median :-0.1273        Median :  0.06312      Median : 0.04507      
##  Mean   : 0.0000        Mean   :  0.00000      Mean   : 0.00000      
##  3rd Qu.: 0.3786        3rd Qu.:  0.15126      3rd Qu.: 0.60587      
##  Max.   : 3.3283        Max.   :  0.81376      Max.   : 4.53150      
##  ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
##  Min.   :-13.08836      Min.   :-3.0321        Min.   :-13.05542     
##  1st Qu.:  0.00903      1st Qu.:-0.6022        1st Qu.: -0.01063     
##  Median :  0.06890      Median :-0.1360        Median :  0.07318     
##  Mean   :  0.00000      Mean   : 0.0000        Mean   :  0.00000     
##  3rd Qu.:  0.14237      3rd Qu.: 0.4838        3rd Qu.:  0.15197     
##  Max.   :  0.43899      Max.   : 2.5846        Max.   :  0.58033     
##  ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
##  Min.   :-2.1018        Min.   :-1.6230324     Min.   :-1.814768     
##  1st Qu.:-0.5599        1st Qu.:-0.7223009     1st Qu.:-0.611797     
##  Median :-0.1745        Median :-0.1218132     Median :-0.010311     
##  Mean   : 0.0000        Mean   : 0.0003314     Mean   : 0.004726     
##  3rd Qu.: 0.2110        3rd Qu.: 0.7789183     3rd Qu.: 0.591175     
##  Max.   : 4.8365        Max.   : 1.9798937     Max.   : 1.794146     
##  ManufacturingProcess24 ManufacturingProcess25 ManufacturingProcess26
##  Min.   :-1.523304      Min.   :-12.927496     Min.   :-12.940454    
##  1st Qu.:-0.833580      1st Qu.:  0.002208     1st Qu.:  0.008505    
##  Median :-0.143857      Median :  0.069146     Median :  0.063251    
##  Mean   : 0.005061      Mean   :  0.000105     Mean   :  0.000404    
##  3rd Qu.: 0.890729      3rd Qu.:  0.128720     3rd Qu.:  0.115417    
##  Max.   : 2.442608      Max.   :  0.433287     Max.   :  0.312785    
##  ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
##  Min.   :-12.888994     Min.   :-1.25583       Min.   :-12.026718    
##  1st Qu.:  0.000681     1st Qu.:-1.25583       1st Qu.: -0.186978    
##  Median :  0.066362     Median : 0.72551       Median : -0.066778    
##  Mean   :  0.001121     Mean   :-0.02868       Mean   : -0.003263    
##  3rd Qu.:  0.131337     3rd Qu.: 0.78266       3rd Qu.:  0.233723    
##  Max.   :  0.416660     Max.   : 0.93507       Max.   :  1.195326    
##  ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
##  Min.   :-9.38589       Min.   :-12.632749     Min.   :-2.86552      
##  1st Qu.:-0.37026       1st Qu.: -0.015263     1st Qu.:-0.64216      
##  Median : 0.03954       Median :  0.110732     Median :-0.08632      
##  Mean   : 0.01114       Mean   : -0.001722     Mean   : 0.00000      
##  3rd Qu.: 0.55179       3rd Qu.:  0.218728     3rd Qu.: 0.65480      
##  Max.   : 2.08855       Max.   :  0.416720     Max.   : 2.69287      
##  ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35
##  Min.   :-3.037737      Min.   :-3.558813      Min.   :-3.01270      
##  1st Qu.:-0.621676      1st Qu.: 0.118269      1st Qu.:-0.51725      
##  Median : 0.183677      Median : 0.118269      Median :-0.05513      
##  Mean   :-0.005764      Mean   :-0.009176      Mean   :-0.02362      
##  3rd Qu.: 0.586354      3rd Qu.: 0.118269      3rd Qu.: 0.49941      
##  Max.   : 2.599738      Max.   : 1.956810      Max.   : 2.44032      
##  ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
##  Min.   :-2.944307      Min.   :-2.27741       Min.   :-3.9024       
##  1st Qu.:-0.655777      1st Qu.:-0.70467       1st Qu.:-0.8225       
##  Median :-0.083645      Median :-0.03064       Median : 0.7175       
##  Mean   :-0.008228      Mean   : 0.00000       Mean   : 0.0000       
##  3rd Qu.: 0.488487      3rd Qu.: 0.64339       3rd Qu.: 0.7175       
##  Max.   : 2.777017      Max.   : 2.89017       Max.   : 0.7175       
##  ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess41
##  Min.   :-4.5508        Min.   :-0.4626528     Min.   :-0.440588     
##  1st Qu.: 0.1653        1st Qu.:-0.4626528     1st Qu.:-0.440588     
##  Median : 0.2317        Median :-0.4626528     Median :-0.440588     
##  Mean   : 0.0000        Mean   : 0.0003392     Mean   :-0.000392     
##  3rd Qu.: 0.2982        3rd Qu.:-0.4626528     3rd Qu.:-0.440588     
##  Max.   : 0.4310        Max.   : 2.1490969     Max.   : 3.275213     
##  ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
##  Min.   :-5.77163       Min.   :-1.0506        Min.   :-5.60583      
##  1st Qu.: 0.09979       1st Qu.:-0.3594        1st Qu.:-0.01588      
##  Median : 0.20280       Median :-0.1290        Median : 0.29467      
##  Mean   : 0.00000       Mean   : 0.0000        Mean   : 0.00000      
##  3rd Qu.: 0.25430       3rd Qu.: 0.1303        3rd Qu.: 0.29467      
##  Max.   : 0.46031       Max.   :11.6224        Max.   : 0.91578      
##  ManufacturingProcess45
##  Min.   :-5.25447      
##  1st Qu.:-0.09356      
##  Median : 0.15220      
##  Mean   : 0.00000      
##  3rd Qu.: 0.39796      
##  Max.   : 1.13523
  1. Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?
dim(CHEM_dataframe)
## [1] 176  58
CHEM_dataframe2 <- CHEM_dataframe[, -nearZeroVar(CHEM_dataframe)]
dim(CHEM_dataframe2)
## [1] 176  57
set.seed(555)

select_train <- createDataPartition(CHEM_dataframe2$Yield, times = 1, p = .80, list = FALSE)

train_x2 <- CHEM_dataframe2[select_train, ][, -c(1)] 
test_x2 <- CHEM_dataframe2[-select_train, ][, -c(1)] 
train_y2 <- CHEM_dataframe2[select_train, ]$Yield
test_y2 <- CHEM_dataframe2[-select_train, ]$Yield

(P_fit2 <- train(x = train_x2, y = train_y2,
                method = "pls",
                metric = "Rsquared",
                tuneLength = 25, 
                trControl = trainControl(method = "cv", number=10), 
                preProcess = c('center', 'scale')
          ))
## Partial Least Squares 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 129, 130, 130, 129, 130, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##    1     0.7338982  0.4317348  0.6175189
##    2     0.7722558  0.4684310  0.5812528
##    3     0.8134501  0.5194408  0.5823906
##    4     0.9540440  0.5193093  0.6360902
##    5     1.0721588  0.5241915  0.6733250
##    6     1.1436786  0.5185259  0.6912503
##    7     1.2458840  0.5085213  0.7365470
##    8     1.3875640  0.4976078  0.7926451
##    9     1.4661458  0.4849316  0.8268420
##   10     1.5879129  0.4644183  0.8643309
##   11     1.6495018  0.4598224  0.8756271
##   12     1.7627131  0.4554407  0.8976769
##   13     1.7487823  0.4650528  0.8931955
##   14     1.6777520  0.4656269  0.8684202
##   15     1.6377169  0.4705341  0.8508950
##   16     1.6194606  0.4693782  0.8400870
##   17     1.5915410  0.4743053  0.8205206
##   18     1.5932854  0.4752533  0.8122477
##   19     1.6520158  0.4751787  0.8387607
##   20     1.6905995  0.4728315  0.8552220
##   21     1.7800104  0.4698293  0.8932960
##   22     1.8218081  0.4683798  0.9101214
##   23     1.9167962  0.4630918  0.9435827
##   24     1.9756004  0.4611352  0.9652468
##   25     2.0143881  0.4592907  0.9780208
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 5.
plot(P_fit2)

  1. Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?
P_predict2 <- predict(P_fit2, newdata=test_x2)
(postResample(pred=P_predict2, obs=test_y2))
##      RMSE  Rsquared       MAE 
## 0.6344353 0.6718573 0.5241329
  1. Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?
plot(varImp(P_fit2, scale = FALSE), top=20, scales = list(y = list(cex = 0.8)))

  1. Explore the relationships between each of the top predictors and the re- sponse. How could this information be helpful in improving yield in future runs of the manufacturing process?

As we can view, that the top preds are ManufacturingProcess36, ManufacturingProcess32 and ManufacturingProcess13.

You can see that ManufacturingProcess32 and ManufacturingProcess36 are negatively correlated. ManufacturingProcess32 has strong positive correlation with Yield. If we continue on this increase the yield. ManufacturingPocess36 and ManufacturingProcess13 are moderately correlated with Yield.

correlation <- cor(select(CHEM_dataframe2, 'ManufacturingProcess32','ManufacturingProcess36','ManufacturingProcess13','Yield'))
corrplot::corrplot(correlation, method='square', type="upper")