library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings

6.2. Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug: (a) Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
data(permeability)

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

  1. The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

388 of out of 1107 predictors are left for modeling.

dim(fingerprints)
## [1]  165 1107
removed_nzv <- nearZeroVar(fingerprints)
fingerprints <- fingerprints[,-removed_nzv]
dim(fingerprints)
## [1] 165 388
  1. Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?
set.seed(1234)
CDP <- createDataPartition(permeability, p=0.8, list=FALSE)

trainperm <- permeability[CDP, ]
trainfp <- fingerprints[CDP, ]
testperm <- permeability[-CDP, ]
testfp <- fingerprints [-CDP, ]

ctrl <- trainControl(method = "cv", number = 10)

plsq62 <- train(trainfp, trainperm, method = "pls", metric = "Rsquared",
             tuneLength = 10, trControl = ctrl, preProc = c( "center","scale"))
summary(plsq62)
## Data:    X dimension: 133 388 
##  Y dimension: 133 1
## Fit method: oscorespls
## Number of components considered: 3
## TRAINING: % variance explained
##           1 comps  2 comps  3 comps
## X           21.80    34.52    40.94
## .outcome    37.19    56.68    63.31
plsq62
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 119, 119, 120, 120, 120, 121, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     12.76548  0.3949848  9.782056
##    2     11.08419  0.5662412  7.827891
##    3     11.02409  0.5704308  8.285652
##    4     11.16727  0.5556966  8.423255
##    5     11.09218  0.5573860  8.252132
##    6     10.96320  0.5653475  8.090110
##    7     11.06634  0.5592164  8.455477
##    8     11.21343  0.5559115  8.601945
##    9     11.24153  0.5635851  8.559953
##   10     11.63386  0.5463035  8.841897
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 3.
plot(plsq62) 

  1. Predict the response for the test set. What is the test set estimate of R2?
predictfp <- predict(plsq62, testfp)

predictfp
##  [1] -0.3081976 18.9261210 -0.7682259 32.8805599 10.3847898  7.7071572
##  [7]  5.3310798  2.8180950  7.1394817  4.6036139  8.2376065 18.6263432
## [13] 18.6166469  1.0238976 17.9175998  2.7033874  5.9829826  8.9856387
## [19] 17.8772989  6.4573699 14.6579928 11.6839201  5.2417549  0.8956647
## [25]  2.5096314 34.2559252 35.6846760 -0.5010380 35.6395026 10.4015385
## [31]  7.3868950  2.4427300
postResample(pred = predictfp, obs = testperm)
##      RMSE  Rsquared       MAE 
## 14.033165  0.138877  9.495030
  1. Try building other models discussed in this chapter. Do any have better predictive performance?
enetGrid <- expand.grid(.lambda = c(0, 0.01, .1), .fraction = seq(.05, 1, length = 20)) #p.136
enetTune <- train(trainfp, trainperm, method = "enet",
                   tuneGrid = enetGrid,
                   trControl = ctrl,
                   preProc = c("center", "scale"))
## Warning: model fit failed for Fold02: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold03: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
enetpred <- predict(enetTune,newdata=testfp)
postResample(pred = enetpred, obs = testperm)
##       RMSE   Rsquared        MAE 
## 12.1507997  0.2489824  8.8411062
set.seed(100)
lmFit1 <- train(trainfp,trainperm, method = "lm", trControl = ctrl)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
lmFit1
## Linear Regression 
## 
## 133 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 120, 119, 121, 120, 119, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   23.22215  0.2238306  15.71695
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
  1. Would you recommend any of your models to replace the permeability laboratory experiment?

enet, since it has the highest r2 and lowest RMSE out of lm, enet and pls.

6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch: (a) Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

  1. A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).
fillmissing <- preProcess(ChemicalManufacturingProcess, method = "medianImpute")
chem <- predict(fillmissing, ChemicalManufacturingProcess)
  1. Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?
chem <- chem[, -nearZeroVar(chem)]

set.seed(2345)
CDP2 <- createDataPartition(chem$Yield, p=0.8, list=FALSE)

chemperm <- chem[CDP2, ]
testperm <- chem[-CDP2, ]
set.seed(1)

plsTune <- train(Yield ~ ., chem , method = "pls", 
             tuneLength = 20, trControl = ctrl, preProc = c("center", "scale"))

plot(plsTune) 

set.seed(2)

enetTune <- train(Yield ~ ., chem , method = "enet", 
                  tuneGrid = enetGrid, trControl = ctrl, preProc = c("center", "scale"))

plot(enetTune)

set.seed(3)
lm <- lm(Yield ~ ., chem)

plot(lm)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

  1. Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

enet is the best one since rsquared is the highest.

plspred <- predict(plsTune,testperm[ ,-1])
postResample(plspred,testperm[ ,1])
##      RMSE  Rsquared       MAE 
## 1.0735972 0.6641533 0.8544773
enetpred <- predict(enetTune,testperm[ ,-1])
postResample(enetpred,testperm[ ,1])
##      RMSE  Rsquared       MAE 
## 1.0163292 0.7121069 0.7974288
summary(lm)
## 
## Call:
## lm(formula = Yield ~ ., data = chem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.04308 -0.52815  0.00162  0.48679  2.08807 
## 
## Coefficients: (1 not defined because of singularities)
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -1.553e+01  8.765e+01  -0.177  0.85962    
## BiologicalMaterial01    1.582e-01  3.298e-01   0.480  0.63228    
## BiologicalMaterial02   -1.232e-01  1.297e-01  -0.950  0.34403    
## BiologicalMaterial03    1.524e-01  2.382e-01   0.640  0.52346    
## BiologicalMaterial04   -1.739e-02  5.154e-01  -0.034  0.97314    
## BiologicalMaterial05    1.542e-01  1.064e-01   1.450  0.14971    
## BiologicalMaterial06    3.254e-03  3.047e-01   0.011  0.99150    
## BiologicalMaterial08    3.858e-01  6.368e-01   0.606  0.54582    
## BiologicalMaterial09   -5.469e-01  1.361e+00  -0.402  0.68864    
## BiologicalMaterial10   -8.237e-02  1.350e+00  -0.061  0.95146    
## BiologicalMaterial11   -9.334e-02  8.297e-02  -1.125  0.26283    
## BiologicalMaterial12    3.648e-01  6.441e-01   0.566  0.57218    
## ManufacturingProcess01  4.514e-02  9.816e-02   0.460  0.64642    
## ManufacturingProcess02  3.501e-03  3.998e-02   0.088  0.93038    
## ManufacturingProcess03 -4.136e+00  5.184e+00  -0.798  0.42660    
## ManufacturingProcess04  6.882e-02  2.974e-02   2.314  0.02237 *  
## ManufacturingProcess05  2.348e-04  3.901e-03   0.060  0.95210    
## ManufacturingProcess06  2.991e-02  4.384e-02   0.682  0.49639    
## ManufacturingProcess07 -1.671e-01  2.150e-01  -0.777  0.43853    
## ManufacturingProcess08 -6.274e-02  2.554e-01  -0.246  0.80637    
## ManufacturingProcess09  2.281e-01  1.832e-01   1.245  0.21543    
## ManufacturingProcess10 -5.414e-02  5.460e-01  -0.099  0.92119    
## ManufacturingProcess11  1.561e-01  6.761e-01   0.231  0.81776    
## ManufacturingProcess12  5.877e-05  1.003e-04   0.586  0.55895    
## ManufacturingProcess13 -3.266e-01  3.878e-01  -0.842  0.40136    
## ManufacturingProcess14  3.284e-03  1.128e-02   0.291  0.77138    
## ManufacturingProcess15 -1.748e-04  9.490e-03  -0.018  0.98534    
## ManufacturingProcess16  1.296e-04  3.910e-04   0.332  0.74081    
## ManufacturingProcess17 -1.171e-01  3.021e-01  -0.388  0.69887    
## ManufacturingProcess18  3.769e-03  4.521e-03   0.834  0.40604    
## ManufacturingProcess19 -4.363e-04  7.791e-03  -0.056  0.95543    
## ManufacturingProcess20 -3.912e-03  4.776e-03  -0.819  0.41442    
## ManufacturingProcess21         NA         NA      NA       NA    
## ManufacturingProcess22 -1.099e-02  4.272e-02  -0.257  0.79738    
## ManufacturingProcess23 -3.669e-02  8.408e-02  -0.436  0.66334    
## ManufacturingProcess24 -1.645e-02  2.352e-02  -0.699  0.48572    
## ManufacturingProcess25 -7.842e-03  1.379e-02  -0.569  0.57055    
## ManufacturingProcess26  8.298e-03  1.051e-02   0.790  0.43116    
## ManufacturingProcess27 -7.932e-03  7.811e-03  -1.016  0.31190    
## ManufacturingProcess28 -7.918e-02  3.062e-02  -2.586  0.01092 *  
## ManufacturingProcess29  1.194e+00  8.665e-01   1.378  0.17073    
## ManufacturingProcess30 -2.107e-01  5.625e-01  -0.374  0.70870    
## ManufacturingProcess31  4.528e-02  1.218e-01   0.372  0.71065    
## ManufacturingProcess32  2.960e-01  6.270e-02   4.720 6.44e-06 ***
## ManufacturingProcess33 -3.456e-01  1.241e-01  -2.785  0.00621 ** 
## ManufacturingProcess34 -3.704e-01  2.723e+00  -0.136  0.89201    
## ManufacturingProcess35 -1.144e-02  1.701e-02  -0.673  0.50252    
## ManufacturingProcess36  1.356e+02  2.976e+02   0.456  0.64942    
## ManufacturingProcess37 -5.871e-01  2.905e-01  -2.021  0.04553 *  
## ManufacturingProcess38 -1.966e-01  2.435e-01  -0.807  0.42110    
## ManufacturingProcess39  5.246e-02  1.321e-01   0.397  0.69197    
## ManufacturingProcess40  1.027e+00  6.597e+00   0.156  0.87652    
## ManufacturingProcess41 -2.164e-01  4.761e+00  -0.045  0.96382    
## ManufacturingProcess42  2.571e-02  2.159e-01   0.119  0.90541    
## ManufacturingProcess43  2.054e-01  1.208e-01   1.700  0.09169 .  
## ManufacturingProcess44 -4.387e-01  1.211e+00  -0.362  0.71768    
## ManufacturingProcess45  8.946e-01  5.497e-01   1.628  0.10623    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.051 on 120 degrees of freedom
## Multiple R-squared:  0.7777, Adjusted R-squared:  0.6758 
## F-statistic: 7.633 on 55 and 120 DF,  p-value: < 2.2e-16
  1. Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?
varImp(enetTune)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   90.03
## BiologicalMaterial06     84.56
## ManufacturingProcess17   74.89
## ManufacturingProcess36   74.50
## BiologicalMaterial03     73.54
## ManufacturingProcess09   70.37
## BiologicalMaterial12     67.98
## BiologicalMaterial02     65.33
## ManufacturingProcess31   61.69
## ManufacturingProcess06   57.59
## BiologicalMaterial11     48.12
## ManufacturingProcess33   48.05
## BiologicalMaterial04     47.13
## BiologicalMaterial08     41.89
## BiologicalMaterial01     39.15
## ManufacturingProcess12   33.34
## BiologicalMaterial09     32.43
## ManufacturingProcess11   28.43
## ManufacturingProcess25   23.09