#install.packages("AppliedPredictiveModeling")
library(AppliedPredictiveModeling)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

Introduction

In this homework assignment I will be submitting exercises 6.2 and 6.3 from the Kuhn and Johnson Applied Predictive Modeling Book.

Question 6.2

Developing a model to predict permeability (see Sect.1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

A.

Start R and use these commands to load the data: > library(AppliedPredictiveModeling) > data(permeability) The matrix fingerprints contains the 1,107 binary molecular predic tors for the 16

data(permeability)

B.

The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

nzv <- nearZeroVar(fingerprints)
remaining_predictors <- fingerprints[,-nzv]
ncol(remaining_predictors)
## [1] 388

There are 719 predictors with near zero variance and that leaves us with 388 predictors for analysis.

C.

Split the data into a training and a test set, preprocess the data, and tune a partial least squares model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

# Splitting data into training and test sets
set.seed(123)
split <- createDataPartition(permeability, p = 0.75, list = FALSE)

train_fingerprints <- remaining_predictors[split,]
train_permeability <- permeability[split,]

test_fingerprints <- remaining_predictors[-split,]
test_permeability <- permeability[-split,]

# Tuning partial least squares model
ctrl <- trainControl(method = "cv")

plsTune <- train(x = train_fingerprints, 
                 y = train_permeability,
                 method = "pls",
                 metric='Rsquared',
                 tuneLength = 20,
                 trControl = ctrl,
                 preProcess=c('nzv','center', 'scale')
                 )
plsTune
## Partial Least Squares 
## 
## 125 samples
## 388 predictors
## 
## Pre-processing: centered (365), scaled (365), remove (23) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 113, 113, 112, 111, 112, 113, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.21619  0.3605993  10.216130
##    2     11.71133  0.4794051   8.394055
##    3     11.54039  0.5000623   8.825300
##    4     11.70562  0.5007565   9.254354
##    5     11.30838  0.5260096   8.577434
##    6     11.56847  0.5042516   8.575659
##    7     11.59341  0.5195490   8.691616
##    8     11.70339  0.5116854   8.853431
##    9     11.90364  0.5102987   8.945206
##   10     12.22900  0.4950605   9.139799
##   11     12.57732  0.4694433   9.238885
##   12     12.77770  0.4684327   9.305484
##   13     12.86567  0.4649233   9.350928
##   14     12.86467  0.4613197   9.423307
##   15     13.05438  0.4467381   9.583466
##   16     13.19220  0.4319748   9.552599
##   17     13.35761  0.4249477   9.701367
##   18     13.37855  0.4230528   9.787294
##   19     13.44356  0.4164976   9.882355
##   20     13.54368  0.4092856  10.020209
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 5.
plot(plsTune)

The partial least squares shows that the optimal number of variables that maximizes R squared is 5. The estimated R squared is 0.5260096

D.

Predict the response for the test set. What is the test set estimate of R2?

prediction <- predict(object = plsTune, newdata = test_fingerprints)

stats_test <- postResample(pred =  prediction, obs = test_permeability)
stats_test
##       RMSE   Rsquared        MAE 
## 11.4429972  0.3845902  7.5284816

For our predictions with plsTune compared to the observed values in test_permeability, our Rsquared is relatively low showing at .32, this shows that the majority of variability observed is not accounted for in our model. # E. Try building other models discussed in this chapter. Do any have better predictive performance?

I will try to use a penalizing ridge-regression model

ridgeGrid <- expand.grid(.lambda = c(0, 0.01, .1), .fraction = seq(.05, 1, length = 20))

ridge_fit <- train(train_fingerprints,
                   train_permeability, 
                   method="enet",
                   tuneGrid = ridgeGrid,
                   trControl = ctrl,
                   preProcess=c('nzv','center', 'scale'))
## Warning: model fit failed for Fold01: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold04: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
ridge_pred <- predict(ridge_fit, newdata = test_fingerprints)

postResample(pred = ridge_pred, obs = test_permeability)
##       RMSE   Rsquared        MAE 
## 10.7406158  0.4617487  7.1390818

I did not get a better predictive performance using the ridge regression model. the Rsquared value was 0.305798, it was worse than our pls model.

F.

Would you recommend any of your models to replace the permeability laboratory experiment?

I would not recommend any of the models above to replace the laboratory experiment. Based on the R squared values, both the models above only account for 32 or less percent of the variability seen in the data. 32% is not sufficient enough, whereas if we had seen an R squared value of 90+ we can consider the model to replace laboratory experiement.

Question 6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials(predictors),6.5 Computing 139 measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

A.

Start R and use these commands to load the data: > library(AppliedPredictiveModeling) > data(chemicalManufacturingProcess) The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

data("ChemicalManufacturingProcess")
head(ChemicalManufacturingProcess)
##   Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 38.00                 6.25                49.58                56.97
## 2 42.44                 8.01                60.97                67.48
## 3 42.03                 8.01                60.97                67.48
## 4 41.42                 8.01                60.97                67.48
## 5 42.49                 7.47                63.33                72.25
## 6 43.57                 6.12                58.36                65.31
##   BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1                12.74                19.51                43.73
## 2                14.65                19.36                53.14
## 3                14.65                19.36                53.14
## 4                14.65                19.36                53.14
## 5                14.02                17.91                54.66
## 6                15.17                21.79                51.23
##   BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1                  100                16.66                11.44
## 2                  100                19.04                12.55
## 3                  100                19.04                12.55
## 4                  100                19.04                12.55
## 5                  100                18.22                12.80
## 6                  100                18.30                12.13
##   BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1                 3.46               138.09                18.83
## 2                 3.46               153.67                21.05
## 3                 3.46               153.67                21.05
## 4                 3.46               153.67                21.05
## 5                 3.05               147.61                21.05
## 6                 3.78               151.88                20.76
##   ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1                     NA                     NA                     NA
## 2                    0.0                      0                     NA
## 3                    0.0                      0                     NA
## 4                    0.0                      0                     NA
## 5                   10.7                      0                     NA
## 6                   12.0                      0                     NA
##   ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1                     NA                     NA                     NA
## 2                    917                 1032.2                  210.0
## 3                    912                 1003.6                  207.1
## 4                    911                 1014.6                  213.3
## 5                    918                 1027.5                  205.7
## 6                    924                 1016.8                  208.9
##   ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1                     NA                     NA                  43.00
## 2                    177                    178                  46.57
## 3                    178                    178                  45.07
## 4                    177                    177                  44.92
## 5                    178                    178                  44.96
## 6                    178                    178                  45.32
##   ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1                     NA                     NA                     NA
## 2                     NA                     NA                      0
## 3                     NA                     NA                      0
## 4                     NA                     NA                      0
## 5                     NA                     NA                      0
## 6                     NA                     NA                      0
##   ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1                   35.5                   4898                   6108
## 2                   34.0                   4869                   6095
## 3                   34.8                   4878                   6087
## 4                   34.8                   4897                   6102
## 5                   34.6                   4992                   6233
## 6                   34.0                   4985                   6222
##   ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1                   4682                   35.5                   4865
## 2                   4617                   34.0                   4867
## 3                   4617                   34.8                   4877
## 4                   4635                   34.8                   4872
## 5                   4733                   33.9                   4886
## 6                   4786                   33.4                   4862
##   ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1                   6049                   4665                    0.0
## 2                   6097                   4621                    0.0
## 3                   6078                   4621                    0.0
## 4                   6073                   4611                    0.0
## 5                   6102                   4659                   -0.7
## 6                   6115                   4696                   -0.6
##   ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1                     NA                     NA                     NA
## 2                      3                      0                      3
## 3                      4                      1                      4
## 4                      5                      2                      5
## 5                      8                      4                     18
## 6                      9                      1                      1
##   ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1                   4873                   6074                   4685
## 2                   4869                   6107                   4630
## 3                   4897                   6116                   4637
## 4                   4892                   6111                   4630
## 5                   4930                   6151                   4684
## 6                   4871                   6128                   4687
##   ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1                   10.7                   21.0                    9.9
## 2                   11.2                   21.4                    9.9
## 3                   11.1                   21.3                    9.4
## 4                   11.1                   21.3                    9.4
## 5                   11.3                   21.6                    9.0
## 6                   11.4                   21.7                   10.1
##   ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1                   69.1                    156                     66
## 2                   68.7                    169                     66
## 3                   69.3                    173                     66
## 4                   69.3                    171                     68
## 5                   69.4                    171                     70
## 6                   68.2                    173                     70
##   ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1                    2.4                    486                  0.019
## 2                    2.6                    508                  0.019
## 3                    2.6                    509                  0.018
## 4                    2.5                    496                  0.018
## 5                    2.5                    468                  0.017
## 6                    2.5                    490                  0.018
##   ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1                    0.5                      3                    7.2
## 2                    2.0                      2                    7.2
## 3                    0.7                      2                    7.2
## 4                    1.2                      2                    7.2
## 5                    0.2                      2                    7.3
## 6                    0.4                      2                    7.2
##   ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1                     NA                     NA                   11.6
## 2                    0.1                   0.15                   11.1
## 3                    0.0                   0.00                   12.0
## 4                    0.0                   0.00                   10.6
## 5                    0.0                   0.00                   11.0
## 6                    0.0                   0.00                   11.5
##   ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1                    3.0                    1.8                    2.4
## 2                    0.9                    1.9                    2.2
## 3                    1.0                    1.8                    2.3
## 4                    1.1                    1.8                    2.1
## 5                    1.1                    1.7                    2.1
## 6                    2.2                    1.8                    2.0

B.

A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect.3.8).

Imputing with MICE

chemical_imputed <- mice(data = ChemicalManufacturingProcess,m=5,maxit=40,meth='pmm',seed=500,printFlag = FALSE)
## Warning: Number of logged events: 5400
data_imputed <- complete(chemical_imputed,1)

C.

Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

# Removing near zero variance predictors

nzv <- nearZeroVar(data_imputed)
remaining_predictors <- data_imputed[,-nzv]
ncol(remaining_predictors)
## [1] 57

Biological Material 07 removed for near zero variance

# Removing highly correlated predictors

high_correlation <- findCorrelation(cor(remaining_predictors[,2:57]), cutoff = .9, exact = TRUE)
length(high_correlation)
## [1] 10

There are 10 highly correlated predictors (>0.90) I will be removing

filtered_remaining <- remaining_predictors[,-high_correlation]
ncol(filtered_remaining)
## [1] 47
split <- createDataPartition(filtered_remaining$Yield, p = 0.75, list = FALSE)

train.p <- filtered_remaining[split, -1]
train.y <- filtered_remaining[split, 1]

test.p <- filtered_remaining[-split, -1]
test.y <- filtered_remaining[-split, 1]

p_pro <- c("nzv", "center","scale")
tr_ctrl =trainControl(method = "boot", number=5)

pls_fit <-train(train.p, train.y, method="pls", tuneLength = 20, preProcess=p_pro,trControl=tr_ctrl)

pls_fit
## Partial Least Squares 
## 
## 132 samples
##  46 predictor
## 
## Pre-processing: centered (46), scaled (46) 
## Resampling: Bootstrapped (5 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     1.723519  0.3542672  1.153243
##    2     2.497541  0.3521729  1.185811
##    3     2.148459  0.4268663  1.118852
##    4     2.106323  0.4298076  1.126935
##    5     2.135111  0.3945931  1.161670
##    6     2.206572  0.3669931  1.193088
##    7     2.156421  0.3488718  1.201320
##    8     2.430114  0.2979331  1.236656
##    9     2.661789  0.2718286  1.279640
##   10     2.827014  0.2685592  1.314875
##   11     3.027437  0.2481485  1.380082
##   12     3.182947  0.2336435  1.443144
##   13     3.180484  0.2464016  1.434462
##   14     3.191236  0.2260686  1.446116
##   15     3.338413  0.1686651  1.480003
##   16     3.527641  0.1502366  1.507497
##   17     3.812876  0.1618443  1.543920
##   18     4.401968  0.1647776  1.628589
##   19     5.052953  0.1515627  1.735406
##   20     5.871303  0.1863467  1.849039
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(pls_fit)

D.

Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

pls_predict <- predict(pls_fit, test.p)
postResample(pred = pls_predict, obs = test.y)
##      RMSE  Rsquared       MAE 
## 1.5735779 0.3935452 1.2471327

The R squared found was 0.12. Our pls model does not account for much of the variablity in the yield seen in the data.

E.

Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

# using varImp function

impv <- varImp(pls_fit)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
impv
## pls variable importance
## 
##   only 20 most important variables shown (out of 46)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess09   91.46
## ManufacturingProcess36   87.56
## ManufacturingProcess13   84.83
## BiologicalMaterial02     81.88
## BiologicalMaterial06     79.55
## ManufacturingProcess06   70.57
## ManufacturingProcess11   64.77
## BiologicalMaterial04     64.59
## ManufacturingProcess33   64.42
## BiologicalMaterial08     63.70
## BiologicalMaterial12     60.05
## ManufacturingProcess29   54.45
## ManufacturingProcess31   47.36
## ManufacturingProcess12   44.07
## ManufacturingProcess10   38.59
## ManufacturingProcess04   35.66
## ManufacturingProcess35   35.55
## ManufacturingProcess34   35.39
## BiologicalMaterial10     34.76

The manufacturing processes dominate the list. The top four most important predictors are all manufacturing processes.

F.

Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving the yield in future runs of the manufacturing process?

toppredictors <- filtered_remaining[,c("ManufacturingProcess32", "ManufacturingProcess36", "ManufacturingProcess13","ManufacturingProcess09","BiologicalMaterial02","BiologicalMaterial06")]

target_column <- filtered_remaining$Yield
head(toppredictors)
##   ManufacturingProcess32 ManufacturingProcess36 ManufacturingProcess13
## 1                    156                  0.019                   35.5
## 2                    169                  0.019                   34.0
## 3                    173                  0.018                   34.8
## 4                    171                  0.018                   34.8
## 5                    171                  0.017                   34.6
## 6                    173                  0.018                   34.0
##   ManufacturingProcess09 BiologicalMaterial02 BiologicalMaterial06
## 1                  43.00                49.58                43.73
## 2                  46.57                60.97                53.14
## 3                  45.07                60.97                53.14
## 4                  44.92                60.97                53.14
## 5                  44.96                63.33                54.66
## 6                  45.32                58.36                51.23
correlations <- sapply(toppredictors, function(column) cor(target_column, column))
print(correlations)
## ManufacturingProcess32 ManufacturingProcess36 ManufacturingProcess13 
##              0.6083321             -0.5179827             -0.5036797 
## ManufacturingProcess09   BiologicalMaterial02   BiologicalMaterial06 
##              0.5034705              0.4815158              0.4781634

Exploring the correlations of the top predictors versus the yeild shows us that process 36 and 13 have strong negative correlations, and so to improve the yield these processes should be avoided where it can be. The other processes and materials listed in the top predictors should all be maximized in replacement of other less efficient processes or materials.