Load Packages

library(AppliedPredictiveModeling)
library(caret)
library(corrplot)
library(elasticnet)
library(penalized)
library(pls)
library(grplasso)
library(lars)
library(MASS)
library(stats)
library(tidyverse)
library(VIM)

Exercise 2

Developing a model to predict permeability could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

Start R and use these commands to load the data:

data(permeability)

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

str(fingerprints)
##  num [1:165, 1:1107] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:165] "1" "2" "3" "4" ...
##   ..$ : chr [1:1107] "X1" "X2" "X3" "X4" ...
low_freq_preds <- nearZeroVar(fingerprints)
low_freq_preds
##   [1]    7    8    9   10   13   14   17   18   19   22   23   24   30   31   32
##  [16]   33   34   45   77   81   82   83   84   85   89   90   91   92   95  100
##  [31]  104  105  106  107  109  110  112  113  114  115  116  117  119  120  122
##  [46]  123  124  128  131  132  134  135  136  137  139  140  144  145  147  148
##  [61]  149  151  155  160  161  164  165  166  216  217  218  219  220  222  243
##  [76]  252  259  273  275  277  282  283  287  288  289  292  346  347  348  349
##  [91]  350  351  352  353  354  363  364  365  369  375  379  384  391  393  397
## [106]  399  402  404  405  407  408  409  410  411  412  413  414  415  416  417
## [121]  418  419  420  421  422  423  424  425  426  427  428  429  430  431  432
## [136]  433  434  435  436  437  438  439  440  441  442  443  444  445  446  447
## [151]  448  449  450  451  452  453  454  455  456  457  458  459  460  461  462
## [166]  463  464  465  466  467  468  469  470  471  472  473  474  475  476  477
## [181]  478  479  480  481  482  483  484  485  486  487  488  489  490  491  492
## [196]  493  494  495  498  500  501  502  513  523  525  526  527  528  530  531
## [211]  532  533  534  535  536  537  538  539  540  541  542  543  544  545  546
## [226]  547  548  550  552  555  562  563  564  566  567  569  570  572  575  578
## [241]  579  580  581  582  583  584  585  586  587  588  589  596  605  606  607
## [256]  608  609  610  611  612  614  615  616  617  618  619  620  622  623  624
## [271]  625  626  627  628  629  630  631  632  633  634  635  636  637  638  639
## [286]  640  641  642  643  644  645  646  647  648  649  650  651  652  653  654
## [301]  655  656  657  658  659  660  661  662  663  664  665  666  667  668  669
## [316]  670  671  672  673  674  675  676  677  678  680  681  682  683  684  685
## [331]  686  687  688  689  690  691  692  693  694  695  696  697  706  707  708
## [346]  709  710  711  712  713  714  715  716  717  718  720  721  722  723  724
## [361]  725  726  727  728  729  730  731  734  735  736  737  738  739  740  741
## [376]  742  743  744  745  746  747  748  749  756  757  758  759  760  761  762
## [391]  763  764  765  766  767  768  769  770  771  772  777  778  779  781  783
## [406]  784  785  786  787  788  789  790  791  794  796  797  799  802  803  804
## [421]  807  808  809  810  811  814  815  816  817  818  819  820  821  822  823
## [436]  824  825  826  827  828  829  830  831  832  833  834  835  836  837  838
## [451]  839  840  841  842  843  844  845  846  847  848  849  850  851  852  853
## [466]  854  855  856  857  858  859  860  861  862  863  864  865  866  867  868
## [481]  869  870  871  872  873  874  875  876  877  878  879  880  881  882  883
## [496]  884  885  886  887  888  889  890  891  892  893  894  895  896  897  898
## [511]  899  900  901  902  903  904  905  906  907  908  909  910  911  912  913
## [526]  914  915  916  917  918  919  920  921  922  923  924  925  926  927  928
## [541]  929  930  931  932  933  934  935  936  937  938  939  940  941  942  943
## [556]  944  945  946  947  948  949  950  951  952  953  954  955  956  957  958
## [571]  959  960  961  962  963  964  965  966  967  968  969  970  971  972  973
## [586]  974  975  976  977  978  979  980  981  982  983  984  985  986  987  988
## [601]  989  990  991  992  993  994  995  996  997  998  999 1000 1001 1002 1003
## [616] 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
## [631] 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
## [646] 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
## [661] 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063
## [676] 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
## [691] 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
## [706] 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
length(low_freq_preds)
## [1] 719
fp_filtered <- fingerprints[,-low_freq_preds]
str(fp_filtered)
##  num [1:165, 1:388] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:165] "1" "2" "3" "4" ...
##   ..$ : chr [1:388] "X1" "X2" "X3" "X4" ...

As can be seen from the results above, 719 of the predictors had low frequencies and were thus filtered out. Leaving us with 388 predictors for modeling.

Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R^2?

set.seed(71)
split <- permeability %>%
  createDataPartition(p = 0.8, list = FALSE, times = 1)

Xtrain.data  <- fp_filtered[split, ]
xtest.data <- fp_filtered[-split, ]


Ytrain.data  <- permeability[split, ]
ytest.data <- permeability[-split, ]

ctrl <- trainControl(method = "cv", number = 10)
pls_model <- train(x = Xtrain.data,
                   y = Ytrain.data,
                   method = "pls",
                   tuneLength = 20,
                   trControl = ctrl,
                   preProc = c("center", "scale"))
pls_model
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 119, 120, 120, 120, 120, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.40963  0.3566085  10.300982
##    2     12.03158  0.4983520   8.531048
##    3     11.98115  0.5306144   8.919622
##    4     11.77743  0.5214277   8.807726
##    5     11.56287  0.5194512   8.436684
##    6     11.30635  0.5199872   8.170254
##    7     11.33887  0.5198030   8.487785
##    8     11.73040  0.5018390   8.799399
##    9     12.06584  0.4859098   9.050962
##   10     12.16738  0.4847391   9.087767
##   11     12.30778  0.4840229   9.145742
##   12     12.73094  0.4729775   9.366431
##   13     12.90488  0.4649557   9.620829
##   14     13.09871  0.4533665   9.788464
##   15     13.36499  0.4410856   9.946836
##   16     13.81829  0.4124786  10.205286
##   17     14.16176  0.3989711  10.373191
##   18     14.27418  0.3914947  10.387093
##   19     14.55635  0.3824931  10.688243
##   20     14.64989  0.3787505  10.834724
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.

As can be seen in the results above, anywhere from 3 to 6 components could prove optimal. A number of 3 components gives us our highest value for R^2, 0.5306144, and a number of 6 components leads to our lowest values of RMSE and MAE, 11.30635 and 8.170254, respectively.

Predict the response for the test set. What is the test set estimate of R^2?

pls_prediction <- pls_model %>%
  predict(xtest.data)

cbind(
  RMSE = RMSE(pls_prediction, ytest.data),
  "R^2" = caret::R2(pls_prediction, ytest.data)
)
##          RMSE       R^2
## [1,] 12.94361 0.2213123

The test set estimate of R^2 is 0.2213123.

Try building other models discussed in this chapter. Do any have better predictive performance?

set.seed(72)
linear_model <- train(x = Xtrain.data,
                      y = Ytrain.data,
                      method = "lm",
                      trControl = ctrl)

linear_prediction <- linear_model %>%
  predict(xtest.data)

cbind(RMSE = RMSE(linear_prediction, ytest.data),
      "R^2" = caret::R2(linear_prediction, ytest.data))
##          RMSE        R^2
## [1,] 22.85912 0.06861545
set.seed(73)
ridge_grid <- data.frame(.lambda = seq(0.001, 0.5, length = 25))
ridge_model <- train(x = Xtrain.data,
                     y = Ytrain.data,
                     method = "ridge",
                     metric = "Rsquared",
                     tuneGrid = ridge_grid,
                     trControl = ctrl,
                     preProc = c("center", "scale"))

ridge_prediction <- ridge_model %>%
  predict(xtest.data)

cbind(RMSE = RMSE(ridge_prediction, ytest.data),
      "R^2" = caret::R2(ridge_prediction, ytest.data))
##          RMSE       R^2
## [1,] 13.29316 0.3486511

Above, I tried the linear and ridge models. Between the two of them, the ridge model had a better predictive performance than the pls model with an RMSE of 13.29316 and an R^2 of 0.3486511.

Would you recommend any of your models to replace the permeability laboratory experiment?

Yes, given that the ridge model outperformed the pls model, I would definitely choose that one to replace the permeability laboratory experiment.

Exercise 3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is the understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch.

Start R and use these commands to load the data:

data(ChemicalManufacturingProcess)
str(ChemicalManufacturingProcess)
## 'data.frame':    176 obs. of  58 variables:
##  $ Yield                 : num  38 42.4 42 41.4 42.5 ...
##  $ BiologicalMaterial01  : num  6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
##  $ BiologicalMaterial02  : num  49.6 61 61 61 63.3 ...
##  $ BiologicalMaterial03  : num  57 67.5 67.5 67.5 72.2 ...
##  $ BiologicalMaterial04  : num  12.7 14.6 14.6 14.6 14 ...
##  $ BiologicalMaterial05  : num  19.5 19.4 19.4 19.4 17.9 ...
##  $ BiologicalMaterial06  : num  43.7 53.1 53.1 53.1 54.7 ...
##  $ BiologicalMaterial07  : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ BiologicalMaterial08  : num  16.7 19 19 19 18.2 ...
##  $ BiologicalMaterial09  : num  11.4 12.6 12.6 12.6 12.8 ...
##  $ BiologicalMaterial10  : num  3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
##  $ BiologicalMaterial11  : num  138 154 154 154 148 ...
##  $ BiologicalMaterial12  : num  18.8 21.1 21.1 21.1 21.1 ...
##  $ ManufacturingProcess01: num  NA 0 0 0 10.7 12 11.5 12 12 12 ...
##  $ ManufacturingProcess02: num  NA 0 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess03: num  NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
##  $ ManufacturingProcess04: num  NA 917 912 911 918 924 933 929 928 938 ...
##  $ ManufacturingProcess05: num  NA 1032 1004 1015 1028 ...
##  $ ManufacturingProcess06: num  NA 210 207 213 206 ...
##  $ ManufacturingProcess07: num  NA 177 178 177 178 178 177 178 177 177 ...
##  $ ManufacturingProcess08: num  NA 178 178 177 178 178 178 178 177 177 ...
##  $ ManufacturingProcess09: num  43 46.6 45.1 44.9 45 ...
##  $ ManufacturingProcess10: num  NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
##  $ ManufacturingProcess11: num  NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
##  $ ManufacturingProcess12: num  NA 0 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess13: num  35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
##  $ ManufacturingProcess14: num  4898 4869 4878 4897 4992 ...
##  $ ManufacturingProcess15: num  6108 6095 6087 6102 6233 ...
##  $ ManufacturingProcess16: num  4682 4617 4617 4635 4733 ...
##  $ ManufacturingProcess17: num  35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
##  $ ManufacturingProcess18: num  4865 4867 4877 4872 4886 ...
##  $ ManufacturingProcess19: num  6049 6097 6078 6073 6102 ...
##  $ ManufacturingProcess20: num  4665 4621 4621 4611 4659 ...
##  $ ManufacturingProcess21: num  0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
##  $ ManufacturingProcess22: num  NA 3 4 5 8 9 1 2 3 4 ...
##  $ ManufacturingProcess23: num  NA 0 1 2 4 1 1 2 3 1 ...
##  $ ManufacturingProcess24: num  NA 3 4 5 18 1 1 2 3 4 ...
##  $ ManufacturingProcess25: num  4873 4869 4897 4892 4930 ...
##  $ ManufacturingProcess26: num  6074 6107 6116 6111 6151 ...
##  $ ManufacturingProcess27: num  4685 4630 4637 4630 4684 ...
##  $ ManufacturingProcess28: num  10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
##  $ ManufacturingProcess29: num  21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
##  $ ManufacturingProcess30: num  9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
##  $ ManufacturingProcess31: num  69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
##  $ ManufacturingProcess32: num  156 169 173 171 171 173 159 161 160 164 ...
##  $ ManufacturingProcess33: num  66 66 66 68 70 70 65 65 65 66 ...
##  $ ManufacturingProcess34: num  2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ ManufacturingProcess35: num  486 508 509 496 468 490 475 478 491 488 ...
##  $ ManufacturingProcess36: num  0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
##  $ ManufacturingProcess37: num  0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
##  $ ManufacturingProcess38: num  3 2 2 2 2 2 2 2 3 3 ...
##  $ ManufacturingProcess39: num  7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
##  $ ManufacturingProcess40: num  NA 0.1 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess41: num  NA 0.15 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess42: num  11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
##  $ ManufacturingProcess43: num  3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
##  $ ManufacturingProcess44: num  1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
##  $ ManufacturingProcess45: num  2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values.

sum(is.na(ChemicalManufacturingProcess))
## [1] 106
ChemicalManufacturingProcess %>% 
  summarize_all(funs(sum(is.na(.)))) %>% 
  pivot_longer(everything(),
               names_to = 'Predictor',
               values_to = 'Number of Missing Values')
## # A tibble: 58 × 2
##    Predictor            `Number of Missing Values`
##    <chr>                                     <int>
##  1 Yield                                         0
##  2 BiologicalMaterial01                          0
##  3 BiologicalMaterial02                          0
##  4 BiologicalMaterial03                          0
##  5 BiologicalMaterial04                          0
##  6 BiologicalMaterial05                          0
##  7 BiologicalMaterial06                          0
##  8 BiologicalMaterial07                          0
##  9 BiologicalMaterial08                          0
## 10 BiologicalMaterial09                          0
## # ℹ 48 more rows
cmp_imputed <- kNN(ChemicalManufacturingProcess, k = 5) %>% 
  select(c(1:58))

sum(is.na(cmp_imputed))
## [1] 0
cmp_imputed %>% 
  summarize_all(funs(sum(is.na(.)))) %>% 
  pivot_longer(everything(),
               names_to = 'Predictor',
               values_to = 'Number of Missing Values')
## # A tibble: 58 × 2
##    Predictor            `Number of Missing Values`
##    <chr>                                     <int>
##  1 Yield                                         0
##  2 BiologicalMaterial01                          0
##  3 BiologicalMaterial02                          0
##  4 BiologicalMaterial03                          0
##  5 BiologicalMaterial04                          0
##  6 BiologicalMaterial05                          0
##  7 BiologicalMaterial06                          0
##  8 BiologicalMaterial07                          0
##  9 BiologicalMaterial08                          0
## 10 BiologicalMaterial09                          0
## # ℹ 48 more rows

Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

cmp_prepared <- cmp_imputed %>%
  select(!nearZeroVar(.))

set.seed(74)
split1 <- createDataPartition(cmp_prepared$Yield, p = 0.8, list = FALSE)

train.data  <- cmp_prepared[split1, ]
test.data <- cmp_prepared[-split1, ]

ctrl1 <- trainControl(method = "cv", number = 10)

ridge_grid1 <- data.frame(.lambda = seq(0.001, 0.5, length = 25))
ridge_model1 <- train(Yield ~ .,
                     data = train.data,
                     method = "ridge",
                     metric = "Rsquared",
                     tuneGrid = ridge_grid1,
                     trControl = ctrl1,
                     preProc = c("center", "scale"))

ridge_model1$results[ridge_model1$results$RMSE == min(ridge_model1$results$RMSE),]
##       lambda     RMSE Rsquared      MAE   RMSESD RsquaredSD     MAESD
## 12 0.2297083 1.697949 0.500405 1.178778 0.940023  0.2332219 0.4508482

Given the results above, it seems that the optimal value of the performance metric occurs at ncomp = 12 with an RMSE of 1.697949 and an R^2 of 0.500405.

Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

postResample(pred = predict(ridge_model1, test.data), obs =test.data$Yield)
##      RMSE  Rsquared       MAE 
## 1.4486942 0.6122705 1.1432855

Predicting the response for the test set using our ridge model renders the above values for the performance metrics. The RMSE, R^2, and MAE have values of 1.4486942, 0.6122705, and 1.1432855, respectively. Compared to the resampled performance metrics on the training set, it seems to have done better with lower error rate and a higher R^2.

Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

plot(varImp(ridge_model1), top = 10)

Above is a plot illustrating the predictors which are most important in the previous ridge model I’ve trained. As can be seen, the process predictors dominate the list.

Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

cmp_cor <- cor(cmp_prepared %>% 
                       select(ManufacturingProcess32, ManufacturingProcess13,
                              ManufacturingProcess36, BiologicalMaterial06,
                              ManufacturingProcess31, ManufacturingProcess17,
                              BiologicalMaterial02, ManufacturingProcess09,
                              BiologicalMaterial12, BiologicalMaterial03,
                              Yield))
corrplot::corrplot(cmp_cor)

Strictly with regards to the response, or yield given the predictor, this information could be helpful in improving yield in future runs of the manufacturing process by pouring more time and resources into the processes which have a higher correlation with yield, meaning that these predictors lead to higher outcome. When it comes to inter-predictor relationships, identifying which variables have high correlations to each other could indicate the need for removal in order to optimize resources.