#install.packages('elasticnet')
library(corrplot)
## corrplot 0.84 loaded
library(mlbench)
library(e1071)
library(caret) 
## Loading required package: lattice
## Loading required package: ggplot2
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(missMDA)
library(DMwR)
## Loading required package: grid
## 
## Attaching package: 'DMwR'
## The following object is masked from 'package:plyr':
## 
##     join
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:corrplot':
## 
##     corrplot
## The following object is masked from 'package:stats':
## 
##     loadings
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.2

Question 6.2

Developing a model to predict permeability (see Sect.1.4) could save signi???cant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a su???cient permeability to become a drug:

  1. Start R and use these commands to load the data:
library(AppliedPredictiveModeling) 
data(permeability)
summary(permeability)
##   permeability  
##  Min.   : 0.06  
##  1st Qu.: 1.55  
##  Median : 4.91  
##  Mean   :12.24  
##  3rd Qu.:15.47  
##  Max.   :55.60
head(permeability)
##   permeability
## 1       12.520
## 2        1.120
## 3       19.405
## 4        1.730
## 5        1.680
## 6        0.510

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

  1. The ???ngerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?
nz <- nearZeroVar(fingerprints)
length(nz)
## [1] 719
# Filter predictors
fp_df <- fingerprints[, -nz]

719 predictors are left for modeling (c) Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2? Split data into train and test set

## 75% of the sample size
smp_size <- floor(0.75 * nrow(fp_df))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(fp_df)), size = smp_size)

train <- fp_df[train_ind, ]
test <-fp_df[-train_ind, ]
test_fp<-test

permeability_train <- permeability[train_ind, ]
permeability_test <- permeability[-train_ind, ]


pls_model <- train(train, permeability_train,
                method = "pls",
                tuneLength = 10,
                trControl = trainControl(method = "cv"))

pls_model
## Partial Least Squares 
## 
## 123 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 110, 111, 111, 111, 111, 111, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     12.62475  0.2900612  9.443021
##    2     11.27070  0.4976929  7.873413
##    3     10.85735  0.5183636  8.077933
##    4     10.78911  0.5252378  8.375595
##    5     10.57609  0.5543138  8.029613
##    6     10.35604  0.5628768  7.731702
##    7     10.19970  0.5711496  7.485803
##    8     10.24760  0.5704241  7.586381
##    9     10.27141  0.5612497  7.611205
##   10     10.52644  0.5380197  7.665872
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 7.

We are picking model 7 10.19970 as RMSE 0.5711496 as Rsquared 7.485803 as MAE

  1. Predict the response for the test set. What is the test set estimate of R2?
lmPred1_fp <- predict(pls_model, test) 
lmValues1 <- data.frame(obs = permeability_test, pred = lmPred1_fp) 
defaultSummary(lmValues1) 
##      RMSE  Rsquared       MAE 
## 12.878365  0.496656  9.308729

We see that the RMSE is 12.87 and Rsquared is 0.49 and MAE is 9.3

  1. Try building other models discussed in this chapter. Do any have better predictive performance?

Let’s try to build the Ridge regression model

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
ctrl <- trainControl(method = "cv", number = 10)
ridgeModel <-  enet (x = as.matrix(train), y = permeability_train,  lambda = 0.001)
 ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15)) 
 ridgeRegFit <- train(train, permeability_train, method = "ridge", tuneGrid = ridgeGrid, trControl = ctrl,preProc = c("center", "scale"))
 ridgeRegFit
## Ridge Regression 
## 
## 123 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 111, 111, 111, 111, 111, 110, ... 
## Resampling results across tuning parameters:
## 
##   lambda       RMSE        Rsquared   MAE        
##   0.000000000    10.70911  0.5389154     7.677982
##   0.007142857  4481.47248  0.3111252  3007.946093
##   0.014285714   224.54490  0.4327764   148.895417
##   0.021428571    11.31681  0.4434589     8.190686
##   0.028571429    11.07845  0.4576801     8.074427
##   0.035714286    10.90095  0.4706866     7.997531
##   0.042857143    10.79583  0.4795058     7.967427
##   0.050000000    10.72215  0.4869756     7.947439
##   0.057142857    10.65453  0.4935822     7.927341
##   0.064285714    10.63009  0.4980133     7.937547
##   0.071428571    10.59211  0.5024700     7.939073
##   0.078571429    10.56938  0.5061914     7.941613
##   0.085714286    10.56359  0.5085101     7.959751
##   0.092857143    10.53339  0.5128997     7.950083
##   0.100000000    10.52636  0.5151716     7.963822
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.
rgPred1_fp <- predict(ridgeRegFit, test_fp) 
rgValues1 <- data.frame(obs = permeability_test, pred = rgPred1_fp) 
defaultSummary(rgValues1) 
##       RMSE   Rsquared        MAE 
## 13.3108143  0.4749396  9.6809970

13.3108143 as RMSE 0.4749396 as Rsquared 9.6809970 2 as MAE

  1. Would you recommend any of your models to replace the permeability laboratory experiment? We can see that the Penalized linear model is better that the Ridge regression

Question 6.3

6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors),measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

  1. Start R and use these commands to load the data
library(AppliedPredictiveModeling) 
data("ChemicalManufacturingProcess")
head(ChemicalManufacturingProcess)
##   Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 38.00                 6.25                49.58                56.97
## 2 42.44                 8.01                60.97                67.48
## 3 42.03                 8.01                60.97                67.48
## 4 41.42                 8.01                60.97                67.48
## 5 42.49                 7.47                63.33                72.25
## 6 43.57                 6.12                58.36                65.31
##   BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1                12.74                19.51                43.73
## 2                14.65                19.36                53.14
## 3                14.65                19.36                53.14
## 4                14.65                19.36                53.14
## 5                14.02                17.91                54.66
## 6                15.17                21.79                51.23
##   BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1                  100                16.66                11.44
## 2                  100                19.04                12.55
## 3                  100                19.04                12.55
## 4                  100                19.04                12.55
## 5                  100                18.22                12.80
## 6                  100                18.30                12.13
##   BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1                 3.46               138.09                18.83
## 2                 3.46               153.67                21.05
## 3                 3.46               153.67                21.05
## 4                 3.46               153.67                21.05
## 5                 3.05               147.61                21.05
## 6                 3.78               151.88                20.76
##   ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1                     NA                     NA                     NA
## 2                    0.0                      0                     NA
## 3                    0.0                      0                     NA
## 4                    0.0                      0                     NA
## 5                   10.7                      0                     NA
## 6                   12.0                      0                     NA
##   ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1                     NA                     NA                     NA
## 2                    917                 1032.2                  210.0
## 3                    912                 1003.6                  207.1
## 4                    911                 1014.6                  213.3
## 5                    918                 1027.5                  205.7
## 6                    924                 1016.8                  208.9
##   ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1                     NA                     NA                  43.00
## 2                    177                    178                  46.57
## 3                    178                    178                  45.07
## 4                    177                    177                  44.92
## 5                    178                    178                  44.96
## 6                    178                    178                  45.32
##   ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1                     NA                     NA                     NA
## 2                     NA                     NA                      0
## 3                     NA                     NA                      0
## 4                     NA                     NA                      0
## 5                     NA                     NA                      0
## 6                     NA                     NA                      0
##   ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1                   35.5                   4898                   6108
## 2                   34.0                   4869                   6095
## 3                   34.8                   4878                   6087
## 4                   34.8                   4897                   6102
## 5                   34.6                   4992                   6233
## 6                   34.0                   4985                   6222
##   ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1                   4682                   35.5                   4865
## 2                   4617                   34.0                   4867
## 3                   4617                   34.8                   4877
## 4                   4635                   34.8                   4872
## 5                   4733                   33.9                   4886
## 6                   4786                   33.4                   4862
##   ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1                   6049                   4665                    0.0
## 2                   6097                   4621                    0.0
## 3                   6078                   4621                    0.0
## 4                   6073                   4611                    0.0
## 5                   6102                   4659                   -0.7
## 6                   6115                   4696                   -0.6
##   ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1                     NA                     NA                     NA
## 2                      3                      0                      3
## 3                      4                      1                      4
## 4                      5                      2                      5
## 5                      8                      4                     18
## 6                      9                      1                      1
##   ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1                   4873                   6074                   4685
## 2                   4869                   6107                   4630
## 3                   4897                   6116                   4637
## 4                   4892                   6111                   4630
## 5                   4930                   6151                   4684
## 6                   4871                   6128                   4687
##   ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1                   10.7                   21.0                    9.9
## 2                   11.2                   21.4                    9.9
## 3                   11.1                   21.3                    9.4
## 4                   11.1                   21.3                    9.4
## 5                   11.3                   21.6                    9.0
## 6                   11.4                   21.7                   10.1
##   ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1                   69.1                    156                     66
## 2                   68.7                    169                     66
## 3                   69.3                    173                     66
## 4                   69.3                    171                     68
## 5                   69.4                    171                     70
## 6                   68.2                    173                     70
##   ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1                    2.4                    486                  0.019
## 2                    2.6                    508                  0.019
## 3                    2.6                    509                  0.018
## 4                    2.5                    496                  0.018
## 5                    2.5                    468                  0.017
## 6                    2.5                    490                  0.018
##   ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1                    0.5                      3                    7.2
## 2                    2.0                      2                    7.2
## 3                    0.7                      2                    7.2
## 4                    1.2                      2                    7.2
## 5                    0.2                      2                    7.3
## 6                    0.4                      2                    7.2
##   ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1                     NA                     NA                   11.6
## 2                    0.1                   0.15                   11.1
## 3                    0.0                   0.00                   12.0
## 4                    0.0                   0.00                   10.6
## 5                    0.0                   0.00                   11.0
## 6                    0.0                   0.00                   11.5
##   ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1                    3.0                    1.8                    2.4
## 2                    0.9                    1.9                    2.2
## 3                    1.0                    1.8                    2.3
## 4                    1.1                    1.8                    2.1
## 5                    1.1                    1.7                    2.1
## 6                    2.2                    1.8                    2.0
  1. A small percentage of cells in the predictor set contain missing values. Use an imputation function to ???ll in these missing values (e.g., see Sect.3.8).
summary(ChemicalManufacturingProcess)
##      Yield       BiologicalMaterial01 BiologicalMaterial02
##  Min.   :35.25   Min.   :4.580        Min.   :46.87       
##  1st Qu.:38.75   1st Qu.:5.978        1st Qu.:52.68       
##  Median :39.97   Median :6.305        Median :55.09       
##  Mean   :40.18   Mean   :6.411        Mean   :55.69       
##  3rd Qu.:41.48   3rd Qu.:6.870        3rd Qu.:58.74       
##  Max.   :46.34   Max.   :8.810        Max.   :64.75       
##                                                           
##  BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05
##  Min.   :56.97        Min.   : 9.38        Min.   :13.24       
##  1st Qu.:64.98        1st Qu.:11.24        1st Qu.:17.23       
##  Median :67.22        Median :12.10        Median :18.49       
##  Mean   :67.70        Mean   :12.35        Mean   :18.60       
##  3rd Qu.:70.43        3rd Qu.:13.22        3rd Qu.:19.90       
##  Max.   :78.25        Max.   :23.09        Max.   :24.85       
##                                                                
##  BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
##  Min.   :40.60        Min.   :100.0        Min.   :15.88       
##  1st Qu.:46.05        1st Qu.:100.0        1st Qu.:17.06       
##  Median :48.46        Median :100.0        Median :17.51       
##  Mean   :48.91        Mean   :100.0        Mean   :17.49       
##  3rd Qu.:51.34        3rd Qu.:100.0        3rd Qu.:17.88       
##  Max.   :59.38        Max.   :100.8        Max.   :19.14       
##                                                                
##  BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11
##  Min.   :11.44        Min.   :1.770        Min.   :135.8       
##  1st Qu.:12.60        1st Qu.:2.460        1st Qu.:143.8       
##  Median :12.84        Median :2.710        Median :146.1       
##  Mean   :12.85        Mean   :2.801        Mean   :147.0       
##  3rd Qu.:13.13        3rd Qu.:2.990        3rd Qu.:149.6       
##  Max.   :14.08        Max.   :6.870        Max.   :158.7       
##                                                                
##  BiologicalMaterial12 ManufacturingProcess01 ManufacturingProcess02
##  Min.   :18.35        Min.   : 0.00          Min.   : 0.00         
##  1st Qu.:19.73        1st Qu.:10.80          1st Qu.:19.30         
##  Median :20.12        Median :11.40          Median :21.00         
##  Mean   :20.20        Mean   :11.21          Mean   :16.68         
##  3rd Qu.:20.75        3rd Qu.:12.15          3rd Qu.:21.50         
##  Max.   :22.21        Max.   :14.10          Max.   :22.50         
##                       NA's   :1              NA's   :3             
##  ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
##  Min.   :1.47           Min.   :911.0          Min.   : 923.0        
##  1st Qu.:1.53           1st Qu.:928.0          1st Qu.: 986.8        
##  Median :1.54           Median :934.0          Median : 999.2        
##  Mean   :1.54           Mean   :931.9          Mean   :1001.7        
##  3rd Qu.:1.55           3rd Qu.:936.0          3rd Qu.:1008.9        
##  Max.   :1.60           Max.   :946.0          Max.   :1175.3        
##  NA's   :15             NA's   :1              NA's   :1             
##  ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
##  Min.   :203.0          Min.   :177.0          Min.   :177.0         
##  1st Qu.:205.7          1st Qu.:177.0          1st Qu.:177.0         
##  Median :206.8          Median :177.0          Median :178.0         
##  Mean   :207.4          Mean   :177.5          Mean   :177.6         
##  3rd Qu.:208.7          3rd Qu.:178.0          3rd Qu.:178.0         
##  Max.   :227.4          Max.   :178.0          Max.   :178.0         
##  NA's   :2              NA's   :1              NA's   :1             
##  ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11
##  Min.   :38.89          Min.   : 7.500         Min.   : 7.500        
##  1st Qu.:44.89          1st Qu.: 8.700         1st Qu.: 9.000        
##  Median :45.73          Median : 9.100         Median : 9.400        
##  Mean   :45.66          Mean   : 9.179         Mean   : 9.386        
##  3rd Qu.:46.52          3rd Qu.: 9.550         3rd Qu.: 9.900        
##  Max.   :49.36          Max.   :11.600         Max.   :11.500        
##                         NA's   :9              NA's   :10            
##  ManufacturingProcess12 ManufacturingProcess13 ManufacturingProcess14
##  Min.   :   0.0         Min.   :32.10          Min.   :4701          
##  1st Qu.:   0.0         1st Qu.:33.90          1st Qu.:4828          
##  Median :   0.0         Median :34.60          Median :4856          
##  Mean   : 857.8         Mean   :34.51          Mean   :4854          
##  3rd Qu.:   0.0         3rd Qu.:35.20          3rd Qu.:4882          
##  Max.   :4549.0         Max.   :38.60          Max.   :5055          
##  NA's   :1                                     NA's   :1             
##  ManufacturingProcess15 ManufacturingProcess16 ManufacturingProcess17
##  Min.   :5904           Min.   :   0           Min.   :31.30         
##  1st Qu.:6010           1st Qu.:4561           1st Qu.:33.50         
##  Median :6032           Median :4588           Median :34.40         
##  Mean   :6039           Mean   :4566           Mean   :34.34         
##  3rd Qu.:6061           3rd Qu.:4619           3rd Qu.:35.10         
##  Max.   :6233           Max.   :4852           Max.   :40.00         
##                                                                      
##  ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
##  Min.   :   0           Min.   :5890           Min.   :   0          
##  1st Qu.:4813           1st Qu.:6001           1st Qu.:4553          
##  Median :4835           Median :6022           Median :4582          
##  Mean   :4810           Mean   :6028           Mean   :4556          
##  3rd Qu.:4862           3rd Qu.:6050           3rd Qu.:4610          
##  Max.   :4971           Max.   :6146           Max.   :4759          
##                                                                      
##  ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
##  Min.   :-1.8000        Min.   : 0.000         Min.   :0.000         
##  1st Qu.:-0.6000        1st Qu.: 3.000         1st Qu.:2.000         
##  Median :-0.3000        Median : 5.000         Median :3.000         
##  Mean   :-0.1642        Mean   : 5.406         Mean   :3.017         
##  3rd Qu.: 0.0000        3rd Qu.: 8.000         3rd Qu.:4.000         
##  Max.   : 3.6000        Max.   :12.000         Max.   :6.000         
##                         NA's   :1              NA's   :1             
##  ManufacturingProcess24 ManufacturingProcess25 ManufacturingProcess26
##  Min.   : 0.000         Min.   :   0           Min.   :   0          
##  1st Qu.: 4.000         1st Qu.:4832           1st Qu.:6020          
##  Median : 8.000         Median :4855           Median :6047          
##  Mean   : 8.834         Mean   :4828           Mean   :6016          
##  3rd Qu.:14.000         3rd Qu.:4877           3rd Qu.:6070          
##  Max.   :23.000         Max.   :4990           Max.   :6161          
##  NA's   :1              NA's   :5              NA's   :5             
##  ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
##  Min.   :   0           Min.   : 0.000         Min.   : 0.00         
##  1st Qu.:4560           1st Qu.: 0.000         1st Qu.:19.70         
##  Median :4587           Median :10.400         Median :19.90         
##  Mean   :4563           Mean   : 6.592         Mean   :20.01         
##  3rd Qu.:4609           3rd Qu.:10.750         3rd Qu.:20.40         
##  Max.   :4710           Max.   :11.500         Max.   :22.00         
##  NA's   :5              NA's   :5              NA's   :5             
##  ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
##  Min.   : 0.000         Min.   : 0.00          Min.   :143.0         
##  1st Qu.: 8.800         1st Qu.:70.10          1st Qu.:155.0         
##  Median : 9.100         Median :70.80          Median :158.0         
##  Mean   : 9.161         Mean   :70.18          Mean   :158.5         
##  3rd Qu.: 9.700         3rd Qu.:71.40          3rd Qu.:162.0         
##  Max.   :11.200         Max.   :72.50          Max.   :173.0         
##  NA's   :5              NA's   :5                                    
##  ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35
##  Min.   :56.00          Min.   :2.300          Min.   :463.0         
##  1st Qu.:62.00          1st Qu.:2.500          1st Qu.:490.0         
##  Median :64.00          Median :2.500          Median :495.0         
##  Mean   :63.54          Mean   :2.494          Mean   :495.6         
##  3rd Qu.:65.00          3rd Qu.:2.500          3rd Qu.:501.5         
##  Max.   :70.00          Max.   :2.600          Max.   :522.0         
##  NA's   :5              NA's   :5              NA's   :5             
##  ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
##  Min.   :0.01700        Min.   :0.000          Min.   :0.000         
##  1st Qu.:0.01900        1st Qu.:0.700          1st Qu.:2.000         
##  Median :0.02000        Median :1.000          Median :3.000         
##  Mean   :0.01957        Mean   :1.014          Mean   :2.534         
##  3rd Qu.:0.02000        3rd Qu.:1.300          3rd Qu.:3.000         
##  Max.   :0.02200        Max.   :2.300          Max.   :3.000         
##  NA's   :5                                                           
##  ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess41
##  Min.   :0.000          Min.   :0.00000        Min.   :0.00000       
##  1st Qu.:7.100          1st Qu.:0.00000        1st Qu.:0.00000       
##  Median :7.200          Median :0.00000        Median :0.00000       
##  Mean   :6.851          Mean   :0.01771        Mean   :0.02371       
##  3rd Qu.:7.300          3rd Qu.:0.00000        3rd Qu.:0.00000       
##  Max.   :7.500          Max.   :0.10000        Max.   :0.20000       
##                         NA's   :1              NA's   :1             
##  ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
##  Min.   : 0.00          Min.   : 0.0000        Min.   :0.000         
##  1st Qu.:11.40          1st Qu.: 0.6000        1st Qu.:1.800         
##  Median :11.60          Median : 0.8000        Median :1.900         
##  Mean   :11.21          Mean   : 0.9119        Mean   :1.805         
##  3rd Qu.:11.70          3rd Qu.: 1.0250        3rd Qu.:1.900         
##  Max.   :12.10          Max.   :11.0000        Max.   :2.100         
##                                                                      
##  ManufacturingProcess45
##  Min.   :0.000         
##  1st Qu.:2.100         
##  Median :2.200         
##  Mean   :2.138         
##  3rd Qu.:2.300         
##  Max.   :2.600         
## 

We clearly see some predictors have missing values let’s impute it with knn

ChemicalManuf_df<-ChemicalManufacturingProcess
ChemicalManuf_df <- knnImputation(ChemicalManuf_df[, !names(ChemicalManuf_df) %in% "Yield"])
anyNA(ChemicalManuf_df)
## [1] FALSE
ChemicalManuf_df$Yield <-ChemicalManufacturingProcess$Yield
  1. Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

Split data into train and test set

## 75% of the sample size
smp_size <- floor(0.75 * nrow(ChemicalManuf_df))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(ChemicalManuf_df)), size = smp_size)

train <- ChemicalManuf_df[train_ind, ]
test <-ChemicalManuf_df[-train_ind, ]

Since we already imputed missing values we don’t have much left to pre-process

Let’s try using ordinary linear regression

lmFitAllPredictors <- lm(Yield ~ ., data = train)
summary(lmFitAllPredictors)
## 
## Call:
## lm(formula = Yield ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.77835 -0.49030 -0.03638  0.44151  2.16912 
## 
## Coefficients: (1 not defined because of singularities)
##                          Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             2.770e+02  1.576e+02   1.758  0.08286 . 
## BiologicalMaterial01    3.168e-01  4.241e-01   0.747  0.45740   
## BiologicalMaterial02   -7.270e-02  1.544e-01  -0.471  0.63907   
## BiologicalMaterial03    7.080e-01  3.306e-01   2.141  0.03549 * 
## BiologicalMaterial04   -1.195e-01  6.703e-01  -0.178  0.85895   
## BiologicalMaterial05    2.656e-01  1.352e-01   1.964  0.05318 . 
## BiologicalMaterial06   -7.302e-01  4.366e-01  -1.673  0.09857 . 
## BiologicalMaterial07   -1.394e+00  1.141e+00  -1.222  0.22572   
## BiologicalMaterial08    1.490e+00  8.851e-01   1.683  0.09645 . 
## BiologicalMaterial09   -3.322e+00  1.922e+00  -1.729  0.08798 . 
## BiologicalMaterial10   -4.643e-01  1.771e+00  -0.262  0.79391   
## BiologicalMaterial11   -9.107e-02  1.008e-01  -0.904  0.36897   
## BiologicalMaterial12    7.872e-01  7.897e-01   0.997  0.32203   
## ManufacturingProcess01  1.898e-01  1.396e-01   1.360  0.17795   
## ManufacturingProcess02 -5.480e-02  6.184e-02  -0.886  0.37836   
## ManufacturingProcess03  1.652e-01  6.872e+00   0.024  0.98089   
## ManufacturingProcess04  3.383e-02  4.228e-02   0.800  0.42621   
## ManufacturingProcess05  3.062e-03  4.465e-03   0.686  0.49486   
## ManufacturingProcess06  1.207e-01  8.263e-02   1.460  0.14839   
## ManufacturingProcess07 -1.552e-01  2.726e-01  -0.569  0.57099   
## ManufacturingProcess08 -2.466e-01  3.308e-01  -0.745  0.45838   
## ManufacturingProcess09  3.161e-01  2.404e-01   1.315  0.19254   
## ManufacturingProcess10 -6.921e-01  7.025e-01  -0.985  0.32767   
## ManufacturingProcess11  4.460e-02  8.668e-01   0.051  0.95910   
## ManufacturingProcess12  3.869e-05  1.436e-04   0.269  0.78836   
## ManufacturingProcess13 -4.554e-01  4.738e-01  -0.961  0.33955   
## ManufacturingProcess14 -6.512e-03  1.327e-02  -0.491  0.62508   
## ManufacturingProcess15  2.713e-03  1.159e-02   0.234  0.81554   
## ManufacturingProcess16 -2.433e-05  3.747e-04  -0.065  0.94841   
## ManufacturingProcess17 -6.513e-02  3.907e-01  -0.167  0.86803   
## ManufacturingProcess18  7.537e-03  5.537e-03   1.361  0.17751   
## ManufacturingProcess19 -1.508e-02  1.077e-02  -1.401  0.16546   
## ManufacturingProcess20 -7.739e-03  5.836e-03  -1.326  0.18880   
## ManufacturingProcess21         NA         NA      NA       NA   
## ManufacturingProcess22 -7.860e-02  5.510e-02  -1.427  0.15782   
## ManufacturingProcess23  1.338e-01  1.104e-01   1.213  0.22899   
## ManufacturingProcess24 -6.122e-02  3.100e-02  -1.975  0.05197 . 
## ManufacturingProcess25 -5.284e-03  1.832e-02  -0.288  0.77378   
## ManufacturingProcess26  1.483e-03  1.337e-02   0.111  0.91195   
## ManufacturingProcess27 -3.732e-03  9.485e-03  -0.393  0.69512   
## ManufacturingProcess28 -1.051e-01  4.166e-02  -2.524  0.01373 * 
## ManufacturingProcess29  2.233e+00  1.017e+00   2.196  0.03120 * 
## ManufacturingProcess30 -1.131e+00  7.711e-01  -1.467  0.14660   
## ManufacturingProcess31  7.921e-03  1.340e-01   0.059  0.95303   
## ManufacturingProcess32  2.794e-01  8.577e-02   3.258  0.00169 **
## ManufacturingProcess33 -2.789e-01  1.629e-01  -1.713  0.09089 . 
## ManufacturingProcess34  1.871e-01  3.726e+00   0.050  0.96008   
## ManufacturingProcess35 -1.756e-02  2.471e-02  -0.710  0.47969   
## ManufacturingProcess36  5.057e+02  4.177e+02   1.211  0.22981   
## ManufacturingProcess37 -8.858e-01  4.078e-01  -2.172  0.03303 * 
## ManufacturingProcess38 -1.191e-01  2.939e-01  -0.405  0.68633   
## ManufacturingProcess39 -2.072e-03  1.718e-01  -0.012  0.99041   
## ManufacturingProcess40  3.166e+00  8.926e+00   0.355  0.72380   
## ManufacturingProcess41 -6.858e-01  6.972e+00  -0.098  0.92190   
## ManufacturingProcess42  2.395e-01  2.685e-01   0.892  0.37511   
## ManufacturingProcess43  4.897e-01  4.674e-01   1.048  0.29812   
## ManufacturingProcess44 -1.838e+00  1.558e+00  -1.179  0.24199   
## ManufacturingProcess45  1.137e+00  7.006e-01   1.623  0.10889   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.082 on 75 degrees of freedom
## Multiple R-squared:  0.8036, Adjusted R-squared:  0.657 
## F-statistic: 5.481 on 56 and 75 DF,  p-value: 1.127e-11
plot(lmFitAllPredictors)

Adjusted Rsquared is 0.657

We see a lot of insignificant predictors so we should probably apply a PLS or reduce the collinearity

We can also see that there is some non linearity in our data set and also the residuals are not normally distributed around the mean. SO let’s try a different model

Let’s check for other model parameters

ctrl <- trainControl(method = "cv", number = 10)
solTrainXtrans <- train[, !names(train) %in% "Yield"]
solTrainY <- train[,  "Yield"]
set.seed(100) 
lmFit1 <- train(x = solTrainXtrans, y = solTrainY, method = "lm", trControl = ctrl)
lmFit1
## Linear Regression 
## 
## 132 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 118, 118, 117, 119, 119, 119, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   3.922679  0.332014  1.981117
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

We can see we have an RMSE of 3.922 and Rsquared of 0.33 and MAE of 1.98. Tuning parameter ‘intercept’ was held constant at a value of TRUE

Let’s find highly correlated predictors and see the difference it makes with the Rsqure. We will filter out the collinear predictors

corThresh <- .9
tooHigh <- findCorrelation(cor(train), corThresh) 
corrPred <- names(train)[tooHigh]
trainXfiltered <- train[, -tooHigh]
testXfiltered <- test[, -tooHigh] 
solTrainXtransfilter <- trainXfiltered[, !names(trainXfiltered) %in% "Yield"]
solTrainYfilter <- trainXfiltered[,  "Yield"]


set.seed(100) 
lmFiltered <- train(solTrainXtransfilter,solTrainYfilter, method = "lm",  trControl = ctrl) 
lmFiltered
## Linear Regression 
## 
## 132 samples
##  48 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 118, 118, 117, 119, 119, 119, ... 
## Resampling results:
## 
##   RMSE     Rsquared   MAE     
##   6.30165  0.3542627  2.564754
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

We can see that the RMSE increased to 6.301, Rsquared is 0.35 increased and MAE as 2.564

plsFit <- plsr(Yield ~ ., data = train)
summary(plsFit)
## Data:    X dimension: 132 57 
##  Y dimension: 132 1
## Fit method: kernelpls
## Number of components considered: 57
## TRAINING: % variance explained
##        1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X        73.17    81.78    93.57    97.91    99.86    99.90    99.94
## Yield    10.91    11.71    12.69    17.53    23.64    38.97    44.04
##        8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X        99.96    99.99     99.99     99.99     99.99    100.00    100.00
## Yield    45.83    47.30     54.78     60.73     63.66     64.61     66.03
##        15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## X         100.0    100.00     100.0    100.00    100.00    100.00
## Yield      70.4     71.01      72.3     72.96     73.25     73.95
##        21 comps  22 comps  23 comps  24 comps  25 comps  26 comps
## X        100.00    100.00    100.00    100.00    100.00    100.00
## Yield     74.48     74.85     75.09     75.73     76.46     76.83
##        27 comps  28 comps  29 comps  30 comps  31 comps  32 comps
## X        100.00    100.00    100.00    100.00    100.00    100.00
## Yield     76.98     77.13     77.41     77.91     78.32     78.56
##        33 comps  34 comps  35 comps  36 comps  37 comps  38 comps
## X         100.0    100.00     100.0    100.00    100.00    100.00
## Yield      78.9     79.12      79.2     79.35     79.46     79.65
##        39 comps  40 comps  41 comps  42 comps  43 comps  44 comps
## X        100.00    100.00    100.00     100.0    100.00    100.00
## Yield     79.78     79.83     79.87      79.9     79.93     79.95
##        45 comps  46 comps  47 comps  48 comps  49 comps  50 comps
## X        100.00    100.00    100.00    100.00    100.00    100.00
## Yield     79.96     79.97     79.98     79.98     79.98     79.98
##        51 comps  52 comps  53 comps  54 comps  55 comps  56 comps
## X        100.00    100.00    100.00    100.00    100.00    100.00
## Yield     79.98     79.98     79.98     79.98     79.98     80.36
##        57 comps
## X        100.00
## Yield     80.36
plot(plsFit)

pls_model <- train(solTrainXtrans, solTrainY,
                   method = "pls",
                   tuneLength = 20,
                   trControl = trainControl(method = "cv"),
                   preProc = c("center", "scale"))

pls_model_filtered <- train(solTrainXtransfilter,solTrainYfilter,
                   method = "pls",
                   tuneLength = 20,
                   trControl = trainControl(method = "cv"),
                   preProc = c("center", "scale"))
  1. Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?
solTestXtrans <- test[, !names(test) %in% "Yield"]
solTestY <- test[,  "Yield"]
lmPred1 <- predict(lmFit1, solTestXtrans) 
lmValues1 <- data.frame(obs = solTestY, pred = lmPred1) 
defaultSummary(lmValues1) 
##      RMSE  Rsquared       MAE 
## 1.4409881 0.5114828 1.1757253

The result for OLS looks better than the model diagnositic performance metric from training set

  1. Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

The most important predictors can be seen by looking at the results of the linear model output. They are ManufacturingProcess37 -8.858e-01 4.078e-01 -2.172 0.03303 ManufacturingProcess32 2.794e-01 8.577e-02 3.258 0.00169 ManufacturingProcess28 -1.051e-01 4.166e-02 -2.524 0.01373 ManufacturingProcess29 2.233e+00 1.017e+00 2.196 0.03120 BiologicalMaterial03 7.080e-01 3.306e-01 2.141 0.03549

We clearly see the manufacturingProcess dominates the list

  1. Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?
#
plot(ChemicalManuf_df$ManufacturingProcess37,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$ManufacturingProcess37), col="red")

plot(ChemicalManuf_df$ManufacturingProcess32,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$ManufacturingProcess32), col="red")

plot(ChemicalManuf_df$ManufacturingProcess28,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$ManufacturingProcess28), col="red")

plot(ChemicalManuf_df$ManufacturingProcess29,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$ManufacturingProcess29), col="red")

plot(ChemicalManuf_df$BiologicalMaterial03,ChemicalManuf_df$Yield)
abline(lm(ChemicalManuf_df$Yield~ChemicalManuf_df$BiologicalMaterial03), col="red")

We can say that the ManufacturingProcess 37 has a slight negative correlation with yield while ManufacturingProcess 32 and BiologicalMaterial03 has a decent positive correlation with yield. If we increase them the yield will increase