A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch.
#install.packages("AppliedPredictiveModeling")
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
library(caret)
chmp <- ChemicalManufacturingProcess
predictors <- chmp[,-1]
# Correlations - Find most correlated predictors
corr <- cor(predictors, use='complete.obs')
topcorr <- findCorrelation(corr)
pairs(predictors[,topcorr])
# Zero to Near-zero variance predictors check
nzv <- nearZeroVar(predictors)
# Final predictors to be considered for modeling (minus top correlated and near-zero variance)
predictors <- predictors[,-c(nzv, topcorr)]
yield <- as.data.frame(chmp[,1])
colnames(yield) <- c("yield")
The matrix \(processPredictors\) contains the 57 predictors (12 describing the input biological material, and 45 describing the process predictors) for the 176 manufacturing runs. \(yield\) contains the percent yield for each run.
library(mice)
# Number of observations with at least one missing value
nrow(chmp[!complete.cases(chmp),])
## [1] 24
# Missing values ranking by observation
miss_obs <- apply(chmp, 1, function(x) sum(is.na(x)))
sort(miss_obs,decreasing=TRUE)
## 1 172 173 174 175 176 23 2 3 4 5 6 22 24 15 16 17 18
## 16 11 11 11 11 11 4 3 3 3 3 3 3 3 1 1 1 1
## 19 20 90 98 134 139 7 8 9 10 11 12 13 14 21 25 26 27
## 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
## 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82 83 84 85 86 87 88 89 91 92 93 94 95 96 97 99 100 101
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 120 121 122 123 124 125 126 127 128 129 130 131 132 133 135 136 137 138
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 158 159 160 161 162 163 164 165 166 167 168 169 170 171
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# Missing values ranking by column
miss_pred <- apply(chmp, 2, function(x) sum(is.na(x)))
top_miss_pred <- row.names(data.frame(sort(miss_pred,decreasing=TRUE)[1:28]))
top_miss_pred
## [1] "ManufacturingProcess03" "ManufacturingProcess11"
## [3] "ManufacturingProcess10" "ManufacturingProcess25"
## [5] "ManufacturingProcess26" "ManufacturingProcess27"
## [7] "ManufacturingProcess28" "ManufacturingProcess29"
## [9] "ManufacturingProcess30" "ManufacturingProcess31"
## [11] "ManufacturingProcess33" "ManufacturingProcess34"
## [13] "ManufacturingProcess35" "ManufacturingProcess36"
## [15] "ManufacturingProcess02" "ManufacturingProcess06"
## [17] "ManufacturingProcess01" "ManufacturingProcess04"
## [19] "ManufacturingProcess05" "ManufacturingProcess07"
## [21] "ManufacturingProcess08" "ManufacturingProcess12"
## [23] "ManufacturingProcess14" "ManufacturingProcess22"
## [25] "ManufacturingProcess23" "ManufacturingProcess24"
## [27] "ManufacturingProcess40" "ManufacturingProcess41"
# Data Imputation Analysis with MICE
#md.pattern(chmp_new)
library(VIM)
mice_plot <- aggr(chmp, col=c('blue','red'), numbers=TRUE)
## Warning: Number of logged events: 675
# Splitting Train and Test datasets
library(caret)
set.seed(500)
train <- createDataPartition(yield$yield, p = 0.7, list = FALSE)
trainPredictors <- predictors[train,]
trainYield <- yield[train,]
testPredictors <- predictors[-train,]
testYield <- yield[-train,]
# Pre-processing
transtrain <- preProcess(trainPredictors, method=c("BoxCox","center","scale", "knnImpute"))
transtest <- preProcess(testPredictors, method=c("BoxCox","center","scale", "knnImpute"))
transTrainPredictors <- predict(transtrain,trainPredictors)
transTestPredictors <- predict(transtest,testPredictors)
# Modeling
ctrl <- trainControl(method = "boot", number = 25)
pls_tune <- train(x = transTrainPredictors, y = trainYield,
method = "pls",
tuneLength = 15,
trControl = ctrl)
pls_tune
## Partial Least Squares
##
## 124 samples
## 47 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 124, 124, 124, 124, 124, 124, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.351010 0.4485164 1.102844
## 2 1.260323 0.5154151 1.016047
## 3 1.273289 0.5153561 1.028123
## 4 1.309761 0.4989806 1.059869
## 5 1.347540 0.4830462 1.084036
## 6 1.382643 0.4660665 1.108433
## 7 1.426869 0.4437381 1.147465
## 8 1.466832 0.4273872 1.179911
## 9 1.522083 0.4010375 1.228305
## 10 1.569062 0.3827168 1.264325
## 11 1.621665 0.3653264 1.296599
## 12 1.659835 0.3524162 1.318829
## 13 1.703314 0.3384623 1.343085
## 14 1.744742 0.3275454 1.364165
## 15 1.794207 0.3150951 1.391282
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.
## ncomp
## 2 2
# Training set responses predicted and performance
plsTrain <- data.frame(obs=trainYield,pred=predict(pls_tune,transTrainPredictors))
defaultSummary(plsTrain)
## RMSE Rsquared MAE
## 1.0855583 0.6339191 0.8762254
# Test set responses predicted and performance
plsTest <- data.frame(obs=testYield,pred=predict(pls_tune,transTestPredictors))
defaultSummary(plsTest)
## RMSE Rsquared MAE
## 1.3105629 0.5551553 1.0267944
topPred = rownames(plsImp$importance)[order(abs(plsImp$importance),decreasing=TRUE)]
kable(topPred[1:25], col.names = "Top 25 Predictors")
Top 25 Predictors |
---|
ManufacturingProcess32 |
ManufacturingProcess36 |
ManufacturingProcess17 |
ManufacturingProcess13 |
ManufacturingProcess33 |
ManufacturingProcess09 |
BiologicalMaterial06 |
ManufacturingProcess06 |
ManufacturingProcess12 |
BiologicalMaterial08 |
BiologicalMaterial01 |
BiologicalMaterial11 |
ManufacturingProcess04 |
ManufacturingProcess28 |
ManufacturingProcess02 |
BiologicalMaterial10 |
ManufacturingProcess11 |
ManufacturingProcess30 |
ManufacturingProcess15 |
ManufacturingProcess24 |
ManufacturingProcess37 |
ManufacturingProcess19 |
ManufacturingProcess34 |
BiologicalMaterial05 |
ManufacturingProcess01 |
## [,1]
## ManufacturingProcess32 0.6377788
## ManufacturingProcess36 -0.5298740
## ManufacturingProcess17 -0.4268190
## ManufacturingProcess13 -0.4442430
## ManufacturingProcess33 0.4841659