Load libraries
library(tidyverse)
library(fpp3)
library(caret)
library(RANN)
## Warning: package 'RANN' was built under R version 4.4.1
library(corrplot)
#Exercise 6.2.
Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:
library(AppliedPredictiveModeling)
data(permeability)
dim(fingerprints) # 165 compounds by 1107 predictors
## [1] 165 1107
dim(permeability) # 165 by 1
## [1] 165 1
The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.
There are 388 predictors left in the modeling.
#find near-zero variance predictors. Identifies variables with little or no variance in dataset.
near_zero <- nearZeroVar(fingerprints)
# Remove near zero variance predictors
remove <- fingerprints[, -near_zero]
# of predictors left
predictor_left <- ncol(remove)
predictor_left
## [1] 388
The optional latent variable is 6 and the r-squared is .5310320.
# Split data into training and test sets splitting it up by 80% and 20%
set.seed(123)
trainindex <- createDataPartition(permeability, p = 0.8, list = FALSE)
#create training set
train_perm <- permeability[trainindex,] # train response
train_fig <- remove[trainindex,] # train predictor
#create testing set
test_perm <- permeability[-trainindex,] # test response
test_fig <- remove[-trainindex,] #test predictor
# Preprocess the data to normalized or standardized data
pre_process <- preProcess(train_fig, method = c("center", "scale"))
train_fig <- predict(pre_process, train_fig)
test_fig <- predict(pre_process, test_fig)
# Tune PLS model
plsTune <- train(train_fig, train_perm, method = "pls", tuneLength = 20, trControl = trainControl(method = "cv"))
plot(plsTune)
plsTune
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.36940 0.3397706 10.315252
## 2 11.78929 0.4816147 8.575238
## 3 11.90097 0.4849485 9.105076
## 4 12.00362 0.4962113 9.407429
## 5 11.76339 0.5219280 9.024291
## 6 11.55813 0.5310320 8.646010
## 7 11.56404 0.5294165 8.761596
## 8 11.83939 0.5159611 9.222387
## 9 11.97478 0.5174448 9.199224
## 10 12.57694 0.4792439 9.678215
## 11 12.73620 0.4725734 9.714152
## 12 13.03145 0.4514959 9.968887
## 13 13.06366 0.4396748 9.834200
## 14 13.38089 0.4168034 10.057344
## 15 13.60993 0.4011096 10.200724
## 16 13.74283 0.3964039 10.201961
## 17 13.97149 0.3844652 10.412445
## 18 14.21149 0.3695810 10.612168
## 19 14.23455 0.3709200 10.593149
## 20 14.31085 0.3732814 10.675872
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.
The test set estimate of R2 is 0.3244542.
# predict the response for test set
preds <- predict(plsTune, test_fig)
head(preds)
## [1] 3.747929 19.035835 5.102938 4.498555 3.572688 1.200594
# estimate of R2
results <- data.frame(obs = test_perm, pred = preds)
defaultSummary(results)
## RMSE Rsquared MAE
## 12.3486900 0.3244542 8.2881075
The lars model has the highest rsquared and the lowest MAE and RSME.
# PCR model (Principal Component Regression)
pcr_model <- train(train_fig, train_perm, method = "pcr", tuneLength = 20, trControl = trainControl(method = "cv", number = 20))
plot(pcr_model)
pcr_model
## Principal Component Analysis
##
## 133 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (20 fold)
## Summary of sample sizes: 126, 126, 129, 125, 126, 126, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 15.12855 0.2388250 11.990303
## 2 15.16469 0.2298260 12.100808
## 3 14.03954 0.4338455 11.129120
## 4 13.81125 0.4296606 11.083226
## 5 11.99261 0.5429695 8.881204
## 6 12.14973 0.5252840 9.001258
## 7 12.25458 0.5095772 9.099907
## 8 11.94226 0.5400248 8.868113
## 9 11.85584 0.5583869 8.883140
## 10 11.84765 0.5549366 9.082185
## 11 11.69525 0.5697105 8.791421
## 12 11.86217 0.5613899 9.001192
## 13 11.83871 0.5628608 8.865340
## 14 11.82985 0.5645557 8.850505
## 15 11.97295 0.5548194 9.068947
## 16 11.98592 0.5622066 9.184922
## 17 11.97040 0.5485688 9.063915
## 18 11.94176 0.5460955 9.120031
## 19 11.75008 0.5453340 8.903375
## 20 11.83378 0.5339776 9.022563
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 11.
#The optional latent variable is 11 and the r-squared is 0.5724243.
# predict the response for test set
preds2 <- predict(pcr_model, test_fig)
head(preds2)
## [1] -0.6525311 14.5857230 -2.9854313 10.7756507 4.2846161 7.5407337
# estimate of R2
results2 <- data.frame(obs = test_perm, pred = preds2)
defaultSummary(results2)
## RMSE Rsquared MAE
## 12.2076841 0.2922108 8.2251123
#The test set estimate of R2 is 0.2922108
# LAR model (Least Angle Regression model)
lar_model <- train(train_fig,train_perm ,method = "lars",metric = "Rsquared",tuneLength = 20, trControl = trainControl(method = "cv", number = 20))
plot(lar_model)
lar_model
## Least Angle Regression
##
## 133 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (20 fold)
## Summary of sample sizes: 127, 128, 127, 128, 126, 126, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.05 11.27255 0.5424020 8.586401
## 0.10 11.16726 0.5657316 8.307219
## 0.15 11.39324 0.5576667 8.588468
## 0.20 11.99930 0.5251099 9.047357
## 0.25 12.29932 0.5068996 9.202691
## 0.30 12.57987 0.4942214 9.395214
## 0.35 12.82884 0.4842143 9.606889
## 0.40 13.29239 0.4634959 9.926427
## 0.45 13.84808 0.4454349 10.283586
## 0.50 14.52629 0.4251082 10.732905
## 0.55 15.10370 0.4026966 11.091553
## 0.60 15.80534 0.3830601 11.551521
## 0.65 16.66899 0.3590691 12.039681
## 0.70 17.49872 0.3450479 12.586979
## 0.75 18.16186 0.3398105 12.947754
## 0.80 18.94232 0.3317843 13.378403
## 0.85 19.55980 0.3283266 13.749623
## 0.90 20.29479 0.3326077 14.283189
## 0.95 21.13430 0.3325196 14.904031
## 1.00 21.98077 0.3296205 15.403904
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was fraction = 0.1.
#The optional latent variable is .1 and the rsquared is 0.5926813.
# predict the response for test set
lar2 <- predict(lar_model, test_fig)
head(lar2)
## 2 3 6 10 12 17
## 5.38869507 16.98476482 -0.04816209 3.66560248 6.11701658 0.27608381
# estimate of R2
results3 <- data.frame(obs = test_perm, pred = lar2)
defaultSummary(results3)
## RMSE Rsquared MAE
## 11.0018629 0.3887295 7.3843906
#The test set estimate of R2 is 0.3887295.
I would recommend using the lars model to replace the permeability laboratory experiment since it’s better in it’s predictive due to higher rsquared and lower RMSE and MAE.
A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors),6.5 Computing 139 measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch:
Start R and use these commands to load the data:
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess) #176 manufacturing runs by 58 predictors
## [1] 176 58
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
#separate out predictor and response
predictor <- ChemicalManufacturingProcess[,-1]
response <- ChemicalManufacturingProcess$Yield
#find missing value
sum(is.na(predictor))
## [1] 106
# imputation and pre-process
pre_process <- preProcess(predictor, method = c("center", "scale", "knnImpute"))
pred_che <- predict(pre_process, predictor)
# any missing data left
sum(is.na(pred_che))
## [1] 0
The optimal value of the performance is 5 with rsquared at 0.6556819.
#find near-zero variance predictors. Identifies variables with little or no variance in dataset.
near_zero2 <- nearZeroVar(pred_che)
# Remove near zero variance predictors
remove2 <- pred_che[, -near_zero2]
# of predictors left
predictor_left2 <- ncol(remove2)
predictor_left2
## [1] 56
# Split data into training and test sets splitting it up by 80% and 20%
set.seed(123)
trainindex2 <- createDataPartition(response, p = 0.8, list = FALSE)
#create training set
train_rep <- response[trainindex2] # train response
train_pre <- remove2[trainindex2,] # train predictor
#create testing set
test_rep <- response[-trainindex2] # test response
test_pre <- remove2[-trainindex2,] # Test predictor
# Preprocess the data to normalized or standardized data
pre_process2 <- preProcess(train_pre, method = c("center", "scale"))
train_pre <- predict(pre_process2, train_pre)
test_pre <- predict(pre_process2, test_pre)
# Tune PLS model
plsTune2 <- train(train_pre, train_rep, method = "pls", tuneLength = 20, trControl = trainControl(method = "cv")) # cross-validation
plot(plsTune2)
plsTune2
## Partial Least Squares
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 131, 130, 130, 129, 131, 129, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.366841 0.4661172 1.1216565
## 2 1.161328 0.5939247 0.9411430
## 3 1.110515 0.6425731 0.9170970
## 4 1.107444 0.6576468 0.9172662
## 5 1.104609 0.6556819 0.9081022
## 6 1.110936 0.6462348 0.9127275
## 7 1.136026 0.6313198 0.9408114
## 8 1.171716 0.6164293 0.9702732
## 9 1.219901 0.5965118 1.0081905
## 10 1.263800 0.5834937 1.0357096
## 11 1.336965 0.5631760 1.0788515
## 12 1.429478 0.5407312 1.1327482
## 13 1.524420 0.5195852 1.1787326
## 14 1.582126 0.5067425 1.2092136
## 15 1.627323 0.5016396 1.2284634
## 16 1.675593 0.5017653 1.2422633
## 17 1.694754 0.5057723 1.2389459
## 18 1.736670 0.5027049 1.2522984
## 19 1.785086 0.5002747 1.2663508
## 20 1.848662 0.4963180 1.2918495
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 5.
The test rsquared is .4538408 which is lower than the training rsquared of .6557 which indicates there might be overfitting in the training data and may need to be adjusted for better forecasting.
# predict the response for test set
prediction <- predict(plsTune2, test_pre)
# estimate of R2
res <- data.frame(obs = test_rep, pred = prediction)
defaultSummary(res)
## RMSE Rsquared MAE
## 1.4230432 0.4538408 1.2393205
The most important predictors in the model are ManufacturingProcess32. The process predictors dominate the list than the biological since the ManufacturingProcess32 was most important predictor.
# Extract variable importance
importance <- varImp(plsTune2, scale = FALSE)
## Warning: package 'pls' was built under R version 4.4.1
##
## Attaching package: 'pls'
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
# Extract the importance as a data frame
importance_df <- as.data.frame(varImp(plsTune2, scale = FALSE)$importance)
importance_df$Predictor <- rownames(importance_df) # Add predictor names as a column
# Sort by importance in descending order
sort_imp <- importance_df[order(importance_df$Overall, decreasing = TRUE), ]
# Print importance
print(sort_imp)
## Overall Predictor
## ManufacturingProcess32 0.116973335 ManufacturingProcess32
## ManufacturingProcess17 0.102170282 ManufacturingProcess17
## ManufacturingProcess09 0.100497915 ManufacturingProcess09
## ManufacturingProcess13 0.100059387 ManufacturingProcess13
## ManufacturingProcess36 0.097984752 ManufacturingProcess36
## ManufacturingProcess06 0.082660964 ManufacturingProcess06
## ManufacturingProcess33 0.076345540 ManufacturingProcess33
## BiologicalMaterial06 0.074229074 BiologicalMaterial06
## BiologicalMaterial03 0.073790055 BiologicalMaterial03
## BiologicalMaterial08 0.071754007 BiologicalMaterial08
## BiologicalMaterial02 0.070893975 BiologicalMaterial02
## ManufacturingProcess11 0.070747932 ManufacturingProcess11
## BiologicalMaterial12 0.066182653 BiologicalMaterial12
## BiologicalMaterial11 0.066160876 BiologicalMaterial11
## BiologicalMaterial01 0.062830275 BiologicalMaterial01
## ManufacturingProcess28 0.061528542 ManufacturingProcess28
## BiologicalMaterial04 0.060774883 BiologicalMaterial04
## ManufacturingProcess12 0.057706356 ManufacturingProcess12
## ManufacturingProcess37 0.057426715 ManufacturingProcess37
## ManufacturingProcess04 0.055723240 ManufacturingProcess04
## BiologicalMaterial10 0.052673573 BiologicalMaterial10
## ManufacturingProcess02 0.051307150 ManufacturingProcess02
## ManufacturingProcess15 0.047536679 ManufacturingProcess15
## ManufacturingProcess34 0.044655263 ManufacturingProcess34
## ManufacturingProcess30 0.044623508 ManufacturingProcess30
## ManufacturingProcess19 0.041132346 ManufacturingProcess19
## ManufacturingProcess44 0.039685105 ManufacturingProcess44
## ManufacturingProcess10 0.037959336 ManufacturingProcess10
## BiologicalMaterial05 0.034849211 BiologicalMaterial05
## ManufacturingProcess43 0.034435154 ManufacturingProcess43
## ManufacturingProcess45 0.033768398 ManufacturingProcess45
## BiologicalMaterial09 0.031799513 BiologicalMaterial09
## ManufacturingProcess29 0.029839098 ManufacturingProcess29
## ManufacturingProcess21 0.029762493 ManufacturingProcess21
## ManufacturingProcess39 0.029012879 ManufacturingProcess39
## ManufacturingProcess01 0.027974595 ManufacturingProcess01
## ManufacturingProcess05 0.027626352 ManufacturingProcess05
## ManufacturingProcess24 0.025562425 ManufacturingProcess24
## ManufacturingProcess35 0.025222364 ManufacturingProcess35
## ManufacturingProcess42 0.022901533 ManufacturingProcess42
## ManufacturingProcess23 0.020232471 ManufacturingProcess23
## ManufacturingProcess31 0.020162875 ManufacturingProcess31
## ManufacturingProcess18 0.018575777 ManufacturingProcess18
## ManufacturingProcess20 0.018423762 ManufacturingProcess20
## ManufacturingProcess14 0.017317646 ManufacturingProcess14
## ManufacturingProcess38 0.015726649 ManufacturingProcess38
## ManufacturingProcess26 0.014466832 ManufacturingProcess26
## ManufacturingProcess41 0.013560462 ManufacturingProcess41
## ManufacturingProcess25 0.012121834 ManufacturingProcess25
## ManufacturingProcess27 0.011725872 ManufacturingProcess27
## ManufacturingProcess40 0.011691928 ManufacturingProcess40
## ManufacturingProcess08 0.010747792 ManufacturingProcess08
## ManufacturingProcess03 0.010719672 ManufacturingProcess03
## ManufacturingProcess22 0.009317751 ManufacturingProcess22
## ManufacturingProcess07 0.007629938 ManufacturingProcess07
## ManufacturingProcess16 0.005913221 ManufacturingProcess16
#print name of top importance
names <-head(sort_imp$Predictor,10)
print(names)
## [1] "ManufacturingProcess32" "ManufacturingProcess17" "ManufacturingProcess09"
## [4] "ManufacturingProcess13" "ManufacturingProcess36" "ManufacturingProcess06"
## [7] "ManufacturingProcess33" "BiologicalMaterial06" "BiologicalMaterial03"
## [10] "BiologicalMaterial08"
The information is helpful to let us know the relationship between the predictor and the response. In the matrix the ManufacturingProcess32 has the highest correlation with yield. It allows us to find out which predictor is the best to optimize the yield.
# select the top predictor from all the predictors
top <- select(predictor, all_of(names))
# extract yield
top$Yield <- ChemicalManufacturingProcess$Yield
#correlation data
correlation_data <- cor(select_if(top, is.numeric), use = "complete.obs")
# correlation Plot
corrplot::corrplot(correlation_data, method = 'number', type = 'upper', number.cex = .70, tl.cex= .70)