library(tidymodels)
library(caret)
library(mice)
library(dlookr)
library(flextable)
library(tidyverse)
library(plsmod)
library(glmnet)
library(vip)
library(AppliedPredictiveModeling) # datasets
This report is a response to problems 6.2 and 6.3 in Applied Predictive Predictive Modeling (Kuhn and Johnson). Tidymodels will be used for model development and evaluation.
6.2. Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug.
data(permeability)
The matrix fingerprints
contains the 1,107 binary molecular predictors for the 165 compounds, while permeability
contains permeability response.
#Create working matrices
<-permeability
response
<-fingerprints covariates
nearZeroVar
function from the caret
package. How many predictors are left for modeling?#collect near zero variance covariates -- adapted from https://stackoverflow.com/questions/28043393/nearzerovar-function-in-caret
<-nearZeroVar(covariates)
zvar
if(length(zvar) > 0) {
<- covariates[, -zvar]}
nozero
<-dim(nozero)[2] #388
num
str_glue('There are {num} predictors left after removing removing those with near zero variance. This can also be accomplished as a recipe step (using step_nzv()) in tidymodels.')
## There are 388 predictors left after removing removing those with near zero variance. This can also be accomplished as a recipe step (using step_nzv()) in tidymodels.
PLS
model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?As an initial step, I will split the data and create resamples using 10-fold cross-validation.
# combine response and predictors into single dataset
<-cbind(response, covariates)%>%as_tibble()
perm
# create test and train sets
set.seed(10)
# create random data split -- data will not include variable for computer programs
<- initial_split(perm)
perm.split
# create train and test sets from split
<- training(perm.split) # 123, 1108
perm.train
<- testing(perm.split) # 42, 1108
perm.test
# create resamples for model tuning
set.seed(20)
<-
cv_foldsvfold_cv(perm.train, v=10)
Now I specify the PLS regression model, create a preprocessing recipe, and build a workflow.
Note that the step_nzv() function removes unbalanced and near_zero variables (see code block): 400 covariates remained after applying the recipe to the train set.
#this codeblock is adapted from: https://stackoverflow.com/questions/64582463/how-do-i-specify-a-pls-model-in-tidy-models
# create preprocessing recipe
<-
model.recipe recipe(permeability ~., data=perm.train)%>%
step_nzv(all_predictors())%>%
step_normalize(all_numeric_predictors()) # scale and center
# extract and review processed data -- this was problematic
<-prep(model.recipe)
perm_juiced
<-juice(perm_juiced)
perm.juiced
# specify pls regression model
<- pls(num_comp = tune()) %>%
pls.spec set_mode("regression") %>%
set_engine("mixOmics")
# build model workflow and fit training model
<-
pls.wf workflow() %>%
add_model(pls.spec) %>%
add_recipe(model.recipe)
#create grid
<- tibble(num_comp = seq(from = 1, to = 20, by = 1)) # evaluates 20 different components
pls.grid
# select fit metrics
<-metric_set(rmse, rsq)
pls.metrics
# tune
set.seed(30)
<-
pls.tune %>%
pls.wftune_grid(
cv_folds,grid = pls.grid,
metrics = pls.metrics
)
autoplot(pls.tune)+theme_light()+labs(title='Figure 1. Hyperparameter Tuning for PLS Model')
#show_best(wm.dec.fit)
<-select_best(pls.tune, 'rmse') # num_com = 2 best.pls
The next step is to update the PLS model with the selected hyper-parameter (num_comp) based on RMSE. That parameter value = 2.
# update PLS model
<-finalize_model(pls.spec, select_best(pls.tune, 'rmse')) #with tuning specs pls.updated
The test set RSQ estimate is 0.71.
# fit updated model to test set
<-last_fit(pls.updated, permeability ~., split = perm.split)
pls.testfit
# report fit metrics
collect_metrics(pls.testfit) %>%
filter(.metric == "rsq") %>%
mutate(model = "Partial Least Squares") %>%
::select(model, .metric, .estimate) %>%
dplyrrename(c(Metric = .metric, Estimate = .estimate))%>%
flextable()%>%
set_caption('Table 2. Goodness of Fit (RSQ) for Partial Least Squares')
model | Metric | Estimate |
Partial Least Squares | rsq | 0.7149671 |
I will fit models for Ridge Regression, Lasso Regression, and Elastic Net Regession.
Using RMSE scores to compare model fit (Table 3), it is clear that PLS outperformed the other models for this dataset.
# specify ridge regression model
<- linear_reg(penalty = tune(), mixture = 0) %>% #note mixture is equivalent to alpha in glmnet, L2 only
ridge.spec set_mode("regression") %>%
set_engine("glmnet")
# specify lasso regression model
<- linear_reg(penalty = tune(), mixture = 1) %>% #note 1 = L1 only
lasso.spec set_mode("regression") %>%
set_engine("glmnet")
# specify elastic net model
<- linear_reg(penalty = tune(), mixture = .5) %>% #note combo of L1 & L2
elastic.spec set_mode("regression") %>%
set_engine("glmnet")
# build ridge workflow
<-
ridge.wf workflow() %>%
add_model(ridge.spec) %>%
add_recipe(model.recipe)
# build lasso workflow
<-
lasso.wf workflow() %>%
add_model(lasso.spec) %>%
add_recipe(model.recipe)
#build elastic net workflow
<-
elastic.wf workflow() %>%
add_model(elastic.spec) %>%
add_recipe(model.recipe)
#create grid
<- grid_regular(penalty(), levels=5)
models.grid
# select fit metrics
<-metric_set(rmse, rsq)
models.metrics
# tune the models
set.seed(40)
<-
ridge.tune %>%
ridge.wftune_grid(
cv_folds,grid = models.grid,
metrics = models.metrics
)
<-
lasso.tune %>%
lasso.wftune_grid(
cv_folds,grid = models.grid,
metrics = models.metrics
)
<-
elastic.tune %>%
elastic.wftune_grid(
cv_folds,grid = models.grid,
metrics = models.metrics
)
# collect best CV fits
<-select_best(ridge.tune, 'rmse')
best.ridge
<-select_best(lasso.tune, 'rmse')
best.lasso
<-select_best(elastic.tune, 'rmse')
best.elastic
# update models with best parameter fit
<-finalize_model(ridge.spec, select_best(ridge.tune, 'rmse')) #with tuning specs
ridge.updated
<-finalize_model(lasso.spec, select_best(lasso.tune, 'rmse'))
lasso.updated
<-finalize_model(elastic.spec, select_best(elastic.tune, 'rmse'))
elastic.updated
# fit updated model to test set
<-last_fit(ridge.updated, permeability ~., split = perm.split)
ridge.testfit
<-last_fit(lasso.updated, permeability ~., split = perm.split)
lasso.testfit
<-last_fit(elastic.updated, permeability ~., split = perm.split)
elastic.testfit
# report fit metrics
collect_metrics(ridge.testfit) %>%
bind_rows(collect_metrics(lasso.testfit)) %>%
bind_rows(collect_metrics(elastic.testfit)) %>%
bind_rows(collect_metrics(pls.testfit)) %>%
filter(.metric == c("rsq")) %>%
mutate(model = c("Ridge", "Lasso", "Elastic", "PLS")) %>%
::select(model, .metric, .estimate) %>%
dplyrrename(c(Metric = .metric, Estimate = .estimate))%>%
flextable()%>%
set_caption('Table 3. Goodness of Fit Comparisons for Regularization Models')
model | Metric | Estimate |
Ridge | rsq | 0.2884277 |
Lasso | rsq | 0.2502109 |
Elastic | rsq | 0.2804801 |
PLS | rsq | 0.7149671 |
Of the four models, I would recommend PLS on the basis of it’s RMSE score and visual assessment of model fit (Figures 1-3). However, it is not clear to me yet whether a nonlinear model might have more predictive power in this experiment.
# plot scatterplots
%>%
ridge.testfitcollect_predictions() %>%
ggplot(aes(permeability, .pred)) +
#geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
geom_point(alpha = 0.6, color = "midnightblue") +
labs(title='Figure 1. Ridge Regression Model', subtitle='Permeability', x='Observed', y='Predicted')+
theme_light()
%>%
lasso.testfitcollect_predictions() %>%
ggplot(aes(permeability, .pred)) +
#geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
geom_point(alpha = 0.6, color = "midnightblue") +
labs(title='Figure 2. Lasso Regression Model', subtitle='Permeability', x='Observed', y='Predicted')+
theme_light()
%>%
elastic.testfitcollect_predictions() %>%
ggplot(aes(permeability, .pred)) +
#geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
geom_point(alpha = 0.6, color = "midnightblue") +
labs(title='Figure 3. Elastic Net Regression Model (50% L1 & L2)', subtitle='Permeability', x='Observed', y='Predicted')+
theme_light()
%>%
pls.testfitcollect_predictions() %>%
ggplot(aes(permeability, .pred)) +
#geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
geom_point(alpha = 0.6, color = "midnightblue") +
labs(title='Figure 4. PLS Regression Model', subtitle='Permeability', x='Observed', y='Predicted')+
theme_light()
6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch:
data(ChemicalManufacturingProcess)
<- ChemicalManufacturingProcess # dim = 176, 58 chem
The matrix processPredictors
contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield
contains the percent yield for each run.
There are 26 covariates with missing data. The greatest percent of missingness is ~ 8.5% (Table 4). Given the low percentage of missing observations, I am not overly concerned with nonrandom NA values and will impute using KNN as part of a tidymodel’s recipe steps.
#calculate percentage of NA in each column
<-chem%>%
missing::map(~ 100*mean(is.na(.)))
purrr
# create a table for missingness
%>%
chemdiagnose()%>%
::select(-unique_count, -unique_rate)%>%
dplyrfilter(missing_count>0)%>%
arrange(desc(missing_count))%>%
flextable()%>%
set_caption("Table 4. Missing Data Summary")
variables | types | missing_count | missing_percent |
ManufacturingProcess03 | numeric | 15 | 8.5227273 |
ManufacturingProcess11 | numeric | 10 | 5.6818182 |
ManufacturingProcess10 | numeric | 9 | 5.1136364 |
ManufacturingProcess25 | numeric | 5 | 2.8409091 |
ManufacturingProcess26 | numeric | 5 | 2.8409091 |
ManufacturingProcess27 | numeric | 5 | 2.8409091 |
ManufacturingProcess28 | numeric | 5 | 2.8409091 |
ManufacturingProcess29 | numeric | 5 | 2.8409091 |
ManufacturingProcess30 | numeric | 5 | 2.8409091 |
ManufacturingProcess31 | numeric | 5 | 2.8409091 |
ManufacturingProcess33 | numeric | 5 | 2.8409091 |
ManufacturingProcess34 | numeric | 5 | 2.8409091 |
ManufacturingProcess35 | numeric | 5 | 2.8409091 |
ManufacturingProcess36 | numeric | 5 | 2.8409091 |
ManufacturingProcess02 | numeric | 3 | 1.7045455 |
ManufacturingProcess06 | numeric | 2 | 1.1363636 |
ManufacturingProcess01 | numeric | 1 | 0.5681818 |
ManufacturingProcess04 | numeric | 1 | 0.5681818 |
ManufacturingProcess05 | numeric | 1 | 0.5681818 |
ManufacturingProcess07 | numeric | 1 | 0.5681818 |
ManufacturingProcess08 | numeric | 1 | 0.5681818 |
ManufacturingProcess12 | numeric | 1 | 0.5681818 |
ManufacturingProcess14 | numeric | 1 | 0.5681818 |
ManufacturingProcess22 | numeric | 1 | 0.5681818 |
ManufacturingProcess23 | numeric | 1 | 0.5681818 |
ManufacturingProcess24 | numeric | 1 | 0.5681818 |
ManufacturingProcess40 | numeric | 1 | 0.5681818 |
ManufacturingProcess41 | numeric | 1 | 0.5681818 |
# impute missing data
#temp<-mice(chem,m=5,maxit=50,meth='pmm',seed=500) # method is predictive mean matching
# create new dataframe from imputation
#chem.2<-complete(temp,1)
# assess numerics
%>%
chemdiagnose_numeric()%>%
flextable()%>%
set_caption("Table 5. Univariate Summary for Imputed Dataset")
variables | min | Q1 | mean | median | Q3 | max | zero | minus | outlier |
Yield | 35.250 | 38.7525 | 40.17653409 | 39.970 | 41.4750 | 46.340 | 0 | 0 | 1 |
BiologicalMaterial01 | 4.580 | 5.9775 | 6.41142045 | 6.305 | 6.8700 | 8.810 | 0 | 0 | 4 |
BiologicalMaterial02 | 46.870 | 52.6800 | 55.68875000 | 55.090 | 58.7375 | 64.750 | 0 | 0 | 0 |
BiologicalMaterial03 | 56.970 | 64.9800 | 67.70500000 | 67.220 | 70.4275 | 78.250 | 0 | 0 | 0 |
BiologicalMaterial04 | 9.380 | 11.2450 | 12.34926136 | 12.100 | 13.2200 | 23.090 | 0 | 0 | 3 |
BiologicalMaterial05 | 13.240 | 17.2350 | 18.59863636 | 18.490 | 19.9000 | 24.850 | 0 | 0 | 1 |
BiologicalMaterial06 | 40.600 | 46.0550 | 48.91039773 | 48.460 | 51.3450 | 59.380 | 0 | 0 | 1 |
BiologicalMaterial07 | 100.000 | 100.0000 | 100.01414773 | 100.000 | 100.0000 | 100.830 | 0 | 0 | 3 |
BiologicalMaterial08 | 15.880 | 17.0600 | 17.49477273 | 17.510 | 17.8800 | 19.140 | 0 | 0 | 1 |
BiologicalMaterial09 | 11.440 | 12.6025 | 12.85005682 | 12.835 | 13.1300 | 14.080 | 0 | 0 | 3 |
BiologicalMaterial10 | 1.770 | 2.4600 | 2.80062500 | 2.710 | 2.9900 | 6.870 | 0 | 0 | 11 |
BiologicalMaterial11 | 135.810 | 143.8175 | 146.95318182 | 146.080 | 149.6000 | 158.730 | 0 | 0 | 6 |
BiologicalMaterial12 | 18.350 | 19.7300 | 20.19988636 | 20.120 | 20.7500 | 22.210 | 0 | 0 | 0 |
ManufacturingProcess01 | 0.000 | 10.8000 | 11.20742857 | 11.400 | 12.1500 | 14.100 | 3 | 0 | 4 |
ManufacturingProcess02 | 0.000 | 19.3000 | 16.68265896 | 21.000 | 21.5000 | 22.500 | 35 | 0 | 35 |
ManufacturingProcess03 | 1.470 | 1.5300 | 1.53956522 | 1.540 | 1.5500 | 1.600 | 0 | 0 | 14 |
ManufacturingProcess04 | 911.000 | 928.0000 | 931.85142857 | 934.000 | 936.0000 | 946.000 | 0 | 0 | 2 |
ManufacturingProcess05 | 923.000 | 986.7500 | 1,001.69314286 | 999.200 | 1,008.8500 | 1,175.300 | 0 | 0 | 10 |
ManufacturingProcess06 | 203.000 | 205.7000 | 207.40172414 | 206.800 | 208.7000 | 227.400 | 0 | 0 | 6 |
ManufacturingProcess07 | 177.000 | 177.0000 | 177.48000000 | 177.000 | 178.0000 | 178.000 | 0 | 0 | 0 |
ManufacturingProcess08 | 177.000 | 177.0000 | 177.55428571 | 178.000 | 178.0000 | 178.000 | 0 | 0 | 0 |
ManufacturingProcess09 | 38.890 | 44.8900 | 45.66011364 | 45.730 | 46.5150 | 49.360 | 0 | 0 | 5 |
ManufacturingProcess10 | 7.500 | 8.7000 | 9.17904192 | 9.100 | 9.5500 | 11.600 | 0 | 0 | 8 |
ManufacturingProcess11 | 7.500 | 9.0000 | 9.38554217 | 9.400 | 9.9000 | 11.500 | 0 | 0 | 5 |
ManufacturingProcess12 | 0.000 | 0.0000 | 857.81142857 | 0.000 | 0.0000 | 4,549.000 | 142 | 0 | 33 |
ManufacturingProcess13 | 32.100 | 33.9000 | 34.50795455 | 34.600 | 35.2000 | 38.600 | 0 | 0 | 4 |
ManufacturingProcess14 | 4,701.000 | 4,828.0000 | 4,853.86857143 | 4,856.000 | 4,882.5000 | 5,055.000 | 0 | 0 | 10 |
ManufacturingProcess15 | 5,904.000 | 6,010.0000 | 6,038.92045455 | 6,031.500 | 6,061.0000 | 6,233.000 | 0 | 0 | 20 |
ManufacturingProcess16 | 0.000 | 4,560.7500 | 4,565.80113636 | 4,588.000 | 4,619.0000 | 4,852.000 | 1 | 0 | 14 |
ManufacturingProcess17 | 31.300 | 33.5000 | 34.34375000 | 34.400 | 35.1000 | 40.000 | 0 | 0 | 4 |
ManufacturingProcess18 | 0.000 | 4,813.0000 | 4,809.68181818 | 4,835.000 | 4,862.0000 | 4,971.000 | 1 | 0 | 8 |
ManufacturingProcess19 | 5,890.000 | 6,000.7500 | 6,028.19886364 | 6,022.000 | 6,050.2500 | 6,146.000 | 0 | 0 | 5 |
ManufacturingProcess20 | 0.000 | 4,552.7500 | 4,556.46022727 | 4,582.000 | 4,609.5000 | 4,759.000 | 1 | 0 | 7 |
ManufacturingProcess21 | -1.800 | -0.6000 | -0.16420455 | -0.300 | 0.0000 | 3.600 | 45 | 101 | 17 |
ManufacturingProcess22 | 0.000 | 3.0000 | 5.40571429 | 5.000 | 8.0000 | 12.000 | 4 | 0 | 0 |
ManufacturingProcess23 | 0.000 | 2.0000 | 3.01714286 | 3.000 | 4.0000 | 6.000 | 5 | 0 | 0 |
ManufacturingProcess24 | 0.000 | 4.0000 | 8.83428571 | 8.000 | 14.0000 | 23.000 | 4 | 0 | 0 |
ManufacturingProcess25 | 0.000 | 4,832.0000 | 4,828.17543860 | 4,855.000 | 4,877.0000 | 4,990.000 | 1 | 0 | 7 |
ManufacturingProcess26 | 0.000 | 6,019.5000 | 6,015.59649123 | 6,047.000 | 6,070.5000 | 6,161.000 | 1 | 0 | 9 |
ManufacturingProcess27 | 0.000 | 4,560.0000 | 4,562.50877193 | 4,587.000 | 4,609.0000 | 4,710.000 | 1 | 0 | 14 |
ManufacturingProcess28 | 0.000 | 0.0000 | 6.59181287 | 10.400 | 10.7500 | 11.500 | 66 | 0 | 0 |
ManufacturingProcess29 | 0.000 | 19.7000 | 20.01111111 | 19.900 | 20.4000 | 22.000 | 1 | 0 | 11 |
ManufacturingProcess30 | 0.000 | 8.8000 | 9.16140351 | 9.100 | 9.7000 | 11.200 | 1 | 0 | 2 |
ManufacturingProcess31 | 0.000 | 70.1000 | 70.18479532 | 70.800 | 71.4000 | 72.500 | 1 | 0 | 5 |
ManufacturingProcess32 | 143.000 | 155.0000 | 158.46590909 | 158.000 | 162.0000 | 173.000 | 0 | 0 | 3 |
ManufacturingProcess33 | 56.000 | 62.0000 | 63.54385965 | 64.000 | 65.0000 | 70.000 | 0 | 0 | 4 |
ManufacturingProcess34 | 2.300 | 2.5000 | 2.49356725 | 2.500 | 2.5000 | 2.600 | 0 | 0 | 48 |
ManufacturingProcess35 | 463.000 | 490.0000 | 495.59649123 | 495.000 | 501.5000 | 522.000 | 0 | 0 | 9 |
ManufacturingProcess36 | 0.017 | 0.0190 | 0.01957310 | 0.020 | 0.0200 | 0.022 | 0 | 0 | 3 |
ManufacturingProcess37 | 0.000 | 0.7000 | 1.01363636 | 1.000 | 1.3000 | 2.300 | 3 | 0 | 2 |
ManufacturingProcess38 | 0.000 | 2.0000 | 2.53409091 | 3.000 | 3.0000 | 3.000 | 5 | 0 | 5 |
ManufacturingProcess39 | 0.000 | 7.1000 | 6.85113636 | 7.200 | 7.3000 | 7.500 | 8 | 0 | 9 |
ManufacturingProcess40 | 0.000 | 0.0000 | 0.01771429 | 0.000 | 0.0000 | 0.100 | 144 | 0 | 31 |
ManufacturingProcess41 | 0.000 | 0.0000 | 0.02371429 | 0.000 | 0.0000 | 0.200 | 143 | 0 | 32 |
ManufacturingProcess42 | 0.000 | 11.4000 | 11.20625000 | 11.600 | 11.7000 | 12.100 | 5 | 0 | 11 |
ManufacturingProcess43 | 0.000 | 0.6000 | 0.91193182 | 0.800 | 1.0250 | 11.000 | 1 | 0 | 6 |
ManufacturingProcess44 | 0.000 | 1.8000 | 1.80511364 | 1.900 | 1.9000 | 2.100 | 5 | 0 | 9 |
ManufacturingProcess45 | 0.000 | 2.1000 | 2.13806818 | 2.200 | 2.3000 | 2.600 | 5 | 0 | 17 |
I will also assess multicollinearity among the covariates. From the correlation coefficients presented in Table 6, it is clear that multicollinearity is a significant issue in this dataset.
# Assess collinearity in the absence of zero values in numerical cols
%>%
chemcorrelate()%>%
filter(coef_corr > .8 | coef_corr < -.8)%>% # set thresholds to limit output
arrange(desc(coef_corr))%>%
flextable()%>%
set_caption("Table 6. Pairwise Correlation: Numerical Covariates")
var1 | var2 | coef_corr |
ManufacturingProcess26 | ManufacturingProcess25 | 0.9975404 |
ManufacturingProcess25 | ManufacturingProcess26 | 0.9975404 |
ManufacturingProcess27 | ManufacturingProcess26 | 0.9960838 |
ManufacturingProcess26 | ManufacturingProcess27 | 0.9960838 |
ManufacturingProcess27 | ManufacturingProcess25 | 0.9935080 |
ManufacturingProcess25 | ManufacturingProcess27 | 0.9935080 |
ManufacturingProcess20 | ManufacturingProcess18 | 0.9917474 |
ManufacturingProcess18 | ManufacturingProcess20 | 0.9917474 |
ManufacturingProcess31 | ManufacturingProcess25 | 0.9713394 |
ManufacturingProcess25 | ManufacturingProcess31 | 0.9713394 |
ManufacturingProcess31 | ManufacturingProcess26 | 0.9606791 |
ManufacturingProcess26 | ManufacturingProcess31 | 0.9606791 |
BiologicalMaterial06 | BiologicalMaterial02 | 0.9543113 |
BiologicalMaterial02 | BiologicalMaterial06 | 0.9543113 |
ManufacturingProcess31 | ManufacturingProcess27 | 0.9542377 |
ManufacturingProcess27 | ManufacturingProcess31 | 0.9542377 |
ManufacturingProcess29 | ManufacturingProcess26 | 0.9519100 |
ManufacturingProcess26 | ManufacturingProcess29 | 0.9519100 |
ManufacturingProcess29 | ManufacturingProcess27 | 0.9511622 |
ManufacturingProcess27 | ManufacturingProcess29 | 0.9511622 |
ManufacturingProcess44 | ManufacturingProcess42 | 0.9487495 |
ManufacturingProcess42 | ManufacturingProcess44 | 0.9487495 |
ManufacturingProcess29 | ManufacturingProcess25 | 0.9374560 |
ManufacturingProcess25 | ManufacturingProcess29 | 0.9374560 |
ManufacturingProcess41 | ManufacturingProcess40 | 0.9244205 |
ManufacturingProcess40 | ManufacturingProcess41 | 0.9244205 |
BiologicalMaterial10 | BiologicalMaterial04 | 0.9205870 |
BiologicalMaterial04 | BiologicalMaterial10 | 0.9205870 |
BiologicalMaterial12 | BiologicalMaterial11 | 0.9037209 |
BiologicalMaterial11 | BiologicalMaterial12 | 0.9037209 |
ManufacturingProcess45 | ManufacturingProcess42 | 0.8938911 |
ManufacturingProcess42 | ManufacturingProcess45 | 0.8938911 |
ManufacturingProcess45 | ManufacturingProcess44 | 0.8772865 |
ManufacturingProcess44 | ManufacturingProcess45 | 0.8772865 |
BiologicalMaterial06 | BiologicalMaterial03 | 0.8723637 |
BiologicalMaterial03 | BiologicalMaterial06 | 0.8723637 |
ManufacturingProcess33 | ManufacturingProcess32 | 0.8613947 |
ManufacturingProcess32 | ManufacturingProcess33 | 0.8613947 |
BiologicalMaterial03 | BiologicalMaterial02 | 0.8607901 |
BiologicalMaterial02 | BiologicalMaterial03 | 0.8607901 |
ManufacturingProcess15 | ManufacturingProcess14 | 0.8558909 |
ManufacturingProcess14 | ManufacturingProcess15 | 0.8558909 |
ManufacturingProcess31 | ManufacturingProcess29 | 0.8438927 |
ManufacturingProcess29 | ManufacturingProcess31 | 0.8438927 |
BiologicalMaterial04 | BiologicalMaterial01 | 0.8196394 |
BiologicalMaterial01 | BiologicalMaterial04 | 0.8196394 |
BiologicalMaterial12 | BiologicalMaterial06 | 0.8128540 |
BiologicalMaterial06 | BiologicalMaterial12 | 0.8128540 |
BiologicalMaterial11 | BiologicalMaterial08 | 0.8003035 |
BiologicalMaterial08 | BiologicalMaterial11 | 0.8003035 |
ManufacturingProcess18 | ManufacturingProcess11 | -0.8023514 |
ManufacturingProcess11 | ManufacturingProcess18 | -0.8023514 |
ManufacturingProcess14 | ManufacturingProcess10 | -0.8501753 |
ManufacturingProcess10 | ManufacturingProcess14 | -0.8501753 |
# create test and train sets
set.seed(100)
# create random data split -- data will not include variable for computer programs
<- initial_split(chem)
chem.split
# create train and test sets from split
<- training(chem.split) # 123, 1108
chem.train
<- testing(chem.split) # 42, 1108
chem.test
# create resamples for model tuning
set.seed(101)
<-
chem_foldsvfold_cv(chem.train, v=10)
I will evaluate an elastic net model for this purposes of this exercise.
First the build the recipe, workflow and tuning grid, train the model, and then evaluate the model on the test set. Performance metrics for the training set are listed in Table 7.
# create apreprocessing recipe
#based on https://stackoverflow.com/questions/64254295/predictor-importance-for-pls-model-trained-with-tidymodels
<-
chem.recipe recipe(Yield ~., data=chem.train)%>%
step_impute_knn(all_predictors()) %>%
step_BoxCox(all_predictors())%>%
step_normalize(all_predictors())%>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors()) # Removes variables that have large absolute correlations with other variables
# specify elastic net model
<- linear_reg(penalty = tune(), mixture = .5) %>% #note combo of L1 & L2
elastic.chem.spec set_mode("regression") %>%
set_engine("glmnet")
#build elastic net workflow
<-
elastic.chem.wf workflow() %>%
add_model(elastic.chem.spec) %>%
add_recipe(chem.recipe)
#create elastic grid
<- grid_regular(penalty(), levels=5)
elastic.grid
# select fit metrics
<-metric_set(rmse, rsq)
models.metrics
# tune the model
set.seed(101)
<-
elastic.chem.tune %>%
elastic.chem.wftune_grid(
chem_folds,grid = elastic.grid,
metrics = models.metrics
)
# collect best CV fits
<-select_best(elastic.chem.tune, 'rmse')
best.chem.elastic
# collect training metrics
<-collect_metrics(elastic.chem.tune)%>%
enet.train.metricsfilter(.config %in% 'Preprocessor1_Model5')%>%
::select(c(.metric, mean))%>%
dplyrmutate(model = 'Elastic Net')
%>%
enet.train.metricspivot_wider(names_from = .metric, values_from = mean)%>%
flextable()%>%
set_caption('Table 7. Performance Metrics for Elastic Net Training Model')
model | rmse | rsq |
Elastic Net | 1.422341 | 0.6037253 |
The optimal values for the Elastic Net performance metrics is as follows:
The test performance metric for the Elastic Net model is included in Table 8. The rmse score is lower on the test set than on the training set (see Table 7). While the reverse is true for rsq scores. The predicted vs. observed values for the test set are plotted in Figure 5.
# update models with best parameter fit
<-finalize_model(elastic.chem.spec, best.chem.elastic)
elastic.chem.updated
# fit updated model to test set
<-last_fit(elastic.chem.updated, Yield ~., split = chem.split)
elastic.chem.testfit
# report fit metrics
<-elastic.chem.testfit%>%
m1collect_metrics()%>%
::select(.metric, .estimate)%>%
dplyrpivot_wider(names_from = .metric, values_from = .estimate)%>%
mutate(model="Elastic Net")%>%
relocate(model, .before=rmse)
%>%
m1flextable()%>%
set_caption('Table 8. Goodness of Fit for Elastic Net')
model | rmse | rsq |
Elastic Net | 1.248739 | 0.4491427 |
#plot obs vs. predicted
%>%
elastic.chem.testfitcollect_predictions() %>%
ggplot(aes(Yield, .pred)) +
geom_point(alpha = 0.6, color = "midnightblue") +
labs(title='Figure 5. Elastic Net Regression Model (50% L1 & L2): Manufacturing Dataset', subtitle='Response Variable = Yield', x='Observed', y='Predicted')+
theme_light()
Manufacturing process predictors dominate the list. These include process predictors 32, 09, and 13.
#adapted from https://stackoverflow.com/questions/64254295/predictor-importance-for-pls-model-trained-with-tidymodels
# specify model based on the optimized parameter for penalty
<- linear_reg(penalty = 1, mixture = .5) %>% #note combo of L1 & L2
e.spec set_mode("regression") %>%
set_engine("glmnet")
#build workflow for e.spec
<-
tidy.wfworkflow()%>%
add_model(e.spec)%>%
add_recipe(chem.recipe)
<- fit(tidy.wf, chem.train)
fit.elastic
# collect loadings
<- fit.elastic%>%
info pull_workflow_fit()%>%
tidy()
# plot variable importance
<-
var.imp%>%
info filter(term != "Y") %>%
::select(term, estimate)%>%
dplyrslice(2:50)%>%
arrange(desc(abs(estimate)))%>%
filter(estimate != 0)
%>%
var.impggplot(aes(estimate, fct_reorder(term, estimate))) +
geom_col(show.legend = FALSE, fill='midnightblue') +
labs(title='Figure 7. Variable Importance Elastic Net Model', y = NULL)+
theme_classic()
From Figure 7, its clear that ManufacturingProcess32 has a large positive influence on Yield (estimate=0.36), while the opposite is true for ManufacturingProcess13 (estimate=-.24). It follows that scaling back processes with negative coefficient values can serve to increase Yield, as can investing in processes with positive coefficient estimates. That said, it may be worthwhile to investigate any interactions (among covariates) that could countervail these outcomes.