library(tidymodels)
library(finetune) # customize grid search for speed
library(earth) # MARS model
library(mlbench) # data
library(DALEXtra)
library(tidyverse)
library(dlookr)
library(magrittr)
library(flextable)
library(patchwork)
library(AppliedPredictiveModeling) # dataset
#library(caret)
#library(mice)
#library(dlookr)
#
#library(glmnet)
#library(vip)
setwd("C:/Users/seanc/OneDrive/Desktop/624/HW8")
This report focuses on the training and evaluation of nonlinear regression models (e.g., neural networks, MARS, SVM, KNN) and is a response to questions 7.2 and 7.5 from Kuhn and Johnson’s Applied Predictive Modeling.
Friedman (1991) introduced several benchmark data sets created by simulation. One of these simulations used the following nonlinear equation to create data:
\(y = 10sin(\pi x_{1}x_{2})+20(x_{3}-0.5))2+10x_{4}+5x_{5}+N(0,\sigma^{2})\)
where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation).
Tune several models on these data.
This first five rows of the dataset are shown in Table 1. There are 2000 observations in the simulated dataset.
# load data
<- mlbench.friedman1
fxn
# create a dataset of nrow = 1000
set.seed(01)
<-fxn(2000)%>%
dataas.data.frame()%>%
relocate(y, .before = x.1)
# view data head
%>%head(5)%>%
dataflextable()%>%
set_caption('Table 1. First Five Rows of Dataset.')
y | x.1 | x.2 | x.3 | x.4 | x.5 | x.6 | x.7 | x.8 | x.9 | x.10 |
16.13531 | 0.2655087 | 0.8718050 | 0.18776846 | 0.77008139 | 0.1282652 | 0.06471249 | 0.26801514 | 0.09251632 | 0.60399576 | 0.11177927 |
15.53217 | 0.3721239 | 0.9671970 | 0.50475902 | 0.69026253 | 0.1276915 | 0.67661240 | 0.10177984 | 0.44674406 | 0.97408409 | 0.07662839 |
23.82663 | 0.5728534 | 0.8669163 | 0.02728685 | 0.65047706 | 0.7777323 | 0.73537169 | 0.13362410 | 0.82944637 | 0.20342439 | 0.17201583 |
11.14839 | 0.9082078 | 0.4377153 | 0.49629785 | 0.07465596 | 0.4201504 | 0.11129967 | 0.79935258 | 0.81237396 | 0.02060101 | 0.84205713 |
17.32047 | 0.2016819 | 0.1919378 | 0.94735171 | 0.90258066 | 0.7159019 | 0.04665462 | 0.01499073 | 0.80510669 | 0.07192066 | 0.53641651 |
Scatterplots of Y (response) vs. numerical predictors show various forms of relationship, from linear (x.4 & x.5) and nonlinear (x.1-x.3), to uncertain (x.6-x.9).
# plot scatterplots
= names(data)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(data)
response
<- names(data)[2:10] #explanatory variables
explain = purrr::set_names(explain)
explain
= function(x) {
scatter ggplot(data, aes_string(x = x, y = 'y')) +
geom_point(color='midnightblue', alpha = 0.05, show.legend = FALSE)+
theme_classic()
}
<-map(explain, ~scatter(.x)) #creates a list of plots using purrr
plots
# layout with patchwork
plot(reduce(plots,`+`))
Analysis of numerical variables indicates an absence of highly correlated covariates, obviating concerns for over-dispersion (Table 1).
%>%
data#filter_if(is.numeric)%>%
correlate()%>%
filter(coef_corr > .3 | coef_corr < -.3)%>% # set thresholds to limit output
flextable()%>%
set_caption("Table 1. Pairwise Correlation: coeff_corr > .3")
var1 | var2 | coef_corr |
x.1 | y | 0.3596485 |
x.2 | y | 0.4059489 |
x.4 | y | 0.5826163 |
x.5 | y | 0.3035609 |
y | x.1 | 0.3596485 |
y | x.2 | 0.4059489 |
y | x.4 | 0.5826163 |
y | x.5 | 0.3035609 |
Similarly, there are no missing values that would require preprocessing prior to model development (Table 2).
# collect statistics
%>%
datadiagnose_numeric()%>%
flextable()%>%
set_caption('Table 2. Univariate Profile for Simulated Data')
variables | min | Q1 | mean | median | Q3 | max | zero | minus | outlier |
y | 0.1292454886 | 10.6468213 | 14.2126215 | 14.1280737 | 17.8936889 | 28.3625462 | 0 | 0 | 0 |
x.1 | 0.0006052661 | 0.2408959 | 0.4949939 | 0.4821248 | 0.7542811 | 0.9999306 | 0 | 0 | 0 |
x.2 | 0.0005705222 | 0.2361910 | 0.4941252 | 0.4878184 | 0.7676816 | 0.9998635 | 0 | 0 | 0 |
x.3 | 0.0002003808 | 0.2727782 | 0.5118419 | 0.5107860 | 0.7622727 | 0.9987511 | 0 | 0 | 0 |
x.4 | 0.0001064336 | 0.2555151 | 0.5031762 | 0.5044569 | 0.7583430 | 0.9989099 | 0 | 0 | 0 |
x.5 | 0.0001206559 | 0.2499833 | 0.4967026 | 0.4972826 | 0.7453419 | 0.9982637 | 0 | 0 | 0 |
x.6 | 0.0006133218 | 0.2564903 | 0.5024327 | 0.5036092 | 0.7482330 | 0.9978924 | 0 | 0 | 0 |
x.7 | 0.0011065297 | 0.2369706 | 0.4950864 | 0.4953294 | 0.7436251 | 0.9998552 | 0 | 0 | 0 |
x.8 | 0.0008246463 | 0.2628807 | 0.5037971 | 0.5045182 | 0.7484474 | 0.9992230 | 0 | 0 | 0 |
x.9 | 0.0002707751 | 0.2410258 | 0.4904802 | 0.4871111 | 0.7326808 | 0.9996335 | 0 | 0 | 0 |
x.10 | 0.0005944150 | 0.2560461 | 0.5091579 | 0.5119832 | 0.7613344 | 0.9996333 | 0 | 0 | 0 |
Prior to modeling, we create training and test sets as well as set up 10-fold cross validation.
# create training and test sets
set.seed(02)
# create random data split -- data will not include variable for computer programs
<- initial_split(data)
data.split
# create train and test sets from split
<- training(data.split) # dim 3750 11
data.train
<- testing(data.split) # dim 1250 11
data.test
# create resamples for model tuning
set.seed(03)
<-
data.foldsvfold_cv(data.train, v=10)
Tidymodels will be used to construct and compare the following predictive algorithms for this dataset:
Hyper-parameters for the models are tuned via. cross-validation using a 25 point grid.
Model results are saved to a local directory as an .rda file for easy retrieval.
# create preprocessing recipe - we will use the same recipe for all models except MARS
# based on https://www.tmwr.org/workflow-sets.html
<-
data.recipe recipe(y ~., data=data.train)%>%
step_normalize(all_numeric_predictors()) # scale and center
# workflow for MARS will use dplyr selectors - scale and center not required
<-
mars.recipe workflow_variables(outcomes = y,
predictors = everything())
# extract and review processed data
<-juice(prep(data.recipe)) # note that response does not get scaled and
data.juice
# SPECIFY MODELS
# mars
<-mars(
mars.specprod_degree = tune()) %>% #<- use GCV to choose terms, highest possible interaction
set_engine("earth") %>%
set_mode("regression")
#svm
<-svm_rbf(
svm.speccost = tune(),
rbf_sigma = tune()) %>%
set_engine("kernlab") %>%
set_mode("regression")
# knn
<-nearest_neighbor(
knn.specneighbors = tune(),
dist_power = tune(),
weight_func = tune()) %>%
set_engine("kknn") %>%
set_mode("regression")
# single feed forward layer neural network
<-mlp(
nnet.spechidden_units = tune(),
penalty = tune(),
epochs = tune()) %>%
set_engine("nnet", MaxNWts = 2600) %>%
set_mode("regression")
# a neural network should have up to 27 hidden units in layer (Kuhn Johnson)
# the following is adopted from https://www.tmwr.org/workflow-sets.html
<-
nnet.param %>%
nnet.spec extract_parameter_set_dials() %>%
update(hidden_units = hidden_units(c(1, 27)))
#Build two workflow sets (can use when there are different recipes)
<-
processed.wfworkflow_set(
preproc=list(normalized = data.recipe),
models = list(SVM.radial = svm.spec, KNN = knn.spec, NNet = nnet.spec)
)
<-
unprocessed.wfworkflow_set(
preproc=list(simple = mars.recipe),
models = list(MARS = mars.spec)
)
#combine workflows
<-
combined.wfbind_rows(processed.wf, unprocessed.wf)%>%
mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
# build grids and tune the models - use race via. finetune to speed analyses
<-
race.ctrl control_race(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
<-
grid.results %>%
combined.wf workflow_map( # applies fxn to all workflows in set, tune_grid() is the default
'tune_race_anova',
seed = 1503,
resamples = data.folds,
grid = 25, # setting up grid size of 25
control = race.ctrl
)
# review and save tuning results
%<>% # option col will be “tune[x]” in case of failure
grid.resultsmutate(wflow_id=ifelse(wflow_id %in% 'unprocessed_MARS', 'MARS', wflow_id))
save(grid.results, file='grid_results.rda')
Which models appear to give the best performance?
The models are listed in order of descending performance (based on RMSE results) as follows (Figure 2 and Table 3):
# load saved models
load('grid_results.rda') # models saved as grid2.results
# sum the number of models that were run
<- nrow(collect_metrics(grid.results, summarize = FALSE))
number.models
# the following works for control_grid but not race_grid
#grid.results %>%
#rank_results() %>%
# filter(.metric == "rmse") %>%
# select_best(model, .config, rmse = mean, rank) # or just select
#clean up id name -- this could be part of combined.wf
#grid.results%<>%
#mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
#plot results by rmse rank
autoplot(
grid.results,rank_metric = "rmse", # <- how to order models
metric = "rmse", # <- which metric to visualize
select_best = TRUE) + # <- one point per workflow
geom_text(aes(y = mean - 1/2, label = wflow_id), angle = 0, vjust = -5) +
lims(y = c(0.5, 2.6)) +
labs(title='Figure 2. Estimated RMSE for Best Model Configuration', x='')+
theme(legend.position = "none")+
theme_light()
# list best rmse results by model
collect_metrics(grid.results)%>%
filter(.metric == c("rmse"))%>%
::select(model, .metric, mean) %>%
dplyrgroup_by(model)%>%
slice(which.min(mean))%>%
rename(c(Model = model, Estimate = 'mean'))%>%
mutate(Metric = 'RMSE')%>%
relocate(Metric, .after=Model)%>%
mutate(Model = ifelse(Model %in% 'nearest_neighbor', 'knn', Model))%>%
mutate(Model = ifelse(Model %in% 'svm_rbf', 'svm', Model))%>%
mutate(Model = ifelse(Model %in% 'mlp', 'nnet', Model))%>%
mutate(Model = toupper(Model))%>%
arrange(desc(Estimate))%>%
flextable()%>%
set_caption('Table 3. Model RMSE Scores: Best_Select()')
Model | Metric | .metric | Estimate |
KNN | RMSE | rmse | 2.464895 |
SVM | RMSE | rmse | 1.494128 |
NNET | RMSE | rmse | 1.086632 |
MARS | RMSE | rmse | 1.063130 |
The best fit cross-validation model (Preprocessor1_Model1) for MARS will be used for final training and evaluation on the test set.
The results of parameter fitting (prod_degree = 2) are shown in Figure 3. And plots for test set observations vs predictions and residuals are shown in Figures 4 and 5, respectively. The latter indicate constant variance for this model.
# Extract the MARS model along and update parameters with best setting
<-
mars.best %>%
grid.results extract_workflow_set_result("MARS") %>%
select_best(metric = "rmse")
# Finalize model on test data
<-
mars.test%>%
grid.resultsextract_workflow("MARS") %>%
finalize_workflow(mars.best) %>%
last_fit(split = data.split)
#collect_metrics(mars.test)
#collect_predictions(mars.test)
# plot tuning parameter results for MARS (see also: collect_predictions(), collect_metrics())
autoplot(grid.results, id = "MARS", metric = "rmse", color='blue')+
labs(title='Figure 3. Tuning Parameter Results for MARS Model')+
theme_classic()
# plot predicted and observed
%>%
mars.test collect_predictions() %>%
ggplot(aes(x = y, y = .pred)) +
geom_abline(color = "gray50", lty = 2) +
geom_point(color='steelblue', alpha = 0.5) +
coord_obs_pred() +
labs(title= 'Figure 4. Predicted vs. Observed: MARS Model', subtitle='Simulation Data', x = "observed", y = "predicted")+
theme_minimal()
# collect diagnostic output and plot resids
<-extract_fit_engine(mars.test)%>%
outputsummary()
<-output[11]
residuals
%<>%
residualsas.data.frame()%>%
select(..y)%>%
rename(resids = '..y')
<-mars.test%>%
pred.obscollect_predictions()%>%
select(.pred, y)%>%
as.data.frame()%>%
rename(fitted=.pred, observed=y)
<-cbind(pred.obs, residuals)
diagnostics
%>%
diagnosticsggplot(aes(x=observed, y=resids))+
geom_point(color='steelblue', alpha=.4)+
labs(title='Figure 5. Observed Values vs. Residuals: MARS Model')+
theme_classic()
Does MARS select the informative predictors (those named X1–X5)?
Yes, the MARS model selects predictors X1-X5 as indicated by the model summary below.
# collect model coeffs
summary(output)
## Call: earth(formula=..y~., data=data, keepxy=TRUE, degree=~2L)
##
## coefficients
## (Intercept) 13.055969
## h(0.485481-x.1) -14.145744
## h(x.1-0.485481) 10.371967
## h(0.497739-x.2) -9.236100
## h(x.2-0.497739) 8.361107
## h(x.3-0.238349) 9.536660
## h(0.48801-x.3) 15.628943
## h(x.3-0.48801) -3.264807
## h(x.3-0.839676) 12.966092
## h(0.398123-x.4) -10.218658
## h(x.4-0.398123) 9.910356
## h(0.920768-x.5) -4.985451
## h(x.5-0.920768) 8.235699
## h(0.28075-x.1) * h(0.497739-x.2) 29.838202
## h(x.1-0.28075) * h(0.497739-x.2) -20.738344
## h(0.504505-x.1) * h(x.2-0.497739) -13.719511
## h(x.1-0.504505) * h(x.2-0.497739) -45.144849
## h(x.1-0.759543) * h(x.2-0.497739) -34.963705
##
## Selected 18 of 18 terms, and 5 of 10 predictors
## Termination condition: Reached nk 21
## Importance: x.4, x.1, x.2, x.5, x.3, x.6-unused, x.7-unused, x.8-unused, ...
## Number of terms at each degree of interaction: 1 12 5
## GCV 1.159366 RSS 1639.646 GRSq 0.9553108 RSq 0.9578089
#coefficients<-output[12]
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
The following models will be trained for this exercise: MARS, SVM, KNN, NNET. The KNN model produced the best fit to the training data (based on rmse; Figure 6 and Table 4), with cross-validation results shown in Table 5.
Test set observations are plotted against model predictions in Figures 7.
# load data
data(ChemicalManufacturingProcess)
<- ChemicalManufacturingProcess
chem
head(chem)
# CREATE SPLITS AND CV-FOLD
# test and train sets
set.seed(100)
# random data split -- data will not include variable for computer programs
<- initial_split(chem)
chem.split
# create train and test sets from split
<- training(chem.split) # 123, 1108
chem.train
<- testing(chem.split) # 42, 1108
chem.test
# CREATE RESAMPLES
set.seed(101)
<-
chem.foldsvfold_cv(chem.train, v=10)
# BUILD RECIPES
<-
chem.recipe recipe(Yield ~., data=chem.train)%>%
step_impute_knn(all_predictors()) %>%
step_normalize(all_predictors())
<-juice(prep(chem.recipe))
chem.juice
<-
mchem.reciperecipe(Yield ~., data=chem.train)%>% # does not require center and scale
step_impute_knn(all_predictors())
<-juice(prep(mchem.recipe))
mchem.juice
# SPECIFY MODELS
# mars
<-mars(
mars2.specprod_degree = tune()) %>% #<- use GCV to choose terms
set_engine("earth") %>%
set_mode("regression")
#svm
<-svm_rbf(
svm2.speccost = tune(),
rbf_sigma = tune()) %>%
set_engine("kernlab") %>%
set_mode("regression")
# knn
<-nearest_neighbor(
knn2.specneighbors = tune(),
dist_power = tune(),
weight_func = tune()) %>%
set_engine("kknn") %>%
set_mode("regression")
# single feed forward layer neural network
<-mlp(
nnet2.spechidden_units = tune(),
penalty = tune(),
epochs = tune()) %>%
set_engine("nnet", MaxNWts = 2600) %>%
set_mode("regression")
# a neural network should have up to 27 hidden units in layer (Kuhn Johnson)
# the following is adopted from https://www.tmwr.org/workflow-sets.html
<-
nnet2.param %>%
nnet2.spec extract_parameter_set_dials() %>%
update(hidden_units = hidden_units(c(1, 27)))
#Build two workflow sets (can use when there are different recipes)
<-
chem.wfworkflow_set(
preproc=list(normalized = chem.recipe),
models = list(SVM.radial = svm2.spec, KNN = knn2.spec, NNet = nnet2.spec)
%>%
)mutate(wflow_id = gsub("(normalized_)", "", wflow_id))
<-
mchem.wfworkflow_set(
preproc=list(simple = mchem.recipe),
models = list(MARS = mars2.spec)
%>%
)mutate(wflow_id = gsub("(simple_)", "", wflow_id))
#combine workflows
<-
combined2.wfbind_rows(chem.wf, mchem.wf)
# build grids and tune the models - use race via. finetune to speed analyses
<-
race2.ctrl control_race(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
<-
grid2.results %>%
combined2.wf workflow_map( # applies fxn to all workflows in set, tune_grid() is the default
'tune_race_anova',
seed = 1511,
resamples = chem.folds,
grid = 25, # setting up grid size of 25
control = race2.ctrl
)
# save model results to local directory
save(grid2.results, file='grid2_results.rda')
#plot results by rmse rank
autoplot(
grid2.results,rank_metric = "rmse", # <- how to order models
metric = "rmse", # <- which metric to visualize
select_best = TRUE) + # <- one point per workflow
geom_text(aes(y = mean - 1/2, label = wflow_id), angle = 0, vjust = -5) +
lims(y = c(0.5, 2.6)) +
labs(title='Figure 6. Estimated RMSE for Best Model Configuration', x='')+
theme(legend.position = "none")+
theme_light()
# list best rmse results by model
collect_metrics(grid2.results)%>%
filter(.metric == c("rmse"))%>%
::select(model, .metric, mean) %>%
dplyrgroup_by(model)%>%
slice(which.min(mean))%>%
rename(c(Model = model, Estimate = 'mean'))%>%
mutate(Metric = 'RMSE')%>%
relocate(Metric, .after=Model)%>%
mutate(Model = ifelse(Model %in% 'nearest_neighbor', 'knn', Model))%>%
mutate(Model = ifelse(Model %in% 'svm_rbf', 'svm', Model))%>%
mutate(Model = ifelse(Model %in% 'mlp', 'nnet', Model))%>%
mutate(Model = toupper(Model))%>%
arrange(desc(Estimate))%>%
flextable()%>%
set_caption('Table 4. Model RMSE Scores: Best_Select()')
Model | Metric | .metric | Estimate |
NNET | RMSE | rmse | 1.614150 |
MARS | RMSE | rmse | 1.379640 |
SVM | RMSE | rmse | 1.165347 |
KNN | RMSE | rmse | 1.152328 |
# Extract the KNN model along and update parameters with best setting
<-
knn.best %>%
grid2.results extract_workflow_set_result("KNN") %>%
select_best(metric = "rmse")
%>%
knn.bestflextable()%>%
set_caption('Table 5. KNN Best Fit Parameters and Model from Cross-Validation')
neighbors | weight_func | dist_power | .config |
9 | biweight | 0.5845462 | Preprocessor1_Model01 |
# Finalize model on test data
<-
knn.test%>%
grid2.resultsextract_workflow("KNN") %>%
finalize_workflow(knn.best) %>%
last_fit(split = chem.split)
# plot predicted and observed
%>%
knn.test collect_predictions() %>%
ggplot(aes(x = Yield, y = .pred)) +
geom_abline(color = "gray50", lty = 2) +
geom_point(color='steelblue', alpha = 0.5) +
coord_obs_pred() +
labs(title= 'Figure 7. Predicted vs. Observed: KNN Model', subtitle='Simulation Data', x = "observed", y = "predicted")+
theme_minimal()
neighbors
The top 5 predictors in the optimal nonlinear regression model are Manufacturing Processes 24, 38, 28, 39, and 29 (see the Feature Importance Plot below). In this analysis, none of the biological variables ranked high in variable importance.
These predictors are different from those ranked for the optimal linear model - Elastic Net indicating that these models are responding individually to unique structure in the data. The differences may also owe to the effects of multicollinearity on each model.
For example, data preprocessing steps for the elastic net model included selective removal of highly correlated predictors (as well as those with near zero variance). This step was not included in the KNN model reported here.
#https://github.com/ModelOriented/DALEXtra/issues/65
<-
chem2.recipe recipe(Yield ~., data=chem)%>% # using full chem dataset
step_impute_knn(all_predictors()) %>%
step_normalize(all_predictors())
# set up spec for best KNN model
<-nearest_neighbor(
knn3.specneighbors = 9,
dist_power = 0.5845462,
weight_func = 'biweight') %>%
set_engine("kknn") %>%
set_mode("regression")
# build new knn workflow
<-
knn3.wf workflow() %>%
add_model(knn3.spec) %>%
add_recipe(chem2.recipe) # could have added fit here but ran into trouble with vip_features
<-
knn3.fit %>%
knn3.wffit(data = chem)
<-
feature_data %>%
knn3.fitextract_mold() %>% # extracts preprocessing info
pluck("predictors") # enables deep indexing
<-
outcome_data %>%
knn3.fit extract_mold() %>%
pluck("outcomes") %>%
pluck(1)
<-
vip_features explain_tidymodels(
knn3.fit, data = feature_data,
y = outcome_data
%>%
) model_parts()
## Preparation of a new explainer is initiated
## -> model label : workflow ( [33m default [39m )
## -> data : 176 rows 57 cols
## -> data : tibble converted into a data.frame
## -> target variable : 176 values
## -> predict function : yhat.workflow will be used ( [33m default [39m )
## -> predicted values : No value for predict function target column. ( [33m default [39m )
## -> model_info : package tidymodels , ver. 0.1.4 , task regression ( [33m default [39m )
## -> predicted values : numerical, min = 39.76276 , mean = 39.83788 , max = 39.88954
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -4.625692 , mean = 0.3386529 , max = 6.492782
## [32m A new explainer has been created! [39m
plot(vip_features, max_vars=10)
Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
The plots do reveal patterns that are distinct with regard to Yield. For example, several processes display a negative relationship to yield (see below: MP24, MP38, MP42, and MP36) while the relationship to Yield is obscured by the presence of zero values (MP28, MP39, MP29, MP30, MP43) for other processes.
# look at feature data and outcome data combine with top vars to make plots
<-vip_features%>%
top.varsas.data.frame()%>%
group_by(variable)%>%
summarise(loss=mean(dropout_loss))%>%
arrange(desc(loss))%>%
head(10)%>%
select(variable)
<-chem%>%
tenselect(c(Yield, ManufacturingProcess28,ManufacturingProcess24,ManufacturingProcess38,ManufacturingProcess42,ManufacturingProcess39,
%>%
ManufacturingProcess29,ManufacturingProcess30,ManufacturingProcess36,ManufacturingProcess22,ManufacturingProcess43))na.omit()
colnames(ten)<-gsub("ManufacturingProcess","M_Process",colnames(ten))
# Plot scatterplots
= names(ten)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(response)
response
<- names(ten)[2:11] #explanatory variables
explain = purrr::set_names(explain)
explain
= function(x) {
scatter
ggplot(ten, aes_string(x = x, y = 'Yield')) +
geom_point(color='midnightblue', alpha = 0.4, show.legend = FALSE)+
theme_classic()
}
<-map(explain, ~scatter(.x)) #creates a list of plots using purrr
plots
# layout with patchwork
plot(reduce(plots,`+`))
Absent these zero values, Yield increases with MP28, MP39, and Mp29; decreases with MP43 and shows a range of values at the single level for MP30 (see below).
# evaluate processes without zero values for Yield.
<-chem%>%
zeromutate(across(everything(), ~na_if(., 0)))%>%
na.omit()
colnames(zero)<-gsub("ManufacturingProcess","M_Process",colnames(zero))
# Plot scatterplots
= names(zero)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(response)
response
<- names(zero)[2:11] #explanatory variables
explain = purrr::set_names(explain)
explain
= function(x) {
scatter
ggplot(zero, aes_string(x = x, y = 'Yield')) +
geom_point(color='midnightblue', alpha = 0.4, show.legend = FALSE)+
theme_classic()
}
<-map(explain, ~scatter(.x)) #creates a list of plots using purrr
plots
# layout with patchwork
plot(reduce(plots,`+`))