library(tidymodels)
library(finetune) # customize grid search for speed
library(earth) # MARS model
library(mlbench) # data 
library(DALEXtra)

library(tidyverse)
library(dlookr)
library(magrittr)
library(flextable)
library(patchwork)


library(AppliedPredictiveModeling) # dataset

#library(caret)
#library(mice)
#library(dlookr)
#
#library(glmnet)
#library(vip)


setwd("C:/Users/seanc/OneDrive/Desktop/624/HW8")

1 Background

This report focuses on the training and evaluation of nonlinear regression models (e.g., neural networks, MARS, SVM, KNN) and is a response to questions 7.2 and 7.5 from Kuhn and Johnson’s Applied Predictive Modeling.

2 Exercise 7.2

Friedman (1991) introduced several benchmark data sets created by simulation. One of these simulations used the following nonlinear equation to create data:

\(y = 10sin(\pi x_{1}x_{2})+20(x_{3}-0.5))2+10x_{4}+5x_{5}+N(0,\sigma^{2})\)

where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation).

Tune several models on these data.

2.1 Data Acquisition and Exploratory Analysis._

This first five rows of the dataset are shown in Table 1. There are 2000 observations in the simulated dataset.

# load data

fxn<- mlbench.friedman1

# create a dataset of nrow = 1000

set.seed(01)

data<-fxn(2000)%>%
  as.data.frame()%>%
  relocate(y, .before = x.1)

# view data head

data%>%head(5)%>%
  flextable()%>%
  set_caption('Table 1. First Five Rows of Dataset.')

2.2 Pairwise Comparisons

Scatterplots of Y (response) vs. numerical predictors show various forms of relationship, from linear (x.4 & x.5) and nonlinear (x.1-x.3), to uncertain (x.6-x.9).

# plot scatterplots

response = names(data)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(data)


explain <- names(data)[2:10] #explanatory variables
explain = purrr::set_names(explain)

scatter = function(x) {
  ggplot(data, aes_string(x = x, y = 'y')) +
  geom_point(color='midnightblue', alpha = 0.05, show.legend = FALSE)+
  theme_classic()

}

plots<-map(explain, ~scatter(.x)) #creates a list of plots using purrr

# layout with patchwork

plot(reduce(plots,`+`))

2.3 Multicollinearity

Analysis of numerical variables indicates an absence of highly correlated covariates, obviating concerns for over-dispersion (Table 1).

data%>%
    #filter_if(is.numeric)%>%
    correlate()%>%
    filter(coef_corr > .3 | coef_corr < -.3)%>% # set thresholds to limit output 
    flextable()%>%
    set_caption("Table 1. Pairwise Correlation: coeff_corr > .3")

Similarly, there are no missing values that would require preprocessing prior to model development (Table 2).

# collect statistics

data%>%
  diagnose_numeric()%>%
  flextable()%>%
  set_caption('Table 2. Univariate Profile for Simulated Data')

2.4 Model Splits

Prior to modeling, we create training and test sets as well as set up 10-fold cross validation.

# create training and test sets

set.seed(02)

# create random data split -- data will not include variable for computer programs

data.split <- initial_split(data)

# create train and test sets from split

data.train <- training(data.split) # dim 3750   11

data.test  <- testing(data.split) # dim 1250   11

# create resamples for model tuning

set.seed(03)

data.folds<-
  vfold_cv(data.train, v=10) 

2.5 Model Development

Tidymodels will be used to construct and compare the following predictive algorithms for this dataset:

  1. MARS - multivariate adaptive regression splines
  2. SVM - support vector machine
  3. KNN - K-nearest neighbor
  4. NNET - a single feed forward layer neural network (27 hidden units)

Hyper-parameters for the models are tuned via. cross-validation using a 25 point grid.

Model results are saved to a local directory as an .rda file for easy retrieval.

# create preprocessing recipe - we will use the same recipe for all models except MARS
# based on https://www.tmwr.org/workflow-sets.html

data.recipe <- 
  recipe(y ~., data=data.train)%>% 
  step_normalize(all_numeric_predictors()) # scale and center

# workflow for MARS will use dplyr selectors - scale and center not required

mars.recipe <- 
   workflow_variables(outcomes = y, 
                      predictors = everything())

# extract and review processed data
    
data.juice <-juice(prep(data.recipe)) # note that response does not get scaled and 

# SPECIFY MODELS

# mars

mars.spec<-mars(
  prod_degree = tune()) %>%  #<- use GCV to choose terms, highest possible interaction
  set_engine("earth") %>% 
  set_mode("regression")

#svm

svm.spec<-svm_rbf(
  cost = tune(), 
  rbf_sigma = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("regression")


# knn

knn.spec<-nearest_neighbor(
  neighbors = tune(), 
  dist_power = tune(), 
  weight_func = tune()) %>% 
  set_engine("kknn") %>% 
  set_mode("regression")

# single feed forward layer neural network

nnet.spec<-mlp(
  hidden_units = tune(), 
  penalty = tune(),
  epochs = tune()) %>% 
  set_engine("nnet", MaxNWts = 2600) %>% 
  set_mode("regression")

# a neural network should have up to 27 hidden units in layer (Kuhn Johnson)

# the following is adopted from https://www.tmwr.org/workflow-sets.html

nnet.param <- 
   nnet.spec %>% 
   extract_parameter_set_dials() %>% 
   update(hidden_units = hidden_units(c(1, 27)))

#Build two workflow sets (can use when there are different recipes)

processed.wf<-
  workflow_set(
    preproc=list(normalized = data.recipe),
    models = list(SVM.radial = svm.spec, KNN = knn.spec, NNet = nnet.spec)
  )

unprocessed.wf<-
  workflow_set(
    preproc=list(simple = mars.recipe),
    models = list(MARS = mars.spec)
  )

#combine workflows 

combined.wf<-
  bind_rows(processed.wf, unprocessed.wf)%>%
  mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))


# build grids and tune the models - use race via. finetune to speed analyses

race.ctrl <-
   control_race(
      save_pred = TRUE,
      parallel_over = "everything",
      save_workflow = TRUE
   )

grid.results <-
   combined.wf %>%
   workflow_map(     # applies fxn to all workflows in set, tune_grid() is the default
      'tune_race_anova',
      seed = 1503,
      resamples = data.folds,
      grid = 25, # setting up grid size of 25
      control = race.ctrl
   )

# review and save tuning results

grid.results%<>% # option col will be “tune[x]”  in case of failure
  mutate(wflow_id=ifelse(wflow_id %in% 'unprocessed_MARS', 'MARS', wflow_id))
  
  
save(grid.results, file='grid_results.rda')

Which models appear to give the best performance?

The models are listed in order of descending performance (based on RMSE results) as follows (Figure 2 and Table 3):

  • MARS
  • KNN
  • SVM
  • NNET
# load saved models

load('grid_results.rda') # models saved as grid2.results

# sum the number of models that were run

number.models <- nrow(collect_metrics(grid.results, summarize = FALSE))

# the following works for control_grid but not race_grid

#grid.results %>% 
   #rank_results() %>% 
  # filter(.metric == "rmse") %>% 
  # select_best(model, .config, rmse = mean, rank) # or just select

#clean up id name -- this could be part of combined.wf

#grid.results%<>%
  #mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))

#plot results by rmse rank

autoplot(
  grid.results,
  rank_metric = "rmse",  # <- how to order models
  metric = "rmse",       # <- which metric to visualize
  select_best = TRUE) +     # <- one point per workflow
  geom_text(aes(y = mean - 1/2, label = wflow_id), angle = 0, vjust = -5) +
  lims(y = c(0.5, 2.6)) +
  labs(title='Figure 2. Estimated RMSE for Best Model Configuration', x='')+
  theme(legend.position = "none")+
  theme_light()

 # list best rmse results by model

collect_metrics(grid.results)%>%
  filter(.metric == c("rmse"))%>%
  dplyr::select(model, .metric, mean) %>%
  group_by(model)%>%
  slice(which.min(mean))%>%
  rename(c(Model = model, Estimate = 'mean'))%>%
  mutate(Metric = 'RMSE')%>%
  relocate(Metric, .after=Model)%>%
  mutate(Model = ifelse(Model %in% 'nearest_neighbor', 'knn', Model))%>%
  mutate(Model = ifelse(Model %in% 'svm_rbf', 'svm', Model))%>%
  mutate(Model = ifelse(Model %in% 'mlp', 'nnet', Model))%>%
  mutate(Model = toupper(Model))%>%
  arrange(desc(Estimate))%>%
  flextable()%>%
  set_caption('Table 3. Model RMSE Scores: Best_Select()') 

2.6 Model Evaluation: MARS

The best fit cross-validation model (Preprocessor1_Model1) for MARS will be used for final training and evaluation on the test set.

The results of parameter fitting (prod_degree = 2) are shown in Figure 3. And plots for test set observations vs predictions and residuals are shown in Figures 4 and 5, respectively. The latter indicate constant variance for this model.

# Extract the MARS model along and update parameters with best setting

mars.best <- 
   grid.results %>% 
   extract_workflow_set_result("MARS") %>% 
   select_best(metric = "rmse")

# Finalize model on test data

mars.test<-
  grid.results%>%
  extract_workflow("MARS") %>% 
  finalize_workflow(mars.best) %>% 
  last_fit(split = data.split)

#collect_metrics(mars.test)
#collect_predictions(mars.test)

# plot tuning parameter results for MARS (see also: collect_predictions(), collect_metrics())
  
autoplot(grid.results, id = "MARS", metric = "rmse", color='blue')+
  labs(title='Figure 3. Tuning Parameter Results for MARS Model')+
  theme_classic()

# plot predicted and observed

mars.test %>%
  collect_predictions() %>% 
  ggplot(aes(x = y, y = .pred)) + 
  geom_abline(color = "gray50", lty = 2) + 
  geom_point(color='steelblue', alpha = 0.5) + 
  coord_obs_pred() + 
  labs(title= 'Figure 4. Predicted vs. Observed: MARS Model', subtitle='Simulation Data', x = "observed", y = "predicted")+
  theme_minimal()

# collect diagnostic output and plot resids


output<-extract_fit_engine(mars.test)%>%
  summary()

residuals<-output[11]

residuals%<>%
  as.data.frame()%>%
  select(..y)%>%
  rename(resids = '..y')

pred.obs<-mars.test%>%
  collect_predictions()%>%
  select(.pred, y)%>%
  as.data.frame()%>%
  rename(fitted=.pred, observed=y)

diagnostics<-cbind(pred.obs, residuals)

diagnostics%>%
  ggplot(aes(x=observed, y=resids))+
  geom_point(color='steelblue', alpha=.4)+
  labs(title='Figure 5. Observed Values vs. Residuals: MARS Model')+
  theme_classic()

Does MARS select the informative predictors (those named X1–X5)?

Yes, the MARS model selects predictors X1-X5 as indicated by the model summary below.

# collect model coeffs

summary(output)
## Call: earth(formula=..y~., data=data, keepxy=TRUE, degree=~2L)
## 
##                                   coefficients
## (Intercept)                          13.055969
## h(0.485481-x.1)                     -14.145744
## h(x.1-0.485481)                      10.371967
## h(0.497739-x.2)                      -9.236100
## h(x.2-0.497739)                       8.361107
## h(x.3-0.238349)                       9.536660
## h(0.48801-x.3)                       15.628943
## h(x.3-0.48801)                       -3.264807
## h(x.3-0.839676)                      12.966092
## h(0.398123-x.4)                     -10.218658
## h(x.4-0.398123)                       9.910356
## h(0.920768-x.5)                      -4.985451
## h(x.5-0.920768)                       8.235699
## h(0.28075-x.1) * h(0.497739-x.2)     29.838202
## h(x.1-0.28075) * h(0.497739-x.2)    -20.738344
## h(0.504505-x.1) * h(x.2-0.497739)   -13.719511
## h(x.1-0.504505) * h(x.2-0.497739)   -45.144849
## h(x.1-0.759543) * h(x.2-0.497739)   -34.963705
## 
## Selected 18 of 18 terms, and 5 of 10 predictors
## Termination condition: Reached nk 21
## Importance: x.4, x.1, x.2, x.5, x.3, x.6-unused, x.7-unused, x.8-unused, ...
## Number of terms at each degree of interaction: 1 12 5
## GCV 1.159366    RSS 1639.646    GRSq 0.9553108    RSq 0.9578089
#coefficients<-output[12]

3 Exercise 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

3.1 Model development

  1. Which nonlinear regression model gives the optimal resampling and test set performance?

The following models will be trained for this exercise: MARS, SVM, KNN, NNET. The KNN model produced the best fit to the training data (based on rmse; Figure 6 and Table 4), with cross-validation results shown in Table 5.

Test set observations are plotted against model predictions in Figures 7.

# load data

data(ChemicalManufacturingProcess)

chem<- ChemicalManufacturingProcess

head(chem)
# CREATE SPLITS AND CV-FOLD

# test and train sets

set.seed(100)

# random data split -- data will not include variable for computer programs

chem.split <- initial_split(chem)

# create train and test sets from split

chem.train <- training(chem.split) # 123, 1108

chem.test  <- testing(chem.split) # 42, 1108

# CREATE RESAMPLES

set.seed(101)

chem.folds<-
  vfold_cv(chem.train, v=10)

# BUILD RECIPES

chem.recipe <- 
  recipe(Yield ~., data=chem.train)%>% 
  step_impute_knn(all_predictors()) %>% 
  step_normalize(all_predictors())


chem.juice <-juice(prep(chem.recipe))

mchem.recipe<-
  recipe(Yield ~., data=chem.train)%>% # does not require center and scale
  step_impute_knn(all_predictors())

mchem.juice <-juice(prep(mchem.recipe))

# SPECIFY MODELS

# mars

mars2.spec<-mars(
  prod_degree = tune()) %>%  #<- use GCV to choose terms
  set_engine("earth") %>% 
  set_mode("regression")

#svm

svm2.spec<-svm_rbf(
  cost = tune(), 
  rbf_sigma = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("regression")


# knn

knn2.spec<-nearest_neighbor(
  neighbors = tune(), 
  dist_power = tune(), 
  weight_func = tune()) %>% 
  set_engine("kknn") %>% 
  set_mode("regression")

# single feed forward layer neural network

nnet2.spec<-mlp(
  hidden_units = tune(), 
  penalty = tune(),
  epochs = tune()) %>% 
  set_engine("nnet", MaxNWts = 2600) %>% 
  set_mode("regression")

# a neural network should have up to 27 hidden units in layer (Kuhn Johnson)

# the following is adopted from https://www.tmwr.org/workflow-sets.html

nnet2.param <- 
   nnet2.spec %>% 
   extract_parameter_set_dials() %>% 
   update(hidden_units = hidden_units(c(1, 27)))

#Build two workflow sets (can use when there are different recipes)

chem.wf<-
  workflow_set(
    preproc=list(normalized = chem.recipe),
    models = list(SVM.radial = svm2.spec, KNN = knn2.spec, NNet = nnet2.spec)
  )%>%
  mutate(wflow_id = gsub("(normalized_)", "", wflow_id))

mchem.wf<-
  workflow_set(
    preproc=list(simple = mchem.recipe),
    models = list(MARS = mars2.spec)
  )%>%
  mutate(wflow_id = gsub("(simple_)", "", wflow_id))

#combine workflows 

combined2.wf<-
  bind_rows(chem.wf, mchem.wf)

  


# build grids and tune the models - use race via. finetune to speed analyses

race2.ctrl <-
   control_race(
      save_pred = TRUE,
      parallel_over = "everything",
      save_workflow = TRUE
   )

grid2.results <-
   combined2.wf %>%
   workflow_map(     # applies fxn to all workflows in set, tune_grid() is the default
      'tune_race_anova',
      seed = 1511,
      resamples = chem.folds,
      grid = 25, # setting up grid size of 25
      control = race2.ctrl
   )

# save model results to local directory

save(grid2.results, file='grid2_results.rda')

#plot results by rmse rank

autoplot(
  grid2.results,
  rank_metric = "rmse",  # <- how to order models
  metric = "rmse",       # <- which metric to visualize
  select_best = TRUE) +     # <- one point per workflow
  geom_text(aes(y = mean - 1/2, label = wflow_id), angle = 0, vjust = -5) +
  lims(y = c(0.5, 2.6)) +
  labs(title='Figure 6. Estimated RMSE for Best Model Configuration', x='')+
  theme(legend.position = "none")+
  theme_light()

 # list best rmse results by model

collect_metrics(grid2.results)%>%
  filter(.metric == c("rmse"))%>%
  dplyr::select(model, .metric, mean) %>%
  group_by(model)%>%
  slice(which.min(mean))%>%
  rename(c(Model = model, Estimate = 'mean'))%>%
  mutate(Metric = 'RMSE')%>%
  relocate(Metric, .after=Model)%>%
  mutate(Model = ifelse(Model %in% 'nearest_neighbor', 'knn', Model))%>%
  mutate(Model = ifelse(Model %in% 'svm_rbf', 'svm', Model))%>%
  mutate(Model = ifelse(Model %in% 'mlp', 'nnet', Model))%>%
  mutate(Model = toupper(Model))%>%
  arrange(desc(Estimate))%>%
  flextable()%>%
  set_caption('Table 4. Model RMSE Scores: Best_Select()') 
# Extract the KNN model along and update parameters with best setting

knn.best <- 
   grid2.results %>% 
   extract_workflow_set_result("KNN") %>% 
   select_best(metric = "rmse")

knn.best%>%
  flextable()%>%
  set_caption('Table 5. KNN Best Fit Parameters and Model from Cross-Validation')
# Finalize model on test data

knn.test<-
  grid2.results%>%
  extract_workflow("KNN") %>% 
  finalize_workflow(knn.best) %>% 
  last_fit(split = chem.split)

# plot predicted and observed

knn.test %>%
  collect_predictions() %>% 
  ggplot(aes(x = Yield, y = .pred)) + 
  geom_abline(color = "gray50", lty = 2) + 
  geom_point(color='steelblue', alpha = 0.5) + 
  coord_obs_pred() + 
  labs(title= 'Figure 7. Predicted vs. Observed: KNN Model', subtitle='Simulation Data', x = "observed", y = "predicted")+
  theme_minimal()

neighbors weight_func dist_power .config 9 biweight 0.5845462 Preprocessor1_Model01

3.2 Variable Importance

  1. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

The top 5 predictors in the optimal nonlinear regression model are Manufacturing Processes 24, 38, 28, 39, and 29 (see the Feature Importance Plot below). In this analysis, none of the biological variables ranked high in variable importance.

These predictors are different from those ranked for the optimal linear model - Elastic Net indicating that these models are responding individually to unique structure in the data. The differences may also owe to the effects of multicollinearity on each model.

For example, data preprocessing steps for the elastic net model included selective removal of highly correlated predictors (as well as those with near zero variance). This step was not included in the KNN model reported here.

#https://github.com/ModelOriented/DALEXtra/issues/65


chem2.recipe <- 
  recipe(Yield ~., data=chem)%>% # using full chem dataset
  step_impute_knn(all_predictors()) %>% 
  step_normalize(all_predictors())

# set up spec for best KNN model

knn3.spec<-nearest_neighbor(
  neighbors = 9, 
  dist_power = 0.5845462, 
  weight_func = 'biweight') %>% 
  set_engine("kknn") %>% 
  set_mode("regression")

# build new knn workflow

knn3.wf <- 
  workflow() %>% 
  add_model(knn3.spec) %>% 
  add_recipe(chem2.recipe) # could have added fit here but ran into trouble with vip_features

knn3.fit <-
  knn3.wf%>%
  fit(data = chem)

feature_data <- 
  knn3.fit%>% 
  extract_mold() %>% # extracts preprocessing info
  pluck("predictors") # enables deep indexing

outcome_data <- 
  knn3.fit %>% 
  extract_mold() %>% 
  pluck("outcomes") %>% 
  pluck(1)

vip_features <- 
  explain_tidymodels(
    knn3.fit, 
    data = feature_data, 
    y = outcome_data
  ) %>% 
  model_parts()
## Preparation of a new explainer is initiated
##   -> model label       :  workflow  (  default  )
##   -> data              :  176  rows  57  cols 
##   -> data              :  tibble converted into a data.frame 
##   -> target variable   :  176  values 
##   -> predict function  :  yhat.workflow  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package tidymodels , ver. 0.1.4 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  39.76276 , mean =  39.83788 , max =  39.88954  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -4.625692 , mean =  0.3386529 , max =  6.492782  
##   A new explainer has been created! 
plot(vip_features, max_vars=10)

3.3 Relationships to Yield

  1. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model.

Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

The plots do reveal patterns that are distinct with regard to Yield. For example, several processes display a negative relationship to yield (see below: MP24, MP38, MP42, and MP36) while the relationship to Yield is obscured by the presence of zero values (MP28, MP39, MP29, MP30, MP43) for other processes.

# look at feature data and outcome data combine with top vars to make plots

top.vars<-vip_features%>%
  as.data.frame()%>%
  group_by(variable)%>%
  summarise(loss=mean(dropout_loss))%>%
  arrange(desc(loss))%>%
  head(10)%>%
  select(variable)

ten<-chem%>%
  select(c(Yield, ManufacturingProcess28,ManufacturingProcess24,ManufacturingProcess38,ManufacturingProcess42,ManufacturingProcess39,
           ManufacturingProcess29,ManufacturingProcess30,ManufacturingProcess36,ManufacturingProcess22,ManufacturingProcess43))%>%
  na.omit()
  
colnames(ten)<-gsub("ManufacturingProcess","M_Process",colnames(ten))

# Plot scatterplots

response = names(ten)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(response)


explain <- names(ten)[2:11] #explanatory variables
explain = purrr::set_names(explain)

scatter = function(x) {

     ggplot(ten, aes_string(x = x, y = 'Yield')) +
     geom_point(color='midnightblue', alpha = 0.4, show.legend = FALSE)+
     theme_classic()

}

plots<-map(explain, ~scatter(.x)) #creates a list of plots using purrr

# layout with patchwork

plot(reduce(plots,`+`))

Absent these zero values, Yield increases with MP28, MP39, and Mp29; decreases with MP43 and shows a range of values at the single level for MP30 (see below).

# evaluate processes without zero values for Yield.

zero<-chem%>%
  mutate(across(everything(), ~na_if(., 0)))%>%
  na.omit()
  
colnames(zero)<-gsub("ManufacturingProcess","M_Process",colnames(zero))

# Plot scatterplots

response = names(zero)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(response)


explain <- names(zero)[2:11] #explanatory variables
explain = purrr::set_names(explain)

scatter = function(x) {

     ggplot(zero, aes_string(x = x, y = 'Yield')) +
     geom_point(color='midnightblue', alpha = 0.4, show.legend = FALSE)+
     theme_classic()

}

plots<-map(explain, ~scatter(.x)) #creates a list of plots using purrr

# layout with patchwork

plot(reduce(plots,`+`))