Modeling

Methodology

Since the response variable is continuous, we will use a series of regression models as candidates. We will train a random forest and XGboost because they have proven historically to be excellent at modeling continuous variables. Our EDA pointed us towards some potential clustering in the principle components, so we will train a radial SVM. Our EDA didn’t point us towards any terribly strong linear correlations, and the potential clustering from the EDA makes us want to throw in a KNN model just to be safe.

Data Preparation

From the Missing Data section, we have decided to drop MFR and listwise delete observations for Filler Speed. Some preliminary modeling showed that Brand Code was very responsive to imputation as well. Since the dataset is fairly large and the remaining datapoints seem missing at random, we will drop the remaining observations with missing data.

Our EDA showed that a large number of the predictors were skewed, so we will center and scale the data for models that do not handle skew data or unscaled predictors well, like KNN or SVM. For the ensemble method, we will leave the data as is.

Partitioning the Data

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.1.3

## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --

## v broom        0.8.0     v recipes      0.2.0
## v dials        0.1.1     v rsample      0.1.1
## v dplyr        1.0.8     v tibble       3.1.6
## v ggplot2      3.3.5     v tidyr        1.2.0
## v infer        1.0.0     v tune         0.2.0
## v modeldata    0.1.1     v workflows    0.2.6
## v parsnip      0.2.1     v workflowsets 0.2.1
## v purrr        0.3.4     v yardstick    0.0.9

## Warning: package 'broom' was built under R version 4.1.3

## Warning: package 'dials' was built under R version 4.1.3

## Warning: package 'dplyr' was built under R version 4.1.3

## Warning: package 'modeldata' was built under R version 4.1.3

## Warning: package 'parsnip' was built under R version 4.1.3

## Warning: package 'recipes' was built under R version 4.1.3

## Warning: package 'rsample' was built under R version 4.1.2

## Warning: package 'tibble' was built under R version 4.1.3

## Warning: package 'tidyr' was built under R version 4.1.3

## Warning: package 'tune' was built under R version 4.1.3

## Warning: package 'workflows' was built under R version 4.1.3

## Warning: package 'workflowsets' was built under R version 4.1.3

## Warning: package 'yardstick' was built under R version 4.1.2

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(readxl)
train_data<-as.data.frame(read_xlsx('C:\\Program Files\\GitHub\\predicting_ph\\data\\StudentData.xlsx'))
#removing MFR
train_data<-train_data%>%select(-MFR)%>%mutate(`Brand Code` = as.factor(`Brand Code`))
train_data<-train_data%>%clean_names()
test_data<-read_xlsx('C:\\Program Files\\GitHub\\predicting_ph\\data\\StudentEvaluation.xlsx')
test_data<-test_data%>%select(-MFR)
test_data<-test_data%>%clean_names()
ph_split<-initial_split(train_data%>%na.omit())
ph_train<-training(ph_split)
ph_test<-testing(ph_split)

Data Preprocessing

# create base recipe with missing data handling
base_rec<-
  recipe(ph~., data = ph_train)%>%
  #imputing brand code
  
  step_impute_bag(brand_code)%>%
  step_unknown(brand_code)%>%
  step_dummy(brand_code)#%>%
  #omitting remaining missing data
  #step_naomit(all_predictors())
temp<-juice(prep(base_rec))
# create centered and scaled data
cs_rec<-
  base_rec%>%
  step_center%>%
  step_scale()

#create resamples
ph_folds<-vfold_cv(ph_train)

Create Models for Tuning

svm_r_spec<-
svm_rbf(cost = tune(), rbf_sigma = tune())%>%
  set_engine('kernlab')%>%
  set_mode('regression')
knn_spec<-
  nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune())%>%
  set_engine('kknn')%>%
  set_mode('regression')
boost_spec<-
  boost_tree(mtry = tune(), min_n = tune(), trees = 1000)%>%
  set_mode('regression')
rf_spec<-
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000)%>%
  set_engine('ranger', importance = 'impurity')%>%
  set_mode('regression')

Create the workflow set

cc<-
  workflow_set(
    preproc = list(center_scale = cs_rec),
    models  = list(SVM_radial = svm_r_spec, knn_spec)
  )
no_pre_proc<-
  workflow_set(
    preproc = list(base = base_rec),
    models = list(boost = boost_spec, rf = rf_spec)
  )
all_workflows<-
  bind_rows(no_pre_proc, cc)%>%
  mutate(wflow_id = gsub('(center_scale_)|(base_)', '',wflow_id))

Tuning and Evaluating

We will use a racing method to tune the models. This will help us limit the overall computational burden by removing candidate tuning parameters quickly, therefore reducing the total amount of tuned models.

library(finetune)

## Warning: package 'finetune' was built under R version 4.1.3

race_ctrl<-
  control_race(
    save_pred = TRUE, 
    parallel_over = 'everything',
    save_workflow = TRUE
  )
race_results_time<-
  system.time(
    race_results<-
      all_workflows%>%
      workflow_map(
        'tune_race_anova',
        seed = 123,
        resamples = ph_folds,
        control = race_ctrl,
        verbose = TRUE
      )
  )

## i 1 of 4 tuning:     boost

## i Creating pre-processing data to finalize unknown parameter: mtry

## Warning: package 'xgboost' was built under R version 4.1.3

## Warning: package 'rlang' was built under R version 4.1.3

## v 1 of 4 tuning:     boost (3m 34.9s)

## i 2 of 4 tuning:     rf

## i Creating pre-processing data to finalize unknown parameter: mtry

## v 2 of 4 tuning:     rf (2m 56.5s)

## i 3 of 4 tuning:     SVM_radial

## ! Fold06: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.

## ! Fold06: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.

## ! Fold06: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.

## ! Fold06: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.

## ! Fold06: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.

## ! Fold06: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.

## ! Fold06: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.

## ! Fold06: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.

## ! Fold06: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.

## ! Fold06: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.

## ! Fold01: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.

## ! Fold03: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.

## ! Fold10: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.

## ! Fold10: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.

## ! Fold07: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.

## ! Fold07: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.

## ! Fold09: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.

## ! Fold09: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.

## ! Fold02: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.

## ! Fold02: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.

## ! Fold04: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.

## ! Fold04: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.

## ! Fold08: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.

## ! Fold08: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.

## ! Fold05: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.

## ! Fold05: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.

## v 3 of 4 tuning:     SVM_radial (47.2s)

## i 4 of 4 tuning:     nearest_neighbor

## Warning: package 'kknn' was built under R version 4.1.3

## v 4 of 4 tuning:     nearest_neighbor (2m 57.2s)

race_results

## # A workflow set/tibble: 4 x 4
##   wflow_id         info             option    result   
##   <chr>            <list>           <list>    <list>   
## 1 boost            <tibble [1 x 4]> <opts[2]> <race[+]>
## 2 rf               <tibble [1 x 4]> <opts[2]> <race[+]>
## 3 SVM_radial       <tibble [1 x 4]> <opts[2]> <race[+]>
## 4 nearest_neighbor <tibble [1 x 4]> <opts[2]> <race[+]>

autoplot(race_results)

collect_metrics(race_results)

## # A tibble: 80 x 9
##    wflow_id .config         preproc model .metric .estimator  mean     n std_err
##    <chr>    <chr>           <chr>   <chr> <chr>   <chr>      <dbl> <int>   <dbl>
##  1 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.102    10 0.00264
##  2 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.645    10 0.0180 
##  3 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.104    10 0.00223
##  4 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.638    10 0.0173 
##  5 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.103    10 0.00225
##  6 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.639    10 0.0168 
##  7 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.105     5 0.00527
##  8 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.644     5 0.0328 
##  9 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.102     3 0.00452
## 10 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.649     3 0.00361
## # ... with 70 more rows

best_results<-
  race_results%>%
  extract_workflow_set_result('nearest_neighbor')%>%
  select_best(metric = 'rmse')
best_results

## # A tibble: 1 x 4
##   neighbors weight_func  dist_power .config              
##       <int> <chr>             <dbl> <chr>                
## 1         8 epanechnikov      0.468 Preprocessor1_Model03

knn_test_results<-
  race_results%>%
  extract_workflow('nearest_neighbor')%>%
  finalize_workflow(best_results)%>%
  last_fit(split = ph_split)
collect_metrics(knn_test_results)

## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.0983 Preprocessor1_Model1
## 2 rsq     standard      0.674  Preprocessor1_Model1

knn_test_results%>%
  collect_predictions()%>%
  ggplot(aes(x = ph, y = .pred))+
  geom_abline(color = 'gray50', lty = 2)+
  geom_point(alpha = .5)+
  coord_obs_pred()+
  labs(x = 'observed', y = 'predicted')