Since the response variable is continuous, we will use a series of regression models as candidates. We will train a random forest and XGboost because they have proven historically to be excellent at modeling continuous variables. Our EDA pointed us towards some potential clustering in the principle components, so we will train a radial SVM. Our EDA didn’t point us towards any terribly strong linear correlations, and the potential clustering from the EDA makes us want to throw in a KNN model just to be safe.
From the Missing Data section, we have decided to drop MFR and listwise delete observations for Filler Speed. Some preliminary modeling showed that Brand Code was very responsive to imputation as well. Since the dataset is fairly large and the remaining datapoints seem missing at random, we will drop the remaining observations with missing data.
Our EDA showed that a large number of the predictors were skewed, so we will center and scale the data for models that do not handle skew data or unscaled predictors well, like KNN or SVM. For the ensemble method, we will leave the data as is.
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.1.3
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.8.0 v recipes 0.2.0
## v dials 0.1.1 v rsample 0.1.1
## v dplyr 1.0.8 v tibble 3.1.6
## v ggplot2 3.3.5 v tidyr 1.2.0
## v infer 1.0.0 v tune 0.2.0
## v modeldata 0.1.1 v workflows 0.2.6
## v parsnip 0.2.1 v workflowsets 0.2.1
## v purrr 0.3.4 v yardstick 0.0.9
## Warning: package 'broom' was built under R version 4.1.3
## Warning: package 'dials' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'modeldata' was built under R version 4.1.3
## Warning: package 'parsnip' was built under R version 4.1.3
## Warning: package 'recipes' was built under R version 4.1.3
## Warning: package 'rsample' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'tune' was built under R version 4.1.3
## Warning: package 'workflows' was built under R version 4.1.3
## Warning: package 'workflowsets' was built under R version 4.1.3
## Warning: package 'yardstick' was built under R version 4.1.2
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x recipes::step() masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(readxl)
train_data<-as.data.frame(read_xlsx('C:\\Program Files\\GitHub\\predicting_ph\\data\\StudentData.xlsx'))
#removing MFR
train_data<-train_data%>%select(-MFR)%>%mutate(`Brand Code` = as.factor(`Brand Code`))
train_data<-train_data%>%clean_names()
test_data<-read_xlsx('C:\\Program Files\\GitHub\\predicting_ph\\data\\StudentEvaluation.xlsx')
test_data<-test_data%>%select(-MFR)
test_data<-test_data%>%clean_names()
ph_split<-initial_split(train_data%>%na.omit())
ph_train<-training(ph_split)
ph_test<-testing(ph_split)
# create base recipe with missing data handling
base_rec<-
recipe(ph~., data = ph_train)%>%
#imputing brand code
step_impute_bag(brand_code)%>%
step_unknown(brand_code)%>%
step_dummy(brand_code)#%>%
#omitting remaining missing data
#step_naomit(all_predictors())
temp<-juice(prep(base_rec))
# create centered and scaled data
cs_rec<-
base_rec%>%
step_center%>%
step_scale()
#create resamples
ph_folds<-vfold_cv(ph_train)
svm_r_spec<-
svm_rbf(cost = tune(), rbf_sigma = tune())%>%
set_engine('kernlab')%>%
set_mode('regression')
knn_spec<-
nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune())%>%
set_engine('kknn')%>%
set_mode('regression')
boost_spec<-
boost_tree(mtry = tune(), min_n = tune(), trees = 1000)%>%
set_mode('regression')
rf_spec<-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000)%>%
set_engine('ranger', importance = 'impurity')%>%
set_mode('regression')
cc<-
workflow_set(
preproc = list(center_scale = cs_rec),
models = list(SVM_radial = svm_r_spec, knn_spec)
)
no_pre_proc<-
workflow_set(
preproc = list(base = base_rec),
models = list(boost = boost_spec, rf = rf_spec)
)
all_workflows<-
bind_rows(no_pre_proc, cc)%>%
mutate(wflow_id = gsub('(center_scale_)|(base_)', '',wflow_id))
We will use a racing method to tune the models. This will help us limit the overall computational burden by removing candidate tuning parameters quickly, therefore reducing the total amount of tuned models.
library(finetune)
## Warning: package 'finetune' was built under R version 4.1.3
race_ctrl<-
control_race(
save_pred = TRUE,
parallel_over = 'everything',
save_workflow = TRUE
)
race_results_time<-
system.time(
race_results<-
all_workflows%>%
workflow_map(
'tune_race_anova',
seed = 123,
resamples = ph_folds,
control = race_ctrl,
verbose = TRUE
)
)
## i 1 of 4 tuning: boost
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: package 'xgboost' was built under R version 4.1.3
## Warning: package 'rlang' was built under R version 4.1.3
## v 1 of 4 tuning: boost (3m 34.9s)
## i 2 of 4 tuning: rf
## i Creating pre-processing data to finalize unknown parameter: mtry
## v 2 of 4 tuning: rf (2m 56.5s)
## i 3 of 4 tuning: SVM_radial
## ! Fold06: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.
## ! Fold06: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.
## ! Fold06: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.
## ! Fold06: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.
## ! Fold06: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.
## ! Fold06: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.
## ! Fold06: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.
## ! Fold06: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.
## ! Fold06: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.
## ! Fold06: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.
## ! Fold01: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.
## ! Fold03: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.
## ! Fold10: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.
## ! Fold10: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.
## ! Fold07: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.
## ! Fold07: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.
## ! Fold09: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.
## ! Fold09: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.
## ! Fold02: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.
## ! Fold02: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.
## ! Fold04: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.
## ! Fold04: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.
## ! Fold08: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.
## ! Fold08: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.
## ! Fold05: preprocessor 1/1, model 1/2: Variable(s) `' constant. Cannot scale data.
## ! Fold05: preprocessor 1/1, model 2/2: Variable(s) `' constant. Cannot scale data.
## v 3 of 4 tuning: SVM_radial (47.2s)
## i 4 of 4 tuning: nearest_neighbor
## Warning: package 'kknn' was built under R version 4.1.3
## v 4 of 4 tuning: nearest_neighbor (2m 57.2s)
race_results
## # A workflow set/tibble: 4 x 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 boost <tibble [1 x 4]> <opts[2]> <race[+]>
## 2 rf <tibble [1 x 4]> <opts[2]> <race[+]>
## 3 SVM_radial <tibble [1 x 4]> <opts[2]> <race[+]>
## 4 nearest_neighbor <tibble [1 x 4]> <opts[2]> <race[+]>
autoplot(race_results)
collect_metrics(race_results)
## # A tibble: 80 x 9
## wflow_id .config preproc model .metric .estimator mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 boost Preprocessor1_~ recipe boos~ rmse standard 0.102 10 0.00264
## 2 boost Preprocessor1_~ recipe boos~ rsq standard 0.645 10 0.0180
## 3 boost Preprocessor1_~ recipe boos~ rmse standard 0.104 10 0.00223
## 4 boost Preprocessor1_~ recipe boos~ rsq standard 0.638 10 0.0173
## 5 boost Preprocessor1_~ recipe boos~ rmse standard 0.103 10 0.00225
## 6 boost Preprocessor1_~ recipe boos~ rsq standard 0.639 10 0.0168
## 7 boost Preprocessor1_~ recipe boos~ rmse standard 0.105 5 0.00527
## 8 boost Preprocessor1_~ recipe boos~ rsq standard 0.644 5 0.0328
## 9 boost Preprocessor1_~ recipe boos~ rmse standard 0.102 3 0.00452
## 10 boost Preprocessor1_~ recipe boos~ rsq standard 0.649 3 0.00361
## # ... with 70 more rows
best_results<-
race_results%>%
extract_workflow_set_result('nearest_neighbor')%>%
select_best(metric = 'rmse')
best_results
## # A tibble: 1 x 4
## neighbors weight_func dist_power .config
## <int> <chr> <dbl> <chr>
## 1 8 epanechnikov 0.468 Preprocessor1_Model03
knn_test_results<-
race_results%>%
extract_workflow('nearest_neighbor')%>%
finalize_workflow(best_results)%>%
last_fit(split = ph_split)
collect_metrics(knn_test_results)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.0983 Preprocessor1_Model1
## 2 rsq standard 0.674 Preprocessor1_Model1
knn_test_results%>%
collect_predictions()%>%
ggplot(aes(x = ph, y = .pred))+
geom_abline(color = 'gray50', lty = 2)+
geom_point(alpha = .5)+
coord_obs_pred()+
labs(x = 'observed', y = 'predicted')