Load Packages

library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.1.3
library(finetune)
## Warning: package 'finetune' was built under R version 4.1.3
## Loading required package: tune
## Warning: package 'tune' was built under R version 4.1.3
library(visdat)
## Warning: package 'visdat' was built under R version 4.1.3
library(naniar)
## Warning: package 'naniar' was built under R version 4.1.3
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.1.3
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.7.12     v recipes      0.2.0 
## v dials        0.1.1      v rsample      0.1.1 
## v infer        1.0.0      v workflows    0.2.6 
## v modeldata    0.1.1      v workflowsets 0.2.1 
## v parsnip      0.2.1      v yardstick    0.0.9
## Warning: package 'dials' was built under R version 4.1.3
## Warning: package 'infer' was built under R version 4.1.3
## Warning: package 'modeldata' was built under R version 4.1.3
## Warning: package 'parsnip' was built under R version 4.1.3
## Warning: package 'recipes' was built under R version 4.1.3
## Warning: package 'rsample' was built under R version 4.1.3
## Warning: package 'workflows' was built under R version 4.1.3
## Warning: package 'workflowsets' was built under R version 4.1.3
## Warning: package 'yardstick' was built under R version 4.1.3
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
library(janitor)
## Warning: package 'janitor' was built under R version 4.1.3
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(car)
## Warning: package 'car' was built under R version 4.1.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
## 
##     lift

Load files

train_data<-openxlsx::read.xlsx('https://github.com/JackJosephWright/predicting_ph/blob/main/data/StudentData.xlsx?raw=true', sep.names=' ')

test_data = openxlsx::read.xlsx('https://github.com/JackJosephWright/predicting_ph/blob/main/data/StudentEvaluation.xlsx?raw=true', sep.names=' ')

train_data%>%head()
##   Brand Code Carb Volume Fill Ounces PC Volume Carb Pressure Carb Temp   PSC
## 1          B    5.340000    23.96667 0.2633333          68.2     141.2 0.104
## 2          A    5.426667    24.00667 0.2386667          68.4     139.6 0.124
## 3          B    5.286667    24.06000 0.2633333          70.8     144.8 0.090
## 4          A    5.440000    24.00667 0.2933333          63.0     132.6    NA
## 5          A    5.486667    24.31333 0.1113333          67.2     136.8 0.026
## 6          A    5.380000    23.92667 0.2693333          66.6     138.4 0.090
##   PSC Fill PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure Hyd Pressure1
## 1     0.26    0.04     -100          118.8          46.0             0
## 2     0.22    0.04     -100          121.6          46.0             0
## 3     0.34    0.16     -100          120.2          46.0             0
## 4     0.42    0.04     -100          115.2          46.4             0
## 5     0.16    0.12     -100          118.4          45.8             0
## 6     0.24    0.04     -100          119.6          45.6             0
##   Hyd Pressure2 Hyd Pressure3 Hyd Pressure4 Filler Level Filler Speed
## 1            NA            NA           118        121.2         4002
## 2            NA            NA           106        118.6         3986
## 3            NA            NA            82        120.0         4020
## 4             0             0            92        117.8         4012
## 5             0             0            92        118.6         4010
## 6             0             0           116        120.2         4014
##   Temperature Usage cont Carb Flow Density   MFR Balling Pressure Vacuum   PH
## 1        66.0      16.18      2932    0.88 725.0   1.398            -4.0 8.36
## 2        67.6      19.90      3144    0.92 726.8   1.498            -4.0 8.26
## 3        67.0      17.76      2914    1.58 735.0   3.142            -3.8 8.94
## 4        65.6      17.42      3062    1.54 730.6   3.042            -4.4 8.24
## 5        65.6      17.68      3054    1.54 722.8   3.042            -4.4 8.26
## 6        66.2      23.82      2948    1.52 738.8   2.992            -4.4 8.32
##   Oxygen Filler Bowl Setpoint Pressure Setpoint Air Pressurer Alch Rel Carb Rel
## 1         0.022           120              46.4         142.6     6.58     5.32
## 2         0.026           120              46.8         143.0     6.56     5.30
## 3         0.024           120              46.6         142.0     7.66     5.84
## 4         0.030           120              46.0         146.2     7.14     5.42
## 5         0.030           120              46.0         146.2     7.14     5.44
## 6         0.024           120              46.0         146.6     7.16     5.44
##   Balling Lvl
## 1        1.48
## 2        1.56
## 3        3.28
## 4        3.04
## 5        3.04
## 6        3.02

Missing Data

vis_dat(train_data)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

The missing data does not look random. Lets take a closer look at how the missing data is distributed

gg_miss_var(train_data)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

MFR and Brand Code contain most of the missing data.

Little’s Missing Completely at Random test will give us some insight if there are patterns in the missing data.

mcar_test(train_data)
## Warning in norm::prelim.norm(data): NAs introduced by coercion to integer range
## # A tibble: 1 x 4
##   statistic    df p.value missing.patterns
##       <dbl> <dbl>   <dbl>            <int>
## 1     7507.  3451       0              115

With a high test statistic and low p-value we can conclude that there is structure to the missing data.

Remove the two most missing, and retest for MCAR

mcar_test(train_data%>%select(-c(MFR,`Brand Code`,`Filler Speed`)))
## # A tibble: 1 x 4
##   statistic    df p.value missing.patterns
##       <dbl> <dbl>   <dbl>            <int>
## 1     6377.  2012       0               74

We can conclude that there is some structure to the missing data

MFR Missing Data

cor(train_data%>%select(-`Brand Code`)%>%na.omit())%>%as.data.frame()%>%arrange(desc(MFR))%>%select(MFR)%>%head()
##                        MFR
## MFR             1.00000000
## Filler Speed    0.95142239
## Filler Level    0.06405259
## Balling         0.05382864
## Pressure Vacuum 0.05159662
## Hyd Pressure2   0.04521297

Our most missing predictor MFR has a 95% correlation with Filler Speed.

train_data%>%
  mutate(both_miss = case_when(
    (is.na(MFR) & is.na(`Filler Speed`) )~TRUE,
         TRUE~FALSE)
  )%>%
  summarize("both missing" = sum(both_miss), 'MFR missing' = sum(is.na(`Filler Speed`)))
##   both missing MFR missing
## 1           54          57

94% of the time when Filler Speed is missing, MFR is also missing. The best option is to drop MFR and listwise delete the missing values from Filler Speed

Missing Data in Brand Code

cat_impute_df<-train_data%>%na.omit()%>%select(-c(MFR, PH))%>%mutate(`Brand Code` = as.factor(`Brand Code`))
brand_split<-split

code_count<-
  cat_impute_df%>%
  count(`Brand Code`)

code_count
##   Brand Code    n
## 1          A  224
## 2          B 1050
## 3          C  252
## 4          D  512
cat_impute_df<-clean_names(cat_impute_df)
brand_split<-initial_split(cat_impute_df)
brand_train<-training(brand_split)
brand_test<-testing(brand_split)

Note that there is a class imbalance in the Brand Codes so it is important to stratify before predicting

tree_spec<-decision_tree()%>%
  set_engine('rpart')%>%
  set_mode('classification')
tune_spec<-
  decision_tree(
    cost_complexity = tune(),
    tree_depth = tune()
  )%>%
  set_engine("rpart") %>% 
  set_mode("classification")
tree_grid<- grid_regular(cost_complexity(),
                         tree_depth(),
                         levels = 5)
brand_folds<-vfold_cv(brand_train , v = 10)
brand_folds
## #  10-fold cross-validation 
## # A tibble: 10 x 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [1375/153]> Fold01
##  2 <split [1375/153]> Fold02
##  3 <split [1375/153]> Fold03
##  4 <split [1375/153]> Fold04
##  5 <split [1375/153]> Fold05
##  6 <split [1375/153]> Fold06
##  7 <split [1375/153]> Fold07
##  8 <split [1375/153]> Fold08
##  9 <split [1376/152]> Fold09
## 10 <split [1376/152]> Fold10
set.seed(123)
tree_wf<-workflow()%>%
  add_model(tune_spec)%>%
  add_formula(brand_code~.)
tree_res<-
  tree_wf%>%
  tune_grid(
    resamples = brand_folds,
    grid = tree_grid
  )
best_tree<-tree_res%>%
  select_best('accuracy')

final_wf<-
  tree_wf%>%
  finalize_workflow(best_tree)
final_fit<-
  final_wf%>%
  last_fit(brand_split)
final_fit%>%
  collect_metrics()
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy multiclass     0.941 Preprocessor1_Model1
## 2 roc_auc  hand_till      0.967 Preprocessor1_Model1

Brand Code can be predicted with a high accuracy using the other predictors. It would be advisable to impute the missing Brand Code before modeling.

Remaining Missing Data

dropping MFR and imputing Brand Codes accounts for 90% of the missing data, the most conservative method would be to listwise delete the remaining missing data. However, we did note that there is missing data in the test set. In order to minimize leakage of bias from the training to the test set, we will impute the training set with the mean values of the test set.

Data Structure

glimpse(train_data)
## Rows: 2,571
## Columns: 33
## $ `Brand Code`        <chr> "B", "A", "B", "A", "A", "A", "A", "B", "B", "B", ~
## $ `Carb Volume`       <dbl> 5.340000, 5.426667, 5.286667, 5.440000, 5.486667, ~
## $ `Fill Ounces`       <dbl> 23.96667, 24.00667, 24.06000, 24.00667, 24.31333, ~
## $ `PC Volume`         <dbl> 0.2633333, 0.2386667, 0.2633333, 0.2933333, 0.1113~
## $ `Carb Pressure`     <dbl> 68.2, 68.4, 70.8, 63.0, 67.2, 66.6, 64.2, 67.6, 64~
## $ `Carb Temp`         <dbl> 141.2, 139.6, 144.8, 132.6, 136.8, 138.4, 136.8, 1~
## $ PSC                 <dbl> 0.104, 0.124, 0.090, NA, 0.026, 0.090, 0.128, 0.15~
## $ `PSC Fill`          <dbl> 0.26, 0.22, 0.34, 0.42, 0.16, 0.24, 0.40, 0.34, 0.~
## $ `PSC CO2`           <dbl> 0.04, 0.04, 0.16, 0.04, 0.12, 0.04, 0.04, 0.04, 0.~
## $ `Mnf Flow`          <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -1~
## $ `Carb Pressure1`    <dbl> 118.8, 121.6, 120.2, 115.2, 118.4, 119.6, 122.2, 1~
## $ `Fill Pressure`     <dbl> 46.0, 46.0, 46.0, 46.4, 45.8, 45.6, 51.8, 46.8, 46~
## $ `Hyd Pressure1`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `Hyd Pressure2`     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `Hyd Pressure3`     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `Hyd Pressure4`     <dbl> 118, 106, 82, 92, 92, 116, 124, 132, 90, 108, 94, ~
## $ `Filler Level`      <dbl> 121.2, 118.6, 120.0, 117.8, 118.6, 120.2, 123.4, 1~
## $ `Filler Speed`      <dbl> 4002, 3986, 4020, 4012, 4010, 4014, NA, 1004, 4014~
## $ Temperature         <dbl> 66.0, 67.6, 67.0, 65.6, 65.6, 66.2, 65.8, 65.2, 65~
## $ `Usage cont`        <dbl> 16.18, 19.90, 17.76, 17.42, 17.68, 23.82, 20.74, 1~
## $ `Carb Flow`         <dbl> 2932, 3144, 2914, 3062, 3054, 2948, 30, 684, 2902,~
## $ Density             <dbl> 0.88, 0.92, 1.58, 1.54, 1.54, 1.52, 0.84, 0.84, 0.~
## $ MFR                 <dbl> 725.0, 726.8, 735.0, 730.6, 722.8, 738.8, NA, NA, ~
## $ Balling             <dbl> 1.398, 1.498, 3.142, 3.042, 3.042, 2.992, 1.298, 1~
## $ `Pressure Vacuum`   <dbl> -4.0, -4.0, -3.8, -4.4, -4.4, -4.4, -4.4, -4.4, -4~
## $ PH                  <dbl> 8.36, 8.26, 8.94, 8.24, 8.26, 8.32, 8.40, 8.38, 8.~
## $ `Oxygen Filler`     <dbl> 0.022, 0.026, 0.024, 0.030, 0.030, 0.024, 0.066, 0~
## $ `Bowl Setpoint`     <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, ~
## $ `Pressure Setpoint` <dbl> 46.4, 46.8, 46.6, 46.0, 46.0, 46.0, 46.0, 46.0, 46~
## $ `Air Pressurer`     <dbl> 142.6, 143.0, 142.0, 146.2, 146.2, 146.6, 146.2, 1~
## $ `Alch Rel`          <dbl> 6.58, 6.56, 7.66, 7.14, 7.14, 7.16, 6.54, 6.52, 6.~
## $ `Carb Rel`          <dbl> 5.32, 5.30, 5.84, 5.42, 5.44, 5.44, 5.38, 5.34, 5.~
## $ `Balling Lvl`       <dbl> 1.48, 1.56, 3.28, 3.04, 3.04, 3.02, 1.44, 1.44, 1.~

There are 2571 observations of 32 predictors and the response (PH). All data are continuous besides the “Brand Code” which is unordered Categorical.

Exploring the Response

train_data<-train_data%>%na.omit()
train_data%>%na.omit()%>%summarize(mean = mean(PH), "standard deviation" = sd(PH))
##       mean standard deviation
## 1 8.552669          0.1707606
ggplot(train_data, aes(x = PH))+geom_histogram()+ggtitle('Distribution of Response')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exploring the predictors

description<-psych::describe(train_data)

Normality

description%>%
  filter((abs(skew)>=.5))%>%nrow()
## [1] 21
skew_list<-as.list(description%>%
  filter((abs(skew)>=.5))%>%rownames())

21 of the 32 numeric predictors are skewed.

skewed predictors can be found in the list skew_list

ggplot(description, aes(x=range))+geom_histogram()+ggtitle('Ranges of Predictors')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Most of the predictors range is 1, but a minority are much larger. Depending on the model this data will need to be centered and scaled

Highly Correlated Predictor Pairs

correlated_predictors<-findCorrelation(cor(train_data%>%select(-'Brand Code')), names = TRUE)
correlated_predictors
## [1] "Balling"       "Hyd Pressure3" "Alch Rel"      "Balling Lvl"  
## [5] "Bowl Setpoint" "Filler Speed"

There are 6 highly correlated predictors and can be found in the correlated_predictors list

PCA

pca_rec<-recipe(~., train_data%>%select(-'Brand Code'))%>%
  update_role(PH, new_role = 'id')%>%
  step_normalize(all_predictors())%>%
  step_pca(all_predictors())
pca_prep<-prep(pca_rec)
tidied_pca<-tidy(pca_prep, 2)

#error here component does not exist
tidied_pca%>%
  filter(component %in% paste0('PC', 1:5))%>%
  mutate(component = fct_inorder(component))%>%
  ggplot(aes(value,terms,fill = terms))+
  geom_col(show.legend = FALSE)+
  facet_wrap(~component,nrow=1)+
  labs(y = NULL)

summary(prcomp(train_data%>%select(-`Brand Code`), scale = TRUE))
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.5555 2.4508 1.48330 1.47574 1.31191 1.24571 1.15438
## Proportion of Variance 0.2041 0.1877 0.06876 0.06806 0.05378 0.04849 0.04164
## Cumulative Proportion  0.2041 0.3918 0.46053 0.52859 0.58237 0.63087 0.67251
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     1.05272 1.01942 0.99613 0.93879 0.90755 0.88793 0.85451
## Proportion of Variance 0.03463 0.03248 0.03101 0.02754 0.02574 0.02464 0.02282
## Cumulative Proportion  0.70714 0.73962 0.77063 0.79817 0.82391 0.84855 0.87136
##                           PC15   PC16   PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.81951 0.7652 0.7066 0.67666 0.66979 0.58281 0.52245
## Proportion of Variance 0.02099 0.0183 0.0156 0.01431 0.01402 0.01061 0.00853
## Cumulative Proportion  0.89235 0.9106 0.9263 0.94056 0.95458 0.96519 0.97372
##                           PC22    PC23    PC24   PC25   PC26    PC27    PC28
## Standard deviation     0.43970 0.40154 0.39113 0.3297 0.2655 0.22208 0.19871
## Proportion of Variance 0.00604 0.00504 0.00478 0.0034 0.0022 0.00154 0.00123
## Cumulative Proportion  0.97976 0.98480 0.98958 0.9930 0.9952 0.99672 0.99796
##                           PC29    PC30    PC31    PC32
## Standard deviation     0.17726 0.14081 0.10490 0.05591
## Proportion of Variance 0.00098 0.00062 0.00034 0.00010
## Cumulative Proportion  0.99894 0.99956 0.99990 1.00000

Looking at the percent of variance explained, we want to limit the total number of principle components used while maintaining a high amount of variance explained, so we will set the threshold for modeling to 90%.

tidied_pca %>%
  filter(component %in% paste0("PC", 1:4)) %>%
  group_by(component) %>%
  top_n(8, abs(value)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, abs(value), component)) %>%
  ggplot(aes(abs(value), terms, fill = value > 0)) +
  geom_col() +
  facet_wrap(~component, scales = "free_y") +
  scale_y_reordered() +
  labs(
    x = "Absolute value of contribution",
    y = NULL, fill = "Positive?"
  )

PCA 1 looks like it has a lot to do with pressure, as the positive elements are Mnf Flow Hyd Pressure2 Fill Pressure and Pressure Setpoint , the negative elements are Pressure Vaccumand Filler Level. In this context positive means that as that variable increases we move in a positive direction along this principle component, while negative means that when that variable increases we move in the negative direction along that principle component.

PCA 2 looks like it has a lot to do with the Carb,

Now lets look at how these features are distributed in the plane of the first two Principle Components

temp<-juice(pca_prep)%>%mutate(case_mean = case_when(PH > mean(PH)~TRUE, TRUE ~FALSE))
temp%>%
  ggplot(aes(PC1,PC2, label = PH, color = case_mean))+
  geom_point(alpha = .2)+
  geom_text(check_overlap = TRUE, family = 'IBMlexSans')+
  ggtitle('PH higher or lower than the mean along PC1 vs PC2')
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

There does seem to be some grouping in the PC1 vs PC2 plane, we could try some sort of radial SVM to model this.

Methodology

Since the response variable is continuous, we will use a series of regression models as candidates. We will train a random forest and XGboost because they have proven historically to be excellent at modeling continuous variables. Our EDA pointed us towards some potential clustering in the principle components, so we will train a radial SVM. Our EDA didn’t point us towards any terribly strong linear correlations, and the potential clustering from the EDA makes us want to throw in a KNN model just to be safe.

Data Preparation

From the Missing Data section, we have decided to drop MFR and listwise delete observations for Filler Speed. Some preliminary modeling showed that Brand Code was very responsive to imputation as well. Since the dataset is fairly large and the remaining datapoints seem missing at random, we will drop the remaining observations with missing data.

Our EDA showed that a large number of the predictors were skewed, so we will center and scale the data for models that do not handle skew data or unscaled predictors well, like KNN or SVM. For the ensemble method, we will leave the data as is.

Partitioning the Data

#removing MFR
train_data<-train_data%>%select(-MFR)%>%mutate(`Brand Code` = as.factor(`Brand Code`))
train_data<-train_data%>%clean_names()
test_data<-test_data%>%select(-MFR)
test_data<-test_data%>%clean_names()
ph_split<-initial_split(train_data%>%na.omit())
ph_train<-training(ph_split)
ph_test<-testing(ph_split)

Data Preprocessing

# create base recipe with missing data handling
base_rec<-
  recipe(ph~., data = ph_train)%>%
  #imputing brand code
  
  step_impute_bag(brand_code)%>%
  step_unknown(brand_code)%>%
  step_dummy(brand_code)%>%
  step_impute_bag(all_predictors())
  #omitting remaining missing data
  #step_naomit(all_predictors())
temp<-juice(prep(base_rec))
# create centered and scaled data
cs_rec<-
  base_rec%>%
  step_center%>%
  step_scale()
pca_rec<-
  base_rec%>%
  step_pca(threshold = .90)

#create resamples
ph_folds<-vfold_cv(ph_train,v = 5)

Create Models for Tuning

svm_r_spec<-
svm_rbf(cost = tune(), rbf_sigma = tune())%>%
  set_engine('kernlab')%>%
  set_mode('regression')
knn_spec<-
  nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune())%>%
  set_engine('kknn')%>%
  set_mode('regression')
boost_spec<-
  boost_tree(mtry = tune(), min_n = tune(), trees = 100)%>%
  set_mode('regression')
rf_spec<-
  rand_forest(mtry = tune(), min_n = tune(), trees = 100)%>%
  set_engine('ranger', importance = 'impurity')%>%
  set_mode('regression')

Create the workflow set

cc<-
  workflow_set(
    preproc = list(center_scale = cs_rec),
    models  = list(SVM_radial = svm_r_spec, knn_spec)
  )
no_pre_proc<-
  workflow_set(
    preproc = list(base = base_rec),
    models = list(boost = boost_spec, rf = rf_spec)
  )
pca_proc<-
  workflow_set(
    preproc = list(pca = pca_rec),
    models = list(svm_pca = svm_r_spec , boost_pca = boost_spec , rf_pca = rf_spec , knn_pca = knn_spec)
  )
all_workflows<-
  bind_rows(no_pre_proc, cc)%>%
  mutate(wflow_id = gsub('(center_scale_)|(base_)|(pca_)', '',wflow_id))

Tuning and Evaluating

We will use a racing method to tune the models. This will help us limit the overall computational burden by removing candidate tuning parameters quickly, therefore reducing the total amount of tuned models.

race_ctrl<-
  control_race(
    save_pred = TRUE, 
    parallel_over = 'everything',
    save_workflow = TRUE
  )
race_results_time<-
  system.time(
    race_results<-
      all_workflows%>%
      workflow_map(
        'tune_race_anova',
        seed = 123,
        resamples = ph_folds,
        control = race_ctrl,
        verbose = TRUE
      )
  )
## i 1 of 4 tuning:     boost
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: package 'xgboost' was built under R version 4.1.3
## Warning: package 'rlang' was built under R version 4.1.3
## v 1 of 4 tuning:     boost (15m 36.2s)
## i 2 of 4 tuning:     rf
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: package 'ranger' was built under R version 4.1.3
## v 2 of 4 tuning:     rf (12m 47.8s)
## i 3 of 4 tuning:     SVM_radial
## Warning: package 'kernlab' was built under R version 4.1.3
## ! Fold1: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.
## ! Fold4: preprocessor 1/1, model 1/1: Variable(s) `' constant. Cannot scale data.
## ! Fold5: preprocessor 1/1, model 1/1: Variable(s) `' constant. Cannot scale data.
## v 3 of 4 tuning:     SVM_radial (8m 46.6s)
## i 4 of 4 tuning:     nearest_neighbor
## Warning: package 'kknn' was built under R version 4.1.3
## v 4 of 4 tuning:     nearest_neighbor (13m 4.6s)
race_results
## # A workflow set/tibble: 4 x 4
##   wflow_id         info             option    result   
##   <chr>            <list>           <list>    <list>   
## 1 boost            <tibble [1 x 4]> <opts[2]> <race[+]>
## 2 rf               <tibble [1 x 4]> <opts[2]> <race[+]>
## 3 SVM_radial       <tibble [1 x 4]> <opts[2]> <race[+]>
## 4 nearest_neighbor <tibble [1 x 4]> <opts[2]> <race[+]>
autoplot(race_results)

collect_metrics(race_results)
## # A tibble: 80 x 9
##    wflow_id .config         preproc model .metric .estimator  mean     n std_err
##    <chr>    <chr>           <chr>   <chr> <chr>   <chr>      <dbl> <int>   <dbl>
##  1 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.105     3 0.00185
##  2 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.620     3 0.0182 
##  3 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.104     5 0.00299
##  4 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.619     5 0.0221 
##  5 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.104     5 0.00210
##  6 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.626     5 0.0149 
##  7 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.103     5 0.00280
##  8 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.628     5 0.0200 
##  9 boost    Preprocessor1_~ recipe  boos~ rmse    standard   0.102     5 0.00338
## 10 boost    Preprocessor1_~ recipe  boos~ rsq     standard   0.638     5 0.0221 
## # ... with 70 more rows
race_results
## # A workflow set/tibble: 4 x 4
##   wflow_id         info             option    result   
##   <chr>            <list>           <list>    <list>   
## 1 boost            <tibble [1 x 4]> <opts[2]> <race[+]>
## 2 rf               <tibble [1 x 4]> <opts[2]> <race[+]>
## 3 SVM_radial       <tibble [1 x 4]> <opts[2]> <race[+]>
## 4 nearest_neighbor <tibble [1 x 4]> <opts[2]> <race[+]>
best_results_rf<-
  race_results%>%
  extract_workflow_set_result('rf')%>%
  select_best(metric = 'rmse')
best_results_rf
## # A tibble: 1 x 3
##    mtry min_n .config              
##   <int> <int> <chr>                
## 1    21     8 Preprocessor1_Model04
best_results_rf<-
  race_results%>%
  extract_workflow_set_result('rf')%>%
  select_best(metric = 'rmse')
rf_test_results<-
  race_results%>%
  extract_workflow('rf')%>%
  finalize_workflow(best_results_rf)%>%
  last_fit(split = ph_split)
collect_metrics(rf_test_results)
## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.103 Preprocessor1_Model1
## 2 rsq     standard       0.664 Preprocessor1_Model1
best_results_boost<-
  race_results%>%
  extract_workflow_set_result('boost')%>%
  select_best(metric = 'rmse')
boost_test_results<-
  race_results%>%
  extract_workflow('boost')%>%
  finalize_workflow(best_results_boost)%>%
  last_fit(split = ph_split)
collect_metrics(boost_test_results)
## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.106 Preprocessor1_Model1
## 2 rsq     standard       0.633 Preprocessor1_Model1
best_results_knn<-
  race_results%>%
  extract_workflow_set_result('nearest_neighbor')%>%
  select_best(metric = 'rmse')
knn_test_results<-
  race_results%>%
  extract_workflow('nearest_neighbor')%>%
  finalize_workflow(best_results_knn)%>%
  last_fit(split = ph_split)
collect_metrics(knn_test_results)
## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.0993 Preprocessor1_Model1
## 2 rsq     standard      0.681  Preprocessor1_Model1
best_results_svm<-
  race_results%>%
  extract_workflow_set_result('SVM_radial')%>%
  select_best(metric = 'rmse')
svm_test_results<-
  race_results%>%
  extract_workflow('SVM_radial')%>%
  finalize_workflow(best_results_svm)%>%
  last_fit(split = ph_split)
## ! train/test split: preprocessor 1/1, model 1/1: Variable(s) `' constant. Cannot scale data.
collect_metrics(svm_test_results)
## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.140 Preprocessor1_Model1
## 2 rsq     standard       0.369 Preprocessor1_Model1
rf_test_results%>%
  collect_predictions()%>%
  ggplot(aes(x = ph, y = .pred))+
  geom_abline(color = 'gray50', lty = 2)+
  geom_point(alpha = .5)+
  coord_obs_pred()+
  labs(x = 'observed', y = 'predicted')

Linear Regression

high_corr = findCorrelation(cor(ph_train%>%select(-'brand_code')), names = TRUE)
linear_model = lm(ph~.,ph_train%>%select(-c('brand_code',all_of(high_corr))))

PCR

apply_pca = function(pca_model,data){
  pca_data = data.frame(predict(pca_model,data))
  pca_data$ph = data['ph'][[1]]
  return(pca_data)
}
pca_model = prcomp(ph_train%>%select(-c(brand_code,ph)),scale=TRUE)

ph_pca_train = apply_pca(pca_model, ph_train)
pcr_model = lm(ph~.,data = ph_pca_train)

Predictions against test data

predictions = data.frame(ph = ph_test$ph)
wkflow_models = race_results$wflow_id

for (model in wkflow_models){
  best_model = race_results%>%
  extract_workflow_set_result(model)%>%
  select_best(metric = 'rmse')
 predictions[model] = race_results%>%
  extract_workflow(model)%>%
  finalize_workflow(best_model)%>%
  fit(ph_train)%>%
  predict(ph_test) 
}
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
predictions$linear_regression = predict(linear_model,ph_test)
predictions$PCR = predict(pcr_model,apply_pca(pca_model,ph_test))

predictions%>%head()
##     ph    boost       rf SVM_radial nearest_neighbor linear_regression      PCR
## 1 8.50 8.635003 8.599861   8.566188         8.487538          8.655354 8.638479
## 2 8.34 8.470973 8.495883   8.521006         8.478933          8.629985 8.605489
## 3 8.42 8.415807 8.490117   8.386269         8.420386          8.423893 8.406048
## 4 8.40 8.470255 8.496905   8.473017         8.419836          8.549044 8.509634
## 5 8.36 8.472733 8.447204   8.447105         8.418773          8.515768 8.482093
## 6 8.38 8.546646 8.480599   8.537699         8.412661          8.618094 8.584302

Based on RMSE, KNN model performed the best on the data

performance = list('model' = c(), 'rmse' = c())
models_used = names(predictions%>%select(-ph))

for (model in models_used){
  rmse = RMSE(predictions['ph'][[1]],predictions[model][[1]])
  performance$model = c(performance$model,model)
  performance$rmse = c(performance$rmse,rmse)
}

performance_df = data.frame(performance)%>%
  arrange(rmse)
performance_df
##               model       rmse
## 1  nearest_neighbor 0.09933587
## 2                rf 0.10295946
## 3             boost 0.10636273
## 4        SVM_radial 0.13969716
## 5               PCR 0.14347318
## 6 linear_regression 0.14532893
knn_model = race_results%>%
  extract_workflow('nearest_neighbor')%>%
  finalize_workflow(best_results_knn)%>%
  fit(ph_train)

Final Prediction

Since KNN performed the best, we will be using that model to predict on the test data

test_data['ph'] = predict(knn_model, test_data)
test_data%>%head()
##   brand_code carb_volume fill_ounces pc_volume carb_pressure carb_temp   psc
## 1          D    5.480000    24.03333 0.2700000          65.4     134.6 0.236
## 2          A    5.393333    23.95333 0.2266667          63.2     135.0 0.042
## 3          B    5.293333    23.92000 0.3033333          66.4     140.4 0.068
## 4          B    5.266667    23.94000 0.1860000          64.8     139.0 0.004
## 5          B    5.406667    24.20000 0.1600000          69.4     142.2 0.040
## 6          B    5.286667    24.10667 0.2120000          73.4     147.2 0.078
##   psc_fill psc_co2 mnf_flow carb_pressure1 fill_pressure hyd_pressure1
## 1     0.40    0.04     -100          116.6          46.0             0
## 2     0.22    0.08     -100          118.8          46.2             0
## 3     0.10    0.02     -100          120.2          45.8             0
## 4     0.20    0.02     -100          124.8          40.0             0
## 5     0.30    0.06     -100          115.0          51.4             0
## 6     0.22      NA     -100          118.6          46.4             0
##   hyd_pressure2 hyd_pressure3 hyd_pressure4 filler_level filler_speed
## 1            NA            NA            96        129.4         3986
## 2             0             0           112        120.0         4012
## 3             0             0            98        119.4         4010
## 4             0             0           132        120.2           NA
## 5             0             0            94        116.0         4018
## 6             0             0            94        120.4         4010
##   temperature usage_cont carb_flow density balling pressure_vacuum       ph
## 1        66.0      21.66      2950    0.88   1.398            -3.8 8.762235
## 2        65.6      17.60      2916    1.50   2.942            -4.4 8.405939
## 3        65.6      24.18      3056    0.90   1.448            -4.2 8.467141
## 4        74.4      18.12        28    0.74   1.056            -4.0 8.638058
## 5        66.4      21.32      3214    0.88   1.398            -4.0 8.430360
## 6        66.6      18.00      3064    0.84   1.298            -3.8 8.500947
##   oxygen_filler bowl_setpoint pressure_setpoint air_pressurer alch_rel carb_rel
## 1         0.022           130              45.2         142.6     6.56     5.34
## 2         0.030           120              46.0         147.2     7.14     5.58
## 3         0.046           120              46.0         146.6     6.52     5.34
## 4            NA           120              46.0         146.4     6.48     5.50
## 5         0.082           120              50.0         145.8     6.50     5.38
## 6         0.064           120              46.0         146.0     6.50     5.42
##   balling_lvl
## 1        1.48
## 2        3.04
## 3        1.46
## 4        1.48
## 5        1.46
## 6        1.44
write.csv(test_data,'StudentPredictions.csv',row.names = FALSE)