library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.1.3
library(finetune)
## Warning: package 'finetune' was built under R version 4.1.3
## Loading required package: tune
## Warning: package 'tune' was built under R version 4.1.3
library(visdat)
## Warning: package 'visdat' was built under R version 4.1.3
library(naniar)
## Warning: package 'naniar' was built under R version 4.1.3
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.1.3
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.7.12 v recipes 0.2.0
## v dials 0.1.1 v rsample 0.1.1
## v infer 1.0.0 v workflows 0.2.6
## v modeldata 0.1.1 v workflowsets 0.2.1
## v parsnip 0.2.1 v yardstick 0.0.9
## Warning: package 'dials' was built under R version 4.1.3
## Warning: package 'infer' was built under R version 4.1.3
## Warning: package 'modeldata' was built under R version 4.1.3
## Warning: package 'parsnip' was built under R version 4.1.3
## Warning: package 'recipes' was built under R version 4.1.3
## Warning: package 'rsample' was built under R version 4.1.3
## Warning: package 'workflows' was built under R version 4.1.3
## Warning: package 'workflowsets' was built under R version 4.1.3
## Warning: package 'yardstick' was built under R version 4.1.3
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
library(janitor)
## Warning: package 'janitor' was built under R version 4.1.3
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(car)
## Warning: package 'car' was built under R version 4.1.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
train_data<-openxlsx::read.xlsx('https://github.com/JackJosephWright/predicting_ph/blob/main/data/StudentData.xlsx?raw=true', sep.names=' ')
test_data = openxlsx::read.xlsx('https://github.com/JackJosephWright/predicting_ph/blob/main/data/StudentEvaluation.xlsx?raw=true', sep.names=' ')
train_data%>%head()
## Brand Code Carb Volume Fill Ounces PC Volume Carb Pressure Carb Temp PSC
## 1 B 5.340000 23.96667 0.2633333 68.2 141.2 0.104
## 2 A 5.426667 24.00667 0.2386667 68.4 139.6 0.124
## 3 B 5.286667 24.06000 0.2633333 70.8 144.8 0.090
## 4 A 5.440000 24.00667 0.2933333 63.0 132.6 NA
## 5 A 5.486667 24.31333 0.1113333 67.2 136.8 0.026
## 6 A 5.380000 23.92667 0.2693333 66.6 138.4 0.090
## PSC Fill PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure Hyd Pressure1
## 1 0.26 0.04 -100 118.8 46.0 0
## 2 0.22 0.04 -100 121.6 46.0 0
## 3 0.34 0.16 -100 120.2 46.0 0
## 4 0.42 0.04 -100 115.2 46.4 0
## 5 0.16 0.12 -100 118.4 45.8 0
## 6 0.24 0.04 -100 119.6 45.6 0
## Hyd Pressure2 Hyd Pressure3 Hyd Pressure4 Filler Level Filler Speed
## 1 NA NA 118 121.2 4002
## 2 NA NA 106 118.6 3986
## 3 NA NA 82 120.0 4020
## 4 0 0 92 117.8 4012
## 5 0 0 92 118.6 4010
## 6 0 0 116 120.2 4014
## Temperature Usage cont Carb Flow Density MFR Balling Pressure Vacuum PH
## 1 66.0 16.18 2932 0.88 725.0 1.398 -4.0 8.36
## 2 67.6 19.90 3144 0.92 726.8 1.498 -4.0 8.26
## 3 67.0 17.76 2914 1.58 735.0 3.142 -3.8 8.94
## 4 65.6 17.42 3062 1.54 730.6 3.042 -4.4 8.24
## 5 65.6 17.68 3054 1.54 722.8 3.042 -4.4 8.26
## 6 66.2 23.82 2948 1.52 738.8 2.992 -4.4 8.32
## Oxygen Filler Bowl Setpoint Pressure Setpoint Air Pressurer Alch Rel Carb Rel
## 1 0.022 120 46.4 142.6 6.58 5.32
## 2 0.026 120 46.8 143.0 6.56 5.30
## 3 0.024 120 46.6 142.0 7.66 5.84
## 4 0.030 120 46.0 146.2 7.14 5.42
## 5 0.030 120 46.0 146.2 7.14 5.44
## 6 0.024 120 46.0 146.6 7.16 5.44
## Balling Lvl
## 1 1.48
## 2 1.56
## 3 3.28
## 4 3.04
## 5 3.04
## 6 3.02
vis_dat(train_data)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
The missing data does not look random. Lets take a closer look at how the missing data is distributed
gg_miss_var(train_data)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
MFR and Brand Code contain most of the missing data.
Little’s Missing Completely at Random test will give us some insight if there are patterns in the missing data.
mcar_test(train_data)
## Warning in norm::prelim.norm(data): NAs introduced by coercion to integer range
## # A tibble: 1 x 4
## statistic df p.value missing.patterns
## <dbl> <dbl> <dbl> <int>
## 1 7507. 3451 0 115
With a high test statistic and low p-value we can conclude that there is structure to the missing data.
Remove the two most missing, and retest for MCAR
mcar_test(train_data%>%select(-c(MFR,`Brand Code`,`Filler Speed`)))
## # A tibble: 1 x 4
## statistic df p.value missing.patterns
## <dbl> <dbl> <dbl> <int>
## 1 6377. 2012 0 74
We can conclude that there is some structure to the missing data
cor(train_data%>%select(-`Brand Code`)%>%na.omit())%>%as.data.frame()%>%arrange(desc(MFR))%>%select(MFR)%>%head()
## MFR
## MFR 1.00000000
## Filler Speed 0.95142239
## Filler Level 0.06405259
## Balling 0.05382864
## Pressure Vacuum 0.05159662
## Hyd Pressure2 0.04521297
Our most missing predictor MFR has a 95% correlation with Filler Speed.
train_data%>%
mutate(both_miss = case_when(
(is.na(MFR) & is.na(`Filler Speed`) )~TRUE,
TRUE~FALSE)
)%>%
summarize("both missing" = sum(both_miss), 'MFR missing' = sum(is.na(`Filler Speed`)))
## both missing MFR missing
## 1 54 57
94% of the time when Filler Speed is missing, MFR is also missing. The best option is to drop MFR and listwise delete the missing values from Filler Speed
Brand Codecat_impute_df<-train_data%>%na.omit()%>%select(-c(MFR, PH))%>%mutate(`Brand Code` = as.factor(`Brand Code`))
brand_split<-split
code_count<-
cat_impute_df%>%
count(`Brand Code`)
code_count
## Brand Code n
## 1 A 224
## 2 B 1050
## 3 C 252
## 4 D 512
cat_impute_df<-clean_names(cat_impute_df)
brand_split<-initial_split(cat_impute_df)
brand_train<-training(brand_split)
brand_test<-testing(brand_split)
Note that there is a class imbalance in the Brand Codes so it is important to stratify before predicting
tree_spec<-decision_tree()%>%
set_engine('rpart')%>%
set_mode('classification')
tune_spec<-
decision_tree(
cost_complexity = tune(),
tree_depth = tune()
)%>%
set_engine("rpart") %>%
set_mode("classification")
tree_grid<- grid_regular(cost_complexity(),
tree_depth(),
levels = 5)
brand_folds<-vfold_cv(brand_train , v = 10)
brand_folds
## # 10-fold cross-validation
## # A tibble: 10 x 2
## splits id
## <list> <chr>
## 1 <split [1375/153]> Fold01
## 2 <split [1375/153]> Fold02
## 3 <split [1375/153]> Fold03
## 4 <split [1375/153]> Fold04
## 5 <split [1375/153]> Fold05
## 6 <split [1375/153]> Fold06
## 7 <split [1375/153]> Fold07
## 8 <split [1375/153]> Fold08
## 9 <split [1376/152]> Fold09
## 10 <split [1376/152]> Fold10
set.seed(123)
tree_wf<-workflow()%>%
add_model(tune_spec)%>%
add_formula(brand_code~.)
tree_res<-
tree_wf%>%
tune_grid(
resamples = brand_folds,
grid = tree_grid
)
best_tree<-tree_res%>%
select_best('accuracy')
final_wf<-
tree_wf%>%
finalize_workflow(best_tree)
final_fit<-
final_wf%>%
last_fit(brand_split)
final_fit%>%
collect_metrics()
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy multiclass 0.941 Preprocessor1_Model1
## 2 roc_auc hand_till 0.967 Preprocessor1_Model1
Brand Code can be predicted with a high accuracy using the other predictors. It would be advisable to impute the missing Brand Code before modeling.
dropping MFR and imputing Brand Codes accounts for 90% of the missing data, the most conservative method would be to listwise delete the remaining missing data. However, we did note that there is missing data in the test set. In order to minimize leakage of bias from the training to the test set, we will impute the training set with the mean values of the test set.
glimpse(train_data)
## Rows: 2,571
## Columns: 33
## $ `Brand Code` <chr> "B", "A", "B", "A", "A", "A", "A", "B", "B", "B", ~
## $ `Carb Volume` <dbl> 5.340000, 5.426667, 5.286667, 5.440000, 5.486667, ~
## $ `Fill Ounces` <dbl> 23.96667, 24.00667, 24.06000, 24.00667, 24.31333, ~
## $ `PC Volume` <dbl> 0.2633333, 0.2386667, 0.2633333, 0.2933333, 0.1113~
## $ `Carb Pressure` <dbl> 68.2, 68.4, 70.8, 63.0, 67.2, 66.6, 64.2, 67.6, 64~
## $ `Carb Temp` <dbl> 141.2, 139.6, 144.8, 132.6, 136.8, 138.4, 136.8, 1~
## $ PSC <dbl> 0.104, 0.124, 0.090, NA, 0.026, 0.090, 0.128, 0.15~
## $ `PSC Fill` <dbl> 0.26, 0.22, 0.34, 0.42, 0.16, 0.24, 0.40, 0.34, 0.~
## $ `PSC CO2` <dbl> 0.04, 0.04, 0.16, 0.04, 0.12, 0.04, 0.04, 0.04, 0.~
## $ `Mnf Flow` <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -1~
## $ `Carb Pressure1` <dbl> 118.8, 121.6, 120.2, 115.2, 118.4, 119.6, 122.2, 1~
## $ `Fill Pressure` <dbl> 46.0, 46.0, 46.0, 46.4, 45.8, 45.6, 51.8, 46.8, 46~
## $ `Hyd Pressure1` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `Hyd Pressure2` <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `Hyd Pressure3` <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ `Hyd Pressure4` <dbl> 118, 106, 82, 92, 92, 116, 124, 132, 90, 108, 94, ~
## $ `Filler Level` <dbl> 121.2, 118.6, 120.0, 117.8, 118.6, 120.2, 123.4, 1~
## $ `Filler Speed` <dbl> 4002, 3986, 4020, 4012, 4010, 4014, NA, 1004, 4014~
## $ Temperature <dbl> 66.0, 67.6, 67.0, 65.6, 65.6, 66.2, 65.8, 65.2, 65~
## $ `Usage cont` <dbl> 16.18, 19.90, 17.76, 17.42, 17.68, 23.82, 20.74, 1~
## $ `Carb Flow` <dbl> 2932, 3144, 2914, 3062, 3054, 2948, 30, 684, 2902,~
## $ Density <dbl> 0.88, 0.92, 1.58, 1.54, 1.54, 1.52, 0.84, 0.84, 0.~
## $ MFR <dbl> 725.0, 726.8, 735.0, 730.6, 722.8, 738.8, NA, NA, ~
## $ Balling <dbl> 1.398, 1.498, 3.142, 3.042, 3.042, 2.992, 1.298, 1~
## $ `Pressure Vacuum` <dbl> -4.0, -4.0, -3.8, -4.4, -4.4, -4.4, -4.4, -4.4, -4~
## $ PH <dbl> 8.36, 8.26, 8.94, 8.24, 8.26, 8.32, 8.40, 8.38, 8.~
## $ `Oxygen Filler` <dbl> 0.022, 0.026, 0.024, 0.030, 0.030, 0.024, 0.066, 0~
## $ `Bowl Setpoint` <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, ~
## $ `Pressure Setpoint` <dbl> 46.4, 46.8, 46.6, 46.0, 46.0, 46.0, 46.0, 46.0, 46~
## $ `Air Pressurer` <dbl> 142.6, 143.0, 142.0, 146.2, 146.2, 146.6, 146.2, 1~
## $ `Alch Rel` <dbl> 6.58, 6.56, 7.66, 7.14, 7.14, 7.16, 6.54, 6.52, 6.~
## $ `Carb Rel` <dbl> 5.32, 5.30, 5.84, 5.42, 5.44, 5.44, 5.38, 5.34, 5.~
## $ `Balling Lvl` <dbl> 1.48, 1.56, 3.28, 3.04, 3.04, 3.02, 1.44, 1.44, 1.~
There are 2571 observations of 32 predictors and the response (PH). All data are continuous besides the “Brand Code” which is unordered Categorical.
train_data<-train_data%>%na.omit()
train_data%>%na.omit()%>%summarize(mean = mean(PH), "standard deviation" = sd(PH))
## mean standard deviation
## 1 8.552669 0.1707606
ggplot(train_data, aes(x = PH))+geom_histogram()+ggtitle('Distribution of Response')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
description<-psych::describe(train_data)
description%>%
filter((abs(skew)>=.5))%>%nrow()
## [1] 21
skew_list<-as.list(description%>%
filter((abs(skew)>=.5))%>%rownames())
21 of the 32 numeric predictors are skewed.
skewed predictors can be found in the list skew_list
ggplot(description, aes(x=range))+geom_histogram()+ggtitle('Ranges of Predictors')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Most of the predictors range is 1, but a minority are much larger. Depending on the model this data will need to be centered and scaled
pca_rec<-recipe(~., train_data%>%select(-'Brand Code'))%>%
update_role(PH, new_role = 'id')%>%
step_normalize(all_predictors())%>%
step_pca(all_predictors())
pca_prep<-prep(pca_rec)
tidied_pca<-tidy(pca_prep, 2)
#error here component does not exist
tidied_pca%>%
filter(component %in% paste0('PC', 1:5))%>%
mutate(component = fct_inorder(component))%>%
ggplot(aes(value,terms,fill = terms))+
geom_col(show.legend = FALSE)+
facet_wrap(~component,nrow=1)+
labs(y = NULL)
summary(prcomp(train_data%>%select(-`Brand Code`), scale = TRUE))
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.5555 2.4508 1.48330 1.47574 1.31191 1.24571 1.15438
## Proportion of Variance 0.2041 0.1877 0.06876 0.06806 0.05378 0.04849 0.04164
## Cumulative Proportion 0.2041 0.3918 0.46053 0.52859 0.58237 0.63087 0.67251
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.05272 1.01942 0.99613 0.93879 0.90755 0.88793 0.85451
## Proportion of Variance 0.03463 0.03248 0.03101 0.02754 0.02574 0.02464 0.02282
## Cumulative Proportion 0.70714 0.73962 0.77063 0.79817 0.82391 0.84855 0.87136
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.81951 0.7652 0.7066 0.67666 0.66979 0.58281 0.52245
## Proportion of Variance 0.02099 0.0183 0.0156 0.01431 0.01402 0.01061 0.00853
## Cumulative Proportion 0.89235 0.9106 0.9263 0.94056 0.95458 0.96519 0.97372
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.43970 0.40154 0.39113 0.3297 0.2655 0.22208 0.19871
## Proportion of Variance 0.00604 0.00504 0.00478 0.0034 0.0022 0.00154 0.00123
## Cumulative Proportion 0.97976 0.98480 0.98958 0.9930 0.9952 0.99672 0.99796
## PC29 PC30 PC31 PC32
## Standard deviation 0.17726 0.14081 0.10490 0.05591
## Proportion of Variance 0.00098 0.00062 0.00034 0.00010
## Cumulative Proportion 0.99894 0.99956 0.99990 1.00000
Looking at the percent of variance explained, we want to limit the total number of principle components used while maintaining a high amount of variance explained, so we will set the threshold for modeling to 90%.
tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, abs(value), component)) %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, scales = "free_y") +
scale_y_reordered() +
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)
PCA 1 looks like it has a lot to do with pressure, as the positive elements are Mnf Flow Hyd Pressure2 Fill Pressure and Pressure Setpoint , the negative elements are Pressure Vaccumand Filler Level. In this context positive means that as that variable increases we move in a positive direction along this principle component, while negative means that when that variable increases we move in the negative direction along that principle component.
PCA 2 looks like it has a lot to do with the Carb,
Now lets look at how these features are distributed in the plane of the first two Principle Components
temp<-juice(pca_prep)%>%mutate(case_mean = case_when(PH > mean(PH)~TRUE, TRUE ~FALSE))
temp%>%
ggplot(aes(PC1,PC2, label = PH, color = case_mean))+
geom_point(alpha = .2)+
geom_text(check_overlap = TRUE, family = 'IBMlexSans')+
ggtitle('PH higher or lower than the mean along PC1 vs PC2')
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
There does seem to be some grouping in the PC1 vs PC2 plane, we could try some sort of radial SVM to model this.
Since the response variable is continuous, we will use a series of regression models as candidates. We will train a random forest and XGboost because they have proven historically to be excellent at modeling continuous variables. Our EDA pointed us towards some potential clustering in the principle components, so we will train a radial SVM. Our EDA didn’t point us towards any terribly strong linear correlations, and the potential clustering from the EDA makes us want to throw in a KNN model just to be safe.
From the Missing Data section, we have decided to drop MFR and listwise delete observations for Filler Speed. Some preliminary modeling showed that Brand Code was very responsive to imputation as well. Since the dataset is fairly large and the remaining datapoints seem missing at random, we will drop the remaining observations with missing data.
Our EDA showed that a large number of the predictors were skewed, so we will center and scale the data for models that do not handle skew data or unscaled predictors well, like KNN or SVM. For the ensemble method, we will leave the data as is.
#removing MFR
train_data<-train_data%>%select(-MFR)%>%mutate(`Brand Code` = as.factor(`Brand Code`))
train_data<-train_data%>%clean_names()
test_data<-test_data%>%select(-MFR)
test_data<-test_data%>%clean_names()
ph_split<-initial_split(train_data%>%na.omit())
ph_train<-training(ph_split)
ph_test<-testing(ph_split)
# create base recipe with missing data handling
base_rec<-
recipe(ph~., data = ph_train)%>%
#imputing brand code
step_impute_bag(brand_code)%>%
step_unknown(brand_code)%>%
step_dummy(brand_code)%>%
step_impute_bag(all_predictors())
#omitting remaining missing data
#step_naomit(all_predictors())
temp<-juice(prep(base_rec))
# create centered and scaled data
cs_rec<-
base_rec%>%
step_center%>%
step_scale()
pca_rec<-
base_rec%>%
step_pca(threshold = .90)
#create resamples
ph_folds<-vfold_cv(ph_train,v = 5)
svm_r_spec<-
svm_rbf(cost = tune(), rbf_sigma = tune())%>%
set_engine('kernlab')%>%
set_mode('regression')
knn_spec<-
nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune())%>%
set_engine('kknn')%>%
set_mode('regression')
boost_spec<-
boost_tree(mtry = tune(), min_n = tune(), trees = 100)%>%
set_mode('regression')
rf_spec<-
rand_forest(mtry = tune(), min_n = tune(), trees = 100)%>%
set_engine('ranger', importance = 'impurity')%>%
set_mode('regression')
cc<-
workflow_set(
preproc = list(center_scale = cs_rec),
models = list(SVM_radial = svm_r_spec, knn_spec)
)
no_pre_proc<-
workflow_set(
preproc = list(base = base_rec),
models = list(boost = boost_spec, rf = rf_spec)
)
pca_proc<-
workflow_set(
preproc = list(pca = pca_rec),
models = list(svm_pca = svm_r_spec , boost_pca = boost_spec , rf_pca = rf_spec , knn_pca = knn_spec)
)
all_workflows<-
bind_rows(no_pre_proc, cc)%>%
mutate(wflow_id = gsub('(center_scale_)|(base_)|(pca_)', '',wflow_id))
We will use a racing method to tune the models. This will help us limit the overall computational burden by removing candidate tuning parameters quickly, therefore reducing the total amount of tuned models.
race_ctrl<-
control_race(
save_pred = TRUE,
parallel_over = 'everything',
save_workflow = TRUE
)
race_results_time<-
system.time(
race_results<-
all_workflows%>%
workflow_map(
'tune_race_anova',
seed = 123,
resamples = ph_folds,
control = race_ctrl,
verbose = TRUE
)
)
## i 1 of 4 tuning: boost
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: package 'xgboost' was built under R version 4.1.3
## Warning: package 'rlang' was built under R version 4.1.3
## v 1 of 4 tuning: boost (15m 36.2s)
## i 2 of 4 tuning: rf
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: package 'ranger' was built under R version 4.1.3
## v 2 of 4 tuning: rf (12m 47.8s)
## i 3 of 4 tuning: SVM_radial
## Warning: package 'kernlab' was built under R version 4.1.3
## ! Fold1: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.
## ! Fold1: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.
## ! Fold3: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 1/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 2/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 3/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 4/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 5/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 6/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 7/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 8/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 9/10: Variable(s) `' constant. Cannot scale data.
## ! Fold2: preprocessor 1/1, model 10/10: Variable(s) `' constant. Cannot scale data.
## ! Fold4: preprocessor 1/1, model 1/1: Variable(s) `' constant. Cannot scale data.
## ! Fold5: preprocessor 1/1, model 1/1: Variable(s) `' constant. Cannot scale data.
## v 3 of 4 tuning: SVM_radial (8m 46.6s)
## i 4 of 4 tuning: nearest_neighbor
## Warning: package 'kknn' was built under R version 4.1.3
## v 4 of 4 tuning: nearest_neighbor (13m 4.6s)
race_results
## # A workflow set/tibble: 4 x 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 boost <tibble [1 x 4]> <opts[2]> <race[+]>
## 2 rf <tibble [1 x 4]> <opts[2]> <race[+]>
## 3 SVM_radial <tibble [1 x 4]> <opts[2]> <race[+]>
## 4 nearest_neighbor <tibble [1 x 4]> <opts[2]> <race[+]>
autoplot(race_results)
collect_metrics(race_results)
## # A tibble: 80 x 9
## wflow_id .config preproc model .metric .estimator mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 boost Preprocessor1_~ recipe boos~ rmse standard 0.105 3 0.00185
## 2 boost Preprocessor1_~ recipe boos~ rsq standard 0.620 3 0.0182
## 3 boost Preprocessor1_~ recipe boos~ rmse standard 0.104 5 0.00299
## 4 boost Preprocessor1_~ recipe boos~ rsq standard 0.619 5 0.0221
## 5 boost Preprocessor1_~ recipe boos~ rmse standard 0.104 5 0.00210
## 6 boost Preprocessor1_~ recipe boos~ rsq standard 0.626 5 0.0149
## 7 boost Preprocessor1_~ recipe boos~ rmse standard 0.103 5 0.00280
## 8 boost Preprocessor1_~ recipe boos~ rsq standard 0.628 5 0.0200
## 9 boost Preprocessor1_~ recipe boos~ rmse standard 0.102 5 0.00338
## 10 boost Preprocessor1_~ recipe boos~ rsq standard 0.638 5 0.0221
## # ... with 70 more rows
race_results
## # A workflow set/tibble: 4 x 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 boost <tibble [1 x 4]> <opts[2]> <race[+]>
## 2 rf <tibble [1 x 4]> <opts[2]> <race[+]>
## 3 SVM_radial <tibble [1 x 4]> <opts[2]> <race[+]>
## 4 nearest_neighbor <tibble [1 x 4]> <opts[2]> <race[+]>
best_results_rf<-
race_results%>%
extract_workflow_set_result('rf')%>%
select_best(metric = 'rmse')
best_results_rf
## # A tibble: 1 x 3
## mtry min_n .config
## <int> <int> <chr>
## 1 21 8 Preprocessor1_Model04
best_results_rf<-
race_results%>%
extract_workflow_set_result('rf')%>%
select_best(metric = 'rmse')
rf_test_results<-
race_results%>%
extract_workflow('rf')%>%
finalize_workflow(best_results_rf)%>%
last_fit(split = ph_split)
collect_metrics(rf_test_results)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.103 Preprocessor1_Model1
## 2 rsq standard 0.664 Preprocessor1_Model1
best_results_boost<-
race_results%>%
extract_workflow_set_result('boost')%>%
select_best(metric = 'rmse')
boost_test_results<-
race_results%>%
extract_workflow('boost')%>%
finalize_workflow(best_results_boost)%>%
last_fit(split = ph_split)
collect_metrics(boost_test_results)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.106 Preprocessor1_Model1
## 2 rsq standard 0.633 Preprocessor1_Model1
best_results_knn<-
race_results%>%
extract_workflow_set_result('nearest_neighbor')%>%
select_best(metric = 'rmse')
knn_test_results<-
race_results%>%
extract_workflow('nearest_neighbor')%>%
finalize_workflow(best_results_knn)%>%
last_fit(split = ph_split)
collect_metrics(knn_test_results)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.0993 Preprocessor1_Model1
## 2 rsq standard 0.681 Preprocessor1_Model1
best_results_svm<-
race_results%>%
extract_workflow_set_result('SVM_radial')%>%
select_best(metric = 'rmse')
svm_test_results<-
race_results%>%
extract_workflow('SVM_radial')%>%
finalize_workflow(best_results_svm)%>%
last_fit(split = ph_split)
## ! train/test split: preprocessor 1/1, model 1/1: Variable(s) `' constant. Cannot scale data.
collect_metrics(svm_test_results)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.140 Preprocessor1_Model1
## 2 rsq standard 0.369 Preprocessor1_Model1
rf_test_results%>%
collect_predictions()%>%
ggplot(aes(x = ph, y = .pred))+
geom_abline(color = 'gray50', lty = 2)+
geom_point(alpha = .5)+
coord_obs_pred()+
labs(x = 'observed', y = 'predicted')
Linear Regression
high_corr = findCorrelation(cor(ph_train%>%select(-'brand_code')), names = TRUE)
linear_model = lm(ph~.,ph_train%>%select(-c('brand_code',all_of(high_corr))))
PCR
apply_pca = function(pca_model,data){
pca_data = data.frame(predict(pca_model,data))
pca_data$ph = data['ph'][[1]]
return(pca_data)
}
pca_model = prcomp(ph_train%>%select(-c(brand_code,ph)),scale=TRUE)
ph_pca_train = apply_pca(pca_model, ph_train)
pcr_model = lm(ph~.,data = ph_pca_train)
Predictions against test data
predictions = data.frame(ph = ph_test$ph)
wkflow_models = race_results$wflow_id
for (model in wkflow_models){
best_model = race_results%>%
extract_workflow_set_result(model)%>%
select_best(metric = 'rmse')
predictions[model] = race_results%>%
extract_workflow(model)%>%
finalize_workflow(best_model)%>%
fit(ph_train)%>%
predict(ph_test)
}
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
predictions$linear_regression = predict(linear_model,ph_test)
predictions$PCR = predict(pcr_model,apply_pca(pca_model,ph_test))
predictions%>%head()
## ph boost rf SVM_radial nearest_neighbor linear_regression PCR
## 1 8.50 8.635003 8.599861 8.566188 8.487538 8.655354 8.638479
## 2 8.34 8.470973 8.495883 8.521006 8.478933 8.629985 8.605489
## 3 8.42 8.415807 8.490117 8.386269 8.420386 8.423893 8.406048
## 4 8.40 8.470255 8.496905 8.473017 8.419836 8.549044 8.509634
## 5 8.36 8.472733 8.447204 8.447105 8.418773 8.515768 8.482093
## 6 8.38 8.546646 8.480599 8.537699 8.412661 8.618094 8.584302
Based on RMSE, KNN model performed the best on the data
performance = list('model' = c(), 'rmse' = c())
models_used = names(predictions%>%select(-ph))
for (model in models_used){
rmse = RMSE(predictions['ph'][[1]],predictions[model][[1]])
performance$model = c(performance$model,model)
performance$rmse = c(performance$rmse,rmse)
}
performance_df = data.frame(performance)%>%
arrange(rmse)
performance_df
## model rmse
## 1 nearest_neighbor 0.09933587
## 2 rf 0.10295946
## 3 boost 0.10636273
## 4 SVM_radial 0.13969716
## 5 PCR 0.14347318
## 6 linear_regression 0.14532893
knn_model = race_results%>%
extract_workflow('nearest_neighbor')%>%
finalize_workflow(best_results_knn)%>%
fit(ph_train)
Since KNN performed the best, we will be using that model to predict on the test data
test_data['ph'] = predict(knn_model, test_data)
test_data%>%head()
## brand_code carb_volume fill_ounces pc_volume carb_pressure carb_temp psc
## 1 D 5.480000 24.03333 0.2700000 65.4 134.6 0.236
## 2 A 5.393333 23.95333 0.2266667 63.2 135.0 0.042
## 3 B 5.293333 23.92000 0.3033333 66.4 140.4 0.068
## 4 B 5.266667 23.94000 0.1860000 64.8 139.0 0.004
## 5 B 5.406667 24.20000 0.1600000 69.4 142.2 0.040
## 6 B 5.286667 24.10667 0.2120000 73.4 147.2 0.078
## psc_fill psc_co2 mnf_flow carb_pressure1 fill_pressure hyd_pressure1
## 1 0.40 0.04 -100 116.6 46.0 0
## 2 0.22 0.08 -100 118.8 46.2 0
## 3 0.10 0.02 -100 120.2 45.8 0
## 4 0.20 0.02 -100 124.8 40.0 0
## 5 0.30 0.06 -100 115.0 51.4 0
## 6 0.22 NA -100 118.6 46.4 0
## hyd_pressure2 hyd_pressure3 hyd_pressure4 filler_level filler_speed
## 1 NA NA 96 129.4 3986
## 2 0 0 112 120.0 4012
## 3 0 0 98 119.4 4010
## 4 0 0 132 120.2 NA
## 5 0 0 94 116.0 4018
## 6 0 0 94 120.4 4010
## temperature usage_cont carb_flow density balling pressure_vacuum ph
## 1 66.0 21.66 2950 0.88 1.398 -3.8 8.762235
## 2 65.6 17.60 2916 1.50 2.942 -4.4 8.405939
## 3 65.6 24.18 3056 0.90 1.448 -4.2 8.467141
## 4 74.4 18.12 28 0.74 1.056 -4.0 8.638058
## 5 66.4 21.32 3214 0.88 1.398 -4.0 8.430360
## 6 66.6 18.00 3064 0.84 1.298 -3.8 8.500947
## oxygen_filler bowl_setpoint pressure_setpoint air_pressurer alch_rel carb_rel
## 1 0.022 130 45.2 142.6 6.56 5.34
## 2 0.030 120 46.0 147.2 7.14 5.58
## 3 0.046 120 46.0 146.6 6.52 5.34
## 4 NA 120 46.0 146.4 6.48 5.50
## 5 0.082 120 50.0 145.8 6.50 5.38
## 6 0.064 120 46.0 146.0 6.50 5.42
## balling_lvl
## 1 1.48
## 2 3.04
## 3 1.46
## 4 1.48
## 5 1.46
## 6 1.44
write.csv(test_data,'StudentPredictions.csv',row.names = FALSE)