library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.4     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──
## ✓ broom        0.7.7      ✓ rsample      0.1.0 
## ✓ dials        0.0.9      ✓ tune         0.1.5 
## ✓ infer        0.5.4      ✓ workflows    0.2.2 
## ✓ modeldata    0.1.0      ✓ workflowsets 0.0.2 
## ✓ parsnip      0.1.6      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.16
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
path = '/home/kenan/Documents/learning/masters/CUNY-SPS-Masters-DS/DATA_624/project/project1/'
df <- read.csv(paste0(path, 'StudentData-TO_MODEL.csv'))

Explore the data

skimr::skim(df)
Data summary
Name df
Number of rows 2571
Number of columns 33
_______________________
Column type frequency:
character 1
numeric 32
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Brand.Code 0 1 0 1 120 5 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Carb.Volume 10 1.00 5.37 0.11 5.04 5.29 5.35 5.45 5.70 ▁▆▇▅▁
Fill.Ounces 38 0.99 23.97 0.09 23.63 23.92 23.97 24.03 24.32 ▁▂▇▂▁
PC.Volume 39 0.98 0.28 0.06 0.08 0.24 0.27 0.31 0.48 ▁▃▇▂▁
Carb.Pressure 27 0.99 68.19 3.54 57.00 65.60 68.20 70.60 79.40 ▁▅▇▃▁
Carb.Temp 26 0.99 141.09 4.04 128.60 138.40 140.80 143.80 154.00 ▁▅▇▃▁
PSC 33 0.99 0.08 0.05 0.00 0.05 0.08 0.11 0.27 ▆▇▃▁▁
PSC.Fill 23 0.99 0.20 0.12 0.00 0.10 0.18 0.26 0.62 ▆▇▃▁▁
PSC.CO2 39 0.98 0.06 0.04 0.00 0.02 0.04 0.08 0.24 ▇▅▂▁▁
Mnf.Flow 2 1.00 24.57 119.48 -100.20 -100.00 65.20 140.80 229.40 ▇▁▁▇▂
Carb.Pressure1 32 0.99 122.59 4.74 105.60 119.00 123.20 125.40 140.20 ▁▃▇▂▁
Fill.Pressure 22 0.99 47.92 3.18 34.60 46.00 46.40 50.00 60.40 ▁▁▇▂▁
Hyd.Pressure1 11 1.00 12.44 12.43 -0.80 0.00 11.40 20.20 58.00 ▇▅▂▁▁
Hyd.Pressure2 15 0.99 20.96 16.39 0.00 0.00 28.60 34.60 59.40 ▇▂▇▅▁
Hyd.Pressure3 15 0.99 20.46 15.98 -1.20 0.00 27.60 33.40 50.00 ▇▁▃▇▁
Hyd.Pressure4 30 0.99 96.29 13.12 52.00 86.00 96.00 102.00 142.00 ▁▃▇▂▁
Filler.Level 20 0.99 109.25 15.70 55.80 98.30 118.40 120.00 161.20 ▁▃▅▇▁
Filler.Speed 57 0.98 3687.20 770.82 998.00 3888.00 3982.00 3998.00 4030.00 ▁▁▁▁▇
Temperature 14 0.99 65.97 1.38 63.60 65.20 65.60 66.40 76.20 ▇▃▁▁▁
Usage.cont 5 1.00 20.99 2.98 12.08 18.36 21.79 23.75 25.90 ▁▃▅▃▇
Carb.Flow 2 1.00 2468.35 1073.70 26.00 1144.00 3028.00 3186.00 5104.00 ▂▅▆▇▁
Density 1 1.00 1.17 0.38 0.24 0.90 0.98 1.62 1.92 ▁▅▇▂▆
MFR 212 0.92 704.05 73.90 31.40 706.30 724.00 731.00 868.60 ▁▁▁▂▇
Balling 1 1.00 2.20 0.93 -0.17 1.50 1.65 3.29 4.01 ▁▇▇▁▇
Pressure.Vacuum 0 1.00 -5.22 0.57 -6.60 -5.60 -5.40 -5.00 -3.60 ▂▇▆▂▁
PH 4 1.00 8.55 0.17 7.88 8.44 8.54 8.68 9.36 ▁▅▇▂▁
Oxygen.Filler 12 1.00 0.05 0.05 0.00 0.02 0.03 0.06 0.40 ▇▁▁▁▁
Bowl.Setpoint 2 1.00 109.33 15.30 70.00 100.00 120.00 120.00 140.00 ▁▂▃▇▁
Pressure.Setpoint 12 1.00 47.62 2.04 44.00 46.00 46.00 50.00 52.00 ▁▇▁▆▁
Air.Pressurer 0 1.00 142.83 1.21 140.80 142.20 142.60 143.00 148.20 ▅▇▁▁▁
Alch.Rel 9 1.00 6.90 0.51 5.28 6.54 6.56 7.24 8.62 ▁▇▂▃▁
Carb.Rel 10 1.00 5.44 0.13 4.96 5.34 5.40 5.54 6.06 ▁▇▇▂▁
Balling.Lvl 1 1.00 2.05 0.87 0.00 1.38 1.48 3.14 3.66 ▁▇▂▁▆
df %>% ggplot(aes(x=PH)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing non-finite values (stat_bin).

Drop rows where the label is missing

means <- skimr::skim(df %>% select(-c('PH')))[10]
df <- df %>% drop_na()

Split the data into training and testing

set.seed(72)
df_split <- initial_split(df, strata=PH)
df_train <- training(df_split)
df_test <- testing(df_split)

Data preprocessing

df_recipe <- recipe(PH ~., data=df_train) %>% 
  update_role(Brand.Code, new_role='code')

Build model specification

model_spec <- rand_forest(mtry = tune(), trees=1000, min_n=tune()) %>%
  set_mode('regression') %>%
  set_engine('ranger')

Create model workflow for the recipe and the model

train_wf <- workflow() %>%
  add_recipe(df_recipe) %>%
  add_model(model_spec)

Hyper parameter tuning

set.seed(42)
folds <- vfold_cv(df_train)

param_grid <- grid_regular(
  mtry(range=c(10,30)),
  min_n(range=c(1, 10)),
  levels=5
)

doParallel::registerDoParallel()
tune_params <- tune_grid(train_wf, resamples = folds, grid=param_grid)

View the relationship between mtry and min_n for rmse and rsq

tune_params %>% 
  collect_metrics() %>% 
  select(c('mtry', 'min_n', '.metric', 'mean')) %>% 
  pivot_longer(mtry:min_n, values_to = 'val', names_to='params') %>% 
  ggplot(aes(x=val, y=mean, color=params)) + 
  geom_line(show.legend=TRUE) + 
  facet_wrap(~.metric, scales='free_y')

Select the best parameters and update the model

best_params <- select_best(tune_params, 'rmse')

model <- finalize_model(
  model_spec,
  best_params
)

Determine the most important variables

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
model %>% 
  set_engine('ranger', importance='permutation') %>%
  fit(PH~., data=juice(df_recipe %>% prep) %>% select(-c('Brand.Code'))) %>%
  vip(geom='col')

Finally fit on the testing data

final_wf <- workflow() %>% 
  add_recipe(df_recipe) %>%
  add_model(model)

final_result <- final_wf %>%
  last_fit(df_split)

How did we perform on the test set

final_result %>% collect_metrics()
## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.103 Preprocessor1_Model1
## 2 rsq     standard       0.682 Preprocessor1_Model1

Our predictions were about in line

final_result %>% collect_predictions() %>% ggplot(aes(x=.pred, y=PH)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

final_result %>% 
  collect_predictions() %>%
  bind_cols(df_test)
## New names:
## * PH -> PH...4
## * PH -> PH...31
## # A tibble: 533 x 38
##    id    .pred  .row PH...4 .config Brand.Code Carb.Volume Fill.Ounces PC.Volume
##    <chr> <dbl> <int>  <dbl> <chr>   <chr>            <dbl>       <dbl>     <dbl>
##  1 trai…  8.57     1   8.26 Prepro… A                 5.49        24.3     0.111
##  2 trai…  8.56     8   8.4  Prepro… B                 5.27        23.9     0.334
##  3 trai…  8.59    12   8.44 Prepro… B                 5.37        24.1     0.261
##  4 trai…  8.46    18   8.42 Prepro… B                 5.36        24.0     0.278
##  5 trai…  8.43    21   8.44 Prepro… B                 5.39        24.1     0.212
##  6 trai…  8.46    25   8.38 Prepro… B                 5.36        24.1     0.155
##  7 trai…  8.47    30   8.28 Prepro… C                 5.27        24.0     0.311
##  8 trai…  8.52    34   8.42 Prepro… D                 5.49        24.1     0.203
##  9 trai…  8.49    37   8.4  Prepro… A                 5.43        24.1     0.239
## 10 trai…  8.50    38   8.5  Prepro… A                 5.44        24.2     0.251
## # … with 523 more rows, and 29 more variables: Carb.Pressure <dbl>,
## #   Carb.Temp <dbl>, PSC <dbl>, PSC.Fill <dbl>, PSC.CO2 <dbl>, Mnf.Flow <dbl>,
## #   Carb.Pressure1 <dbl>, Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>,
## #   Hyd.Pressure2 <dbl>, Hyd.Pressure3 <dbl>, Hyd.Pressure4 <dbl>,
## #   Filler.Level <dbl>, Filler.Speed <dbl>, Temperature <dbl>,
## #   Usage.cont <dbl>, Carb.Flow <dbl>, Density <dbl>, MFR <dbl>, Balling <dbl>,
## #   Pressure.Vacuum <dbl>, PH...31 <dbl>, Oxygen.Filler <dbl>,
## #   Bowl.Setpoint <dbl>, Pressure.Setpoint <dbl>, Air.Pressurer <dbl>,
## #   Alch.Rel <dbl>, Carb.Rel <dbl>, Balling.Lvl <dbl>
final_model <- fit(final_wf, data=df_train)

Making prediction on testing set

testing_df <- read.csv(paste0(path, 'StudentEvaluation-TO_PREDICT.csv'))
testing_df <- testing_df %>% select(-c('PH'))

Fill missing values with mean from the training set

#missing_df <- data.frame(names=testing_df %>% colnames(), values=means %>% drop_na())

testing_df <- testing_df %>% replace_na(list(Carb.Volume=5.37019784,Fill.Ounces=23.97475457,PC.Volume=0.27711875,Carb.Pressure=68.18957547,Carb.Temp=141.09492338,PSC=0.08457368,PSC.Fill=0.19536892,PSC.CO2=0.05641390,Mnf.Flow=24.56893733,Carb.Pressure1=122.58637259,Fill.Pressure=47.92216556,Hyd.Pressure1=12.43757812,Hyd.Pressure2=20.96103286,Hyd.Pressure3=20.45845070,Hyd.Pressure4=96.28886265,Filler.Level=109.25237162,Filler.Speed=3687.19888624,Temperature=65.96754009,Usage.cont=20.99296181,Carb.Flow=2468.35422343,Density=1.17364981,MFR=704.04925816,Balling=2.19776965,Pressure.Vacuum=-5.21610268,Oxygen.Filler=0.04684259,Bowl.Setpoint=109.32658622,Pressure.Setpoint=47.61539664,Air.Pressurer=142.83399455,Alch.Rel=6.89741608,Carb.Rel=5.43678251,Balling.Lvl=2.05000778))

Make predictions

testing_df$PH = predict(final_model, testing_df)
write.csv(testing_df, 'StudenEvaluation-PREDICTED.csv')