library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.4 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──
## ✓ broom 0.7.7 ✓ rsample 0.1.0
## ✓ dials 0.0.9 ✓ tune 0.1.5
## ✓ infer 0.5.4 ✓ workflows 0.2.2
## ✓ modeldata 0.1.0 ✓ workflowsets 0.0.2
## ✓ parsnip 0.1.6 ✓ yardstick 0.0.8
## ✓ recipes 0.1.16
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
path = '/home/kenan/Documents/learning/masters/CUNY-SPS-Masters-DS/DATA_624/project/project1/'
df <- read.csv(paste0(path, 'StudentData-TO_MODEL.csv'))
Explore the data
skimr::skim(df)
| Name | df |
| Number of rows | 2571 |
| Number of columns | 33 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 32 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Brand.Code | 0 | 1 | 0 | 1 | 120 | 5 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Carb.Volume | 10 | 1.00 | 5.37 | 0.11 | 5.04 | 5.29 | 5.35 | 5.45 | 5.70 | ▁▆▇▅▁ |
| Fill.Ounces | 38 | 0.99 | 23.97 | 0.09 | 23.63 | 23.92 | 23.97 | 24.03 | 24.32 | ▁▂▇▂▁ |
| PC.Volume | 39 | 0.98 | 0.28 | 0.06 | 0.08 | 0.24 | 0.27 | 0.31 | 0.48 | ▁▃▇▂▁ |
| Carb.Pressure | 27 | 0.99 | 68.19 | 3.54 | 57.00 | 65.60 | 68.20 | 70.60 | 79.40 | ▁▅▇▃▁ |
| Carb.Temp | 26 | 0.99 | 141.09 | 4.04 | 128.60 | 138.40 | 140.80 | 143.80 | 154.00 | ▁▅▇▃▁ |
| PSC | 33 | 0.99 | 0.08 | 0.05 | 0.00 | 0.05 | 0.08 | 0.11 | 0.27 | ▆▇▃▁▁ |
| PSC.Fill | 23 | 0.99 | 0.20 | 0.12 | 0.00 | 0.10 | 0.18 | 0.26 | 0.62 | ▆▇▃▁▁ |
| PSC.CO2 | 39 | 0.98 | 0.06 | 0.04 | 0.00 | 0.02 | 0.04 | 0.08 | 0.24 | ▇▅▂▁▁ |
| Mnf.Flow | 2 | 1.00 | 24.57 | 119.48 | -100.20 | -100.00 | 65.20 | 140.80 | 229.40 | ▇▁▁▇▂ |
| Carb.Pressure1 | 32 | 0.99 | 122.59 | 4.74 | 105.60 | 119.00 | 123.20 | 125.40 | 140.20 | ▁▃▇▂▁ |
| Fill.Pressure | 22 | 0.99 | 47.92 | 3.18 | 34.60 | 46.00 | 46.40 | 50.00 | 60.40 | ▁▁▇▂▁ |
| Hyd.Pressure1 | 11 | 1.00 | 12.44 | 12.43 | -0.80 | 0.00 | 11.40 | 20.20 | 58.00 | ▇▅▂▁▁ |
| Hyd.Pressure2 | 15 | 0.99 | 20.96 | 16.39 | 0.00 | 0.00 | 28.60 | 34.60 | 59.40 | ▇▂▇▅▁ |
| Hyd.Pressure3 | 15 | 0.99 | 20.46 | 15.98 | -1.20 | 0.00 | 27.60 | 33.40 | 50.00 | ▇▁▃▇▁ |
| Hyd.Pressure4 | 30 | 0.99 | 96.29 | 13.12 | 52.00 | 86.00 | 96.00 | 102.00 | 142.00 | ▁▃▇▂▁ |
| Filler.Level | 20 | 0.99 | 109.25 | 15.70 | 55.80 | 98.30 | 118.40 | 120.00 | 161.20 | ▁▃▅▇▁ |
| Filler.Speed | 57 | 0.98 | 3687.20 | 770.82 | 998.00 | 3888.00 | 3982.00 | 3998.00 | 4030.00 | ▁▁▁▁▇ |
| Temperature | 14 | 0.99 | 65.97 | 1.38 | 63.60 | 65.20 | 65.60 | 66.40 | 76.20 | ▇▃▁▁▁ |
| Usage.cont | 5 | 1.00 | 20.99 | 2.98 | 12.08 | 18.36 | 21.79 | 23.75 | 25.90 | ▁▃▅▃▇ |
| Carb.Flow | 2 | 1.00 | 2468.35 | 1073.70 | 26.00 | 1144.00 | 3028.00 | 3186.00 | 5104.00 | ▂▅▆▇▁ |
| Density | 1 | 1.00 | 1.17 | 0.38 | 0.24 | 0.90 | 0.98 | 1.62 | 1.92 | ▁▅▇▂▆ |
| MFR | 212 | 0.92 | 704.05 | 73.90 | 31.40 | 706.30 | 724.00 | 731.00 | 868.60 | ▁▁▁▂▇ |
| Balling | 1 | 1.00 | 2.20 | 0.93 | -0.17 | 1.50 | 1.65 | 3.29 | 4.01 | ▁▇▇▁▇ |
| Pressure.Vacuum | 0 | 1.00 | -5.22 | 0.57 | -6.60 | -5.60 | -5.40 | -5.00 | -3.60 | ▂▇▆▂▁ |
| PH | 4 | 1.00 | 8.55 | 0.17 | 7.88 | 8.44 | 8.54 | 8.68 | 9.36 | ▁▅▇▂▁ |
| Oxygen.Filler | 12 | 1.00 | 0.05 | 0.05 | 0.00 | 0.02 | 0.03 | 0.06 | 0.40 | ▇▁▁▁▁ |
| Bowl.Setpoint | 2 | 1.00 | 109.33 | 15.30 | 70.00 | 100.00 | 120.00 | 120.00 | 140.00 | ▁▂▃▇▁ |
| Pressure.Setpoint | 12 | 1.00 | 47.62 | 2.04 | 44.00 | 46.00 | 46.00 | 50.00 | 52.00 | ▁▇▁▆▁ |
| Air.Pressurer | 0 | 1.00 | 142.83 | 1.21 | 140.80 | 142.20 | 142.60 | 143.00 | 148.20 | ▅▇▁▁▁ |
| Alch.Rel | 9 | 1.00 | 6.90 | 0.51 | 5.28 | 6.54 | 6.56 | 7.24 | 8.62 | ▁▇▂▃▁ |
| Carb.Rel | 10 | 1.00 | 5.44 | 0.13 | 4.96 | 5.34 | 5.40 | 5.54 | 6.06 | ▁▇▇▂▁ |
| Balling.Lvl | 1 | 1.00 | 2.05 | 0.87 | 0.00 | 1.38 | 1.48 | 3.14 | 3.66 | ▁▇▂▁▆ |
df %>% ggplot(aes(x=PH)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing non-finite values (stat_bin).
Drop rows where the label is missing
means <- skimr::skim(df %>% select(-c('PH')))[10]
df <- df %>% drop_na()
Split the data into training and testing
set.seed(72)
df_split <- initial_split(df, strata=PH)
df_train <- training(df_split)
df_test <- testing(df_split)
Data preprocessing
df_recipe <- recipe(PH ~., data=df_train) %>%
update_role(Brand.Code, new_role='code')
Build model specification
model_spec <- rand_forest(mtry = tune(), trees=1000, min_n=tune()) %>%
set_mode('regression') %>%
set_engine('ranger')
Create model workflow for the recipe and the model
train_wf <- workflow() %>%
add_recipe(df_recipe) %>%
add_model(model_spec)
Hyper parameter tuning
set.seed(42)
folds <- vfold_cv(df_train)
param_grid <- grid_regular(
mtry(range=c(10,30)),
min_n(range=c(1, 10)),
levels=5
)
doParallel::registerDoParallel()
tune_params <- tune_grid(train_wf, resamples = folds, grid=param_grid)
View the relationship between mtry and min_n for rmse and rsq
tune_params %>%
collect_metrics() %>%
select(c('mtry', 'min_n', '.metric', 'mean')) %>%
pivot_longer(mtry:min_n, values_to = 'val', names_to='params') %>%
ggplot(aes(x=val, y=mean, color=params)) +
geom_line(show.legend=TRUE) +
facet_wrap(~.metric, scales='free_y')
Select the best parameters and update the model
best_params <- select_best(tune_params, 'rmse')
model <- finalize_model(
model_spec,
best_params
)
Determine the most important variables
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
model %>%
set_engine('ranger', importance='permutation') %>%
fit(PH~., data=juice(df_recipe %>% prep) %>% select(-c('Brand.Code'))) %>%
vip(geom='col')
Finally fit on the testing data
final_wf <- workflow() %>%
add_recipe(df_recipe) %>%
add_model(model)
final_result <- final_wf %>%
last_fit(df_split)
How did we perform on the test set
final_result %>% collect_metrics()
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.103 Preprocessor1_Model1
## 2 rsq standard 0.682 Preprocessor1_Model1
Our predictions were about in line
final_result %>% collect_predictions() %>% ggplot(aes(x=.pred, y=PH)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
final_result %>%
collect_predictions() %>%
bind_cols(df_test)
## New names:
## * PH -> PH...4
## * PH -> PH...31
## # A tibble: 533 x 38
## id .pred .row PH...4 .config Brand.Code Carb.Volume Fill.Ounces PC.Volume
## <chr> <dbl> <int> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 trai… 8.57 1 8.26 Prepro… A 5.49 24.3 0.111
## 2 trai… 8.56 8 8.4 Prepro… B 5.27 23.9 0.334
## 3 trai… 8.59 12 8.44 Prepro… B 5.37 24.1 0.261
## 4 trai… 8.46 18 8.42 Prepro… B 5.36 24.0 0.278
## 5 trai… 8.43 21 8.44 Prepro… B 5.39 24.1 0.212
## 6 trai… 8.46 25 8.38 Prepro… B 5.36 24.1 0.155
## 7 trai… 8.47 30 8.28 Prepro… C 5.27 24.0 0.311
## 8 trai… 8.52 34 8.42 Prepro… D 5.49 24.1 0.203
## 9 trai… 8.49 37 8.4 Prepro… A 5.43 24.1 0.239
## 10 trai… 8.50 38 8.5 Prepro… A 5.44 24.2 0.251
## # … with 523 more rows, and 29 more variables: Carb.Pressure <dbl>,
## # Carb.Temp <dbl>, PSC <dbl>, PSC.Fill <dbl>, PSC.CO2 <dbl>, Mnf.Flow <dbl>,
## # Carb.Pressure1 <dbl>, Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>,
## # Hyd.Pressure2 <dbl>, Hyd.Pressure3 <dbl>, Hyd.Pressure4 <dbl>,
## # Filler.Level <dbl>, Filler.Speed <dbl>, Temperature <dbl>,
## # Usage.cont <dbl>, Carb.Flow <dbl>, Density <dbl>, MFR <dbl>, Balling <dbl>,
## # Pressure.Vacuum <dbl>, PH...31 <dbl>, Oxygen.Filler <dbl>,
## # Bowl.Setpoint <dbl>, Pressure.Setpoint <dbl>, Air.Pressurer <dbl>,
## # Alch.Rel <dbl>, Carb.Rel <dbl>, Balling.Lvl <dbl>
final_model <- fit(final_wf, data=df_train)
Making prediction on testing set
testing_df <- read.csv(paste0(path, 'StudentEvaluation-TO_PREDICT.csv'))
testing_df <- testing_df %>% select(-c('PH'))
Fill missing values with mean from the training set
#missing_df <- data.frame(names=testing_df %>% colnames(), values=means %>% drop_na())
testing_df <- testing_df %>% replace_na(list(Carb.Volume=5.37019784,Fill.Ounces=23.97475457,PC.Volume=0.27711875,Carb.Pressure=68.18957547,Carb.Temp=141.09492338,PSC=0.08457368,PSC.Fill=0.19536892,PSC.CO2=0.05641390,Mnf.Flow=24.56893733,Carb.Pressure1=122.58637259,Fill.Pressure=47.92216556,Hyd.Pressure1=12.43757812,Hyd.Pressure2=20.96103286,Hyd.Pressure3=20.45845070,Hyd.Pressure4=96.28886265,Filler.Level=109.25237162,Filler.Speed=3687.19888624,Temperature=65.96754009,Usage.cont=20.99296181,Carb.Flow=2468.35422343,Density=1.17364981,MFR=704.04925816,Balling=2.19776965,Pressure.Vacuum=-5.21610268,Oxygen.Filler=0.04684259,Bowl.Setpoint=109.32658622,Pressure.Setpoint=47.61539664,Air.Pressurer=142.83399455,Alch.Rel=6.89741608,Carb.Rel=5.43678251,Balling.Lvl=2.05000778))
Make predictions
testing_df$PH = predict(final_model, testing_df)
write.csv(testing_df, 'StudenEvaluation-PREDICTED.csv')