Project 2

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.4     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidymodels)

## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──

## ✓ broom        0.7.7      ✓ rsample      0.1.0 
## ✓ dials        0.0.9      ✓ tune         0.1.5 
## ✓ infer        0.5.4      ✓ workflows    0.2.2 
## ✓ modeldata    0.1.0      ✓ workflowsets 0.0.2 
## ✓ parsnip      0.1.6      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.16

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

path = '/home/kenan/Documents/learning/masters/CUNY-SPS-Masters-DS/DATA_624/project/project1/'
df <- read.csv(paste0(path, 'StudentData-TO_MODEL.csv'))

Explore the data

skimr::skim(df)

Data summary
Name	df
Number of rows	2571
Number of columns	33
_______________________
Column type frequency:
character	1
numeric	32
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
Brand.Code	0	1	0	1	120	5	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Carb.Volume	10	1.00	5.37	0.11	5.04	5.29	5.35	5.45	5.70	▁▆▇▅▁
Fill.Ounces	38	0.99	23.97	0.09	23.63	23.92	23.97	24.03	24.32	▁▂▇▂▁
PC.Volume	39	0.98	0.28	0.06	0.08	0.24	0.27	0.31	0.48	▁▃▇▂▁
Carb.Pressure	27	0.99	68.19	3.54	57.00	65.60	68.20	70.60	79.40	▁▅▇▃▁
Carb.Temp	26	0.99	141.09	4.04	128.60	138.40	140.80	143.80	154.00	▁▅▇▃▁
PSC	33	0.99	0.08	0.05	0.00	0.05	0.08	0.11	0.27	▆▇▃▁▁
PSC.Fill	23	0.99	0.20	0.12	0.00	0.10	0.18	0.26	0.62	▆▇▃▁▁
PSC.CO2	39	0.98	0.06	0.04	0.00	0.02	0.04	0.08	0.24	▇▅▂▁▁
Mnf.Flow	2	1.00	24.57	119.48	-100.20	-100.00	65.20	140.80	229.40	▇▁▁▇▂
Carb.Pressure1	32	0.99	122.59	4.74	105.60	119.00	123.20	125.40	140.20	▁▃▇▂▁
Fill.Pressure	22	0.99	47.92	3.18	34.60	46.00	46.40	50.00	60.40	▁▁▇▂▁
Hyd.Pressure1	11	1.00	12.44	12.43	-0.80	0.00	11.40	20.20	58.00	▇▅▂▁▁
Hyd.Pressure2	15	0.99	20.96	16.39	0.00	0.00	28.60	34.60	59.40	▇▂▇▅▁
Hyd.Pressure3	15	0.99	20.46	15.98	-1.20	0.00	27.60	33.40	50.00	▇▁▃▇▁
Hyd.Pressure4	30	0.99	96.29	13.12	52.00	86.00	96.00	102.00	142.00	▁▃▇▂▁
Filler.Level	20	0.99	109.25	15.70	55.80	98.30	118.40	120.00	161.20	▁▃▅▇▁
Filler.Speed	57	0.98	3687.20	770.82	998.00	3888.00	3982.00	3998.00	4030.00	▁▁▁▁▇
Temperature	14	0.99	65.97	1.38	63.60	65.20	65.60	66.40	76.20	▇▃▁▁▁
Usage.cont	5	1.00	20.99	2.98	12.08	18.36	21.79	23.75	25.90	▁▃▅▃▇
Carb.Flow	2	1.00	2468.35	1073.70	26.00	1144.00	3028.00	3186.00	5104.00	▂▅▆▇▁
Density	1	1.00	1.17	0.38	0.24	0.90	0.98	1.62	1.92	▁▅▇▂▆
MFR	212	0.92	704.05	73.90	31.40	706.30	724.00	731.00	868.60	▁▁▁▂▇
Balling	1	1.00	2.20	0.93	-0.17	1.50	1.65	3.29	4.01	▁▇▇▁▇
Pressure.Vacuum	0	1.00	-5.22	0.57	-6.60	-5.60	-5.40	-5.00	-3.60	▂▇▆▂▁
PH	4	1.00	8.55	0.17	7.88	8.44	8.54	8.68	9.36	▁▅▇▂▁
Oxygen.Filler	12	1.00	0.05	0.05	0.00	0.02	0.03	0.06	0.40	▇▁▁▁▁
Bowl.Setpoint	2	1.00	109.33	15.30	70.00	100.00	120.00	120.00	140.00	▁▂▃▇▁
Pressure.Setpoint	12	1.00	47.62	2.04	44.00	46.00	46.00	50.00	52.00	▁▇▁▆▁
Air.Pressurer	0	1.00	142.83	1.21	140.80	142.20	142.60	143.00	148.20	▅▇▁▁▁
Alch.Rel	9	1.00	6.90	0.51	5.28	6.54	6.56	7.24	8.62	▁▇▂▃▁
Carb.Rel	10	1.00	5.44	0.13	4.96	5.34	5.40	5.54	6.06	▁▇▇▂▁
Balling.Lvl	1	1.00	2.05	0.87	0.00	1.38	1.48	3.14	3.66	▁▇▂▁▆

df %>% ggplot(aes(x=PH)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 4 rows containing non-finite values (stat_bin).

Drop rows where the label is missing

means <- skimr::skim(df %>% select(-c('PH')))[10]
df <- df %>% drop_na()

Split the data into training and testing

set.seed(72)
df_split <- initial_split(df, strata=PH)
df_train <- training(df_split)
df_test <- testing(df_split)

Data preprocessing

df_recipe <- recipe(PH ~., data=df_train) %>% 
  update_role(Brand.Code, new_role='code')

Build model specification

model_spec <- rand_forest(mtry = tune(), trees=1000, min_n=tune()) %>%
  set_mode('regression') %>%
  set_engine('ranger')

Create model workflow for the recipe and the model

train_wf <- workflow() %>%
  add_recipe(df_recipe) %>%
  add_model(model_spec)

Hyper parameter tuning

set.seed(42)
folds <- vfold_cv(df_train)

param_grid <- grid_regular(
  mtry(range=c(10,30)),
  min_n(range=c(1, 10)),
  levels=5
)

doParallel::registerDoParallel()
tune_params <- tune_grid(train_wf, resamples = folds, grid=param_grid)

View the relationship between mtry and min_n for rmse and rsq

tune_params %>% 
  collect_metrics() %>% 
  select(c('mtry', 'min_n', '.metric', 'mean')) %>% 
  pivot_longer(mtry:min_n, values_to = 'val', names_to='params') %>% 
  ggplot(aes(x=val, y=mean, color=params)) + 
  geom_line(show.legend=TRUE) + 
  facet_wrap(~.metric, scales='free_y')

Select the best parameters and update the model

best_params <- select_best(tune_params, 'rmse')

model <- finalize_model(
  model_spec,
  best_params
)

Determine the most important variables

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

model %>% 
  set_engine('ranger', importance='permutation') %>%
  fit(PH~., data=juice(df_recipe %>% prep) %>% select(-c('Brand.Code'))) %>%
  vip(geom='col')

Finally fit on the testing data

final_wf <- workflow() %>% 
  add_recipe(df_recipe) %>%
  add_model(model)

final_result <- final_wf %>%
  last_fit(df_split)

How did we perform on the test set

final_result %>% collect_metrics()

## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.103 Preprocessor1_Model1
## 2 rsq     standard       0.682 Preprocessor1_Model1

Our predictions were about in line

final_result %>% collect_predictions() %>% ggplot(aes(x=.pred, y=PH)) + geom_point() + geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

final_result %>% 
  collect_predictions() %>%
  bind_cols(df_test)

## New names:
## * PH -> PH...4
## * PH -> PH...31

## # A tibble: 533 x 38
##    id    .pred  .row PH...4 .config Brand.Code Carb.Volume Fill.Ounces PC.Volume
##    <chr> <dbl> <int>  <dbl> <chr>   <chr>            <dbl>       <dbl>     <dbl>
##  1 trai…  8.57     1   8.26 Prepro… A                 5.49        24.3     0.111
##  2 trai…  8.56     8   8.4  Prepro… B                 5.27        23.9     0.334
##  3 trai…  8.59    12   8.44 Prepro… B                 5.37        24.1     0.261
##  4 trai…  8.46    18   8.42 Prepro… B                 5.36        24.0     0.278
##  5 trai…  8.43    21   8.44 Prepro… B                 5.39        24.1     0.212
##  6 trai…  8.46    25   8.38 Prepro… B                 5.36        24.1     0.155
##  7 trai…  8.47    30   8.28 Prepro… C                 5.27        24.0     0.311
##  8 trai…  8.52    34   8.42 Prepro… D                 5.49        24.1     0.203
##  9 trai…  8.49    37   8.4  Prepro… A                 5.43        24.1     0.239
## 10 trai…  8.50    38   8.5  Prepro… A                 5.44        24.2     0.251
## # … with 523 more rows, and 29 more variables: Carb.Pressure <dbl>,
## #   Carb.Temp <dbl>, PSC <dbl>, PSC.Fill <dbl>, PSC.CO2 <dbl>, Mnf.Flow <dbl>,
## #   Carb.Pressure1 <dbl>, Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>,
## #   Hyd.Pressure2 <dbl>, Hyd.Pressure3 <dbl>, Hyd.Pressure4 <dbl>,
## #   Filler.Level <dbl>, Filler.Speed <dbl>, Temperature <dbl>,
## #   Usage.cont <dbl>, Carb.Flow <dbl>, Density <dbl>, MFR <dbl>, Balling <dbl>,
## #   Pressure.Vacuum <dbl>, PH...31 <dbl>, Oxygen.Filler <dbl>,
## #   Bowl.Setpoint <dbl>, Pressure.Setpoint <dbl>, Air.Pressurer <dbl>,
## #   Alch.Rel <dbl>, Carb.Rel <dbl>, Balling.Lvl <dbl>

final_model <- fit(final_wf, data=df_train)

Making prediction on testing set

testing_df <- read.csv(paste0(path, 'StudentEvaluation-TO_PREDICT.csv'))
testing_df <- testing_df %>% select(-c('PH'))

Fill missing values with mean from the training set

#missing_df <- data.frame(names=testing_df %>% colnames(), values=means %>% drop_na())

testing_df <- testing_df %>% replace_na(list(Carb.Volume=5.37019784,Fill.Ounces=23.97475457,PC.Volume=0.27711875,Carb.Pressure=68.18957547,Carb.Temp=141.09492338,PSC=0.08457368,PSC.Fill=0.19536892,PSC.CO2=0.05641390,Mnf.Flow=24.56893733,Carb.Pressure1=122.58637259,Fill.Pressure=47.92216556,Hyd.Pressure1=12.43757812,Hyd.Pressure2=20.96103286,Hyd.Pressure3=20.45845070,Hyd.Pressure4=96.28886265,Filler.Level=109.25237162,Filler.Speed=3687.19888624,Temperature=65.96754009,Usage.cont=20.99296181,Carb.Flow=2468.35422343,Density=1.17364981,MFR=704.04925816,Balling=2.19776965,Pressure.Vacuum=-5.21610268,Oxygen.Filler=0.04684259,Bowl.Setpoint=109.32658622,Pressure.Setpoint=47.61539664,Air.Pressurer=142.83399455,Alch.Rel=6.89741608,Carb.Rel=5.43678251,Balling.Lvl=2.05000778))

Make predictions

testing_df$PH = predict(final_model, testing_df)
write.csv(testing_df, 'StudenEvaluation-PREDICTED.csv')

Project 2

Kenan Sooklall

7/8/2021