Load packages

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'tidyr' was built under R version 3.6.2
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidymodels 0.0.3 --
## v broom     0.5.4     v recipes   0.1.9
## v dials     0.0.4     v rsample   0.0.5
## v infer     0.5.1     v yardstick 0.0.5
## v parsnip   0.0.5
## Warning: package 'broom' was built under R version 3.6.2
## Warning: package 'parsnip' was built under R version 3.6.2
## Warning: package 'recipes' was built under R version 3.6.2
## Warning: package 'yardstick' was built under R version 3.6.2
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x dials::margin()   masks ggplot2::margin()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
library(janitor)
## Warning: package 'janitor' was built under R version 3.6.2
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

Load data

df <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   drive = col_character(),
##   eng_dscr = col_character(),
##   fuelType = col_character(),
##   fuelType1 = col_character(),
##   make = col_character(),
##   model = col_character(),
##   mpgData = col_character(),
##   phevBlended = col_logical(),
##   trany = col_character(),
##   VClass = col_character(),
##   guzzler = col_logical(),
##   trans_dscr = col_character(),
##   tCharger = col_logical(),
##   sCharger = col_character(),
##   atvType = col_character(),
##   fuelType2 = col_logical(),
##   rangeA = col_logical(),
##   evMotor = col_logical(),
##   mfrCode = col_logical(),
##   c240Dscr = col_logical()
##   # ... with 4 more columns
## )
## See spec(...) for full column specifications.
## Warning: 26930 parsing failures.
##  row     col           expected actual                                                                                                         file
## 4430 guzzler 1/0/T/F/TRUE/FALSE      G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## 4431 guzzler 1/0/T/F/TRUE/FALSE      G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## 4432 guzzler 1/0/T/F/TRUE/FALSE      G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## 4433 guzzler 1/0/T/F/TRUE/FALSE      G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## 4442 guzzler 1/0/T/F/TRUE/FALSE      G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## .... ....... .................. ...... ............................................................................................................
## See problems(...) for more details.

Remove a lot of excess data

cars <- df %>% select(make, model, year, VClass, trany, city08, comb08, co2TailpipeGpm, cylinders, displ, drive, fuelType, highway08, guzzler) %>% 
  #remove co2 output = 0, this will be our target
  filter(co2TailpipeGpm != 0) %>% 
  mutate(cylinders = as_factor(cylinders))

Split the data

set.seed(999)

cars_split <- initial_split(cars, strata = co2TailpipeGpm)

cars_training <- training(cars_split)

cars_testing <- testing(cars_split)

EDA

Let’s look at the distribution of the training data

cars_training %>% 
  select_if(is_numeric) %>% 
  select(-cylinders) %>% 
  pivot_longer(everything(),names_to = "names", values_to = "values") %>% 
  ggplot(aes(values)) +
  geom_histogram() + 
  facet_wrap(~names, scales="free") 
## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

Remove non-numeric to simplify model this time

cars_training_final <- cars_training %>% select_if(is_numeric)
## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated
cars_testing_final <- cars_testing %>% select_if(is_numeric)
## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

Prepare a recipe

Impute Individual transformations for skewness and other issues Discretize (if needed and if you have no other choice) Create dummy variables Create interactions Normalization steps (center, scale, range, etc) Multivariate transformation (e.g. PCA, spatial sign, etc)

processed <- recipe(co2TailpipeGpm ~., data =  cars_training_final) %>%
  step_knnimpute(all_predictors()) %>% 
  step_dummy(cylinders) %>% 
  step_BoxCox(-co2TailpipeGpm) %>% 
  step_normalize(-co2TailpipeGpm) %>% 
  prep()

juiced_df <- juice(processed)

Visualise the juiced df

juiced_df %>% 
  select_if(is_numeric) %>%
  select(-starts_with("cylinders")) %>% 
  pivot_longer(everything(),names_to = "names", values_to = "values") %>% 
  ggplot(aes(values)) +
  geom_histogram() + 
  facet_wrap(~names, scales="free")
## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bake the test data

baked_df <- bake(processed, cars_testing_final)

Create a random forest spec

rf_spec <- rand_forest() %>% 
  set_engine("ranger") %>% 
  set_mode("regression")

Fit the model to training

rf_fit <- rf_spec %>% 
  fit(co2TailpipeGpm ~., data = juiced_df)

rf_training_results <- rf_fit %>% 
                        predict(juiced_df) 

  rf_training_results %>% 
  cbind(juiced_df$co2TailpipeGpm) %>% 
  ggplot(aes(juiced_df$co2TailpipeGpm, .pred)) +
  geom_point(aes(alpha = 0.1)) +
  geom_abline() +
  xlab("Actual") +
  ylab("Predicted")  

  rf_training_results %>% 
  bind_cols(juiced_df) %>% 
    metrics(truth = co2TailpipeGpm, estimate = .pred)
## # A tibble: 3 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      11.4  
## 2 rsq     standard       0.992
## 3 mae     standard       6.07
  #  .metric .estimator .estimate
  # <chr>   <chr>          <dbl>
# 1 rmse    standard      11.4  
# 2 rsq     standard       0.992
# 3 mae     standard       6.07 

Fit the model to testing

rf_testing_results <- rf_fit %>% 
                        predict(baked_df) 

  rf_testing_results %>% 
  cbind(baked_df$co2TailpipeGpm) %>% 
  ggplot(aes(baked_df$co2TailpipeGpm, .pred)) +
  geom_point(aes(alpha = 0.1)) +
  geom_abline() +
  xlab("Actual") +
  ylab("Predicted") 

  rf_testing_results %>% 
  bind_cols(baked_df) %>% 
    metrics(truth = co2TailpipeGpm, estimate = .pred)
## # A tibble: 3 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      13.6  
## 2 rsq     standard       0.989
## 3 mae     standard       6.55
  #  .metric .estimator .estimate
  # <chr>   <chr>          <dbl>
# 1 rmse    standard      13.6  
# 2 rsq     standard       0.989
# 3 mae     standard       6.55 

Use cross validation to evaulate the model on 10 folds

(this doesn’t tune the model)

juiced_df_folds <- vfold_cv(juiced_df)

# the fit resamples function is part of tune which doesn't seem to be loaded by tidyverse? (currently)
library(tune)

rf_fit_res <- fit_resamples(co2TailpipeGpm ~.,
              rf_spec,
              juiced_df_folds)

rf_fit_res %>% collect_metrics()
## # A tibble: 2 x 5
##   .metric .estimator   mean     n  std_err
##   <chr>   <chr>       <dbl> <int>    <dbl>
## 1 rmse    standard   12.3      10 0.429   
## 2 rsq     standard    0.990    10 0.000665
#  .metric .estimator   mean     n  std_err
#  <chr>   <chr>       <dbl> <int>    <dbl>
# 1 rmse    standard   12.1      10 0.195   
# 2 rsq     standard    0.991    10 0.000318

Use cross validation to tune a model

#create a new spec that can accept tuned values for mtry and min_n
new_spec <- rand_forest(mode = "regression",
                        mtry = tune(),
                        min_n = tune()) %>% 
                        set_engine("ranger")

# grid tune using this new spec on folds
tuned_model <- tune_grid(co2TailpipeGpm ~.,
                model = new_spec,
                resamples = juiced_df_folds)
## i Creating pre-processing data to finalize unknown parameter: mtry
# select the best hyperparamets from the different models, in this case by rmse
show_best(tuned_model, metric = "rmse", maximize = F)
## # A tibble: 5 x 7
##    mtry min_n .metric .estimator  mean     n std_err
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl>
## 1     7     6 rmse    standard    7.90    10   0.572
## 2     9     5 rmse    standard    7.92    10   0.564
## 3    10    11 rmse    standard    8.27    10   0.547
## 4     6    21 rmse    standard    8.64    10   0.521
## 5     7    28 rmse    standard    8.90    10   0.510
autoplot(tuned_model, metric = "rmse")

best_settings <- select_best(tuned_model, metric = "rmse", maximize = F)

# apply the best hyperprameters (selected above) to the new spec with open values
tuned_spec <- finalize_model(new_spec, best_settings)

# use the finalised tuned spec to fit the juiced (training) data
tuned_fit <- tuned_spec %>% 
             fit(co2TailpipeGpm ~., data = juiced_df)

Predict using the tuned fit

tuned_training_results <- tuned_fit %>% 
                        predict(juiced_df) 


tuned_training_results %>% 
  cbind(juiced_df$co2TailpipeGpm) %>% 
  ggplot(aes(juiced_df$co2TailpipeGpm, .pred)) +
  geom_point(aes(alpha = 0.1)) +
  geom_abline() +
  xlab("Actual") +
  ylab("Predicted")  

  tuned_training_results %>% 
  bind_cols(juiced_df) %>% 
    metrics(truth = co2TailpipeGpm, estimate = .pred)
## # A tibble: 3 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       5.18 
## 2 rsq     standard       0.998
## 3 mae     standard       1.25
#    .metric .estimator .estimate
#  <chr>   <chr>          <dbl>
# 1 rmse    standard       6.45 
# 2 rsq     standard       0.997
# 3 mae     standard       2.06 
  
  
  
tuned_testing_results <- tuned_fit %>% 
                        predict(baked_df) 


tuned_testing_results %>% 
  cbind(baked_df$co2TailpipeGpm) %>% 
  ggplot(aes(baked_df$co2TailpipeGpm, .pred)) +
  geom_point(aes(alpha = 0.1)) +
  geom_abline() +
  xlab("Actual") +
  ylab("Predicted")  

  tuned_testing_results %>% 
  bind_cols(baked_df) %>% 
    metrics(truth = co2TailpipeGpm, estimate = .pred)
## # A tibble: 3 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       9.74 
## 2 rsq     standard       0.993
## 3 mae     standard       2.06
#   .metric .estimator .estimate
#  <chr>   <chr>          <dbl>
# 1 rmse    standard      10.1  
# 2 rsq     standard       0.993
# 3 mae     standard       2.69