library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tidyr' was built under R version 3.6.2
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidymodels 0.0.3 --
## v broom 0.5.4 v recipes 0.1.9
## v dials 0.0.4 v rsample 0.0.5
## v infer 0.5.1 v yardstick 0.0.5
## v parsnip 0.0.5
## Warning: package 'broom' was built under R version 3.6.2
## Warning: package 'parsnip' was built under R version 3.6.2
## Warning: package 'recipes' was built under R version 3.6.2
## Warning: package 'yardstick' was built under R version 3.6.2
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x dials::margin() masks ggplot2::margin()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
library(janitor)
## Warning: package 'janitor' was built under R version 3.6.2
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
df <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## drive = col_character(),
## eng_dscr = col_character(),
## fuelType = col_character(),
## fuelType1 = col_character(),
## make = col_character(),
## model = col_character(),
## mpgData = col_character(),
## phevBlended = col_logical(),
## trany = col_character(),
## VClass = col_character(),
## guzzler = col_logical(),
## trans_dscr = col_character(),
## tCharger = col_logical(),
## sCharger = col_character(),
## atvType = col_character(),
## fuelType2 = col_logical(),
## rangeA = col_logical(),
## evMotor = col_logical(),
## mfrCode = col_logical(),
## c240Dscr = col_logical()
## # ... with 4 more columns
## )
## See spec(...) for full column specifications.
## Warning: 26930 parsing failures.
## row col expected actual file
## 4430 guzzler 1/0/T/F/TRUE/FALSE G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## 4431 guzzler 1/0/T/F/TRUE/FALSE G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## 4432 guzzler 1/0/T/F/TRUE/FALSE G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## 4433 guzzler 1/0/T/F/TRUE/FALSE G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## 4442 guzzler 1/0/T/F/TRUE/FALSE G 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv'
## .... ....... .................. ...... ............................................................................................................
## See problems(...) for more details.
cars <- df %>% select(make, model, year, VClass, trany, city08, comb08, co2TailpipeGpm, cylinders, displ, drive, fuelType, highway08, guzzler) %>%
#remove co2 output = 0, this will be our target
filter(co2TailpipeGpm != 0) %>%
mutate(cylinders = as_factor(cylinders))
set.seed(999)
cars_split <- initial_split(cars, strata = co2TailpipeGpm)
cars_training <- training(cars_split)
cars_testing <- testing(cars_split)
Let’s look at the distribution of the training data
cars_training %>%
select_if(is_numeric) %>%
select(-cylinders) %>%
pivot_longer(everything(),names_to = "names", values_to = "values") %>%
ggplot(aes(values)) +
geom_histogram() +
facet_wrap(~names, scales="free")
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
cars_training_final <- cars_training %>% select_if(is_numeric)
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
cars_testing_final <- cars_testing %>% select_if(is_numeric)
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
Impute Individual transformations for skewness and other issues Discretize (if needed and if you have no other choice) Create dummy variables Create interactions Normalization steps (center, scale, range, etc) Multivariate transformation (e.g. PCA, spatial sign, etc)
processed <- recipe(co2TailpipeGpm ~., data = cars_training_final) %>%
step_knnimpute(all_predictors()) %>%
step_dummy(cylinders) %>%
step_BoxCox(-co2TailpipeGpm) %>%
step_normalize(-co2TailpipeGpm) %>%
prep()
juiced_df <- juice(processed)
juiced_df %>%
select_if(is_numeric) %>%
select(-starts_with("cylinders")) %>%
pivot_longer(everything(),names_to = "names", values_to = "values") %>%
ggplot(aes(values)) +
geom_histogram() +
facet_wrap(~names, scales="free")
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
baked_df <- bake(processed, cars_testing_final)
rf_spec <- rand_forest() %>%
set_engine("ranger") %>%
set_mode("regression")
rf_fit <- rf_spec %>%
fit(co2TailpipeGpm ~., data = juiced_df)
rf_training_results <- rf_fit %>%
predict(juiced_df)
rf_training_results %>%
cbind(juiced_df$co2TailpipeGpm) %>%
ggplot(aes(juiced_df$co2TailpipeGpm, .pred)) +
geom_point(aes(alpha = 0.1)) +
geom_abline() +
xlab("Actual") +
ylab("Predicted")
rf_training_results %>%
bind_cols(juiced_df) %>%
metrics(truth = co2TailpipeGpm, estimate = .pred)
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 11.4
## 2 rsq standard 0.992
## 3 mae standard 6.07
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 11.4
# 2 rsq standard 0.992
# 3 mae standard 6.07
rf_testing_results <- rf_fit %>%
predict(baked_df)
rf_testing_results %>%
cbind(baked_df$co2TailpipeGpm) %>%
ggplot(aes(baked_df$co2TailpipeGpm, .pred)) +
geom_point(aes(alpha = 0.1)) +
geom_abline() +
xlab("Actual") +
ylab("Predicted")
rf_testing_results %>%
bind_cols(baked_df) %>%
metrics(truth = co2TailpipeGpm, estimate = .pred)
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 13.6
## 2 rsq standard 0.989
## 3 mae standard 6.55
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 13.6
# 2 rsq standard 0.989
# 3 mae standard 6.55
(this doesn’t tune the model)
juiced_df_folds <- vfold_cv(juiced_df)
# the fit resamples function is part of tune which doesn't seem to be loaded by tidyverse? (currently)
library(tune)
rf_fit_res <- fit_resamples(co2TailpipeGpm ~.,
rf_spec,
juiced_df_folds)
rf_fit_res %>% collect_metrics()
## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 rmse standard 12.3 10 0.429
## 2 rsq standard 0.990 10 0.000665
# .metric .estimator mean n std_err
# <chr> <chr> <dbl> <int> <dbl>
# 1 rmse standard 12.1 10 0.195
# 2 rsq standard 0.991 10 0.000318
#create a new spec that can accept tuned values for mtry and min_n
new_spec <- rand_forest(mode = "regression",
mtry = tune(),
min_n = tune()) %>%
set_engine("ranger")
# grid tune using this new spec on folds
tuned_model <- tune_grid(co2TailpipeGpm ~.,
model = new_spec,
resamples = juiced_df_folds)
## i Creating pre-processing data to finalize unknown parameter: mtry
# select the best hyperparamets from the different models, in this case by rmse
show_best(tuned_model, metric = "rmse", maximize = F)
## # A tibble: 5 x 7
## mtry min_n .metric .estimator mean n std_err
## <int> <int> <chr> <chr> <dbl> <int> <dbl>
## 1 7 6 rmse standard 7.90 10 0.572
## 2 9 5 rmse standard 7.92 10 0.564
## 3 10 11 rmse standard 8.27 10 0.547
## 4 6 21 rmse standard 8.64 10 0.521
## 5 7 28 rmse standard 8.90 10 0.510
autoplot(tuned_model, metric = "rmse")
best_settings <- select_best(tuned_model, metric = "rmse", maximize = F)
# apply the best hyperprameters (selected above) to the new spec with open values
tuned_spec <- finalize_model(new_spec, best_settings)
# use the finalised tuned spec to fit the juiced (training) data
tuned_fit <- tuned_spec %>%
fit(co2TailpipeGpm ~., data = juiced_df)
tuned_training_results <- tuned_fit %>%
predict(juiced_df)
tuned_training_results %>%
cbind(juiced_df$co2TailpipeGpm) %>%
ggplot(aes(juiced_df$co2TailpipeGpm, .pred)) +
geom_point(aes(alpha = 0.1)) +
geom_abline() +
xlab("Actual") +
ylab("Predicted")
tuned_training_results %>%
bind_cols(juiced_df) %>%
metrics(truth = co2TailpipeGpm, estimate = .pred)
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 5.18
## 2 rsq standard 0.998
## 3 mae standard 1.25
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 6.45
# 2 rsq standard 0.997
# 3 mae standard 2.06
tuned_testing_results <- tuned_fit %>%
predict(baked_df)
tuned_testing_results %>%
cbind(baked_df$co2TailpipeGpm) %>%
ggplot(aes(baked_df$co2TailpipeGpm, .pred)) +
geom_point(aes(alpha = 0.1)) +
geom_abline() +
xlab("Actual") +
ylab("Predicted")
tuned_testing_results %>%
bind_cols(baked_df) %>%
metrics(truth = co2TailpipeGpm, estimate = .pred)
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 9.74
## 2 rsq standard 0.993
## 3 mae standard 2.06
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 10.1
# 2 rsq standard 0.993
# 3 mae standard 2.69