1 Case study

1.1 Set up libraries

pacman::p_load(
  tidyverse, 
  tidymodels,
  AppliedPredictiveModeling, 
  skimr, 
  janitor, 
  corrplot, 
  vip
)
## also installing the dependencies 'plotrix', 'CORElearn', 'ellipse'
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'plotrix' successfully unpacked and MD5 sums checked
## package 'CORElearn' successfully unpacked and MD5 sums checked
## package 'ellipse' successfully unpacked and MD5 sums checked
## package 'AppliedPredictiveModeling' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\yyz\AppData\Local\Temp\RtmpsbBF9P\downloaded_packages
## 
## AppliedPredictiveModeling installed
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.4.3
## also installing the dependency 'snakecase'
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'snakecase' successfully unpacked and MD5 sums checked
## package 'janitor' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\yyz\AppData\Local\Temp\RtmpsbBF9P\downloaded_packages
## 
## janitor installed
## Warning: package 'janitor' was built under R version 4.4.3
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'corrplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\yyz\AppData\Local\Temp\RtmpsbBF9P\downloaded_packages
## 
## corrplot installed
## Warning: package 'corrplot' was built under R version 4.4.3

1.2 Read in data

data(FuelEconomy, package = "AppliedPredictiveModeling")

1.3 Pre-processing

1.3.1 Convert to tibble

cars2010 <- as_tibble(cars2010)

1.3.2 Clean names and select predictors

cars2010 <- clean_names(cars2010)
cars2010 <- 
  cars2010 %>% 
  select(
    eng_displ, 
    num_cyl, 
    FE = fe, 
    num_gears, 
    drive_desc
  )
cars2010

1.3.3 Recode the drive predictor

cars2010 %>% 
  count(drive_desc)
cars2010<- 
  cars2010 %>% 
  mutate(
    drive = case_when(
      str_detect(drive_desc, "Front") ~ "front", 
      str_detect(drive_desc, "Rear") ~ "rear", 
      TRUE ~ "4WD"
    ))
cars2010 %>% 
  count(drive_desc, drive)
cars2010 <- 
  cars2010 %>% 
  select(-drive_desc)
cars2010

1.3.4 Convert character into factor

cars2010 <- 
  cars2010 %>% 
  mutate_if(
    is.character, factor
  )
cars2010

1.3.5 Split into training and testing

set.seed(2020)
car_split <- initial_split(cars2010, strata = FE)
car_train <- training(car_split)
car_test <- testing(car_split)

1.4 EDA

1.4.1 Overview

skim(car_train)
Data summary
Name car_train
Number of rows 829
Number of columns 5
_______________________
Column type frequency:
factor 1
numeric 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
drive 0 1 FALSE 3 fro: 287, 4WD: 279, rea: 263

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
eng_displ 0 1 3.51 1.33 1.0 2.40 3.5 4.3 8.40 ▅▇▃▂▁
num_cyl 0 1 5.99 1.96 2.0 4.00 6.0 8.0 16.00 ▇▇▅▁▁
FE 0 1 34.71 7.61 17.5 29.14 34.5 39.2 69.64 ▃▇▃▁▁
num_gears 0 1 5.26 1.40 1.0 5.00 6.0 6.0 8.00 ▁▁▆▇▂

1.4.2 Look at Fuel efficiency

car_train %>% 
  ggplot(aes(FE)) + 
  geom_histogram(col = "black", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

1.4.3 Bivariate relationships

car_train %>% 
  ggplot(aes(eng_displ, FE)) + 
  geom_point()

car_train %>% 
  ggplot(aes(factor(num_cyl), FE)) + 
  geom_boxplot()

car_train %>% 
  ggplot(aes(factor(num_gears), FE)) + 
  geom_boxplot()

car_train %>% 
  ggplot(aes(drive, FE)) + geom_boxplot()

1.4.4 Relationship between predictors

car_train %>% 
  select_if(is.numeric) %>% 
  cor() %>% 
  corrplot()

1.4.5 Preprocessing with recipes

car_recipe <- 
  recipe(FE ~ ., data = car_train) %>% 
  step_poly(eng_displ, degree = 2, options = list(raw = TRUE))
car_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 4
## 
## ── Operations
## • Orthogonal polynomials on: eng_displ
car_recipe <- car_recipe %>% prep()
car_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 4
## 
## ── Training information
## Training data contained 829 data points and no incomplete rows.
## 
## ── Operations
## • Orthogonal polynomials on: eng_displ | Trained
car_train <- car_recipe %>% juice()
car_train
car_test <- car_recipe %>% bake(car_test)
car_test

1.5 Fit model

car_lm <- 
  linear_reg() %>% 
  set_engine("lm") %>% 
  fit(FE ~ ., data = car_train)
car_lm
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = FE ~ ., data = data)
## 
## Coefficients:
##      (Intercept)           num_cyl         num_gears        drivefront  
##         54.90897          -0.55180           0.06216           5.25074  
##        driverear  eng_displ_poly_1  eng_displ_poly_2  
##          2.24564          -8.28921           0.66325

1.6 Use model

1.6.1 Model fit

car_train %>% 
  add_column(
    predict(car_lm, new_data = car_train)
  ) %>% 
  yardstick::rmse(truth = FE, estimate = .pred)
car_test %>% 
  add_column(
    predict(car_lm, new_data = car_test)
  ) %>% 
  yardstick::rmse(truth = FE, estimate = .pred)

1.6.2 Check assumptions

car_res <- augment(car_lm$fit)
car_res
car_res %>% 
  ggplot(aes(.fitted, .resid)) + 
  geom_point() + 
  geom_hline(yintercept = 0)

car_res %>% 
  ggplot(aes(sample = .resid)) + 
  stat_qq() + 
  stat_qq_line()

car_res %>% 
  ggplot(aes(.hat, .std.resid)) +
  geom_vline(size = 2, colour = "white", xintercept = 0) +
  geom_hline(size = 2, colour = "white", yintercept = 0) +
  geom_point(aes(size = .cooksd)) + geom_smooth(se = FALSE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

1.6.3 Variable importance

vi(car_lm)
vip(car_lm, 
    include_type = TRUE, 
    geom = 'point',
    mapping = aes_string(col = "Sign"), size = 5) + 
  scale_color_brewer(palette = "Set1")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

1.6.4 Inference

tidy(car_lm$fit)
tidy(car_lm$fit) %>% 
  arrange(p.value)

1.6.5 Prediction

new_data <- tibble(
  num_cyl = 4, 
  num_gears = 6, 
  eng_displ = 5, 
  drive = "front"
) 
new_data
new_data <- bake(car_recipe, new_data = new_data)
## Warning: ! There was 1 column that was a factor when the recipe was prepped:
## • `drive`
## ℹ This may cause errors when processing new data.
new_data
predict(car_lm, new_data = new_data)
predict(car_lm, new_data = new_data, type = "pred_int")