1 Case study

1.1 Set up libraries

pacman::p_load(
  tidyverse, 
  tidymodels,
  AppliedPredictiveModeling, 
  skimr, 
  janitor, 
  corrplot, 
  vip
)

## also installing the dependencies 'plotrix', 'CORElearn', 'ellipse'

## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'

## package 'plotrix' successfully unpacked and MD5 sums checked
## package 'CORElearn' successfully unpacked and MD5 sums checked
## package 'ellipse' successfully unpacked and MD5 sums checked
## package 'AppliedPredictiveModeling' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\yyz\AppData\Local\Temp\RtmpsbBF9P\downloaded_packages

## 
## AppliedPredictiveModeling installed

## Warning: package 'AppliedPredictiveModeling' was built under R version 4.4.3

## also installing the dependency 'snakecase'

## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'

## package 'snakecase' successfully unpacked and MD5 sums checked
## package 'janitor' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\yyz\AppData\Local\Temp\RtmpsbBF9P\downloaded_packages

## 
## janitor installed

## Warning: package 'janitor' was built under R version 4.4.3
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'

## package 'corrplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\yyz\AppData\Local\Temp\RtmpsbBF9P\downloaded_packages

## 
## corrplot installed

## Warning: package 'corrplot' was built under R version 4.4.3

1.2 Read in data

data(FuelEconomy, package = "AppliedPredictiveModeling")

1.3 Pre-processing

1.3.1 Convert to tibble

cars2010 <- as_tibble(cars2010)

1.3.2 Clean names and select predictors

cars2010 <- clean_names(cars2010)
cars2010 <- 
  cars2010 %>% 
  select(
    eng_displ, 
    num_cyl, 
    FE = fe, 
    num_gears, 
    drive_desc
  )
cars2010

1.3.3 Recode the drive predictor

cars2010 %>% 
  count(drive_desc)

cars2010<- 
  cars2010 %>% 
  mutate(
    drive = case_when(
      str_detect(drive_desc, "Front") ~ "front", 
      str_detect(drive_desc, "Rear") ~ "rear", 
      TRUE ~ "4WD"
    ))
cars2010 %>% 
  count(drive_desc, drive)

cars2010 <- 
  cars2010 %>% 
  select(-drive_desc)
cars2010

1.3.4 Convert character into factor

cars2010 <- 
  cars2010 %>% 
  mutate_if(
    is.character, factor
  )
cars2010

1.3.5 Split into training and testing

set.seed(2020)
car_split <- initial_split(cars2010, strata = FE)
car_train <- training(car_split)
car_test <- testing(car_split)

1.4 EDA

1.4.1 Overview

skim(car_train)

Data summary
Name	car_train
Number of rows	829
Number of columns	5
_______________________
Column type frequency:
factor	1
numeric	4
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
drive	0	1	FALSE	3	fro: 287, 4WD: 279, rea: 263

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
eng_displ	1	3.51	1.33	1.0	2.40	3.5	4.3	8.40	▅▇▃▂▁
num_cyl	1	5.99	1.96	2.0	4.00	6.0	8.0	16.00	▇▇▅▁▁
FE	1	34.71	7.61	17.5	29.14	34.5	39.2	69.64	▃▇▃▁▁
num_gears	1	5.26	1.40	1.0	5.00	6.0	6.0	8.00	▁▁▆▇▂

1.4.2 Look at Fuel efficiency

car_train %>% 
  ggplot(aes(FE)) + 
  geom_histogram(col = "black", fill = "orange")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

1.4.3 Bivariate relationships

car_train %>% 
  ggplot(aes(eng_displ, FE)) + 
  geom_point()

car_train %>% 
  ggplot(aes(factor(num_cyl), FE)) + 
  geom_boxplot()

car_train %>% 
  ggplot(aes(factor(num_gears), FE)) + 
  geom_boxplot()

car_train %>% 
  ggplot(aes(drive, FE)) + geom_boxplot()

1.4.4 Relationship between predictors

car_train %>% 
  select_if(is.numeric) %>% 
  cor() %>% 
  corrplot()

1.4.5 Preprocessing with recipes

car_recipe <- 
  recipe(FE ~ ., data = car_train) %>% 
  step_poly(eng_displ, degree = 2, options = list(raw = TRUE))
car_recipe

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:   1
## predictor: 4

##

## ── Operations

## • Orthogonal polynomials on: eng_displ

car_recipe <- car_recipe %>% prep()
car_recipe

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:   1
## predictor: 4

##

## ── Training information

## Training data contained 829 data points and no incomplete rows.

##

## ── Operations

## • Orthogonal polynomials on: eng_displ | Trained

car_train <- car_recipe %>% juice()
car_train

car_test <- car_recipe %>% bake(car_test)
car_test

1.5 Fit model

car_lm <- 
  linear_reg() %>% 
  set_engine("lm") %>% 
  fit(FE ~ ., data = car_train)
car_lm

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = FE ~ ., data = data)
## 
## Coefficients:
##      (Intercept)           num_cyl         num_gears        drivefront  
##         54.90897          -0.55180           0.06216           5.25074  
##        driverear  eng_displ_poly_1  eng_displ_poly_2  
##          2.24564          -8.28921           0.66325

1.6 Use model

1.6.1 Model fit

car_train %>% 
  add_column(
    predict(car_lm, new_data = car_train)
  ) %>% 
  yardstick::rmse(truth = FE, estimate = .pred)

car_test %>% 
  add_column(
    predict(car_lm, new_data = car_test)
  ) %>% 
  yardstick::rmse(truth = FE, estimate = .pred)

1.6.2 Check assumptions

car_res <- augment(car_lm$fit)
car_res

car_res %>% 
  ggplot(aes(.fitted, .resid)) + 
  geom_point() + 
  geom_hline(yintercept = 0)

car_res %>% 
  ggplot(aes(sample = .resid)) + 
  stat_qq() + 
  stat_qq_line()

car_res %>% 
  ggplot(aes(.hat, .std.resid)) +
  geom_vline(size = 2, colour = "white", xintercept = 0) +
  geom_hline(size = 2, colour = "white", yintercept = 0) +
  geom_point(aes(size = .cooksd)) + geom_smooth(se = FALSE)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

1.6.3 Variable importance

vi(car_lm)

vip(car_lm, 
    include_type = TRUE, 
    geom = 'point',
    mapping = aes_string(col = "Sign"), size = 5) + 
  scale_color_brewer(palette = "Set1")

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

1.6.4 Inference

tidy(car_lm$fit)

tidy(car_lm$fit) %>% 
  arrange(p.value)

1.6.5 Prediction

new_data <- tibble(
  num_cyl = 4, 
  num_gears = 6, 
  eng_displ = 5, 
  drive = "front"
) 
new_data

new_data <- bake(car_recipe, new_data = new_data)

## Warning: ! There was 1 column that was a factor when the recipe was prepped:
## • `drive`
## ℹ This may cause errors when processing new data.

new_data

predict(car_lm, new_data = new_data)

predict(car_lm, new_data = new_data, type = "pred_int")

Linear Regression: Fuel Economy

Max Glonek

Trimester 2, 2025

1 Case study

1.1 Set up libraries

1.2 Read in data

1.3 Pre-processing

1.3.1 Convert to tibble

1.3.2 Clean names and select predictors

1.3.3 Recode the drive predictor

1.3.4 Convert character into factor

1.3.5 Split into training and testing

1.4 EDA

1.4.1 Overview

1.4.2 Look at Fuel efficiency

1.4.3 Bivariate relationships

1.4.4 Relationship between predictors

1.4.5 Preprocessing with recipes

1.5 Fit model

1.6 Use model

1.6.1 Model fit

1.6.2 Check assumptions

1.6.3 Variable importance

1.6.4 Inference

1.6.5 Prediction