Set up libraries
pacman::p_load(
tidyverse,
tidymodels,
AppliedPredictiveModeling,
skimr,
janitor,
corrplot,
vip
)
## also installing the dependencies 'plotrix', 'CORElearn', 'ellipse'
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'plotrix' successfully unpacked and MD5 sums checked
## package 'CORElearn' successfully unpacked and MD5 sums checked
## package 'ellipse' successfully unpacked and MD5 sums checked
## package 'AppliedPredictiveModeling' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\yyz\AppData\Local\Temp\RtmpsbBF9P\downloaded_packages
##
## AppliedPredictiveModeling installed
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.4.3
## also installing the dependency 'snakecase'
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'snakecase' successfully unpacked and MD5 sums checked
## package 'janitor' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\yyz\AppData\Local\Temp\RtmpsbBF9P\downloaded_packages
##
## janitor installed
## Warning: package 'janitor' was built under R version 4.4.3
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'corrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\yyz\AppData\Local\Temp\RtmpsbBF9P\downloaded_packages
##
## corrplot installed
## Warning: package 'corrplot' was built under R version 4.4.3
Pre-processing
Convert to
tibble
cars2010 <- as_tibble(cars2010)
Clean names and
select predictors
cars2010 <- clean_names(cars2010)
cars2010 <-
cars2010 %>%
select(
eng_displ,
num_cyl,
FE = fe,
num_gears,
drive_desc
)
cars2010
Recode the drive
predictor
cars2010 %>%
count(drive_desc)
cars2010<-
cars2010 %>%
mutate(
drive = case_when(
str_detect(drive_desc, "Front") ~ "front",
str_detect(drive_desc, "Rear") ~ "rear",
TRUE ~ "4WD"
))
cars2010 %>%
count(drive_desc, drive)
cars2010 <-
cars2010 %>%
select(-drive_desc)
cars2010
Convert character
into factor
cars2010 <-
cars2010 %>%
mutate_if(
is.character, factor
)
cars2010
Split into training
and testing
set.seed(2020)
car_split <- initial_split(cars2010, strata = FE)
car_train <- training(car_split)
car_test <- testing(car_split)
EDA
Overview
skim(car_train)
Data summary
| Name |
car_train |
| Number of rows |
829 |
| Number of columns |
5 |
| _______________________ |
|
| Column type frequency: |
|
| factor |
1 |
| numeric |
4 |
| ________________________ |
|
| Group variables |
None |
Variable type: factor
| drive |
0 |
1 |
FALSE |
3 |
fro: 287, 4WD: 279, rea: 263 |
Variable type: numeric
| eng_displ |
0 |
1 |
3.51 |
1.33 |
1.0 |
2.40 |
3.5 |
4.3 |
8.40 |
▅▇▃▂▁ |
| num_cyl |
0 |
1 |
5.99 |
1.96 |
2.0 |
4.00 |
6.0 |
8.0 |
16.00 |
▇▇▅▁▁ |
| FE |
0 |
1 |
34.71 |
7.61 |
17.5 |
29.14 |
34.5 |
39.2 |
69.64 |
▃▇▃▁▁ |
| num_gears |
0 |
1 |
5.26 |
1.40 |
1.0 |
5.00 |
6.0 |
6.0 |
8.00 |
▁▁▆▇▂ |
Look at Fuel
efficiency
car_train %>%
ggplot(aes(FE)) +
geom_histogram(col = "black", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bivariate
relationships
car_train %>%
ggplot(aes(eng_displ, FE)) +
geom_point()

car_train %>%
ggplot(aes(factor(num_cyl), FE)) +
geom_boxplot()

car_train %>%
ggplot(aes(factor(num_gears), FE)) +
geom_boxplot()

car_train %>%
ggplot(aes(drive, FE)) + geom_boxplot()

Relationship
between predictors
car_train %>%
select_if(is.numeric) %>%
cor() %>%
corrplot()

Preprocessing with
recipes
car_recipe <-
recipe(FE ~ ., data = car_train) %>%
step_poly(eng_displ, degree = 2, options = list(raw = TRUE))
car_recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 4
##
## ── Operations
## • Orthogonal polynomials on: eng_displ
car_recipe <- car_recipe %>% prep()
car_recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 4
##
## ── Training information
## Training data contained 829 data points and no incomplete rows.
##
## ── Operations
## • Orthogonal polynomials on: eng_displ | Trained
car_train <- car_recipe %>% juice()
car_train
car_test <- car_recipe %>% bake(car_test)
car_test
Fit model
car_lm <-
linear_reg() %>%
set_engine("lm") %>%
fit(FE ~ ., data = car_train)
car_lm
## parsnip model object
##
##
## Call:
## stats::lm(formula = FE ~ ., data = data)
##
## Coefficients:
## (Intercept) num_cyl num_gears drivefront
## 54.90897 -0.55180 0.06216 5.25074
## driverear eng_displ_poly_1 eng_displ_poly_2
## 2.24564 -8.28921 0.66325
Use model
Model fit
car_train %>%
add_column(
predict(car_lm, new_data = car_train)
) %>%
yardstick::rmse(truth = FE, estimate = .pred)
car_test %>%
add_column(
predict(car_lm, new_data = car_test)
) %>%
yardstick::rmse(truth = FE, estimate = .pred)
Check
assumptions
car_res <- augment(car_lm$fit)
car_res
car_res %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
geom_hline(yintercept = 0)

car_res %>%
ggplot(aes(sample = .resid)) +
stat_qq() +
stat_qq_line()

car_res %>%
ggplot(aes(.hat, .std.resid)) +
geom_vline(size = 2, colour = "white", xintercept = 0) +
geom_hline(size = 2, colour = "white", yintercept = 0) +
geom_point(aes(size = .cooksd)) + geom_smooth(se = FALSE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Variable
importance
vi(car_lm)
vip(car_lm,
include_type = TRUE,
geom = 'point',
mapping = aes_string(col = "Sign"), size = 5) +
scale_color_brewer(palette = "Set1")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Inference
tidy(car_lm$fit)
tidy(car_lm$fit) %>%
arrange(p.value)
Prediction
new_data <- tibble(
num_cyl = 4,
num_gears = 6,
eng_displ = 5,
drive = "front"
)
new_data
new_data <- bake(car_recipe, new_data = new_data)
## Warning: ! There was 1 column that was a factor when the recipe was prepped:
## • `drive`
## ℹ This may cause errors when processing new data.
new_data
predict(car_lm, new_data = new_data)
predict(car_lm, new_data = new_data, type = "pred_int")