Prerequisites:
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
Part 1: Simple Linear Regression
- Import the boston housing data set.
boston <- readr::read_csv("Data/boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
- Split the data into a training set and test set using a 70-30%
split.
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
- Using the training data, assess the correlation between cmedv and
all the predictor variables. Which predictor variable has the strongest
postivite/negative correlation?
cor_train <- cor(train)
cmedv_cor <- cor_train[ , c("cmedv")] # rm has the strongest positive correlation, and lstat has the strongest negative
- Based on the results in #3, plot the relationship between cmedv and
rm. Add a “best fit” regression line to show the linear relationship.
Does this line accurately capture the realtionship
ggplot(train, aes(rm, cmedv)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels = scales::comma)
## `geom_smooth()` using formula = 'y ~ x'

- Train a simple linear regression model with the predictor variables
identified above as having the strongest positive correlation. Explain
the coefficient estimate for this predictor variable. Is the coefficient
statistically different than zero?
mod_1 <- linear_reg() %>%
fit(cmedv ~ rm, data = train)
tidy(mod_1)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -35.4 3.11 -11.4 8.70e-26
## 2 rm 9.22 0.491 18.8 7.46e-55
- Compute the generalization RMSE for the above model. Interpret the
results
residual <- mod_1 %>%
predict(train) %>%
bind_cols(train) %>%
select(rm, cmedv, .pred) %>%
mutate(residual = cmedv - .pred)
residual %>%
mutate(squared_residuals = residual^2) %>%
summarize(sum_of_squared_residuals = sum(squared_residuals))
## # A tibble: 1 × 1
## sum_of_squared_residuals
## <dbl>
## 1 14751.
mod_1 %>%
predict(test) %>%
bind_cols(test) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.83
Part 2: Multiple Linear Regression
- Fit a linear regression model using all available features to
predict cmedv. Which predictor variables are statistically different
than zero at the p.value < 0.05 level? Interpret the lstat
coefficient.
mod_2 <- linear_reg() %>%
fit(cmedv ~ ., data = train)
tidy(mod_2) %>%
arrange(desc(p.value)) # lstat coefficient = -0.479
## # A tibble: 16 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 indus -0.0143 0.0738 -0.193 8.47e- 1
## 2 age -0.00921 0.0156 -0.591 5.55e- 1
## 3 lat 5.54 4.24 1.31 1.93e- 1
## 4 lon -5.65 3.86 -1.46 1.45e- 1
## 5 (Intercept) -608. 342. -1.78 7.64e- 2
## 6 zn 0.0332 0.0165 2.01 4.56e- 2
## 7 crim -0.0830 0.0396 -2.10 3.65e- 2
## 8 chas 2.28 1.05 2.17 3.06e- 2
## 9 nox -11.7 4.74 -2.46 1.44e- 2
## 10 tax -0.0121 0.00436 -2.78 5.78e- 3
## 11 rad 0.272 0.0790 3.44 6.47e- 4
## 12 b 0.0123 0.00310 3.97 8.72e- 5
## 13 dis -1.26 0.244 -5.17 4.06e- 7
## 14 ptratio -0.874 0.163 -5.37 1.48e- 7
## 15 lstat -0.479 0.0637 -7.51 5.24e-13
## 16 rm 4.37 0.516 8.46 8.13e-16
- Compute the generalization RMSE for the model with all predictor
variables. What is the test set RMSE? Is this better than the model that
just used the rm predictor variable?
mod_2 %>%
predict(test) %>%
bind_cols(test) %>%
rmse(truth = cmedv, estimate = .pred) # This is better than the previous model
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
- Identify the top 5 most influential predictor variables using the
full model
mod_2 %>%
vip::vip() # 5 most influential predictor variables: rm, lstat, ptratio, dis, b
