Prerequisites:

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(vip)

## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

Part 1: Simple Linear Regression

Import the boston housing data set.

boston <- readr::read_csv("Data/boston.csv")

## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Split the data into a training set and test set using a 70-30% split.

set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)

Using the training data, assess the correlation between cmedv and all the predictor variables. Which predictor variable has the strongest postivite/negative correlation?

cor_train <- cor(train)
cmedv_cor <- cor_train[ , c("cmedv")] # rm has the strongest positive correlation, and lstat has the strongest negative

Based on the results in #3, plot the relationship between cmedv and rm. Add a “best fit” regression line to show the linear relationship. Does this line accurately capture the realtionship

ggplot(train, aes(rm, cmedv)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels = scales::comma)

## `geom_smooth()` using formula = 'y ~ x'

Train a simple linear regression model with the predictor variables identified above as having the strongest positive correlation. Explain the coefficient estimate for this predictor variable. Is the coefficient statistically different than zero?

mod_1 <- linear_reg() %>%
  fit(cmedv ~ rm, data = train)

tidy(mod_1)

## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -35.4      3.11      -11.4 8.70e-26
## 2 rm              9.22     0.491      18.8 7.46e-55

Compute the generalization RMSE for the above model. Interpret the results

residual <- mod_1 %>%
  predict(train) %>%
  bind_cols(train) %>%
  select(rm, cmedv, .pred) %>%
  mutate(residual = cmedv - .pred)

residual %>%
  mutate(squared_residuals = residual^2) %>%
  summarize(sum_of_squared_residuals = sum(squared_residuals))

## # A tibble: 1 × 1
##   sum_of_squared_residuals
##                      <dbl>
## 1                   14751.

mod_1 %>%
  predict(test) %>%
  bind_cols(test) %>%
  rmse(truth = cmedv, estimate = .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.83

Part 2: Multiple Linear Regression

Fit a linear regression model using all available features to predict cmedv. Which predictor variables are statistically different than zero at the p.value < 0.05 level? Interpret the lstat coefficient.

mod_2 <- linear_reg() %>%
  fit(cmedv ~ ., data = train)

tidy(mod_2) %>%
  arrange(desc(p.value)) # lstat coefficient = -0.479

## # A tibble: 16 × 5
##    term          estimate std.error statistic  p.value
##    <chr>            <dbl>     <dbl>     <dbl>    <dbl>
##  1 indus         -0.0143    0.0738     -0.193 8.47e- 1
##  2 age           -0.00921   0.0156     -0.591 5.55e- 1
##  3 lat            5.54      4.24        1.31  1.93e- 1
##  4 lon           -5.65      3.86       -1.46  1.45e- 1
##  5 (Intercept) -608.      342.         -1.78  7.64e- 2
##  6 zn             0.0332    0.0165      2.01  4.56e- 2
##  7 crim          -0.0830    0.0396     -2.10  3.65e- 2
##  8 chas           2.28      1.05        2.17  3.06e- 2
##  9 nox          -11.7       4.74       -2.46  1.44e- 2
## 10 tax           -0.0121    0.00436    -2.78  5.78e- 3
## 11 rad            0.272     0.0790      3.44  6.47e- 4
## 12 b              0.0123    0.00310     3.97  8.72e- 5
## 13 dis           -1.26      0.244      -5.17  4.06e- 7
## 14 ptratio       -0.874     0.163      -5.37  1.48e- 7
## 15 lstat         -0.479     0.0637     -7.51  5.24e-13
## 16 rm             4.37      0.516       8.46  8.13e-16

Compute the generalization RMSE for the model with all predictor variables. What is the test set RMSE? Is this better than the model that just used the rm predictor variable?

mod_2 %>%
  predict(test) %>%
  bind_cols(test) %>%
  rmse(truth = cmedv, estimate = .pred) # This is better than the previous model

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.83

Identify the top 5 most influential predictor variables using the full model

mod_2 %>%
  vip::vip() # 5 most influential predictor variables: rm, lstat, ptratio, dis, b

Module 9 Lab

Andrew Thiery

2024-10-24

Prerequisites:

Part 1: Simple Linear Regression

Part 2: Multiple Linear Regression