Prerequisites

library(tidymodels)
library(tidyverse)
library(here)

Part 1: Simple Linear Regression

1 - 3)

path <- here('data', 'boston.csv')
boston <- read_csv(path)

## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

set.seed(123)
split <- initial_split(data = boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)

correlation <- cor(
  train$cmedv, select(train, -cmedv)
  )  

correlation[,order(correlation, decreasing = TRUE)]

##           rm            b           zn          dis         chas          lat 
##  0.708152619  0.358302318  0.344272023  0.271455846  0.164575979 -0.002024587 
##          lon         crim          rad          age          nox          tax 
## -0.315456520 -0.384298342 -0.396999035 -0.398915864 -0.439021014 -0.479383777 
##        indus      ptratio        lstat 
## -0.489537963 -0.500927283 -0.742823016

4)

train %>%
  ggplot(aes(rm, cmedv)) +
    geom_point() +
    geom_smooth(method = 'lm', se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

5)

lml <- linear_reg() %>%
  fit(cmedv ~ rm, data = train)

tidy(lml)

## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -35.4      3.11      -11.4 8.70e-26
## 2 rm              9.22     0.491      18.8 7.46e-55

6)

lml %>%
  predict(test) %>%    
  bind_cols(select(test, cmedv)) %>%  
  rmse(cmedv, .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.83

Part 2: Multiple Linear Regression

7)

lml2 <- linear_reg() %>%
  fit(cmedv ~ ., data = train) 

tidy(lml2) %>%
  filter(as.numeric(p.value) < 0.05)

## # A tibble: 11 × 5
##    term    estimate std.error statistic  p.value
##    <chr>      <dbl>     <dbl>     <dbl>    <dbl>
##  1 crim     -0.0830   0.0396      -2.10 3.65e- 2
##  2 zn        0.0332   0.0165       2.01 4.56e- 2
##  3 chas      2.28     1.05         2.17 3.06e- 2
##  4 nox     -11.7      4.74        -2.46 1.44e- 2
##  5 rm        4.37     0.516        8.46 8.13e-16
##  6 dis      -1.26     0.244       -5.17 4.06e- 7
##  7 rad       0.272    0.0790       3.44 6.47e- 4
##  8 tax      -0.0121   0.00436     -2.78 5.78e- 3
##  9 ptratio  -0.874    0.163       -5.37 1.48e- 7
## 10 b         0.0123   0.00310      3.97 8.72e- 5
## 11 lstat    -0.479    0.0637      -7.51 5.24e-13

8)

lml2 %>%
  predict(test) %>%   
  bind_cols(select(test, cmedv)) %>%  
  rmse(cmedv, .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.83

9)

lml2 %>%
  vip::vip(5)

Module 9 Lab

Daniel Plotkin

2022-10-18

Prerequisites

Part 1: Simple Linear Regression

1 - 3)

4)

5)

6)

Part 2: Multiple Linear Regression

7)

8)

9)