1 - 3)
path <- here('data', 'boston.csv')
boston <- read_csv(path)
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
split <- initial_split(data = boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
correlation <- cor(
train$cmedv, select(train, -cmedv)
)
correlation[,order(correlation, decreasing = TRUE)]
## rm b zn dis chas lat
## 0.708152619 0.358302318 0.344272023 0.271455846 0.164575979 -0.002024587
## lon crim rad age nox tax
## -0.315456520 -0.384298342 -0.396999035 -0.398915864 -0.439021014 -0.479383777
## indus ptratio lstat
## -0.489537963 -0.500927283 -0.742823016
4)
train %>%
ggplot(aes(rm, cmedv)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

5)
lml <- linear_reg() %>%
fit(cmedv ~ rm, data = train)
tidy(lml)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -35.4 3.11 -11.4 8.70e-26
## 2 rm 9.22 0.491 18.8 7.46e-55
6)
lml %>%
predict(test) %>%
bind_cols(select(test, cmedv)) %>%
rmse(cmedv, .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.83