library(tidymodels)
library(tidyverse)
boston <- readr::read_csv('boston.csv')
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
correlation_data <- cor(train)
corr_cmedv <- correlation_data["cmedv", ]
corr_cmedv
## lon lat cmedv crim zn indus
## -0.315456520 -0.002024587 1.000000000 -0.384298342 0.344272023 -0.489537963
## chas nox rm age dis rad
## 0.164575979 -0.439021014 0.708152619 -0.398915864 0.271455846 -0.396999035
## tax ptratio b lstat
## -0.479383777 -0.500927283 0.358302318 -0.742823016
strongest positive relationship: rm
strongest negative relationship: lstat
ggplot(train, aes(cmedv, rm)) +
geom_point(size = 1.5, alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
regModel <- linear_reg() %>%
fit(cmedv ~ rm, data = train)
tidy(regModel)
The coefficient estimate: 9.217 which means for every one unit
increase of rm we see an average increase of 9.217 cmedv
The coefficient is statistically different than zero because the p-value
is practically 0 along with the test statistic being 18.76 which is
greater than 2.
residuals <- regModel %>%
predict(train) %>%
bind_cols(train) %>%
select(cmedv, rm, .pred) %>%
mutate(residual = cmedv - .pred)
residuals %>%
mutate(squared_residuals = residual ^ 2) %>%
summarize(sum_of_squared_residuals = sum(squared_residuals))
regModel %>%
predict(test) %>%
bind_cols(test) %>%
rmse(truth = cmedv, estimate = .pred)
The RMSE is 6.83
multiRegModel <- linear_reg() %>%
fit(cmedv ~ ., data = train)
tidy(multiRegModel) %>%
filter(p.value < 0.05) %>%
arrange(p.value)
variables statistically different than zero: rm , lstat, ptratio, dis, b, rad, tax, nox, chas, crim, zn
The lstat coefficient is -0.479 which shows that the cmedv will decrease by 0.479 with one unit change of lstat
multiRegModel %>%
predict(test) %>%
bind_cols(test) %>%
rmse(truth = cmedv, estimate = .pred)
The RMSE is 4.83. This model is better than just using rm as the predictor variable.
multiRegModel %>%
vip::vip()
Top 5 most influential predictor variables: rm, lstat, ptratio, dis,
b