library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(here)
## here() starts at C:/Users/bdipp/OneDrive/Documents/Data Mining
library(kknn)
library(rsample)
library(ggplot2)
library(readr)
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
boston <- readr::read_csv('boston.csv')
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
boston_split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(boston_split)
boston_test <- testing(boston_split)
correlation <- cor(boston_train)
cmedv_correlation <- correlation[ , c("cmedv")]
cmedv_correlation
## lon lat cmedv crim zn indus
## -0.315456520 -0.002024587 1.000000000 -0.384298342 0.344272023 -0.489537963
## chas nox rm age dis rad
## 0.164575979 -0.439021014 0.708152619 -0.398915864 0.271455846 -0.396999035
## tax ptratio b lstat
## -0.479383777 -0.500927283 0.358302318 -0.742823016
As we can see the strongest positive correlation is rm with r = 0.708 . We also see the strongest negative correlation is lstat with r = -0.743
ggplot(boston_train, aes(rm, cmedv)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", se = FALSE) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels= scales::comma)
## `geom_smooth()` using formula = 'y ~ x'
Yes, the line seems to capture the trend between rm and cmedv.
model1 <- linear_reg() %>%
fit(cmedv ~ rm, data = boston_train)
tidy(model1)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -35.4 3.11 -11.4 8.70e-26
## 2 rm 9.22 0.491 18.8 7.46e-55
For every one unit increase in rm, there is a 9.22 increase on average in the cmedv. Since the p-value is less than 0.05 we can confidently say the coefficient is statistically different than zero.
residuals <- model1 %>%
predict(boston_train) %>%
bind_cols(boston_train) %>%
select(rm, cmedv, .pred) %>%
mutate(residual = cmedv - .pred)
residuals %>%
mutate(squared_residuals = residual^2) %>%
summarize(sum_of_squared_residuals = sum(squared_residuals))
## # A tibble: 1 × 1
## sum_of_squared_residuals
## <dbl>
## 1 14751.
model1 %>%
predict(boston_test) %>%
bind_cols(boston_test) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.83
On average our simple linear regression model is off by 6.83 units.
model2 <- linear_reg() %>%
fit(cmedv ~ ., data = boston_train)
tidy(model2) %>%
arrange(desc(p.value))
## # A tibble: 16 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 indus -0.0143 0.0738 -0.193 8.47e- 1
## 2 age -0.00921 0.0156 -0.591 5.55e- 1
## 3 lat 5.54 4.24 1.31 1.93e- 1
## 4 lon -5.65 3.86 -1.46 1.45e- 1
## 5 (Intercept) -608. 342. -1.78 7.64e- 2
## 6 zn 0.0332 0.0165 2.01 4.56e- 2
## 7 crim -0.0830 0.0396 -2.10 3.65e- 2
## 8 chas 2.28 1.05 2.17 3.06e- 2
## 9 nox -11.7 4.74 -2.46 1.44e- 2
## 10 tax -0.0121 0.00436 -2.78 5.78e- 3
## 11 rad 0.272 0.0790 3.44 6.47e- 4
## 12 b 0.0123 0.00310 3.97 8.72e- 5
## 13 dis -1.26 0.244 -5.17 4.06e- 7
## 14 ptratio -0.874 0.163 -5.37 1.48e- 7
## 15 lstat -0.479 0.0637 -7.51 5.24e-13
## 16 rm 4.37 0.516 8.46 8.13e-16
Variables that have a p-value less than 0.05: zn, crim, chas, nox, tax, rad, b, dis, ptratio, lstat, rm
lstat will decrease by 0.4786 units on average for every unit increase of cmedv.
model2 %>%
predict(boston_test) %>%
bind_cols(boston_test) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
Our new rmse is 4.83 which is 2 points better than using a model that just uses rm.
model2 %>%
vip::vip(num_features = 20)
rm, lstat, ptratio, dis, and b