library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(here)
## here() starts at C:/Users/bdipp/OneDrive/Documents/Data Mining
library(kknn)
library(rsample)
library(ggplot2)
library(readr)
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

1.

boston <- readr::read_csv('boston.csv')
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2.

set.seed(123)
boston_split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(boston_split)
boston_test <- testing(boston_split)

3.

correlation <- cor(boston_train)

cmedv_correlation <- correlation[ , c("cmedv")]  

cmedv_correlation
##          lon          lat        cmedv         crim           zn        indus 
## -0.315456520 -0.002024587  1.000000000 -0.384298342  0.344272023 -0.489537963 
##         chas          nox           rm          age          dis          rad 
##  0.164575979 -0.439021014  0.708152619 -0.398915864  0.271455846 -0.396999035 
##          tax      ptratio            b        lstat 
## -0.479383777 -0.500927283  0.358302318 -0.742823016

As we can see the strongest positive correlation is rm with r = 0.708 . We also see the strongest negative correlation is lstat with r = -0.743

4.

ggplot(boston_train, aes(rm, cmedv)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels= scales::comma)
## `geom_smooth()` using formula = 'y ~ x'

Yes, the line seems to capture the trend between rm and cmedv.

5.

model1 <- linear_reg() %>%
  fit(cmedv ~ rm, data = boston_train)

tidy(model1)
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -35.4      3.11      -11.4 8.70e-26
## 2 rm              9.22     0.491      18.8 7.46e-55

For every one unit increase in rm, there is a 9.22 increase on average in the cmedv. Since the p-value is less than 0.05 we can confidently say the coefficient is statistically different than zero.

6.

residuals <- model1 %>%
   predict(boston_train) %>%
   bind_cols(boston_train) %>%
   select(rm, cmedv, .pred) %>%
   mutate(residual = cmedv - .pred)
residuals %>%
   mutate(squared_residuals = residual^2) %>%
   summarize(sum_of_squared_residuals = sum(squared_residuals))
## # A tibble: 1 × 1
##   sum_of_squared_residuals
##                      <dbl>
## 1                   14751.
model1 %>%
   predict(boston_test) %>%
   bind_cols(boston_test) %>%
   rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.83

On average our simple linear regression model is off by 6.83 units.

7.

model2 <- linear_reg() %>%
   fit(cmedv ~ ., data = boston_train) 

tidy(model2) %>%
  arrange(desc(p.value))
## # A tibble: 16 × 5
##    term          estimate std.error statistic  p.value
##    <chr>            <dbl>     <dbl>     <dbl>    <dbl>
##  1 indus         -0.0143    0.0738     -0.193 8.47e- 1
##  2 age           -0.00921   0.0156     -0.591 5.55e- 1
##  3 lat            5.54      4.24        1.31  1.93e- 1
##  4 lon           -5.65      3.86       -1.46  1.45e- 1
##  5 (Intercept) -608.      342.         -1.78  7.64e- 2
##  6 zn             0.0332    0.0165      2.01  4.56e- 2
##  7 crim          -0.0830    0.0396     -2.10  3.65e- 2
##  8 chas           2.28      1.05        2.17  3.06e- 2
##  9 nox          -11.7       4.74       -2.46  1.44e- 2
## 10 tax           -0.0121    0.00436    -2.78  5.78e- 3
## 11 rad            0.272     0.0790      3.44  6.47e- 4
## 12 b              0.0123    0.00310     3.97  8.72e- 5
## 13 dis           -1.26      0.244      -5.17  4.06e- 7
## 14 ptratio       -0.874     0.163      -5.37  1.48e- 7
## 15 lstat         -0.479     0.0637     -7.51  5.24e-13
## 16 rm             4.37      0.516       8.46  8.13e-16

Variables that have a p-value less than 0.05: zn, crim, chas, nox, tax, rad, b, dis, ptratio, lstat, rm

lstat will decrease by 0.4786 units on average for every unit increase of cmedv.

8.

model2 %>%
   predict(boston_test) %>%
   bind_cols(boston_test) %>%
   rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.83

Our new rmse is 4.83 which is 2 points better than using a model that just uses rm.

9.

model2 %>%
   vip::vip(num_features = 20)

rm, lstat, ptratio, dis, and b