#Import the Boston housing data set (boston.csv).
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(readr)
boston <- read_csv("~/Documents/boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(boston)

#Split the data into a training set and test set using a 70-30% split. Be sure to include the set.seed(123) so that your train and test sets are the same size as mine and statify by cmedv.

set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)

#Using the training data, assess the correlation between cmedv and all the predictor variables. Hint: you can generate a correlation matrix for all variables at one time with cor(). Which predictor variable has the strongest positive correlation? Which predictor variable has the strongest negative correlation?

corr_train <- cor(train)

cmedv_cor <- corr_train[ , c("cmedv")] 

#Based on the results in #3, plot the relationship between cmedv and the predictor variable with the strongest positive correlation. Add a “best fit” regression line that illustrates the linear relationship. Do you feel like this line accurately captures the relationship?

ggplot(train, aes(rm, cmedv)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels= scales::comma)
## `geom_smooth()` using formula = 'y ~ x'

#Train a simple linear regression model with the predictor variables identified above as having the strongest positive correlation. Explain the coefficient estimate for this predictor variable. Is the coefficient statistically different than zero?

mod1 <- linear_reg() %>%
  fit(cmedv ~ rm, data = train)

tidy(mod1)
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -35.4      3.11      -11.4 8.70e-26
## 2 rm              9.22     0.491      18.8 7.46e-55
#Compute the generalization RMSE for the above model. Interpret the results

residual <- mod1 %>%
   predict(train) %>%
   bind_cols(train) %>%
   select(rm, cmedv, .pred) %>%
   mutate(residual = cmedv - .pred)

residual %>%
   mutate(squared_residuals = residual^2) %>%
   summarize(sum_of_squared_residuals = sum(squared_residuals))
## # A tibble: 1 × 1
##   sum_of_squared_residuals
##                      <dbl>
## 1                   14751.
mod1 %>%
   predict(test) %>%
   bind_cols(test) %>%
   rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.83
#Fit a linear regression model using all available features to predict cmedv. Which predictor variables are statistically different than zero at the p.value < 0.05 level? Interpret the lstat coefficient.

mod3 <- linear_reg() %>%
   fit(cmedv ~ ., data = train) 

tidy(mod3) %>%
  arrange(desc(p.value))
## # A tibble: 16 × 5
##    term          estimate std.error statistic  p.value
##    <chr>            <dbl>     <dbl>     <dbl>    <dbl>
##  1 indus         -0.0143    0.0738     -0.193 8.47e- 1
##  2 age           -0.00921   0.0156     -0.591 5.55e- 1
##  3 lat            5.54      4.24        1.31  1.93e- 1
##  4 lon           -5.65      3.86       -1.46  1.45e- 1
##  5 (Intercept) -608.      342.         -1.78  7.64e- 2
##  6 zn             0.0332    0.0165      2.01  4.56e- 2
##  7 crim          -0.0830    0.0396     -2.10  3.65e- 2
##  8 chas           2.28      1.05        2.17  3.06e- 2
##  9 nox          -11.7       4.74       -2.46  1.44e- 2
## 10 tax           -0.0121    0.00436    -2.78  5.78e- 3
## 11 rad            0.272     0.0790      3.44  6.47e- 4
## 12 b              0.0123    0.00310     3.97  8.72e- 5
## 13 dis           -1.26      0.244      -5.17  4.06e- 7
## 14 ptratio       -0.874     0.163      -5.37  1.48e- 7
## 15 lstat         -0.479     0.0637     -7.51  5.24e-13
## 16 rm             4.37      0.516       8.46  8.13e-16
#Compute the generalization RMSE for the model with all predictor variables. What is the test set RMSE? Is this better than the model that just used the rm predictor variable?

mod3 %>%
   predict(test) %>%
   bind_cols(test) %>%
   rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.83
#sing the full model with all predictor variables, identify the top 5 most influential predictor variables in our model.