#PACKAGES

# Data wrangling & visualization packages
library(tidyverse)

# Modeling packages
library(tidymodels)

#IMPORT

library(readr)
boston <- read_csv("data/boston-1.csv")
head(boston)
## # A tibble: 6 × 16
##     lon   lat cmedv    crim    zn indus  chas   nox    rm   age   dis   rad
##   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -71.0  42.3  24   0.00632    18  2.31     0 0.538  6.58  65.2  4.09     1
## 2 -71.0  42.3  21.6 0.0273      0  7.07     0 0.469  6.42  78.9  4.97     2
## 3 -70.9  42.3  34.7 0.0273      0  7.07     0 0.469  7.18  61.1  4.97     2
## 4 -70.9  42.3  33.4 0.0324      0  2.18     0 0.458  7.00  45.8  6.06     3
## 5 -70.9  42.3  36.2 0.0690      0  2.18     0 0.458  7.15  54.2  6.06     3
## 6 -70.9  42.3  28.7 0.0298      0  2.18     0 0.458  6.43  58.7  6.06     3
## # … with 4 more variables: tax <dbl>, ptratio <dbl>, b <dbl>, lstat <dbl>

#2. Split the data into a training set and test set using a 70-30% split. Be sure to include the set.seed(123) so that your train and test sets are the same size as mine and statify by cmedv.

set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)

#3. Using the training data, assess the correlation between cmedv and all the predictor variables. Hint: you can generate a correlation matrix for all variables at one time with cor(). Which predictor variable has the strongest positive correlation? Which predictor variable has the strongest negative correlation?

correlation_train <- cor(train)

cmedv_cor <- correlation_train[ , c("cmedv")]   

strongest postive = rm strongest negative = lstat

#4. Based on the results in #3, plot the relationship between cmedv and the predictor variable with the strongest positive correlation. Add a “best fit” regression line that illustrates the linear relationship. Do you feel like this line accurately captures the relationship? yes

ggplot(train, aes(rm, cmedv)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels= scales::comma)

#5. Train a simple linear regression model with the predictor variables identified above as having the strongest positive correlation. Explain the coefficient estimate for this predictor variable. Is the coefficient statistically different than zero?

model1 <- linear_reg() %>%
  fit(cmedv ~ rm, data = train)

tidy(model1)
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -35.4      3.11      -11.4 8.70e-26
## 2 rm              9.22     0.491      18.8 7.46e-55

for every one rm we see an average increase of 9.217 yes, because the p-value is basically 0

#6. Compute the generalization RMSE for the above model. Interpret the results.

residuals <- model1 %>%
   predict(train) %>%
   bind_cols(train) %>%
   select(rm, cmedv, .pred) %>%
   mutate(residual = cmedv - .pred)

residuals %>%
   mutate(squared_residuals = residual^2) %>%
   summarize(sum_of_squared_residuals = sum(squared_residuals))
## # A tibble: 1 × 1
##   sum_of_squared_residuals
##                      <dbl>
## 1                   14751.

model1 %>%
   predict(test) %>%
   bind_cols(test) %>%
   rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.83

#7 Fit a linear regression model using all available features to predict cmedv. Which predictor variables are statistically different than zero at the p.value < 0.05 level? Interpret the lstat coefficient.


model3 <- linear_reg() %>%
   fit(cmedv ~ ., data = train) 

tidy(model3) %>%
  arrange(desc(p.value))
## # A tibble: 16 × 5
##    term          estimate std.error statistic  p.value
##    <chr>            <dbl>     <dbl>     <dbl>    <dbl>
##  1 indus         -0.0143    0.0738     -0.193 8.47e- 1
##  2 age           -0.00921   0.0156     -0.591 5.55e- 1
##  3 lat            5.54      4.24        1.31  1.93e- 1
##  4 lon           -5.65      3.86       -1.46  1.45e- 1
##  5 (Intercept) -608.      342.         -1.78  7.64e- 2
##  6 zn             0.0332    0.0165      2.01  4.56e- 2
##  7 crim          -0.0830    0.0396     -2.10  3.65e- 2
##  8 chas           2.28      1.05        2.17  3.06e- 2
##  9 nox          -11.7       4.74       -2.46  1.44e- 2
## 10 tax           -0.0121    0.00436    -2.78  5.78e- 3
## 11 rad            0.272     0.0790      3.44  6.47e- 4
## 12 b              0.0123    0.00310     3.97  8.72e- 5
## 13 dis           -1.26      0.244      -5.17  4.06e- 7
## 14 ptratio       -0.874     0.163      -5.37  1.48e- 7
## 15 lstat         -0.479     0.0637     -7.51  5.24e-13
## 16 rm             4.37      0.516       8.46  8.13e-16

pvalue less than 0.05: zn, crim, chas, nox, tax, rad, b, dis, ptratio, lstat, rm

In reference to the intercept the lstat will decrease by 0.4786

#8. Compute the generalization RMSE for the model with all predictor variables. What is the test set RMSE? Is this better than the model that just used the rm predictor variable?

model3 %>%
   predict(test) %>%
   bind_cols(test) %>%
   rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.83

rmse: 4.829. Yes, it is almost 2 points better thann when just using rm

#9. Using the full model with all predictor variables, identify the top 5 most influential predictor variables in our model.

model3 %>%
   vip::vip(num_features = 20)