Loading Packages

library(tidymodels)
library(tidyverse)
library(readr)

Inserting Data

boston <- read_csv("boston.csv")

Split the data into a training set and test set using a 70-30% split

set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = "cmedv")

train <- training(split)
test <- testing(split)
# Calculate the correlation matrix
cor_matrix <- cor(train %>% select(-c(cmedv)))

# Find the correlation between cmedv and other variables
cor_cmedv <- cor(train$cmedv, train %>% select(-c(cmedv)))

# Show the correlation of cmedv with other variables
cor_cmedv
##             lon          lat       crim       zn     indus     chas       nox
## [1,] -0.3154565 -0.002024587 -0.3842983 0.344272 -0.489538 0.164576 -0.439021
##             rm        age       dis       rad        tax    ptratio         b
## [1,] 0.7081526 -0.3989159 0.2714558 -0.396999 -0.4793838 -0.5009273 0.3583023
##          lstat
## [1,] -0.742823
# Plot the relationship using aes()
train %>%
  ggplot(aes(x=rm,y=cmedv))+geom_point(alpha=0.2)+geom_smooth(method='lm',se=FALSE)

Train a simple linear regression with rm. Explain the coefficient estimate is statistically different from 0

model_rm<-linear_reg() %>%
  fit(cmedv~rm,data=train)
tidy(model_rm)
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -35.4      3.11      -11.4 8.70e-26
## 2 rm              9.22     0.491      18.8 7.46e-55

Compute the generallization RMSE for the model and intepret the results.

model_rm %>%
  predict(test) %>%
  bind_cols(test %>% select(cmedv)) %>%
  rmse(truth=cmedv,estimate=.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.83

Fit a linear regression model using all available features

# Fit a multiple linear regression model
lm_full_model <- lm(cmedv ~ ., data = train)

# Summary of the model to see the coefficients and p-values
summary(lm_full_model)
## 
## Call:
## lm(formula = cmedv ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.4058  -2.4591  -0.7128   1.4936  25.8239 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.080e+02  3.420e+02  -1.778 0.076351 .  
## lon         -5.648e+00  3.864e+00  -1.462 0.144690    
## lat          5.537e+00  4.243e+00   1.305 0.192741    
## crim        -8.304e-02  3.956e-02  -2.099 0.036540 *  
## zn           3.318e-02  1.654e-02   2.007 0.045592 *  
## indus       -1.426e-02  7.379e-02  -0.193 0.846841    
## chas         2.282e+00  1.051e+00   2.171 0.030606 *  
## nox         -1.165e+01  4.738e+00  -2.459 0.014435 *  
## rm           4.369e+00  5.162e-01   8.464 8.13e-16 ***
## age         -9.215e-03  1.559e-02  -0.591 0.554928    
## dis         -1.259e+00  2.436e-01  -5.168 4.06e-07 ***
## rad          2.721e-01  7.903e-02   3.444 0.000647 ***
## tax         -1.212e-02  4.362e-03  -2.778 0.005784 ** 
## ptratio     -8.739e-01  1.627e-01  -5.369 1.48e-07 ***
## b            1.231e-02  3.099e-03   3.972 8.72e-05 ***
## lstat       -4.786e-01  6.369e-02  -7.514 5.24e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.683 on 336 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7398 
## F-statistic: 67.54 on 15 and 336 DF,  p-value: < 2.2e-16

Compute the generalization RMSE for the full model

# Predict on the test set using the full model
full_model_predictions <- predict(lm_full_model, newdata = test)

# Compute RMSE for the full model
rmse_full_model <- sqrt(mean((full_model_predictions - test$cmedv)^2))
rmse_full_model
## [1] 4.829261

Identify the top 5 most influential predictor variables

# Get the t-statistics for each variable
summary(lm_full_model)$coefficients[, "t value"]
## (Intercept)         lon         lat        crim          zn       indus 
##  -1.7777422  -1.4619475   1.3051333  -2.0992513   2.0066086  -0.1932994 
##        chas         nox          rm         age         dis         rad 
##   2.1713288  -2.4590256   8.4635327  -0.5909847  -5.1683784   3.4435320 
##         tax     ptratio           b       lstat 
##  -2.7776005  -5.3694233   3.9721307  -7.5141152
# Get the absolute value of t-statistics and order by influence
t_values <- abs(summary(lm_full_model)$coefficients[, "t value"])
top_5_predictors <- sort(t_values, decreasing = TRUE)[1:5]
top_5_predictors
##       rm    lstat  ptratio      dis        b 
## 8.463533 7.514115 5.369423 5.168378 3.972131