Loading Packages
library(tidymodels)
library(tidyverse)
library(readr)
Inserting Data
boston <- read_csv("boston.csv")
Split the data into a training set and test set using a 70-30%
split
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = "cmedv")
train <- training(split)
test <- testing(split)
# Calculate the correlation matrix
cor_matrix <- cor(train %>% select(-c(cmedv)))
# Find the correlation between cmedv and other variables
cor_cmedv <- cor(train$cmedv, train %>% select(-c(cmedv)))
# Show the correlation of cmedv with other variables
cor_cmedv
## lon lat crim zn indus chas nox
## [1,] -0.3154565 -0.002024587 -0.3842983 0.344272 -0.489538 0.164576 -0.439021
## rm age dis rad tax ptratio b
## [1,] 0.7081526 -0.3989159 0.2714558 -0.396999 -0.4793838 -0.5009273 0.3583023
## lstat
## [1,] -0.742823
# Plot the relationship using aes()
train %>%
ggplot(aes(x=rm,y=cmedv))+geom_point(alpha=0.2)+geom_smooth(method='lm',se=FALSE)

Train a simple linear regression with rm. Explain the coefficient
estimate is statistically different from 0
model_rm<-linear_reg() %>%
fit(cmedv~rm,data=train)
tidy(model_rm)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -35.4 3.11 -11.4 8.70e-26
## 2 rm 9.22 0.491 18.8 7.46e-55
Compute the generallization RMSE for the model and intepret the
results.
model_rm %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth=cmedv,estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.83
Fit a linear regression model using all available features
# Fit a multiple linear regression model
lm_full_model <- lm(cmedv ~ ., data = train)
# Summary of the model to see the coefficients and p-values
summary(lm_full_model)
##
## Call:
## lm(formula = cmedv ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.4058 -2.4591 -0.7128 1.4936 25.8239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.080e+02 3.420e+02 -1.778 0.076351 .
## lon -5.648e+00 3.864e+00 -1.462 0.144690
## lat 5.537e+00 4.243e+00 1.305 0.192741
## crim -8.304e-02 3.956e-02 -2.099 0.036540 *
## zn 3.318e-02 1.654e-02 2.007 0.045592 *
## indus -1.426e-02 7.379e-02 -0.193 0.846841
## chas 2.282e+00 1.051e+00 2.171 0.030606 *
## nox -1.165e+01 4.738e+00 -2.459 0.014435 *
## rm 4.369e+00 5.162e-01 8.464 8.13e-16 ***
## age -9.215e-03 1.559e-02 -0.591 0.554928
## dis -1.259e+00 2.436e-01 -5.168 4.06e-07 ***
## rad 2.721e-01 7.903e-02 3.444 0.000647 ***
## tax -1.212e-02 4.362e-03 -2.778 0.005784 **
## ptratio -8.739e-01 1.627e-01 -5.369 1.48e-07 ***
## b 1.231e-02 3.099e-03 3.972 8.72e-05 ***
## lstat -4.786e-01 6.369e-02 -7.514 5.24e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.683 on 336 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7398
## F-statistic: 67.54 on 15 and 336 DF, p-value: < 2.2e-16
Compute the generalization RMSE for the full model
# Predict on the test set using the full model
full_model_predictions <- predict(lm_full_model, newdata = test)
# Compute RMSE for the full model
rmse_full_model <- sqrt(mean((full_model_predictions - test$cmedv)^2))
rmse_full_model
## [1] 4.829261
Identify the top 5 most influential predictor variables
# Get the t-statistics for each variable
summary(lm_full_model)$coefficients[, "t value"]
## (Intercept) lon lat crim zn indus
## -1.7777422 -1.4619475 1.3051333 -2.0992513 2.0066086 -0.1932994
## chas nox rm age dis rad
## 2.1713288 -2.4590256 8.4635327 -0.5909847 -5.1683784 3.4435320
## tax ptratio b lstat
## -2.7776005 -5.3694233 3.9721307 -7.5141152
# Get the absolute value of t-statistics and order by influence
t_values <- abs(summary(lm_full_model)$coefficients[, "t value"])
top_5_predictors <- sort(t_values, decreasing = TRUE)[1:5]
top_5_predictors
## rm lstat ptratio dis b
## 8.463533 7.514115 5.369423 5.168378 3.972131