Calling Libraries

First, let’s call the libraries needed

# Required libraries
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-7
library(readr)
library(lmtest) # for bptest()
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car) # for vif()
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
real_estate_data <- read_csv("/Users/daviddrums180/Downloads/real_estate_data.csv")
## Rows: 506 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): medv, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
real_estate_data$cmedv <- NULL

Models and Accuracy

1. Multiple Regression Analysis

Building the Multiple Regression Model

Model_multiple <- lm(medv ~ rm + lstat + indus, data = real_estate_data)
summary(Model_multiple)
## 
## Call:
## lm(formula = medv ~ rm + lstat + indus, data = real_estate_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.3179  -3.5006  -0.9387   2.0762  28.8816 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.96865    3.18168  -0.304    0.761    
## rm           5.07379    0.44428  11.420   <2e-16 ***
## lstat       -0.60671    0.05046 -12.025   <2e-16 ***
## indus       -0.06364    0.04506  -1.412    0.159    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.535 on 502 degrees of freedom
## Multiple R-squared:   0.64,  Adjusted R-squared:  0.6378 
## F-statistic: 297.5 on 3 and 502 DF,  p-value: < 2.2e-16

Accuracy for Multiple Regression Model

# Accuracy for Multiple Regression Model

# RMSE for Multiple Regression
RMSE_multiple <- RMSE(Model_multiple$fitted.values, real_estate_data$medv)

# Rsquared for Multiple Regression - directly extracted from the model summary
r_squared_multiple <- summary(Model_multiple)$r.squared

# AIC for Multiple Regression
AIC_multiple <- AIC(Model_multiple)

# Consolidating the metrics into a single dataframe
data.frame(
  RMSE = RMSE_multiple,
  Rsquare = r_squared_multiple,
  AIC = AIC_multiple
)
##      RMSE   Rsquare      AIC
## 1 5.51287 0.6399917 3173.536

2. Polynomial Regression Analysis

Building the Polynomial Regression Model

Model_polynomial <- lm(medv ~ I(rm^2) + I(lstat^2) + I(indus^2), data = real_estate_data)
summary(Model_polynomial)
## 
## Call:
## lm(formula = medv ~ I(rm^2) + I(lstat^2) + I(indus^2), data = real_estate_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.812  -3.073  -0.840   2.449  34.251 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.397421   1.536384   2.862  0.00438 ** 
## I(rm^2)      0.537562   0.033300  16.143  < 2e-16 ***
## I(lstat^2)  -0.011466   0.001398  -8.202 1.99e-15 ***
## I(indus^2)  -0.005509   0.001748  -3.151  0.00172 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.774 on 502 degrees of freedom
## Multiple R-squared:  0.6082, Adjusted R-squared:  0.6058 
## F-statistic: 259.7 on 3 and 502 DF,  p-value: < 2.2e-16

Accuracy for Polynomial Regression Model

# Accuracy for Polynomial Model

# RMSE for Polynomial
RMSE_poly <- RMSE(Model_polynomial$fitted.values, real_estate_data$medv)

# Rsquared for Polynomial - directly extracted from the model summary
r_squared_polynomial <- summary(Model_polynomial)$r.squared

# AIC for Polynomial
AIC_poly <- AIC(Model_polynomial)

# Consolidating the metrics into a single dataframe
data.frame(
  RMSE = RMSE_poly,
  Rsquare = r_squared_polynomial,
  AIC = AIC_poly
)
##       RMSE   Rsquare     AIC
## 1 5.751214 0.6081895 3216.37

3. Lasso Regression Analysis

Prepare data for Lasso and Ridge

# Split the data into train and test set
set.seed(123)
training.samples <- real_estate_data$medv %>%
  createDataPartition(p = 0.75, list = FALSE)

train.data <- real_estate_data[training.samples, ]  # Training data 
test.data <- real_estate_data[-training.samples, ]  # Testing data

Building the Lasso Regression Model

# Independent variables (using all variables other than medv)
x <- model.matrix(medv ~ ., train.data)[,-1]

# Dependent variable
y <- train.data$medv

# Estimate the LASSO regression
# Cross-validation to find the optimal lambda
set.seed(123)
cv.lasso <- cv.glmnet(x, y, alpha = 1) # alpha = 1 for LASSO

# Display the best lambda value
cv.lasso$lambda.min
## [1] 0.02028847
# Fit the final LASSO model on the training data using the best lambda
lassomodel <- glmnet(x, y, alpha = 1, lambda = cv.lasso$lambda.min)

# Display regression coefficients
coef(lassomodel)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept)  31.324069189
## crim         -0.072856499
## zn            0.024428950
## indus        -0.013363553
## chas          1.461360444
## nox         -15.260051785
## rm            4.065903517
## age           .          
## dis          -1.059160501
## rad           0.224296254
## tax          -0.011638803
## ptratio      -0.922663770
## b             0.007987449
## lstat        -0.424509047
# Predict on test data
x.test <- model.matrix(medv ~ ., test.data)[,-1]
lasso_predictions <- predict(lassomodel, newx = x.test) %>% as.vector()

Accuracy for Lasso Regression Model

# Model Accuracy

# Calculus for the AIC
residuals_lasso <- test.data$medv - lasso_predictions
SSE <- sum(residuals_lasso^2)
n <- nrow(test.data)
Tr <- sum(predict(lassomodel, newx = x, s = cv.lasso$lambda.min, type = "coefficients")[1, , drop = FALSE] != 0)
AIC_lasso <- 2*Tr + n*log(SSE/n)

# RMSE, Rsquare and AIC
data.frame(
  RMSE = RMSE(lasso_predictions, test.data$medv),
  Rsquare = R2(lasso_predictions, test.data$medv),
  AIC = AIC_lasso
)
##       RMSE   Rsquare      AIC
## 1 6.456171 0.6762786 468.2591

4. Ridge Regression Analysis

Building the Ridge Regression Model

# Convert data to model.matrix format for Ridge regression
x <- model.matrix(medv ~ ., train.data)[,-1]
y <- train.data$medv

# Define the best lambda using cross-validation
set.seed(123)
cv.ridge <- cv.glmnet(x, y, alpha = 0)  # alpha = 0 for Ridge regression

# Display the best lambda value
cv.ridge$lambda.min
## [1] 0.6490823
# Fit the Ridge model on the training data
ridgemodel <- glmnet(x, y, alpha = 0, lambda = cv.ridge$lambda.min)

# Display regression coefficients
coef(ridgemodel)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept)  25.994735184
## crim         -0.066393301
## zn            0.018062258
## indus        -0.058721267
## chas          1.738033967
## nox         -11.213169927
## rm            4.092962823
## age          -0.003856820
## dis          -0.849835447
## rad           0.118962860
## tax          -0.006818129
## ptratio      -0.842092676
## b             0.007931751
## lstat        -0.385975629
# Predict on test data
x.test <- model.matrix(medv ~ ., test.data)[,-1]
ridge_predictions <- predict(ridgemodel, newx = x.test) %>% as.vector()

Accuracy for Ridge Regression Model

# Model Accuracy

# Calculus for the AIC
residuals_ridge <- test.data$medv - ridge_predictions
SSE <- sum(residuals_ridge^2)
n <- nrow(test.data)
Tr <- sum(predict(ridgemodel, newx = x, s = cv.ridge$lambda.min, type = "coefficients")[1, , drop = FALSE] != 0)
AIC_ridge <- 2*Tr + n*log(SSE/n)

# RMSE, Rsquare and AIC
data.frame(
  RMSE = RMSE(ridge_predictions, test.data$medv),
  Rsquare = R2(ridge_predictions, test.data$medv),
  AIC = AIC_ridge
)
##       RMSE   Rsquare      AIC
## 1 6.635525 0.6626213 475.1095

Diagnostic Test per Model

a. Multiple Linear Regression Model

1. Multicollinearity (VIF):

vif(Model_multiple)
##       rm    lstat    indus 
## 1.606319 2.140129 1.575512

2. Heteroscedasticity (Breusch-Pagan test):

# Using the bptest
bptest(Model_multiple)
## 
##  studentized Breusch-Pagan test
## 
## data:  Model_multiple
## BP = 16.416, df = 3, p-value = 0.0009317

3. Normality of residuals:

hist(Model_multiple$residuals, main="Histogram of Residuals for Multiple Linear Regression", xlab="Residuals", border="blue", col="lightgray")

b. Polynomial Regression Model

1. Multicollinearity (VIF):

# VIF for Polynomial Model
vif(Model_polynomial)
##    I(rm^2) I(lstat^2) I(indus^2) 
##   1.384632   1.649561   1.355109

2. Heteroscedasticity:

# BPTest for Polynomial Model
bptest(Model_polynomial)
## 
##  studentized Breusch-Pagan test
## 
## data:  Model_polynomial
## BP = 21.228, df = 3, p-value = 9.44e-05

3. Normality of residuals:

# Histogram of Residuals for Polynomial Model
hist(Model_polynomial$residuals, main="Histogram of Residuals for Polynomial Regression", xlab="Residuals", border="blue", col="lightgray")

c. Lasso Model

1. Multicollinearity (VIF):

# Building a linear model with the same variables used in the Lasso model for the purpose of calculating VIF
lm_for_vif_lasso <- lm(medv ~ ., data = train.data)
vif(lm_for_vif_lasso)
##     crim       zn    indus     chas      nox       rm      age      dis 
## 1.844756 2.289531 3.962531 1.074133 4.376679 1.991743 3.198481 3.894466 
##      rad      tax  ptratio        b    lstat 
## 7.629755 8.908685 1.740579 1.351500 3.455653

2. Heteroscedasticity (Breusch-Pagan test):

lasso_residuals <- residuals_lasso
bptest(lasso_residuals ~ lasso_predictions)
## 
##  studentized Breusch-Pagan test
## 
## data:  lasso_residuals ~ lasso_predictions
## BP = 0.13451, df = 1, p-value = 0.7138
ggplot() +
  geom_point(aes(x = lasso_predictions, y = residuals_lasso), shape = 1, color = "black", size = 2.5) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Lasso Regression: Residuals vs Predicted",
       x = "Predicted values",
       y = "Residuals") +
  theme_minimal()

3. Normality of residuals:

hist(lasso_residuals, main="Histogram of Residuals for Lasso Regression", xlab="Residuals", border="green", col="lightgray")

d. Ridge Model

1. Multicollinearity (VIF):

# Building a linear model with the same variables used in the Ridge model for the purpose of calculating VIF
lm_for_vif_ridge <- lm(medv ~ ., data = train.data)
vif(lm_for_vif_ridge)
##     crim       zn    indus     chas      nox       rm      age      dis 
## 1.844756 2.289531 3.962531 1.074133 4.376679 1.991743 3.198481 3.894466 
##      rad      tax  ptratio        b    lstat 
## 7.629755 8.908685 1.740579 1.351500 3.455653

2. Heteroscedasticity (Breusch-Pagan test):

ridge_residuals <- residuals_ridge
bptest(ridge_residuals ~ ridge_predictions)
## 
##  studentized Breusch-Pagan test
## 
## data:  ridge_residuals ~ ridge_predictions
## BP = 0.039756, df = 1, p-value = 0.842
ggplot() +
  geom_point(aes(x = ridge_predictions, y = residuals_ridge), shape = 1, color = "black", size = 2.5) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Ridge Regression: Residuals vs Predicted",
       x = "Predicted values",
       y = "Residuals") +
  theme_minimal()

3. Normality of residuals:

hist(ridge_residuals, main="Histogram of Residuals for Ridge Regression", xlab="Residuals", border="blue", col="lightgray")