First, let’s call the libraries needed
# Required libraries
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-7
library(readr)
library(lmtest) # for bptest()
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # for vif()
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
real_estate_data <- read_csv("/Users/daviddrums180/Downloads/real_estate_data.csv")
## Rows: 506 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): medv, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
real_estate_data$cmedv <- NULL
Model_multiple <- lm(medv ~ rm + lstat + indus, data = real_estate_data)
summary(Model_multiple)
##
## Call:
## lm(formula = medv ~ rm + lstat + indus, data = real_estate_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.3179 -3.5006 -0.9387 2.0762 28.8816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.96865 3.18168 -0.304 0.761
## rm 5.07379 0.44428 11.420 <2e-16 ***
## lstat -0.60671 0.05046 -12.025 <2e-16 ***
## indus -0.06364 0.04506 -1.412 0.159
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.535 on 502 degrees of freedom
## Multiple R-squared: 0.64, Adjusted R-squared: 0.6378
## F-statistic: 297.5 on 3 and 502 DF, p-value: < 2.2e-16
# Accuracy for Multiple Regression Model
# RMSE for Multiple Regression
RMSE_multiple <- RMSE(Model_multiple$fitted.values, real_estate_data$medv)
# Rsquared for Multiple Regression - directly extracted from the model summary
r_squared_multiple <- summary(Model_multiple)$r.squared
# AIC for Multiple Regression
AIC_multiple <- AIC(Model_multiple)
# Consolidating the metrics into a single dataframe
data.frame(
RMSE = RMSE_multiple,
Rsquare = r_squared_multiple,
AIC = AIC_multiple
)
## RMSE Rsquare AIC
## 1 5.51287 0.6399917 3173.536
Model_polynomial <- lm(medv ~ I(rm^2) + I(lstat^2) + I(indus^2), data = real_estate_data)
summary(Model_polynomial)
##
## Call:
## lm(formula = medv ~ I(rm^2) + I(lstat^2) + I(indus^2), data = real_estate_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.812 -3.073 -0.840 2.449 34.251
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.397421 1.536384 2.862 0.00438 **
## I(rm^2) 0.537562 0.033300 16.143 < 2e-16 ***
## I(lstat^2) -0.011466 0.001398 -8.202 1.99e-15 ***
## I(indus^2) -0.005509 0.001748 -3.151 0.00172 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.774 on 502 degrees of freedom
## Multiple R-squared: 0.6082, Adjusted R-squared: 0.6058
## F-statistic: 259.7 on 3 and 502 DF, p-value: < 2.2e-16
# Accuracy for Polynomial Model
# RMSE for Polynomial
RMSE_poly <- RMSE(Model_polynomial$fitted.values, real_estate_data$medv)
# Rsquared for Polynomial - directly extracted from the model summary
r_squared_polynomial <- summary(Model_polynomial)$r.squared
# AIC for Polynomial
AIC_poly <- AIC(Model_polynomial)
# Consolidating the metrics into a single dataframe
data.frame(
RMSE = RMSE_poly,
Rsquare = r_squared_polynomial,
AIC = AIC_poly
)
## RMSE Rsquare AIC
## 1 5.751214 0.6081895 3216.37
# Split the data into train and test set
set.seed(123)
training.samples <- real_estate_data$medv %>%
createDataPartition(p = 0.75, list = FALSE)
train.data <- real_estate_data[training.samples, ] # Training data
test.data <- real_estate_data[-training.samples, ] # Testing data
# Independent variables (using all variables other than medv)
x <- model.matrix(medv ~ ., train.data)[,-1]
# Dependent variable
y <- train.data$medv
# Estimate the LASSO regression
# Cross-validation to find the optimal lambda
set.seed(123)
cv.lasso <- cv.glmnet(x, y, alpha = 1) # alpha = 1 for LASSO
# Display the best lambda value
cv.lasso$lambda.min
## [1] 0.02028847
# Fit the final LASSO model on the training data using the best lambda
lassomodel <- glmnet(x, y, alpha = 1, lambda = cv.lasso$lambda.min)
# Display regression coefficients
coef(lassomodel)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 31.324069189
## crim -0.072856499
## zn 0.024428950
## indus -0.013363553
## chas 1.461360444
## nox -15.260051785
## rm 4.065903517
## age .
## dis -1.059160501
## rad 0.224296254
## tax -0.011638803
## ptratio -0.922663770
## b 0.007987449
## lstat -0.424509047
# Predict on test data
x.test <- model.matrix(medv ~ ., test.data)[,-1]
lasso_predictions <- predict(lassomodel, newx = x.test) %>% as.vector()
# Model Accuracy
# Calculus for the AIC
residuals_lasso <- test.data$medv - lasso_predictions
SSE <- sum(residuals_lasso^2)
n <- nrow(test.data)
Tr <- sum(predict(lassomodel, newx = x, s = cv.lasso$lambda.min, type = "coefficients")[1, , drop = FALSE] != 0)
AIC_lasso <- 2*Tr + n*log(SSE/n)
# RMSE, Rsquare and AIC
data.frame(
RMSE = RMSE(lasso_predictions, test.data$medv),
Rsquare = R2(lasso_predictions, test.data$medv),
AIC = AIC_lasso
)
## RMSE Rsquare AIC
## 1 6.456171 0.6762786 468.2591
# Convert data to model.matrix format for Ridge regression
x <- model.matrix(medv ~ ., train.data)[,-1]
y <- train.data$medv
# Define the best lambda using cross-validation
set.seed(123)
cv.ridge <- cv.glmnet(x, y, alpha = 0) # alpha = 0 for Ridge regression
# Display the best lambda value
cv.ridge$lambda.min
## [1] 0.6490823
# Fit the Ridge model on the training data
ridgemodel <- glmnet(x, y, alpha = 0, lambda = cv.ridge$lambda.min)
# Display regression coefficients
coef(ridgemodel)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 25.994735184
## crim -0.066393301
## zn 0.018062258
## indus -0.058721267
## chas 1.738033967
## nox -11.213169927
## rm 4.092962823
## age -0.003856820
## dis -0.849835447
## rad 0.118962860
## tax -0.006818129
## ptratio -0.842092676
## b 0.007931751
## lstat -0.385975629
# Predict on test data
x.test <- model.matrix(medv ~ ., test.data)[,-1]
ridge_predictions <- predict(ridgemodel, newx = x.test) %>% as.vector()
# Model Accuracy
# Calculus for the AIC
residuals_ridge <- test.data$medv - ridge_predictions
SSE <- sum(residuals_ridge^2)
n <- nrow(test.data)
Tr <- sum(predict(ridgemodel, newx = x, s = cv.ridge$lambda.min, type = "coefficients")[1, , drop = FALSE] != 0)
AIC_ridge <- 2*Tr + n*log(SSE/n)
# RMSE, Rsquare and AIC
data.frame(
RMSE = RMSE(ridge_predictions, test.data$medv),
Rsquare = R2(ridge_predictions, test.data$medv),
AIC = AIC_ridge
)
## RMSE Rsquare AIC
## 1 6.635525 0.6626213 475.1095
vif(Model_multiple)
## rm lstat indus
## 1.606319 2.140129 1.575512
# Using the bptest
bptest(Model_multiple)
##
## studentized Breusch-Pagan test
##
## data: Model_multiple
## BP = 16.416, df = 3, p-value = 0.0009317
hist(Model_multiple$residuals, main="Histogram of Residuals for Multiple Linear Regression", xlab="Residuals", border="blue", col="lightgray")
# VIF for Polynomial Model
vif(Model_polynomial)
## I(rm^2) I(lstat^2) I(indus^2)
## 1.384632 1.649561 1.355109
# BPTest for Polynomial Model
bptest(Model_polynomial)
##
## studentized Breusch-Pagan test
##
## data: Model_polynomial
## BP = 21.228, df = 3, p-value = 9.44e-05
# Histogram of Residuals for Polynomial Model
hist(Model_polynomial$residuals, main="Histogram of Residuals for Polynomial Regression", xlab="Residuals", border="blue", col="lightgray")
# Building a linear model with the same variables used in the Lasso model for the purpose of calculating VIF
lm_for_vif_lasso <- lm(medv ~ ., data = train.data)
vif(lm_for_vif_lasso)
## crim zn indus chas nox rm age dis
## 1.844756 2.289531 3.962531 1.074133 4.376679 1.991743 3.198481 3.894466
## rad tax ptratio b lstat
## 7.629755 8.908685 1.740579 1.351500 3.455653
lasso_residuals <- residuals_lasso
bptest(lasso_residuals ~ lasso_predictions)
##
## studentized Breusch-Pagan test
##
## data: lasso_residuals ~ lasso_predictions
## BP = 0.13451, df = 1, p-value = 0.7138
ggplot() +
geom_point(aes(x = lasso_predictions, y = residuals_lasso), shape = 1, color = "black", size = 2.5) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Lasso Regression: Residuals vs Predicted",
x = "Predicted values",
y = "Residuals") +
theme_minimal()
hist(lasso_residuals, main="Histogram of Residuals for Lasso Regression", xlab="Residuals", border="green", col="lightgray")
# Building a linear model with the same variables used in the Ridge model for the purpose of calculating VIF
lm_for_vif_ridge <- lm(medv ~ ., data = train.data)
vif(lm_for_vif_ridge)
## crim zn indus chas nox rm age dis
## 1.844756 2.289531 3.962531 1.074133 4.376679 1.991743 3.198481 3.894466
## rad tax ptratio b lstat
## 7.629755 8.908685 1.740579 1.351500 3.455653
ridge_residuals <- residuals_ridge
bptest(ridge_residuals ~ ridge_predictions)
##
## studentized Breusch-Pagan test
##
## data: ridge_residuals ~ ridge_predictions
## BP = 0.039756, df = 1, p-value = 0.842
ggplot() +
geom_point(aes(x = ridge_predictions, y = residuals_ridge), shape = 1, color = "black", size = 2.5) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Ridge Regression: Residuals vs Predicted",
x = "Predicted values",
y = "Residuals") +
theme_minimal()
hist(ridge_residuals, main="Histogram of Residuals for Ridge Regression", xlab="Residuals", border="blue", col="lightgray")