# Import BD
bd<- read.csv("C:\\Users\\Silva\\Downloads\\real_estate_data.csv")
#Installing libraries
#library(pysch)
library(tidyverse)
library(ggplot2)
library(corrplot)
library(gmodels)
library(effects)
library(stargazer)
library(olsrr)
library(kableExtra)
library(jtools)
library(fastmap)
library(dlookr)
library(Hmisc)
library(naniar)
library(glmnet)
library(caret)
library(car)
library(lmtest)
Estimate 3 different linear regression models
## Model 1: Multiple linear regression
modelol <- lm(medv ~ indus + nox + rad + chas + tax + rm + crim + dis + lstat, data = bd)
summary(modelol)
##
## Call:
## lm(formula = medv ~ indus + nox + rad + chas + tax + rm + crim +
## dis + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.922 -3.164 -1.019 2.128 27.880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.037486 3.996667 3.512 0.000485 ***
## indus -0.102369 0.065751 -1.557 0.120127
## nox -9.128940 3.797284 -2.404 0.016579 *
## rad 0.135520 0.069856 1.940 0.052947 .
## chas 3.602529 0.934492 3.855 0.000131 ***
## tax -0.010006 0.004004 -2.499 0.012790 *
## rm 4.603069 0.431139 10.677 < 2e-16 ***
## crim -0.095412 0.035470 -2.690 0.007388 **
## dis -1.071326 0.184055 -5.821 1.05e-08 ***
## lstat -0.575164 0.051430 -11.184 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.186 on 496 degrees of freedom
## Multiple R-squared: 0.6878, Adjusted R-squared: 0.6821
## F-statistic: 121.4 on 9 and 496 DF, p-value: < 2.2e-16
## Model 2: Linear model (logarithmic)
modelo2 <- lm(log(medv) ~ indus + nox + rad + chas + tax + rm + crim + dis + lstat, data = bd)
summary(modelo2)
##
## Call:
## lm(formula = log(medv) ~ indus + nox + rad + chas + tax + rm +
## crim + dis + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.75215 -0.11530 -0.02392 0.11428 0.85862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2723333 0.1583533 20.665 < 2e-16 ***
## indus -0.0019455 0.0026051 -0.747 0.455539
## nox -0.4302928 0.1504535 -2.860 0.004415 **
## rad 0.0079011 0.0027678 2.855 0.004489 **
## chas 0.1369810 0.0370258 3.700 0.000240 ***
## tax -0.0005753 0.0001587 -3.626 0.000318 ***
## rm 0.1187548 0.0170823 6.952 1.14e-11 ***
## crim -0.0100107 0.0014054 -7.123 3.73e-12 ***
## dis -0.0386950 0.0072925 -5.306 1.69e-07 ***
## lstat -0.0308904 0.0020377 -15.159 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2055 on 496 degrees of freedom
## Multiple R-squared: 0.7519, Adjusted R-squared: 0.7474
## F-statistic: 167 on 9 and 496 DF, p-value: < 2.2e-16
## Model 3: polynomial – multiple linear regression
polymodel = lm(log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) + dis + tax + crim + lstat, data=bd)
summary(polymodel)
##
## Call:
## lm(formula = log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) +
## dis + tax + crim + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96456 -0.10702 -0.00925 0.09530 0.88754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8502983 0.4015860 17.058 < 2e-16 ***
## indus 0.0006635 0.0024114 0.275 0.783318
## nox -0.4980196 0.1385524 -3.594 0.000358 ***
## rad 0.0076340 0.0025457 2.999 0.002847 **
## chas 0.1183804 0.0341080 3.471 0.000564 ***
## rm -1.0108149 0.1191905 -8.481 2.60e-16 ***
## I(rm^2) 0.0876424 0.0091672 9.560 < 2e-16 ***
## dis -0.0300090 0.0067681 -4.434 1.14e-05 ***
## tax -0.0005642 0.0001459 -3.867 0.000125 ***
## crim -0.0108652 0.0012956 -8.386 5.26e-16 ***
## lstat -0.0313507 0.0018747 -16.723 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.189 on 495 degrees of freedom
## Multiple R-squared: 0.7905, Adjusted R-squared: 0.7863
## F-statistic: 186.8 on 10 and 495 DF, p-value: < 2.2e-16
# Lasso regression model
## Creation of data samples for train & test
set.seed(123)
training.samples<-bd$medv %>%
createDataPartition(p=0.75,list=FALSE)
train.data<-bd[training.samples, ]
test.data<-bd[-training.samples, ]
selected_model = lm(log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) + dis + tax + crim + lstat, data=bd)
summary(selected_model)
##
## Call:
## lm(formula = log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) +
## dis + tax + crim + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96456 -0.10702 -0.00925 0.09530 0.88754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8502983 0.4015860 17.058 < 2e-16 ***
## indus 0.0006635 0.0024114 0.275 0.783318
## nox -0.4980196 0.1385524 -3.594 0.000358 ***
## rad 0.0076340 0.0025457 2.999 0.002847 **
## chas 0.1183804 0.0341080 3.471 0.000564 ***
## rm -1.0108149 0.1191905 -8.481 2.60e-16 ***
## I(rm^2) 0.0876424 0.0091672 9.560 < 2e-16 ***
## dis -0.0300090 0.0067681 -4.434 1.14e-05 ***
## tax -0.0005642 0.0001459 -3.867 0.000125 ***
## crim -0.0108652 0.0012956 -8.386 5.26e-16 ***
## lstat -0.0313507 0.0018747 -16.723 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.189 on 495 degrees of freedom
## Multiple R-squared: 0.7905, Adjusted R-squared: 0.7863
## F-statistic: 186.8 on 10 and 495 DF, p-value: < 2.2e-16
RMSE(selected_model$fitted.values,test.data$medv)
## Warning in pred - obs: longitud de objeto mayor no es múltiplo de la longitud
## de uno menor
## [1] 23.18747
# Creation of the lasso model
x = model.matrix(log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) + dis + tax + crim + lstat, train.data)[,-1]
y = train.data$medv
set.seed(123)
cv.lasso<-cv.glmnet(x,y,alpha=1)
cv.lasso$lambda.min
## [1] 0.002175466
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)
coef(lassomodel)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 113.02252202
## indus -0.02689376
## nox -11.19359043
## rad 0.07845893
## chas 1.55776739
## rm -27.20726443
## I(rm^2) 2.47732516
## dis -0.63931869
## tax -0.01046509
## crim -0.10082665
## lstat -0.49493861
x.test<-model.matrix(log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) + dis + tax + crim + lstat,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()
data.frame(
RMSE = RMSE(lassopredictions, test.data$medv),
Rsquare = R2(lassopredictions, test.data$medv))
## RMSE Rsquare
## 1 6.190498 0.7101066
# Lasso model graph
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])+ offset_x
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
lasso<-glmnet(scale(x),y,alpha=1)
plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)
# Show the level of accuracy for each linear regression model
# Model 1 - level of accuracy
AIC(modelol)
## [1] 3113.49
# Model 2 - level of accuracy
AIC(modelo2)
## [1] -153.6378
# Model 3 - level of accuracy
AIC(polymodel)
## [1] -237.3781
# Model 1:
## multicollinearity
vif(modelol)
## indus nox rad chas tax rm crim dis
## 3.821185 3.636213 6.948218 1.058039 8.554215 1.723358 1.748175 2.820945
## lstat
## 2.533120
## heteroscedasticity
bptest(modelol)
##
## studentized Breusch-Pagan test
##
## data: modelol
## BP = 51.207, df = 9, p-value = 6.382e-08
## normality of residuals
histogram(modelol$residuals)
# Model 2:
## multicollinearity
vif(modelo2)
## indus nox rad chas tax rm crim dis
## 3.821185 3.636213 6.948218 1.058039 8.554215 1.723358 1.748175 2.820945
## lstat
## 2.533120
## heteroscedasticity
bptest(modelo2)
##
## studentized Breusch-Pagan test
##
## data: modelo2
## BP = 58.896, df = 9, p-value = 2.187e-09
## normality of residuals
histogram(modelo2$residuals)
# Model 3:
## multicollinearity
vif(polymodel)
## indus nox rad chas rm I(rm^2) dis tax
## 3.870759 3.645744 6.949055 1.061493 99.192665 97.984932 2.872709 8.554751
## crim lstat
## 1.756534 2.534792
## heteroscedasticity
bptest(polymodel)
##
## studentized Breusch-Pagan test
##
## data: polymodel
## BP = 60.406, df = 10, p-value = 3.036e-09
## normality of residuals
histogram(polymodel$residuals)
# multicollinearity
## Eliminate variable that cause multicollinearity on the model 3 and estimate again
## Model 3: polynomial – multiple linear regression
polymodel = lm(log(medv) ~ indus + nox + rad + chas + I(rm^2) + dis + tax + crim + lstat, data=bd)
summary(polymodel)
##
## Call:
## lm(formula = log(medv) ~ indus + nox + rad + chas + I(rm^2) +
## dis + tax + crim + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73893 -0.11458 -0.01841 0.10837 0.84763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5732309 0.1168883 30.570 < 2e-16 ***
## indus -0.0012317 0.0025670 -0.480 0.631561
## nox -0.4419377 0.1479584 -2.987 0.002958 **
## rad 0.0075824 0.0027216 2.786 0.005540 **
## chas 0.1324163 0.0364221 3.636 0.000306 ***
## I(rm^2) 0.0105767 0.0012918 8.187 2.27e-15 ***
## dis -0.0365508 0.0071887 -5.084 5.23e-07 ***
## tax -0.0005672 0.0001560 -3.636 0.000306 ***
## crim -0.0101261 0.0013820 -7.327 9.60e-13 ***
## lstat -0.0298361 0.0019951 -14.955 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.202 on 496 degrees of freedom
## Multiple R-squared: 0.7601, Adjusted R-squared: 0.7557
## F-statistic: 174.6 on 9 and 496 DF, p-value: < 2.2e-16
# Model comparison
stargazer(modelol,modelo2,polymodel,type="text",title="OLS Regression Results",single.row=TRUE,ci=FALSE,ci.level=0.9)
##
## OLS Regression Results
## ======================================================================================
## Dependent variable:
## -------------------------------------------------------
## medv log(medv)
## (1) (2) (3)
## --------------------------------------------------------------------------------------
## indus -0.102 (0.066) -0.002 (0.003) -0.001 (0.003)
## nox -9.129** (3.797) -0.430*** (0.150) -0.442*** (0.148)
## rad 0.136* (0.070) 0.008*** (0.003) 0.008*** (0.003)
## chas 3.603*** (0.934) 0.137*** (0.037) 0.132*** (0.036)
## I(rm2) 0.011*** (0.001)
## tax -0.010** (0.004) -0.001*** (0.0002) -0.001*** (0.0002)
## rm 4.603*** (0.431) 0.119*** (0.017)
## crim -0.095*** (0.035) -0.010*** (0.001) -0.010*** (0.001)
## dis -1.071*** (0.184) -0.039*** (0.007) -0.037*** (0.007)
## lstat -0.575*** (0.051) -0.031*** (0.002) -0.030*** (0.002)
## Constant 14.037*** (3.997) 3.272*** (0.158) 3.573*** (0.117)
## --------------------------------------------------------------------------------------
## Observations 506 506 506
## R2 0.688 0.752 0.760
## Adjusted R2 0.682 0.747 0.756
## Residual Std. Error (df = 496) 5.186 0.205 0.202
## F Statistic (df = 9; 496) 121.396*** 166.981*** 174.612***
## ======================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Diagnostics test plays an important role on modern tasks, by helping identify and improve the accuracy of the linear regression results and predictive analytics. By examining trends and correlation between the variables to determine the cause and get to know the story of what happened. According to consultancy firm Gartner, “predictive analytics is a form of advanced analytics that examines data or content to answer the question: what is likely to happen in the future?”, in other words it uses all the data collected to create a possible outcome or prediction about an assumption. In the same way, the diagnostic analysis helps us to extract value from the data according to the investigation and respond to the relationship of the variables, with this we are concerned with a deeper vision, which allows decision making to be with greater certainty.
Regression diagnostics. (s/f). Bumc.bu.edu. Recuperado el 22 de agosto de 2023, de https://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/R/R5_Correlation-Regression/R5_Correlation-Regression7.html
Corporativa, I. (2021, abril 22). Predictive analytics. Iberdrola. https://www.iberdrola.com/innovation/predictive-analytics