# Import BD
bd<- read.csv("C:\\Users\\Silva\\Downloads\\real_estate_data.csv")
#Installing libraries
#library(pysch)
library(tidyverse)
library(ggplot2)
library(corrplot)
library(gmodels)
library(effects)
library(stargazer)
library(olsrr)
library(kableExtra)
library(jtools)
library(fastmap)
library(dlookr)
library(Hmisc)
library(naniar)
library(glmnet)
library(caret)
library(car)
library(lmtest)
Estimate 3 different linear regression models
## Model 1: Multiple linear regression
modelol <- lm(medv ~ indus + nox + rad + chas + tax + rm + crim + dis + lstat, data = bd)
summary(modelol)
##
## Call:
## lm(formula = medv ~ indus + nox + rad + chas + tax + rm + crim +
## dis + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.922 -3.164 -1.019 2.128 27.880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.037486 3.996667 3.512 0.000485 ***
## indus -0.102369 0.065751 -1.557 0.120127
## nox -9.128940 3.797284 -2.404 0.016579 *
## rad 0.135520 0.069856 1.940 0.052947 .
## chas 3.602529 0.934492 3.855 0.000131 ***
## tax -0.010006 0.004004 -2.499 0.012790 *
## rm 4.603069 0.431139 10.677 < 2e-16 ***
## crim -0.095412 0.035470 -2.690 0.007388 **
## dis -1.071326 0.184055 -5.821 1.05e-08 ***
## lstat -0.575164 0.051430 -11.184 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.186 on 496 degrees of freedom
## Multiple R-squared: 0.6878, Adjusted R-squared: 0.6821
## F-statistic: 121.4 on 9 and 496 DF, p-value: < 2.2e-16
## Model 2: Linear model (logarithmic)
modelo2 <- lm(log(medv) ~ indus + nox + rad + chas + tax + rm + crim + dis + lstat, data = bd)
summary(modelo2)
##
## Call:
## lm(formula = log(medv) ~ indus + nox + rad + chas + tax + rm +
## crim + dis + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.75215 -0.11530 -0.02392 0.11428 0.85862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2723333 0.1583533 20.665 < 2e-16 ***
## indus -0.0019455 0.0026051 -0.747 0.455539
## nox -0.4302928 0.1504535 -2.860 0.004415 **
## rad 0.0079011 0.0027678 2.855 0.004489 **
## chas 0.1369810 0.0370258 3.700 0.000240 ***
## tax -0.0005753 0.0001587 -3.626 0.000318 ***
## rm 0.1187548 0.0170823 6.952 1.14e-11 ***
## crim -0.0100107 0.0014054 -7.123 3.73e-12 ***
## dis -0.0386950 0.0072925 -5.306 1.69e-07 ***
## lstat -0.0308904 0.0020377 -15.159 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2055 on 496 degrees of freedom
## Multiple R-squared: 0.7519, Adjusted R-squared: 0.7474
## F-statistic: 167 on 9 and 496 DF, p-value: < 2.2e-16
## Model 3: polynomial – multiple linear regression
polymodel = lm(log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) + dis + tax + crim + lstat, data=bd)
summary(polymodel)
##
## Call:
## lm(formula = log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) +
## dis + tax + crim + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96456 -0.10702 -0.00925 0.09530 0.88754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8502983 0.4015860 17.058 < 2e-16 ***
## indus 0.0006635 0.0024114 0.275 0.783318
## nox -0.4980196 0.1385524 -3.594 0.000358 ***
## rad 0.0076340 0.0025457 2.999 0.002847 **
## chas 0.1183804 0.0341080 3.471 0.000564 ***
## rm -1.0108149 0.1191905 -8.481 2.60e-16 ***
## I(rm^2) 0.0876424 0.0091672 9.560 < 2e-16 ***
## dis -0.0300090 0.0067681 -4.434 1.14e-05 ***
## tax -0.0005642 0.0001459 -3.867 0.000125 ***
## crim -0.0108652 0.0012956 -8.386 5.26e-16 ***
## lstat -0.0313507 0.0018747 -16.723 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.189 on 495 degrees of freedom
## Multiple R-squared: 0.7905, Adjusted R-squared: 0.7863
## F-statistic: 186.8 on 10 and 495 DF, p-value: < 2.2e-16
# Lasso regression model
## Creation of data samples for train & test
set.seed(123)
training.samples<-bd$medv %>%
createDataPartition(p=0.75,list=FALSE)
train.data<-bd[training.samples, ]
test.data<-bd[-training.samples, ]
selected_model = lm(log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) + dis + tax + crim + lstat, data=bd)
summary(selected_model)
##
## Call:
## lm(formula = log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) +
## dis + tax + crim + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96456 -0.10702 -0.00925 0.09530 0.88754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8502983 0.4015860 17.058 < 2e-16 ***
## indus 0.0006635 0.0024114 0.275 0.783318
## nox -0.4980196 0.1385524 -3.594 0.000358 ***
## rad 0.0076340 0.0025457 2.999 0.002847 **
## chas 0.1183804 0.0341080 3.471 0.000564 ***
## rm -1.0108149 0.1191905 -8.481 2.60e-16 ***
## I(rm^2) 0.0876424 0.0091672 9.560 < 2e-16 ***
## dis -0.0300090 0.0067681 -4.434 1.14e-05 ***
## tax -0.0005642 0.0001459 -3.867 0.000125 ***
## crim -0.0108652 0.0012956 -8.386 5.26e-16 ***
## lstat -0.0313507 0.0018747 -16.723 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.189 on 495 degrees of freedom
## Multiple R-squared: 0.7905, Adjusted R-squared: 0.7863
## F-statistic: 186.8 on 10 and 495 DF, p-value: < 2.2e-16
RMSE(selected_model$fitted.values,test.data$medv)
## Warning in pred - obs: longitud de objeto mayor no es múltiplo de la longitud
## de uno menor
## [1] 23.18747
# Creation of the lasso model
x = model.matrix(log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) + dis + tax + crim + lstat, train.data)[,-1]
y = train.data$medv
set.seed(123)
cv.lasso<-cv.glmnet(x,y,alpha=1)
cv.lasso$lambda.min
## [1] 0.002175466
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)
coef(lassomodel)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 113.02252202
## indus -0.02689376
## nox -11.19359043
## rad 0.07845893
## chas 1.55776739
## rm -27.20726443
## I(rm^2) 2.47732516
## dis -0.63931869
## tax -0.01046509
## crim -0.10082665
## lstat -0.49493861
x.test<-model.matrix(log(medv) ~ indus + nox + rad + chas + rm + I(rm^2) + dis + tax + crim + lstat,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()
data.frame(
RMSE = RMSE(lassopredictions, test.data$medv),
Rsquare = R2(lassopredictions, test.data$medv))
## RMSE Rsquare
## 1 6.190498 0.7101066
# Lasso model graph
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])+ offset_x
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
lasso<-glmnet(scale(x),y,alpha=1)
plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)
# Show the level of accuracy for each linear regression model
# Model 1 - level of accuracy
AIC(modelol)
## [1] 3113.49
# Model 2 - level of accuracy
AIC(modelo2)
## [1] -153.6378
# Model 3 - level of accuracy
AIC(polymodel)
## [1] -237.3781
# Model 1:
## multicollinearity
vif(modelol)
## indus nox rad chas tax rm crim dis
## 3.821185 3.636213 6.948218 1.058039 8.554215 1.723358 1.748175 2.820945
## lstat
## 2.533120
## heteroscedasticity
bptest(modelol)
##
## studentized Breusch-Pagan test
##
## data: modelol
## BP = 51.207, df = 9, p-value = 6.382e-08
## normality of residuals
histogram(modelol$residuals)
# Model 2:
## multicollinearity
vif(modelo2)
## indus nox rad chas tax rm crim dis
## 3.821185 3.636213 6.948218 1.058039 8.554215 1.723358 1.748175 2.820945
## lstat
## 2.533120
## heteroscedasticity
bptest(modelo2)
##
## studentized Breusch-Pagan test
##
## data: modelo2
## BP = 58.896, df = 9, p-value = 2.187e-09
## normality of residuals
histogram(modelo2$residuals)
# Model 3:
## multicollinearity
vif(polymodel)
## indus nox rad chas rm I(rm^2) dis tax
## 3.870759 3.645744 6.949055 1.061493 99.192665 97.984932 2.872709 8.554751
## crim lstat
## 1.756534 2.534792
## heteroscedasticity
bptest(polymodel)
##
## studentized Breusch-Pagan test
##
## data: polymodel
## BP = 60.406, df = 10, p-value = 3.036e-09
## normality of residuals
histogram(polymodel$residuals)
# multicollinearity
## Eliminate variable that cause multicollinearity on the model 3 and estimate again
## Model 3: polynomial – multiple linear regression
polymodel = lm(log(medv) ~ indus + nox + rad + chas + I(rm^2) + dis + tax + crim + lstat, data=bd)
summary(polymodel)
##
## Call:
## lm(formula = log(medv) ~ indus + nox + rad + chas + I(rm^2) +
## dis + tax + crim + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73893 -0.11458 -0.01841 0.10837 0.84763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5732309 0.1168883 30.570 < 2e-16 ***
## indus -0.0012317 0.0025670 -0.480 0.631561
## nox -0.4419377 0.1479584 -2.987 0.002958 **
## rad 0.0075824 0.0027216 2.786 0.005540 **
## chas 0.1324163 0.0364221 3.636 0.000306 ***
## I(rm^2) 0.0105767 0.0012918 8.187 2.27e-15 ***
## dis -0.0365508 0.0071887 -5.084 5.23e-07 ***
## tax -0.0005672 0.0001560 -3.636 0.000306 ***
## crim -0.0101261 0.0013820 -7.327 9.60e-13 ***
## lstat -0.0298361 0.0019951 -14.955 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.202 on 496 degrees of freedom
## Multiple R-squared: 0.7601, Adjusted R-squared: 0.7557
## F-statistic: 174.6 on 9 and 496 DF, p-value: < 2.2e-16
AIC(polymodel)
## [1] -170.7324
# Model comparison
stargazer(modelol,modelo2,polymodel,type="text",title="OLS Regression Results",single.row=TRUE,ci=FALSE,ci.level=0.9)
##
## OLS Regression Results
## ======================================================================================
## Dependent variable:
## -------------------------------------------------------
## medv log(medv)
## (1) (2) (3)
## --------------------------------------------------------------------------------------
## indus -0.102 (0.066) -0.002 (0.003) -0.001 (0.003)
## nox -9.129** (3.797) -0.430*** (0.150) -0.442*** (0.148)
## rad 0.136* (0.070) 0.008*** (0.003) 0.008*** (0.003)
## chas 3.603*** (0.934) 0.137*** (0.037) 0.132*** (0.036)
## I(rm2) 0.011*** (0.001)
## tax -0.010** (0.004) -0.001*** (0.0002) -0.001*** (0.0002)
## rm 4.603*** (0.431) 0.119*** (0.017)
## crim -0.095*** (0.035) -0.010*** (0.001) -0.010*** (0.001)
## dis -1.071*** (0.184) -0.039*** (0.007) -0.037*** (0.007)
## lstat -0.575*** (0.051) -0.031*** (0.002) -0.030*** (0.002)
## Constant 14.037*** (3.997) 3.272*** (0.158) 3.573*** (0.117)
## --------------------------------------------------------------------------------------
## Observations 506 506 506
## R2 0.688 0.752 0.760
## Adjusted R2 0.682 0.747 0.756
## Residual Std. Error (df = 496) 5.186 0.205 0.202
## F Statistic (df = 9; 496) 121.396*** 166.981*** 174.612***
## ======================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
According to the previous analysis ,made on assignment 1, we can see that model 3 has the highest R^2 and also has the least AIC (the AIC is more important than the R^2). The purpose of assignment 2 is to prove through diagnostic tests the presence of multicollinearity. As we know multicollinearity is when two or more independent variables in the model are highly correlated. The presence of multicollinearity can cause difficulties in estimating the individual effects of the correlated variables on the dependent variable. Diagnostic tests in regression analysis can detect and correct if there is the presence of multicollinearity. With the Variance Inflation Factor is when we can see if there is multicollinearity, if VIF > 10 (or sometimes 5), then multicollinearity is high. The VIF
Model 3 continues being the most accurate model, with the diagnostic tests we can see that in this model, R decreased and the AIC is in the correct range to be considered good.
With the diagnostic tests we see that: (same results of A1)
all the variables used for the estimation of the model turned out to be significant with up to 99% confidence, except the variable “indus” (this continues to be true with the diagnostic test)
The rooms (rm) variable is the one that has a greater percentage impact on the model and on the value of the houses. This impact is negative, which would mean that the greater the number of rooms, the value of the same decreases.
The impact of the rooms (rm) variable is not linear, since its square is also significant in the model and in this case it points to a positive impact, contrary to that shown in the variable before being transformed. this phenomenon could be visualized from the exploration of the data in graph 4.
variables that had a significant negative impact:
variables that had a significant positive impact:
With the diagnostic tests we confirm that model 3 is the most appropriate for the given real estate database, considering the VIF between the variables.
The existence of heterosedasticity was found in the 3 models, this problem may be caused by the naturalness of the data, the database does not include areas or regions of the United States as a variable, which affected the data and it was not possible to perform a data segregation.
Diagnostics test plays an important role on modern tasks, by helping identify and improve the accuracy of the linear regression results and predictive analytics. By examining trends and correlation between the variables to determine the cause and get to know the story of what happened. According to consultancy firm Gartner, “predictive analytics is a form of advanced analytics that examines data or content to answer the question: what is likely to happen in the future?”, in other words it uses all the data collected to create a possible outcome or prediction about an assumption. In the same way, the diagnostic analysis helps us to extract value from the data according to the investigation and respond to the relationship of the variables, with this we are concerned with a deeper vision, which allows decision making to be with greater certainty.
Regression diagnostics. (s/f). Bumc.bu.edu. Recuperado el 22 de agosto de 2023, de https://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/R/R5_Correlation-Regression/R5_Correlation-Regression7.html
Corporativa, I. (2021, abril 22). Predictive analytics. Iberdrola. https://www.iberdrola.com/innovation/predictive-analytics