library(foreign)
library(dplyr) # data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats) # to work with categorical variables
library(ggplot2) # data visualization
library(readr) # read specific csv files
library(janitor) # data exploration and cleaning
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(Hmisc) # several useful functions for data analysis
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych) # functions for multivariate analysis
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar) # summaries and visualization of missing values NAs
#library(dlookr) # summaries and visualization of missing values NAs
library(corrplot) # correlation plots
## corrplot 0.92 loaded
library(jtools) # presentation of regression analysis
##
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
##
## %nin%
library(lmtest) # diagnostic checks - linear regression analysis
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # diagnostic checks - linear regression analysis
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(olsrr) # diagnostic checks - linear regression analysis
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(naniar) # identifying missing values
#library(funModeling) # create frequency tables
library(stargazer) # create publication quality tables
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
#library(effects) # displays for linear and other regression models
data = read.csv("real_estate_data.csv")
head(data)
## medv cmedv crim zn indus chas nox rm age dis rad tax ptratio
## 1 24.0 24.0 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3
## 2 21.6 21.6 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8
## 3 34.7 34.7 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8
## 4 33.4 33.4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7
## 5 36.2 36.2 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7
## 6 28.7 28.7 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7
## b lstat
## 1 396.90 4.98
## 2 396.90 9.14
## 3 392.83 4.03
## 4 394.63 2.94
## 5 396.90 5.33
## 6 394.12 5.21
#Renombrando variables
colnames(data) <- c("median_value", "cmedian_value", "crime_rate","residential_land", "non_retail_business", "river_view", "nitric_oxid", "rooms", "age", "distance", "access_to_highways", "property_tax_rate", "school", "b", "low_status_pop")
# Detect Heteroscedasticity
boxplot(formula=median_value ~ crime_rate, data=data, col=cm.colors(3))

modelo_multiple_1 <- lm(formula = median_value ~ crime_rate + residential_land + non_retail_business + river_view + rooms + distance + access_to_highways + property_tax_rate + school + b + low_status_pop, data = data)
summary(modelo_multiple_1)
##
## Call:
## lm(formula = median_value ~ crime_rate + residential_land + non_retail_business +
## river_view + rooms + distance + access_to_highways + property_tax_rate +
## school + b + low_status_pop, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.8808 -2.8171 -0.7587 1.7176 26.6875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.108557 4.354055 5.307 1.68e-07 ***
## crime_rate -0.098378 0.033498 -2.937 0.003471 **
## residential_land 0.050975 0.013881 3.672 0.000267 ***
## non_retail_business -0.060498 0.060393 -1.002 0.316957
## river_view 2.486994 0.877634 2.834 0.004789 **
## rooms 3.894948 0.417198 9.336 < 2e-16 ***
## distance -1.121279 0.179465 -6.248 8.99e-10 ***
## access_to_highways 0.265392 0.067009 3.961 8.58e-05 ***
## property_tax_rate -0.013877 0.003824 -3.629 0.000315 ***
## school -0.750648 0.126124 -5.952 5.04e-09 ***
## b 0.010097 0.002732 3.696 0.000244 ***
## low_status_pop -0.564127 0.047926 -11.771 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.846 on 494 degrees of freedom
## Multiple R-squared: 0.7284, Adjusted R-squared: 0.7224
## F-statistic: 120.5 on 11 and 494 DF, p-value: < 2.2e-16
AIC (modelo_multiple_1)
## [1] 3046.858
#-- Identify the presence of heteroscedasticity --
bptest(modelo_multiple_1)
##
## studentized Breusch-Pagan test
##
## data: modelo_multiple_1
## BP = 60.495, df = 11, p-value = 7.502e-09
#-- Identify the presence of multicollinearity --
vif(modelo_multiple_1)
## crime_rate residential_land non_retail_business river_view
## 1.785523 2.253931 3.691762 1.068674
## rooms distance access_to_highways property_tax_rate
## 1.847962 3.071328 7.321554 8.934464
## school b low_status_pop
## 1.603462 1.338170 2.519030
#-- Identify the presence of normality of residuals --
ols_test_normality(modelo_multiple_1)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.8968 0.0000
## Kolmogorov-Smirnov 0.1198 0.0000
## Cramer-von Mises 41.0715 0.0000
## Anderson-Darling 11.3748 0.0000
## -----------------------------------------------
hist(modelo_multiple_1$residuals)

modelo_multiple_2 <- lm(formula = log(median_value) ~ crime_rate + residential_land + non_retail_business + river_view + rooms + distance + access_to_highways + property_tax_rate + school + b + low_status_pop, data = data)
summary(modelo_multiple_2)
##
## Call:
## lm(formula = log(median_value) ~ crime_rate + residential_land +
## non_retail_business + river_view + rooms + distance + access_to_highways +
## property_tax_rate + school + b + low_status_pop, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74427 -0.11107 -0.01225 0.08947 0.96890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5216289 0.1749117 20.134 < 2e-16 ***
## crime_rate -0.0098555 0.0013457 -7.324 9.87e-13 ***
## residential_land 0.0013458 0.0005576 2.413 0.01617 *
## non_retail_business -0.0010203 0.0024261 -0.421 0.67425
## river_view 0.0928912 0.0352564 2.635 0.00868 **
## rooms 0.0956772 0.0167597 5.709 1.97e-08 ***
## distance -0.0346369 0.0072095 -4.804 2.06e-06 ***
## access_to_highways 0.0124489 0.0026919 4.625 4.80e-06 ***
## property_tax_rate -0.0006906 0.0001536 -4.495 8.67e-06 ***
## school -0.0294347 0.0050667 -5.809 1.12e-08 ***
## b 0.0004497 0.0001098 4.097 4.89e-05 ***
## low_status_pop -0.0304928 0.0019253 -15.838 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1947 on 494 degrees of freedom
## Multiple R-squared: 0.7781, Adjusted R-squared: 0.7732
## F-statistic: 157.5 on 11 and 494 DF, p-value: < 2.2e-16
AIC (modelo_multiple_2)
## [1] -206.2981
#-- Identify the presence of heteroscedasticity --
bptest(modelo_multiple_2)
##
## studentized Breusch-Pagan test
##
## data: modelo_multiple_2
## BP = 69.312, df = 11, p-value = 1.65e-10
#-- Identify the presence of multicollinearity --
vif(modelo_multiple_2)
## crime_rate residential_land non_retail_business river_view
## 1.785523 2.253931 3.691762 1.068674
## rooms distance access_to_highways property_tax_rate
## 1.847962 3.071328 7.321554 8.934464
## school b low_status_pop
## 1.603462 1.338170 2.519030
#-- Identify the presence of normality of residuals --
ols_test_normality(modelo_multiple_2)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.9421 0.0000
## Kolmogorov-Smirnov 0.0818 0.0023
## Cramer-von Mises 118.6904 0.0000
## Anderson-Darling 6.5815 0.0000
## -----------------------------------------------
hist(modelo_multiple_2$residuals)

modelo_multiple_3 <- lm(formula = log(median_value) ~ crime_rate + residential_land + non_retail_business + river_view + I(rooms^2) + distance + access_to_highways + property_tax_rate + school + b + low_status_pop, data = data)
summary(modelo_multiple_3)
##
## Call:
## lm(formula = log(median_value) ~ crime_rate + residential_land +
## non_retail_business + river_view + I(rooms^2) + distance +
## access_to_highways + property_tax_rate + school + b + low_status_pop,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74384 -0.10483 -0.01286 0.08970 0.95760
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7274107 0.1340314 27.810 < 2e-16 ***
## crime_rate -0.0099001 0.0013289 -7.450 4.20e-13 ***
## residential_land 0.0012356 0.0005515 2.241 0.02550 *
## non_retail_business -0.0006142 0.0023987 -0.256 0.79801
## river_view 0.0903880 0.0348289 2.595 0.00973 **
## I(rooms^2) 0.0086600 0.0012794 6.769 3.70e-11 ***
## distance -0.0320597 0.0071567 -4.480 9.30e-06 ***
## access_to_highways 0.0119458 0.0026615 4.488 8.94e-06 ***
## property_tax_rate -0.0006772 0.0001518 -4.462 1.01e-05 ***
## school -0.0281128 0.0050180 -5.602 3.52e-08 ***
## b 0.0004521 0.0001083 4.174 3.53e-05 ***
## low_status_pop -0.0296588 0.0018905 -15.688 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1923 on 494 degrees of freedom
## Multiple R-squared: 0.7836, Adjusted R-squared: 0.7788
## F-statistic: 162.6 on 11 and 494 DF, p-value: < 2.2e-16
AIC (modelo_multiple_3)
## [1] -218.8483
#-- Identify the presence of heteroscedasticity --
bptest(modelo_multiple_3)
##
## studentized Breusch-Pagan test
##
## data: modelo_multiple_3
## BP = 70.556, df = 11, p-value = 9.577e-11
#-- Identify the presence of multicollinearity --
vif(modelo_multiple_3)
## crime_rate residential_land non_retail_business river_view
## 1.784934 2.259964 3.699395 1.069104
## I(rooms^2) distance access_to_highways property_tax_rate
## 1.843563 3.102491 7.336924 8.937588
## school b low_status_pop
## 1.612336 1.335837 2.489849
#-- Identify the presence of normality of residuals --
ols_test_normality(modelo_multiple_3)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.9357 0.0000
## Kolmogorov-Smirnov 0.09 6e-04
## Cramer-von Mises 119.7145 0.0000
## Anderson-Darling 7.3549 0.0000
## -----------------------------------------------
hist(modelo_multiple_3$residuals)
