library(foreign)
library(dplyr)        # data manipulation 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats)      # to work with categorical variables
library(ggplot2)      # data visualization 
library(readr)        # read specific csv files
library(janitor)      # data exploration and cleaning 
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(Hmisc)        # several useful functions for data analysis 
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(psych)        # functions for multivariate analysis 
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(naniar)       # summaries and visualization of missing values NAs
#library(dlookr)       # summaries and visualization of missing values NAs
library(corrplot)     # correlation plots
## corrplot 0.92 loaded
library(jtools)       # presentation of regression analysis 
## 
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
## 
##     %nin%
library(lmtest)       # diagnostic checks - linear regression analysis 
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)          # diagnostic checks - linear regression analysis
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(olsrr)        # diagnostic checks - linear regression analysis 
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(naniar)       # identifying missing values
#library(funModeling)  # create frequency tables
library(stargazer)    # create publication quality tables
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
#library(effects)      # displays for linear and other regression models
data = read.csv("real_estate_data.csv") 
head(data)
##   medv cmedv    crim zn indus chas   nox    rm  age    dis rad tax ptratio
## 1 24.0  24.0 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3
## 2 21.6  21.6 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8
## 3 34.7  34.7 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8
## 4 33.4  33.4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7
## 5 36.2  36.2 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7
## 6 28.7  28.7 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7
##        b lstat
## 1 396.90  4.98
## 2 396.90  9.14
## 3 392.83  4.03
## 4 394.63  2.94
## 5 396.90  5.33
## 6 394.12  5.21
#Renombrando variables

colnames(data) <- c("median_value", "cmedian_value", "crime_rate","residential_land", "non_retail_business", "river_view", "nitric_oxid", "rooms", "age", "distance", "access_to_highways", "property_tax_rate", "school", "b", "low_status_pop")
# Detect Heteroscedasticity 
boxplot(formula=median_value ~ crime_rate, data=data, col=cm.colors(3))

modelo_multiple_1 <- lm(formula = median_value ~ crime_rate + residential_land + non_retail_business +  river_view + rooms +  distance + access_to_highways + property_tax_rate + school + b + low_status_pop, data = data)
summary(modelo_multiple_1)
## 
## Call:
## lm(formula = median_value ~ crime_rate + residential_land + non_retail_business + 
##     river_view + rooms + distance + access_to_highways + property_tax_rate + 
##     school + b + low_status_pop, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.8808  -2.8171  -0.7587   1.7176  26.6875 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         23.108557   4.354055   5.307 1.68e-07 ***
## crime_rate          -0.098378   0.033498  -2.937 0.003471 ** 
## residential_land     0.050975   0.013881   3.672 0.000267 ***
## non_retail_business -0.060498   0.060393  -1.002 0.316957    
## river_view           2.486994   0.877634   2.834 0.004789 ** 
## rooms                3.894948   0.417198   9.336  < 2e-16 ***
## distance            -1.121279   0.179465  -6.248 8.99e-10 ***
## access_to_highways   0.265392   0.067009   3.961 8.58e-05 ***
## property_tax_rate   -0.013877   0.003824  -3.629 0.000315 ***
## school              -0.750648   0.126124  -5.952 5.04e-09 ***
## b                    0.010097   0.002732   3.696 0.000244 ***
## low_status_pop      -0.564127   0.047926 -11.771  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 494 degrees of freedom
## Multiple R-squared:  0.7284, Adjusted R-squared:  0.7224 
## F-statistic: 120.5 on 11 and 494 DF,  p-value: < 2.2e-16
AIC (modelo_multiple_1)
## [1] 3046.858
#-- Identify the presence of heteroscedasticity --
bptest(modelo_multiple_1)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_multiple_1
## BP = 60.495, df = 11, p-value = 7.502e-09
#-- Identify the presence of multicollinearity --
vif(modelo_multiple_1)
##          crime_rate    residential_land non_retail_business          river_view 
##            1.785523            2.253931            3.691762            1.068674 
##               rooms            distance  access_to_highways   property_tax_rate 
##            1.847962            3.071328            7.321554            8.934464 
##              school                   b      low_status_pop 
##            1.603462            1.338170            2.519030
#-- Identify the presence of normality of residuals --
ols_test_normality(modelo_multiple_1)
## -----------------------------------------------
##        Test             Statistic       pvalue  
## -----------------------------------------------
## Shapiro-Wilk              0.8968         0.0000 
## Kolmogorov-Smirnov        0.1198         0.0000 
## Cramer-von Mises         41.0715         0.0000 
## Anderson-Darling         11.3748         0.0000 
## -----------------------------------------------
hist(modelo_multiple_1$residuals)

modelo_multiple_2 <- lm(formula = log(median_value) ~ crime_rate + residential_land + non_retail_business +  river_view + rooms +  distance + access_to_highways + property_tax_rate + school + b + low_status_pop, data = data)
summary(modelo_multiple_2)
## 
## Call:
## lm(formula = log(median_value) ~ crime_rate + residential_land + 
##     non_retail_business + river_view + rooms + distance + access_to_highways + 
##     property_tax_rate + school + b + low_status_pop, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74427 -0.11107 -0.01225  0.08947  0.96890 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.5216289  0.1749117  20.134  < 2e-16 ***
## crime_rate          -0.0098555  0.0013457  -7.324 9.87e-13 ***
## residential_land     0.0013458  0.0005576   2.413  0.01617 *  
## non_retail_business -0.0010203  0.0024261  -0.421  0.67425    
## river_view           0.0928912  0.0352564   2.635  0.00868 ** 
## rooms                0.0956772  0.0167597   5.709 1.97e-08 ***
## distance            -0.0346369  0.0072095  -4.804 2.06e-06 ***
## access_to_highways   0.0124489  0.0026919   4.625 4.80e-06 ***
## property_tax_rate   -0.0006906  0.0001536  -4.495 8.67e-06 ***
## school              -0.0294347  0.0050667  -5.809 1.12e-08 ***
## b                    0.0004497  0.0001098   4.097 4.89e-05 ***
## low_status_pop      -0.0304928  0.0019253 -15.838  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1947 on 494 degrees of freedom
## Multiple R-squared:  0.7781, Adjusted R-squared:  0.7732 
## F-statistic: 157.5 on 11 and 494 DF,  p-value: < 2.2e-16
AIC (modelo_multiple_2)
## [1] -206.2981
#-- Identify the presence of heteroscedasticity --
bptest(modelo_multiple_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_multiple_2
## BP = 69.312, df = 11, p-value = 1.65e-10
#-- Identify the presence of multicollinearity --
vif(modelo_multiple_2)
##          crime_rate    residential_land non_retail_business          river_view 
##            1.785523            2.253931            3.691762            1.068674 
##               rooms            distance  access_to_highways   property_tax_rate 
##            1.847962            3.071328            7.321554            8.934464 
##              school                   b      low_status_pop 
##            1.603462            1.338170            2.519030
#-- Identify the presence of normality of residuals --
ols_test_normality(modelo_multiple_2)
## -----------------------------------------------
##        Test             Statistic       pvalue  
## -----------------------------------------------
## Shapiro-Wilk              0.9421         0.0000 
## Kolmogorov-Smirnov        0.0818         0.0023 
## Cramer-von Mises         118.6904        0.0000 
## Anderson-Darling          6.5815         0.0000 
## -----------------------------------------------
hist(modelo_multiple_2$residuals)

modelo_multiple_3 <- lm(formula = log(median_value) ~ crime_rate + residential_land + non_retail_business +  river_view + I(rooms^2) +  distance + access_to_highways + property_tax_rate + school + b + low_status_pop, data = data)
summary(modelo_multiple_3)
## 
## Call:
## lm(formula = log(median_value) ~ crime_rate + residential_land + 
##     non_retail_business + river_view + I(rooms^2) + distance + 
##     access_to_highways + property_tax_rate + school + b + low_status_pop, 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74384 -0.10483 -0.01286  0.08970  0.95760 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.7274107  0.1340314  27.810  < 2e-16 ***
## crime_rate          -0.0099001  0.0013289  -7.450 4.20e-13 ***
## residential_land     0.0012356  0.0005515   2.241  0.02550 *  
## non_retail_business -0.0006142  0.0023987  -0.256  0.79801    
## river_view           0.0903880  0.0348289   2.595  0.00973 ** 
## I(rooms^2)           0.0086600  0.0012794   6.769 3.70e-11 ***
## distance            -0.0320597  0.0071567  -4.480 9.30e-06 ***
## access_to_highways   0.0119458  0.0026615   4.488 8.94e-06 ***
## property_tax_rate   -0.0006772  0.0001518  -4.462 1.01e-05 ***
## school              -0.0281128  0.0050180  -5.602 3.52e-08 ***
## b                    0.0004521  0.0001083   4.174 3.53e-05 ***
## low_status_pop      -0.0296588  0.0018905 -15.688  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1923 on 494 degrees of freedom
## Multiple R-squared:  0.7836, Adjusted R-squared:  0.7788 
## F-statistic: 162.6 on 11 and 494 DF,  p-value: < 2.2e-16
AIC (modelo_multiple_3)
## [1] -218.8483
#-- Identify the presence of heteroscedasticity --
bptest(modelo_multiple_3)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_multiple_3
## BP = 70.556, df = 11, p-value = 9.577e-11
#-- Identify the presence of multicollinearity --
vif(modelo_multiple_3)
##          crime_rate    residential_land non_retail_business          river_view 
##            1.784934            2.259964            3.699395            1.069104 
##          I(rooms^2)            distance  access_to_highways   property_tax_rate 
##            1.843563            3.102491            7.336924            8.937588 
##              school                   b      low_status_pop 
##            1.612336            1.335837            2.489849
#-- Identify the presence of normality of residuals --
ols_test_normality(modelo_multiple_3)
## -----------------------------------------------
##        Test             Statistic       pvalue  
## -----------------------------------------------
## Shapiro-Wilk              0.9357         0.0000 
## Kolmogorov-Smirnov         0.09           6e-04 
## Cramer-von Mises         119.7145        0.0000 
## Anderson-Darling          7.3549         0.0000 
## -----------------------------------------------
hist(modelo_multiple_3$residuals)