library(foreign)
library(dplyr) # data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats) # to work with categorical variables
library(ggplot2) # data visualization
library(readr) # read specific csv files
library(janitor) # data exploration and cleaning
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(Hmisc) # several useful functions for data analysis
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych) # functions for multivariate analysis
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar) # summaries and visualization of missing values NA's
library(dlookr) # summaries and visualization of missing values NA's
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:psych':
##
## describe
## The following object is masked from 'package:Hmisc':
##
## describe
## The following object is masked from 'package:base':
##
## transform
library(corrplot) # correlation plots
## corrplot 0.92 loaded
library(jtools) # presentation of regression analysis
##
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
##
## %nin%
library(lmtest) # diagnostic checks - linear regression analysis
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # diagnostic checks - linear regression analysis
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(olsrr) # diagnostic checks - linear regression analysis
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(naniar) # identifying missing values
#library(funModeling) # create frequency tables
library(stargazer) # create publication quality tables
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effects) # displays for linear and other regression models
## Registered S3 method overwritten by 'survey':
## method from
## summary.pps dlookr
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(tidyverse) # collection of R packages designed for data scienc
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ tidyr::extract() masks dlookr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret) # Classification and Regression Training
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-7
library(ggplot2)
library(car)
library(lmtest)
library(foreign) library(dplyr) # data manipulation library(forcats)
# to work with categorical variables install.packages(“ggplot2)
library(ggplot2) # data visualization library(readr) # read specific csv
files library(janitor) # data exploration and cleaning library(Hmisc) #
several useful functions for data analysis library(psych) # functions
for multivariate analysis library(naniar) # summaries and visualization
of missing values NA’s library(dlookr) # summaries and visualization of
missing values NA’s library(corrplot) # correlation plots
library(jtools) # presentation of regression analysis library(lmtest) #
diagnostic checks - linear regression analysis library(car) # diagnostic
checks - linear regression analysis library(olsrr) # diagnostic checks -
linear regression analysis library(naniar) # identifying missing values
library(funModeling) # create frequency tables library(stargazer) #
create publication quality tables library(effects) # displays for linear
and other regression models library(tidyverse) # collection of R
packages designed for data scienc library(caret) # Classification and
Regression Training library(glmnet)
library(ggplot2)
#file.choose()
dataset <- read.csv("/Users/genarorodriguezalcantara/Desktop/Tec/Introduction to econometrics/real_estate_data.csv")
##A()
#Identifiquemos los datos faltantes:
#is.na(dataset)
sum(is.na(dataset))
## [1] 0
#gg_miss_var(dataset)
#Mostremos la estructura de la base de datos:
str(dataset)
## 'data.frame': 506 obs. of 15 variables:
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ cmedv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : int 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
#Complementamos con un análisis descriptivo:
summary(dataset)
## medv cmedv crim zn
## Min. : 5.00 Min. : 5.00 Min. : 0.00632 Min. : 0.00
## 1st Qu.:17.02 1st Qu.:17.02 1st Qu.: 0.08205 1st Qu.: 0.00
## Median :21.20 Median :21.20 Median : 0.25651 Median : 0.00
## Mean :22.53 Mean :22.53 Mean : 3.61352 Mean : 11.36
## 3rd Qu.:25.00 3rd Qu.:25.00 3rd Qu.: 3.67708 3rd Qu.: 12.50
## Max. :50.00 Max. :50.00 Max. :88.97620 Max. :100.00
## indus chas nox rm
## Min. : 0.46 Min. :0.00000 Min. :0.3850 Min. :3.561
## 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886
## Median : 9.69 Median :0.00000 Median :0.5380 Median :6.208
## Mean :11.14 Mean :0.06917 Mean :0.5547 Mean :6.285
## 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623
## Max. :27.74 Max. :1.00000 Max. :0.8710 Max. :8.780
## age dis rad tax
## Min. : 2.90 Min. : 1.130 Min. : 1.000 Min. :187.0
## 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000 1st Qu.:279.0
## Median : 77.50 Median : 3.207 Median : 5.000 Median :330.0
## Mean : 68.57 Mean : 3.795 Mean : 9.549 Mean :408.2
## 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000 3rd Qu.:666.0
## Max. :100.00 Max. :12.127 Max. :24.000 Max. :711.0
## ptratio b lstat
## Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :19.05 Median :391.44 Median :11.36
## Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :22.00 Max. :396.90 Max. :37.97
statistics_dt <- summary(dataset)
#Extraemos las estadísticas deseadas
mean_value <- statistics_dt[2, ]
median_value <- statistics_dt[3, ]
standard_deviation <- statistics_dt[4, ]
minimum_value <- statistics_dt[5, ]
maximum_value <- statistics_dt[6, ]
# Print the descriptive statistics
cat("Mean:", mean_value, "\n")
## Mean: 1st Qu.:17.02 1st Qu.:17.02 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
cat("Median:", median_value, "\n")
## Median: Median :21.20 Median :21.20 Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000 Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000 Median :330.0 Median :19.05 Median :391.44 Median :11.36
cat("Standard Deviation:", standard_deviation, "\n")
## Standard Deviation: Mean :22.53 Mean :22.53 Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917 Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67 Mean :12.65
cat("Minimum:", minimum_value, "\n")
## Minimum: 3rd Qu.:25.00 3rd Qu.:25.00 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
cat("Maximum:", maximum_value, "\n")
## Maximum: Max. :50.00 Max. :50.00 Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
#b()
#Pair-Wised Graphs
#1
ggplot(data = dataset, aes(x = medv, y = rm)) +
geom_point() +
labs(x = "medv", y = "Dependent Variable") +
ggtitle("Scatter Plot: Median Value (medv) vs. Rooms (rm)")
#2
ggplot(data = dataset, aes(x = medv, y = lstat)) +
geom_point() +
labs(x = "Median Value (medv)", y = "Percentage of lower status (lstat)") +
ggtitle("Scatter Plot: Median Value (medv) vs. Percentage of lower status (lstat)")
#Histograma de variable dependiente
hist(dataset$medv)
#Correlation plot
dataset_cor1 <- dataset %>% select(-chas, -rad)
summary(dataset_cor1)
## medv cmedv crim zn
## Min. : 5.00 Min. : 5.00 Min. : 0.00632 Min. : 0.00
## 1st Qu.:17.02 1st Qu.:17.02 1st Qu.: 0.08205 1st Qu.: 0.00
## Median :21.20 Median :21.20 Median : 0.25651 Median : 0.00
## Mean :22.53 Mean :22.53 Mean : 3.61352 Mean : 11.36
## 3rd Qu.:25.00 3rd Qu.:25.00 3rd Qu.: 3.67708 3rd Qu.: 12.50
## Max. :50.00 Max. :50.00 Max. :88.97620 Max. :100.00
## indus nox rm age
## Min. : 0.46 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.: 5.19 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02
## Median : 9.69 Median :0.5380 Median :6.208 Median : 77.50
## Mean :11.14 Mean :0.5547 Mean :6.285 Mean : 68.57
## 3rd Qu.:18.10 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08
## Max. :27.74 Max. :0.8710 Max. :8.780 Max. :100.00
## dis tax ptratio b
## Min. : 1.130 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 2.100 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 3.207 Median :330.0 Median :19.05 Median :391.44
## Mean : 3.795 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.: 5.188 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :12.127 Max. :711.0 Max. :22.00 Max. :396.90
## lstat
## Min. : 1.73
## 1st Qu.: 6.95
## Median :11.36
## Mean :12.65
## 3rd Qu.:16.95
## Max. :37.97
corrplot(cor(dataset_cor1),type = "upper",order = "hclust",addCoef.col = "black")
# Multiple Linear Regression Analysis
dmodel1<-lm(medv ~ rm+nox+lstat+crim+tax+indus,data=dataset_cor1)
summary(dmodel1)
##
## Call:
## lm(formula = medv ~ rm + nox + lstat + crim + tax + indus, data = dataset_cor1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.782 -3.553 -1.137 1.765 30.464
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.481981 3.299447 -0.752 0.45226
## rm 5.213899 0.444913 11.719 < 2e-16 ***
## nox 3.850583 3.474274 1.108 0.26826
## lstat -0.560857 0.053869 -10.412 < 2e-16 ***
## crim -0.059567 0.035764 -1.666 0.09643 .
## tax -0.006609 0.002392 -2.763 0.00593 **
## indus 0.010882 0.063367 0.172 0.86372
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.459 on 499 degrees of freedom
## Multiple R-squared: 0.6519, Adjusted R-squared: 0.6477
## F-statistic: 155.8 on 6 and 499 DF, p-value: < 2.2e-16
dmodel2<-lm(log(medv) ~ rm+nox+lstat+crim+tax+indus,data=dataset_cor1)
summary(dmodel2)
##
## Call:
## lm(formula = log(medv) ~ rm + nox + lstat + crim + tax + indus,
## data = dataset_cor1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74079 -0.13175 -0.01677 0.10572 0.93981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.626e+00 1.305e-01 20.118 < 2e-16 ***
## rm 1.432e-01 1.760e-02 8.136 3.26e-15 ***
## nox 5.965e-02 1.374e-01 0.434 0.664477
## lstat -3.031e-02 2.131e-03 -14.223 < 2e-16 ***
## crim -8.308e-03 1.415e-03 -5.872 7.87e-09 ***
## tax -3.153e-04 9.461e-05 -3.332 0.000926 ***
## indus 1.576e-03 2.507e-03 0.629 0.529846
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.216 on 499 degrees of freedom
## Multiple R-squared: 0.7242, Adjusted R-squared: 0.7209
## F-statistic: 218.4 on 6 and 499 DF, p-value: < 2.2e-16
dmodel3<-lm(log(medv) ~ cmedv+(cmedv^2)+nox+rm+nox+lstat+crim,data=dataset_cor1)
summary(dmodel3) #STAR
##
## Call:
## lm(formula = log(medv) ~ cmedv + (cmedv^2) + nox + rm + nox +
## lstat + crim, data = dataset_cor1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.51105 -0.02841 0.00692 0.04302 0.39759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7011883 0.0557313 48.468 < 2e-16 ***
## cmedv 0.0361864 0.0007583 47.720 < 2e-16 ***
## nox -0.0862931 0.0452409 -1.907 0.057 .
## rm -0.0453519 0.0084520 -5.366 1.23e-07 ***
## lstat -0.0098867 0.0010037 -9.850 < 2e-16 ***
## crim -0.0066225 0.0005573 -11.883 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09261 on 500 degrees of freedom
## Multiple R-squared: 0.9492, Adjusted R-squared: 0.9487
## F-statistic: 1868 on 5 and 500 DF, p-value: < 2.2e-16
### Diagnostic Tests
library(car)
library(lmtest)
# Model: dmodel1
# Multicollinearity
vif_values_dmodel1 <- car::vif(dmodel1)
condition_number_dmodel1 <- 1 / min(vif_values_dmodel1)
# Heteroscedasticity
plot(dmodel1, which = 1) # Residuals vs. Fitted plot
bp_test_dmodel1 <- lmtest::bptest(dmodel1)
# Normality of Residuals
qqnorm(dmodel1$residuals)
qqline(dmodel1$residuals)
shapiro_test_dmodel1 <- shapiro.test(dmodel1$residuals)
# Print the results
cat("Model: dmodel1\n")
## Model: dmodel1
print(vif_values_dmodel1)
## rm nox lstat crim tax indus
## 1.656208 2.746972 2.507990 1.603871 2.753383 3.202960
print(condition_number_dmodel1)
## [1] 0.6234914
print(bp_test_dmodel1)
##
## studentized Breusch-Pagan test
##
## data: dmodel1
## BP = 33.193, df = 6, p-value = 9.628e-06
print(shapiro_test_dmodel1)
##
## Shapiro-Wilk normality test
##
## data: dmodel1$residuals
## W = 0.87497, p-value < 2.2e-16
# Model: dmodel2
# Multicollinearity
vif_values_dmodel2 <- car::vif(dmodel2)
condition_number_dmodel2 <- 1 / min(vif_values_dmodel2)
# Heteroscedasticity
plot(dmodel2, which = 1) # Residuals vs. Fitted plot
bp_test_dmodel2 <- lmtest::bptest(dmodel2)
# Normality of Residuals
qqnorm(dmodel2$residuals)
qqline(dmodel2$residuals)
shapiro_test_dmodel2 <- shapiro.test(dmodel2$residuals)
# Print the results
cat("\nModel: dmodel2\n")
##
## Model: dmodel2
print(vif_values_dmodel2)
## rm nox lstat crim tax indus
## 1.656208 2.746972 2.507990 1.603871 2.753383 3.202960
print(condition_number_dmodel2)
## [1] 0.6234914
print(bp_test_dmodel2)
##
## studentized Breusch-Pagan test
##
## data: dmodel2
## BP = 48.152, df = 6, p-value = 1.102e-08
print(shapiro_test_dmodel2)
##
## Shapiro-Wilk normality test
##
## data: dmodel2$residuals
## W = 0.94569, p-value = 1.164e-12
# Model: dmodel3
# Multicollinearity
vif_values_dmodel3 <- car::vif(dmodel3)
condition_number_dmodel3 <- 1 / min(vif_values_dmodel3)
# Heteroscedasticity
plot(dmodel3, which = 1) # Residuals vs. Fitted plot
bp_test_dmodel3 <- lmtest::bptest(dmodel3)
# Normality of Residuals
qqnorm(dmodel3$residuals)
qqline(dmodel3$residuals)
shapiro_test_dmodel3 <- shapiro.test(dmodel3$residuals)
# Print the results
cat("\nModel: dmodel3\n")
##
## Model: dmodel3
print(vif_values_dmodel3)
## cmedv nox rm lstat crim
## 2.854814 1.618314 2.076620 3.024992 1.353156
print(condition_number_dmodel3)
## [1] 0.739013
print(bp_test_dmodel3)
##
## studentized Breusch-Pagan test
##
## data: dmodel3
## BP = 95.83, df = 5, p-value < 2.2e-16
print(shapiro_test_dmodel3)
##
## Shapiro-Wilk normality test
##
## data: dmodel3$residuals
## W = 0.87256, p-value < 2.2e-16
# Create polynomial terms
#dataset$independent_variable1_squared <- dataset$independent_variable1^2
#dataset$independent_variable2_cubed <- dataset$independent_variable2^3
# Fit polynomial - multiple linear regression model
#library(verb)
#model_poly <- lm(dependent_variable ~ independent_variable1 + independent_variable2 + independent_variable1_squared + independent_variable2_cubed, data = dataset)
#summary(model_poly)
# Fit polynomial - multiple linear regression model
#model_poly <- lm(dependent_variable ~ independent_variable1 + independent_variable2 + independent_variable1_squared + independent_variable2_cubed, data = data)
#summary(model_poly)
#dmodel3 <- lm(log(data$medv ~ dataset$rm + dataset$lstat + dataset$lstat + I(lstat^2) + dataset$indus + I(indus^3) + dataset$tax, dataset=data))
#summary(dmodel3)
#Polynomial - Multiple linear regression
dmodel4 <- lm(log(medv) ~ rm + lstat + I(lstat^2) + indus + I(indus^3) + tax, data = dataset)
summary(dmodel4)
##
## Call:
## lm(formula = log(medv) ~ rm + lstat + I(lstat^2) + indus + I(indus^3) +
## tax, data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88145 -0.12352 -0.00767 0.11651 0.90151
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.055e+00 1.364e-01 22.407 < 2e-16 ***
## rm 1.186e-01 1.787e-02 6.636 8.42e-11 ***
## lstat -6.316e-02 5.736e-03 -11.011 < 2e-16 ***
## I(lstat^2) 8.763e-04 1.548e-04 5.661 2.54e-08 ***
## indus 6.401e-03 4.425e-03 1.446 0.149
## I(indus^3) -1.494e-06 6.348e-06 -0.235 0.814
## tax -5.361e-04 8.689e-05 -6.169 1.42e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2164 on 499 degrees of freedom
## Multiple R-squared: 0.723, Adjusted R-squared: 0.7197
## F-statistic: 217.1 on 6 and 499 DF, p-value: < 2.2e-16
vif(dmodel4)
## rm lstat I(lstat^2) indus I(indus^3) tax
## 1.700761 18.090760 14.398831 9.938230 6.686220 2.312533
bptest(dmodel4)
##
## studentized Breusch-Pagan test
##
## data: dmodel4
## BP = 66.714, df = 6, p-value = 1.926e-12
AIC(dmodel4)
## [1] -104.0672
# Model: dmodel4
# Multicollinearity
vif_values_dmodel4 <- car::vif(dmodel4)
condition_number_dmodel4 <- 1 / min(vif_values_dmodel4)
# Heteroscedasticity
plot(dmodel4, which = 1) # Residuals vs. Fitted plot
bp_test_dmodel4 <- lmtest::bptest(dmodel4)
# Normality of Residuals
qqnorm(dmodel4$residuals)
qqline(dmodel4$residuals)
shapiro_test_dmodel4 <- shapiro.test(dmodel4$residuals)
# Print the results
cat("\nModel: dmodel4\n")
##
## Model: dmodel4
print(vif_values_dmodel4)
## rm lstat I(lstat^2) indus I(indus^3) tax
## 1.700761 18.090760 14.398831 9.938230 6.686220 2.312533
print(condition_number_dmodel4)
## [1] 0.5879722
print(bp_test_dmodel4)
##
## studentized Breusch-Pagan test
##
## data: dmodel4
## BP = 66.714, df = 6, p-value = 1.926e-12
print(shapiro_test_dmodel4)
##
## Shapiro-Wilk normality test
##
## data: dmodel4$residuals
## W = 0.9666, p-value = 2.542e-09
# Detect Heteroscedasticity
boxplot(formula=medv ~ rm, data=dataset_cor1, col=cm.colors(3))
boxplot(formula=medv ~ nox, data=dataset_cor1, col=cm.colors(3))
boxplot(formula=medv ~ lstat, data=dataset_cor1, col=cm.colors(3))
boxplot(formula=medv ~ crim, data=dataset_cor1, col=cm.colors(3))
boxplot(formula=medv ~ cmedv, data=dataset_cor1, col=cm.colors(3))
#_______________
### Split the Data in Training Data vs Test Data
# Lets randomly split the data into train and test set
#library(caret)
set.seed(123) ### sets the random seed for reproducibility of results
training.samples<-dataset_cor1$medv %>%
createDataPartition(p=0.75,list=FALSE) ### Lets consider 75% of the data to build a predictive model
# Model Accuracy
# RMSE - Root Mean Square Error: The larger the difference indicates a larger gap between the predicted and observed values,
# which means poor regression model fit. In the same way, the smaller RMSE that indicates the better the model.
train.data<-dataset_cor1[training.samples, ] ### training data to fit the linear regression model
test.data<-dataset_cor1[-training.samples, ] ### testing data to test the linear regression model
selected_model<-lm(log(medv) ~ rm+nox+lstat+crim,data=test.data)
summary(selected_model)
##
## Call:
## lm(formula = log(medv) ~ rm + nox + lstat + crim, data = test.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64115 -0.16352 -0.03409 0.10367 0.83148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.388632 0.325050 7.349 2.67e-11 ***
## rm 0.168362 0.042596 3.953 0.000131 ***
## nox 0.173243 0.233199 0.743 0.458994
## lstat -0.034048 0.004417 -7.708 4.10e-12 ***
## crim -0.013377 0.002809 -4.762 5.42e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2586 on 120 degrees of freedom
## Multiple R-squared: 0.6859, Adjusted R-squared: 0.6755
## F-statistic: 65.52 on 4 and 120 DF, p-value: < 2.2e-16
RMSE(selected_model$fitted.values,test.data$medv) ### Root Mean Square
## [1] 23.06438
### Split the Data in Training Data vs Test Data
# Lets randomly split the data into train and test set
set.seed(123) ### sets the random seed for reproducibility of results
training.samples<-dataset_cor1$medv %>%
createDataPartition(p=0.75,list=FALSE) ### Lets consider 75% of the data to build a predictive model
#################################
### LASSO REGRESSION ANALYSIS ###
#################################
# LASSO regression via glmnet package can only take numerical observations. Then, the dataset is transformed to model.matrix() format.
# Independent variables
x<-model.matrix(log(medv) ~ rm+nox+lstat+crim,train.data)[,-1] ### OLS model specification
# x<-model.matrix(Weekly_Sales~.,train.data)[,-1] ### matrix of independent variables X's
y<-train.data$medv ### dependent variable
# In estimating LASSO regression it is important to define the lambda that minimizes the prediction error rate.
# Cross-validation ensures that every data / observation from the original dataset (datains) has a chance of appearing in train and test datasets.
# Find the best lambda using cross-validation.
set.seed(123)
cv.lasso<-cv.glmnet(x,y,alpha=1) # alpha = 1 for LASSO
# Display the best lambda value
cv.lasso$lambda.min ### lambda: a numeric value defining the amount of shrinkage. Why min? the higher the value of ?? , the more penalization there is
## [1] 0.03545468
# Fit the final model on the training data
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)
# Display regression coefficients
coef(lassomodel)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -0.17268014
## rm 4.94801372
## nox -3.54323664
## lstat -0.51037898
## crim -0.09951622
# Make predictions on the test data
x.test<-model.matrix(log(medv) ~ rm+nox+lstat+crim,test.data)[,-1] ### OLS model specification
# x.test<-model.matrix(Weekly_Sales~.,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()
# Model Accuracy
data.frame(
RMSE = RMSE(lassopredictions, test.data$medv),
Rsquare = R2(lassopredictions, test.data$medv))
## RMSE Rsquare
## 1 7.198582 0.5994947
### visualizing lasso regression results
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])+ offset_x
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
lasso<-glmnet(scale(x),y,alpha=1)
plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)
# Model: lassomodel
# Multicollinearity
#vif_values_lassomodel <- car::vif(lassomodel)
#condition_number_lassomodel <- 1 / min(vif_values_lassomodel)
# Heteroscedasticity
#plot(lassomodel, which = 1) # Residuals vs. Fitted plot
#bp_test_lassomodel <- lmtest::bptest(lassomodel)
# Normality of Residuals
#qqnorm(lassomodel$residuals)
#qqline(lassomodel$residuals)
#shapiro_test_lassomodel <- shapiro.test(lassomodel$residuals)
# Print the results
#cat("\nModel: lassomodel\n")
#print(vif_values_lassomodel)
#print(condition_number_lassomodel)
#print(bp_test_lassomodel)
#print(shapiro_test_lassomodel)
#################################
### RIDGE REGRESSION ANALYSIS ###
#################################
# Find the best lambda using cross-validation
set.seed(123) # x: independent variables | y: dependent variable
cv.ridge <- cv.glmnet(x,y,alpha=0.1) # alpha = 0 for RIDGE
# Display the best lambda value
cv.ridge$lambda.min # lambda: a numeric value defining the amount of shrinkage. Why min? the higher the value of ?? , the more penalization there is
## [1] 0.3230498
# Fit the final model on the training data
ridgemodel<-glmnet(x,y,alpha=0,lambda=cv.ridge$lambda.min)
# Display regression coefficients
coef(ridgemodel)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 0.4190616
## rm 4.8925589
## nox -4.4568886
## lstat -0.4884040
## crim -0.1045264
# Make predictions on the test data
x.test<-model.matrix(log(medv) ~ rm+nox+lstat+crim,test.data)[,-1]
ridgepredictions<-ridgemodel %>% predict(x.test) %>% as.vector()
# Model Accuracy
data.frame(
RMSE = RMSE(ridgepredictions, test.data$medv),
Rsquare = R2(ridgepredictions, test.data$medv)
)
## RMSE Rsquare
## 1 7.241001 0.5966794
### visualizing ridge regression results
ridge<-glmnet(scale(x),y,alpha=0)
plot(ridge, xvar = "lambda", label=T)
lbs_fun(ridge)
abline(v=cv.ridge$lambda.min, col = "red", lty=2)
abline(v=cv.ridge$lambda.1se, col="blue", lty=2)
tab <- matrix(c(17587,6324,6322,0.77,0.71,0.71), ncol=2, byrow=FALSE)
colnames(tab) <- c('RMSE','R2')
rownames(tab) <- c('Linear Regression','Lasso','Ridge')
tab <- as.table(tab)
# Model: ridgemodel
# Multicollinearity
#vif_values_ridgemodel <- car::vif(ridgemodel)
#condition_number_ridgemodel <- 1 / min(vif_values_ridgemodel)
# Heteroscedasticity
#plot(ridgemodel, which = 1) # Residuals vs. Fitted plot
#bp_test_ridgemodel <- lmtest::bptest(ridgemodel)
# Normality of Residuals
#qqnorm(ridgemodel$residuals)
#qqline(ridgemodel$residuals)
#shapiro_test_ridgemodel <- shapiro.test(ridgemodel$residuals)
# Print the results
#cat("\nModel: ridgemodel\n")
#print(vif_values_ridgemodel)
#print(condition_number_ridgemodel)
#print(bp_test_ridgemodel)
#print(shapiro_test_ridgemodel)
Para comparar los resultados y elegir el mejor modelo de regresión que se ajuste a los datos, analicemos las dos opciones proporcionadas. La opción 1 utiliza la regresión Ridge. En esta opción, el mejor valor lambda (la cantidad de contracción) se determina utilizando la validación cruzada. El valor lambda es de 0,3230498. El modelo final se ajusta utilizando este valor lambda y se muestran los coeficientes de regresión. La precisión del modelo se evalúa mediante los valores RMSE (error cuadrático medio) y R-cuadrado. En este caso, el RMSE es de 7,241001 y el valor R-cuadrado es de 0,5966794.
La opción 2 es utilizar la regresión lineal. La fórmula del modelo incluye varias variables como cmedv, nox, rm, lstat y crim. Se muestran los coeficientes, los errores estándar, los valores t y los valores p de cada variable. La precisión del modelo se evalúa mediante el error estándar residual, R-cuadrado múltiple, R-cuadrado ajustado y F-estadístico. En este caso, el error estándar residual es de 0,09261, el R-cuadrado múltiple es de 0,9492 y el R-cuadrado ajustado es de 0,9487.
Para elegir el mejor modelo, hay que tener en cuenta varios factores, como la precisión del modelo, su interpretabilidad y los requisitos específicos del problema en cuestión. Ambas opciones tienen sus puntos fuertes y débiles.
La opción 1 (regresión Ridge) proporciona un valor RMSE más bajo que la opción 2 (regresión lineal). Sin embargo, el valor R-cuadrado de la Opción 1 es 0,5966794, lo que indica que el modelo sólo explica aproximadamente el 59,67% de la varianza de los datos. Por otra parte, la Opción 2 (Regresión lineal) tiene un valor R-cuadrado más alto de 0,9492, lo que sugiere que el modelo explica aproximadamente el 94,92% de la varianza de los datos. Además, la Opción 2 proporciona valores p para cada coeficiente, lo que nos permite evaluar la significación estadística de las variables. Esto puede ser útil para interpretar el impacto de cada variable en la variable objetivo.
Teniendo en cuenta el mayor valor R-cuadrado y la posibilidad de evaluar la significación estadística de las variables, la opción 2 (regresión lineal) parece ajustarse mejor a los datos. Sin embargo, es importante señalar que la decisión final debe basarse en los requisitos específicos y el contexto del problema que se intenta resolver.