library(foreign)
library(dplyr) # data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats) # to work with categorical variables
library(ggplot2) # data visualization
library(readr) # read specific csv files
library(janitor) # data exploration and cleaning
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(Hmisc) # several useful functions for data analysis
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych) # functions for multivariate analysis
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar) # summaries and visualization of missing values NA's
library(corrplot) # correlation plots
## corrplot 0.92 loaded
library(jtools) # presentation of regression analysis
##
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
##
## %nin%
library(lmtest) # diagnostic checks - linear regression analysis
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # diagnostic checks - linear regression analysis
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(olsrr) # diagnostic checks - linear regression analysis
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(naniar) # identifying missing values
library(stargazer) # create publication quality tables
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effects) # displays for linear and other regression models
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(tidyverse) # collection of R packages designed for data science
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret) # Classification and Regression Training
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet) # methods for prediction and plotting, and functions for cross-validation
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-7
bd = read_csv("C:\\Users\\maria\\OneDrive\\Desktop\\AD 2023\\ECONOMETRICS\\real_estate_data.csv")
## Rows: 506 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): medv, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bd # Confirm that the data base has been uploaded correctly.
## # A tibble: 506 × 15
## medv cmedv crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 24 24 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296
## 2 21.6 21.6 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242
## 3 34.7 34.7 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242
## 4 33.4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222
## 5 36.2 36.2 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222
## 6 28.7 28.7 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222
## 7 22.9 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5 311
## 8 27.1 22.1 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95 5 311
## 9 16.5 16.5 0.211 12.5 7.87 0 0.524 5.63 100 6.08 5 311
## 10 18.9 18.9 0.170 12.5 7.87 0 0.524 6.00 85.9 6.59 5 311
## # ℹ 496 more rows
## # ℹ 3 more variables: ptratio <dbl>, b <dbl>, lstat <dbl>
# We can observed that the data base has 506 observations and 15 variables.
# View(bd)
head(bd)
## # A tibble: 6 × 15
## medv cmedv crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 24 24 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296
## 2 21.6 21.6 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242
## 3 34.7 34.7 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242
## 4 33.4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222
## 5 36.2 36.2 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222
## 6 28.7 28.7 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222
## # ℹ 3 more variables: ptratio <dbl>, b <dbl>, lstat <dbl>
str(bd)
## spc_tbl_ [506 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ medv : num [1:506] 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ cmedv : num [1:506] 24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
## $ crim : num [1:506] 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num [1:506] 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num [1:506] 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : num [1:506] 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num [1:506] 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num [1:506] 6.58 6.42 7.18 7 7.15 ...
## $ age : num [1:506] 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num [1:506] 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num [1:506] 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num [1:506] 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num [1:506] 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num [1:506] 397 397 393 395 397 ...
## $ lstat : num [1:506] 4.98 9.14 4.03 2.94 5.33 ...
## - attr(*, "spec")=
## .. cols(
## .. medv = col_double(),
## .. cmedv = col_double(),
## .. crim = col_double(),
## .. zn = col_double(),
## .. indus = col_double(),
## .. chas = col_double(),
## .. nox = col_double(),
## .. rm = col_double(),
## .. age = col_double(),
## .. dis = col_double(),
## .. rad = col_double(),
## .. tax = col_double(),
## .. ptratio = col_double(),
## .. b = col_double(),
## .. lstat = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# 1.A. Identify the name of the variables.
colnames(bd)
## [1] "medv" "cmedv" "crim" "zn" "indus" "chas" "nox"
## [8] "rm" "age" "dis" "rad" "tax" "ptratio" "b"
## [15] "lstat"
# We need to know what each varible means:
# 1. medv = median value of owner-occupied homes in USD 1000's
# 2. cmedv = corrections to the variable MEDV but with more reliable information
# 3. crim = crime rate per capita by town
# 4. zn = proportion of residential land zoned for lots over 25,000 sq.ft
# 5. indus = proportion of non-retail business acres per town.
# 6. chas = river view Dummy variable (= 1 if neighborhood bounds river; 0 otherwise)
# 7. nox = nitric oxid, nitric oxides concentration (parts per 10 million).
# 8. rm = average number of rooms per dwelling
# 9. age = proportion of owner-occupied units built prior to 1940
# 10. dis = weighted distances to five Boston employment centers
# 11. rad = index of accessibility to radial highways
# 12. tax = full-value property-tax rate per USD 10,000
# 13. ptratio = pupil-teacher ratio by town or school
# 14. b = percentage of african american population living in the zone
# 15. lstat = percentage of lower status of the population
# 1.B. Identify missing values
# Method 1.
missing_values <- sum(is.na(bd))
missing_values
## [1] 0
# Method 2.
missing_values_2 <- anyNA(bd)
missing_values_2
## [1] FALSE
# We can conclude that there are not any missing values in our data set.
# 1.C. Display the structure of the dataset
# The next code is to know the type of structure of each variable:
structure_medv = class (bd$medv)
structure_cmedv = class (bd$cmedv)
structure_crim = class (bd$crim)
structure_zn = class (bd$zn)
structure_indus = class (bd$indus)
structure_chas = class (bd$chas)
structure_nox = class (bd$nox)
structure_rm = class (bd$rm)
structure_age = class (bd$age)
structure_dis = class (bd$dis)
structure_rad = class (bd$rad)
structure_tax = class (bd$tax)
structure_ptratio = class (bd$ptratio)
structure_b = class (bd$b)
structure_lstat = class (bd$lstat)
structure_medv
## [1] "numeric"
structure_cmedv
## [1] "numeric"
structure_crim
## [1] "numeric"
structure_zn
## [1] "numeric"
structure_indus
## [1] "numeric"
structure_chas
## [1] "numeric"
structure_nox
## [1] "numeric"
structure_rm
## [1] "numeric"
structure_age
## [1] "numeric"
structure_dis
## [1] "numeric"
structure_rad
## [1] "numeric"
structure_tax
## [1] "numeric"
structure_ptratio
## [1] "numeric"
structure_b
## [1] "numeric"
structure_lstat
## [1] "numeric"
# We can see that all the data frame is store in a numeric type.
# 1.D.1. Include descriptive statistics (mean, median, standard deviation, minimum, maximum)
# This code shows basic descriptive statistics:
descriptive_statistics <- summary(bd)
descriptive_statistics
## medv cmedv crim zn
## Min. : 5.00 Min. : 5.00 Min. : 0.00632 Min. : 0.00
## 1st Qu.:17.02 1st Qu.:17.02 1st Qu.: 0.08205 1st Qu.: 0.00
## Median :21.20 Median :21.20 Median : 0.25651 Median : 0.00
## Mean :22.53 Mean :22.53 Mean : 3.61352 Mean : 11.36
## 3rd Qu.:25.00 3rd Qu.:25.00 3rd Qu.: 3.67708 3rd Qu.: 12.50
## Max. :50.00 Max. :50.00 Max. :88.97620 Max. :100.00
## indus chas nox rm
## Min. : 0.46 Min. :0.00000 Min. :0.3850 Min. :3.561
## 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886
## Median : 9.69 Median :0.00000 Median :0.5380 Median :6.208
## Mean :11.14 Mean :0.06917 Mean :0.5547 Mean :6.285
## 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623
## Max. :27.74 Max. :1.00000 Max. :0.8710 Max. :8.780
## age dis rad tax
## Min. : 2.90 Min. : 1.130 Min. : 1.000 Min. :187.0
## 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000 1st Qu.:279.0
## Median : 77.50 Median : 3.207 Median : 5.000 Median :330.0
## Mean : 68.57 Mean : 3.795 Mean : 9.549 Mean :408.2
## 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000 3rd Qu.:666.0
## Max. :100.00 Max. :12.127 Max. :24.000 Max. :711.0
## ptratio b lstat
## Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :19.05 Median :391.44 Median :11.36
## Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :22.00 Max. :396.90 Max. :37.97
descriptive_statistics_log <- summary(log(bd$medv))
descriptive_statistics_log
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.609 2.835 3.054 3.035 3.219 3.912
describe <- describe(bd)
describe
## vars n mean sd median trimmed mad min max range skew
## medv 1 506 22.53 9.20 21.20 21.56 5.93 5.00 50.00 45.00 1.10
## cmedv 2 506 22.53 9.18 21.20 21.56 5.93 5.00 50.00 45.00 1.10
## crim 3 506 3.61 8.60 0.26 1.68 0.33 0.01 88.98 88.97 5.19
## zn 4 506 11.36 23.32 0.00 5.08 0.00 0.00 100.00 100.00 2.21
## indus 5 506 11.14 6.86 9.69 10.93 9.37 0.46 27.74 27.28 0.29
## chas 6 506 0.07 0.25 0.00 0.00 0.00 0.00 1.00 1.00 3.39
## nox 7 506 0.55 0.12 0.54 0.55 0.13 0.38 0.87 0.49 0.72
## rm 8 506 6.28 0.70 6.21 6.25 0.51 3.56 8.78 5.22 0.40
## age 9 506 68.57 28.15 77.50 71.20 28.98 2.90 100.00 97.10 -0.60
## dis 10 506 3.80 2.11 3.21 3.54 1.91 1.13 12.13 11.00 1.01
## rad 11 506 9.55 8.71 5.00 8.73 2.97 1.00 24.00 23.00 1.00
## tax 12 506 408.24 168.54 330.00 400.04 108.23 187.00 711.00 524.00 0.67
## ptratio 13 506 18.46 2.16 19.05 18.66 1.70 12.60 22.00 9.40 -0.80
## b 14 506 356.67 91.29 391.44 383.17 8.09 0.32 396.90 396.58 -2.87
## lstat 15 506 12.65 7.14 11.36 11.90 7.11 1.73 37.97 36.24 0.90
## kurtosis se
## medv 1.45 0.41
## cmedv 1.47 0.41
## crim 36.60 0.38
## zn 3.95 1.04
## indus -1.24 0.30
## chas 9.48 0.01
## nox -0.09 0.01
## rm 1.84 0.03
## age -0.98 1.25
## dis 0.46 0.09
## rad -0.88 0.39
## tax -1.15 7.49
## ptratio -0.30 0.10
## b 7.10 4.06
## lstat 0.46 0.32
# We didn't saw any incosistency except for the factor that "chas" is a categorical yes or no question. This means that
# in the next part of the analysis we need to transform it as a factor.
# 1.E. Transform variables if required
bd$chas<-as.factor(bd$chas)
# This type of variable has already been transform in order to be analyze as a categorical variable.
# 2.1 Build at least 2 pair-wised graphs between the dependent variable and independent variables
bd1<-bd %>% select(cmedv,chas,rm,rad,tax) %>% group_by(chas)
bd1
## # A tibble: 506 × 5
## # Groups: chas [2]
## cmedv chas rm rad tax
## <dbl> <fct> <dbl> <dbl> <dbl>
## 1 24 0 6.58 1 296
## 2 21.6 0 6.42 2 242
## 3 34.7 0 7.18 2 242
## 4 33.4 0 7.00 3 222
## 5 36.2 0 7.15 3 222
## 6 28.7 0 6.43 3 222
## 7 22.9 0 6.01 5 311
## 8 22.1 0 6.17 5 311
## 9 16.5 0 5.63 5 311
## 10 18.9 0 6.00 5 311
## # ℹ 496 more rows
ggplot(data=bd1,aes(x=reorder(chas,cmedv),y=cmedv,fill=rad)) +
geom_bar(stat="identity") + coord_flip()
#In this bar graph, we can visualize that properties that have accessibility to radial highways are more expensive than the ones who don't. Also, if the neighborhood bounds a river, the properties are much less expensive and in a smaller range than those who don't.
bd %>% mutate(indus_intervals=cut(indus,breaks=c(0,5,10,15,20,25,30))) %>%
ggplot(aes(x=reorder(indus_intervals,cmedv),y=cmedv,fill=tax)) +
geom_bar(stat="identity") + coord_flip()
#In this bar graph, we can visualize that properties that have 15-20 points of proportion of non-retail business acres per town have much higher taxes than most of the population. Also, if the neighborhood has a larger proportion of non-retail business acres per town it has the most taxes. Finally, there is a spike at 15-20.
bd %>% mutate(rm_intervals=cut(rm,breaks=c(3,4,5,6,7,8,9))) %>%
ggplot(aes(x=reorder(rm_intervals,cmedv),y=cmedv,fill=lstat)) +
geom_bar(stat="identity") + coord_flip()
#In this bar graph, we can visualize that properties that have a highest number of rooms (7-9), have the lowest percentage of lower status of the population. Whereas the middle class homes are more exposed to the lower status of the population.
# 2.2 Display a histogran of dependent variable
hist(bd$cmedv)
hist(log(bd$cmedv))
# In this histogram we can see that the values are following a normal distribution behavior.
# 2.3 Display a correlation plot
corr_bd<- bd %>% select(-chas) # lets remove qualitative data
corrplot(cor(corr_bd),type='upper',order='hclust',addCoef.col='black')
Briefly describe at least 3 hypotheses which you would like to explore through regression analysis.
# 4.1. Estimate 3 different linear regression models by using:
# - multiple linear regression
# - polynomial – multiple linear regression
# - lasso regression model
# - ridge regression model
# 4.2. Show the level of accuracy for each linear regression model
# 4.3. Select the regression model that better fits the data
# 4.4. Interpret the main results of the selected regression model
# Linear regression
linear_regresion_2<-lm(cmedv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+b+lstat,data=bd) # Regresion lineal
summary(linear_regresion_2)
##
## Call:
## lm(formula = cmedv ~ crim + zn + indus + chas + nox + rm + age +
## dis + rad + tax + ptratio + b + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5651 -2.6908 -0.5352 1.8446 26.1319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.637e+01 5.058e+00 7.191 2.40e-12 ***
## crim -1.062e-01 3.257e-02 -3.261 0.001189 **
## zn 4.772e-02 1.360e-02 3.508 0.000493 ***
## indus 2.325e-02 6.094e-02 0.382 0.702970
## chas1 2.692e+00 8.539e-01 3.152 0.001718 **
## nox -1.774e+01 3.785e+00 -4.687 3.59e-06 ***
## rm 3.789e+00 4.142e-01 9.149 < 2e-16 ***
## age 5.749e-04 1.309e-02 0.044 0.964989
## dis -1.502e+00 1.977e-01 -7.598 1.53e-13 ***
## rad 3.038e-01 6.575e-02 4.620 4.91e-06 ***
## tax -1.270e-02 3.727e-03 -3.409 0.000706 ***
## ptratio -9.239e-01 1.297e-01 -7.126 3.70e-12 ***
## b 9.228e-03 2.662e-03 3.467 0.000573 ***
## lstat -5.307e-01 5.026e-02 -10.558 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.703 on 492 degrees of freedom
## Multiple R-squared: 0.7444, Adjusted R-squared: 0.7377
## F-statistic: 110.2 on 13 and 492 DF, p-value: < 2.2e-16
# Polynomial regresion
polynomial_regresion2 <- lm(cmedv ~ crim+I(crim^2)+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+b+lstat, data=bd )
summary(polynomial_regresion2)
##
## Call:
## lm(formula = cmedv ~ crim + I(crim^2) + zn + indus + chas + nox +
## rm + age + dis + rad + tax + ptratio + b + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4761 -2.6682 -0.5622 1.7314 26.2607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.602e+01 5.081e+00 7.088 4.77e-12 ***
## crim -3.850e-02 9.478e-02 -0.406 0.684781
## I(crim^2) -9.496e-04 1.248e-03 -0.761 0.447231
## zn 4.656e-02 1.370e-02 3.400 0.000729 ***
## indus 2.503e-02 6.102e-02 0.410 0.681798
## chas1 2.722e+00 8.551e-01 3.183 0.001550 **
## nox -1.765e+01 3.789e+00 -4.659 4.09e-06 ***
## rm 3.816e+00 4.158e-01 9.177 < 2e-16 ***
## age 6.367e-04 1.310e-02 0.049 0.961249
## dis -1.483e+00 1.994e-01 -7.436 4.66e-13 ***
## rad 2.808e-01 7.235e-02 3.882 0.000118 ***
## tax -1.266e-02 3.729e-03 -3.394 0.000745 ***
## ptratio -9.184e-01 1.299e-01 -7.070 5.35e-12 ***
## b 9.502e-03 2.687e-03 3.536 0.000445 ***
## lstat -5.390e-01 5.146e-02 -10.475 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.705 on 491 degrees of freedom
## Multiple R-squared: 0.7447, Adjusted R-squared: 0.7375
## F-statistic: 102.3 on 14 and 491 DF, p-value: < 2.2e-16
# Lasso regresion model
### Split the Data in Training Data vs Test Data
# Lets randomly split the data into train and test set
set.seed(123) ### sets the random seed for reproducibility of results
training.samples<-bd$cmedv %>%
createDataPartition(p=0.75,list=FALSE) ### Lets consider 75% of the data to build a predictive model
train.data<-bd[training.samples, ] ### training data to fit the linear regression model
test.data<-bd[-training.samples, ] ### testing data to test the linear regression model
#################################
### LASSO REGRESSION ANALYSIS ###
#################################
# LASSO regression via glmnet package can only take numerical observations. Then, the dataset is transformed to model.matrix() format.
# Independent variables
x<-model.matrix(log(cmedv) ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+b+lstat,train.data)[,-1] ### OLS model specification
# x<-model.matrix(Weekly_Sales~.,train.data)[,-1] ### matrix of independent variables X's
y<-train.data$cmedv ### dependent variable
# In estimating LASSO regression it is important to define the lambda that minimizes the prediction error rate.
# Cross-validation ensures that every data / observation from the original dataset (datains) has a chance of appearing in train and test datasets.
# Find the best lambda using cross-validation.
set.seed(123)
cv.lasso <- cv.glmnet(x,y,alpha=1) # alpha = 1 for LASSO
# NO SE PUEDE USAR CON SOLO UNA VARIABLE
# Display the best lambda value
cv.lasso$lambda.min ### lambda: a numeric value defining the amount of shrinkage. Why min? the higher the value of ?? , the more penalization there is
## [1] 0.007628727
# Fit the final model on the training data
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)
lassomodel
##
## Call: glmnet(x = x, y = y, alpha = 1, lambda = cv.lasso$lambda.min)
##
## Df %Dev Lambda
## 1 13 78.02 0.007629
# Display regression coefficients
coef(lassomodel)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 30.507771133
## crim -0.075322963
## zn 0.027211748
## indus 0.016590129
## chas1 1.279261773
## nox -15.423179147
## rm 4.511284919
## age -0.015525359
## dis -1.242892035
## rad 0.241103964
## tax -0.012687811
## ptratio -0.955863019
## b 0.009410302
## lstat -0.413182367
# Make predictions on the test data
x.test<-model.matrix(log(cmedv) ~
crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+b+lstat,test.data)[,-1] ### OLS model specification
# x.test<-model.matrix(Weekly_Sales~.,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()
# Model Accuracy
data.frame(
RMSE = RMSE(lassopredictions, test.data$cmedv),
Rsquare = R2(lassopredictions, test.data$cmedv))
## RMSE Rsquare
## 1 6.016507 0.6241085
### visualizing lasso regression results
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])+ offset_x
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
lasso<-glmnet(scale(x),y,alpha=1)
plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)
#################################
### RIDGE REGRESSION ANALYSIS ###
#################################
# Find the best lambda using cross-validation
set.seed(123) # x: independent variables | y: dependent variable
cv.ridge <- cv.glmnet(x,y,alpha=0.1) # alpha = 0 for RIDGE
# Display the best lambda value
cv.ridge$lambda.min # lambda: a numeric value defining the amount of shrinkage. Why min? the higher the value of ?? , the more penalization there is
## [1] 0.1008473
# Fit the final model on the training data
ridgemodel<-glmnet(x,y,alpha=0,lambda=cv.ridge$lambda.min)
# Display regression coefficients
coef(ridgemodel)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 29.559647000
## crim -0.074314702
## zn 0.026122102
## indus 0.010936647
## chas1 1.349831660
## nox -14.803259756
## rm 4.523410858
## age -0.016144172
## dis -1.205157032
## rad 0.220860317
## tax -0.011698216
## ptratio -0.944809185
## b 0.009438998
## lstat -0.407789289
# Make predictions on the test data
x.test<-model.matrix(log(cmedv) ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+b+lstat,test.data)[,-1]
ridgepredictions<-ridgemodel %>% predict(x.test) %>% as.vector()
# Model Accuracy
data.frame(
RMSE = RMSE(ridgepredictions, test.data$cmedv),
Rsquare = R2(ridgepredictions, test.data$cmedv)
)
## RMSE Rsquare
## 1 6.032134 0.6223725
### visualizing ridge regression results
ridge<-glmnet(scale(x),y,alpha=0)
plot(ridge, xvar = "lambda", label=T)
lbs_fun(ridge)
abline(v=cv.ridge$lambda.min, col = "red", lty=2)
abline(v=cv.ridge$lambda.1se, col="blue", lty=2)
### Diagnostic Tests -Hyphothesis 2
# Linear regression:
vif(linear_regresion_2)
## crim zn indus chas nox rm age dis
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945
## rad tax ptratio b lstat
## 7.484496 9.008554 1.799084 1.348521 2.941491
bptest(linear_regresion_2)
##
## studentized Breusch-Pagan test
##
## data: linear_regresion_2
## BP = 66.062, df = 13, p-value = 4.226e-09
AIC(linear_regresion_2)
## [1] 3018.491
histogram(linear_regresion_2$residuals)
# Polynomial regression:
vif(polynomial_regresion2)
## crim I(crim^2) zn indus chas nox rm age
## 15.164099 9.366630 2.327595 3.997472 1.076317 4.397853 1.947053 3.100945
## dis rad tax ptratio b lstat
## 4.021025 9.054725 9.011262 1.804598 1.373120 3.080401
bptest(polynomial_regresion2)
##
## studentized Breusch-Pagan test
##
## data: polynomial_regresion2
## BP = 68.026, df = 14, p-value = 4.387e-09
AIC(polynomial_regresion2)
## [1] 3019.895
histogram(polynomial_regresion2$residuals)
# In both cases of the linear and polynomial regressions we found out through the bptest analsis that there is heteroscdasticity since the p-value is less than 5.
# This means that we accept the null hypothesis (H0 = there is heteroscdasticity in our model.)
# Also, its worth mentioning that the behavior of the residuals in the histograms are very similar as well as the value of AIC. We can conclude with this information
# that there is a llitle difference between the two prediction models, but this is not significant.
# We created a new variable in order to compare the quality and reliability of each of the regression models.
tab <- matrix(c(4.703,4.684,6.0165,6.0321,0.737,0.737, 0.624, 0.622), ncol=2, byrow=FALSE)
colnames(tab) <- c('RMSE','R2')
rownames(tab) <- c('Linear Regression','Polynomial','Lasso','Ridge')
tab <- as.table(tab)
tab
## RMSE R2
## Linear Regression 4.7030 0.7370
## Polynomial 4.6840 0.7370
## Lasso 6.0165 0.6240
## Ridge 6.0321 0.6220
# With this table we can conclude that the model that best fits our interest and is more reliable is the "Polynomial Regresion".
# This, due to the fact that it showed the minimun value in the RMSE test and it mantains the highest value of the R^2 test.
# With our model selected we print it in order to see the coeficients and prove our hypothesis.
summary(polynomial_regresion2)
##
## Call:
## lm(formula = cmedv ~ crim + I(crim^2) + zn + indus + chas + nox +
## rm + age + dis + rad + tax + ptratio + b + lstat, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4761 -2.6682 -0.5622 1.7314 26.2607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.602e+01 5.081e+00 7.088 4.77e-12 ***
## crim -3.850e-02 9.478e-02 -0.406 0.684781
## I(crim^2) -9.496e-04 1.248e-03 -0.761 0.447231
## zn 4.656e-02 1.370e-02 3.400 0.000729 ***
## indus 2.503e-02 6.102e-02 0.410 0.681798
## chas1 2.722e+00 8.551e-01 3.183 0.001550 **
## nox -1.765e+01 3.789e+00 -4.659 4.09e-06 ***
## rm 3.816e+00 4.158e-01 9.177 < 2e-16 ***
## age 6.367e-04 1.310e-02 0.049 0.961249
## dis -1.483e+00 1.994e-01 -7.436 4.66e-13 ***
## rad 2.808e-01 7.235e-02 3.882 0.000118 ***
## tax -1.266e-02 3.729e-03 -3.394 0.000745 ***
## ptratio -9.184e-01 1.299e-01 -7.070 5.35e-12 ***
## b 9.502e-03 2.687e-03 3.536 0.000445 ***
## lstat -5.390e-01 5.146e-02 -10.475 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.705 on 491 degrees of freedom
## Multiple R-squared: 0.7447, Adjusted R-squared: 0.7375
## F-statistic: 102.3 on 14 and 491 DF, p-value: < 2.2e-16
Hypotesis 1: We accept our hypothesis since rad has a positive coeficint and a 99% significant. This means that as there is more access to radial highways, the median value of a home will increase.
Hypotesis 2: We reject the hypothesis since the result show that the B variable has a positive impact on the value of the home. It’s worth mentioning that eventhough this is a positive coeficient, the true impact is not very significant compared to other variables. This means that as there is more african-american population, the value of the home will increase however there is not much causuality in the variable. This means that there are other variables with greater impact.
Hypotesis 3: We completly reject the hypothesis since there is not a significant level of impact to the dependent variables. In the result we saw that this variable dosen´t have any astherisks which means that the variable doesen´t affect the value of the home.
The linear regression analysis can contribute to improve predictive analytics due to the fact that we can see haw each of the individual variables affects or dosen´t affect the dependent variable. Whith these we can see which variables we need to focus according to their impact on the dependent variable. This coefficents can be use to create a algebraic expresion that can be use to predict a variable according to the results of the independent variables. As we have more access to data the regresion models will improve in their reliability and give us a better prediction.