library(foreign)
library(dplyr) # data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats) # to work with categorical variables
library(ggplot2) # data visualization
library(readr) # read specific csv files
library(janitor) # data exploration and cleaning
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(Hmisc) # several useful functions for data analysis
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych) # functions for multivariate analysis
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar) # summaries and visualization of missing values NA's
library(dlookr) # summaries and visualization of missing values NA's
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:psych':
##
## describe
## The following object is masked from 'package:Hmisc':
##
## describe
## The following object is masked from 'package:base':
##
## transform
library(corrplot) # correlation plots
## corrplot 0.92 loaded
library(jtools) # presentation of regression analysis
##
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
##
## %nin%
library(lmtest) # diagnostic checks - linear regression analysis
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # diagnostic checks - linear regression analysis
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(olsrr) # diagnostic checks - linear regression analysis
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(naniar) # identifying missing values
library(stargazer) # create publication quality tables
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effects) # displays for linear and other regression models
## Registered S3 method overwritten by 'survey':
## method from
## summary.pps dlookr
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(tidyverse) # collection of R packages designed for data science
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ tidyr::extract() masks dlookr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret) # Classification and Regression Training
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet) # methods for prediction and plotting, and functions for cross-validation
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-7
library(corrplot)
#Por si los necesitan instalar.
#install.packages("janitor")
#install.packages("lmtest")
#install.packages("tidyverse")
#install.packages("glmnet")
#install.packages("missRanger")
#install.packages("corrplot")
#install.packages("missForest")
#install.packages("ggplot2")
# A) Exploratory Data Analysis
#Here we load the dataset
datains <- read_csv("/Users/valeriacantulobo/Downloads/real_estate_data.csv")
## Rows: 506 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): medv, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Here we analyze the first 6 rows and the columns
head(datains)
## # A tibble: 6 × 15
## medv cmedv crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 24 24 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296
## 2 21.6 21.6 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242
## 3 34.7 34.7 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242
## 4 33.4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222
## 5 36.2 36.2 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222
## 6 28.7 28.7 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222
## # … with 3 more variables: ptratio <dbl>, b <dbl>, lstat <dbl>
#Here we see the structure of the dataset
str(datains)
## spc_tbl_ [506 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ medv : num [1:506] 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ cmedv : num [1:506] 24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
## $ crim : num [1:506] 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num [1:506] 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num [1:506] 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : num [1:506] 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num [1:506] 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num [1:506] 6.58 6.42 7.18 7 7.15 ...
## $ age : num [1:506] 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num [1:506] 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num [1:506] 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num [1:506] 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num [1:506] 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num [1:506] 397 397 393 395 397 ...
## $ lstat : num [1:506] 4.98 9.14 4.03 2.94 5.33 ...
## - attr(*, "spec")=
## .. cols(
## .. medv = col_double(),
## .. cmedv = col_double(),
## .. crim = col_double(),
## .. zn = col_double(),
## .. indus = col_double(),
## .. chas = col_double(),
## .. nox = col_double(),
## .. rm = col_double(),
## .. age = col_double(),
## .. dis = col_double(),
## .. rad = col_double(),
## .. tax = col_double(),
## .. ptratio = col_double(),
## .. b = col_double(),
## .. lstat = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#Here we try to identify if there are some missing values
sum(is.na(datains))
## [1] 0
gg_miss_var(datains)

#Here we use the descriptive stadistic
summary(datains)
## medv cmedv crim zn
## Min. : 5.00 Min. : 5.00 Min. : 0.00632 Min. : 0.00
## 1st Qu.:17.02 1st Qu.:17.02 1st Qu.: 0.08205 1st Qu.: 0.00
## Median :21.20 Median :21.20 Median : 0.25651 Median : 0.00
## Mean :22.53 Mean :22.53 Mean : 3.61352 Mean : 11.36
## 3rd Qu.:25.00 3rd Qu.:25.00 3rd Qu.: 3.67708 3rd Qu.: 12.50
## Max. :50.00 Max. :50.00 Max. :88.97620 Max. :100.00
## indus chas nox rm
## Min. : 0.46 Min. :0.00000 Min. :0.3850 Min. :3.561
## 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886
## Median : 9.69 Median :0.00000 Median :0.5380 Median :6.208
## Mean :11.14 Mean :0.06917 Mean :0.5547 Mean :6.285
## 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623
## Max. :27.74 Max. :1.00000 Max. :0.8710 Max. :8.780
## age dis rad tax
## Min. : 2.90 Min. : 1.130 Min. : 1.000 Min. :187.0
## 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000 1st Qu.:279.0
## Median : 77.50 Median : 3.207 Median : 5.000 Median :330.0
## Mean : 68.57 Mean : 3.795 Mean : 9.549 Mean :408.2
## 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000 3rd Qu.:666.0
## Max. :100.00 Max. :12.127 Max. :24.000 Max. :711.0
## ptratio b lstat
## Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :19.05 Median :391.44 Median :11.36
## Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :22.00 Max. :396.90 Max. :37.97
# Here since we did not obtain the standard deviation with the summary we go individually pero numeric column
standard_deviationmedv <- sd(datains$medv)
standard_deviationcmed <- sd(datains$cmedv)
standard_deviationcrim <- sd(datains$crim)
standard_deviationzn <- sd(datains$zn)
standard_deviationindus <- sd(datains$indus)
standard_deviationchas <- sd(datains$chas)
standard_deviationnox <- sd(datains$nox)
standard_deviationrm <- sd(datains$rm)
standard_deviationage <- sd(datains$age)
standard_deviationdis <- sd(datains$dis)
standard_deviationrad <- sd(datains$rad)
standard_deviationtax <- sd(datains$tax)
standard_deviatinpratio <- sd(datains$ptratio)
standard_deviationb <- sd(datains$b)
standard_deviationistat <- sd(datains$lstat)
print(standard_deviationmedv)
## [1] 9.197104
print(standard_deviationcmed)
## [1] 9.182176
print(standard_deviationcrim)
## [1] 8.601545
print(standard_deviationzn)
## [1] 23.32245
print(standard_deviationindus)
## [1] 6.860353
print(standard_deviationchas)
## [1] 0.253994
print(standard_deviationnox)
## [1] 0.1158777
print(standard_deviationrm)
## [1] 0.7026171
print(standard_deviationage)
## [1] 28.14886
print(standard_deviationdis)
## [1] 2.10571
print(standard_deviationrad)
## [1] 8.707259
print(standard_deviationtax)
## [1] 168.5371
print(standard_deviatinpratio)
## [1] 2.164946
print(standard_deviationb)
## [1] 91.29486
print(standard_deviationistat)
## [1] 7.141062
# B) Data Visualization
#2 Graphs
# median home value compared to crime rate.
plot_crime <- ggplot(datains, aes(x = crim, y = medv)) +
geom_point() +
labs(x = "Crime Rate", y = "Median Value")
print(plot_crime)

# median home value compared to rooms in the home.
plot_rooms <- ggplot(datains, aes(x = rm, y = medv)) +
geom_point() +
labs(x = "Number of rooms", y = "Median Value")
print(plot_rooms)

# Here we display a histogram of the dependent variable.
hist(datains$medv)

hist(log(datains$medv))

# Here we display a correlation plot, without qualitative variables.
cor_matrix <- cor(datains %>% select(-chas))
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust", addCoef.col = "black")

# Hipotesis 1: Entre más vieja y mejor ubicada este la casa, más caro será el precio de ella
#################################
### LASSO REGRESSION ANALYSIS ###
#################################
set.seed(123) ### sets the random seed for reproducibility of results
training.samples<-datains$medv %>%
createDataPartition(p=0.75,list=FALSE) ### Lets consider 75% of the data to build a predictive model
train.data<-datains[training.samples, ] ### training data to fit the linear regression model
test.data<-datains[-training.samples, ] ### testing data to test the linear regression model
# LASSO regression via glmnet package can only take numerical observations. Then, the dataset is transformed to model.matrix() format.
# Independent variables
x<-model.matrix(log(medv) ~ age+dis,train.data)[,-1] ### OLS model specification
# x<-model.matrix(Weekly_Sales~.,train.data)[,-1] ### matrix of independent variables X's
y<-train.data$medv ### dependent variable
# In estimating LASSO regression it is important to define the lambda that minimizes the prediction error rate.
# Cross-validation ensures that every data / observation from the original dataset (datains) has a chance of appearing in train and test datasets.
# Find the best lambda using cross-validation.
set.seed(123)
cv.lasso<-cv.glmnet(x,y,alpha=1) # alpha = 1 for LASSO
# Display the best lambda value
cv.lasso$lambda.min ### lambda: a numeric value defining the amount of shrinkage. Why min? the higher the value of ?? , the more penalization there is
## [1] 0.122487
# Fit the final model on the training data
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)
# Display regression coefficients
coef(lassomodel)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 30.3328641
## age -0.1202913
## dis .
# Make predictions on the test data
x.test<-model.matrix(log(medv) ~ age+dis,test.data)[,-1] ### OLS model specification
# x.test<-model.matrix(Weekly_Sales~.,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()
# Model Accuracy
data.frame(
RMSE = RMSE(lassopredictions, test.data$medv),
Rsquare = R2(lassopredictions, test.data$medv))
## RMSE Rsquare
## 1 10.40051 0.107037
### visualizing lasso regression results
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])+ offset_x
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
lasso<-glmnet(scale(x),y,alpha=1)
plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)

print(lassomodel)
##
## Call: glmnet(x = x, y = y, alpha = 1, lambda = cv.lasso$lambda.min)
##
## Df %Dev Lambda
## 1 1 16.62 0.1225
## Interpretación Hipotesis 1 Modelo Lasso
### Despues de haber llevado a cabo la regressión LASSO, hemos podido elaborar diferentes tipos de insights.
#Primeramente, la gráfica que hemos creado nos enseña claramente que los coefficientes son negativos y cerca de cero.
#Lo que esto nos dice es que para la variable de edad de la propiedad, mientas más vieja sea menor será el precio de ella.
#Esto cumple con nuestra hipotesis ya que podemos ver que mientas mas nueva sea la propiedad mayor efecto tiene en el medv.
#Por el lado de la distancia al boston work center, podemos ver que la variable tiene un coeficiente muy cercano al cero.
#El mismo analisis nos permite decri que no hay relación, o hay una muy muy minima entre estas dos variables.
#La variable de distancia no tiene un efecto negativo o positivo entre medv, no hay relación.
#######HYPOTHESIS 2: entre mayor distancia a centros de trabajo, mayor crimen por la zona y mas vieja la casa esta sera menos cara #########
#################################
### LASSO REGRESSION ANALYSIS ###
#################################
### Split the Data in Training Data vs Test Data
# Lets randomly split the data into train and test set
set.seed(123) ### sets the random seed for reproducibility of results
training.samples<-datains$medv %>%
createDataPartition(p=0.75,list=FALSE) ### Lets consider 75% of the data to build a predictive model
train.data<-datains[training.samples, ] ### training data to fit the linear regression model
test.data<-datains[-training.samples, ] ### testing data to test the linear regression model
# LASSO regression via glmnet package can only take numerical observations. Then, the dataset is transformed to model.matrix() format.
# Independent variables
x<-model.matrix(log(medv) ~ crim+age+dis,train.data)[,-1] ### OLS model specification
# x<-model.matrix(Weekly_Sales~.,train.data)[,-1] ### matrix of independent variables X's
y<-train.data$medv ### dependent variable
# In estimating LASSO regression it is important to define the lambda that minimizes the prediction error rate.
# Cross-validation ensures that every data / observation from the original dataset (datains) has a chance of appearing in train and test datasets.
# Find the best lambda using cross-validation.
set.seed(123)
cv.lasso<-cv.glmnet(x,y,alpha=1) # alpha = 1 for LASSO
# Display the best lambda value
cv.lasso$lambda.min ### lambda: a numeric value defining the amount of shrinkage. Why min? the higher the value of ?? , the more penalization there is
## [1] 0.05572061
# Fit the final model on the training data
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)
# Display regression coefficients
coef(lassomodel)
## 4 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 31.8613624
## crim -0.3336781
## age -0.1054565
## dis -0.3404518
# Make predictions on the test data
x.test<-model.matrix(log(medv) ~ crim+age+dis,test.data)[,-1] ### OLS model specification
# x.test<-model.matrix(Weekly_Sales~.,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()
# Model Accuracy
data.frame(
RMSE = RMSE(lassopredictions, test.data$medv),
Rsquare = R2(lassopredictions, test.data$medv))
## RMSE Rsquare
## 1 10.06598 0.1596282
### visualizing lasso regression results
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])+ offset_x
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
lasso<-glmnet(scale(x),y,alpha=1)
plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)

### Interpretación: lo primero que se puede ver en esta grafica es como los coeficientes son negativos, hablando de
#la variable "age" vemos que mientras mas vieja el precio si sera menor por lo que esta parte de nuestra hipotesis es correcta,
#pasando a la siguiente variable la cual es "crime" vemos que igual estamos correctos en nuestra hipotesis que dice que al tener
#mayor crimen en la zona la casa tendra un menor valor y finalmente con nuestra variable de "distance" tiene menor relacion
#ya que esta mucho mas cerca del 0 y aqui no tiene gran relevancia en la hipotesis hablando acerca de la distancia que la casa sea mas barata.
#Por lo que en general la hipotesis seria rechazada ya que una de las 3 variables que usamos fue incorrecta.
#Hypothesis 3: Entre mas area para habitar, que tenga vista al rio, menor distancia a centros de trabajos y que entre mas cuartos tenga mayor sera la el costo de la casa.
# Multiple Linear Regression Analysis
model1<-lm(medv ~ zn+chas+rm+dis,data=datains)
summary(model1)
##
## Call:
## lm(formula = medv ~ zn + chas + rm + dis, data = datains)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.591 -3.351 0.097 2.688 39.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -30.96429 2.71155 -11.419 < 2e-16 ***
## zn 0.05690 0.01678 3.391 0.000751 ***
## chas 4.61757 1.13091 4.083 5.17e-05 ***
## rm 8.26060 0.42797 19.302 < 2e-16 ***
## dis 0.16237 0.18125 0.896 0.370762
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.381 on 501 degrees of freedom
## Multiple R-squared: 0.5225, Adjusted R-squared: 0.5187
## F-statistic: 137 on 4 and 501 DF, p-value: < 2.2e-16
model2<-lm(log(medv) ~ zn+chas+rm+dis,data=datains)
summary(model2)
##
## Call:
## lm(formula = log(medv) ~ zn + chas + rm + dis, data = datains)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1104 -0.1338 0.0420 0.1629 1.4281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8130409 0.1275211 6.376 4.14e-10 ***
## zn 0.0010743 0.0007892 1.361 0.174
## chas 0.2088265 0.0531853 3.926 9.83e-05 ***
## rm 0.3257937 0.0201269 16.187 < 2e-16 ***
## dis 0.0388205 0.0085238 4.554 6.61e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3001 on 501 degrees of freedom
## Multiple R-squared: 0.4653, Adjusted R-squared: 0.461
## F-statistic: 109 on 4 and 501 DF, p-value: < 2.2e-16
model3<-lm(log(medv) ~ zn+I(zn^2)+chas+rm+dis,data=datains)
summary(model3)
##
## Call:
## lm(formula = log(medv) ~ zn + I(zn^2) + chas + rm + dis, data = datains)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.11130 -0.14435 0.04365 0.16144 1.42502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.332e-01 1.284e-01 6.487 2.11e-10 ***
## zn 3.621e-03 2.162e-03 1.675 0.094628 .
## I(zn^2) -3.151e-05 2.491e-05 -1.265 0.206484
## chas 2.079e-01 5.316e-02 3.912 0.000104 ***
## rm 3.231e-01 2.022e-02 15.978 < 2e-16 ***
## dis 3.588e-02 8.831e-03 4.062 5.64e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2999 on 500 degrees of freedom
## Multiple R-squared: 0.467, Adjusted R-squared: 0.4617
## F-statistic: 87.62 on 5 and 500 DF, p-value: < 2.2e-16
### Diagnostic Tests
# Model 1:
vif(model1)
## zn chas rm dis
## 1.899862 1.023388 1.121511 1.806664
bptest(model1)
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 38.041, df = 4, p-value = 1.099e-07
AIC(model1)
## [1] 3318.475
histogram(model1$residuals)

# Model 2:
vif(model2)
## zn chas rm dis
## 1.899862 1.023388 1.121511 1.806664
bptest(model2)
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 57.68, df = 4, p-value = 8.906e-12
AIC(model2)
## [1] 224.7979
histogram(model2$residuals)

# Model 3:
vif(model3)
## zn I(zn^2) chas rm dis
## 14.275978 11.583736 1.023563 1.133692 1.941621
bptest(model3)
##
## studentized Breusch-Pagan test
##
## data: model3
## BP = 56.788, df = 5, p-value = 5.592e-11
AIC(model3)
## [1] 225.1812
histogram(model3$residuals)

# Detect Heteroscedasticity
boxplot(formula=medv ~ chas, data=datains, col=cm.colors(3))

#Interpretación
# En este caso el modelo que mejor se acopla es el modelo 2, pues tiene un valor AIC de 224, el más bajo.
# Podemos observar que los valores que tienen más peso son los de vista al río, cantidad de cuartos y distancia al trabajo.
# Definitivamente las habitaciones son el valor que más incrementa el precio de la casa, por cada habitación extra el valor sube .32 unidades.
# Tenemos algo similar en el caso de la vista al río, incrementa .21 unidades pero no hay muchas propiedades con esta característica.
# Finalmente las variables de tamaño del lote residencial y distancia al trabajo no afectan tanto como las otras variables el precio de la casa.
# Podemos concluir que el modelo confirma nuestra hipotesis, todos estos valores sí incrementan el precio de la propiedad, pero algunos como cantidad de habitaciones y vista al río tienen un peso más grande.
#E: Results discussion
#El analisis de regresion lineal nos ayuda a mejorar la predeccion al identificar patrones o tendencias que tengan
#las variables seleccionadas, asi logrando poder tener una mejor prediccion sobre como estas variables se comportaran en un futuro y
#sobre como se afecatn entre ellas, tambien se puede ver la relevancia de las variables seleccionadas y esto lo pudimos
#ver en los 3 ejemplos que hicimos con las hipotesis creadas donde vimos que las variables que seleccionamos como se comportaban
#y que tan relevantes eran para nuestra variable dependiente.
library(htmlwidgets)