Assigment 1

library(foreign)
library(dplyr)        # data manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(forcats)      # to work with categorical variables
library(ggplot2)      # data visualization 
library(readr)        # read specific csv files
library(janitor)      # data exploration and cleaning

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(Hmisc)        # several useful functions for data analysis

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(psych)        # functions for multivariate analysis

## 
## Attaching package: 'psych'

## The following object is masked from 'package:Hmisc':
## 
##     describe

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(naniar)       # summaries and visualization of missing values NA's
library(dlookr)       # summaries and visualization of missing values NA's

## 
## Attaching package: 'dlookr'

## The following object is masked from 'package:psych':
## 
##     describe

## The following object is masked from 'package:Hmisc':
## 
##     describe

## The following object is masked from 'package:base':
## 
##     transform

library(corrplot)     # correlation plots

## corrplot 0.92 loaded

library(jtools)       # presentation of regression analysis

## 
## Attaching package: 'jtools'

## The following object is masked from 'package:Hmisc':
## 
##     %nin%

library(lmtest)       # diagnostic checks - linear regression analysis

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(car)          # diagnostic checks - linear regression analysis

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

library(olsrr)        # diagnostic checks - linear regression analysis

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

library(naniar)       # identifying missing values
library(stargazer)    # create publication quality tables

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(effects)      # displays for linear and other regression models

## Registered S3 method overwritten by 'survey':
##   method      from  
##   summary.pps dlookr

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

library(tidyverse)    # collection of R packages designed for data science

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ stringr   1.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()       masks ggplot2::%+%()
## ✖ psych::alpha()     masks ggplot2::alpha()
## ✖ tidyr::extract()   masks dlookr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ car::recode()      masks dplyr::recode()
## ✖ purrr::some()      masks car::some()
## ✖ Hmisc::src()       masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)        # Classification and Regression Training

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(glmnet)       # methods for prediction and plotting, and functions for cross-validation

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-7

library(corrplot)

#Por si los necesitan instalar.
#install.packages("janitor")
#install.packages("lmtest")
#install.packages("tidyverse")
#install.packages("glmnet")
#install.packages("missRanger")
#install.packages("corrplot")
#install.packages("missForest")
#install.packages("ggplot2")

# A) Exploratory Data Analysis

#Here we load the dataset
datains <- read_csv("/Users/valeriacantulobo/Downloads/real_estate_data.csv")

## Rows: 506 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): medv, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Here we analyze the first 6 rows and the columns
head(datains)

## # A tibble: 6 × 15
##    medv cmedv    crim    zn indus  chas   nox    rm   age   dis   rad   tax
##   <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  24    24   0.00632    18  2.31     0 0.538  6.58  65.2  4.09     1   296
## 2  21.6  21.6 0.0273      0  7.07     0 0.469  6.42  78.9  4.97     2   242
## 3  34.7  34.7 0.0273      0  7.07     0 0.469  7.18  61.1  4.97     2   242
## 4  33.4  33.4 0.0324      0  2.18     0 0.458  7.00  45.8  6.06     3   222
## 5  36.2  36.2 0.0690      0  2.18     0 0.458  7.15  54.2  6.06     3   222
## 6  28.7  28.7 0.0298      0  2.18     0 0.458  6.43  58.7  6.06     3   222
## # … with 3 more variables: ptratio <dbl>, b <dbl>, lstat <dbl>

#Here we see the structure of the dataset
str(datains)

## spc_tbl_ [506 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ medv   : num [1:506] 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
##  $ cmedv  : num [1:506] 24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
##  $ crim   : num [1:506] 0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num [1:506] 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num [1:506] 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : num [1:506] 0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num [1:506] 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num [1:506] 6.58 6.42 7.18 7 7.15 ...
##  $ age    : num [1:506] 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num [1:506] 4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num [1:506] 1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num [1:506] 296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num [1:506] 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num [1:506] 397 397 393 395 397 ...
##  $ lstat  : num [1:506] 4.98 9.14 4.03 2.94 5.33 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   medv = col_double(),
##   ..   cmedv = col_double(),
##   ..   crim = col_double(),
##   ..   zn = col_double(),
##   ..   indus = col_double(),
##   ..   chas = col_double(),
##   ..   nox = col_double(),
##   ..   rm = col_double(),
##   ..   age = col_double(),
##   ..   dis = col_double(),
##   ..   rad = col_double(),
##   ..   tax = col_double(),
##   ..   ptratio = col_double(),
##   ..   b = col_double(),
##   ..   lstat = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

#Here we try to identify if there are some missing values
sum(is.na(datains))

## [1] 0

gg_miss_var(datains)

#Here we use the descriptive stadistic
summary(datains)

##       medv           cmedv            crim                zn        
##  Min.   : 5.00   Min.   : 5.00   Min.   : 0.00632   Min.   :  0.00  
##  1st Qu.:17.02   1st Qu.:17.02   1st Qu.: 0.08205   1st Qu.:  0.00  
##  Median :21.20   Median :21.20   Median : 0.25651   Median :  0.00  
##  Mean   :22.53   Mean   :22.53   Mean   : 3.61352   Mean   : 11.36  
##  3rd Qu.:25.00   3rd Qu.:25.00   3rd Qu.: 3.67708   3rd Qu.: 12.50  
##  Max.   :50.00   Max.   :50.00   Max.   :88.97620   Max.   :100.00  
##      indus            chas              nox               rm       
##  Min.   : 0.46   Min.   :0.00000   Min.   :0.3850   Min.   :3.561  
##  1st Qu.: 5.19   1st Qu.:0.00000   1st Qu.:0.4490   1st Qu.:5.886  
##  Median : 9.69   Median :0.00000   Median :0.5380   Median :6.208  
##  Mean   :11.14   Mean   :0.06917   Mean   :0.5547   Mean   :6.285  
##  3rd Qu.:18.10   3rd Qu.:0.00000   3rd Qu.:0.6240   3rd Qu.:6.623  
##  Max.   :27.74   Max.   :1.00000   Max.   :0.8710   Max.   :8.780  
##       age              dis              rad              tax       
##  Min.   :  2.90   Min.   : 1.130   Min.   : 1.000   Min.   :187.0  
##  1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000   1st Qu.:279.0  
##  Median : 77.50   Median : 3.207   Median : 5.000   Median :330.0  
##  Mean   : 68.57   Mean   : 3.795   Mean   : 9.549   Mean   :408.2  
##  3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000   3rd Qu.:666.0  
##  Max.   :100.00   Max.   :12.127   Max.   :24.000   Max.   :711.0  
##     ptratio            b              lstat      
##  Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :19.05   Median :391.44   Median :11.36  
##  Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :22.00   Max.   :396.90   Max.   :37.97

# Here since we did not obtain the standard deviation with the summary we go individually pero numeric column
standard_deviationmedv <- sd(datains$medv)
standard_deviationcmed <- sd(datains$cmedv)
standard_deviationcrim <- sd(datains$crim)
standard_deviationzn <- sd(datains$zn)
standard_deviationindus <- sd(datains$indus)
standard_deviationchas <- sd(datains$chas)
standard_deviationnox <- sd(datains$nox)
standard_deviationrm <- sd(datains$rm)
standard_deviationage <- sd(datains$age)
standard_deviationdis <- sd(datains$dis)
standard_deviationrad <- sd(datains$rad)
standard_deviationtax <- sd(datains$tax)
standard_deviatinpratio <- sd(datains$ptratio)
standard_deviationb <- sd(datains$b)
standard_deviationistat <- sd(datains$lstat)

print(standard_deviationmedv)

## [1] 9.197104

print(standard_deviationcmed)

## [1] 9.182176

print(standard_deviationcrim)

## [1] 8.601545

print(standard_deviationzn)

## [1] 23.32245

print(standard_deviationindus)

## [1] 6.860353

print(standard_deviationchas)

## [1] 0.253994

print(standard_deviationnox)

## [1] 0.1158777

print(standard_deviationrm)

## [1] 0.7026171

print(standard_deviationage)

## [1] 28.14886

print(standard_deviationdis)

## [1] 2.10571

print(standard_deviationrad)

## [1] 8.707259

print(standard_deviationtax)

## [1] 168.5371

print(standard_deviatinpratio)

## [1] 2.164946

print(standard_deviationb)

## [1] 91.29486

print(standard_deviationistat)

## [1] 7.141062

# B) Data Visualization

#2 Graphs

# median home value compared to crime rate.
plot_crime <- ggplot(datains, aes(x = crim, y = medv)) +
  geom_point() +
  labs(x = "Crime Rate", y = "Median Value")
print(plot_crime)

# median home value compared to rooms in the home.
plot_rooms <- ggplot(datains, aes(x = rm, y = medv)) +
  geom_point() +
  labs(x = "Number of rooms", y = "Median Value")
print(plot_rooms)

# Here we display a histogram of the dependent variable.
hist(datains$medv)

hist(log(datains$medv))

# Here we display a correlation plot, without qualitative variables.
cor_matrix <- cor(datains %>% select(-chas))
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust", addCoef.col = "black")

# Hipotesis 1: Entre más vieja y mejor ubicada este la casa, más caro será el precio de ella

#################################
### LASSO REGRESSION ANALYSIS ###
#################################

set.seed(123)                                  ### sets the random seed for reproducibility of results
training.samples<-datains$medv %>%
  createDataPartition(p=0.75,list=FALSE)       ### Lets consider 75% of the data to build a predictive model

train.data<-datains[training.samples, ]  ### training data to fit the linear regression model 
test.data<-datains[-training.samples, ]  ### testing data to test the linear regression model         


# LASSO regression via glmnet package can only take numerical observations. Then, the dataset is transformed to model.matrix() format. 
# Independent variables
x<-model.matrix(log(medv) ~ age+dis,train.data)[,-1] ### OLS model specification
# x<-model.matrix(Weekly_Sales~.,train.data)[,-1] ### matrix of independent variables X's
y<-train.data$medv ### dependent variable 

# In estimating LASSO regression it is important to define the lambda that minimizes the prediction error rate. 
# Cross-validation ensures that every data / observation from the original dataset (datains) has a chance of appearing in train and test datasets.
# Find the best lambda using cross-validation.
set.seed(123) 
cv.lasso<-cv.glmnet(x,y,alpha=1) # alpha = 1 for LASSO

# Display the best lambda value
cv.lasso$lambda.min                      ### lambda: a numeric value defining the amount of shrinkage. Why min? the higher the value of ?? , the more penalization there is

## [1] 0.122487

# Fit the final model on the training data
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)

# Display regression coefficients
coef(lassomodel)

## 3 x 1 sparse Matrix of class "dgCMatrix"
##                     s0
## (Intercept) 30.3328641
## age         -0.1202913
## dis          .

# Make predictions on the test data
x.test<-model.matrix(log(medv) ~ age+dis,test.data)[,-1] ### OLS model specification
# x.test<-model.matrix(Weekly_Sales~.,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()

# Model Accuracy
data.frame(
  RMSE = RMSE(lassopredictions, test.data$medv),
  Rsquare = R2(lassopredictions, test.data$medv))

##       RMSE  Rsquare
## 1 10.40051 0.107037

### visualizing lasso regression results 
lbs_fun <- function(fit, offset_x=1, ...) {
  L <- length(fit$lambda)
  x <- log(fit$lambda[L])+ offset_x
  y <- fit$beta[, L]
  labs <- names(y)
  text(x, y, labels=labs, ...)
}

lasso<-glmnet(scale(x),y,alpha=1)

plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)

print(lassomodel)

## 
## Call:  glmnet(x = x, y = y, alpha = 1, lambda = cv.lasso$lambda.min) 
## 
##   Df  %Dev Lambda
## 1  1 16.62 0.1225

## Interpretación Hipotesis 1 Modelo Lasso 
### Despues de haber llevado a cabo la regressión LASSO, hemos podido elaborar diferentes tipos de insights. 
#Primeramente, la gráfica que hemos creado nos enseña claramente que los coefficientes son negativos y cerca de cero. 
#Lo que esto nos dice es que para la variable de edad de la propiedad, mientas más vieja sea menor será el precio de ella. 
#Esto cumple con nuestra hipotesis ya que podemos ver que mientas mas nueva sea la propiedad mayor efecto tiene en el medv. 
#Por el lado de la distancia al boston work center, podemos ver que la variable tiene un coeficiente muy cercano al cero. 
#El mismo analisis nos permite decri que no hay relación, o hay una muy muy minima entre estas dos variables. 
#La variable de distancia no tiene un efecto negativo o positivo entre medv, no hay relación.

#######HYPOTHESIS 2: entre mayor distancia a centros de trabajo, mayor crimen por la zona y mas vieja la casa esta sera menos cara #########

#################################
### LASSO REGRESSION ANALYSIS ###
#################################

### Split the Data in Training Data vs Test Data
# Lets randomly split the data into train and test set
set.seed(123)                                  ### sets the random seed for reproducibility of results
training.samples<-datains$medv %>%
  createDataPartition(p=0.75,list=FALSE)       ### Lets consider 75% of the data to build a predictive model

train.data<-datains[training.samples, ]  ### training data to fit the linear regression model 
test.data<-datains[-training.samples, ]  ### testing data to test the linear regression model   

# LASSO regression via glmnet package can only take numerical observations. Then, the dataset is transformed to model.matrix() format. 
# Independent variables
x<-model.matrix(log(medv) ~ crim+age+dis,train.data)[,-1] ### OLS model specification
# x<-model.matrix(Weekly_Sales~.,train.data)[,-1] ### matrix of independent variables X's
y<-train.data$medv ### dependent variable 

# In estimating LASSO regression it is important to define the lambda that minimizes the prediction error rate. 
# Cross-validation ensures that every data / observation from the original dataset (datains) has a chance of appearing in train and test datasets.
# Find the best lambda using cross-validation.
set.seed(123) 
cv.lasso<-cv.glmnet(x,y,alpha=1) # alpha = 1 for LASSO

# Display the best lambda value
cv.lasso$lambda.min                      ### lambda: a numeric value defining the amount of shrinkage. Why min? the higher the value of ?? , the more penalization there is

## [1] 0.05572061

# Fit the final model on the training data
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)

# Display regression coefficients
coef(lassomodel)

## 4 x 1 sparse Matrix of class "dgCMatrix"
##                     s0
## (Intercept) 31.8613624
## crim        -0.3336781
## age         -0.1054565
## dis         -0.3404518

# Make predictions on the test data
x.test<-model.matrix(log(medv) ~ crim+age+dis,test.data)[,-1] ### OLS model specification
# x.test<-model.matrix(Weekly_Sales~.,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()


# Model Accuracy
data.frame(
  RMSE = RMSE(lassopredictions, test.data$medv),
  Rsquare = R2(lassopredictions, test.data$medv))

##       RMSE   Rsquare
## 1 10.06598 0.1596282

### visualizing lasso regression results 
lbs_fun <- function(fit, offset_x=1, ...) {
  L <- length(fit$lambda)
  x <- log(fit$lambda[L])+ offset_x
  y <- fit$beta[, L]
  labs <- names(y)
  text(x, y, labels=labs, ...)
}

lasso<-glmnet(scale(x),y,alpha=1)

plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)

### Interpretación: lo primero que se puede ver en esta grafica es como los coeficientes son negativos, hablando de 
#la variable "age" vemos que mientras mas vieja el precio si sera menor por lo que esta parte de nuestra hipotesis es correcta, 
#pasando a la siguiente variable la cual es "crime" vemos que igual estamos correctos en nuestra hipotesis que dice que al tener 
#mayor crimen en la zona la casa tendra un menor valor y finalmente con nuestra variable de "distance" tiene menor relacion 
#ya que esta mucho mas cerca del 0 y aqui no tiene gran relevancia en la hipotesis hablando acerca de la distancia que la casa sea mas barata. 
#Por lo que en general la hipotesis seria rechazada ya que una de las 3 variables que usamos fue incorrecta.

#Hypothesis 3: Entre mas area para habitar, que tenga vista al rio, menor distancia a centros de trabajos y que entre mas cuartos tenga mayor sera la el costo de la casa.

# Multiple Linear Regression Analysis 
model1<-lm(medv ~ zn+chas+rm+dis,data=datains)
summary(model1)

## 
## Call:
## lm(formula = medv ~ zn + chas + rm + dis, data = datains)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.591  -3.351   0.097   2.688  39.693 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -30.96429    2.71155 -11.419  < 2e-16 ***
## zn            0.05690    0.01678   3.391 0.000751 ***
## chas          4.61757    1.13091   4.083 5.17e-05 ***
## rm            8.26060    0.42797  19.302  < 2e-16 ***
## dis           0.16237    0.18125   0.896 0.370762    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.381 on 501 degrees of freedom
## Multiple R-squared:  0.5225, Adjusted R-squared:  0.5187 
## F-statistic:   137 on 4 and 501 DF,  p-value: < 2.2e-16

model2<-lm(log(medv) ~ zn+chas+rm+dis,data=datains)
summary(model2)

## 
## Call:
## lm(formula = log(medv) ~ zn + chas + rm + dis, data = datains)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1104 -0.1338  0.0420  0.1629  1.4281 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.8130409  0.1275211   6.376 4.14e-10 ***
## zn          0.0010743  0.0007892   1.361    0.174    
## chas        0.2088265  0.0531853   3.926 9.83e-05 ***
## rm          0.3257937  0.0201269  16.187  < 2e-16 ***
## dis         0.0388205  0.0085238   4.554 6.61e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3001 on 501 degrees of freedom
## Multiple R-squared:  0.4653, Adjusted R-squared:  0.461 
## F-statistic:   109 on 4 and 501 DF,  p-value: < 2.2e-16

model3<-lm(log(medv) ~ zn+I(zn^2)+chas+rm+dis,data=datains)
summary(model3)

## 
## Call:
## lm(formula = log(medv) ~ zn + I(zn^2) + chas + rm + dis, data = datains)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.11130 -0.14435  0.04365  0.16144  1.42502 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.332e-01  1.284e-01   6.487 2.11e-10 ***
## zn           3.621e-03  2.162e-03   1.675 0.094628 .  
## I(zn^2)     -3.151e-05  2.491e-05  -1.265 0.206484    
## chas         2.079e-01  5.316e-02   3.912 0.000104 ***
## rm           3.231e-01  2.022e-02  15.978  < 2e-16 ***
## dis          3.588e-02  8.831e-03   4.062 5.64e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2999 on 500 degrees of freedom
## Multiple R-squared:  0.467,  Adjusted R-squared:  0.4617 
## F-statistic: 87.62 on 5 and 500 DF,  p-value: < 2.2e-16

### Diagnostic Tests
# Model 1:
vif(model1)

##       zn     chas       rm      dis 
## 1.899862 1.023388 1.121511 1.806664

bptest(model1)

## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 38.041, df = 4, p-value = 1.099e-07

AIC(model1)

## [1] 3318.475

histogram(model1$residuals)

# Model 2:
vif(model2)

##       zn     chas       rm      dis 
## 1.899862 1.023388 1.121511 1.806664

bptest(model2)

## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 57.68, df = 4, p-value = 8.906e-12

AIC(model2)

## [1] 224.7979

histogram(model2$residuals)

# Model 3:
vif(model3)

##        zn   I(zn^2)      chas        rm       dis 
## 14.275978 11.583736  1.023563  1.133692  1.941621

bptest(model3)

## 
##  studentized Breusch-Pagan test
## 
## data:  model3
## BP = 56.788, df = 5, p-value = 5.592e-11

AIC(model3)

## [1] 225.1812

histogram(model3$residuals)

# Detect Heteroscedasticity 

boxplot(formula=medv ~ chas, data=datains, col=cm.colors(3))

#Interpretación
# En este caso el modelo que mejor se acopla es el modelo 2, pues tiene un valor AIC de 224, el más bajo.
# Podemos observar que los valores que tienen más peso son los de vista al río, cantidad de cuartos y distancia al trabajo.
# Definitivamente las habitaciones son el valor que más incrementa el precio de la casa, por cada habitación extra el valor sube .32 unidades.
# Tenemos algo similar en el caso de la vista al río, incrementa .21 unidades pero no hay muchas propiedades con esta característica.
# Finalmente las variables de tamaño del lote residencial y distancia al trabajo no afectan tanto como las otras variables el precio de la casa.
# Podemos concluir que el modelo confirma nuestra hipotesis, todos estos valores sí incrementan el precio de la propiedad, pero algunos como cantidad de habitaciones y vista al río tienen un peso más grande.

#E: Results discussion
#El analisis de regresion lineal nos ayuda a mejorar la predeccion al identificar patrones o tendencias que tengan 
#las variables seleccionadas, asi logrando poder tener una mejor prediccion sobre como estas variables se comportaran en un futuro y 
#sobre como se afecatn entre ellas, tambien se puede ver la relevancia de las variables seleccionadas y esto lo pudimos 
#ver en los 3 ejemplos que hicimos con las hipotesis creadas donde vimos que las variables que seleccionamos como se comportaban 
#y que tan relevantes eran para nuestra variable dependiente.

library(htmlwidgets)

Assigment 1

equipo 2

2023-09-03