Objetivos

Análisis exploratorio

# cargando las librerias
library(tidyverse)
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.0.8     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.4.0
## v ggplot2   3.4.2     v tibble    3.1.6
## v lubridate 1.9.2     v tidyr     1.2.0
## v purrr     0.3.4     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the [conflicted package](http://conflicted.r-lib.org/) to force all conflicts to become errors
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggfortify)
head(df)
## # A tibble: 6 x 21
##   id         date                  price bedrooms bathrooms sqft_living sqft_lot
##   <chr>      <dttm>                <dbl>    <dbl>     <dbl>       <dbl>    <dbl>
## 1 7129300520 2014-10-13 00:00:00  221900        3      1           1180     5650
## 2 6414100192 2014-12-09 00:00:00  538000        3      2.25        2570     7242
## 3 5631500400 2015-02-25 00:00:00  180000        2      1            770    10000
## 4 2487200875 2014-12-09 00:00:00  604000        4      3           1960     5000
## 5 1954400510 2015-02-18 00:00:00  510000        3      2           1680     8080
## 6 7237550310 2014-05-12 00:00:00 1225000        4      4.5         5420   101930
## # i 14 more variables: floors <dbl>, waterfront <dbl>, view <dbl>,
## #   condition <dbl>, grade <dbl>, sqft_above <dbl>, sqft_basement <dbl>,
## #   yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>, lat <dbl>, long <dbl>,
## #   sqft_living15 <dbl>, sqft_lot15 <dbl>

Filtrando datos por zipcode 98178

df=df %>% filter(zipcode=="98178") %>% 
  select(-c(id, zipcode, date))

Explorando los datos

ggplot(df, aes(x = price)) +
  geom_histogram() +
  labs(title = "Histograma de price")
## 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x = bathrooms)) +
  geom_bar() +
  labs(title = "Gráfico de barras de bathrooms")

ggplot(df, aes(x = bedrooms)) +
  geom_bar() +
  labs(title = "Gráfico de barras de bedrooms")

ggplot(df, aes(x = sqft_living)) +
  geom_histogram() +
  labs(title = "Histograma de sqft_living")
## 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x = sqft_lot)) +
  geom_histogram() +
  labs(title = "Histograma de sqft_lot")
## 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x = floors)) +
  geom_bar() +
  labs(title = "Histograma de floors")

ggplot(df, aes(x = waterfront)) +
  geom_bar() +
  labs(title = "Histograma de waterfront")

ggplot(df, aes(x = view)) +
  geom_bar() +
  labs(title = "Histograma de view",
       x = "Price",
       y = "Frecuencia") 

ggplot(df, aes(x = sqft_living15)) +
  geom_histogram() +
  labs(title = "Histograma de sqft_living15",
       x = "Price",
       y = "Frecuencia") 
## 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x = sqft_lot15)) +
  geom_histogram() +
  labs(title = "Histograma de sqft_lot15")
## 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x = yr_built)) +
  geom_histogram() +
  labs(title = "Histograma de yr_built")
## 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

numerical_variables= df %>% select(price, sqft_living, sqft_lot, sqft_above, sqft_basement, sqft_living15, sqft_living15, sqft_lot15)
autoplot(cor(numerical_variables))
## 
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

modelo de regresión lineal simple

modelo1=lm(price~sqft_living,data= df)

summary(modelo1)
## 
## Call:
## lm(formula = price ~ sqft_living, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -174111  -61000  -12016   27040 1190976 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 75701.18   21186.09   3.573  0.00042 ***
## sqft_living   135.84      11.34  11.975  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 129500 on 260 degrees of freedom
## Multiple R-squared:  0.3555, Adjusted R-squared:  0.353 
## F-statistic: 143.4 on 1 and 260 DF,  p-value: < 2.2e-16

Diagnostico del modelo

autoplot(modelo1)

shapiro.test(modelo1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo1$residuals
## W = 0.63742, p-value < 2.2e-16
bptest(modelo1)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo1
## BP = 7.8787, df = 1, p-value = 0.005002

Visualización de recta de regresión del modelo ajustado

intervalos_prediccion = predict(modelo1, interval = "prediction")
## Warning in predict.lm(modelo1, interval = "prediction"): predictions on current data refer to _future_ responses
df_1 <- cbind(df,intervalos_prediccion)

# Tu código existente
ggplot(data = df_1, aes(x = sqft_living, y = price)) + 
  geom_point() + 
  geom_smooth(method = 'lm', se = TRUE) +
  geom_line(aes(y = lwr), linetype = "dashed") +
  geom_line(aes(y = upr), linetype = "dashed")
## 
## `geom_smooth()` using formula = 'y ~ x'

confint(modelo1)
##                  2.5 %      97.5 %
## (Intercept) 33983.0176 117419.3449
## sqft_living   113.5017    158.1743
sigma(modelo1)
## [1] 129537.5

Predicciones con el modelo de regresión lineal simple

df_filtrado=df[c(1,10,30,50,100,25,228),]

df_filtrado
## # A tibble: 7 x 18
##     price bedrooms bathrooms sqft_living sqft_lot floors waterfront  view
##     <dbl>    <dbl>     <dbl>       <dbl>    <dbl>  <dbl>      <dbl> <dbl>
## 1  221900        3      1           1180     5650      1          0     0
## 2  290900        2      2           1610    17600      2          0     0
## 3  240000        4      1.5         1920     7973      1          0     0
## 4  100000        2      1            790     6426      1          0     0
## 5  210000        4      1.75        2090     6485      1          0     0
## 6 1700000        4      3.75        3190    17186      2          1     4
## 7   90000        2      1            580     7500      1          0     0
## # i 10 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## #   sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, lat <dbl>,
## #   long <dbl>, sqft_living15 <dbl>, sqft_lot15 <dbl>
new_data=data.frame(sqft_living=df_filtrado$sqft_living)


predicciones=predict(modelo1, newdata = new_data, interval= "confidence")

cbind(df_filtrado$price, predicciones)
##                fit      lwr      upr
## 1  221900 235990.0 216017.5 255962.5
## 2  290900 294400.3 278417.8 310382.9
## 3  240000 336510.1 320186.2 352834.0
## 4  100000 183013.2 156772.7 209253.7
## 5  210000 359602.6 341904.3 377300.8
## 6 1700000 509024.4 472792.3 545256.4
## 7   90000 154487.2 124364.1 184610.3
new_data=data.frame(sqft_living=df_filtrado$sqft_living)


predicciones=predict(modelo1, newdata = new_data, interval= "prediction")

cbind(df_filtrado$price, predicciones)
##                fit        lwr      upr
## 1  221900 235990.0  -19866.86 491846.9
## 2  290900 294400.3   38823.97 549976.7
## 3  240000 336510.1   80912.18 592108.0
## 4  100000 183013.2  -73409.12 439435.5
## 5  210000 359602.6  103913.18 615292.0
## 6 1700000 509024.4  251387.80 766660.9
## 7   90000 154487.2 -102361.45 411335.9

Datos atipicos

autoplot(modelo1, which=4)

Reajustando el modelo

df_sin_atipicos=  subset(df, !(row.names(df) == "25"))
modelo2=lm(price~sqft_living, data=df_sin_atipicos)
summary(modelo2)
## 
## Call:
## lm(formula = price ~ sqft_living, data = df_sin_atipicos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -159831  -53215   -9442   28230  615500 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 94605.159  17431.362   5.427 1.31e-07 ***
## sqft_living   122.224      9.367  13.048  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 106100 on 259 degrees of freedom
## Multiple R-squared:  0.3966, Adjusted R-squared:  0.3943 
## F-statistic: 170.2 on 1 and 259 DF,  p-value: < 2.2e-16
autoplot(modelo2)

shapiro.test(modelo2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo2$residuals
## W = 0.73529, p-value < 2.2e-16
bptest(modelo2)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo2
## BP = 4.5122, df = 1, p-value = 0.03365

Visualizando recta de regresión del modelo reajustado

intervalos_prediccion = predict(modelo2, interval = "prediction")
## Warning in predict.lm(modelo2, interval = "prediction"): predictions on current data refer to _future_ responses
df_sin_atipicos <- cbind(df_sin_atipicos,intervalos_prediccion)

# Tu código existente
ggplot(data = df_sin_atipicos, aes(x = sqft_living, y = price)) + 
  geom_point() + 
  geom_smooth(method = 'lm', se = TRUE) +
  geom_line(aes(y = lwr), linetype = "dashed") +
  geom_line(aes(y = upr), linetype = "dashed")
## 
## `geom_smooth()` using formula = 'y ~ x'

Datos atipicos

autoplot(modelo2, which = 4)

Reajustando el modelo

df_sin_atipicos2=  subset(df_sin_atipicos, !(row.names(df_sin_atipicos) == "68"))
modelo3= lm(price~sqft_living, data=df_sin_atipicos2)

summary(modelo3)
## 
## Call:
## lm(formula = price ~ sqft_living, data = df_sin_atipicos2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -155511  -51763   -9919   28324  588123 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.046e+05  1.634e+04   6.399 7.31e-10 ***
## sqft_living 1.150e+02  8.815e+00  13.050  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 98990 on 258 degrees of freedom
## Multiple R-squared:  0.3976, Adjusted R-squared:  0.3953 
## F-statistic: 170.3 on 1 and 258 DF,  p-value: < 2.2e-16
autoplot(modelo3)

shapiro.test(modelo3$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo3$residuals
## W = 0.75762, p-value < 2.2e-16
bptest(modelo3)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo3
## BP = 1.4389, df = 1, p-value = 0.2303
df_sin_atipicos2=df_sin_atipicos2 %>% select(-c(fit, lwr, upr))
dim(df_sin_atipicos2)
## [1] 260  18
dim(intervalos_prediccion)
## [1] 261   3

Visualizando la recta de regresión del modelo reajustado

intervalos_prediccion = predict(modelo3, interval = "prediction")
## Warning in predict.lm(modelo3, interval = "prediction"): predictions on current data refer to _future_ responses
df_sin_atipicos2 <- cbind(df_sin_atipicos2,intervalos_prediccion)

# Tu código existente
ggplot(data = df_sin_atipicos2, aes(x = sqft_living, y = price)) + 
  geom_point() + 
  geom_smooth(method = 'lm', se = TRUE) +
  geom_line(aes(y = lwr), linetype = "dashed") +
  geom_line(aes(y = upr), linetype = "dashed")
## 
## `geom_smooth()` using formula = 'y ~ x'

autoplot(modelo3, which = 4)

Predicciones con modelo de regresión lineal simple reajustado

predicciones=predict(modelo3, newdata = new_data, interval= "confidence")

cbind(df_filtrado$price, predicciones)
##                fit      lwr      upr
## 1  221900 240327.9 225050.0 255605.8
## 2  290900 289795.2 277560.8 302029.6
## 3  240000 325457.7 312870.4 338045.0
## 4  100000 195462.1 175319.5 215604.7
## 5  210000 345014.6 331309.4 358719.8
## 6 1700000 471559.0 443292.9 499825.1
## 7   90000 171303.6 148141.7 194465.5
predicciones=predict(modelo3, newdata = new_data, interval= "prediction")

cbind(df_filtrado$price, predicciones)
##                fit         lwr      upr
## 1  221900 240327.9  44790.9213 435864.8
## 2  290900 289795.2  94472.5143 485117.9
## 3  240000 325457.7 130112.6119 520802.9
## 4  100000 195462.1   -514.9383 391439.1
## 5  210000 345014.6 149594.2607 540434.9
## 6 1700000 471559.0 274581.2314 668536.8
## 7   90000 171303.6 -25006.7189 367614.0

Modelo de regresión lineal multiple

# od el modelo lineal inicial
modelo_inicial <- lm(price ~ ., data = df)


intercept_only <- lm(price ~ 1, data=df)
 
all <- lm(price ~ ., data=df)
 
forward <- step(intercept_only, direction='forward', scope=formula(all), trace=0, k=3.84,criteria= "AIC")
 
formula(forward)
## price ~ view + waterfront + sqft_living + grade + sqft_above + 
##     bathrooms + condition
backward <- step(all, direction='backward', scope=formula(all), trace=0, k=3.84,criteria= "AIC")
 
formula(backward)
## price ~ bathrooms + sqft_living + waterfront + view + condition + 
##     grade + sqft_above
both <- step(intercept_only, direction='both', scope=formula(all), trace=0, k=3.84,criteria= "AIC")
 
formula(both)
## price ~ view + waterfront + sqft_living + grade + sqft_above + 
##     bathrooms + condition
# Realizar la selección de variables paso a paso con AIC
modelo_seleccionado <- both
summary(modelo_seleccionado)
## 
## Call:
## lm(formula = price ~ view + waterfront + sqft_living + grade + 
##     sqft_above + bathrooms + condition, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -203698  -33413   -4616   32805  546391 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -179330.58   52022.79  -3.447 0.000663 ***
## view          34262.31    5152.63   6.649 1.79e-10 ***
## waterfront   452127.51   28071.11  16.107  < 2e-16 ***
## sqft_living      32.04      11.41   2.807 0.005391 ** 
## grade         39312.52    8222.29   4.781 2.95e-06 ***
## sqft_above       42.87      12.74   3.364 0.000888 ***
## bathrooms     19744.01    8990.44   2.196 0.028988 *  
## condition     12552.54    6444.26   1.948 0.052533 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 65350 on 254 degrees of freedom
## Multiple R-squared:  0.8397, Adjusted R-squared:  0.8353 
## F-statistic: 190.1 on 7 and 254 DF,  p-value: < 2.2e-16

diagnostico del modelo

autoplot(modelo_seleccionado)

shapiro.test(modelo_seleccionado$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo_seleccionado$residuals
## W = 0.86209, p-value = 1.44e-14
bptest(modelo_seleccionado)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_seleccionado
## BP = 68.734, df = 7, p-value = 2.662e-12

Predicciones con el modelo seleccionado

df_filtrado=df[c(1,10,30,50,100,25,228),]

df_filtrado
## # A tibble: 7 x 18
##     price bedrooms bathrooms sqft_living sqft_lot floors waterfront  view
##     <dbl>    <dbl>     <dbl>       <dbl>    <dbl>  <dbl>      <dbl> <dbl>
## 1  221900        3      1           1180     5650      1          0     0
## 2  290900        2      2           1610    17600      2          0     0
## 3  240000        4      1.5         1920     7973      1          0     0
## 4  100000        2      1            790     6426      1          0     0
## 5  210000        4      1.75        2090     6485      1          0     0
## 6 1700000        4      3.75        3190    17186      2          1     4
## 7   90000        2      1            580     7500      1          0     0
## # i 10 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## #   sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, lat <dbl>,
## #   long <dbl>, sqft_living15 <dbl>, sqft_lot15 <dbl>
new_data=data.frame(bathrooms= df_filtrado$bathrooms,sqft_living=df_filtrado$sqft_living,waterfront= df_filtrado$waterfront, view= df_filtrado$view,condition= df_filtrado$condition, grade= df_filtrado$grade, sqft_above= df_filtrado$sqft_above)


predicciones=predict(modelo_seleccionado, newdata = new_data, interval= "confidence")

cbind(price=df_filtrado$price,predicciones)
##     price       fit        lwr       upr
## 1  221900  241643.8  225132.99  258154.7
## 2  290900  254283.5  235053.37  273513.6
## 3  240000  346256.3  321661.15  370851.5
## 4  100000  173119.3  159019.72  187218.8
## 5  210000  289891.8  275933.88  303849.7
## 6 1700000 1153608.5 1095691.19 1211525.9
## 7   90000  118077.2   95076.08  141078.3
predicciones=predict(modelo_seleccionado, newdata = new_data, interval= "prediction")
cbind(price=df_filtrado$price,predicciones)
##     price       fit        lwr       upr
## 1  221900  241643.8  111889.55  371398.1
## 2  290900  254283.5  124155.20  384411.7
## 3  240000  346256.3  215227.74  477284.9
## 4  100000  173119.3   43649.72  302588.8
## 5  210000  289891.8  160437.59  419346.0
## 6 1700000 1153608.5 1012477.44 1294739.6
## 7   90000  118077.2  -12661.55  248815.9

Datos atipicos

autoplot(modelo_seleccionado, which=4)

Reajustando el modelo seleccionado

df=  subset(df, !(row.names(df) == "25"))
modelo_seleccionado2= lm(formula = price ~ view + waterfront + sqft_living + grade + 
    sqft_above + bathrooms + condition, data = df)
autoplot(modelo_seleccionado2)

shapiro.test(modelo_seleccionado2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo_seleccionado2$residuals
## W = 0.98259, p-value = 0.002808
bptest(modelo_seleccionado2)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_seleccionado2
## BP = 39.433, df = 7, p-value = 1.616e-06

Datos atipicos

autoplot(modelo_seleccionado2, which = 4)

Reajustando el modelo seleccionado

df=  subset(df, !(row.names(df) == "68"))
modelo_seleccionado3= lm(price ~ view + waterfront + sqft_living + grade + 
    sqft_above + bathrooms + condition, data = df)
summary(modelo_seleccionado3)
## 
## Call:
## lm(formula = price ~ view + waterfront + sqft_living + grade + 
##     sqft_above + bathrooms + condition, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -121865  -32499   -2335   32676  207970 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -65696.068  42496.700  -1.546 0.123381    
## view         37213.943   4111.922   9.050  < 2e-16 ***
## waterfront  349237.425  24498.609  14.255  < 2e-16 ***
## sqft_living     53.282      9.258   5.755 2.51e-08 ***
## grade        23105.355   6693.149   3.452 0.000652 ***
## sqft_above      20.300     10.340   1.963 0.050715 .  
## bathrooms    12771.285   7194.592   1.775 0.077085 .  
## condition    12745.708   5134.596   2.482 0.013705 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52060 on 252 degrees of freedom
## Multiple R-squared:  0.8373, Adjusted R-squared:  0.8328 
## F-statistic: 185.2 on 7 and 252 DF,  p-value: < 2.2e-16
autoplot(modelo_seleccionado3)

shapiro.test(modelo_seleccionado3$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo_seleccionado3$residuals
## W = 0.98103, p-value = 0.001533
bptest(modelo_seleccionado3)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_seleccionado3
## BP = 27.892, df = 7, p-value = 0.00023

Predicciones con el modelo seleccionado reajustado

new_data=data.frame(bathrooms= df_filtrado$bathrooms,sqft_living=df_filtrado$sqft_living,waterfront= df_filtrado$waterfront, view= df_filtrado$view,condition= df_filtrado$condition, grade= df_filtrado$grade, sqft_above= df_filtrado$sqft_above)


predicciones=predict(modelo_seleccionado3, newdata = new_data, interval= "confidence")

cbind(price=df_filtrado$price,predicciones)
##     price      fit      lwr       upr
## 1  221900 233877.2 220640.8  247113.6
## 2  290900 255183.6 239814.5  270552.7
## 3  240000 317819.2 297692.6  337945.8
## 4  100000 182074.7 170732.9  193416.5
## 5  210000 293972.5 282833.3  305111.7
## 6 1700000 984308.3 929292.0 1039324.7
## 7   90000 143517.0 124735.9  162298.0
predicciones=predict(modelo_seleccionado3, newdata = new_data, interval= "prediction")
cbind(price=df_filtrado$price,predicciones)
##     price      fit       lwr       upr
## 1  221900 233877.2 130498.56  337255.8
## 2  290900 255183.6 151510.33  358856.9
## 3  240000 317819.2 213334.71  422303.8
## 4  100000 182074.7  78921.49  285227.8
## 5  210000 293972.5 190841.44  397103.6
## 6 1700000 984308.3 867952.26 1100664.4
## 7   90000 143517.0  39283.26  247750.7

Datos atipicos

autoplot(modelo_seleccionado3, which = 4)

`