Cargando datos

library(tidyverse)
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.0.8     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.4.0
## v ggplot2   3.4.2     v tibble    3.1.6
## v lubridate 1.9.2     v tidyr     1.2.0
## v purrr     0.3.4     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the [conflicted package](http://conflicted.r-lib.org/) to force all conflicts to become errors
library(ggfortify)
head(df)
## # A tibble: 6 x 21
##   id         date                  price bedrooms bathrooms sqft_living sqft_lot
##   <chr>      <dttm>                <dbl>    <dbl>     <dbl>       <dbl>    <dbl>
## 1 7129300520 2014-10-13 00:00:00  221900        3      1           1180     5650
## 2 6414100192 2014-12-09 00:00:00  538000        3      2.25        2570     7242
## 3 5631500400 2015-02-25 00:00:00  180000        2      1            770    10000
## 4 2487200875 2014-12-09 00:00:00  604000        4      3           1960     5000
## 5 1954400510 2015-02-18 00:00:00  510000        3      2           1680     8080
## 6 7237550310 2014-05-12 00:00:00 1225000        4      4.5         5420   101930
## # i 14 more variables: floors <dbl>, waterfront <dbl>, view <dbl>,
## #   condition <dbl>, grade <dbl>, sqft_above <dbl>, sqft_basement <dbl>,
## #   yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>, lat <dbl>, long <dbl>,
## #   sqft_living15 <dbl>, sqft_lot15 <dbl>

filtrando por zicode 98001, 98030,98122

df=df %>% filter(zipcode %in% c(98001, 98030,98122))
ggplot(df, aes(x=sqft_living, y=price, color=as.factor(zipcode)))+
    geom_point()+
    geom_smooth(method=lm, se=TRUE)
## 
## `geom_smooth()` using formula = 'y ~ x'

modelo de regresion price sobre sqft living

modelo=lm(price~sqft_living, data=df)
autoplot(modelo)

summary(modelo)
## 
## Call:
## lm(formula = price ~ sqft_living, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -336931 -133404  -83927  113826 1346557 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 103258.00   22232.84   4.644 3.92e-06 ***
## sqft_living    156.71      11.18  14.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 217500 on 906 degrees of freedom
## Multiple R-squared:  0.1783, Adjusted R-squared:  0.1774 
## F-statistic: 196.5 on 1 and 906 DF,  p-value: < 2.2e-16

modelo de regresion price sobre sqft_living + zipcode

modelo2=lm(price~sqft_living+zipcode, data=df)
summary(modelo2)
## 
## Call:
## lm(formula = price ~ sqft_living + zipcode, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -459584  -75180  -10187   56200 1058883 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.189e+08  8.573e+06  -37.19   <2e-16 ***
## sqft_living  1.805e+02  7.061e+00   25.57   <2e-16 ***
## zipcode      3.253e+03  8.742e+01   37.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 136800 on 905 degrees of freedom
## Multiple R-squared:  0.6751, Adjusted R-squared:  0.6744 
## F-statistic: 940.4 on 2 and 905 DF,  p-value: < 2.2e-16
autoplot(modelo2)

modelo price sobre sqft_living filtrando solo para zipcode 98001

df_98001= df %>% filter(zipcode==98001)


ggplot(df_98001, aes(x=sqft_living, y=price, color=as.factor(zipcode)))+
    geom_point()+
    geom_smooth(method=lm, se=TRUE)
## 
## `geom_smooth()` using formula = 'y ~ x'

modelo3=lm(price~sqft_living, data=df_98001)
summary(modelo3)
## 
## Call:
## lm(formula = price ~ sqft_living, data = df_98001)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -159825  -38217   -4387   26114  239049 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46556.758   9255.509    5.03 7.75e-07 ***
## sqft_living   123.233      4.604   26.77  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57320 on 360 degrees of freedom
## Multiple R-squared:  0.6656, Adjusted R-squared:  0.6646 
## F-statistic: 716.5 on 1 and 360 DF,  p-value: < 2.2e-16
autoplot(modelo3)

Se puede observar como Residual standard error ha disminuido respecto a los modelos anteriores

modelo mas complejo de price filtrando solo para zipcode 98001

modelo4=lm(price~sqft_living+bathrooms+sqft_lot+bedrooms+floors+grade+yr_renovated, data=df_98001)

summary(modelo4)
## 
## Call:
## lm(formula = price ~ sqft_living + bathrooms + sqft_lot + bedrooms + 
##     floors + grade + yr_renovated, data = df_98001)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -137751  -26456   -1337   26673  182126 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.712e+05  2.436e+04  -7.027 1.09e-11 ***
## sqft_living   7.719e+01  6.551e+00  11.783  < 2e-16 ***
## bathrooms     8.348e+03  6.047e+03   1.381   0.1683    
## sqft_lot      1.095e+00  1.323e-01   8.280 2.55e-15 ***
## bedrooms     -7.325e+03  4.083e+03  -1.794   0.0737 .  
## floors        8.253e+03  6.390e+03   1.292   0.1974    
## grade         3.899e+04  3.865e+03  10.088  < 2e-16 ***
## yr_renovated  1.702e+01  8.491e+00   2.004   0.0458 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46600 on 354 degrees of freedom
## Multiple R-squared:  0.7826, Adjusted R-squared:  0.7783 
## F-statistic: 182.1 on 7 and 354 DF,  p-value: < 2.2e-16
autoplot(modelo4)

Comparación de los intervalos de predicción entre los 4 modelos

new_data=head(df,10)
new_data_98001=new_data %>% filter(zipcode==98001)
new_data_98001 %>% select(price)
## # A tibble: 4 x 1
##    price
##    <dbl>
## 1 240000
## 2 287000
## 3 480000
## 4 465000
predict(modelo, new_data_98001, interval="prediction")
##        fit        lwr       upr
## 1 294439.7 -132959.88  721839.2
## 2 454280.1   27054.28  881505.9
## 3 609419.3  181242.47 1037596.1
## 4 528558.9  101015.09  956102.6
predict(modelo2, new_data_98001, interval="prediction")
##        fit        lwr      upr
## 1 126403.3 -142619.33 395426.0
## 2 310530.9   41656.68 579405.2
## 3 489243.0  219802.81 758683.1
## 4 396096.1  127038.01 665154.2
predict(modelo3, new_data_98001, interval = "prediction")
##        fit       lwr      upr
## 1 196900.8  83854.52 309947.1
## 2 322598.3 209678.46 435518.2
## 3 444598.8 331081.06 558116.6
## 4 381010.7 267892.75 494128.6
predict(modelo4, new_data_98001, interval = "prediction")
##        fit       lwr      upr
## 1 192009.0  99332.69 284685.2
## 2 287761.4 195598.47 379924.3
## 3 454771.7 362220.18 547323.2
## 4 424201.3 331629.53 516773.0

Gráfico de los intervalos de confianza considerando la visualización de price y sqft

# Predicciones y intervalos de predicción para modelo3
predictions_modelo3 <- predict(modelo3, newdata = df_98001, interval = "prediction")
df_pred_modelo3 <- cbind(df_98001, predictions_modelo3)

# Predicciones y intervalos de predicción para modelo2
predictions_modelo2 <- predict(modelo2, newdata = df_98001, interval = "prediction")
df_pred_modelo2 <- cbind(df_98001, predictions_modelo2)

# Crear el gráfico de dispersión con intervalos de predicción de ambos modelos
library(ggplot2)

ggplot() +
  geom_point(data = df, aes(x = sqft_living, y = price, color=factor(zipcode))) +
  geom_ribbon(aes(x = sqft_living, ymin = lwr, ymax = upr, fill = "Modelo3"), 
              data = df_pred_modelo3, alpha = 0.3) +
  geom_ribbon(aes(x = sqft_living, ymin = lwr, ymax = upr, fill = "Modelo2"), 
              data = df_pred_modelo2, alpha = 0.3) +
  labs(title = "Gráfico de Dispersión con Intervalos de Predicción Modelo2 vs Modelo3",
       x = "sqft_living",
       y = "price") +
  scale_fill_manual(values = c("Modelo3" = "red", "Modelo2" = "blue"),
                    name = "Modelos",
                    breaks = c("Modelo3", "Modelo2"))+
  
  scale_color_manual(values = c("98001" = "black", "other" = "gray95"),
                     name = "Zipcode",
                     breaks = c("98001", "other"))

# Predicciones y intervalos de predicción para modelo3
predictions_modelo3 <- predict(modelo3, newdata = df_98001, interval = "prediction")
df_pred_modelo3 <- cbind(df_98001, predictions_modelo3)

# Predicciones y intervalos de predicción para modelo4
predictions_modelo4 <- predict(modelo4, newdata = df_98001, interval = "prediction")
df_pred_modelo4 <- cbind(df_98001, predictions_modelo4)

# Crear el gráfico de dispersión con intervalos de predicción de ambos modelos
library(ggplot2)

ggplot() +
  geom_point(data = df_pred_modelo3, aes(x = sqft_living, y = price)) +
  geom_ribbon(aes(x = sqft_living, ymin = lwr, ymax = upr, fill = "Modelo3"), 
              data = df_pred_modelo3, alpha = 0.3) +
  geom_ribbon(aes(x = sqft_living, ymin = lwr, ymax = upr, fill = "Modelo4"), 
              data = df_pred_modelo4, alpha = 0.3) +
  labs(title = "Gráfico de Dispersión con Intervalos de Predicción Modelo3 vs Modelo4",
       x = "sqft_living",
       y = "price") +
  scale_fill_manual(values = c("Modelo3" = "red", "Modelo4" = "blue"),
                    name = "Modelos",
                    breaks = c("Modelo3", "Modelo4"))