library(tidyverse)
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.0.8 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.4.0
## v ggplot2 3.4.2 v tibble 3.1.6
## v lubridate 1.9.2 v tidyr 1.2.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the [conflicted package](http://conflicted.r-lib.org/) to force all conflicts to become errors
library(ggfortify)
head(df)
## # A tibble: 6 x 21
## id date price bedrooms bathrooms sqft_living sqft_lot
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7129300520 2014-10-13 00:00:00 221900 3 1 1180 5650
## 2 6414100192 2014-12-09 00:00:00 538000 3 2.25 2570 7242
## 3 5631500400 2015-02-25 00:00:00 180000 2 1 770 10000
## 4 2487200875 2014-12-09 00:00:00 604000 4 3 1960 5000
## 5 1954400510 2015-02-18 00:00:00 510000 3 2 1680 8080
## 6 7237550310 2014-05-12 00:00:00 1225000 4 4.5 5420 101930
## # i 14 more variables: floors <dbl>, waterfront <dbl>, view <dbl>,
## # condition <dbl>, grade <dbl>, sqft_above <dbl>, sqft_basement <dbl>,
## # yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>, lat <dbl>, long <dbl>,
## # sqft_living15 <dbl>, sqft_lot15 <dbl>
df=df %>% filter(zipcode %in% c(98001, 98030,98122))
ggplot(df, aes(x=sqft_living, y=price, color=as.factor(zipcode)))+
geom_point()+
geom_smooth(method=lm, se=TRUE)
##
## `geom_smooth()` using formula = 'y ~ x'
modelo=lm(price~sqft_living, data=df)
autoplot(modelo)
summary(modelo)
##
## Call:
## lm(formula = price ~ sqft_living, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -336931 -133404 -83927 113826 1346557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103258.00 22232.84 4.644 3.92e-06 ***
## sqft_living 156.71 11.18 14.019 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 217500 on 906 degrees of freedom
## Multiple R-squared: 0.1783, Adjusted R-squared: 0.1774
## F-statistic: 196.5 on 1 and 906 DF, p-value: < 2.2e-16
modelo2=lm(price~sqft_living+zipcode, data=df)
summary(modelo2)
##
## Call:
## lm(formula = price ~ sqft_living + zipcode, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459584 -75180 -10187 56200 1058883
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.189e+08 8.573e+06 -37.19 <2e-16 ***
## sqft_living 1.805e+02 7.061e+00 25.57 <2e-16 ***
## zipcode 3.253e+03 8.742e+01 37.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 136800 on 905 degrees of freedom
## Multiple R-squared: 0.6751, Adjusted R-squared: 0.6744
## F-statistic: 940.4 on 2 and 905 DF, p-value: < 2.2e-16
autoplot(modelo2)
df_98001= df %>% filter(zipcode==98001)
ggplot(df_98001, aes(x=sqft_living, y=price, color=as.factor(zipcode)))+
geom_point()+
geom_smooth(method=lm, se=TRUE)
##
## `geom_smooth()` using formula = 'y ~ x'
modelo3=lm(price~sqft_living, data=df_98001)
summary(modelo3)
##
## Call:
## lm(formula = price ~ sqft_living, data = df_98001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -159825 -38217 -4387 26114 239049
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46556.758 9255.509 5.03 7.75e-07 ***
## sqft_living 123.233 4.604 26.77 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57320 on 360 degrees of freedom
## Multiple R-squared: 0.6656, Adjusted R-squared: 0.6646
## F-statistic: 716.5 on 1 and 360 DF, p-value: < 2.2e-16
autoplot(modelo3)
Se puede observar como Residual standard error ha disminuido respecto a los modelos anteriores
modelo4=lm(price~sqft_living+bathrooms+sqft_lot+bedrooms+floors+grade+yr_renovated, data=df_98001)
summary(modelo4)
##
## Call:
## lm(formula = price ~ sqft_living + bathrooms + sqft_lot + bedrooms +
## floors + grade + yr_renovated, data = df_98001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -137751 -26456 -1337 26673 182126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.712e+05 2.436e+04 -7.027 1.09e-11 ***
## sqft_living 7.719e+01 6.551e+00 11.783 < 2e-16 ***
## bathrooms 8.348e+03 6.047e+03 1.381 0.1683
## sqft_lot 1.095e+00 1.323e-01 8.280 2.55e-15 ***
## bedrooms -7.325e+03 4.083e+03 -1.794 0.0737 .
## floors 8.253e+03 6.390e+03 1.292 0.1974
## grade 3.899e+04 3.865e+03 10.088 < 2e-16 ***
## yr_renovated 1.702e+01 8.491e+00 2.004 0.0458 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46600 on 354 degrees of freedom
## Multiple R-squared: 0.7826, Adjusted R-squared: 0.7783
## F-statistic: 182.1 on 7 and 354 DF, p-value: < 2.2e-16
autoplot(modelo4)
new_data=head(df,10)
new_data_98001=new_data %>% filter(zipcode==98001)
new_data_98001 %>% select(price)
## # A tibble: 4 x 1
## price
## <dbl>
## 1 240000
## 2 287000
## 3 480000
## 4 465000
predict(modelo, new_data_98001, interval="prediction")
## fit lwr upr
## 1 294439.7 -132959.88 721839.2
## 2 454280.1 27054.28 881505.9
## 3 609419.3 181242.47 1037596.1
## 4 528558.9 101015.09 956102.6
predict(modelo2, new_data_98001, interval="prediction")
## fit lwr upr
## 1 126403.3 -142619.33 395426.0
## 2 310530.9 41656.68 579405.2
## 3 489243.0 219802.81 758683.1
## 4 396096.1 127038.01 665154.2
predict(modelo3, new_data_98001, interval = "prediction")
## fit lwr upr
## 1 196900.8 83854.52 309947.1
## 2 322598.3 209678.46 435518.2
## 3 444598.8 331081.06 558116.6
## 4 381010.7 267892.75 494128.6
predict(modelo4, new_data_98001, interval = "prediction")
## fit lwr upr
## 1 192009.0 99332.69 284685.2
## 2 287761.4 195598.47 379924.3
## 3 454771.7 362220.18 547323.2
## 4 424201.3 331629.53 516773.0
# Predicciones y intervalos de predicción para modelo3
predictions_modelo3 <- predict(modelo3, newdata = df_98001, interval = "prediction")
df_pred_modelo3 <- cbind(df_98001, predictions_modelo3)
# Predicciones y intervalos de predicción para modelo2
predictions_modelo2 <- predict(modelo2, newdata = df_98001, interval = "prediction")
df_pred_modelo2 <- cbind(df_98001, predictions_modelo2)
# Crear el gráfico de dispersión con intervalos de predicción de ambos modelos
library(ggplot2)
ggplot() +
geom_point(data = df, aes(x = sqft_living, y = price, color=factor(zipcode))) +
geom_ribbon(aes(x = sqft_living, ymin = lwr, ymax = upr, fill = "Modelo3"),
data = df_pred_modelo3, alpha = 0.3) +
geom_ribbon(aes(x = sqft_living, ymin = lwr, ymax = upr, fill = "Modelo2"),
data = df_pred_modelo2, alpha = 0.3) +
labs(title = "Gráfico de Dispersión con Intervalos de Predicción Modelo2 vs Modelo3",
x = "sqft_living",
y = "price") +
scale_fill_manual(values = c("Modelo3" = "red", "Modelo2" = "blue"),
name = "Modelos",
breaks = c("Modelo3", "Modelo2"))+
scale_color_manual(values = c("98001" = "black", "other" = "gray95"),
name = "Zipcode",
breaks = c("98001", "other"))
# Predicciones y intervalos de predicción para modelo3
predictions_modelo3 <- predict(modelo3, newdata = df_98001, interval = "prediction")
df_pred_modelo3 <- cbind(df_98001, predictions_modelo3)
# Predicciones y intervalos de predicción para modelo4
predictions_modelo4 <- predict(modelo4, newdata = df_98001, interval = "prediction")
df_pred_modelo4 <- cbind(df_98001, predictions_modelo4)
# Crear el gráfico de dispersión con intervalos de predicción de ambos modelos
library(ggplot2)
ggplot() +
geom_point(data = df_pred_modelo3, aes(x = sqft_living, y = price)) +
geom_ribbon(aes(x = sqft_living, ymin = lwr, ymax = upr, fill = "Modelo3"),
data = df_pred_modelo3, alpha = 0.3) +
geom_ribbon(aes(x = sqft_living, ymin = lwr, ymax = upr, fill = "Modelo4"),
data = df_pred_modelo4, alpha = 0.3) +
labs(title = "Gráfico de Dispersión con Intervalos de Predicción Modelo3 vs Modelo4",
x = "sqft_living",
y = "price") +
scale_fill_manual(values = c("Modelo3" = "red", "Modelo4" = "blue"),
name = "Modelos",
breaks = c("Modelo3", "Modelo4"))