# cargando las librerias
library(tidyverse)
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.0.8 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.4.0
## v ggplot2 3.4.2 v tibble 3.1.6
## v lubridate 1.9.2 v tidyr 1.2.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the [conflicted package](http://conflicted.r-lib.org/) to force all conflicts to become errors
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggfortify)
head(df)
## # A tibble: 6 x 21
## id date price bedrooms bathrooms sqft_living sqft_lot
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7129300520 2014-10-13 00:00:00 221900 3 1 1180 5650
## 2 6414100192 2014-12-09 00:00:00 538000 3 2.25 2570 7242
## 3 5631500400 2015-02-25 00:00:00 180000 2 1 770 10000
## 4 2487200875 2014-12-09 00:00:00 604000 4 3 1960 5000
## 5 1954400510 2015-02-18 00:00:00 510000 3 2 1680 8080
## 6 7237550310 2014-05-12 00:00:00 1225000 4 4.5 5420 101930
## # i 14 more variables: floors <dbl>, waterfront <dbl>, view <dbl>,
## # condition <dbl>, grade <dbl>, sqft_above <dbl>, sqft_basement <dbl>,
## # yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>, lat <dbl>, long <dbl>,
## # sqft_living15 <dbl>, sqft_lot15 <dbl>
df=df %>% filter(zipcode=="98178") %>%
select(-c(id, zipcode, date))
ggplot(df, aes(x = price)) +
geom_histogram() +
labs(title = "Histograma de price")
##
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x = bathrooms)) +
geom_bar() +
labs(title = "Gráfico de barras de bathrooms")
ggplot(df, aes(x = bedrooms)) +
geom_bar() +
labs(title = "Gráfico de barras de bedrooms")
ggplot(df, aes(x = sqft_living)) +
geom_histogram() +
labs(title = "Histograma de sqft_living")
##
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x = sqft_lot)) +
geom_histogram() +
labs(title = "Histograma de sqft_lot")
##
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x = floors)) +
geom_bar() +
labs(title = "Histograma de floors")
ggplot(df, aes(x = waterfront)) +
geom_bar() +
labs(title = "Histograma de waterfront")
ggplot(df, aes(x = view)) +
geom_bar() +
labs(title = "Histograma de view",
x = "Price",
y = "Frecuencia")
ggplot(df, aes(x = sqft_living15)) +
geom_histogram() +
labs(title = "Histograma de sqft_living15",
x = "Price",
y = "Frecuencia")
##
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x = sqft_lot15)) +
geom_histogram() +
labs(title = "Histograma de sqft_lot15")
##
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x = yr_built)) +
geom_histogram() +
labs(title = "Histograma de yr_built")
##
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
numerical_variables= df %>% select(price, sqft_living, sqft_lot, sqft_above, sqft_basement, sqft_living15, sqft_living15, sqft_lot15)
autoplot(cor(numerical_variables))
##
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
modelo1=lm(price~sqft_living,data= df)
summary(modelo1)
##
## Call:
## lm(formula = price ~ sqft_living, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -174111 -61000 -12016 27040 1190976
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75701.18 21186.09 3.573 0.00042 ***
## sqft_living 135.84 11.34 11.975 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 129500 on 260 degrees of freedom
## Multiple R-squared: 0.3555, Adjusted R-squared: 0.353
## F-statistic: 143.4 on 1 and 260 DF, p-value: < 2.2e-16
autoplot(modelo1)
shapiro.test(modelo1$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo1$residuals
## W = 0.63742, p-value < 2.2e-16
bptest(modelo1)
##
## studentized Breusch-Pagan test
##
## data: modelo1
## BP = 7.8787, df = 1, p-value = 0.005002
intervalos_prediccion = predict(modelo1, interval = "prediction")
## Warning in predict.lm(modelo1, interval = "prediction"): predictions on current data refer to _future_ responses
df_1 <- cbind(df,intervalos_prediccion)
# Tu código existente
ggplot(data = df_1, aes(x = sqft_living, y = price)) +
geom_point() +
geom_smooth(method = 'lm', se = TRUE) +
geom_line(aes(y = lwr), linetype = "dashed") +
geom_line(aes(y = upr), linetype = "dashed")
##
## `geom_smooth()` using formula = 'y ~ x'
confint(modelo1)
## 2.5 % 97.5 %
## (Intercept) 33983.0176 117419.3449
## sqft_living 113.5017 158.1743
sigma(modelo1)
## [1] 129537.5
df_filtrado=df[c(1,10,30,50,100,25,228),]
df_filtrado
## # A tibble: 7 x 18
## price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 221900 3 1 1180 5650 1 0 0
## 2 290900 2 2 1610 17600 2 0 0
## 3 240000 4 1.5 1920 7973 1 0 0
## 4 100000 2 1 790 6426 1 0 0
## 5 210000 4 1.75 2090 6485 1 0 0
## 6 1700000 4 3.75 3190 17186 2 1 4
## 7 90000 2 1 580 7500 1 0 0
## # i 10 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## # sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, lat <dbl>,
## # long <dbl>, sqft_living15 <dbl>, sqft_lot15 <dbl>
new_data=data.frame(sqft_living=df_filtrado$sqft_living)
predicciones=predict(modelo1, newdata = new_data, interval= "confidence")
cbind(df_filtrado$price, predicciones)
## fit lwr upr
## 1 221900 235990.0 216017.5 255962.5
## 2 290900 294400.3 278417.8 310382.9
## 3 240000 336510.1 320186.2 352834.0
## 4 100000 183013.2 156772.7 209253.7
## 5 210000 359602.6 341904.3 377300.8
## 6 1700000 509024.4 472792.3 545256.4
## 7 90000 154487.2 124364.1 184610.3
new_data=data.frame(sqft_living=df_filtrado$sqft_living)
predicciones=predict(modelo1, newdata = new_data, interval= "prediction")
cbind(df_filtrado$price, predicciones)
## fit lwr upr
## 1 221900 235990.0 -19866.86 491846.9
## 2 290900 294400.3 38823.97 549976.7
## 3 240000 336510.1 80912.18 592108.0
## 4 100000 183013.2 -73409.12 439435.5
## 5 210000 359602.6 103913.18 615292.0
## 6 1700000 509024.4 251387.80 766660.9
## 7 90000 154487.2 -102361.45 411335.9
autoplot(modelo1, which=4)
df_sin_atipicos= subset(df, !(row.names(df) == "25"))
modelo2=lm(price~sqft_living, data=df_sin_atipicos)
summary(modelo2)
##
## Call:
## lm(formula = price ~ sqft_living, data = df_sin_atipicos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -159831 -53215 -9442 28230 615500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94605.159 17431.362 5.427 1.31e-07 ***
## sqft_living 122.224 9.367 13.048 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 106100 on 259 degrees of freedom
## Multiple R-squared: 0.3966, Adjusted R-squared: 0.3943
## F-statistic: 170.2 on 1 and 259 DF, p-value: < 2.2e-16
autoplot(modelo2)
shapiro.test(modelo2$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo2$residuals
## W = 0.73529, p-value < 2.2e-16
bptest(modelo2)
##
## studentized Breusch-Pagan test
##
## data: modelo2
## BP = 4.5122, df = 1, p-value = 0.03365
intervalos_prediccion = predict(modelo2, interval = "prediction")
## Warning in predict.lm(modelo2, interval = "prediction"): predictions on current data refer to _future_ responses
df_sin_atipicos <- cbind(df_sin_atipicos,intervalos_prediccion)
# Tu código existente
ggplot(data = df_sin_atipicos, aes(x = sqft_living, y = price)) +
geom_point() +
geom_smooth(method = 'lm', se = TRUE) +
geom_line(aes(y = lwr), linetype = "dashed") +
geom_line(aes(y = upr), linetype = "dashed")
##
## `geom_smooth()` using formula = 'y ~ x'
autoplot(modelo2, which = 4)
df_sin_atipicos2= subset(df_sin_atipicos, !(row.names(df_sin_atipicos) == "68"))
modelo3= lm(price~sqft_living, data=df_sin_atipicos2)
summary(modelo3)
##
## Call:
## lm(formula = price ~ sqft_living, data = df_sin_atipicos2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -155511 -51763 -9919 28324 588123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.046e+05 1.634e+04 6.399 7.31e-10 ***
## sqft_living 1.150e+02 8.815e+00 13.050 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 98990 on 258 degrees of freedom
## Multiple R-squared: 0.3976, Adjusted R-squared: 0.3953
## F-statistic: 170.3 on 1 and 258 DF, p-value: < 2.2e-16
autoplot(modelo3)
shapiro.test(modelo3$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo3$residuals
## W = 0.75762, p-value < 2.2e-16
bptest(modelo3)
##
## studentized Breusch-Pagan test
##
## data: modelo3
## BP = 1.4389, df = 1, p-value = 0.2303
df_sin_atipicos2=df_sin_atipicos2 %>% select(-c(fit, lwr, upr))
dim(df_sin_atipicos2)
## [1] 260 18
dim(intervalos_prediccion)
## [1] 261 3
intervalos_prediccion = predict(modelo3, interval = "prediction")
## Warning in predict.lm(modelo3, interval = "prediction"): predictions on current data refer to _future_ responses
df_sin_atipicos2 <- cbind(df_sin_atipicos2,intervalos_prediccion)
# Tu código existente
ggplot(data = df_sin_atipicos2, aes(x = sqft_living, y = price)) +
geom_point() +
geom_smooth(method = 'lm', se = TRUE) +
geom_line(aes(y = lwr), linetype = "dashed") +
geom_line(aes(y = upr), linetype = "dashed")
##
## `geom_smooth()` using formula = 'y ~ x'
autoplot(modelo3, which = 4)
predicciones=predict(modelo3, newdata = new_data, interval= "confidence")
cbind(df_filtrado$price, predicciones)
## fit lwr upr
## 1 221900 240327.9 225050.0 255605.8
## 2 290900 289795.2 277560.8 302029.6
## 3 240000 325457.7 312870.4 338045.0
## 4 100000 195462.1 175319.5 215604.7
## 5 210000 345014.6 331309.4 358719.8
## 6 1700000 471559.0 443292.9 499825.1
## 7 90000 171303.6 148141.7 194465.5
predicciones=predict(modelo3, newdata = new_data, interval= "prediction")
cbind(df_filtrado$price, predicciones)
## fit lwr upr
## 1 221900 240327.9 44790.9213 435864.8
## 2 290900 289795.2 94472.5143 485117.9
## 3 240000 325457.7 130112.6119 520802.9
## 4 100000 195462.1 -514.9383 391439.1
## 5 210000 345014.6 149594.2607 540434.9
## 6 1700000 471559.0 274581.2314 668536.8
## 7 90000 171303.6 -25006.7189 367614.0
# od el modelo lineal inicial
modelo_inicial <- lm(price ~ ., data = df)
intercept_only <- lm(price ~ 1, data=df)
all <- lm(price ~ ., data=df)
forward <- step(intercept_only, direction='forward', scope=formula(all), trace=0, k=3.84,criteria= "AIC")
formula(forward)
## price ~ view + waterfront + sqft_living + grade + sqft_above +
## bathrooms + condition
backward <- step(all, direction='backward', scope=formula(all), trace=0, k=3.84,criteria= "AIC")
formula(backward)
## price ~ bathrooms + sqft_living + waterfront + view + condition +
## grade + sqft_above
both <- step(intercept_only, direction='both', scope=formula(all), trace=0, k=3.84,criteria= "AIC")
formula(both)
## price ~ view + waterfront + sqft_living + grade + sqft_above +
## bathrooms + condition
# Realizar la selección de variables paso a paso con AIC
modelo_seleccionado <- both
summary(modelo_seleccionado)
##
## Call:
## lm(formula = price ~ view + waterfront + sqft_living + grade +
## sqft_above + bathrooms + condition, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -203698 -33413 -4616 32805 546391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -179330.58 52022.79 -3.447 0.000663 ***
## view 34262.31 5152.63 6.649 1.79e-10 ***
## waterfront 452127.51 28071.11 16.107 < 2e-16 ***
## sqft_living 32.04 11.41 2.807 0.005391 **
## grade 39312.52 8222.29 4.781 2.95e-06 ***
## sqft_above 42.87 12.74 3.364 0.000888 ***
## bathrooms 19744.01 8990.44 2.196 0.028988 *
## condition 12552.54 6444.26 1.948 0.052533 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65350 on 254 degrees of freedom
## Multiple R-squared: 0.8397, Adjusted R-squared: 0.8353
## F-statistic: 190.1 on 7 and 254 DF, p-value: < 2.2e-16
autoplot(modelo_seleccionado)
shapiro.test(modelo_seleccionado$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo_seleccionado$residuals
## W = 0.86209, p-value = 1.44e-14
bptest(modelo_seleccionado)
##
## studentized Breusch-Pagan test
##
## data: modelo_seleccionado
## BP = 68.734, df = 7, p-value = 2.662e-12
df_filtrado=df[c(1,10,30,50,100,25,228),]
df_filtrado
## # A tibble: 7 x 18
## price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 221900 3 1 1180 5650 1 0 0
## 2 290900 2 2 1610 17600 2 0 0
## 3 240000 4 1.5 1920 7973 1 0 0
## 4 100000 2 1 790 6426 1 0 0
## 5 210000 4 1.75 2090 6485 1 0 0
## 6 1700000 4 3.75 3190 17186 2 1 4
## 7 90000 2 1 580 7500 1 0 0
## # i 10 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## # sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, lat <dbl>,
## # long <dbl>, sqft_living15 <dbl>, sqft_lot15 <dbl>
new_data=data.frame(bathrooms= df_filtrado$bathrooms,sqft_living=df_filtrado$sqft_living,waterfront= df_filtrado$waterfront, view= df_filtrado$view,condition= df_filtrado$condition, grade= df_filtrado$grade, sqft_above= df_filtrado$sqft_above)
predicciones=predict(modelo_seleccionado, newdata = new_data, interval= "confidence")
cbind(price=df_filtrado$price,predicciones)
## price fit lwr upr
## 1 221900 241643.8 225132.99 258154.7
## 2 290900 254283.5 235053.37 273513.6
## 3 240000 346256.3 321661.15 370851.5
## 4 100000 173119.3 159019.72 187218.8
## 5 210000 289891.8 275933.88 303849.7
## 6 1700000 1153608.5 1095691.19 1211525.9
## 7 90000 118077.2 95076.08 141078.3
predicciones=predict(modelo_seleccionado, newdata = new_data, interval= "prediction")
cbind(price=df_filtrado$price,predicciones)
## price fit lwr upr
## 1 221900 241643.8 111889.55 371398.1
## 2 290900 254283.5 124155.20 384411.7
## 3 240000 346256.3 215227.74 477284.9
## 4 100000 173119.3 43649.72 302588.8
## 5 210000 289891.8 160437.59 419346.0
## 6 1700000 1153608.5 1012477.44 1294739.6
## 7 90000 118077.2 -12661.55 248815.9
autoplot(modelo_seleccionado, which=4)
df= subset(df, !(row.names(df) == "25"))
modelo_seleccionado2= lm(formula = price ~ view + waterfront + sqft_living + grade +
sqft_above + bathrooms + condition, data = df)
autoplot(modelo_seleccionado2)
shapiro.test(modelo_seleccionado2$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo_seleccionado2$residuals
## W = 0.98259, p-value = 0.002808
bptest(modelo_seleccionado2)
##
## studentized Breusch-Pagan test
##
## data: modelo_seleccionado2
## BP = 39.433, df = 7, p-value = 1.616e-06
autoplot(modelo_seleccionado2, which = 4)
df= subset(df, !(row.names(df) == "68"))
modelo_seleccionado3= lm(price ~ view + waterfront + sqft_living + grade +
sqft_above + bathrooms + condition, data = df)
summary(modelo_seleccionado3)
##
## Call:
## lm(formula = price ~ view + waterfront + sqft_living + grade +
## sqft_above + bathrooms + condition, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -121865 -32499 -2335 32676 207970
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -65696.068 42496.700 -1.546 0.123381
## view 37213.943 4111.922 9.050 < 2e-16 ***
## waterfront 349237.425 24498.609 14.255 < 2e-16 ***
## sqft_living 53.282 9.258 5.755 2.51e-08 ***
## grade 23105.355 6693.149 3.452 0.000652 ***
## sqft_above 20.300 10.340 1.963 0.050715 .
## bathrooms 12771.285 7194.592 1.775 0.077085 .
## condition 12745.708 5134.596 2.482 0.013705 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52060 on 252 degrees of freedom
## Multiple R-squared: 0.8373, Adjusted R-squared: 0.8328
## F-statistic: 185.2 on 7 and 252 DF, p-value: < 2.2e-16
autoplot(modelo_seleccionado3)
shapiro.test(modelo_seleccionado3$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo_seleccionado3$residuals
## W = 0.98103, p-value = 0.001533
bptest(modelo_seleccionado3)
##
## studentized Breusch-Pagan test
##
## data: modelo_seleccionado3
## BP = 27.892, df = 7, p-value = 0.00023
new_data=data.frame(bathrooms= df_filtrado$bathrooms,sqft_living=df_filtrado$sqft_living,waterfront= df_filtrado$waterfront, view= df_filtrado$view,condition= df_filtrado$condition, grade= df_filtrado$grade, sqft_above= df_filtrado$sqft_above)
predicciones=predict(modelo_seleccionado3, newdata = new_data, interval= "confidence")
cbind(price=df_filtrado$price,predicciones)
## price fit lwr upr
## 1 221900 233877.2 220640.8 247113.6
## 2 290900 255183.6 239814.5 270552.7
## 3 240000 317819.2 297692.6 337945.8
## 4 100000 182074.7 170732.9 193416.5
## 5 210000 293972.5 282833.3 305111.7
## 6 1700000 984308.3 929292.0 1039324.7
## 7 90000 143517.0 124735.9 162298.0
predicciones=predict(modelo_seleccionado3, newdata = new_data, interval= "prediction")
cbind(price=df_filtrado$price,predicciones)
## price fit lwr upr
## 1 221900 233877.2 130498.56 337255.8
## 2 290900 255183.6 151510.33 358856.9
## 3 240000 317819.2 213334.71 422303.8
## 4 100000 182074.7 78921.49 285227.8
## 5 210000 293972.5 190841.44 397103.6
## 6 1700000 984308.3 867952.26 1100664.4
## 7 90000 143517.0 39283.26 247750.7
autoplot(modelo_seleccionado3, which = 4)
`