library(MASS)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages -------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(ggplot2)
library(knitr)
data("Boston")
datos <- Boston
kable(head(datos))
| crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.00632 | 18 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
| 0.02731 | 0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
| 0.02729 | 0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
| 0.03237 | 0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
| 0.06905 | 0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
| 0.02985 | 0 | 2.18 | 0 | 0.458 | 6.430 | 58.7 | 6.0622 | 3 | 222 | 18.7 | 394.12 | 5.21 | 28.7 |
kable(tail(datos))
| crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 501 | 0.22438 | 0 | 9.69 | 0 | 0.585 | 6.027 | 79.7 | 2.4982 | 6 | 391 | 19.2 | 396.90 | 14.33 | 16.8 |
| 502 | 0.06263 | 0 | 11.93 | 0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273 | 21.0 | 391.99 | 9.67 | 22.4 |
| 503 | 0.04527 | 0 | 11.93 | 0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273 | 21.0 | 396.90 | 9.08 | 20.6 |
| 504 | 0.06076 | 0 | 11.93 | 0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273 | 21.0 | 396.90 | 5.64 | 23.9 |
| 505 | 0.10959 | 0 | 11.93 | 0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1 | 273 | 21.0 | 393.45 | 6.48 | 22.0 |
| 506 | 0.04741 | 0 | 11.93 | 0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1 | 273 | 21.0 | 396.90 | 7.88 | 11.9 |
str(datos)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(datos)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
X = lstat: porcentaje de población en condición de pobreza. Y = Valor mediano de las casas ocupadas por el dueño en unidades de $1000s.
ggplot(data = datos, aes(x=lstat, y = medv)) +
geom_point() +
geom_smooth(color = 'red')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
−0.75 = Correlación negativa considerable.
−0.50 = Correlación negativa media.
cor(datos$lstat, datos$medv)
## [1] -0.7376627
modelo <- lm(data = datos,formula = medv~lstat)
summary(modelo)
##
## Call:
## lm(formula = medv ~ lstat, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
ggplot() + geom_point(data = datos, aes(x = lstat, y = medv)) +
geom_line(aes( x = datos$lstat, y = predict(modelo, datos)), color = "red")
modelo.pol2 <- lm(formula = medv ~ poly(lstat, 2), data = datos)
summary(modelo.pol2)
##
## Call:
## lm(formula = medv ~ poly(lstat, 2), data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2834 -3.8313 -0.5295 2.3095 25.4148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.2456 91.76 <2e-16 ***
## poly(lstat, 2)1 -152.4595 5.5237 -27.60 <2e-16 ***
## poly(lstat, 2)2 64.2272 5.5237 11.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared: 0.6407, Adjusted R-squared: 0.6393
## F-statistic: 448.5 on 2 and 503 DF, p-value: < 2.2e-16
attach(datos)
plot(x = lstat, y = medv, main = "medv vs lstat", pch = 20, col = "grey30")
points(lstat, fitted(modelo.pol2), col = 'red', pch = 20)
attach(datos)
## The following objects are masked from datos (pos = 3):
##
## age, black, chas, crim, dis, indus, lstat, medv, nox, ptratio, rad,
## rm, tax, zn
plot(x = lstat, y = medv, main = "medv vs lstat", pch = 20, col = "grey30")
points(lstat, fitted(modelo.pol2), col = 'red', pch = 20)
ggplot() + geom_point(data = datos, aes(x = lstat, y = medv)) + geom_line(aes( x = datos$lstat, y = predict(modelo.pol2, datos)), color = "red")
####A la hora de comparar dos modelos se pueden evaluar sus R2. En este caso el modelo cuadrático es capaz de explicar un 64% de variabilidad frente al 54% del modelo lineal.
modelo.pol5 <- lm(formula = medv ~ poly(lstat, 5), data = datos)
summary(modelo.pol5)
##
## Call:
## lm(formula = medv ~ poly(lstat, 5), data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5433 -3.1039 -0.7052 2.0844 27.1153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.2318 97.197 < 2e-16 ***
## poly(lstat, 5)1 -152.4595 5.2148 -29.236 < 2e-16 ***
## poly(lstat, 5)2 64.2272 5.2148 12.316 < 2e-16 ***
## poly(lstat, 5)3 -27.0511 5.2148 -5.187 3.10e-07 ***
## poly(lstat, 5)4 25.4517 5.2148 4.881 1.42e-06 ***
## poly(lstat, 5)5 -19.2524 5.2148 -3.692 0.000247 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared: 0.6817, Adjusted R-squared: 0.6785
## F-statistic: 214.2 on 5 and 500 DF, p-value: < 2.2e-16
ggplot() + geom_point(data = datos, aes(x = lstat, y = medv)) + geom_line(aes( x = datos$lstat, y = predict(modelo.pol5, datos)), color = "red")
modelo.pol10 <- lm(formula = medv ~ poly(lstat, 10), data = datos)
summary(modelo.pol10)
##
## Call:
## lm(formula = medv ~ poly(lstat, 10), data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5340 -3.0286 -0.7507 2.0437 26.4738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.2311 97.488 < 2e-16 ***
## poly(lstat, 10)1 -152.4595 5.1993 -29.323 < 2e-16 ***
## poly(lstat, 10)2 64.2272 5.1993 12.353 < 2e-16 ***
## poly(lstat, 10)3 -27.0511 5.1993 -5.203 2.88e-07 ***
## poly(lstat, 10)4 25.4517 5.1993 4.895 1.33e-06 ***
## poly(lstat, 10)5 -19.2524 5.1993 -3.703 0.000237 ***
## poly(lstat, 10)6 6.5088 5.1993 1.252 0.211211
## poly(lstat, 10)7 1.9416 5.1993 0.373 0.708977
## poly(lstat, 10)8 -6.7299 5.1993 -1.294 0.196133
## poly(lstat, 10)9 8.4168 5.1993 1.619 0.106116
## poly(lstat, 10)10 -7.3351 5.1993 -1.411 0.158930
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.199 on 495 degrees of freedom
## Multiple R-squared: 0.6867, Adjusted R-squared: 0.6804
## F-statistic: 108.5 on 10 and 495 DF, p-value: < 2.2e-16
ggplot() + geom_point(data = datos, aes(x = lstat, y = medv)) + geom_line(aes( x = datos$lstat, y = predict(modelo.pol10, datos)), color = "red")