housing <- read.table("C:/Users/christian.figueroa/Desktop/Actividad 2 regresion/housing.csv", quote="\"", comment.char="")
names(housing) = c(“CRIM”, “ZN”, “INDUS”, “CHAS”, “NOX”, “RM”, “AGE”, “DIS”, “RAD”, “TAX”, “PTRATIO”, “B - 1000”, “LSTAT”, “MEDV”)
names(housing) = c("CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV")
summary(housing)
## CRIM ZN INDUS CHAS
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## NOX RM AGE DIS
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## RAD TAX PTRATIO B
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## LSTAT MEDV
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
A continuación se presenta un análisis exploratorio de la variable MEDV que corresponde valor medio comercial de casas de uso residencial en miles de dólares.
library(dplyr)
library(PASWR2)
eda(housing$MEDV)
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 506.000 0.000 5.000 17.025 22.533 21.200 21.906 25.000
## Max Stdev Var SE Mean I.Q.R. Range Kurtosis Skewness
## 50.000 9.197 84.587 0.409 7.975 45.000 1.451 1.102
## SW p-val
## 0.000
Se realiza una matriz de correlación con el fin de verificar que variables podría arrojarnos a fututro un mejor modelo, obteniendo los mejores resultados versus la variable de interes MEDV, con las variables RM y la variable LSTAT.
housing.cor <- cor(housing)
round(housing.cor, digits=2)
## CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO
## CRIM 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 0.29
## ZN -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 -0.39
## INDUS 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 0.38
## CHAS -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 -0.12
## NOX 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 0.19
## RM -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 -0.36
## AGE 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 0.26
## DIS -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 -0.23
## RAD 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 0.46
## TAX 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 0.46
## PTRATIO 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 1.00
## B -0.39 0.18 -0.36 0.05 -0.38 0.13 -0.27 0.29 -0.44 -0.44 -0.18
## LSTAT 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 0.37
## MEDV -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 -0.51
## B LSTAT MEDV
## CRIM -0.39 0.46 -0.39
## ZN 0.18 -0.41 0.36
## INDUS -0.36 0.60 -0.48
## CHAS 0.05 -0.05 0.18
## NOX -0.38 0.59 -0.43
## RM 0.13 -0.61 0.70
## AGE -0.27 0.60 -0.38
## DIS 0.29 -0.50 0.25
## RAD -0.44 0.49 -0.38
## TAX -0.44 0.54 -0.47
## PTRATIO -0.18 0.37 -0.51
## B 1.00 -0.37 0.33
## LSTAT -0.37 1.00 -0.74
## MEDV 0.33 -0.74 1.00
library(corrplot)
## corrplot 0.90 loaded
corrplot(housing.cor)
Se hace el análisis paso a paso y se van descartando variables de acuerdo a la correlación.
Modelo_1<- lm(MEDV~CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT , data=housing)
summary(Modelo_1)
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE +
## DIS + RAD + TAX + PTRATIO + B + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## CRIM -1.080e-01 3.286e-02 -3.287 0.001087 **
## ZN 4.642e-02 1.373e-02 3.382 0.000778 ***
## INDUS 2.056e-02 6.150e-02 0.334 0.738288
## CHAS 2.687e+00 8.616e-01 3.118 0.001925 **
## NOX -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## RM 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## AGE 6.922e-04 1.321e-02 0.052 0.958229
## DIS -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## RAD 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## TAX -1.233e-02 3.760e-03 -3.280 0.001112 **
## PTRATIO -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## B 9.312e-03 2.686e-03 3.467 0.000573 ***
## LSTAT -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
Modelo_2<- lm(MEDV~CRIM + ZN + INDUS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT , data=housing)
summary(Modelo_2)
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + INDUS + NOX + RM + AGE + DIS +
## RAD + TAX + PTRATIO + B + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3968 -2.8103 -0.6455 1.9141 26.3755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.891960 5.146516 7.168 2.79e-12 ***
## CRIM -0.113139 0.033113 -3.417 0.000686 ***
## ZN 0.047052 0.013847 3.398 0.000734 ***
## INDUS 0.040311 0.061707 0.653 0.513889
## NOX -17.366999 3.851224 -4.509 8.13e-06 ***
## RM 3.850492 0.421402 9.137 < 2e-16 ***
## AGE 0.002784 0.013309 0.209 0.834407
## DIS -1.485374 0.201187 -7.383 6.64e-13 ***
## RAD 0.328311 0.066542 4.934 1.10e-06 ***
## TAX -0.013756 0.003766 -3.653 0.000287 ***
## PTRATIO -0.990958 0.131399 -7.542 2.25e-13 ***
## B 0.009741 0.002706 3.600 0.000351 ***
## LSTAT -0.534158 0.051072 -10.459 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.787 on 493 degrees of freedom
## Multiple R-squared: 0.7355, Adjusted R-squared: 0.7291
## F-statistic: 114.3 on 12 and 493 DF, p-value: < 2.2e-16
Modelo_3<- lm(MEDV~CRIM + ZN + INDUS + NOX + RM + DIS + RAD + TAX + PTRATIO + B + LSTAT , data=housing)
summary(Modelo_3)
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + INDUS + NOX + RM + DIS + RAD +
## TAX + PTRATIO + B + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.4332 -2.8060 -0.6283 1.9330 26.3996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.802482 5.123739 7.183 2.53e-12 ***
## CRIM -0.113134 0.033081 -3.420 0.000678 ***
## ZN 0.046705 0.013733 3.401 0.000726 ***
## INDUS 0.040394 0.061646 0.655 0.512613
## NOX -17.151648 3.707457 -4.626 4.76e-06 ***
## RM 3.868889 0.411721 9.397 < 2e-16 ***
## DIS -1.497683 0.192200 -7.792 3.91e-14 ***
## RAD 0.327325 0.066311 4.936 1.09e-06 ***
## TAX -0.013737 0.003761 -3.652 0.000287 ***
## PTRATIO -0.988928 0.130913 -7.554 2.06e-13 ***
## B 0.009779 0.002697 3.625 0.000319 ***
## LSTAT -0.530535 0.047999 -11.053 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.782 on 494 degrees of freedom
## Multiple R-squared: 0.7355, Adjusted R-squared: 0.7296
## F-statistic: 124.9 on 11 and 494 DF, p-value: < 2.2e-16
Modelo_4<- lm(MEDV~CRIM + ZN + NOX + RM + DIS + RAD + TAX + PTRATIO + B + LSTAT , data=housing)
summary(Modelo_4)
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + NOX + RM + DIS + RAD + TAX +
## PTRATIO + B + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3716 -2.7943 -0.5508 1.8942 26.3982
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.620311 5.113241 7.162 2.90e-12 ***
## CRIM -0.114056 0.033032 -3.453 0.000602 ***
## ZN 0.045742 0.013647 3.352 0.000864 ***
## NOX -16.469153 3.556086 -4.631 4.65e-06 ***
## RM 3.844639 0.409818 9.381 < 2e-16 ***
## DIS -1.526099 0.187136 -8.155 2.89e-15 ***
## RAD 0.315531 0.063785 4.947 1.04e-06 ***
## TAX -0.012674 0.003391 -3.737 0.000208 ***
## PTRATIO -0.978442 0.129857 -7.535 2.34e-13 ***
## B 0.009730 0.002695 3.611 0.000337 ***
## LSTAT -0.528103 0.047827 -11.042 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.78 on 495 degrees of freedom
## Multiple R-squared: 0.7353, Adjusted R-squared: 0.7299
## F-statistic: 137.5 on 10 and 495 DF, p-value: < 2.2e-16
Modelo_5<- lm(MEDV~CRIM + ZN + NOX + RM + RAD + TAX + PTRATIO + B + LSTAT , data=housing)
summary(Modelo_5)
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + NOX + RM + RAD + TAX + PTRATIO +
## B + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8442 -3.1261 -0.8713 1.6651 29.0643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.006578 4.931226 3.854 0.000131 ***
## CRIM -0.083018 0.034911 -2.378 0.017787 *
## ZN -0.009998 0.012567 -0.796 0.426685
## NOX -2.356363 3.305363 -0.713 0.476249
## RM 4.493694 0.427738 10.506 < 2e-16 ***
## RAD 0.279772 0.067705 4.132 4.22e-05 ***
## TAX -0.010091 0.003593 -2.809 0.005169 **
## PTRATIO -1.033125 0.137981 -7.487 3.23e-13 ***
## B 0.010588 0.002865 3.695 0.000244 ***
## LSTAT -0.494430 0.050698 -9.753 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.086 on 496 degrees of freedom
## Multiple R-squared: 0.6997, Adjusted R-squared: 0.6942
## F-statistic: 128.4 on 9 and 496 DF, p-value: < 2.2e-16
Modelo_6<- lm(MEDV~CRIM + ZN + NOX + RM + RAD + TAX + PTRATIO + DIS + LSTAT , data=housing)
summary(Modelo_6)
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + NOX + RM + RAD + TAX + PTRATIO +
## DIS + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.8021 -2.7866 -0.6165 2.0405 26.6133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.003257 4.950052 8.485 2.50e-16 ***
## CRIM -0.128304 0.033191 -3.866 0.000126 ***
## ZN 0.046100 0.013811 3.338 0.000908 ***
## NOX -17.346284 3.590566 -4.831 1.81e-06 ***
## RM 3.712126 0.413094 8.986 < 2e-16 ***
## RAD 0.300012 0.064407 4.658 4.11e-06 ***
## TAX -0.013267 0.003428 -3.870 0.000124 ***
## PTRATIO -0.963988 0.131360 -7.339 8.89e-13 ***
## DIS -1.552476 0.189249 -8.203 2.02e-15 ***
## LSTAT -0.553587 0.047874 -11.563 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.837 on 496 degrees of freedom
## Multiple R-squared: 0.7283, Adjusted R-squared: 0.7234
## F-statistic: 147.7 on 9 and 496 DF, p-value: < 2.2e-16
Modelo_7<- lm(MEDV~CRIM + NOX + RM + RAD + TAX + PTRATIO + DIS + LSTAT , data=housing)
summary(Modelo_7)
##
## Call:
## lm(formula = MEDV ~ CRIM + NOX + RM + RAD + TAX + PTRATIO + DIS +
## LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.1090 -2.8685 -0.8488 1.9821 27.0231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.735025 4.995395 8.555 < 2e-16 ***
## CRIM -0.118109 0.033386 -3.538 0.000441 ***
## NOX -18.617731 3.606548 -5.162 3.54e-07 ***
## RM 3.913724 0.412804 9.481 < 2e-16 ***
## RAD 0.284536 0.064892 4.385 1.42e-05 ***
## TAX -0.011026 0.003396 -3.247 0.001246 **
## PTRATIO -1.114662 0.124613 -8.945 < 2e-16 ***
## DIS -1.236015 0.165451 -7.471 3.62e-13 ***
## LSTAT -0.553077 0.048360 -11.437 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.886 on 497 degrees of freedom
## Multiple R-squared: 0.7222, Adjusted R-squared: 0.7177
## F-statistic: 161.5 on 8 and 497 DF, p-value: < 2.2e-16
Modelo_8<- lm(MEDV~CRIM + NOX + RM + RAD + PTRATIO + DIS + LSTAT , data=housing)
summary(Modelo_8)
##
## Call:
## lm(formula = MEDV ~ CRIM + NOX + RM + RAD + PTRATIO + DIS + LSTAT,
## data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.4735 -3.0732 -0.5928 1.9530 26.9299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.26050 5.02214 8.216 1.83e-15 ***
## CRIM -0.11776 0.03370 -3.494 0.000519 ***
## NOX -21.21373 3.55032 -5.975 4.38e-09 ***
## RM 4.04451 0.41475 9.752 < 2e-16 ***
## RAD 0.12043 0.04108 2.931 0.003530 **
## PTRATIO -1.15452 0.12519 -9.222 < 2e-16 ***
## DIS -1.24006 0.16702 -7.424 4.94e-13 ***
## LSTAT -0.56033 0.04877 -11.490 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.933 on 498 degrees of freedom
## Multiple R-squared: 0.7163, Adjusted R-squared: 0.7123
## F-statistic: 179.6 on 7 and 498 DF, p-value: < 2.2e-16
Modelo_9<- lm(MEDV~CRIM + NOX + RM + PTRATIO + DIS + LSTAT , data=housing)
summary(Modelo_9)
##
## Call:
## lm(formula = MEDV ~ CRIM + NOX + RM + PTRATIO + DIS + LSTAT,
## data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.0803 -2.9917 -0.7166 1.9493 28.0304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.54553 4.66335 7.622 1.27e-13 ***
## CRIM -0.07202 0.03010 -2.393 0.0171 *
## NOX -17.00665 3.27192 -5.198 2.95e-07 ***
## RM 4.25196 0.41177 10.326 < 2e-16 ***
## PTRATIO -1.00080 0.11454 -8.738 < 2e-16 ***
## DIS -1.20335 0.16782 -7.171 2.71e-12 ***
## LSTAT -0.55353 0.04908 -11.277 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.97 on 499 degrees of freedom
## Multiple R-squared: 0.7114, Adjusted R-squared: 0.7079
## F-statistic: 205 on 6 and 499 DF, p-value: < 2.2e-16
Modelo_10<- lm(MEDV~NOX + RM + PTRATIO + DIS + LSTAT , data=housing)
summary(Modelo_10)
##
## Call:
## lm(formula = MEDV ~ NOX + RM + PTRATIO + DIS + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.7765 -3.0186 -0.6481 1.9752 27.7625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.49920 4.61295 8.129 3.43e-15 ***
## NOX -17.99657 3.26095 -5.519 5.49e-08 ***
## RM 4.16331 0.41203 10.104 < 2e-16 ***
## PTRATIO -1.04577 0.11352 -9.212 < 2e-16 ***
## DIS -1.18466 0.16842 -7.034 6.64e-12 ***
## LSTAT -0.58108 0.04794 -12.122 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.994 on 500 degrees of freedom
## Multiple R-squared: 0.7081, Adjusted R-squared: 0.7052
## F-statistic: 242.6 on 5 and 500 DF, p-value: < 2.2e-16
Modelo_11<- lm(MEDV~RM + PTRATIO + DIS + LSTAT , data=housing)
summary(Modelo_11)
##
## Call:
## lm(formula = MEDV ~ RM + PTRATIO + DIS + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4172 -3.0971 -0.6397 1.8727 27.1088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.47136 4.07802 6.001 3.77e-09 ***
## RM 4.22379 0.42382 9.966 < 2e-16 ***
## PTRATIO -0.97365 0.11603 -8.391 4.94e-16 ***
## DIS -0.55193 0.12695 -4.348 1.67e-05 ***
## LSTAT -0.66544 0.04675 -14.233 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.139 on 501 degrees of freedom
## Multiple R-squared: 0.6903, Adjusted R-squared: 0.6878
## F-statistic: 279.2 on 4 and 501 DF, p-value: < 2.2e-16
Modelo_12<- lm(MEDV~NOX + RM + PTRATIO + LSTAT , data=housing)
summary(Modelo_12)
##
## Call:
## lm(formula = MEDV ~ NOX + RM + PTRATIO + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.9962 -3.0676 -0.8475 1.8366 30.1291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.39584 4.00911 4.838 1.75e-06 ***
## NOX -2.38280 2.50166 -0.952 0.341
## RM 4.55168 0.42761 10.644 < 2e-16 ***
## PTRATIO -0.93376 0.11771 -7.933 1.41e-14 ***
## LSTAT -0.54643 0.04994 -10.942 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.23 on 501 degrees of freedom
## Multiple R-squared: 0.6792, Adjusted R-squared: 0.6766
## F-statistic: 265.2 on 4 and 501 DF, p-value: < 2.2e-16
Modelo_13<- lm(MEDV~NOX + RM + DIS + LSTAT , data=housing)
summary(Modelo_13)
##
## Call:
## lm(formula = MEDV ~ NOX + RM + DIS + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.072 -3.228 -0.907 1.968 26.402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.03600 3.99033 3.016 0.00269 **
## NOX -14.53791 3.49991 -4.154 3.84e-05 ***
## RM 4.86340 0.43754 11.115 < 2e-16 ***
## DIS -0.96699 0.18018 -5.367 1.23e-07 ***
## LSTAT -0.65865 0.05099 -12.917 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.396 on 501 degrees of freedom
## Multiple R-squared: 0.6585, Adjusted R-squared: 0.6558
## F-statistic: 241.6 on 4 and 501 DF, p-value: < 2.2e-16
Modelo_14<- lm(MEDV~NOX + PTRATIO + DIS + LSTAT , data=housing)
summary(Modelo_14)
##
## Call:
## lm(formula = MEDV ~ NOX + PTRATIO + DIS + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.8891 -3.2217 -0.8456 1.9936 24.8884
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.11463 3.38663 21.294 < 2e-16 ***
## NOX -18.87303 3.57360 -5.281 1.92e-07 ***
## PTRATIO -1.25733 0.12231 -10.280 < 2e-16 ***
## DIS -1.41272 0.18297 -7.721 6.32e-14 ***
## LSTAT -0.83355 0.04485 -18.585 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.475 on 501 degrees of freedom
## Multiple R-squared: 0.6485, Adjusted R-squared: 0.6457
## F-statistic: 231.1 on 4 and 501 DF, p-value: < 2.2e-16
Modelo_15<- lm(MEDV~NOX + RM + PTRATIO + DIS , data=housing)
summary(Modelo_15)
##
## Call:
## lm(formula = MEDV ~ NOX + RM + PTRATIO + DIS, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.274 -3.143 -0.525 2.102 38.543
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.4422 5.0970 4.795 2.14e-06 ***
## NOX -30.5997 3.5122 -8.712 < 2e-16 ***
## RM 6.7664 0.3996 16.934 < 2e-16 ***
## PTRATIO -1.2875 0.1270 -10.138 < 2e-16 ***
## DIS -0.9748 0.1904 -5.121 4.35e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.675 on 501 degrees of freedom
## Multiple R-squared: 0.6223, Adjusted R-squared: 0.6193
## F-statistic: 206.4 on 4 and 501 DF, p-value: < 2.2e-16
Modelo_16<- lm(MEDV~RM + PTRATIO + LSTAT , data=housing)
summary(Modelo_16)
##
## Call:
## lm(formula = MEDV ~ RM + PTRATIO + LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.4871 -3.1047 -0.7976 1.8129 29.6559
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.56711 3.91320 4.745 2.73e-06 ***
## RM 4.51542 0.42587 10.603 < 2e-16 ***
## PTRATIO -0.93072 0.11765 -7.911 1.64e-14 ***
## LSTAT -0.57181 0.04223 -13.540 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.229 on 502 degrees of freedom
## Multiple R-squared: 0.6786, Adjusted R-squared: 0.6767
## F-statistic: 353.3 on 3 and 502 DF, p-value: < 2.2e-16
Se toman los modelos del 10 al 16 para hacer la validaciones y seleccionar le mejor modelo.
AIC(Modelo_10)
## [1] 3071.439
AIC(Modelo_11)
## [1] 3099.359
AIC(Modelo_12)
## [1] 3117.182
AIC(Modelo_13)
## [1] 3148.768
AIC(Modelo_14)
## [1] 3163.461
AIC(Modelo_15)
## [1] 3199.802
AIC(Modelo_16)
## [1] 3116.097
BIC(Modelo_10)
## [1] 3101.024
BIC(Modelo_11)
## [1] 3124.718
BIC(Modelo_12)
## [1] 3142.541
BIC(Modelo_13)
## [1] 3174.127
BIC(Modelo_14)
## [1] 3188.82
BIC(Modelo_15)
## [1] 3225.161
BIC(Modelo_16)
## [1] 3137.23
Una vez revisados los rsquare y valors BIC y AIC se determina que le mejor modelo es el 16 que incluye las variables (RM, PRATIO, LSTAT)
options(repr.plot.width=6, repr.plot.height=6)
par(mfrow=c(2,2))
plot(Modelo_16)
residuos <- residuals(Modelo_16)
mean(residuos)
## [1] 1.828488e-17
Teniendo en cuenta el gráfico Q-Q se puede establecer que no hay normalidad en los datos.
library(lmtest)
dwtest(Modelo_16)
##
## Durbin-Watson test
##
## data: Modelo_16
## DW = 0.90124, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
Se Rechaza la hipotesis nula, por lo tanto podemos afirmarque existe correlación.
bptest(Modelo_16)
##
## studentized Breusch-Pagan test
##
## data: Modelo_16
## BP = 1.6223, df = 3, p-value = 0.6543
No rechazamos la hipótesis nula por lo que podemos afirmar que existe homscedasticidad en el modelo.
En el gráfico a continuación se evidencia que existen datos atípicos en el valor de las viviendas y en el número de habitaciones, mientras que en la variable proporción alumno-maestro por ciudad no se encuentran muchos datos atípicos.
boxplot(x=housing$MEDV, xlab='MEDV ($)', horizontal=TRUE)
boxplot(x=housing$RM, xlab='RM ($)', horizontal=TRUE)
boxplot(x=housing$PTRATIO, xlab='PTRATIO ($)', horizontal=TRUE)
El modelo seleccionado, es el que se encontro con menos variables y mejores resultados en los diferentes parámentros, lo que nos puede indicar que las variables, porcentaje de estatus mas bajo de la población, el número de habitaciones y la proporción entre alumno.maestro, sonlas que de mejor manera pueden explicar la variable valor medio de las viviendas ocupadas por le propietario en 1000.
confint(Modelo_16)
## 2.5 % 97.5 %
## (Intercept) 10.8788409 26.2553821
## RM 3.6787108 5.3521311
## PTRATIO -1.1618769 -0.6995682
## LSTAT -0.6547754 -0.4888359