housing <- read.table("C:/Users/christian.figueroa/Desktop/Actividada 1 Regresion/archive/archive/housing.csv", quote="\"", comment.char="")
names(housing) = c(“CRIM”, “ZN”, “INDUS”, “CHAS”, “NOX”, “RM”, “AGE”, “DIS”, “RAD”, “TAX”, “PTRATIO”, “B - 1000”, “LSTAT”, “MEDV”)
names(housing) = c("CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV")
summary(housing)
## CRIM ZN INDUS CHAS
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## NOX RM AGE DIS
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## RAD TAX PTRATIO B
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## LSTAT MEDV
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
A continuación se presenta un análisis exploratorio de la variable MEDV que corresponde valor medio comercial de casas de uso residencial en miles de dólares.
library(dplyr)
library(PASWR2)
eda(housing$MEDV)
## Size (n) Missing Minimum 1st Qu Mean Median TrMean 3rd Qu
## 506.000 0.000 5.000 17.025 22.533 21.200 21.906 25.000
## Max Stdev Var SE Mean I.Q.R. Range Kurtosis Skewness
## 50.000 9.197 84.587 0.409 7.975 45.000 1.451 1.102
## SW p-val
## 0.000
Se realiza una matriz de correlación con el fin de verificar que variables podría arrojarnos a fututro un mejor modelo, obteniendo los mejores resultados versus la variable de interes MEDV, con las variables RM y la variable LSTAT.
cor(housing)
## CRIM ZN INDUS CHAS NOX
## CRIM 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## ZN -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## INDUS 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## CHAS -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## NOX 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## RM -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## AGE 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## DIS -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## RAD 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## TAX 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## PTRATIO 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## B -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## LSTAT 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## MEDV -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## RM AGE DIS RAD TAX PTRATIO
## CRIM -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## ZN 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## INDUS -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## CHAS 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## NOX -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## RM 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## AGE -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## DIS 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## RAD -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## TAX -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## PTRATIO -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## B 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## LSTAT -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## MEDV 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## B LSTAT MEDV
## CRIM -0.38506394 0.4556215 -0.3883046
## ZN 0.17552032 -0.4129946 0.3604453
## INDUS -0.35697654 0.6037997 -0.4837252
## CHAS 0.04878848 -0.0539293 0.1752602
## NOX -0.38005064 0.5908789 -0.4273208
## RM 0.12806864 -0.6138083 0.6953599
## AGE -0.27353398 0.6023385 -0.3769546
## DIS 0.29151167 -0.4969958 0.2499287
## RAD -0.44441282 0.4886763 -0.3816262
## TAX -0.44180801 0.5439934 -0.4685359
## PTRATIO -0.17738330 0.3740443 -0.5077867
## B 1.00000000 -0.3660869 0.3334608
## LSTAT -0.36608690 1.0000000 -0.7376627
## MEDV 0.33346082 -0.7376627 1.0000000
Luego de generar el modelo para cada variable se considera que el mejor modelo correponde a la variable LSTAT (% de menor estatus de la población), la decisión se toma principalmente por el valor de R Square y su nivel de predicción de la variable MEDV, ademas por la prueba BIC.
modelo_lstat<- lm(MEDV~LSTAT, data=housing)
options(repr.plot.width=4, repr.plot.height=4)
ggplot(housing, aes(x=LSTAT, y=MEDV))+stat_smooth(method="lm", se=FALSE)+geom_point()
summary(modelo_lstat)
##
## Call:
## lm(formula = MEDV ~ LSTAT, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## LSTAT -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
AIC(modelo_lstat)
## [1] 3288.975
BIC(modelo_lstat)
## [1] 3301.655
options(repr.plot.width=6, repr.plot.height=6)
par(mfrow=c(2,2))
plot(modelo_lstat)
residuos <- residuals(modelo_lstat)
mean(residuos)
## [1] 6.600194e-16
library(lmtest)
dwtest(modelo_lstat)
##
## Durbin-Watson test
##
## data: modelo_lstat
## DW = 0.8915, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
bptest(modelo_lstat)
##
## studentized Breusch-Pagan test
##
## data: modelo_lstat
## BP = 15.497, df = 1, p-value = 8.262e-05
En el gráfico a continuación se evidencia que existen datos atípicos en el valor de las viviendas.
boxplot(x=housing$MEDV, xlab='MEDV ($)', horizontal=TRUE)
Teniendo en cuenta la relación negativa que existe entre las variables MEDV y LSTAt y teniendo en cuentas los resultados del modelo, podemos afrimar que a un porcentaje menor estatus en la población, mayor es el costo de la vivienda, de igual forma se puede afirma que esta variable es la que mejor predice el comportamiento del costo de la vivienda.