Estas preguntas deben responderse utilizando el conjunto de datos de Carseats.
library(MASS)
library(ISLR)
head(Carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1 9.50 138 73 11 276 120 Bad 42 17
## 2 11.22 111 48 16 260 83 Good 65 10
## 3 10.06 113 35 10 269 80 Medium 59 12
## 4 7.40 117 100 4 466 97 Medium 55 14
## 5 4.15 141 64 3 340 128 Bad 38 13
## 6 10.81 124 113 13 501 72 Bad 78 16
## Urban US
## 1 Yes Yes
## 2 Yes Yes
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 No Yes
lm.fita <-lm(Sales~Price, data=Carseats)
summary (lm.fita)
##
## Call:
## lm(formula = Sales ~ Price, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5224 -1.8442 -0.1459 1.6503 7.5108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.641915 0.632812 21.558 <2e-16 ***
## Price -0.053073 0.005354 -9.912 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.532 on 398 degrees of freedom
## Multiple R-squared: 0.198, Adjusted R-squared: 0.196
## F-statistic: 98.25 on 1 and 398 DF, p-value: < 2.2e-16
#coeficiente de Precio es -0.053073, esto signifca que hay una relación negativa, es decir, si aumenta el precio disminuyen las ventas.
#intercepto : 13.641915, que es cuando el precio empieza en cero.
confint(lm.fita)
## 2.5 % 97.5 %
## (Intercept) 12.3978438 14.88598655
## Price -0.0635995 -0.04254653
lm.fitm <- lm(Sales~Price+Urban+US, data=Carseats)
summary(lm.fitm)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
# Price -0.054459 significa relación negativa
# UrbanYes -0.021916 significa una relación negativa
# USYes 1.200573 significa una relación positiva si la tienda está en US
# S= 13.043469 - 0.054459*P - 0.021916*Ur + 1.200573*US
# Ur y US solo pueden tomar valor de 1 o 0
lm.fitm2 <- lm(Sales~Price+US, data=Carseats)
summary(lm.fitm2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
confint(lm.fitm2)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
Utilizando el conjunto de datos de Boston, ajuste los modelos de clasificación para predecir si un suburbio determinado tiene una tasa de criminalidad superior o inferior a la mediana. Explore el modelo de regresión logística utilizando varios subconjuntos de predictores. Describe tus hallazgos.
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
Sacar la mediana de la tasa de criminalidad: 0.25651
Boston$mediana_crim <- as.numeric(Boston$crim > 0.25651) # se realizo una variable dummy para poder hacer glm con family=binomial
glm.fit1<-glm(mediana_crim~zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv,data=Boston,family=binomial)
summary(glm.fit1)
##
## Call:
## glm(formula = mediana_crim ~ zn + indus + chas + nox + rm + age +
## dis + rad + tax + ptratio + black + lstat + medv, family = binomial,
## data = Boston)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -34.103704 6.530014 -5.223 1.76e-07 ***
## zn -0.079918 0.033731 -2.369 0.01782 *
## indus -0.059389 0.043722 -1.358 0.17436
## chas 0.785327 0.728930 1.077 0.28132
## nox 48.523782 7.396497 6.560 5.37e-11 ***
## rm -0.425596 0.701104 -0.607 0.54383
## age 0.022172 0.012221 1.814 0.06963 .
## dis 0.691400 0.218308 3.167 0.00154 **
## rad 0.656465 0.152452 4.306 1.66e-05 ***
## tax -0.006412 0.002689 -2.385 0.01709 *
## ptratio 0.368716 0.122136 3.019 0.00254 **
## black -0.013524 0.006536 -2.069 0.03853 *
## lstat 0.043862 0.048981 0.895 0.37052
## medv 0.167130 0.066940 2.497 0.01254 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 211.93 on 492 degrees of freedom
## AIC: 239.93
##
## Number of Fisher Scoring iterations: 9
glm.probs <- predict(glm.fit1, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "más", "menos")
Boston$prediccion <- glm.pred # realizar una nueva columna para comparar la realidad (mediana_crim) con la predicción
table(glm.pred, Boston$mediana_crim)
##
## glm.pred 0 1
## más 19 229
## menos 234 24
En este primer intento se encuentra que las variables significativas para el modelo solo son: zn, nox,dis,rad,tax,ptratio,black,medv
Boston$mediana_crim <- as.numeric(Boston$crim > 0.25651)
glm.fit1<-glm(mediana_crim~zn+nox+dis+rad+tax+ptratio+black+medv,data=Boston,family=binomial) #ejercicio resuelto solo con las variables significativas
summary(glm.fit1)
##
## Call:
## glm(formula = mediana_crim ~ zn + nox + dis + rad + tax + ptratio +
## black + medv, family = binomial, data = Boston)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -28.347709 5.569863 -5.089 3.59e-07 ***
## zn -0.074499 0.029975 -2.485 0.01294 *
## nox 44.180443 6.289746 7.024 2.15e-12 ***
## dis 0.489849 0.194930 2.513 0.01197 *
## rad 0.692116 0.137842 5.021 5.14e-07 ***
## tax -0.007448 0.002428 -3.067 0.00216 **
## ptratio 0.272145 0.107311 2.536 0.01121 *
## black -0.013484 0.006331 -2.130 0.03317 *
## medv 0.087913 0.030787 2.856 0.00430 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 221.78 on 497 degrees of freedom
## AIC: 239.78
##
## Number of Fisher Scoring iterations: 9
glm.probs <- predict(glm.fit1, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "más", "menos")
Boston$prediccion <- glm.pred
table(glm.pred, Boston$mediana_crim)
##
## glm.pred 0 1
## más 24 223
## menos 229 30