library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
data("CreditCard")
head(CreditCard)
## card reports age income share expenditure owner selfemp dependents
## 1 yes 0 37.66667 4.5200 0.033269910 124.983300 yes no 3
## 2 yes 0 33.25000 2.4200 0.005216942 9.854167 no no 3
## 3 yes 0 33.66667 4.5000 0.004155556 15.000000 yes no 4
## 4 yes 0 30.50000 2.5400 0.065213780 137.869200 no no 0
## 5 yes 0 32.16667 9.7867 0.067050590 546.503300 yes no 2
## 6 yes 0 23.25000 2.5000 0.044438400 91.996670 no no 0
## months majorcards active
## 1 54 1 12
## 2 34 1 13
## 3 58 1 5
## 4 25 1 7
## 5 64 1 5
## 6 54 1 1
for (i in names(CreditCard)) {
print(i)
plot(CreditCard$card ~CreditCard[[i]], main = paste("Card vs", i), ylab = i)
}
## [1] "card"
## [1] "reports"
## [1] "age"
## [1] "income"
## [1] "share"
## [1] "expenditure"
## [1] "owner"
## [1] "selfemp"
## [1] "dependents"
## [1] "months"
## [1] "majorcards"
## [1] "active"
1.
model <- glm(card~reports+income+owner+dependents+selfemp, data=CreditCard, family=binomial(link='logit'))
summary(model)
##
## Call:
## glm(formula = card ~ reports + income + owner + dependents +
## selfemp, family = binomial(link = "logit"), data = CreditCard)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.04186 0.19141 5.443 5.24e-08 ***
## reports -1.36457 0.11424 -11.945 < 2e-16 ***
## income 0.26557 0.06364 4.173 3.01e-05 ***
## owneryes 0.78625 0.18026 4.362 1.29e-05 ***
## dependents -0.26301 0.06679 -3.938 8.23e-05 ***
## selfempyes -0.67449 0.28051 -2.404 0.0162 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1404.6 on 1318 degrees of freedom
## Residual deviance: 1052.5 on 1313 degrees of freedom
## AIC: 1064.5
##
## Number of Fisher Scoring iterations: 6
\[\ln \left( \frac{p}{1-p} \right)= 1.04186 -1.36457 \cdot reports+0.26557 \cdot income + 0.78625 \cdot owneryes -0.26301 \cdot dependents -0.67449 \cdot selfempyes\] * Los factores que favorecen: 1. Income 2. Owner * Los factores que desfavorecen: 1. Reports 2. Dependents 3. Selfemp
Considermos dos escenarios, 1. La persona es self-employed 2. La persnoa no es self-employed
income = c(5,5)
owner= c("yes", "yes")
reports= c(5, 5)
dependents = c(3, 3)
selfemp= c("yes", "no")
info_person <- data.frame(
income = income,
owner = owner,
reports = reports,
dependents = dependents,
selfemp= selfemp
)
prediccion <- predict(model, newdata = info_person, type = "response")
print(prediccion)
## 1 2
## 0.005879631 0.011476861
Es decir 1. Si la persona es self-employed, la probabilidad de que le acepten la solicitud de tarjeta de credito es de 0.559% 2. Si la persona no es self-employed, la probabilidad de que le acepten la solicitud de tarjeta de credito es de 1.14%
Una vez mas considermos dos escenarios, 1. La persona es self-employed 2. La persnoa no es self-employed
income = c(2,2)
owner= c("no", "no")
reports= c(5,5)
dependents = c(1,1)
selfemp= c("yes", "no")
info_person_2 <- data.frame(
income = income,
owner = owner,
reports = reports,
dependents = dependents,
selfemp= selfemp
)
prediccion_2 <- predict(model, newdata = info_person_2, type = "response")
print(prediccion_2)
## 1 2
## 0.002051114 0.004018445
Es decir 1. Si la persona es self-employed, la probabilidad de que le acepten la solicitud de tarjeta de credito es de 0.205% 2. Si la persona no es self-employed, la probabilidad de que le acepten la solicitud de tarjeta de credito es de 0.401%
data("ShipAccidents")
head(ShipAccidents)
## type construction operation service incidents
## 1 A 1960-64 1960-74 127 0
## 2 A 1960-64 1975-79 63 0
## 3 A 1965-69 1960-74 1095 3
## 4 A 1965-69 1975-79 1095 4
## 5 A 1970-74 1960-74 1512 6
## 6 A 1970-74 1975-79 3353 18
model2 <- glm(incidents ~ type + construction + operation +service, data= ShipAccidents,family =poisson )
summary(model2)
##
## Call:
## glm(formula = incidents ~ type + construction + operation + service,
## family = poisson, data = ShipAccidents)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.492e-04 2.787e-01 0.002 0.998427
## typeB 5.933e-01 2.163e-01 2.743 0.006092 **
## typeC -1.190e+00 3.275e-01 -3.635 0.000278 ***
## typeD -8.210e-01 2.877e-01 -2.854 0.004321 **
## typeE -2.900e-01 2.351e-01 -1.233 0.217466
## construction1965-69 1.148e+00 1.793e-01 6.403 1.53e-10 ***
## construction1970-74 1.596e+00 2.242e-01 7.122 1.06e-12 ***
## construction1975-79 5.670e-01 2.809e-01 2.018 0.043557 *
## operation1975-79 8.619e-01 1.317e-01 6.546 5.92e-11 ***
## service 7.270e-05 8.488e-06 8.565 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 730.253 on 39 degrees of freedom
## Residual deviance: 99.793 on 30 degrees of freedom
## AIC: 217.66
##
## Number of Fisher Scoring iterations: 5
(exp(0.0000727)-1)*100
## [1] 0.007270264
Por cada unidad adicional de servicio el número esperado de incidentes aumenta %0.007270264
(exp(8.619e-01)-1)*100
## [1] 136.7655
Cada vez que de la operacion es 1975-79 el número esperado de incidentes aumenta %136.7655
summary(model2)
##
## Call:
## glm(formula = incidents ~ type + construction + operation + service,
## family = poisson, data = ShipAccidents)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.492e-04 2.787e-01 0.002 0.998427
## typeB 5.933e-01 2.163e-01 2.743 0.006092 **
## typeC -1.190e+00 3.275e-01 -3.635 0.000278 ***
## typeD -8.210e-01 2.877e-01 -2.854 0.004321 **
## typeE -2.900e-01 2.351e-01 -1.233 0.217466
## construction1965-69 1.148e+00 1.793e-01 6.403 1.53e-10 ***
## construction1970-74 1.596e+00 2.242e-01 7.122 1.06e-12 ***
## construction1975-79 5.670e-01 2.809e-01 2.018 0.043557 *
## operation1975-79 8.619e-01 1.317e-01 6.546 5.92e-11 ***
## service 7.270e-05 8.488e-06 8.565 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 730.253 on 39 degrees of freedom
## Residual deviance: 99.793 on 30 degrees of freedom
## AIC: 217.66
##
## Number of Fisher Scoring iterations: 5
dispersiontest(model2)
##
## Overdispersion test
##
## data: model2
## z = 3.3419, p-value = 0.000416
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 2.372137
Esto indica que la varianza de los incidentes es aproximadamente 2.37 veces mayor que la media esperada y este valor es significativo.
Para el ejercicio del punto anterior, ajusta un modelo de regresión Poisson inflado en cero, utilizando solo las variables explicativas necesarias. Escriba la fórmula de ambas ecuación. ¿Cuál es la probabilidad de no sufrir ningún accidente para un barco tipo B, de construcción 1960-64, operación 1960-74 y con Service = 1000?
library(pscl)
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
hist(ShipAccidents$incidents)
model3 <- zeroinfl(
incidents ~ type + construction + operation|type + construction + operation,
data = ShipAccidents,
dist = "poisson"
)
summary(model3)
##
## Call:
## zeroinfl(formula = incidents ~ type + construction + operation | type +
## construction + operation, data = ShipAccidents, dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -3.4515 -0.7053 -0.3227 0.1853 3.8935
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.87367 0.21785 8.601 < 2e-16 ***
## typeB 1.48866 0.17083 8.714 < 2e-16 ***
## typeC -1.62079 0.32947 -4.919 8.68e-07 ***
## typeD -0.21886 0.29408 -0.744 0.45674
## typeE -0.27617 0.23595 -1.170 0.24181
## construction1965-69 0.31988 0.15244 2.098 0.03588 *
## construction1970-74 0.08175 0.15817 0.517 0.60527
## construction1975-79 -0.59599 0.21833 -2.730 0.00634 **
## operation1975-79 0.27915 0.11234 2.485 0.01296 *
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.8818 1.8028 1.599 0.1099
## typeB -2.5707 1.8500 -1.390 0.1647
## typeC -15.6831 682.3506 -0.023 0.9817
## typeD 2.6843 1.9192 1.399 0.1619
## typeE 0.5809 1.4895 0.390 0.6965
## construction1965-69 -3.8635 1.9111 -2.022 0.0432 *
## construction1970-74 -21.3676 2355.0486 -0.009 0.9928
## construction1975-79 -1.4496 1.4883 -0.974 0.3300
## operation1975-79 -2.3143 1.4093 -1.642 0.1006
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 38
## Log-likelihood: -98 on 18 Df
Las ecuaciones del problema serian: \[ \begin{aligned} \log(\hat{\lambda}) &= 1.87367 + 1.48866\cdot \text{typeB} - 1.62079 \cdot \text{typeC} - 0.21886\cdot \text{typeD} - 0.27617\cdot \text{typeE} \\ &\quad + 0.31988\cdot \text{construction1965-69} + 0.08175\cdot \text{construction1970-74} \\ &\quad - 0.59599 \cdot \text{construction1975-79} + 0.27915\cdot \text{operation1975-79} \end{aligned} \]
\[ \begin{aligned} \log\left(\frac{\hat{\pi}}{1-\hat{\pi}}\right) &= 2.8818 - 2.5707\cdot \text{typeB} - 15.6831\cdot \text{typeC} + 2.6843\cdot \text{typeD} + 0.5809\cdot \text{typeE} \\ &\quad - 3.8635\cdot \text{construction65-69} - 21.3676\cdot \text{construction70-74} \\ &\quad - 1.4496\cdot \text{construction75-79} - 2.3143\cdot \text{operation75-79} \end{aligned} \] Dejamos por fuera la variable service, ya que no logramos obtener un modelo que logre dar resultados por esta variable:
El error de la consola
modelo_fallido <- zeroinfl(
incidents ~ type + construction + operation + service |type + construction + operation,
data = ShipAccidents,
dist = "poisson"
)
## Warning in value[[3L]](cond): system is computationally singular: reciprocal
## condition number = 5.03099e-36FALSE
summary(modelo_fallido)
##
## Call:
## zeroinfl(formula = incidents ~ type + construction + operation + service |
## type + construction + operation, data = ShipAccidents, dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.5665 -0.6356 -0.2333 0.5411 3.6577
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.760e-01 NA NA NA
## typeB 7.274e-01 NA NA NA
## typeC -1.414e+00 NA NA NA
## typeD -3.786e-01 NA NA NA
## typeE -2.270e-01 NA NA NA
## construction1965-69 8.134e-01 NA NA NA
## construction1970-74 1.008e+00 NA NA NA
## construction1975-79 3.064e-01 NA NA NA
## operation1975-79 6.628e-01 NA NA NA
## service 5.223e-05 NA NA NA
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.8030 NA NA NA
## typeB -2.5313 NA NA NA
## typeC -16.9764 NA NA NA
## typeD 2.6607 NA NA NA
## typeE 0.5997 NA NA NA
## construction1965-69 -3.8064 NA NA NA
## construction1970-74 -21.3676 NA NA NA
## construction1975-79 -1.4066 NA NA NA
## operation1975-79 -2.2740 NA NA NA
##
## Number of iterations in BFGS optimization: 40
## Log-likelihood: -79.11 on 19 Df
No da valores para los p valores.
¿Cuál es la probabilidad de no sufrir ningún accidente para un barco tipo B, de construcción 1960-64, operación 1960-74 y con Service = 1000?
nuevo_barco <- data.frame(
type = "B",
construction = "1960-64",
operation = "1960-74",
service = 1000
)
predict(
model3,
newdata = nuevo_barco,
type = "prob"
)
## 0 1 2 3 4 5
## 1 0.5771342 3.583224e-12 5.169945e-11 4.972865e-10 3.587473e-09 2.07043e-08
## 6 7 8 9 10 11
## 1 9.957525e-08 4.104832e-07 1.480633e-06 4.747302e-06 1.3699e-05 3.593669e-05
## 12 13 14 15 16 17
## 1 8.641693e-05 0.0001918216 0.000395377 0.0007606103 0.001371779 0.002328505
## 18 19 20 21 22 23
## 1 0.003732903 0.005669374 0.008179883 0.01124009 0.01474311 0.01849708
## 24 25 26 27 28 29 30
## 1 0.02223996 0.0256706 0.0284908 0.03044969 0.031381 0.0312256 0.03003527
## 31 32 33 34 35 36 37
## 1 0.02795837 0.0252118 0.02204611 0.01871091 0.01542655 0.01236541 0.009643811
## 38 39 40 41 42 43
## 1 0.007323304 0.005418567 0.003909007 0.002751215 0.001890241 0.001268501
## 44 45 46 47 48 49
## 1 0.0008319172 0.0005334695 0.0003346523 0.000205465 0.0001235204 7.274188e-05
## 50 51 52 53 54 55
## 1 4.198136e-05 2.375354e-05 1.318157e-05 7.176837e-06 3.835141e-06 2.012151e-06
## 56 57 58
## 1 1.036847e-06 5.249062e-07 2.611534e-07
La probabilidad de que ocurran 0 incidentes es de 57%