set.seed(993)
edad <- abs(round(rnorm(n = 1000,
mean = 67,
sd = 2)))
dap <- abs(round(rnorm(n = 1000,
mean = 30,
sd = 3), 1)) #diámetro a la altura del pecho
hibrido <- factor(rbinom(n = 1000,
size = 1,
prob = 0.6),
labels = c('h1', 'h2'))
rto <- abs(round(rnorm(n = 1000,
mean = 80,
sd = 5), 1)) #Rendimiento
cloA <- abs(round(rnorm(n = 1000,
mean = 320,
sd = 10)))
z <- 0.22 * edad - 0.12 * cloA + dap -8 #Variable artificial
pr <- 1/(1+exp(-z)) # Probabilidad de aborto
y = rbinom(1000,1,pr) # Abortos
abortos <- factor(rbinom(1000, 1, pr),
labels = c('Si', 'No'))
data <- data.frame(edad,
dap,
hibrido,
rto,
cloA,
abortos)
## Modificados
library(faux)
##
## ************
## Welcome to faux. For support and examples visit:
## https://debruine.github.io/faux/
## - Get and set global package options with: faux_options()
## ************
dfa <- rnorm_multi(n = 1000,
mu = c(67, 30, 30, 320),
sd = c(2, 3, 5, 10),
varnames = c('Edad1', 'dap1', 'rto1', 'clolA'),
r = c(0.4, 0.6, 0.5, 0.6, 0.7, 0.8))
dfa$hibrido1 <- round(runif(n = 1000, min = 0,max = 1.2))
w <- 0.5 * dfa$clolA - 0.01 * dfa$dap - 0.6 * dfa$rto - 0.02 * dfa$Edad
dfa$abortos1 <- ifelse(w > 140, 1, 0)
univariable_clolA <- glm(abortos1 ~ clolA, family = binomial, data = dfa)
summary(univariable_clolA)
##
## Call:
## glm(formula = abortos1 ~ clolA, family = binomial, data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.53166 -0.61869 0.07893 0.60462 2.64112
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -81.07545 5.11724 -15.84 <2e-16 ***
## clolA 0.25415 0.01603 15.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1383.99 on 999 degrees of freedom
## Residual deviance: 797.03 on 998 degrees of freedom
## AIC: 801.03
##
## Number of Fisher Scoring iterations: 6
univariable_dap1 <- glm(abortos1 ~ dap1, family = binomial, data = dfa)
summary(univariable_dap1)
##
## Call:
## glm(formula = abortos1 ~ dap1, family = binomial, data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3202 -0.9783 0.4409 0.9761 2.0610
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.84041 0.84950 -12.76 <2e-16 ***
## dap1 0.36744 0.02851 12.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1384 on 999 degrees of freedom
## Residual deviance: 1159 on 998 degrees of freedom
## AIC: 1163
##
## Number of Fisher Scoring iterations: 4
univariable_edad1 <- glm(abortos1 ~ Edad1, family = binomial, data = dfa)
summary(univariable_edad1)
##
## Call:
## glm(formula = abortos1 ~ Edad1, family = binomial, data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5344 -1.1951 0.9266 1.1231 1.5628
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.81875 2.10959 -4.180 2.91e-05 ***
## Edad1 0.13328 0.03153 4.227 2.37e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1384.0 on 999 degrees of freedom
## Residual deviance: 1365.6 on 998 degrees of freedom
## AIC: 1369.6
##
## Number of Fisher Scoring iterations: 4
univariable_rto1 <- glm(abortos1 ~ rto1, family = binomial, data = dfa)
summary(univariable_rto1)
##
## Call:
## glm(formula = abortos1 ~ rto1, family = binomial, data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8287 -1.1448 0.7274 1.0823 1.8173
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.23180 0.42657 -7.576 3.56e-14 ***
## rto1 0.11134 0.01413 7.878 3.32e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1384.0 on 999 degrees of freedom
## Residual deviance: 1315.2 on 998 degrees of freedom
## AIC: 1319.2
##
## Number of Fisher Scoring iterations: 4
univariable_h1 <- glm(abortos1 ~ hibrido1, family = binomial, data = dfa)
summary(univariable_h1)
##
## Call:
## glm(formula = abortos1 ~ hibrido1, family = binomial, data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.229 -1.212 1.127 1.144 1.144
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1201 0.1002 1.199 0.230
## hibrido1 -0.0401 0.1293 -0.310 0.756
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1384.0 on 999 degrees of freedom
## Residual deviance: 1383.9 on 998 degrees of freedom
## AIC: 1387.9
##
## Number of Fisher Scoring iterations: 3
mod1 <- glm(abortos1 ~ dfa$Edad1 + dfa$dap1 + dfa$rto1 + dfa$clolA + dfa$hibrido1, data = dfa)
summary(mod1)
##
## Call:
## glm(formula = abortos1 ~ dfa$Edad1 + dfa$dap1 + dfa$rto1 + dfa$clolA +
## dfa$hibrido1, data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.84987 -0.26673 -0.00386 0.26092 0.78326
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.916822 0.569729 -27.938 <2e-16 ***
## dfa$Edad1 -0.010902 0.005810 -1.876 0.0609 .
## dfa$dap1 0.003374 0.004496 0.750 0.4531
## dfa$rto1 -0.071728 0.003500 -20.491 <2e-16 ***
## dfa$clolA 0.060121 0.001772 33.923 <2e-16 ***
## dfa$hibrido1 -0.014645 0.019649 -0.745 0.4562
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.0924694)
##
## Null deviance: 249.424 on 999 degrees of freedom
## Residual deviance: 91.915 on 994 degrees of freedom
## AIC: 464.98
##
## Number of Fisher Scoring iterations: 2
mod2 <- glm(abortos1 ~ dfa$Edad1 + dfa$dap1 + dfa$rto1 + dfa$clolA, data = dfa)
summary(mod2)
##
## Call:
## glm(formula = abortos1 ~ dfa$Edad1 + dfa$dap1 + dfa$rto1 + dfa$clolA,
## data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8548 -0.2653 -0.0036 0.2678 0.7919
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.936103 0.569014 -28.007 <2e-16 ***
## dfa$Edad1 -0.010711 0.005803 -1.846 0.0652 .
## dfa$dap1 0.003342 0.004494 0.744 0.4573
## dfa$rto1 -0.071749 0.003500 -20.502 <2e-16 ***
## dfa$clolA 0.060118 0.001772 33.929 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.09242809)
##
## Null deviance: 249.424 on 999 degrees of freedom
## Residual deviance: 91.966 on 995 degrees of freedom
## AIC: 463.54
##
## Number of Fisher Scoring iterations: 2
mod3 <- glm(abortos1 ~ dfa$Edad1 + dfa$rto1 + dfa$clolA, data = dfa)
summary(mod3)
##
## Call:
## glm(formula = abortos1 ~ dfa$Edad1 + dfa$rto1 + dfa$clolA, data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.85662 -0.26769 -0.00415 0.26760 0.79602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -16.062917 0.542733 -29.596 <2e-16 ***
## dfa$Edad1 -0.010386 0.005785 -1.795 0.0729 .
## dfa$rto1 -0.071628 0.003495 -20.495 <2e-16 ***
## dfa$clolA 0.060748 0.001557 39.028 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.0923866)
##
## Null deviance: 249.424 on 999 degrees of freedom
## Residual deviance: 92.017 on 996 degrees of freedom
## AIC: 462.1
##
## Number of Fisher Scoring iterations: 2
del_coef1 <- abs((coef(mod2)-coef(mod1)[-6]))
round(del_coef1, 3)
## (Intercept) dfa$Edad1 dfa$dap1 dfa$rto1 dfa$clolA
## 0.019 0.000 0.000 0.000 0.000
del_coef2 <- abs((coef(mod3)-coef(mod2)[-3]))
round(del_coef2, 3)
## (Intercept) dfa$Edad1 dfa$rto1 dfa$clolA
## 0.127 0.000 0.000 0.001
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
lrtest(mod3, mod2)
## Likelihood ratio test
##
## Model 1: abortos1 ~ dfa$Edad1 + dfa$rto1 + dfa$clolA
## Model 2: abortos1 ~ dfa$Edad1 + dfa$dap1 + dfa$rto1 + dfa$clolA
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 -226.05
## 2 6 -225.77 1 0.5555 0.4561
anova(mod3, mod2, test = 'Chisq')
## Analysis of Deviance Table
##
## Model 1: abortos1 ~ dfa$Edad1 + dfa$rto1 + dfa$clolA
## Model 2: abortos1 ~ dfa$Edad1 + dfa$dap1 + dfa$rto1 + dfa$clolA
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 996 92.017
## 2 995 91.966 1 0.051105 0.4571
pred <- fitted.values(mod3)
par(mfrow = c(2, 3))
scatter.smooth(dfa$rto1, pred,cex = 0.5)
scatter.smooth(dfa$clolA, pred, cex = 0.5)
scatter.smooth(dfa$dap1, pred, cex = 0.5)
model_inter1 <- glm(abortos1 ~ dfa$dap1 + dfa$rto1 + dfa$Edad1 + dfa$dap1:dfa$rto1,family = 'binomial' , data = dfa)
summary(model_inter1)
##
## Call:
## glm(formula = abortos1 ~ dfa$dap1 + dfa$rto1 + dfa$Edad1 + dfa$dap1:dfa$rto1,
## family = "binomial", data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3150 -0.9896 0.4335 0.9655 2.0857
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.332609 5.569535 -1.496 0.13463
## dfa$dap1 0.452018 0.166482 2.715 0.00663 **
## dfa$rto1 0.071292 0.162367 0.439 0.66061
## dfa$Edad1 -0.077380 0.043124 -1.794 0.07275 .
## dfa$dap1:dfa$rto1 -0.002210 0.005401 -0.409 0.68234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1384 on 999 degrees of freedom
## Residual deviance: 1155 on 995 degrees of freedom
## AIC: 1165
##
## Number of Fisher Scoring iterations: 4
model_inter2 <- glm(abortos1 ~ dfa$dap1 + dfa$rto1 + dfa$Edad1 + dfa$dap1:dfa$Edad1,family = 'binomial',data = dfa)
summary(model_inter2)
##
## Call:
## glm(formula = abortos1 ~ dfa$dap1 + dfa$rto1 + dfa$Edad1 + dfa$dap1:dfa$Edad1,
## family = "binomial", data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3422 -0.9867 0.4248 0.9730 2.0253
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.844415 27.342977 0.104 0.917
## dfa$dap1 0.077855 0.913870 0.085 0.932
## dfa$rto1 0.005202 0.020488 0.254 0.800
## dfa$Edad1 -0.215136 0.409552 -0.525 0.599
## dfa$dap1:dfa$Edad1 0.004601 0.013662 0.337 0.736
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1384.0 on 999 degrees of freedom
## Residual deviance: 1155.1 on 995 degrees of freedom
## AIC: 1165.1
##
## Number of Fisher Scoring iterations: 4
*El modelo 2 indica que la interacción dap:edad tampoco tiene efecto sobre abortos florales
model_inter3 <- glm(abortos1 ~ dfa$dap1 + dfa$rto1 + dfa$Edad1 + dfa$rto1:dfa$Edad1,family = 'binomial',data = dfa)
summary(model_inter3)
##
## Call:
## glm(formula = abortos1 ~ dfa$dap1 + dfa$rto1 + dfa$Edad1 + dfa$rto1:dfa$Edad1,
## family = "binomial", data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3230 -0.9846 0.4244 0.9744 2.0171
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.229090 13.107060 0.017 0.986
## dfa$dap1 0.384990 0.034408 11.189 <2e-16 ***
## dfa$rto1 -0.213651 0.430020 -0.497 0.619
## dfa$Edad1 -0.175959 0.196887 -0.894 0.371
## dfa$rto1:dfa$Edad1 0.003274 0.006421 0.510 0.610
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1384 on 999 degrees of freedom
## Residual deviance: 1155 on 995 degrees of freedom
## AIC: 1165
##
## Number of Fisher Scoring iterations: 4
*El modelo 3 indica que la interacción rto:edad tampoco tiene efecto sobre abortos florales
model_inter4 <- glm(abortos1 ~ dfa$Edad1 + dfa$rto1 + dfa$dap1 + dfa$Edad1:dfa$rto1 + dfa$Edad1:dfa$dap1 + dfa$rto1:dfa$dap1, family = 'binomial', data = dfa)
summary(model_inter4)
##
## Call:
## glm(formula = abortos1 ~ dfa$Edad1 + dfa$rto1 + dfa$dap1 + dfa$Edad1:dfa$rto1 +
## dfa$Edad1:dfa$dap1 + dfa$rto1:dfa$dap1, family = "binomial",
## data = dfa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3180 -0.9828 0.4526 0.9713 2.0613
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.575968 31.552157 0.494 0.622
## dfa$Edad1 -0.499783 0.525355 -0.951 0.341
## dfa$rto1 -0.155055 0.554442 -0.280 0.780
## dfa$dap1 0.021989 1.217443 0.018 0.986
## dfa$Edad1:dfa$rto1 0.005543 0.008610 0.644 0.520
## dfa$Edad1:dfa$dap1 0.008609 0.019369 0.444 0.657
## dfa$rto1:dfa$dap1 -0.007074 0.007257 -0.975 0.330
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1384 on 999 degrees of freedom
## Residual deviance: 1154 on 993 degrees of freedom
## AIC: 1168
##
## Number of Fisher Scoring iterations: 4
library(lmtest)
lrtest(model_inter4, model_inter1)
## Likelihood ratio test
##
## Model 1: abortos1 ~ dfa$Edad1 + dfa$rto1 + dfa$dap1 + dfa$Edad1:dfa$rto1 +
## dfa$Edad1:dfa$dap1 + dfa$rto1:dfa$dap1
## Model 2: abortos1 ~ dfa$dap1 + dfa$rto1 + dfa$Edad1 + dfa$dap1:dfa$rto1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 7 -577.01
## 2 5 -577.52 -2 1.0313 0.5971
lrtest(model_inter4, model_inter2)
## Likelihood ratio test
##
## Model 1: abortos1 ~ dfa$Edad1 + dfa$rto1 + dfa$dap1 + dfa$Edad1:dfa$rto1 +
## dfa$Edad1:dfa$dap1 + dfa$rto1:dfa$dap1
## Model 2: abortos1 ~ dfa$dap1 + dfa$rto1 + dfa$Edad1 + dfa$dap1:dfa$Edad1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 7 -577.01
## 2 5 -577.55 -2 1.085 0.5813
lrtest(model_inter4, model_inter3)
## Likelihood ratio test
##
## Model 1: abortos1 ~ dfa$Edad1 + dfa$rto1 + dfa$dap1 + dfa$Edad1:dfa$rto1 +
## dfa$Edad1:dfa$dap1 + dfa$rto1:dfa$dap1
## Model 2: abortos1 ~ dfa$dap1 + dfa$rto1 + dfa$Edad1 + dfa$rto1:dfa$Edad1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 7 -577.01
## 2 5 -577.48 -2 0.9376 0.6258
*La interacción en el modelo no es importante entonces tomamos el modelo sencillo
library(lmtest)
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
rtaB <- ifelse(mod3$fitted.values>1, 1, ifelse(mod3$fitted.values<0,0,mod3$fitted.values))
prop_ab <- rtaB*100
cat_Edad1 <- cut(dfa$Edad1,breaks = 4)
cat_clolA <- cut(dfa$clolA,breaks=4)
data_2 <- data.frame(cat_Edad1, cat_clolA, prop_ab)
tips2 <- data_2 %>%
group_by(cat_clolA, cat_Edad1) %>%
summarise(media_prop_abortos = mean(prop_ab))
## `summarise()` has grouped output by 'cat_clolA'. You can override using the
## `.groups` argument.
#Graficando
library(ggplot2)
tips2$tip_groups
## Warning: Unknown or uninitialised column: `tip_groups`.
## NULL
ggplot(data = tips2) +
aes(x = cat_Edad1, y = media_prop_abortos, color = cat_clolA) +
geom_line(aes(group = cat_clolA))
rta2 = rtaB
prop_ab2 <- rta2*100
cat_Edad2 <- cut(dfa$Edad1,breaks = 4)
cat_rto1 <- cut(dfa$rto1,breaks=4)
data_3 <- data.frame(cat_Edad2, cat_rto1, prop_ab2)
tips3 <- data_3 %>%
group_by(cat_rto1, cat_Edad2) %>%
summarise(media_prop_abortos2 = mean(prop_ab2))
## `summarise()` has grouped output by 'cat_rto1'. You can override using the
## `.groups` argument.
#Graficando
library(ggplot2)
tips3$tip_groups
## Warning: Unknown or uninitialised column: `tip_groups`.
## NULL
ggplot(data = tips3) +
aes(x = cat_Edad2, y = media_prop_abortos2, color = cat_rto1) +
geom_line(aes(group = cat_rto1))
rta3 = rtaB
prop_ab3 <- rta3*100
cat_clolA3 <- cut(dfa$clolA,breaks = 4)
cat_rto2 <- cut(dfa$rto1,breaks=4)
data_4 <- data.frame(cat_clolA3, cat_rto2, prop_ab3)
tips4 <- data_4 %>%
group_by(cat_rto2, cat_clolA3) %>%
summarise(media_prop_abortos3 = mean(prop_ab3))
## `summarise()` has grouped output by 'cat_rto2'. You can override using the
## `.groups` argument.
#Graficando
library(ggplot2)
tips4$tip_groups
## Warning: Unknown or uninitialised column: `tip_groups`.
## NULL
ggplot(data = tips4) +
aes(x = cat_clolA3, y = media_prop_abortos3, color = cat_rto2) +
geom_line(aes(group = cat_rto2))
library(ResourceSelection)
## ResourceSelection 0.3-5 2019-07-22
cut_prob1 <- ifelse(fitted(mod3) > 0.5, 1, 0)
table(mod3$y, cut_prob1)
## cut_prob1
## 0 1
## 0 476 0
## 1 11 513
hoslem.test(mod3$y, fitted(mod3))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod3$y, fitted(mod3)
## X-squared = 250.86, df = 8, p-value < 2.2e-16
Predprob1<-predict(mod3,type="response")
plot(Predprob1,jitter(as.numeric(dfa$abortos1),0.5), cex=0.5, ylab="Abortos", xlim = c(0,1), xlab = 'Predicción de probabilidad de abortos')
abline(v = 0.5, col = 'red')
text(x = 0.8, y = 0.8, 'alta probabilidad de abortos, \n predicha y observada')
text(x = 0.2, y = 0.2, 'alta probabilidad de no abortos, \n predicha y observada')
*Este gráfico confirma que el modelo es muy bueno, nos indica que la probabilidad de abortar entre los datos predichos y los observados es alta, por lo tanto se concluye que el modelo representa la probabilidad de abortos florales en palmas.