Práctica

#Importar los datos:
bat.df <- read.table('http://www.uib.no/People/nzlkj/statkey/data/bat.csv',header=T)
attach(bat.df)
head(bat.df,15)
##    occurrence temp
## 1           0  8.0
## 2           0  8.5
## 3           0  9.0
## 4           0  9.0
## 5           0  9.0
## 6           0  9.0
## 7           0  9.5
## 8           0  9.5
## 9           0  9.5
## 10          0 10.0
## 11          0 10.0
## 12          0 10.0
## 13          0 10.0
## 14          0 10.0
## 15          0 10.5
#Ejecutar el modelo:
bat.glm <- glm(occurrence~temp, binomial, data=bat.df)
summary(bat.glm)
## 
## Call:
## glm(formula = occurrence ~ temp, family = binomial, data = bat.df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.60282  -0.78282  -0.01836   0.59162   1.63226  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -28.3926     4.1007  -6.924 4.40e-12 ***
## temp          2.1893     0.3163   6.921 4.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 253.25  on 182  degrees of freedom
## Residual deviance: 144.48  on 181  degrees of freedom
## AIC: 148.48
## 
## Number of Fisher Scoring iterations: 6
#Observe que si hay diferencias sifnificativas.
anova(bat.glm, test='Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: occurrence
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                   182     253.25              
## temp  1   108.77       181     144.47 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Modelos nulos,
bat.glm1 <- glm(occurrence~1, binomial) #Estimación del intercepto mediante un efecto aleatorio.
summary(bat.glm1, test="Chisq")
## 
## Call:
## glm(formula = occurrence ~ 1, family = binomial)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.136  -1.136  -1.136   1.219   1.219  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.09844    0.14802  -0.665    0.506
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 253.25  on 182  degrees of freedom
## Residual deviance: 253.25  on 182  degrees of freedom
## AIC: 255.25
## 
## Number of Fisher Scoring iterations: 3
anova(bat.glm,bat.glm1, test='Chi')
## Analysis of Deviance Table
## 
## Model 1: occurrence ~ temp
## Model 2: occurrence ~ 1
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       181     144.47                          
## 2       182     253.25 -1  -108.77 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bat.glm2 <- glm(occurrence~1 +temp, binomial)#Estimación del intercepto mediante un efecto aleatorio, mas la variable independiente
summary(bat.glm2)
## 
## Call:
## glm(formula = occurrence ~ 1 + temp, family = binomial)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.60282  -0.78282  -0.01836   0.59162   1.63226  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -28.3926     4.1007  -6.924 4.40e-12 ***
## temp          2.1893     0.3163   6.921 4.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 253.25  on 182  degrees of freedom
## Residual deviance: 144.48  on 181  degrees of freedom
## AIC: 148.48
## 
## Number of Fisher Scoring iterations: 6
#Comparamos los modelos
anova(bat.glm,bat.glm2, test='Chis')
## Analysis of Deviance Table
## 
## Model 1: occurrence ~ temp
## Model 2: occurrence ~ 1 + temp
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       181     144.47                     
## 2       181     144.47  0        0
anova(bat.glm2, test='Chis')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: occurrence
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                   182     253.25              
## temp  1   108.77       181     144.47 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Mostrar el efecto de forma gráfica
library(effects)
plot(allEffects(bat.glm))

library(popbio)
logi.hist.plot(temp,occurrence,boxp=F,type="hist",col="gray", xlab="temp")

# o sin uso de paquetes
plot(temp, occurrence)
tempvals <- seq(8,15.1, .1)
lines(tempvals, predict(bat.glm, list(temp=tempvals), type='response'))

Práctica

#Desarrolle modelos de glm
set.seed(2016)
bodysize<-rnorm(20,30,5) # genera 20 valores, con media of 30 & s.d.=2
set.seed(4)
age<-rnorm(20,5,1.5)
bodysize<-sort(bodysize) # ordena de forma ascendente. 
survive<-c(0,0,0,0,0,1,0,1,0,0,1,1,0,1,1,1,0,1,1,1) # el vector sobrevivenvia, se supone que la mortalidad ocurre en individuos de bodysize menores
dat<-as.data.frame(cbind(bodysize,survive,age)) # crea un data frame.
head(dat, 8)
##   bodysize survive      age
## 1 16.04265       0 5.325132
## 2 25.42629       0 4.186261
## 3 25.47205       0 6.336717
## 4 25.61454       0 5.893971
## 5 26.18246       0 7.453427
## 6 26.49067       1 6.033913
## 7 26.50730       0 3.078130
## 8 26.57495       1 4.680283
library(popbio)
logi.hist.plot(age,survive,boxp=F,type="hist",col="gray", xlab="bodysize")

#Modelo glm
glm.s<-glm(survive~bodysize,family=binomial,dat)
summary(glm.s)
## 
## Call:
## glm(formula = survive ~ bodysize, family = binomial, data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.96476  -0.82351   0.08846   0.88483   1.54749  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -8.9608     4.6317  -1.935   0.0530 .
## bodysize      0.3066     0.1583   1.937   0.0527 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 21.656  on 18  degrees of freedom
## AIC: 25.656
## 
## Number of Fisher Scoring iterations: 5
library(effects)
plot(allEffects(glm.s))

#Genere un modelo que incluya la edad "age" y compare contra todos los modelos que puede ejeuctar.
#Piense, en mode,os simples, multiples, de efectos aleatorios

glm.s2<-glm(survive~bodysize+age,family=binomial,dat)
summary(glm.s2)
## 
## Call:
## glm(formula = survive ~ bodysize + age, family = binomial, data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.59619  -0.59853   0.02848   0.76485   1.70778  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -4.9583     5.3666  -0.924   0.3555  
## bodysize      0.3066     0.1675   1.830   0.0672 .
## age          -0.7220     0.5045  -1.431   0.1524  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 19.171  on 17  degrees of freedom
## AIC: 25.171
## 
## Number of Fisher Scoring iterations: 5
library(effects)
plot(allEffects(glm.s2))

summary(glm.s2)
## 
## Call:
## glm(formula = survive ~ bodysize + age, family = binomial, data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.59619  -0.59853   0.02848   0.76485   1.70778  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -4.9583     5.3666  -0.924   0.3555  
## bodysize      0.3066     0.1675   1.830   0.0672 .
## age          -0.7220     0.5045  -1.431   0.1524  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 19.171  on 17  degrees of freedom
## AIC: 25.171
## 
## Number of Fisher Scoring iterations: 5
#Analice su conclusión
#Comparación de modelos
anova(glm.s,glm.s2, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: survive ~ bodysize
## Model 2: survive ~ bodysize + age
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        18     21.655                     
## 2        17     19.171  1   2.4846    0.115
glm.s3<-glm(survive~1 +bodysize,family=binomial,dat)
library(effects)
plot(allEffects(glm.s3))

summary(glm.s3)
## 
## Call:
## glm(formula = survive ~ 1 + bodysize, family = binomial, data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.96476  -0.82351   0.08846   0.88483   1.54749  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -8.9608     4.6317  -1.935   0.0530 .
## bodysize      0.3066     0.1583   1.937   0.0527 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 21.656  on 18  degrees of freedom
## AIC: 25.656
## 
## Number of Fisher Scoring iterations: 5
anova(glm.s, glm.s3, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: survive ~ bodysize
## Model 2: survive ~ 1 + bodysize
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        18     21.655                     
## 2        18     21.655  0        0
seeds.df <-read.table('http://www.uib.no/People/nzlkj/statkey/data/seeds.csv',header=T)
attach(seeds.df)
head(seeds.df)
##   Hamster Seed.type Not.eaten Eaten
## 1       1         B        48     2
## 2       2         B        46     4
## 3       3         A        41     9
## 4       4         B        46     4
## 5       5         A        40    10
## 6       6         A        39    11
#Crear plot para cada variable dependiente:
plot(Seed.type, Eaten)

plot(Seed.type, Not.eaten)

#Generar el modelo:
fit1.glm <- glm(cbind(Eaten,Not.eaten)~Seed.type, binomial)
summary(fit1.glm)
## 
## Call:
## glm(formula = cbind(Eaten, Not.eaten) ~ Seed.type, family = binomial)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.13219  -0.98042  -0.07752   0.98500   1.85766  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0460     0.1075  -9.733  < 2e-16 ***
## Seed.typeB   -2.0221     0.2527  -8.001 1.24e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 117.193  on 17  degrees of freedom
## Residual deviance:  28.802  on 16  degrees of freedom
## AIC: 88.967
## 
## Number of Fisher Scoring iterations: 5
anova(fit1.glm, test='Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: cbind(Eaten, Not.eaten)
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                         17    117.193              
## Seed.type  1   88.391        16     28.802 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit2.glm <- glm(cbind(Eaten,Not.eaten)~Seed.type, quasibinomial)
anova(fit2.glm, test='F')
## Analysis of Deviance Table
## 
## Model: quasibinomial, link: logit
## 
## Response: cbind(Eaten, Not.eaten)
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev     F    Pr(>F)    
## NULL                         17    117.193                    
## Seed.type  1   88.391        16     28.802 56.77 1.196e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit2.glm)
## 
## Call:
## glm(formula = cbind(Eaten, Not.eaten) ~ Seed.type, family = quasibinomial)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.13219  -0.98042  -0.07752   0.98500   1.85766  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.0460     0.1341  -7.800 7.69e-07 ***
## Seed.typeB   -2.0221     0.3154  -6.412 8.60e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.557005)
## 
##     Null deviance: 117.193  on 17  degrees of freedom
## Residual deviance:  28.802  on 16  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
#Para calcular la probabilidad de depredación para la semilla "A"? 
exp(-1.0460)/(1+exp(0.1341)) #se obtien el resultado me da la probabilidad.
## [1] 0.1639091
#Para calcular la probabilidad de depredación para la semilla "B"? 
exp(-2.0221)/(1+exp(0.3154)) # se obtien el resultado me da la probabilidad de depredación para la semilla "B"
## [1] 0.05583633

No hay necesidad de comparar los modelos, pero si fijarnos en si existe sobredispersión, resultado que se obtiene al hacer un modelo “quasi”

library(effects)
plot(allEffects(fit2.glm))

#Visual gráfica de las predicciones
fit.pre <- predict(fit1.glm, type='response', se.fit=T)
plot(Seed.type, fit.pre$fit, xlab='Seed type', ylab='Probability of predation', ylim=c(0,1))

overdispersion

#En la base de datos se está interesado en saber si el nacimiento de niños con bajo peso <2.5kg (low), depende de la edad (age) y la raza (race) de la madre
library(MASS) 
head(birthwt)
##    low age lwt race smoke ptl ht ui ftv  bwt
## 85   0  19 182    2     0   0  0  1   0 2523
## 86   0  33 155    3     0   0  0  0   3 2551
## 87   0  20 105    1     1   0  0  0   1 2557
## 88   0  21 108    1     1   0  0  1   2 2594
## 89   0  18 107    1     1   0  0  1   0 2600
## 91   0  21 124    3     0   0  0  0   0 2622
nacimiento <- glm(low ~ age + race, family = binomial, data = birthwt) 
summary(nacimiento) #Interprete el resulatado.
## 
## Call:
## glm(formula = low ~ age + race, family = binomial, data = birthwt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1054  -0.8820  -0.7607   1.3466   1.8182  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.33286    0.86012  -0.387    0.699
## age         -0.04355    0.03219  -1.353    0.176
## race         0.28600    0.17413   1.642    0.101
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 229.21  on 186  degrees of freedom
## AIC: 235.21
## 
## Number of Fisher Scoring iterations: 4
#Para saber si mi modelo presenta sobredispersión, se puede aplicar la siguiente función.
phi <- sum(resid(nacimiento, type = "pearson")^2) / df.residual(nacimiento) 
phi
## [1] 1.011612
#Lo que hay que buscar que es que el valor no sobrepase 1.2. Para modelos con pocos datos es preferible que no pase de 1.5, para que no exista sobredispersión.
#valores menores a uno no habría problema.
#Este parámetro nos indica cuántas veces mayor es la varianza de la media.
#dispersión fue menor que uno, resulta que la varianza condicional es en realidad menor que la media condicional, por lo que se interpreta que no hay sobredispersión.
#o aplicar un modelo "quasi()".
nacimiento2 <- glm(low ~ age + race, family = quasibinomial, data = birthwt) 
summary(nacimiento2) #Haga lectura de los parámetros de dispersión.
## 
## Call:
## glm(formula = low ~ age + race, family = quasibinomial, data = birthwt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1054  -0.8820  -0.7607   1.3466   1.8182  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.33286    0.86510  -0.385    0.701
## age         -0.04355    0.03237  -1.345    0.180
## race         0.28600    0.17514   1.633    0.104
## 
## (Dispersion parameter for quasibinomial family taken to be 1.011613)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 229.21  on 186  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4
nacimiento2$coefficients 
## (Intercept)         age        race 
## -0.33285936 -0.04354883  0.28599803
pi.hat<- predict.glm(nacimiento2, data.frame(age=25, race=3), type="response", se.fit=TRUE)
pi.hat
## $fit
##         1 
## 0.3627181 
## 
## $se.fit
##          1 
## 0.06073225 
## 
## $residual.scale
## [1] 1.00579
library(effects)
plot(allEffects(nacimiento2))

fit.pre <- predict(nacimiento2, type='response', se.fit=T)
plot(birthwt$age, fit.pre$fit, xlab='age', ylab='Probabilidad de nacimientos <25', ylim=c(0,1))

fit.pre <- predict(nacimiento2, type='response', se.fit=T)
plot(birthwt$race, fit.pre$fit, xlab='raza', ylab='Probabilidad de nacimientos <25', ylim=c(0,1))

nacimiento3<-glm(low ~age+ smoke, data = birthwt, family = binomial)
summary(nacimiento3)
## 
## Call:
## glm(formula = low ~ age + smoke, family = binomial, data = birthwt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1589  -0.8668  -0.7470   1.2821   1.7925  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.06091    0.75732   0.080   0.9359  
## age         -0.04978    0.03197  -1.557   0.1195  
## smoke        0.69185    0.32181   2.150   0.0316 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 227.28  on 186  degrees of freedom
## AIC: 233.28
## 
## Number of Fisher Scoring iterations: 4
fit.pre <- predict(nacimiento3, type='response', se.fit=T)
plot(birthwt$smoke, fit.pre$fit, xlab='smoke', ylab='Probabilidad de nacimientos <25', ylim=c(0,1))

nacimiento4<-glm(low ~smoke, data = birthwt, family = binomial)
summary(nacimiento4)
## 
## Call:
## glm(formula = low ~ smoke, family = binomial, data = birthwt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0197  -0.7623  -0.7623   1.3438   1.6599  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0871     0.2147  -5.062 4.14e-07 ***
## smoke         0.7041     0.3196   2.203   0.0276 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 229.80  on 187  degrees of freedom
## AIC: 233.8
## 
## Number of Fisher Scoring iterations: 4
exp(0.7041)/(1+exp(0.3196))#La prob de tener nacimientos de niños con pesos <25 de mujeres fumadoras es de 85.08%
## [1] 0.8508144
exp(-1.0871)/(1+exp(0.2147))#La prob de tener nacimientos de niños con pesos <25, pára  mujeres NO fumadoras es de 15.05%
## [1] 0.1505668
library(effects)
plot(allEffects(nacimiento4))

Observación: Drop1= para diseños no ortogonales, produce resultados (Type III SS). Anova= para comparar modelos, para distribucion de datos binomial, se requiere en la función Test=“Chisq”.