Práctica

#Importar los datos:
bat.df <- read.table('http://www.uib.no/People/nzlkj/statkey/data/bat.csv',header=T)
attach(bat.df)
head(bat.df,15)

##    occurrence temp
## 1           0  8.0
## 2           0  8.5
## 3           0  9.0
## 4           0  9.0
## 5           0  9.0
## 6           0  9.0
## 7           0  9.5
## 8           0  9.5
## 9           0  9.5
## 10          0 10.0
## 11          0 10.0
## 12          0 10.0
## 13          0 10.0
## 14          0 10.0
## 15          0 10.5

#Ejecutar el modelo:
bat.glm <- glm(occurrence~temp, binomial, data=bat.df)
summary(bat.glm)

## 
## Call:
## glm(formula = occurrence ~ temp, family = binomial, data = bat.df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.60282  -0.78282  -0.01836   0.59162   1.63226  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -28.3926     4.1007  -6.924 4.40e-12 ***
## temp          2.1893     0.3163   6.921 4.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 253.25  on 182  degrees of freedom
## Residual deviance: 144.48  on 181  degrees of freedom
## AIC: 148.48
## 
## Number of Fisher Scoring iterations: 6

#Observe que si hay diferencias sifnificativas.
anova(bat.glm, test='Chisq')

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: occurrence
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                   182     253.25              
## temp  1   108.77       181     144.47 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Modelos nulos,
bat.glm1 <- glm(occurrence~1, binomial) #Estimación del intercepto mediante un efecto aleatorio.
summary(bat.glm1, test="Chisq")

## 
## Call:
## glm(formula = occurrence ~ 1, family = binomial)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.136  -1.136  -1.136   1.219   1.219  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.09844    0.14802  -0.665    0.506
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 253.25  on 182  degrees of freedom
## Residual deviance: 253.25  on 182  degrees of freedom
## AIC: 255.25
## 
## Number of Fisher Scoring iterations: 3

anova(bat.glm,bat.glm1, test='Chi')

## Analysis of Deviance Table
## 
## Model 1: occurrence ~ temp
## Model 2: occurrence ~ 1
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       181     144.47                          
## 2       182     253.25 -1  -108.77 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

bat.glm2 <- glm(occurrence~1 +temp, binomial)#Estimación del intercepto mediante un efecto aleatorio, mas la variable independiente
summary(bat.glm2)

## 
## Call:
## glm(formula = occurrence ~ 1 + temp, family = binomial)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.60282  -0.78282  -0.01836   0.59162   1.63226  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -28.3926     4.1007  -6.924 4.40e-12 ***
## temp          2.1893     0.3163   6.921 4.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 253.25  on 182  degrees of freedom
## Residual deviance: 144.48  on 181  degrees of freedom
## AIC: 148.48
## 
## Number of Fisher Scoring iterations: 6

#Comparamos los modelos
anova(bat.glm,bat.glm2, test='Chis')

## Analysis of Deviance Table
## 
## Model 1: occurrence ~ temp
## Model 2: occurrence ~ 1 + temp
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       181     144.47                     
## 2       181     144.47  0        0

anova(bat.glm2, test='Chis')

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: occurrence
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                   182     253.25              
## temp  1   108.77       181     144.47 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Mostrar el efecto de forma gráfica
library(effects)
plot(allEffects(bat.glm))

library(popbio)
logi.hist.plot(temp,occurrence,boxp=F,type="hist",col="gray", xlab="temp")

# o sin uso de paquetes
plot(temp, occurrence)
tempvals <- seq(8,15.1, .1)
lines(tempvals, predict(bat.glm, list(temp=tempvals), type='response'))

Práctica

#Desarrolle modelos de glm
set.seed(2016)
bodysize<-rnorm(20,30,5) # genera 20 valores, con media of 30 & s.d.=2
set.seed(4)
age<-rnorm(20,5,1.5)
bodysize<-sort(bodysize) # ordena de forma ascendente. 
survive<-c(0,0,0,0,0,1,0,1,0,0,1,1,0,1,1,1,0,1,1,1) # el vector sobrevivenvia, se supone que la mortalidad ocurre en individuos de bodysize menores
dat<-as.data.frame(cbind(bodysize,survive,age)) # crea un data frame.
head(dat, 8)

##   bodysize survive      age
## 1 16.04265       0 5.325132
## 2 25.42629       0 4.186261
## 3 25.47205       0 6.336717
## 4 25.61454       0 5.893971
## 5 26.18246       0 7.453427
## 6 26.49067       1 6.033913
## 7 26.50730       0 3.078130
## 8 26.57495       1 4.680283

library(popbio)
logi.hist.plot(age,survive,boxp=F,type="hist",col="gray", xlab="bodysize")

#Modelo glm
glm.s<-glm(survive~bodysize,family=binomial,dat)
summary(glm.s)

## 
## Call:
## glm(formula = survive ~ bodysize, family = binomial, data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.96476  -0.82351   0.08846   0.88483   1.54749  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -8.9608     4.6317  -1.935   0.0530 .
## bodysize      0.3066     0.1583   1.937   0.0527 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 21.656  on 18  degrees of freedom
## AIC: 25.656
## 
## Number of Fisher Scoring iterations: 5

library(effects)
plot(allEffects(glm.s))

#Genere un modelo que incluya la edad "age" y compare contra todos los modelos que puede ejeuctar.
#Piense, en mode,os simples, multiples, de efectos aleatorios

glm.s2<-glm(survive~bodysize+age,family=binomial,dat)
summary(glm.s2)

## 
## Call:
## glm(formula = survive ~ bodysize + age, family = binomial, data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.59619  -0.59853   0.02848   0.76485   1.70778  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -4.9583     5.3666  -0.924   0.3555  
## bodysize      0.3066     0.1675   1.830   0.0672 .
## age          -0.7220     0.5045  -1.431   0.1524  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 19.171  on 17  degrees of freedom
## AIC: 25.171
## 
## Number of Fisher Scoring iterations: 5

library(effects)
plot(allEffects(glm.s2))

summary(glm.s2)

## 
## Call:
## glm(formula = survive ~ bodysize + age, family = binomial, data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.59619  -0.59853   0.02848   0.76485   1.70778  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -4.9583     5.3666  -0.924   0.3555  
## bodysize      0.3066     0.1675   1.830   0.0672 .
## age          -0.7220     0.5045  -1.431   0.1524  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 19.171  on 17  degrees of freedom
## AIC: 25.171
## 
## Number of Fisher Scoring iterations: 5

#Analice su conclusión
#Comparación de modelos
anova(glm.s,glm.s2, test="Chisq")

## Analysis of Deviance Table
## 
## Model 1: survive ~ bodysize
## Model 2: survive ~ bodysize + age
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        18     21.655                     
## 2        17     19.171  1   2.4846    0.115

glm.s3<-glm(survive~1 +bodysize,family=binomial,dat)
library(effects)
plot(allEffects(glm.s3))

summary(glm.s3)

## 
## Call:
## glm(formula = survive ~ 1 + bodysize, family = binomial, data = dat)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.96476  -0.82351   0.08846   0.88483   1.54749  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -8.9608     4.6317  -1.935   0.0530 .
## bodysize      0.3066     0.1583   1.937   0.0527 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 21.656  on 18  degrees of freedom
## AIC: 25.656
## 
## Number of Fisher Scoring iterations: 5

anova(glm.s, glm.s3, test="Chisq")

## Analysis of Deviance Table
## 
## Model 1: survive ~ bodysize
## Model 2: survive ~ 1 + bodysize
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        18     21.655                     
## 2        18     21.655  0        0

seeds.df <-read.table('http://www.uib.no/People/nzlkj/statkey/data/seeds.csv',header=T)
attach(seeds.df)
head(seeds.df)

##   Hamster Seed.type Not.eaten Eaten
## 1       1         B        48     2
## 2       2         B        46     4
## 3       3         A        41     9
## 4       4         B        46     4
## 5       5         A        40    10
## 6       6         A        39    11

#Crear plot para cada variable dependiente:
plot(Seed.type, Eaten)

plot(Seed.type, Not.eaten)

#Generar el modelo:
fit1.glm <- glm(cbind(Eaten,Not.eaten)~Seed.type, binomial)
summary(fit1.glm)

## 
## Call:
## glm(formula = cbind(Eaten, Not.eaten) ~ Seed.type, family = binomial)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.13219  -0.98042  -0.07752   0.98500   1.85766  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0460     0.1075  -9.733  < 2e-16 ***
## Seed.typeB   -2.0221     0.2527  -8.001 1.24e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 117.193  on 17  degrees of freedom
## Residual deviance:  28.802  on 16  degrees of freedom
## AIC: 88.967
## 
## Number of Fisher Scoring iterations: 5

anova(fit1.glm, test='Chisq')

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: cbind(Eaten, Not.eaten)
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                         17    117.193              
## Seed.type  1   88.391        16     28.802 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

fit2.glm <- glm(cbind(Eaten,Not.eaten)~Seed.type, quasibinomial)
anova(fit2.glm, test='F')

## Analysis of Deviance Table
## 
## Model: quasibinomial, link: logit
## 
## Response: cbind(Eaten, Not.eaten)
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev     F    Pr(>F)    
## NULL                         17    117.193                    
## Seed.type  1   88.391        16     28.802 56.77 1.196e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fit2.glm)

## 
## Call:
## glm(formula = cbind(Eaten, Not.eaten) ~ Seed.type, family = quasibinomial)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.13219  -0.98042  -0.07752   0.98500   1.85766  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.0460     0.1341  -7.800 7.69e-07 ***
## Seed.typeB   -2.0221     0.3154  -6.412 8.60e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.557005)
## 
##     Null deviance: 117.193  on 17  degrees of freedom
## Residual deviance:  28.802  on 16  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

#Para calcular la probabilidad de depredación para la semilla "A"? 
exp(-1.0460)/(1+exp(0.1341)) #se obtien el resultado me da la probabilidad.

## [1] 0.1639091

#Para calcular la probabilidad de depredación para la semilla "B"? 
exp(-2.0221)/(1+exp(0.3154)) # se obtien el resultado me da la probabilidad de depredación para la semilla "B"

## [1] 0.05583633

No hay necesidad de comparar los modelos, pero si fijarnos en si existe sobredispersión, resultado que se obtiene al hacer un modelo “quasi”

library(effects)
plot(allEffects(fit2.glm))

#Visual gráfica de las predicciones
fit.pre <- predict(fit1.glm, type='response', se.fit=T)
plot(Seed.type, fit.pre$fit, xlab='Seed type', ylab='Probability of predation', ylim=c(0,1))

overdispersion

#En la base de datos se está interesado en saber si el nacimiento de niños con bajo peso <2.5kg (low), depende de la edad (age) y la raza (race) de la madre
library(MASS) 
head(birthwt)

##    low age lwt race smoke ptl ht ui ftv  bwt
## 85   0  19 182    2     0   0  0  1   0 2523
## 86   0  33 155    3     0   0  0  0   3 2551
## 87   0  20 105    1     1   0  0  0   1 2557
## 88   0  21 108    1     1   0  0  1   2 2594
## 89   0  18 107    1     1   0  0  1   0 2600
## 91   0  21 124    3     0   0  0  0   0 2622

nacimiento <- glm(low ~ age + race, family = binomial, data = birthwt) 
summary(nacimiento) #Interprete el resulatado.

## 
## Call:
## glm(formula = low ~ age + race, family = binomial, data = birthwt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1054  -0.8820  -0.7607   1.3466   1.8182  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.33286    0.86012  -0.387    0.699
## age         -0.04355    0.03219  -1.353    0.176
## race         0.28600    0.17413   1.642    0.101
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 229.21  on 186  degrees of freedom
## AIC: 235.21
## 
## Number of Fisher Scoring iterations: 4

#Para saber si mi modelo presenta sobredispersión, se puede aplicar la siguiente función.
phi <- sum(resid(nacimiento, type = "pearson")^2) / df.residual(nacimiento) 
phi

## [1] 1.011612

#Lo que hay que buscar que es que el valor no sobrepase 1.2. Para modelos con pocos datos es preferible que no pase de 1.5, para que no exista sobredispersión.
#valores menores a uno no habría problema.
#Este parámetro nos indica cuántas veces mayor es la varianza de la media.
#dispersión fue menor que uno, resulta que la varianza condicional es en realidad menor que la media condicional, por lo que se interpreta que no hay sobredispersión.

#o aplicar un modelo "quasi()".
nacimiento2 <- glm(low ~ age + race, family = quasibinomial, data = birthwt) 
summary(nacimiento2) #Haga lectura de los parámetros de dispersión.

## 
## Call:
## glm(formula = low ~ age + race, family = quasibinomial, data = birthwt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1054  -0.8820  -0.7607   1.3466   1.8182  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.33286    0.86510  -0.385    0.701
## age         -0.04355    0.03237  -1.345    0.180
## race         0.28600    0.17514   1.633    0.104
## 
## (Dispersion parameter for quasibinomial family taken to be 1.011613)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 229.21  on 186  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

nacimiento2$coefficients

## (Intercept)         age        race 
## -0.33285936 -0.04354883  0.28599803

pi.hat<- predict.glm(nacimiento2, data.frame(age=25, race=3), type="response", se.fit=TRUE)
pi.hat

## $fit
##         1 
## 0.3627181 
## 
## $se.fit
##          1 
## 0.06073225 
## 
## $residual.scale
## [1] 1.00579

library(effects)
plot(allEffects(nacimiento2))

fit.pre <- predict(nacimiento2, type='response', se.fit=T)
plot(birthwt$age, fit.pre$fit, xlab='age', ylab='Probabilidad de nacimientos <25', ylim=c(0,1))

fit.pre <- predict(nacimiento2, type='response', se.fit=T)
plot(birthwt$race, fit.pre$fit, xlab='raza', ylab='Probabilidad de nacimientos <25', ylim=c(0,1))

nacimiento3<-glm(low ~age+ smoke, data = birthwt, family = binomial)
summary(nacimiento3)

## 
## Call:
## glm(formula = low ~ age + smoke, family = binomial, data = birthwt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1589  -0.8668  -0.7470   1.2821   1.7925  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.06091    0.75732   0.080   0.9359  
## age         -0.04978    0.03197  -1.557   0.1195  
## smoke        0.69185    0.32181   2.150   0.0316 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 227.28  on 186  degrees of freedom
## AIC: 233.28
## 
## Number of Fisher Scoring iterations: 4

fit.pre <- predict(nacimiento3, type='response', se.fit=T)
plot(birthwt$smoke, fit.pre$fit, xlab='smoke', ylab='Probabilidad de nacimientos <25', ylim=c(0,1))

nacimiento4<-glm(low ~smoke, data = birthwt, family = binomial)
summary(nacimiento4)

## 
## Call:
## glm(formula = low ~ smoke, family = binomial, data = birthwt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0197  -0.7623  -0.7623   1.3438   1.6599  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0871     0.2147  -5.062 4.14e-07 ***
## smoke         0.7041     0.3196   2.203   0.0276 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 234.67  on 188  degrees of freedom
## Residual deviance: 229.80  on 187  degrees of freedom
## AIC: 233.8
## 
## Number of Fisher Scoring iterations: 4

exp(0.7041)/(1+exp(0.3196))#La prob de tener nacimientos de niños con pesos <25 de mujeres fumadoras es de 85.08%

## [1] 0.8508144

exp(-1.0871)/(1+exp(0.2147))#La prob de tener nacimientos de niños con pesos <25, pára  mujeres NO fumadoras es de 15.05%

## [1] 0.1505668

library(effects)
plot(allEffects(nacimiento4))

Observación: Drop1= para diseños no ortogonales, produce resultados (Type III SS). Anova= para comparar modelos, para distribucion de datos binomial, se requiere en la función Test=“Chisq”.

Practica Regresion Logistica

Oscar Ramírez-Alán

20 de mayo de 2016

Práctica

Práctica

No hay necesidad de comparar los modelos, pero si fijarnos en si existe sobredispersión, resultado que se obtiene al hacer un modelo “quasi”

overdispersion