## Homework 7: Generalized Linear Models (GLMs)

a. Poisson model (3 points)

# load data
data <- dvisits %>% filter(hospdays > 0)

# explore the response variable
hist(data$hospdays)

# fit poisson regression model
poisson_model <- glm(hospdays ~ age + sex + illness, data = data, family = poisson)
summary(poisson_model)
## 
## Call:
## glm(formula = hospdays ~ age + sex + illness, family = poisson, 
##     data = data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.412074   0.034064  41.453  < 2e-16 ***
## age          1.738076   0.061077  28.457  < 2e-16 ***
## sex         -0.115799   0.025219  -4.592 4.40e-06 ***
## illness      0.059204   0.007921   7.474 7.76e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 8641.7  on 698  degrees of freedom
## Residual deviance: 7577.5  on 695  degrees of freedom
## AIC: 10060
## 
## Number of Fisher Scoring iterations: 5

b. Negative-Binomial model (3 points)

# fit negative binomial model
negbin_model <- glm.nb(hospdays ~ age + sex + illness, data = data)
summary(negbin_model)
## 
## Call:
## glm.nb(formula = hospdays ~ age + sex + illness, data = data, 
##     init.theta = 1.111056706, link = log)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.42968    0.09470  15.096  < 2e-16 ***
## age          1.75183    0.18785   9.325  < 2e-16 ***
## sex         -0.20117    0.07962  -2.527  0.01152 *  
## illness      0.07109    0.02622   2.711  0.00671 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.1111) family taken to be 1)
## 
##     Null deviance: 850.19  on 698  degrees of freedom
## Residual deviance: 740.03  on 695  degrees of freedom
## AIC: 4577.9
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.1111 
##           Std. Err.:  0.0616 
## 
##  2 x log-likelihood:  -4567.8660

c. Gamma model (3 points)

# fit gamma model
gamma_model <- glm(hospdays ~ age + sex + illness, data = data, family = Gamma(link = "log"))
summary(gamma_model)
## 
## Call:
## glm(formula = hospdays ~ age + sex + illness, family = Gamma(link = "log"), 
##     data = data)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.43080    0.13509  10.592  < 2e-16 ***
## age          1.75668    0.27138   6.473 1.81e-10 ***
## sex         -0.20901    0.11476  -1.821   0.0690 .  
## illness      0.07159    0.03808   1.880   0.0606 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 2.115837)
## 
##     Null deviance: 902.32  on 698  degrees of freedom
## Residual deviance: 791.16  on 695  degrees of freedom
## AIC: 4510.7
## 
## Number of Fisher Scoring iterations: 7

d. Best model (1 point)

# compare AIC across models
AIC(poisson_model, negbin_model, gamma_model)
##               df       AIC
## poisson_model  4 10059.539
## negbin_model   5  4577.866
## gamma_model    5  4510.712
# compare BIC across models
BIC(poisson_model, negbin_model, gamma_model)
##               df       BIC
## poisson_model  4 10077.738
## negbin_model   5  4600.614
## gamma_model    5  4533.461

The Gamma model has the lowest AIC/BIC, indicating it is the best model and suggesting that a continuous distribution fits better.