Generalized Linear Models, Part 2

# Load libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(broom)
library(lindia)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
data("midwest")

Build a linear (or generalized linear) model as you like : Using binary classification problem (logistic regression)

model <- glm(inmetro ~ popdensity +area, family = binomial, data=midwest)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
## 
## Call:
## glm(formula = inmetro ~ popdensity + area, family = binomial, 
##     data = midwest)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2928  -0.5013  -0.3476   0.0721   2.3296  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.7491502  0.5201275  -7.208 5.67e-13 ***
## popdensity   0.0016608  0.0001845   9.003  < 2e-16 ***
## area         4.1267087 11.5626826   0.357    0.721    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 562.13  on 436  degrees of freedom
## Residual deviance: 283.96  on 434  degrees of freedom
## AIC: 289.96
## 
## Number of Fisher Scoring iterations: 8

Interpret at least one of the coefficients : Extract the coefficients and odds ratios using coef:

coef(model)      # Coefficients, beta
##  (Intercept)   popdensity         area 
## -3.749150167  0.001660791  4.126708675
exp(coef(model)) # Odds ratios
## (Intercept)  popdensity        area 
##  0.02353774  1.00166217 61.97361158

Use the tools from previous weeks to diagnose the model

glm_model1 <- glm(inmetro ~ popdensity, data = midwest, family = binomial(link = 'logit'))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_model1)
## 
## Call:
## glm(formula = inmetro ~ popdensity, family = binomial(link = "logit"), 
##     data = midwest)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3060  -0.4998  -0.3455   0.0724   2.3596  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.6050145  0.3193561 -11.288   <2e-16 ***
## popdensity   0.0016530  0.0001827   9.048   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 562.13  on 436  degrees of freedom
## Residual deviance: 284.08  on 435  degrees of freedom
## AIC: 288.08
## 
## Number of Fisher Scoring iterations: 8
glm_model2 <- glm(inmetro ~ area, data = midwest, family = binomial(link = 'logit'))
summary(glm_model2)
## 
## Call:
## glm(formula = inmetro ~ area, family = binomial(link = "logit"), 
##     data = midwest)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0681  -0.9578  -0.8603   1.3996   1.9133  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -0.1152     0.2599  -0.443   0.6577  
## area        -16.3883     7.4958  -2.186   0.0288 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 562.13  on 436  degrees of freedom
## Residual deviance: 557.01  on 435  degrees of freedom
## AIC: 561.01
## 
## Number of Fisher Scoring iterations: 4
paste("Model 1 Deviance", round(glm_model1$deviance, 1))
## [1] "Model 1 Deviance 284.1"
paste("Model 2 Deviance", round(glm_model2$deviance, 1))
## [1] "Model 2 Deviance 557"
paste("Model 1 AIC", round(glm_model1$aic, 1))
## [1] "Model 1 AIC 288.1"
paste("Model 2 AIC", round(glm_model2$aic, 1))
## [1] "Model 2 AIC 561"
paste("Model 1 BIC", round(BIC(glm_model1), 2))
## [1] "Model 1 BIC 296.24"
paste("Model 2 BIC", round(BIC(glm_model2), 2))
## [1] "Model 2 BIC 569.17"
# baseline model
model0 <- glm(poptotal ~ 1, data = midwest, 
             family = poisson(link = 'log'))

model1 <- glm(poptotal ~ popdensity, data = midwest, 
             family = poisson(link = 'log'))

model2 <- glm(poptotal ~ popdensity + area, data = midwest, 
             family = poisson(link = 'log'))
# T
anova(model0, model1, model2, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: poptotal ~ 1
## Model 2: poptotal ~ popdensity
## Model 3: poptotal ~ popdensity + area
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       436   96248700                          
## 2       435   26510219  1 69738481 < 2.2e-16 ***
## 3       434   26474771  1    35448 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1