library(readxl) # reading excel files
library(skimr) # summary statistics
library(tidyverse) # Pack of useful tools
library(DataExplorer) # Exploratory data analysis
library(MASS) # Negative binomial regression
library(vcd) # Godness of fit parameters
library(car) # Goodness of fit
library(rcompanion) # Goodness of fit
library(popbio) # calculate elasticities

1 Import dataset

dataset <- read_excel("TDM_GZLM_CALMICH_Example.xlsx")
view(dataset)

dataset <- read_excel("TDM_GZLM_CALMICH_Example.xlsx", skip = 5) #skipping the first 5 rows
view(dataset)

2 Explore data

2.1 Summary Statistics

df <- dataset
str(df)
## tibble [84 × 6] (S3: tbl_df/tbl/data.frame)
##  $ STATE   : num [1:84] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ACCIDENT: num [1:84] 0 0 0 0 2 8 2 4 3 12 ...
##  $ AADT1   : num [1:84] 6633 6633 6633 6633 12700 ...
##  $ AADT2   : num [1:84] 180 51 100 51 21 1700 51 701 45 65 ...
##  $ MEDIAN  : num [1:84] 16 16 16 16 0 0 0 0 0 0 ...
##  $ DRIVE   : num [1:84] 1 1 0 1 13 7 15 12 8 14 ...
summary(df)
##      STATE           ACCIDENT          AADT1           AADT2       
##  Min.   :0.0000   Min.   : 0.000   Min.   : 2367   Min.   :  15.0  
##  1st Qu.:0.0000   1st Qu.: 0.000   1st Qu.: 7307   1st Qu.: 101.0  
##  Median :0.0000   Median : 1.000   Median :12050   Median : 348.5  
##  Mean   :0.2857   Mean   : 2.619   Mean   :12870   Mean   : 595.9  
##  3rd Qu.:1.0000   3rd Qu.: 4.000   3rd Qu.:16658   3rd Qu.: 917.5  
##  Max.   :1.0000   Max.   :13.000   Max.   :33058   Max.   :3001.0  
##      MEDIAN           DRIVE       
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 0.000   Median : 1.000  
##  Mean   : 3.798   Mean   : 3.095  
##  3rd Qu.: 6.000   3rd Qu.: 5.250  
##  Max.   :36.000   Max.   :15.000

2.2 Preparing the data

# STATE <- categorical norminal variable

df$STATE <- factor(df$STATE, label = c("California", "Michigan")) #0-Cali, 1-Michigan
table(df$STATE)
## 
## California   Michigan 
##         60         24
skim(df)
Data summary
Name df
Number of rows 84
Number of columns 6
_______________________
Column type frequency:
factor 1
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
STATE 0 1 FALSE 2 Cal: 60, Mic: 24

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ACCIDENT 0 1 2.62 3.36 0 0.00 1.0 4.00 13 ▇▂▁▁▁
AADT1 0 1 12869.71 6797.85 2367 7307.25 12050.0 16658.50 33058 ▇▇▅▂▁
AADT2 0 1 595.86 679.27 15 101.00 348.5 917.50 3001 ▇▂▁▁▁
MEDIAN 0 1 3.80 6.09 0 0.00 0.0 6.00 36 ▇▁▁▁▁
DRIVE 0 1 3.10 3.90 0 0.00 1.0 5.25 15 ▇▂▂▁▁
plot_histogram(df, ncol = 3)

3 Generalised Linear Models

3.1 Density function

# Let’s start by plotting the density function of the dependent variable
plot(density(df$ACCIDENT), main = "Density estimate of ACCIDENTS")

# As the dependent variable is “count data”, and has discrete values (it is not continuous), then a Poisson distribution should be more adequate.

3.2 Poisson Assumption

# mean 
mean(df$ACCIDENT)
## [1] 2.619048
# variance (phuong sai)
var(df$ACCIDENT)
## [1] 11.29891
var(df$ACCIDENT)/mean(df$ACCIDENT)
## [1] 4.314129
# If the coefficient of variance > 1, you have overdispersion

3.3 Goodness of fit

# Estimate goodness of fit parameters for the PDF (Ham mat do xac suat) of ACCIDENT. This test analyses the equality between the mean and the variance through Poisson Regression Standard against the alternative of the variance exceeding the mean (Negative Binomial).

gf<-goodfit(df$ACCIDENT,  type= "poisson", method= "ML") #Maximum Likelihood method
summary(gf)
## 
##   Goodness-of-fit test for poisson distribution
## 
##                       X^2 df     P(> X^2)
## Likelihood Ratio 155.7106 11 1.012305e-27
# ## HO: The null hypothesis is that it is a Poisson distribution
##   Goodness-of-fit test for poisson distribution
## 
##                       X^2 df     P(> X^2)
## Likelihood Ratio 155.7106 11 1.012305e-27

### Therefore, for it to be a Poisson distribution, the pvalue > 0.05.

3.4 Different models

3.4.1 Poisson model

3.4.1.1 glm Model

model1 = glm(
  ACCIDENT ~ STATE + AADT2 + MEDIAN + DRIVE + offset(log(AADT1)),
  family = poisson(link = "log"),
  data = df,
  method = "glm.fit"
)
summary(model1)
## 
## Call:
## glm(formula = ACCIDENT ~ STATE + AADT2 + MEDIAN + DRIVE + offset(log(AADT1)), 
##     family = poisson(link = "log"), data = df, method = "glm.fit")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1238  -1.2665  -0.5184   0.3952   3.7088  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -8.997e+00  1.656e-01 -54.320  < 2e-16 ***
## STATEMichigan -2.000e-01  1.590e-01  -1.258   0.2084    
## AADT2          5.136e-04  7.406e-05   6.935 4.06e-12 ***
## MEDIAN        -5.212e-02  2.147e-02  -2.427   0.0152 *  
## DRIVE          6.552e-02  1.634e-02   4.011 6.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 255.07  on 83  degrees of freedom
## Residual deviance: 167.31  on 79  degrees of freedom
## AIC: 339.29
## 
## Number of Fisher Scoring iterations: 5
## What can we say about the residuals and degrees of freedom?
# residuals > degrees of freedom -> overdispersion
# residuals < degrees of freedom -> underdispersion
# residuals = degrees of freedom -> mean = variance

## In overdispersion, the estimates are reliable but the standard errors tend to be smaller.

3.4.1.2 Pseudo-Rsquare and Log-likelihood ratio test

nagelkerke(model1)
## $Models
##                                                                                                                   
## Model: "glm, ACCIDENT ~ STATE + AADT2 + MEDIAN + DRIVE + offset(log(AADT1)), poisson(link = \"log\"), df, glm.fit"
## Null:  "glm, ACCIDENT ~ 1, poisson(link = \"log\"), df, glm.fit"                                                  
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                             0.331213
## Cox and Snell (ML)                   0.856499
## Nagelkerke (Cragg and Uhler)         0.858945
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq    p.value
##       -4      -81.54 163.08 3.1953e-34
## 
## $Number.of.observations
##          
## Model: 84
## Null:  84
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"
# The likelihood ratio test (Omnibus test) compares the fitted model (“Model”) with the only-intercept model (“Null”). This test verifies if the explained variance is higher than the the unexplained variance.

# Note: (h_0) There is no overdispersion in the model. Therefore, if pvalue < 0.05, there is overdispersion, and we should choose to use a Negative Binomial model.

3.4.1.3 Wald test

# Type III test
Anova(model1, type = "III", test = "Wald")
## Analysis of Deviance Table (Type III tests)
## 
## Response: ACCIDENT
##             Df     Chisq Pr(>Chisq)    
## (Intercept)  1 2950.6671  < 2.2e-16 ***
## STATE        1    1.5825    0.20841    
## AADT2        1   48.0944  4.062e-12 ***
## MEDIAN       1    5.8922    0.01521 *  
## DRIVE        1   16.0891  6.043e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.4.2 Overdispersed Poisson Model

## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq    p.value
##       -4      -81.54 163.08 3.1953e-34

### h_0) There is no overdispersion in the model;  pvalue (3.1953e-34) < 0.05, there is overdispersion, and we should choose to use a Negative Binomial model.

# Let us correct the standard errors with an overdispersed poisson model.

#  quasipoisson

model2 = glm(
  ACCIDENT ~ STATE + AADT2 + MEDIAN + DRIVE + offset(log(AADT1)),
  family = quasipoisson(link = "log"),
  data = df,
  method = "glm.fit"
)
summary(model2)
## 
## Call:
## glm(formula = ACCIDENT ~ STATE + AADT2 + MEDIAN + DRIVE + offset(log(AADT1)), 
##     family = quasipoisson(link = "log"), data = df, method = "glm.fit")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1238  -1.2665  -0.5184   0.3952   3.7088  
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.9972283  0.2387734 -37.681  < 2e-16 ***
## STATEMichigan -0.2000225  0.2292178  -0.873  0.38551    
## AADT2          0.0005136  0.0001068   4.811 7.08e-06 ***
## MEDIAN        -0.0521227  0.0309546  -1.684  0.09616 .  
## DRIVE          0.0655221  0.0235482   2.782  0.00675 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 2.07814)
## 
##     Null deviance: 255.07  on 83  degrees of freedom
## Residual deviance: 167.31  on 79  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
# The estimates are the same, but the standard errors have increased because they are adjusted by the scale parameter.

3.4.3 Negative Binomial distribution

3.4.3.1 glm Model

model3 = glm.nb(ACCIDENT ~ STATE + AADT2 + MEDIAN + DRIVE + offset(log(AADT1)),
                data = df)
summary(model3)
## 
## Call:
## glm.nb(formula = ACCIDENT ~ STATE + AADT2 + MEDIAN + DRIVE + 
##     offset(log(AADT1)), data = df, init.theta = 2.19225058, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2604  -0.9918  -0.3235   0.2587   2.3277  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -8.9883516  0.2416863 -37.190  < 2e-16 ***
## STATEMichigan -0.2736628  0.2574693  -1.063   0.2878    
## AADT2          0.0005680  0.0001419   4.004 6.23e-05 ***
## MEDIAN        -0.0633165  0.0301447  -2.100   0.0357 *  
## DRIVE          0.0597809  0.0279771   2.137   0.0326 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(2.1923) family taken to be 1)
## 
##     Null deviance: 125.481  on 83  degrees of freedom
## Residual deviance:  87.601  on 79  degrees of freedom
## AIC: 313.72
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  2.192 
##           Std. Err.:  0.762 
## 
##  2 x log-likelihood:  -301.719

3.4.3.2 Pseudo-Rsquare and Log-likelihood ratio test

nagelkerke(model3)
## $Models
##                                                                                                     
## Model: "glm.nb, ACCIDENT ~ STATE + AADT2 + MEDIAN + DRIVE + offset(log(AADT1)), df, 2.19225058, log"
## Null:  "glm.nb, ACCIDENT ~ 1, df, 0.6629432212, log"                                                
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                             0.150312
## Cox and Snell (ML)                   0.470285
## Nagelkerke (Cragg and Uhler)         0.477249
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq    p.value
##       -4     -26.687 53.375 7.1132e-11
## 
## $Number.of.observations
##          
## Model: 84
## Null:  84
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"

3.4.3.3 Wald test

Anova(model3, type = "III", test = "Wald")
## Analysis of Deviance Table (Type III tests)
## 
## Response: ACCIDENT
##             Df     Chisq Pr(>Chisq)    
## (Intercept)  1 1383.1076  < 2.2e-16 ***
## STATE        1    1.1297    0.28783    
## AADT2        1   16.0325  6.227e-05 ***
## MEDIAN       1    4.4118    0.03569 *  
## DRIVE        1    4.5658    0.03262 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.4.4 Comparing model 1 and model 3

### AIC and BIC
# Akaike’s Information Criteria (AIC) and Bayesian Information Criteria (BIC) evaluates the quality of a finite set of models.
# AIC and BIC consider the maximum likelihood and the number of parameters in assessing the quality of the models. Nonetheless, the difference between both methods is that the BIC takes into account the number of observations of the dataset.
# Calculate the AIC and the BIC.

aic <- data.frame(model1 = AIC(model1), model3 = AIC(model3))
bic <- data.frame(model1 = BIC(model1), model3 = BIC(model3))

# Note: The smaller the values of AIC and BIC, the better the model.

3.4.5 Elasticities

# Calculate the elasticities of the negative binomial model
el1 <- as.numeric(model3$coefficients["AADT1"] * mean(df$AADT1)/mean(df$ACCIDENT))
el2 <- as.numeric(model3$coefficients["AADT2"] * mean(df$AADT2)/mean(df$ACCIDENT))
el3 <- as.numeric(model3$coefficients["MEDIAN"] * mean(df$MEDIAN)/mean(df$ACCIDENT))
el4 <- as.numeric(model3$coefficients["DRIVE"] * mean(df$DRIVE)/mean(df$ACCIDENT))

elasticity <- data.frame(variable = c("AADT1", "AADT2", "MEDIAN", "DRIVE"),
                         elasticity = c(el1, el2, el3, el4))

# Note: AADT1 does not have a value because it is the offset of the model. Note: STATE is a categorical variable. We would need to calculate the pseudo-elasticities in this case. Follow the same logic of the code and try to calculate for yourselves.