Example 1: Number of Death due to AIDS

Whyte, et al.,1987 reported the number of deaths due to AIDS in Australia per 3 month period from January 1983 - June 1986.

Create data set

#x: Time point (quarter)
x <- seq(1,14,1)
#y: number of deaths
y <- c(0,1,2,3,1,4,9,18,23,31,20,25,37,45)
death.dat <- data.frame(x,y)
death.dat
##     x  y
## 1   1  0
## 2   2  1
## 3   3  2
## 4   4  3
## 5   5  1
## 6   6  4
## 7   7  9
## 8   8 18
## 9   9 23
## 10 10 31
## 11 11 20
## 12 12 25
## 13 13 37
## 14 14 45

Plot scatter plot between x and y versus histogram of y

par(mfrow=c(1,2))
plot(x,y,,pch = 19, col="navy")
hist(y,ylab = "count", col = "navy",border = "grey")

Plot scatter plot between x and y versus x and ln(y)

par(mfrow=c(1,2))
plot(x,y,pch = 19, col="navy")
plot(x,log(y),pch = 19, col="purple")

Fit Poisson regression with log link function

poi1 <- glm(y~x, data = death.dat, family = poisson(link = "log") )
summary(poi1)
## 
## Call:
## glm(formula = y ~ x, family = poisson(link = "log"), data = death.dat)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.33963    0.25119   1.352    0.176    
## x            0.25652    0.02204  11.639   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 207.272  on 13  degrees of freedom
## Residual deviance:  29.654  on 12  degrees of freedom
## AIC: 86.581
## 
## Number of Fisher Scoring iterations: 5

Plot Observed vs Fitted line

par(mfrow=c(1,2))
plot(x,log(y),pch = 19, col="navy", main = "Observed vs Fitted line of Log(y)")
abline(poi1,col = "blue",lwd=1)

plot(x,y,pch = 19, col="navy", main = "Observed vs Fitted line of y")
lines(fitted(poi1)~ x, col="red")

Fit Poisson regression with transform explanatory variable

poi.fit <- glm(y~log(x), data = death.dat, family = poisson(link = "log") )
summary(poi.fit)
## 
## Call:
## glm(formula = y ~ log(x), family = poisson(link = "log"), data = death.dat)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.9442     0.5116   -3.80 0.000145 ***
## log(x)        2.1748     0.2150   10.11  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 207.272  on 13  degrees of freedom
## Residual deviance:  17.092  on 12  degrees of freedom
## AIC: 74.019
## 
## Number of Fisher Scoring iterations: 4

Fit poisson regression model (intercept only)

poi.null <- glm(y~1, data = death.dat, family = poisson(link = "log") )
summary(poi.null)
## 
## Call:
## glm(formula = y ~ 1, family = poisson(link = "log"), data = death.dat)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.75001    0.06757    40.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 207.27  on 13  degrees of freedom
## Residual deviance: 207.27  on 13  degrees of freedom
## AIC: 262.2
## 
## Number of Fisher Scoring iterations: 5

Compare fitted model with null model

anova(poi.null,poi.fit, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: y ~ 1
## Model 2: y ~ log(x)
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        13    207.272                          
## 2        12     17.092  1   190.18 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Construct confidence intervals for regression coefficient parameter

confint(poi.fit)
## Waiting for profiling to be done...
##                 2.5 %     97.5 %
## (Intercept) -2.999299 -0.9925442
## log(x)       1.771631  2.6151271
exp(confint(poi.fit))
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept) 0.04982198  0.3706325
## log(x)      5.88043832 13.6689531

Fit the saturated model

poi.sat <- glm(y~as.factor(1:length(x)), data = death.dat, family = poisson(link = "log") )
summary(poi.sat)
## 
## Call:
## glm(formula = y ~ as.factor(1:length(x)), family = poisson(link = "log"), 
##     data = death.dat)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                -22.30   42247.17  -0.001        1
## as.factor(1:length(x))2     22.30   42247.17   0.001        1
## as.factor(1:length(x))3     23.00   42247.17   0.001        1
## as.factor(1:length(x))4     23.40   42247.17   0.001        1
## as.factor(1:length(x))5     22.30   42247.17   0.001        1
## as.factor(1:length(x))6     23.69   42247.17   0.001        1
## as.factor(1:length(x))7     24.50   42247.17   0.001        1
## as.factor(1:length(x))8     25.19   42247.17   0.001        1
## as.factor(1:length(x))9     25.44   42247.17   0.001        1
## as.factor(1:length(x))10    25.74   42247.17   0.001        1
## as.factor(1:length(x))11    25.30   42247.17   0.001        1
## as.factor(1:length(x))12    25.52   42247.17   0.001        1
## as.factor(1:length(x))13    25.91   42247.17   0.001        1
## as.factor(1:length(x))14    26.11   42247.17   0.001        1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2.0727e+02  on 13  degrees of freedom
## Residual deviance: 4.1223e-10  on  0  degrees of freedom
## AIC: 80.927
## 
## Number of Fisher Scoring iterations: 20

Compute the log-likelihood of saturated model, fitted model and null model, then calculate the Psuedo R-squared.

ll.s <- as.numeric(logLik(poi.sat))
ll.fit <- as.numeric(logLik(poi.fit))
ll.0 <- as.numeric(logLik(poi.null))
ll.s; ll.fit; ll.0
## [1] -26.46359
## [1] -35.00942
## [1] -130.0997
#Pseudo R-square
pr2 <- (ll.0-ll.fit)/ll.0
pr2
## [1] 0.7309032

Compute the Deviance statistic (G-squared) and its p-value

G.sq <- deviance(poi.fit); G.sq
## [1] 17.09166
1-pchisq(G.sq, df = poi.fit$df.residual)
## [1] 0.1461819

Calculate the Pearson goodness of fit statistic and its p-value

Pearson <- sum((death.dat$y - poi.fit$fitted.values)^2 
               / poi.fit$fitted.values)
Pearson
## [1] 15.98845
1-pchisq(Pearson, df = poi.fit$df.residual)
## [1] 0.1917658

Plot scatter plot of observed values versus fitted line

pred <- predict.glm(poi.fit, data = death.dat, type = "response")
plot(y~log(x),xlab="log(x)", ylab="y", pch=19, col = "navy")
lines(fitted(poi.fit)~ log(x), col="red")

Estimate the probabilities that the number of deaths equals y for particular x = 3

mu_3 <- as.numeric(exp(poi.fit$coefficients[1]+(poi.fit$coefficients[2]*log(3)))); mu_3
## [1] 1.560612
p_y0 <- dpois(0,lambda = mu_3); p_y0
## [1] 0.2100075
p_y1 <- dpois(1,lambda = mu_3); p_y1
## [1] 0.3277402

Estimate the dispersion parameter

#Dispersion parameter estimated
Pearson/poi.fit$df.residual
## [1] 1.332371
deviance(poi.fit)/poi.fit$df.residual
## [1] 1.424305

Example 2: Rate of Lung cancer

Lung cancer incidence in four Danish cities 1968-1971. This data set contains counts of incident lung cancer cases and population size in four neighbouring Danish cities by age group.
A data frame with 24 observations on the following 4 variables:
city a factor with levels Fredericia, Horsens, Kolding, and Vejle.
age a factor with levels 40-54, 55-59, 60-64, 65-69,70-74, and 75+.
pop a numeric vector, number of inhabitants.
cases a numeric vector, number of lung cancer cases.

#install.packages("ISwR")
library(ISwR)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data(eba1977)
cancer.data = eba1977
head(cancer.data)
##         city   age  pop cases
## 1 Fredericia 40-54 3059    11
## 2    Horsens 40-54 2879    13
## 3    Kolding 40-54 3142     4
## 4      Vejle 40-54 2520     5
## 5 Fredericia 55-59  800    11
## 6    Horsens 55-59 1083     6
levels(cancer.data$city)
## [1] "Fredericia" "Horsens"    "Kolding"    "Vejle"
levels(cancer.data$age)
## [1] "40-54" "55-59" "60-64" "65-69" "70-74" "75+"

Response variable: Number of cases of lung cancer

hist(cancer.data$cases, xlab = "cases", main = "Histogram of cases")

mean(cancer.data$cases)
## [1] 9.333333
var(cancer.data$cases)
## [1] 9.971014
# add the log(pop) values to the dataframe
cancer.data <- cancer.data %>% mutate(logpop = log(pop))
head(cancer.data)
##         city   age  pop cases   logpop
## 1 Fredericia 40-54 3059    11 8.025843
## 2    Horsens 40-54 2879    13 7.965198
## 3    Kolding 40-54 3142     4 8.052615
## 4      Vejle 40-54 2520     5 7.832014
## 5 Fredericia 55-59  800    11 6.684612
## 6    Horsens 55-59 1083     6 6.987490
poisson.model.rate <- glm(cases ~ city + age, offset = logpop, family = poisson(link = "log"), data = cancer.data)
#or
poisson.model.rate <- glm(cases ~ city + age + offset(logpop), family = poisson(link = "log"), data = cancer.data)
summary(poisson.model.rate)
## 
## Call:
## glm(formula = cases ~ city + age + offset(logpop), family = poisson(link = "log"), 
##     data = cancer.data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.6321     0.2003 -28.125  < 2e-16 ***
## cityHorsens  -0.3301     0.1815  -1.818   0.0690 .  
## cityKolding  -0.3715     0.1878  -1.978   0.0479 *  
## cityVejle    -0.2723     0.1879  -1.450   0.1472    
## age55-59      1.1010     0.2483   4.434 9.23e-06 ***
## age60-64      1.5186     0.2316   6.556 5.53e-11 ***
## age65-69      1.7677     0.2294   7.704 1.31e-14 ***
## age70-74      1.8569     0.2353   7.891 3.00e-15 ***
## age75+        1.4197     0.2503   5.672 1.41e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 129.908  on 23  degrees of freedom
## Residual deviance:  23.447  on 15  degrees of freedom
## AIC: 137.84
## 
## Number of Fisher Scoring iterations: 5
fitted(poisson.model.rate)/cancer.data$pop
##           1           2           3           4           5           6 
## 0.003581174 0.002574437 0.002469818 0.002727466 0.010769357 0.007741882 
##           7           8           9          10          11          12 
## 0.007427272 0.008202074 0.016351230 0.011754582 0.011276906 0.012453297 
##          13          14          15          16          17          18 
## 0.020976379 0.015079512 0.014466720 0.015975868 0.022932476 0.016485712 
##          19          20          21          22          23          24 
## 0.015815776 0.017465655 0.014810616 0.010647064 0.010214395 0.011279946
pred <- predict.glm(poisson.model.rate, type = "response")
pred.rate <- pred/cancer.data$pop; pred.rate
##           1           2           3           4           5           6 
## 0.003581174 0.002574437 0.002469818 0.002727466 0.010769357 0.007741882 
##           7           8           9          10          11          12 
## 0.007427272 0.008202074 0.016351230 0.011754582 0.011276906 0.012453297 
##          13          14          15          16          17          18 
## 0.020976379 0.015079512 0.014466720 0.015975868 0.022932476 0.016485712 
##          19          20          21          22          23          24 
## 0.015815776 0.017465655 0.014810616 0.010647064 0.010214395 0.011279946

Create a test dataframe containing new values of variables

test.data <- data.frame(city = "Kolding", age = "40-54", pop = 1000, logpop = log(1000))

predict outcomes (responses) using ‘predict()’

predicted.value <- predict(poisson.model.rate, test.data, type = "response")
predicted.value/test.data$pop
##           1 
## 0.002469818
#or
poisson.model.rate$coefficients
## (Intercept) cityHorsens cityKolding   cityVejle    age55-59    age60-64 
##  -5.6320645  -0.3300600  -0.3715462  -0.2723177   1.1010140   1.5186123 
##    age65-69    age70-74      age75+ 
##   1.7677062   1.8568633   1.4196534
exp(poisson.model.rate$coefficients[1]+poisson.model.rate$coefficients[3])
## (Intercept) 
## 0.002469818
#define new variable 'mid.age' using mutate() and case_when()
cancer.data <- cancer.data %>%
                    mutate(mid.age = case_when(cancer.data$age == "40-54" ~ 47,
                             cancer.data$age == "55-59" ~ 57,
                             cancer.data$age == "60-64" ~ 62,
                             cancer.data$age == "65-69" ~ 67,
                             cancer.data$age == "70-74" ~ 72,
                             cancer.data$age == "75+" ~ 75))
head(cancer.data)
##         city   age  pop cases   logpop mid.age
## 1 Fredericia 40-54 3059    11 8.025843      47
## 2    Horsens 40-54 2879    13 7.965198      47
## 3    Kolding 40-54 3142     4 8.052615      47
## 4      Vejle 40-54 2520     5 7.832014      47
## 5 Fredericia 55-59  800    11 6.684612      57
## 6    Horsens 55-59 1083     6 6.987490      57