Whyte, et al.,1987 reported the number of deaths due to AIDS in Australia per 3 month period from January 1983 - June 1986.
Create data set
#x: Time point (quarter)
x <- seq(1,14,1)
#y: number of deaths
y <- c(0,1,2,3,1,4,9,18,23,31,20,25,37,45)
death.dat <- data.frame(x,y)
death.dat
## x y
## 1 1 0
## 2 2 1
## 3 3 2
## 4 4 3
## 5 5 1
## 6 6 4
## 7 7 9
## 8 8 18
## 9 9 23
## 10 10 31
## 11 11 20
## 12 12 25
## 13 13 37
## 14 14 45
Plot scatter plot between x and y versus histogram of y
par(mfrow=c(1,2))
plot(x,y,,pch = 19, col="navy")
hist(y,ylab = "count", col = "navy",border = "grey")
Plot scatter plot between x and y versus x and ln(y)
par(mfrow=c(1,2))
plot(x,y,pch = 19, col="navy")
plot(x,log(y),pch = 19, col="purple")
Fit Poisson regression with log link function
poi1 <- glm(y~x, data = death.dat, family = poisson(link = "log") )
summary(poi1)
##
## Call:
## glm(formula = y ~ x, family = poisson(link = "log"), data = death.dat)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.33963 0.25119 1.352 0.176
## x 0.25652 0.02204 11.639 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 207.272 on 13 degrees of freedom
## Residual deviance: 29.654 on 12 degrees of freedom
## AIC: 86.581
##
## Number of Fisher Scoring iterations: 5
Plot Observed vs Fitted line
par(mfrow=c(1,2))
plot(x,log(y),pch = 19, col="navy", main = "Observed vs Fitted line of Log(y)")
abline(poi1,col = "blue",lwd=1)
plot(x,y,pch = 19, col="navy", main = "Observed vs Fitted line of y")
lines(fitted(poi1)~ x, col="red")
Fit Poisson regression with transform explanatory variable
poi.fit <- glm(y~log(x), data = death.dat, family = poisson(link = "log") )
summary(poi.fit)
##
## Call:
## glm(formula = y ~ log(x), family = poisson(link = "log"), data = death.dat)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9442 0.5116 -3.80 0.000145 ***
## log(x) 2.1748 0.2150 10.11 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 207.272 on 13 degrees of freedom
## Residual deviance: 17.092 on 12 degrees of freedom
## AIC: 74.019
##
## Number of Fisher Scoring iterations: 4
Fit poisson regression model (intercept only)
poi.null <- glm(y~1, data = death.dat, family = poisson(link = "log") )
summary(poi.null)
##
## Call:
## glm(formula = y ~ 1, family = poisson(link = "log"), data = death.dat)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.75001 0.06757 40.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 207.27 on 13 degrees of freedom
## Residual deviance: 207.27 on 13 degrees of freedom
## AIC: 262.2
##
## Number of Fisher Scoring iterations: 5
Compare fitted model with null model
anova(poi.null,poi.fit, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: y ~ 1
## Model 2: y ~ log(x)
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 13 207.272
## 2 12 17.092 1 190.18 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Construct confidence intervals for regression coefficient parameter
confint(poi.fit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -2.999299 -0.9925442
## log(x) 1.771631 2.6151271
exp(confint(poi.fit))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.04982198 0.3706325
## log(x) 5.88043832 13.6689531
Fit the saturated model
poi.sat <- glm(y~as.factor(1:length(x)), data = death.dat, family = poisson(link = "log") )
summary(poi.sat)
##
## Call:
## glm(formula = y ~ as.factor(1:length(x)), family = poisson(link = "log"),
## data = death.dat)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -22.30 42247.17 -0.001 1
## as.factor(1:length(x))2 22.30 42247.17 0.001 1
## as.factor(1:length(x))3 23.00 42247.17 0.001 1
## as.factor(1:length(x))4 23.40 42247.17 0.001 1
## as.factor(1:length(x))5 22.30 42247.17 0.001 1
## as.factor(1:length(x))6 23.69 42247.17 0.001 1
## as.factor(1:length(x))7 24.50 42247.17 0.001 1
## as.factor(1:length(x))8 25.19 42247.17 0.001 1
## as.factor(1:length(x))9 25.44 42247.17 0.001 1
## as.factor(1:length(x))10 25.74 42247.17 0.001 1
## as.factor(1:length(x))11 25.30 42247.17 0.001 1
## as.factor(1:length(x))12 25.52 42247.17 0.001 1
## as.factor(1:length(x))13 25.91 42247.17 0.001 1
## as.factor(1:length(x))14 26.11 42247.17 0.001 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2.0727e+02 on 13 degrees of freedom
## Residual deviance: 4.1223e-10 on 0 degrees of freedom
## AIC: 80.927
##
## Number of Fisher Scoring iterations: 20
Compute the log-likelihood of saturated model, fitted model and null model, then calculate the Psuedo R-squared.
ll.s <- as.numeric(logLik(poi.sat))
ll.fit <- as.numeric(logLik(poi.fit))
ll.0 <- as.numeric(logLik(poi.null))
ll.s; ll.fit; ll.0
## [1] -26.46359
## [1] -35.00942
## [1] -130.0997
#Pseudo R-square
pr2 <- (ll.0-ll.fit)/ll.0
pr2
## [1] 0.7309032
Compute the Deviance statistic (G-squared) and its p-value
G.sq <- deviance(poi.fit); G.sq
## [1] 17.09166
1-pchisq(G.sq, df = poi.fit$df.residual)
## [1] 0.1461819
Calculate the Pearson goodness of fit statistic and its p-value
Pearson <- sum((death.dat$y - poi.fit$fitted.values)^2
/ poi.fit$fitted.values)
Pearson
## [1] 15.98845
1-pchisq(Pearson, df = poi.fit$df.residual)
## [1] 0.1917658
Plot scatter plot of observed values versus fitted line
pred <- predict.glm(poi.fit, data = death.dat, type = "response")
plot(y~log(x),xlab="log(x)", ylab="y", pch=19, col = "navy")
lines(fitted(poi.fit)~ log(x), col="red")
Estimate the probabilities that the number of deaths equals y for particular x = 3
mu_3 <- as.numeric(exp(poi.fit$coefficients[1]+(poi.fit$coefficients[2]*log(3)))); mu_3
## [1] 1.560612
p_y0 <- dpois(0,lambda = mu_3); p_y0
## [1] 0.2100075
p_y1 <- dpois(1,lambda = mu_3); p_y1
## [1] 0.3277402
Estimate the dispersion parameter
#Dispersion parameter estimated
Pearson/poi.fit$df.residual
## [1] 1.332371
deviance(poi.fit)/poi.fit$df.residual
## [1] 1.424305
Lung cancer incidence in four Danish cities 1968-1971. This data set
contains counts of incident lung cancer cases and population size in
four neighbouring Danish cities by age group.
A data frame with 24 observations on the following 4 variables:
city a factor with levels Fredericia, Horsens, Kolding, and Vejle.
age a factor with levels 40-54, 55-59, 60-64, 65-69,70-74, and
75+.
pop a numeric vector, number of inhabitants.
cases a numeric vector, number of lung cancer cases.
#install.packages("ISwR")
library(ISwR)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(eba1977)
cancer.data = eba1977
head(cancer.data)
## city age pop cases
## 1 Fredericia 40-54 3059 11
## 2 Horsens 40-54 2879 13
## 3 Kolding 40-54 3142 4
## 4 Vejle 40-54 2520 5
## 5 Fredericia 55-59 800 11
## 6 Horsens 55-59 1083 6
levels(cancer.data$city)
## [1] "Fredericia" "Horsens" "Kolding" "Vejle"
levels(cancer.data$age)
## [1] "40-54" "55-59" "60-64" "65-69" "70-74" "75+"
Response variable: Number of cases of lung cancer
hist(cancer.data$cases, xlab = "cases", main = "Histogram of cases")
mean(cancer.data$cases)
## [1] 9.333333
var(cancer.data$cases)
## [1] 9.971014
# add the log(pop) values to the dataframe
cancer.data <- cancer.data %>% mutate(logpop = log(pop))
head(cancer.data)
## city age pop cases logpop
## 1 Fredericia 40-54 3059 11 8.025843
## 2 Horsens 40-54 2879 13 7.965198
## 3 Kolding 40-54 3142 4 8.052615
## 4 Vejle 40-54 2520 5 7.832014
## 5 Fredericia 55-59 800 11 6.684612
## 6 Horsens 55-59 1083 6 6.987490
poisson.model.rate <- glm(cases ~ city + age, offset = logpop, family = poisson(link = "log"), data = cancer.data)
#or
poisson.model.rate <- glm(cases ~ city + age + offset(logpop), family = poisson(link = "log"), data = cancer.data)
summary(poisson.model.rate)
##
## Call:
## glm(formula = cases ~ city + age + offset(logpop), family = poisson(link = "log"),
## data = cancer.data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.6321 0.2003 -28.125 < 2e-16 ***
## cityHorsens -0.3301 0.1815 -1.818 0.0690 .
## cityKolding -0.3715 0.1878 -1.978 0.0479 *
## cityVejle -0.2723 0.1879 -1.450 0.1472
## age55-59 1.1010 0.2483 4.434 9.23e-06 ***
## age60-64 1.5186 0.2316 6.556 5.53e-11 ***
## age65-69 1.7677 0.2294 7.704 1.31e-14 ***
## age70-74 1.8569 0.2353 7.891 3.00e-15 ***
## age75+ 1.4197 0.2503 5.672 1.41e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 129.908 on 23 degrees of freedom
## Residual deviance: 23.447 on 15 degrees of freedom
## AIC: 137.84
##
## Number of Fisher Scoring iterations: 5
fitted(poisson.model.rate)/cancer.data$pop
## 1 2 3 4 5 6
## 0.003581174 0.002574437 0.002469818 0.002727466 0.010769357 0.007741882
## 7 8 9 10 11 12
## 0.007427272 0.008202074 0.016351230 0.011754582 0.011276906 0.012453297
## 13 14 15 16 17 18
## 0.020976379 0.015079512 0.014466720 0.015975868 0.022932476 0.016485712
## 19 20 21 22 23 24
## 0.015815776 0.017465655 0.014810616 0.010647064 0.010214395 0.011279946
pred <- predict.glm(poisson.model.rate, type = "response")
pred.rate <- pred/cancer.data$pop; pred.rate
## 1 2 3 4 5 6
## 0.003581174 0.002574437 0.002469818 0.002727466 0.010769357 0.007741882
## 7 8 9 10 11 12
## 0.007427272 0.008202074 0.016351230 0.011754582 0.011276906 0.012453297
## 13 14 15 16 17 18
## 0.020976379 0.015079512 0.014466720 0.015975868 0.022932476 0.016485712
## 19 20 21 22 23 24
## 0.015815776 0.017465655 0.014810616 0.010647064 0.010214395 0.011279946
Create a test dataframe containing new values of variables
test.data <- data.frame(city = "Kolding", age = "40-54", pop = 1000, logpop = log(1000))
predict outcomes (responses) using ‘predict()’
predicted.value <- predict(poisson.model.rate, test.data, type = "response")
predicted.value/test.data$pop
## 1
## 0.002469818
#or
poisson.model.rate$coefficients
## (Intercept) cityHorsens cityKolding cityVejle age55-59 age60-64
## -5.6320645 -0.3300600 -0.3715462 -0.2723177 1.1010140 1.5186123
## age65-69 age70-74 age75+
## 1.7677062 1.8568633 1.4196534
exp(poisson.model.rate$coefficients[1]+poisson.model.rate$coefficients[3])
## (Intercept)
## 0.002469818
#define new variable 'mid.age' using mutate() and case_when()
cancer.data <- cancer.data %>%
mutate(mid.age = case_when(cancer.data$age == "40-54" ~ 47,
cancer.data$age == "55-59" ~ 57,
cancer.data$age == "60-64" ~ 62,
cancer.data$age == "65-69" ~ 67,
cancer.data$age == "70-74" ~ 72,
cancer.data$age == "75+" ~ 75))
head(cancer.data)
## city age pop cases logpop mid.age
## 1 Fredericia 40-54 3059 11 8.025843 47
## 2 Horsens 40-54 2879 13 7.965198 47
## 3 Kolding 40-54 3142 4 8.052615 47
## 4 Vejle 40-54 2520 5 7.832014 47
## 5 Fredericia 55-59 800 11 6.684612 57
## 6 Horsens 55-59 1083 6 6.987490 57