##################################################
# Lab 8: Generalized Linear Model
# Zero-inflated models
# Franz Mueter
# Last modified: 10-09-2024
##################################################
#### Chum salmon example
# Nenana River chum salmon catches
# Data courtesy of Parker Bradley
# The data consist of the counts of chum salmon (and 2 other species) collected in two types of nets (Fyke net, plane trap) along two sides of the Nenana River (river left (L) and river right (R)), as well as in mid-channel (M) during the summer of 2011
# Our goal for this exercise is to estimate the trend in mean count of chum salmon over the course of the season to identify any seasonal patterns of migration
# Import and examine data:
Nenana <- read.csv("Nenana fish.csv")
head(Nenana)
## Date Gear Side Volume Chum LNS Lamprey
## 1 5/12/2011 Fyke R 2.55150 48 12 2
## 2 5/13/2011 Fyke R 2.65356 80 16 5
## 3 5/14/2011 Fyke R 1.55736 18 6 2
## 4 5/15/2011 Fyke R 1.46880 6 4 0
## 5 5/16/2011 Fyke R 2.51712 2 0 0
## 6 5/16/2011 Fyke R 2.56680 5 5 1
# Convert dates to Julian day for plotting and modeling
Nenana$Julian <- strptime(Nenana$Date,"%m/%d/%Y")$yday
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# CPUE by Julian day (overall, by gear and by location in river):
(p1 <- ggplot(Nenana, aes(Julian, Chum)) + geom_point())

p1 + facet_wrap(~Gear)

p1 + facet_wrap(~Side)

# For this analysis, we restrict data to early season to eliminate excessive zeros(only two fish caught after July 10 or Julian day 190)
Chum <- Nenana[Nenana$Julian <= 190, c(1:5,8)]
head(Chum)
## Date Gear Side Volume Chum Julian
## 1 5/12/2011 Fyke R 2.55150 48 131
## 2 5/13/2011 Fyke R 2.65356 80 132
## 3 5/14/2011 Fyke R 1.55736 18 133
## 4 5/15/2011 Fyke R 1.46880 6 134
## 5 5/16/2011 Fyke R 2.51712 2 135
## 6 5/16/2011 Fyke R 2.56680 5 135
(p2 <- ggplot(Chum, aes(Julian, Chum)) + geom_point())

p2 + facet_wrap(~Gear)

p2 + facet_wrap(~Side)

(p3 <- ggplot(Chum) + geom_histogram(aes(Chum), binwidth=5))

p3 + facet_wrap(~Side)

# The histograms is very skewed with many small counts (including zeros) and a few large ones (right-skewed)
# Note that fyke nets were used on the left and right side of the river, while plane traps were used in the middle of the river:
table(Chum$Gear, Chum$Side)
##
## L M R
## Fyke 79 0 106
## Plane Trap 0 48 0
# Catches are not directly comparable between the gears hence we focus on the fyke net catches. Since fyke nets were deployed later on the left hand side of the river (see scatterplot by Julian day above), the catches on the left-hand side missed some of the early part of the outmigration.
# Therefore, and for simplicity, We focus on a single sampling site on the right side of the river (R). First, we'll save the subset as a separate data frame for convenience
Chum.R <- Chum[Chum$Side == "R",]
table(Chum.R$Gear)
##
## Fyke
## 106
ggplot(Chum.R, aes(Julian, Chum)) + geom_point() +
ggtitle("Chum salmon counts, Nenana River, by Julian day, river right")

# Exercise 1: Examine the number of zero counts in 'Chum.R'
# Use 'table' or 'tabulate' (or 'dplyr' functions) to tally the number of zeros in the data and compute the proportion of zeros in the daily Chum salmon counts (on river right)
Chum.R
## Date Gear Side Volume Chum Julian
## 1 5/12/2011 Fyke R 2.551500 48 131
## 2 5/13/2011 Fyke R 2.653560 80 132
## 3 5/14/2011 Fyke R 1.557360 18 133
## 4 5/15/2011 Fyke R 1.468800 6 134
## 5 5/16/2011 Fyke R 2.517120 2 135
## 6 5/16/2011 Fyke R 2.566800 5 135
## 7 5/17/2011 Fyke R 3.130380 2 136
## 8 5/17/2011 Fyke R 3.586275 10 136
## 9 5/18/2011 Fyke R 2.527200 11 137
## 10 5/18/2011 Fyke R 2.777040 13 137
## 11 5/18/2011 Fyke R 2.620800 11 137
## 12 5/19/2011 Fyke R 1.456866 12 138
## 14 5/21/2011 Fyke R 3.474150 55 140
## 15 5/21/2011 Fyke R 2.692800 13 140
## 17 5/21/2011 Fyke R 1.911600 39 140
## 18 5/23/2011 Fyke R 1.156680 8 142
## 20 5/24/2011 Fyke R 1.968192 29 143
## 22 5/24/2011 Fyke R 1.927800 15 143
## 24 5/25/2011 Fyke R 1.215000 12 144
## 26 5/25/2011 Fyke R 1.543464 4 144
## 27 5/25/2011 Fyke R 1.440000 10 144
## 29 5/26/2011 Fyke R 1.640520 7 145
## 31 5/26/2011 Fyke R 1.663200 11 145
## 34 5/27/2011 Fyke R 1.247400 13 146
## 36 5/27/2011 Fyke R 1.051785 11 146
## 38 5/28/2011 Fyke R 1.716660 13 147
## 40 5/28/2011 Fyke R 1.350639 11 147
## 42 5/30/2011 Fyke R 2.408256 7 149
## 44 5/30/2011 Fyke R 1.648512 13 149
## 46 5/30/2011 Fyke R 1.130679 6 149
## 48 5/31/2011 Fyke R 1.559376 3 150
## 50 5/31/2011 Fyke R 1.767762 9 150
## 52 5/31/2011 Fyke R 2.040102 3 150
## 53 6/1/2011 Fyke R 2.430000 12 151
## 55 6/2/2011 Fyke R 2.608200 8 152
## 57 6/3/2011 Fyke R 1.692900 9 153
## 59 6/3/2011 Fyke R 1.320660 3 153
## 61 6/4/2011 Fyke R 0.959310 2 154
## 63 6/4/2011 Fyke R 1.053000 2 154
## 65 6/4/2011 Fyke R 0.420021 0 154
## 67 6/5/2011 Fyke R 1.646127 1 155
## 69 6/5/2011 Fyke R 1.848852 1 155
## 71 6/5/2011 Fyke R 1.620675 0 155
## 73 6/7/2011 Fyke R 1.690308 0 157
## 75 6/7/2011 Fyke R 2.104893 1 157
## 77 6/7/2011 Fyke R 1.421244 1 157
## 79 6/8/2011 Fyke R 1.029600 1 158
## 81 6/8/2011 Fyke R 0.871416 0 158
## 83 6/9/2011 Fyke R 0.754110 0 159
## 85 6/9/2011 Fyke R 0.887040 0 159
## 87 6/10/2011 Fyke R 2.440800 1 160
## 89 6/10/2011 Fyke R 2.864160 0 160
## 91 6/10/2011 Fyke R 2.233440 0 160
## 93 6/11/2011 Fyke R 2.749500 3 161
## 95 6/11/2011 Fyke R 2.574000 3 161
## 97 6/11/2011 Fyke R 1.890000 1 161
## 99 6/13/2011 Fyke R 1.607040 1 163
## 101 6/13/2011 Fyke R 1.508832 3 163
## 103 6/14/2011 Fyke R 1.256283 1 164
## 105 6/14/2011 Fyke R 0.956250 0 164
## 107 6/15/2011 Fyke R 1.290573 0 165
## 109 6/15/2011 Fyke R 1.259496 1 165
## 111 6/16/2011 Fyke R 1.023615 1 166
## 113 6/16/2011 Fyke R 1.740960 2 166
## 115 6/16/2011 Fyke R 1.140480 3 166
## 117 6/17/2011 Fyke R 1.706670 6 167
## 119 6/18/2011 Fyke R 1.985544 8 168
## 121 6/18/2011 Fyke R 0.993600 3 168
## 123 6/20/2011 Fyke R 0.669708 0 170
## 125 6/21/2011 Fyke R 1.306800 1 171
## 127 6/21/2011 Fyke R 0.940500 0 171
## 129 6/21/2011 Fyke R 0.841500 1 171
## 131 6/22/2011 Fyke R 0.540000 0 172
## 133 6/23/2011 Fyke R 0.617760 0 173
## 135 6/23/2011 Fyke R 0.926640 0 173
## 137 6/24/2011 Fyke R 0.487296 2 174
## 139 6/27/2011 Fyke R 1.122300 0 177
## 141 6/28/2011 Fyke R 1.593000 0 178
## 143 6/28/2011 Fyke R 0.509355 0 178
## 145 6/28/2011 Fyke R 0.261414 0 178
## 147 6/29/2011 Fyke R 0.629640 0 179
## 149 6/29/2011 Fyke R 0.049113 0 179
## 151 6/29/2011 Fyke R 0.192600 0 179
## 153 6/30/2011 Fyke R 0.221130 0 180
## 154 6/30/2011 Fyke R 1.929600 0 180
## 156 6/30/2011 Fyke R 2.693250 2 180
## 158 6/30/2011 Fyke R 1.532520 0 180
## 160 7/1/2011 Fyke R 2.577825 0 181
## 162 7/1/2011 Fyke R 1.741680 0 181
## 164 7/2/2011 Fyke R 1.759680 0 182
## 166 7/2/2011 Fyke R 1.994814 1 182
## 167 7/5/2011 Fyke R 1.267200 0 185
## 168 7/6/2011 Fyke R 2.335545 2 186
## 169 7/6/2011 Fyke R 1.352736 0 186
## 170 7/6/2011 Fyke R 2.300292 0 186
## 171 7/7/2011 Fyke R 1.304478 1 187
## 172 7/7/2011 Fyke R 1.688310 1 187
## 173 7/7/2011 Fyke R 1.638846 0 187
## 174 7/8/2011 Fyke R 1.149120 0 188
## 175 7/8/2011 Fyke R 1.385910 0 188
## 176 7/9/2011 Fyke R 0.888894 0 189
## 178 7/9/2011 Fyke R 2.126250 0 189
## 179 7/9/2011 Fyke R 1.393200 0 189
## 180 7/10/2011 Fyke R 2.076192 0 190
## 182 7/10/2011 Fyke R 1.602180 0 190
## 184 7/10/2011 Fyke R 1.544634 0 190
?table
## Help on topic 'table' was found in the following packages:
##
## Package Library
## vctrs /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
## base /Library/Frameworks/R.framework/Resources/library
##
##
## Using the first match ...
table(Chum.R$Chum == 0)
##
## FALSE TRUE
## 66 40
mean(Chum.R$Chum == 0)
## [1] 0.3773585
# Does there seem to be an excess of zeros?
#yes 37% are zeroes
# Use a binomial model with 'Julian' as the only predictor variable to examine the trend in the proportion of positive catches over time! To do so, you can simply use 'Chum > 0' as your response variable on the left-hand side in the model formula!
# Thus your response will consist of zeros and ones depending on whether any chum were caught (1) or not (0):
with(Chum.R, Chum > 0)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
## [49] FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [61] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
## [73] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE
## [97] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Use 'glm' to fit the logistic regression model and save the result as object 'fit'
fit <- glm(Chum > 0 ~ Julian, data = Chum.R, family=binomial)# specify formula & other arguments as needed
# Examine the standard diagnostic plot, which is not particularly useful in this case with a binomial response and a single predictor.
par(mfrow=c(2,2))
plot(fit)

# With your fitted model ('fit'), you should be able to plot the estimated trend in the proportion of positive catches over time:
plot(Chum.R$Julian, fitted(fit, type="response"), type="l")
# Note that connecting the fitted values with a line works only because the Julian days (values on x-axis) are ordered in the data frame from first to last. The predicted values returned by 'fitted' are in the same order
# Use one method of your choice to plot the fitted model with a 95% confidence band [Options include: (1) compute CI 'by hand' and add it to the above plot; (2) use 'visreg()' from the visreg package; (3) use 'ggplot()' with 'geom_smooth()', which allows you to specify the model to fit. I will include all 3 options in the solutions for you to examine.
#install.packages("visreg")
library(visreg)
## Warning: package 'visreg' was built under R version 4.3.3
?visreg
?geom_smooth
ggplot(Chum.R, aes(Julian, as.numeric(Chum > 0))) +
geom_point() + geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula = 'y ~ x'
visreg(fit, scale ="response")

### Modeling counts rather than presence / absence
# We first model the trend in counts over time using a Poisson regression with a log link (For the moment, we won't worry about extra zeros!). This is often called a log-linear model.
fit1 <- glm(Chum ~ Julian, data = Chum.R, family = poisson)
summary(fit1)
##
## Call:
## glm(formula = Chum ~ Julian, family = poisson, data = Chum.R)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.792078 0.567408 27.83 <2e-16 ***
## Julian -0.093436 0.003972 -23.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1373.43 on 105 degrees of freedom
## Residual deviance: 478.04 on 104 degrees of freedom
## AIC: 706.59
##
## Number of Fisher Scoring iterations: 5
# Assess goodness of fit:
# Remember that the null and residual deviances are the equivalent to the total sum of squares and residual sum of squares in a linear model. There is no R^2 in GLM models, but we can use an equivalent metric based on the likelihood, which is the proportion of deviance explained, defined as:
# null deviance - residual deviance
# 100 X ------------------------------------
# null deviance
null.dev <- summary(fit1)$null.deviance
resid.dev <- deviance(fit1)
100 * (null.dev - resid.dev) / null.dev
## [1] 65.194
# About 65% of the variability (as measured by deviance) in the counts (on the logit scale) is accounted for by the estimated trend
### Visualizing the fitted values
# For illustration (as well as to deal with effort standardization as shown below) we plot the fitted model with confidence bands 'manually' by computing the predicted values and standard errors using the model coefficients or the 'predict' function and adding lines for the lower and upper limits of the confidence bands
# First plot the raw data:
plot(Chum ~ Julian, data=Chum.R, col=2)
# We then compute predicted values on the log-scale for each sampling day using the estimated coefficients.
(cf <- coef(fit1)) # Extract coefficients
## (Intercept) Julian
## 15.79207824 -0.09343609
y.hat <- cf[1] + cf[2] * Chum.R$Julian
# predicted value = 15.7920782 - 0.0934361 * Julian
# Remember that the Poisson model is fit on the scale of the log-transformed response (using the 'log-link'). Therefore, to back-transform to the scale of the response variable we take the exponent of the predicted values and then add add them to the plot:
lines(Chum.R$Julian, exp(y.hat))
# The curvature in the fitted line (an exponential decline) is directly related to the fact that we are using a log-link (this fit is the equivalent of a simple linear regression in a linear model)
# Alternatively, we can simply use the 'predict() function to compute predicted values on the scale of the response:
predict(fit1, type="response")
## 1 2 3 4 5 6 7
## 34.8812708 31.7697296 28.9357497 26.3545715 24.0036441 24.0036441 21.8624284
## 8 9 10 11 12 14 15
## 21.8624284 19.9122172 19.9122172 19.9122172 18.1359722 15.0446913 15.0446913
## 17 18 20 22 24 26 27
## 15.0446913 12.4803200 11.3670283 11.3670283 10.3530465 10.3530465 10.3530465
## 29 31 34 36 38 40 42
## 9.4295156 9.4295156 8.5883672 8.5883672 7.8222524 7.8222524 6.4889476
## 44 46 48 50 52 53 55
## 6.4889476 6.4889476 5.9101089 5.9101089 5.9101089 5.3829049 4.9027294
## 57 59 61 63 65 67 69
## 4.4653874 4.4653874 4.0670580 4.0670580 4.0670580 3.7042611 3.7042611
## 71 73 75 77 79 81 83
## 3.7042611 3.0728689 3.0728689 3.0728689 2.7987574 2.7987574 2.5490976
## 85 87 89 91 93 95 97
## 2.5490976 2.3217085 2.3217085 2.3217085 2.1146033 2.1146033 2.1146033
## 99 101 103 105 107 109 111
## 1.7541687 1.7541687 1.5976902 1.5976902 1.4551702 1.4551702 1.3253635
## 113 115 117 119 121 123 125
## 1.3253635 1.3253635 1.2071361 1.0994550 1.0994550 0.9120526 0.8306941
## 127 129 131 133 135 137 139
## 0.8306941 0.8306941 0.7565930 0.6891021 0.6891021 0.6276316 0.4742075
## 141 143 145 147 149 151 153
## 0.4319064 0.4319064 0.4319064 0.3933787 0.3933787 0.3933787 0.3582879
## 154 156 158 160 162 164 166
## 0.3582879 0.3582879 0.3582879 0.3263272 0.3263272 0.2972176 0.2972176
## 167 168 169 170 171 172 173
## 0.2245630 0.2045311 0.2045311 0.2045311 0.1862862 0.1862862 0.1862862
## 174 175 176 178 179 180 182
## 0.1696687 0.1696687 0.1545337 0.1545337 0.1545337 0.1407487 0.1407487
## 184
## 0.1407487
# The values are identical to the back-transformed 'y.hat' values that we computed above, as you can easily verify:
predict(fit1, type="response") - exp(y.hat)
## 1 2 3 4 5
## -2.842171e-14 0.000000e+00 -2.486900e-14 0.000000e+00 -2.131628e-14
## 6 7 8 9 10
## -2.131628e-14 0.000000e+00 0.000000e+00 -7.105427e-15 -7.105427e-15
## 11 12 14 15 17
## -7.105427e-15 1.065814e-14 7.105427e-15 7.105427e-15 7.105427e-15
## 18 20 22 24 26
## 5.329071e-15 -5.329071e-15 -5.329071e-15 5.329071e-15 5.329071e-15
## 27 29 31 34 36
## 5.329071e-15 -5.329071e-15 -5.329071e-15 3.552714e-15 3.552714e-15
## 38 40 42 44 46
## -3.552714e-15 -3.552714e-15 -8.881784e-16 -8.881784e-16 -8.881784e-16
## 48 50 52 53 55
## 4.440892e-15 4.440892e-15 4.440892e-15 -8.881784e-16 3.552714e-15
## 57 59 61 63 65
## -8.881784e-16 -8.881784e-16 2.664535e-15 2.664535e-15 2.664535e-15
## 67 69 71 73 75
## -4.440892e-16 -4.440892e-16 -4.440892e-16 0.000000e+00 0.000000e+00
## 77 79 81 83 85
## 0.000000e+00 2.220446e-15 2.220446e-15 0.000000e+00 0.000000e+00
## 87 89 91 93 95
## 2.220446e-15 2.220446e-15 2.220446e-15 0.000000e+00 0.000000e+00
## 97 99 101 103 105
## 0.000000e+00 2.220446e-16 2.220446e-16 -1.332268e-15 -1.332268e-15
## 107 109 111 113 115
## 2.220446e-16 2.220446e-16 -8.881784e-16 -8.881784e-16 -8.881784e-16
## 117 119 121 123 125
## 2.220446e-16 -6.661338e-16 -6.661338e-16 -5.551115e-16 2.220446e-16
## 127 129 131 133 135
## 2.220446e-16 2.220446e-16 9.992007e-16 2.220446e-16 2.220446e-16
## 137 139 141 143 145
## -3.330669e-16 2.220446e-16 -2.220446e-16 -2.220446e-16 -2.220446e-16
## 147 149 151 153 154
## -4.996004e-16 -4.996004e-16 -4.996004e-16 4.996004e-16 4.996004e-16
## 156 158 160 162 164
## 4.996004e-16 4.996004e-16 2.220446e-16 2.220446e-16 -5.551115e-17
## 166 167 168 169 170
## -5.551115e-17 1.665335e-16 -5.551115e-17 -5.551115e-17 -5.551115e-17
## 171 172 173 174 175
## -1.942890e-16 -1.942890e-16 -1.942890e-16 3.053113e-16 3.053113e-16
## 176 178 179 180 182
## 1.387779e-16 1.387779e-16 1.387779e-16 0.000000e+00 0.000000e+00
## 184
## 0.000000e+00
### We generally want to acknowledge and ideally visualize uncertainty in the fitted regression line, hence let's add 95% confidence bands:
# First compute CIs on the log-scale (the scale of the linear predictor or the 'link' scale):
Fitted <- predict(fit1, type="link", se=T)
names(Fitted)
## [1] "fit" "se.fit" "residual.scale"
# The result is a list with three components:
# 'fit' - fitted values on the scale of the link function
# 'se.fit' - standard errors for each fitted value
# 'residual scale'- is the scale parameter in the GLM, which is assumed to be 1 for the Poisson
# Taking the exponent of the fitted values gives us the predicted values on the 'response' scale ('y.hat', corresponding to predicted counts) as we saw above:
exp(Fitted$fit)
## 1 2 3 4 5 6 7
## 34.8812708 31.7697296 28.9357497 26.3545715 24.0036441 24.0036441 21.8624284
## 8 9 10 11 12 14 15
## 21.8624284 19.9122172 19.9122172 19.9122172 18.1359722 15.0446913 15.0446913
## 17 18 20 22 24 26 27
## 15.0446913 12.4803200 11.3670283 11.3670283 10.3530465 10.3530465 10.3530465
## 29 31 34 36 38 40 42
## 9.4295156 9.4295156 8.5883672 8.5883672 7.8222524 7.8222524 6.4889476
## 44 46 48 50 52 53 55
## 6.4889476 6.4889476 5.9101089 5.9101089 5.9101089 5.3829049 4.9027294
## 57 59 61 63 65 67 69
## 4.4653874 4.4653874 4.0670580 4.0670580 4.0670580 3.7042611 3.7042611
## 71 73 75 77 79 81 83
## 3.7042611 3.0728689 3.0728689 3.0728689 2.7987574 2.7987574 2.5490976
## 85 87 89 91 93 95 97
## 2.5490976 2.3217085 2.3217085 2.3217085 2.1146033 2.1146033 2.1146033
## 99 101 103 105 107 109 111
## 1.7541687 1.7541687 1.5976902 1.5976902 1.4551702 1.4551702 1.3253635
## 113 115 117 119 121 123 125
## 1.3253635 1.3253635 1.2071361 1.0994550 1.0994550 0.9120526 0.8306941
## 127 129 131 133 135 137 139
## 0.8306941 0.8306941 0.7565930 0.6891021 0.6891021 0.6276316 0.4742075
## 141 143 145 147 149 151 153
## 0.4319064 0.4319064 0.4319064 0.3933787 0.3933787 0.3933787 0.3582879
## 154 156 158 160 162 164 166
## 0.3582879 0.3582879 0.3582879 0.3263272 0.3263272 0.2972176 0.2972176
## 167 168 169 170 171 172 173
## 0.2245630 0.2045311 0.2045311 0.2045311 0.1862862 0.1862862 0.1862862
## 174 175 176 178 179 180 182
## 0.1696687 0.1696687 0.1545337 0.1545337 0.1545337 0.1407487 0.1407487
## 184
## 0.1407487
# We can compute (asymptotic) 95% confidence intervals as usual, on the scale of the linear predictor (log scale):
lwr <- Fitted$fit - 1.96 * Fitted$se.fit
upr <- Fitted$fit + 1.96 * Fitted$se.fit
# To obtain correct confidence intervals on the back-transformed scale, we need to take the exponent of the lower and upper bounds (just as we did for the predicted values):
lower <- exp(lwr)
upper <- exp(upr)
# Let's add them to the plot:
lines(Chum.R$Julian, upper, lty=2)
lines(Chum.R$Julian, lower, lty=2)

# Voila! A simple Poisson regression with a 95% confidence band
# Of course, we can do all that in a simple call to 'visreg':
visreg(fit1, scale="response")
# Verify that the confidence bands are the same as those computed above:
lines(Chum.R$Julian, upper, lty=2)
lines(Chum.R$Julian, lower, lty=2)

# Note that in this case case, the rug plots at the top and bottom show the distribution (by Julian day) of positive (at the top) and negative (at the bottom) residuals.
## Accounting for fishing effort (as measured by volume filtered)
# In the analysis so far we assumed that counts are comparable, i.e., that each count represents the same amount of effort. However, each set filtered a different volume of water. We can account for the volume filtered by using an 'offset':
# log(catch/volume) = log(catch) - log(volume) = a + b*Julian
# log(catch) = a + b*Julian + log(volume)
# where 'log(volume)' (which has no regression coefficient) is the 'offset', it is simply a known value for each sample that is added to the right-hand side to account for effort
# Note that the link function (log) is applied to the response but NOT to the offset, hence the formula to apply the offset uses the un-transformed values for 'Chum' but log(Volume) for the offset:
fit2 <- glm(Chum ~ Julian + offset(log(Volume)),
family=poisson, data = Chum.R)
summary(fit2)
##
## Call:
## glm(formula = Chum ~ Julian + offset(log(Volume)), family = poisson,
## data = Chum.R)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 12.527967 0.545408 22.97 <2e-16 ***
## Julian -0.075515 0.003817 -19.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1078.38 on 105 degrees of freedom
## Residual deviance: 463.31 on 104 degrees of freedom
## AIC: 691.86
##
## Number of Fisher Scoring iterations: 5
# You can check to make sure that the offset is applied correctly by predicting counts for a given day (say day 180) and for a range of volume filtered:
vol <- seq(0.05,3.5, by=0.01)
new.x <- data.frame(Julian = 180, Volume=vol)
p <- predict(fit2, newdata = new.x, type="response")
# The predicted counts should increase linearly with the volume filtered (starting at 0), all else being equal, which is indeed the case:
plot(vol,p)

# Model diagnostics
par(mfrow=c(2,2))
plot(fit2)

# Note the clear heteroscedasticity evident in the first plot, which is likely related to overdispersion, which we will examine below! There is one very influential point with a large residual and high leverage (the observation labeled "2", see 4th plot):
Chum.R["2",]
## Date Gear Side Volume Chum Julian
## 2 5/13/2011 Fyke R 2.65356 80 132
max(Chum.R$Chum)
## [1] 80
# The outlier corresponds to the largest observed count of chum salmon.
# The predicted values are a little more tricky now as they depend on the amount of water filtered. Therefore, we can only visualize a smooth trend in catch-per-unit-effort (CPUE) for a given amount of water filtered.
# Let's take a look at volume filtered:
plot(Volume ~ Julian, data=Chum.R)
abline(h=mean(Chum.R$Vol), col=4, lty=2)
# Note that volume filtered decreased over the course of the season, which may explain part of the decline in observed catches!
# Instead of the raw counts, we can plot counts adjusted to mean volume to show the decline in standardized CPUE over time. The adjusted counts are simply the catch-per-unit-volume multiplied by the average volume filtered
plot(Chum/Volume * mean(Volume) ~ Julian, data=Chum, col=2)
# To visualize predicted values, we first need to compute predicted values based on the mean volume filtered. We set up a data frame that contains a range of Julian days and the mean Volume:
dat <- data.frame(Julian = 130:190, Volume = mean(Chum.R$Vol))
# We now predict catches assuming that the volume filtered is constant (at the mean) as specified in the 'dat' data frame:
Fitted <- predict(fit2, newdata=dat, se=T)
mu <- exp(Fitted$fit) # predicted values on response scale
lower <- exp(Fitted$fit - 1.96 * Fitted$se.fit)
upper <- exp(Fitted$fit + 1.96 * Fitted$se.fit)
x <- dat$Julian
lines(x, mu)
lines(x, upper, lty=2)
lines(x, lower, lty=2)
# Evaluating Poisson fit:
summary(fit2)
##
## Call:
## glm(formula = Chum ~ Julian + offset(log(Volume)), family = poisson,
## data = Chum.R)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 12.527967 0.545408 22.97 <2e-16 ***
## Julian -0.075515 0.003817 -19.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1078.38 on 105 degrees of freedom
## Residual deviance: 463.31 on 104 degrees of freedom
## AIC: 691.86
##
## Number of Fisher Scoring iterations: 5
# The counts are clearly overdispersed as is evident by comparing the residual deviance to residual d.f., which should be roughly the same if the Poisson assumption is valid.
# To try to account for the apparent overdispersion, we fit a negative binomial model. 'glm' does not currently accommodate the negative binomial, hence we use function 'glm.nb' from the package MASS:
library(MASS)
fit3 <- glm.nb(Chum ~ Julian + offset(log(Volume)),
data = Chum.R)
summary(fit3)
##
## Call:
## glm.nb(formula = Chum ~ Julian + offset(log(Volume)), data = Chum.R,
## init.theta = 1.574571029, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.463046 1.158013 11.63 <2e-16 ***
## Julian -0.081607 0.007576 -10.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.5746) family taken to be 1)
##
## Null deviance: 256.27 on 105 degrees of freedom
## Residual deviance: 101.62 on 104 degrees of freedom
## AIC: 433.92
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.575
## Std. Err.: 0.363
##
## 2 x log-likelihood: -427.922
# Note that the residual deviance is now very similar to the d.f. and that there is still a highly significant decrease in CPUE over time (no surprise there!)
# Model diagnostics:
plot(fit3,c(1,3:5))

# Note that the distribution of the standardized residuals looks much better although there are a few relatively large and somewhat influential values, in particular observation '119', which has the largest standardized residual and the largest Cook's Distance. This observation corresponds to a count of 8 chum salmon that occurs later in the time series (when the mean count is generally very low)
# While the difference in the models here is striking and the negative binomial results in an obvious improvement over the Poisson based on visual inspection of residuals and comparison of the deviance, we can also conveniently compare the negative binomial model fit and the Poisson fit, which is nested in the negative binomial model, using a likelihood ratio test:
# Unfortunately, our usual approach using the 'anova' function does not work here as it does not know how to handle the 'glm.nb' output (it does not recognize the additional parameter estimated in the glm.nb model and no test is returned):
anova(fit2,fit3, test="LRT")
## Analysis of Deviance Table
##
## Model 1: Chum ~ Julian + offset(log(Volume))
## Model 2: Chum ~ Julian + offset(log(Volume))
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 104 463.31
## 2 104 101.62 0 361.69
# You can compute the LRT by hand as we've done before (using 'logLik' to extract the likelihood from each model), or we can use a convenient function in the 'lmtest' package:
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
lrtest(fit2, fit3)
## Likelihood ratio test
##
## Model 1: Chum ~ Julian + offset(log(Volume))
## Model 2: Chum ~ Julian + offset(log(Volume))
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 2 -343.93
## 2 3 -213.96 1 259.94 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# You can verify the observed Chisq value or the LRT statistic:
-2 * (logLik(fit2) - logLik(fit3))
## 'log Lik.' 259.9414 (df=2)
# Based on the reduction in the negative log-likelihood achieved by adding a single parameter (the 'theta' parameter of 'glm.nb'), the negative binomial model provides a better fit, and the difference is highly significant. This is consistent with our earlier observation that the Poisson counts are highly over-dispersed!
# A considerable difference in the fits is also readily apparent when we compare the AIC values:
AIC(fit2, fit3)
## df AIC
## fit2 2 691.8639
## fit3 3 433.9224
# Here lower is better, and the substantial reduction in AIC
# (corresponding to a substantial difference in log-likelihoods, adjusted for the number of extra parameters) also provides strong support for the negative binomial model.

# Exercise 2: Follow the template for fit2 above to plot counts over time (adjusted to mean volume) and superimpose the predicted values from the NB model!
plot(Chum/Volume * mean(Volume) ~ Julian, data=Chum, col=2)
dat <- data.frame(Julian = 130:190, Volume = mean(Chum.R$Vol))
Fitted <- predict(fit3, newdata=dat, se=T)
mu <- exp(Fitted$fit) # predicted values on response scale
lower <- exp(Fitted$fit - 1.96 * Fitted$se.fit)
upper <- exp(Fitted$fit + 1.96 * Fitted$se.fit)
x <- dat$Julian
lines(x, mu)
lines(x, upper, lty=2)
lines(x, lower, lty=2)

# Compare the confidence bands of the two models!
#the confidence bands for fit3 are much wider from julian day 130-140.
### Zero-inflated models
# We are now ready to address the potential for extra zeros in the data, which can also contribute to the overdisperion problem
# Recall the large number of zeros, which are clearly evident in a barplot that shows the number of sets (days) that caught a given number of fish:
x <- barplot(tabulate(Chum.R$Chum+1))
axis(1, at=x, lab=0:max(Chum.R$Chum))

# Note that most of the zeros occur later in the season when the mean counts are very low, hence many of them may be a natural part of the NB distribution, which has more zeros when the mean count is low (rather than reflecting "extra" zeros)
# Whether there are extra zeros or not is difficult to evaluate, but we can fit a hurdle model or a zero-inflated model and compare the fits:
# We will use the 'pscl' package, which contains functions for fitting hurdle models and zero-inflated models
if(!("pscl" %in% installed.packages())) {
install.packages("pscl")
}
library(pscl)
## Warning: package 'pscl' was built under R version 4.3.3
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
# We fit a hurdle model that uses a negative binomial model for the counts, a binomial model for modeling the probability of zeros (which is the default, with logit link) and we use an offset as before. The same variables ('Julian' and the offset) are included as predictor variables for both the binomial and NB components (by default):
fit4 <- hurdle(Chum ~ Julian + offset(log(Volume)),
dist="negbin", data = Chum.R)
summary(fit4)
##
## Call:
## hurdle(formula = Chum ~ Julian + offset(log(Volume)), data = Chum.R,
## dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.1024 -0.5946 -0.3383 0.3631 3.3372
##
## Count model coefficients (truncated negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 12.76400 1.63324 7.815 5.49e-15 ***
## Julian -0.07692 0.01102 -6.980 2.96e-12 ***
## Log(theta) 0.37225 0.31283 1.190 0.234
## Zero hurdle model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 17.97077 3.54111 5.075 3.88e-07 ***
## Julian -0.10738 0.02104 -5.105 3.31e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta: count = 1.451
## Number of iterations in BFGS optimization: 9
## Log-likelihood: -214 on 5 Df
# (Note: If you use the argument 'offset = log(Volume)' instead of the offset in the model formula, the offset is only applied to the count portion of the model, but not to the binomial or 'hurdle' portion. However, we should apply the offset to both as the 'odds-ratio' or the probability of catching chum versus not catching chum should increase linearly with volume filtered. In other words, if we filter twice as much water, we have twice the chance of catching something)
# Note that the output contains two sets of coefficients, one for the hurdle component and one for the truncated count component (modeled via NB)! The NB component includes an estimate of the theta parameter (on the log-scale). Note that both coefficients for Julian day are negative and significantly different from zero, implying that there is a significant decrease over time in both the probability of catching chums (fewer 'successes' as the season progresses) and in the number of chum salmon being caught.
# Let's visualize the 'hurdle' component first (that is, the proportion of positive catches):
d <- 130:190
(cf <- coef(fit4)[3:4]) # Coefficients for hurdle component
## zero_(Intercept) zero_Julian
## 17.9707734 -0.1073826
p.logit <- cf[1] + cf[2] * d
plot(Chum > 0 ~ Julian, data = Chum.R, pch="|", cex=2)
lines(d, exp(p.logit) / (1 + exp(p.logit)), type="l", lwd=3)

# The output also contains the estimate (plus standard errors) of the overdispersion parameter 'theta' of the negative binomial:
# Var(y) = mu + mu^2 / theta
# where a larger theta implies a smaller variance
thetafit4<- fit4$theta
thetafit2 <-fit2$theta
# Exercise 3: What is the amount of "extra variance" relative to the Poisson variance (mu) at mean counts of 5 and 10? As per the above formula, the variance depends on both the mean count (mu) and on the 'theta' parameter.
#Var(y) = mu + mu^2/theta
5+5^2/thetafit4
## count
## 22.22954
10+10^2/thetafit4
## count
## 78.91814
#variance increases as the mean counts increase, and its higher for both mean counts with fit4 as expected and as shown in the CI bands for both models
# Let's once more visualize the fitted model, this time the predicted values based on the hurdle model:
# First, plot mean-adjusted chum salmon counts
plot(Chum/Volume * mean(Volume) ~ Julian,
data=Chum, col=2, xlab = "Julian Day",
ylab = "Counts adjusted to mean volume")
# Compute predicted values based on the average volume filtered:
dat <- data.frame(Julian = 130:190, Volume = mean(Chum.R$Vol))
# Currently, the 'predict' function for hurdle models does not compute standard errors, hence we only plot the fitted values
# (We could compute standard errors, but it's a bit more work!)
Fitted <- predict(fit4, newdata=dat, type="response")
lines(dat$Julian, Fitted, col=4)

# Compare the fit of the negative binomial without extra zeros to the hurdle model:
AIC(fit3, fit4)
## df AIC
## fit3 3 433.9224
## fit4 5 438.0458
### Which model fits better according to the AIC criterion?
#**********fit 3 (433, vs 438 fit4)
# Note that 'plot' does not have a method for hurdle models, so we can't get our usual basic diagnostic plots either, but we can extract residuals' to look at some relevant plots ('Deviance' residuals are not available, hence we use 'Pearson' residuals)
r <- resid(fit4, type="pearson")
# Residuals versus fitted values:
plot(fitted(fit4), r); abline(h=0, lty=2, col=4)

# No strong heteroscedasticity, but 2 fairly large outliers (> 3)
hist(r) # Outliers are also evident in the histogram

# We might want to refit the model without these outliers to evaluate their influence on the fitted model.
# Exercise 4: Read the help file for 'zeroinfl' and fit an equivalent model using a zero-inflated negative binomial distribution.
?zeroinfl
fit5 <- zeroinfl(Chum ~ Julian + offset(log(Volume)), dist="negbin", data=Chum.R)
summary(fit5)
##
## Call:
## zeroinfl(formula = Chum ~ Julian + offset(log(Volume)), data = Chum.R,
## dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.1584 -0.6282 -0.3766 0.3578 3.5287
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.12139 1.50140 8.739 < 2e-16 ***
## Julian -0.07927 0.01007 -7.871 3.53e-15 ***
## Log(theta) 0.46988 0.25568 1.838 0.0661 .
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -28.7855 89.9408 -0.32 0.749
## Julian 0.1435 0.4789 0.30 0.764
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 1.5998
## Number of iterations in BFGS optimization: 34
## Log-likelihood: -213.9 on 5 Df
AIC(fit3, fit4, fit5)
## df AIC
## fit3 3 433.9224
## fit4 5 438.0458
## fit5 5 437.7616
# Compare results from the zero-inflated model to those from the hurdle model!
#zero inflated model is very similar to the hurdle model according to AIC scores.
# Does the model suggest a significant amount of "zero-inflation"? Why or why not?
#no, fit3 has the best score so that suggest the negative binomial model already deals wiht the zeroes well enough and the hurdle & zero inflation models are not necessary here. also, the zero inflated coefficients
###########
# So far, we have not worried about the time series nature of the data but it is very likely that there may be temporal autocorrelation in the counts
# Exercise 5: Plot the residuals from 'fit4' over time (Julian day) and connect residuals with a line (which makes it easier to spot time trends)
plot(Chum.R$Julian, resid(fit4, type="pearson"), type="b")
abline(h=0, lty=2)

# Is there evidence of auto-correlation?
#yes a little, there are clumps of negative residuals at 150-170 and then some high resids right after
# We will tackle autocorrelation in a future lab!