Needed libraries
library(car)
library(MASS)
library(faraway)
library(nortest)
library(mfp)
library(ggplot2)
library(mgcv)
# library(glmmADMB)
Load an example data set
data(iris)
see the data
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.30 Min. :2.00 Min. :1.00 Min. :0.1
## 1st Qu.:5.10 1st Qu.:2.80 1st Qu.:1.60 1st Qu.:0.3
## Median :5.80 Median :3.00 Median :4.35 Median :1.3
## Mean :5.84 Mean :3.06 Mean :3.76 Mean :1.2
## 3rd Qu.:6.40 3rd Qu.:3.30 3rd Qu.:5.10 3rd Qu.:1.8
## Max. :7.90 Max. :4.40 Max. :6.90 Max. :2.5
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
Scatter plots to explore the data and relations between variables
scatterplotMatrix(~Sepal.Length + Sepal.Width + Petal.Length + Petal.Width |
Species, data = iris)
Exploratory plots with ggplot2
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() + facet_grid(Species ~
.) + stat_smooth(method = "lm")
The null model
fit1 <- lm(Petal.Width ~ 1, data = iris)
summary(fit1)
##
## Call:
## lm(formula = Petal.Width ~ 1, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.099 -0.899 0.101 0.601 1.301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1993 0.0622 19.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.762 on 149 degrees of freedom
A univariate linear model
fit2 <- lm(Petal.Width ~ Sepal.Length, data = iris)
summary(fit2)
##
## Call:
## lm(formula = Petal.Width ~ Sepal.Length, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9667 -0.3594 -0.0179 0.2839 1.2333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.2002 0.2569 -12.5 <2e-16 ***
## Sepal.Length 0.7529 0.0435 17.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.44 on 148 degrees of freedom
## Multiple R-squared: 0.669, Adjusted R-squared: 0.667
## F-statistic: 299 on 1 and 148 DF, p-value: <2e-16
The complete multivariate simple effects model
fit3 <- lm(Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width, data = iris)
summary(fit3)
##
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width,
## data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6096 -0.1013 -0.0109 0.0983 0.6069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2403 0.1784 -1.35 0.18
## Sepal.Length -0.2073 0.0475 -4.36 2.4e-05 ***
## Petal.Length 0.5241 0.0245 21.40 < 2e-16 ***
## Sepal.Width 0.2228 0.0489 4.55 1.1e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.192 on 146 degrees of freedom
## Multiple R-squared: 0.938, Adjusted R-squared: 0.937
## F-statistic: 734 on 3 and 146 DF, p-value: <2e-16
Categorical variables are treated as dummy variables
contrasts(iris$Species)
## versicolor virginica
## setosa 0 0
## versicolor 1 0
## virginica 0 1
fit4 <- lm(Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width + Species,
data = iris)
summary(fit4)
##
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width +
## Species, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5924 -0.0829 -0.0135 0.0877 0.4524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4731 0.1766 -2.68 0.0082 **
## Sepal.Length -0.0929 0.0446 -2.08 0.0389 *
## Petal.Length 0.2422 0.0488 4.96 2.0e-06 ***
## Sepal.Width 0.2422 0.0478 5.07 1.2e-06 ***
## Speciesversicolor 0.6481 0.1231 5.26 5.0e-07 ***
## Speciesvirginica 1.0464 0.1655 6.32 3.0e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.167 on 144 degrees of freedom
## Multiple R-squared: 0.954, Adjusted R-squared: 0.952
## F-statistic: 595 on 5 and 144 DF, p-value: <2e-16
anova(fit4)
## Analysis of Variance Table
##
## Response: Petal.Width
## Df Sum Sq Mean Sq F value Pr(>F)
## Sepal.Length 1 57.9 57.9 2086.3 < 2e-16 ***
## Petal.Length 1 22.5 22.5 810.8 < 2e-16 ***
## Sepal.Width 1 0.8 0.8 27.5 5.4e-07 ***
## Species 2 1.4 0.7 24.9 5.1e-10 ***
## Residuals 144 4.0 0.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Manually comparing nested models:
anova(fit3, fit4, test = "LRT")
## Analysis of Variance Table
##
## Model 1: Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width
## Model 2: Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width + Species
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 146 5.38
## 2 144 4.00 2 1.38 1.5e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Automatic variable selection using AIC…
step(fit4)
## Start: AIC=-531.7
## Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width + Species
##
## Df Sum of Sq RSS AIC
## <none> 4.00 -532
## - Sepal.Length 1 0.121 4.12 -529
## - Petal.Length 1 0.683 4.68 -510
## - Sepal.Width 1 0.714 4.71 -509
## - Species 2 1.383 5.38 -491
##
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width +
## Species, data = iris)
##
## Coefficients:
## (Intercept) Sepal.Length Petal.Length
## -0.4731 -0.0929 0.2422
## Sepal.Width Speciesversicolor Speciesvirginica
## 0.2422 0.6481 1.0464
A highly complex model with complex interactions, but not necessarily the best approach
fit5 <- lm(Petal.Width ~ Sepal.Length * Petal.Length * Sepal.Width * Species,
data = iris)
summary(fit5)
Automatic AIC methods can easily result in complex and over-fitted models
fit6 <- step(fit5)
summary(fit6)
Load the data
saw <- read.table("Saw_FL.csv", header = TRUE, sep = ",", dec = ".")
head(saw, 10)
## Region Depthm Sizecm
## 1 5 0.9 30.5
## 2 3 3.0 30.5
## 3 3 0.3 38.1
## 4 4 0.3 45.0
## 5 2 0.9 45.0
## 6 5 0.2 45.7
## 7 5 0.2 45.7
## 8 5 0.2 45.7
## 9 4 0.2 45.7
## 10 4 0.3 45.7
str(saw)
## 'data.frame': 2841 obs. of 3 variables:
## $ Region: int 5 3 3 4 2 5 5 5 4 4 ...
## $ Depthm: num 0.9 3 0.3 0.3 0.9 0.2 0.2 0.2 0.2 0.3 ...
## $ Sizecm: num 30.5 30.5 38.1 45 45 45.7 45.7 45.7 45.7 45.7 ...
Define region as a factor
saw$Region <- factor(saw$Region)
str(saw)
## 'data.frame': 2841 obs. of 3 variables:
## $ Region: Factor w/ 8 levels "1","2","3","4",..: 5 3 3 4 2 5 5 5 4 4 ...
## $ Depthm: num 0.9 3 0.3 0.3 0.9 0.2 0.2 0.2 0.2 0.3 ...
## $ Sizecm: num 30.5 30.5 38.1 45 45 45.7 45.7 45.7 45.7 45.7 ...
contrasts(saw$Region)
## 2 3 4 5 6 7 8
## 1 0 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0
## 3 0 1 0 0 0 0 0
## 4 0 0 1 0 0 0 0
## 5 0 0 0 1 0 0 0
## 6 0 0 0 0 1 0 0
## 7 0 0 0 0 0 1 0
## 8 0 0 0 0 0 0 1
Different set of contrasts to have another region as the baseline
saw$Region <- factor(saw$Region, levels = c("8", "1", "2", "3", "4", "5", "6",
"7"))
contrasts(saw$Region)
## 1 2 3 4 5 6 7
## 8 0 0 0 0 0 0 0
## 1 1 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0
## 3 0 0 1 0 0 0 0
## 4 0 0 0 1 0 0 0
## 5 0 0 0 0 1 0 0
## 6 0 0 0 0 0 1 0
## 7 0 0 0 0 0 0 1
Reset the regions as they were originally
saw$Region <- factor(saw$Region, levels = c("1", "2", "3", "4", "5", "6", "7",
"8"))
Let's see the effects of Region and Depth on the specimens sizes
qplot(Region, Sizecm, geom = c("boxplot"), data = saw)
qplot(Sizecm, Depthm, data = saw) + geom_smooth()
qplot(Sizecm, Depthm, data = saw, colour = Region) + facet_grid(. ~ Region)
Let's check if the response is Normally distributed
qplot(Sizecm, data = saw)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
## this.
lillie.test(saw$Sizecm)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: saw$Sizecm
## D = 0.1516, p-value < 2.2e-16
Compare a LM with a GLM with Normal errors and identity link function
fit7 <- lm(Sizecm ~ Depthm + Region, data = saw)
fit8 <- glm(Sizecm ~ Depthm + Region, family = gaussian, data = saw)
cbind(fit7$coef, fit8$coef)
## [,1] [,2]
## (Intercept) 152.882525 152.882525
## Depthm 3.869187 3.869187
## Region2 29.617951 29.617951
## Region3 3.871427 3.871427
## Region4 -0.008396 -0.008396
## Region5 78.619636 78.619636
## Region6 122.880689 122.880689
## Region7 140.456897 140.456897
## Region8 106.097652 106.097652
What about the residuals of the linear model…
plot(fitted(fit8), rstandard(fit8))
fit9 <- glm(Sizecm ~ Depthm + Region, family = Gamma(link = log), data = saw)
summary(fit9)
##
## Call:
## glm(formula = Sizecm ~ Depthm + Region, family = Gamma(link = log),
## data = saw)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.716 -0.494 -0.144 0.245 1.657
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.03955 0.12460 40.44 < 2e-16 ***
## Depthm 0.01935 0.00133 14.57 < 2e-16 ***
## Region2 0.12470 0.15261 0.82 0.41393
## Region3 0.01403 0.12551 0.11 0.91101
## Region4 -0.01120 0.12651 -0.09 0.92947
## Region5 0.37984 0.12622 3.01 0.00264 **
## Region6 0.49194 0.13974 3.52 0.00044 ***
## Region7 0.56939 0.13850 4.11 4.1e-05 ***
## Region8 0.43265 0.24040 1.80 0.07202 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.2948)
##
## Null deviance: 994.99 on 2840 degrees of freedom
## Residual deviance: 773.37 on 2832 degrees of freedom
## AIC: 33523
##
## Number of Fisher Scoring iterations: 8
Pseudo-R2 (Proportion of Deviance explained)
1 - (773.37/2832)/(994.99/2840) #R2 = 22.05%
## [1] 0.2205
Let's check this assumption visually with a GAM
plot(gam(Sizecm ~ s(Depthm), family = Gamma(link = log), data = saw))
Scale and functional form transformations with fractional polynomials
fit10 <- mfp(Sizecm ~ fp(Depthm) + Region, family = Gamma(log), data = saw,
maxit = 1000)
summary(fit10)
##
## Call:
## glm(formula = Sizecm ~ Region + I(((Depthm + 0.1)/10)^-1) + I(((Depthm +
## 0.1)/10)^-1 * log(((Depthm + 0.1)/10))), family = Gamma(log),
## data = saw)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4674 -0.3689 -0.0905 0.2070 2.5594
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 5.63643 0.11010
## Region2 0.04519 0.13378
## Region3 0.03389 0.11030
## Region4 0.21484 0.11145
## Region5 0.41198 0.11064
## Region6 0.43642 0.12184
## Region7 0.53688 0.12113
## Region8 0.37538 0.21014
## I(((Depthm + 0.1)/10)^-1) -0.12998 0.00668
## I(((Depthm + 0.1)/10)^-1 * log(((Depthm + 0.1)/10))) -0.02604 0.00177
## t value Pr(>|t|)
## (Intercept) 51.19 < 2e-16 ***
## Region2 0.34 0.73553
## Region3 0.31 0.75866
## Region4 1.93 0.05400 .
## Region5 3.72 0.00020 ***
## Region6 3.58 0.00035 ***
## Region7 4.43 9.7e-06 ***
## Region8 1.79 0.07415 .
## I(((Depthm + 0.1)/10)^-1) -19.45 < 2e-16 ***
## I(((Depthm + 0.1)/10)^-1 * log(((Depthm + 0.1)/10))) -14.68 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.2255)
##
## Null deviance: 994.99 on 2840 degrees of freedom
## Residual deviance: 556.82 on 2831 degrees of freedom
## AIC: 32556
##
## Number of Fisher Scoring iterations: 5
Transformations in Depth to make it a linear predictor
saw$Depthm1 <- I(((saw$Depthm + 0.1)/10)^-1)
saw$Depthm2 <- I(((saw$Depthm + 0.1)/10)^-1 * log(((saw$Depthm + 0.1)/10)))
Transformations in Depth to make it a linear predictor
fit11 <- glm(Sizecm ~ Depthm1 + Depthm2 + Region, family = Gamma(link = log),
data = saw)
summary(fit11)
##
## Call:
## glm(formula = Sizecm ~ Depthm1 + Depthm2 + Region, family = Gamma(link = log),
## data = saw)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4674 -0.3689 -0.0905 0.2070 2.5594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.63643 0.11010 51.19 < 2e-16 ***
## Depthm1 -0.12998 0.00668 -19.45 < 2e-16 ***
## Depthm2 -0.02604 0.00177 -14.68 < 2e-16 ***
## Region2 0.04519 0.13378 0.34 0.73553
## Region3 0.03389 0.11030 0.31 0.75866
## Region4 0.21484 0.11145 1.93 0.05400 .
## Region5 0.41198 0.11064 3.72 0.00020 ***
## Region6 0.43642 0.12184 3.58 0.00035 ***
## Region7 0.53688 0.12113 4.43 9.7e-06 ***
## Region8 0.37538 0.21014 1.79 0.07415 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.2255)
##
## Null deviance: 994.99 on 2840 degrees of freedom
## Residual deviance: 556.82 on 2831 degrees of freedom
## AIC: 32556
##
## Number of Fisher Scoring iterations: 5
1 - (fit11$deviance/fit11$df.residual)/(fit11$null.deviance/fit11$df.null) #R2 = 43.9%
## [1] 0.4386
Goodness-of-fit comparisons
AIC(fit9, fit11)
## df AIC
## fit9 10 33523
## fit11 11 32556
BIC(fit9, fit11)
## df BIC
## fit9 10 33583
## fit11 11 32621
anova(fit9, fit11, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: Sizecm ~ Depthm + Region
## Model 2: Sizecm ~ Depthm1 + Depthm2 + Region
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2832 773
## 2 2831 557 1 217 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual analysis: Still some outliers and trends, but much better than a LM
par(mfrow = c(2, 2))
plot(fit11)
We could have used a GAM instead of a GLM with transformations in the variables
fit12 <- gam(Sizecm ~ s(Depthm) + Region, family = Gamma(link = log), data = saw)
summary(fit12)
##
## Family: Gamma
## Link function: log
##
## Formula:
## Sizecm ~ s(Depthm) + Region
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.0337 0.1053 47.81 < 2e-16 ***
## Region2 0.0265 0.1289 0.21 0.83717
## Region3 0.0217 0.1060 0.21 0.83756
## Region4 0.1446 0.1071 1.35 0.17710
## Region5 0.4026 0.1067 3.77 0.00016 ***
## Region6 0.4456 0.1203 3.71 0.00022 ***
## Region7 0.5360 0.1175 4.56 5.3e-06 ***
## Region8 0.3433 0.2045 1.68 0.09327 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Depthm) 8.82 8.99 118 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.381 Deviance explained = 40.3%
## GCV score = 0.21148 Scale est. = 0.21023 n = 2841
Prediction with the GLM vs. GAM: Predict the size of 1 specimen captured in region 7 at 1m of depth
# GLM
Depthm1.pred <- I(((1 + 0.1)/10)^-1)
Depthm2.pred <- I(((1 + 0.1)/10)^-1 * log(((1 + 0.1)/10)))
Depthm1.pred # 1st transformed depth parameter
## [1] 9.091
Depthm2.pred # 2nd transformed depth parameter
## [1] -20.07
x.glm <- data.frame(Depthm1 = 9.090909, Depthm2 = -20.06614, Region = "7")
pred.glm <- predict(fit11, new = x.glm, se = T, type = "response")
pred.glm$fit # predicted size is 248.2 cm
## 1
## 248.2
pred.glm$se.fit # SE=13.3
## 1
## 13.29
# GLM
x.gam <- data.frame(Depthm = 1, Region = "7")
pred.gam <- predict.gam(fit12, newdata = x.gam, se.fit = T, type = "response")
pred.gam$fit # Predicted size is 227.7 cm
## 1
## 227.7
pred.gam$se.fit # SE=12.0
## 1
## 12.04
#load dataset
cpue <- read.table("cpue.csv", header = T, sep = ",", dec = ".", na.strings = "-9999")
# factors
cpue$Vessel <- factor(cpue$Vessel)
cpue$Gear <- factor(cpue$Gear)
cpue$Season <- factor(cpue$Season)
cpue$Year <- factor(cpue$Year)
str(cpue)
## 'data.frame': 473 obs. of 9 variables:
## $ Vessel: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hookn : int 1360 1360 1360 1360 1360 1360 1360 1360 1360 1360 ...
## $ Gear : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Season: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ Year : Factor w/ 4 levels "2008","2009",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ n : int 7 19 13 22 25 13 17 15 7 14 ...
## $ cpue : num 5.15 13.97 9.56 16.18 18.38 ...
## $ chlo : num 0.0921 0.1056 0.0794 0.097 0.1107 ...
## $ sst : num 27.5 27.7 27.7 28 28.5 ...
How many sets do we have with zero catches
cpue$PositiveSet <- ifelse(cpue$n >= 1, 1, 0)
table(cpue$PositiveSet)
##
## 0 1
## 8 465
8/465 * 100 #1.7% of fishing sets with 0 catches
## [1] 1.72
Is the response variable Normally distributed?
qplot(cpue, data = cpue) # and don't forget that we also have some zeros
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
## this.
fit13 <- glm(cpue ~ Gear + Season + Year + chlo + sst, family = Gamma(link = log),
data = cpue)
## Error: non-positive values not allowed for the gamma family
Why do we have this error? What is the log of zero?
log(0)
## [1] -Inf
We can add a “small” constant because the obs=0 are not that many
fit14 <- glm(cpue + 1 ~ Gear + Season + Year + chlo + sst, family = Gamma(link = log),
data = cpue)
summary(fit14)
##
## Call:
## glm(formula = cpue + 1 ~ Gear + Season + Year + chlo + sst, family = Gamma(link = log),
## data = cpue)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.281 -0.442 -0.147 0.229 1.743
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.5652 0.3414 7.51 3.0e-13 ***
## Gear2 0.8002 0.0829 9.65 < 2e-16 ***
## Season2 -0.3293 0.1366 -2.41 0.01634 *
## Season3 0.5076 0.1224 4.15 4.0e-05 ***
## Season4 0.2134 0.1018 2.10 0.03653 *
## Year2009 0.3643 0.1068 3.41 0.00071 ***
## Year2010 1.0972 0.1210 9.07 < 2e-16 ***
## Year2011 0.8195 0.1062 7.71 7.6e-14 ***
## chlo 0.5185 0.1689 3.07 0.00227 **
## sst -0.0460 0.0119 -3.87 0.00012 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.3682)
##
## Null deviance: 362.58 on 472 degrees of freedom
## Residual deviance: 160.46 on 463 degrees of freedom
## AIC: 3408
##
## Number of Fisher Scoring iterations: 7
The residuals are not that bad
par(mfrow = c(2, 2))
plot(fit14)
Evaluate possible Collinearity
vif(fit14) # variance inflation factors
## Gear2 Season2 Season3 Season4 Year2009 Year2010 Year2011 chlo
## 1.831 1.469 2.223 3.324 3.272 2.510 3.339 1.467
## sst
## 2.041
Here we should test for linearity, but let's keep moving as if everything was linear with the predictors
let's try to add one interaction
fit15 <- glm(cpue + 1 ~ Gear + Season + Year + chlo + sst + Gear:sst, family = Gamma(link = log),
data = cpue)
summary(fit15)
##
## Call:
## glm(formula = cpue + 1 ~ Gear + Season + Year + chlo + sst +
## Gear:sst, family = Gamma(link = log), data = cpue)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.285 -0.458 -0.154 0.220 1.740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9367 0.6999 2.77 0.0059 **
## Gear2 1.5711 0.7372 2.13 0.0336 *
## Season2 -0.3217 0.1368 -2.35 0.0191 *
## Season3 0.5284 0.1243 4.25 2.6e-05 ***
## Season4 0.2220 0.1025 2.17 0.0307 *
## Year2009 0.3096 0.1221 2.54 0.0115 *
## Year2010 1.0292 0.1384 7.44 5.1e-13 ***
## Year2011 0.7545 0.1250 6.04 3.2e-09 ***
## chlo 0.5117 0.1692 3.02 0.0026 **
## sst -0.0203 0.0278 -0.73 0.4670
## Gear2:sst -0.0298 0.0283 -1.05 0.2935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.3692)
##
## Null deviance: 362.58 on 472 degrees of freedom
## Residual deviance: 160.09 on 462 degrees of freedom
## AIC: 3409
##
## Number of Fisher Scoring iterations: 8
How do we know that more complex is better (in this case it is not…)
anova(fit14, fit15, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: cpue + 1 ~ Gear + Season + Year + chlo + sst
## Model 2: cpue + 1 ~ Gear + Season + Year + chlo + sst + Gear:sst
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 463 160
## 2 462 160 1 0.371 0.32
AIC(fit14, fit15)
## df AIC
## fit14 11 3408
## fit15 12 3409
BIC(fit14, fit15)
## df BIC
## fit14 11 3454
## fit15 12 3459
# we should also test for other interactions, but let's keep things simple
Some residuals that I like to do and see
res.dev.fit15 <- residuals(fit15, type = "deviance")
par(mfcol = c(2, 3))
hist(res.dev.fit15, main = "1) Histogram", col = "gray88", xlab = "Deviance residuals")
plot(fit15, which = 2, main = "2) QQ Plot")
plot(fit15, which = 1, main = "3) Residuals")
halfnorm(res.dev.fit15, main = "4) Residuals halfnormal")
plot(cooks.distance(fit15), ylab = "Cook distances", main = "5) Cook distances")
halfnorm(cooks.distance(fit15), main = "6) Cook distances halfnormal")
fit16 <- glm(n ~ Gear + Season + Year + chlo + sst + offset(log(Hookn)), family = poisson(link = log),
data = cpue)
summary(fit16)
##
## Call:
## glm(formula = n ~ Gear + Season + Year + chlo + sst + offset(log(Hookn)),
## family = poisson(link = log), data = cpue)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -9.858 -2.019 -0.505 1.225 13.806
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.34123 0.10695 -40.59 < 2e-16 ***
## Gear2 0.86826 0.03725 23.31 < 2e-16 ***
## Season2 0.04566 0.06087 0.75 0.45
## Season3 0.65423 0.03738 17.50 < 2e-16 ***
## Season4 0.23571 0.03189 7.39 1.4e-13 ***
## Year2009 0.42694 0.04615 9.25 < 2e-16 ***
## Year2010 1.33073 0.04529 29.38 < 2e-16 ***
## Year2011 0.90496 0.04358 20.77 < 2e-16 ***
## chlo 0.33672 0.04482 7.51 5.8e-14 ***
## sst -0.05718 0.00343 -16.66 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 9414.5 on 472 degrees of freedom
## Residual deviance: 4102.2 on 463 degrees of freedom
## AIC: 6337
##
## Number of Fisher Scoring iterations: 5
But the residuals don't look so good
par(mfcol = c(2, 2))
plot(fit16)
The Variance increases with the predicted values, but in Poisson variance should be constant, ie, the dispersion parameter should be 1
sum(residuals(fit16, type = "pearson")^2/fit16$df.res) #dispersion parameter is 9.7 so our data is over-dispersed
## [1] 9.67
fit17 <- glm.nb(n ~ Gear + Season + Year + chlo + sst + offset(log(Hookn)),
link = log, data = cpue)
summary(fit17)
##
## Call:
## glm.nb(formula = n ~ Gear + Season + Year + chlo + sst + offset(log(Hookn)),
## data = cpue, link = log, init.theta = 3.221256186)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.238 -0.786 -0.246 0.421 3.023
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.4956 0.3346 -13.43 < 2e-16 ***
## Gear2 0.8951 0.0855 10.47 < 2e-16 ***
## Season2 -0.3318 0.1479 -2.24 0.02488 *
## Season3 0.5340 0.1201 4.45 8.7e-06 ***
## Season4 0.2166 0.0999 2.17 0.03016 *
## Year2009 0.4028 0.1091 3.69 0.00022 ***
## Year2010 1.1887 0.1208 9.84 < 2e-16 ***
## Year2011 0.8867 0.1072 8.27 < 2e-16 ***
## chlo 0.5329 0.1627 3.27 0.00106 **
## sst -0.0482 0.0115 -4.17 3.0e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(3.221) family taken to be 1)
##
## Null deviance: 1157.91 on 472 degrees of freedom
## Residual deviance: 518.98 on 463 degrees of freedom
## AIC: 3694
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 3.221
## Std. Err.: 0.248
##
## 2 x log-likelihood: -3671.951
And the residuals look much better
par(mfcol = c(2, 2))
plot(fit17)
Let's compare the models so far
AIC(fit15, fit16, fit17)
## df AIC
## fit15 12 3409
## fit16 10 6337
## fit17 11 3694
fit18 <- glm(cpue + 1 ~ Gear + Season + Year + chlo + sst + Vessel, family = Gamma(link = log),
data = cpue)
summary(fit18)
##
## Call:
## glm(formula = cpue + 1 ~ Gear + Season + Year + chlo + sst +
## Vessel, family = Gamma(link = log), data = cpue)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.271 -0.450 -0.131 0.198 1.764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8948 0.3983 7.27 1.6e-12 ***
## Gear2 0.8136 0.0830 9.80 < 2e-16 ***
## Season2 -0.3732 0.1365 -2.73 0.00652 **
## Season3 0.4440 0.1298 3.42 0.00068 ***
## Season4 0.1768 0.1041 1.70 0.09023 .
## Year2009 0.3663 0.1068 3.43 0.00066 ***
## Year2010 1.0778 0.1209 8.91 < 2e-16 ***
## Year2011 0.7300 0.1158 6.31 6.7e-10 ***
## chlo 0.4152 0.1747 2.38 0.01785 *
## sst -0.0587 0.0144 -4.08 5.4e-05 ***
## Vessel2 0.1800 0.1054 1.71 0.08840 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.3675)
##
## Null deviance: 362.58 on 472 degrees of freedom
## Residual deviance: 159.31 on 462 degrees of freedom
## AIC: 3406
##
## Number of Fisher Scoring iterations: 11
The vessel effect could be important. In this example there are only 2 vessels, but what if we had sampled many more? We would need n_Vessels-1 df to calculate parameters, and the model could be highly complex to interpret
fit19 <- glmmPQL(cpue + 1 ~ Gear + Season + Year + chlo + sst, random = ~1 |
Vessel, family = Gamma(link = log), data = cpue)
## Loading required package: nlme
## iteration 1
summary(fit19)
## Linear mixed-effects model fit by maximum likelihood
## Data: cpue
## AIC BIC logLik
## NA NA NA
##
## Random effects:
## Formula: ~1 | Vessel
## (Intercept) Residual
## StdDev: 2.144e-05 0.6003
##
## Variance function:
## Structure: fixed weights
## Formula: ~invwt
## Fixed effects: cpue + 1 ~ Gear + Season + Year + chlo + sst
## Value Std.Error DF t-value p-value
## (Intercept) 2.5652 0.3414 462 7.513 0.0000
## Gear2 0.8002 0.0829 462 9.654 0.0000
## Season2 -0.3292 0.1366 462 -2.410 0.0164
## Season3 0.5076 0.1224 462 4.147 0.0000
## Season4 0.2134 0.1018 462 2.097 0.0365
## Year2009 0.3643 0.1068 462 3.409 0.0007
## Year2010 1.0972 0.1210 462 9.067 0.0000
## Year2011 0.8195 0.1062 462 7.713 0.0000
## chlo 0.5186 0.1689 462 3.070 0.0023
## sst -0.0460 0.0119 462 -3.871 0.0001
## Correlation:
## (Intr) Gear2 Seasn2 Seasn3 Seasn4 Yr2009 Yr2010 Yr2011 chlo
## Gear2 -0.131
## Season2 -0.051 -0.130
## Season3 -0.121 -0.414 0.404
## Season4 -0.666 -0.391 0.285 0.474
## Year2009 -0.208 -0.377 -0.134 0.041 0.343
## Year2010 -0.214 -0.432 0.162 0.292 0.442 0.653
## Year2011 -0.243 -0.477 0.131 0.223 0.452 0.748 0.726
## chlo -0.169 -0.023 -0.154 -0.351 0.125 0.000 0.057 0.085
## sst -0.952 0.194 -0.019 0.044 0.519 -0.004 -0.009 0.007 0.122
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -1.6190 -0.6317 -0.2337 0.4110 4.7863
##
## Number of Observations: 473
## Number of Groups: 2
And the residuals still look acceptable
par(mfcol = c(2, 2))
plot(fit19)
PQL is a fast way to calculate mixed models, but may be inaccurate, and there is no likelihood function. A more powerful way is to use Laplace approximations. Need some extra software (ADMB) for this. For complex GLMM we need MCMC and Bayesian statistics…
library(glmmADMB)
## Loading required package: R2admb
## Attaching package: 'glmmADMB'
## The following object(s) are masked from 'package:MASS':
##
## stepAIC
## The following object(s) are masked from 'package:stats':
##
## step
fit21 <- glmmadmb(cpue + 1 ~ Gear + Season + Year + chlo + sst, random = ~1 |
Vessel, family = "gamma", link = "log", data = cpue)
summary(fit21)
##
## Call:
## glmmadmb(formula = cpue + 1 ~ Gear + Season + Year + chlo + sst,
## data = cpue, family = "gamma", link = "log", random = ~1 |
## Vessel)
##
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.5652 0.3150 8.14 3.9e-16 ***
## Gear2 0.8002 0.0804 9.96 < 2e-16 ***
## Season2 -0.3292 0.1367 -2.41 0.01603 *
## Season3 0.5076 0.1124 4.51 6.3e-06 ***
## Season4 0.2134 0.0926 2.30 0.02125 *
## Year2009 0.3643 0.1014 3.59 0.00033 ***
## Year2010 1.0972 0.1111 9.88 < 2e-16 ***
## Year2011 0.8195 0.1021 8.03 1.0e-15 ***
## chlo 0.5185 0.1621 3.20 0.00138 **
## sst -0.0460 0.0109 -4.20 2.6e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of observations: total=473, Vessel=2
## Random effect variance(s):
## Group=Vessel
## Variance StdDev
## (Intercept) 4.983e-09 7.059e-05
## Gamma shape parameter: 3.11 (std. err.: 0.1923)
##
## Log-likelihood: -1693
#compare the GLM with GLMM
AIC(fit18, fit21)
## df AIC
## fit18 12 3406
## fit21 12 3409
BIC(fit18, fit21)
## df BIC
## fit18 12 3456
## fit21 12 3459