CCMAR Open-Source Courses - Modelling with R

Needed libraries

library(car)
library(MASS)
library(faraway)
library(nortest)
library(mfp)
library(ggplot2)
library(mgcv)
# library(glmmADMB)

- A Linear Model example

Load an example data set

data(iris)

Exploratory data analysis

see the data

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
##   Sepal.Length   Sepal.Width    Petal.Length   Petal.Width 
##  Min.   :4.30   Min.   :2.00   Min.   :1.00   Min.   :0.1  
##  1st Qu.:5.10   1st Qu.:2.80   1st Qu.:1.60   1st Qu.:0.3  
##  Median :5.80   Median :3.00   Median :4.35   Median :1.3  
##  Mean   :5.84   Mean   :3.06   Mean   :3.76   Mean   :1.2  
##  3rd Qu.:6.40   3rd Qu.:3.30   3rd Qu.:5.10   3rd Qu.:1.8  
##  Max.   :7.90   Max.   :4.40   Max.   :6.90   Max.   :2.5  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Scatter plots to explore the data and relations between variables

scatterplotMatrix(~Sepal.Length + Sepal.Width + Petal.Length + Petal.Width | 
    Species, data = iris)

plot of chunk unnamed-chunk-4

Exploratory plots with ggplot2

ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() + facet_grid(Species ~ 
    .) + stat_smooth(method = "lm")

plot of chunk unnamed-chunk-5

Examples of linear models

The null model

fit1 <- lm(Petal.Width ~ 1, data = iris)
summary(fit1)
## 
## Call:
## lm(formula = Petal.Width ~ 1, data = iris)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.099 -0.899  0.101  0.601  1.301 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.1993     0.0622    19.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.762 on 149 degrees of freedom

A univariate linear model

fit2 <- lm(Petal.Width ~ Sepal.Length, data = iris)
summary(fit2)
## 
## Call:
## lm(formula = Petal.Width ~ Sepal.Length, data = iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9667 -0.3594 -0.0179  0.2839  1.2333 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.2002     0.2569   -12.5   <2e-16 ***
## Sepal.Length   0.7529     0.0435    17.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.44 on 148 degrees of freedom
## Multiple R-squared: 0.669,   Adjusted R-squared: 0.667 
## F-statistic:  299 on 1 and 148 DF,  p-value: <2e-16

The complete multivariate simple effects model

fit3 <- lm(Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width, data = iris)
summary(fit3)
## 
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width, 
##     data = iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6096 -0.1013 -0.0109  0.0983  0.6069 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.2403     0.1784   -1.35     0.18    
## Sepal.Length  -0.2073     0.0475   -4.36  2.4e-05 ***
## Petal.Length   0.5241     0.0245   21.40  < 2e-16 ***
## Sepal.Width    0.2228     0.0489    4.55  1.1e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.192 on 146 degrees of freedom
## Multiple R-squared: 0.938,   Adjusted R-squared: 0.937 
## F-statistic:  734 on 3 and 146 DF,  p-value: <2e-16

Categorical variables are treated as dummy variables

contrasts(iris$Species)
##            versicolor virginica
## setosa              0         0
## versicolor          1         0
## virginica           0         1
fit4 <- lm(Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width + Species, 
    data = iris)
summary(fit4)
## 
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width + 
##     Species, data = iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5924 -0.0829 -0.0135  0.0877  0.4524 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.4731     0.1766   -2.68   0.0082 ** 
## Sepal.Length       -0.0929     0.0446   -2.08   0.0389 *  
## Petal.Length        0.2422     0.0488    4.96  2.0e-06 ***
## Sepal.Width         0.2422     0.0478    5.07  1.2e-06 ***
## Speciesversicolor   0.6481     0.1231    5.26  5.0e-07 ***
## Speciesvirginica    1.0464     0.1655    6.32  3.0e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.167 on 144 degrees of freedom
## Multiple R-squared: 0.954,   Adjusted R-squared: 0.952 
## F-statistic:  595 on 5 and 144 DF,  p-value: <2e-16
anova(fit4)
## Analysis of Variance Table
## 
## Response: Petal.Width
##               Df Sum Sq Mean Sq F value  Pr(>F)    
## Sepal.Length   1   57.9    57.9  2086.3 < 2e-16 ***
## Petal.Length   1   22.5    22.5   810.8 < 2e-16 ***
## Sepal.Width    1    0.8     0.8    27.5 5.4e-07 ***
## Species        2    1.4     0.7    24.9 5.1e-10 ***
## Residuals    144    4.0     0.0                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variable selection

Manually comparing nested models:

anova(fit3, fit4, test = "LRT")
## Analysis of Variance Table
## 
## Model 1: Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width
## Model 2: Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width + Species
##   Res.Df  RSS Df Sum of Sq Pr(>Chi)    
## 1    146 5.38                          
## 2    144 4.00  2      1.38  1.5e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Automatic variable selection using AIC…

step(fit4)
## Start:  AIC=-531.7
## Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width + Species
## 
##                Df Sum of Sq  RSS  AIC
## <none>                      4.00 -532
## - Sepal.Length  1     0.121 4.12 -529
## - Petal.Length  1     0.683 4.68 -510
## - Sepal.Width   1     0.714 4.71 -509
## - Species       2     1.383 5.38 -491
## 
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Petal.Length + Sepal.Width + 
##     Species, data = iris)
## 
## Coefficients:
##       (Intercept)       Sepal.Length       Petal.Length  
##           -0.4731            -0.0929             0.2422  
##       Sepal.Width  Speciesversicolor   Speciesvirginica  
##            0.2422             0.6481             1.0464

A highly complex model with complex interactions, but not necessarily the best approach

fit5 <- lm(Petal.Width ~ Sepal.Length * Petal.Length * Sepal.Width * Species, 
    data = iris)
summary(fit5)

Automatic AIC methods can easily result in complex and over-fitted models

fit6 <- step(fit5)
summary(fit6)

- An Example comparing GLM with GAM

Load the data

saw <- read.table("Saw_FL.csv", header = TRUE, sep = ",", dec = ".")
head(saw, 10)
##    Region Depthm Sizecm
## 1       5    0.9   30.5
## 2       3    3.0   30.5
## 3       3    0.3   38.1
## 4       4    0.3   45.0
## 5       2    0.9   45.0
## 6       5    0.2   45.7
## 7       5    0.2   45.7
## 8       5    0.2   45.7
## 9       4    0.2   45.7
## 10      4    0.3   45.7
str(saw)
## 'data.frame':    2841 obs. of  3 variables:
##  $ Region: int  5 3 3 4 2 5 5 5 4 4 ...
##  $ Depthm: num  0.9 3 0.3 0.3 0.9 0.2 0.2 0.2 0.2 0.3 ...
##  $ Sizecm: num  30.5 30.5 38.1 45 45 45.7 45.7 45.7 45.7 45.7 ...

Define region as a factor

saw$Region <- factor(saw$Region)
str(saw)
## 'data.frame':    2841 obs. of  3 variables:
##  $ Region: Factor w/ 8 levels "1","2","3","4",..: 5 3 3 4 2 5 5 5 4 4 ...
##  $ Depthm: num  0.9 3 0.3 0.3 0.9 0.2 0.2 0.2 0.2 0.3 ...
##  $ Sizecm: num  30.5 30.5 38.1 45 45 45.7 45.7 45.7 45.7 45.7 ...
contrasts(saw$Region)
##   2 3 4 5 6 7 8
## 1 0 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0
## 3 0 1 0 0 0 0 0
## 4 0 0 1 0 0 0 0
## 5 0 0 0 1 0 0 0
## 6 0 0 0 0 1 0 0
## 7 0 0 0 0 0 1 0
## 8 0 0 0 0 0 0 1

Different set of contrasts to have another region as the baseline

saw$Region <- factor(saw$Region, levels = c("8", "1", "2", "3", "4", "5", "6", 
    "7"))
contrasts(saw$Region)
##   1 2 3 4 5 6 7
## 8 0 0 0 0 0 0 0
## 1 1 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0
## 3 0 0 1 0 0 0 0
## 4 0 0 0 1 0 0 0
## 5 0 0 0 0 1 0 0
## 6 0 0 0 0 0 1 0
## 7 0 0 0 0 0 0 1

Reset the regions as they were originally

saw$Region <- factor(saw$Region, levels = c("1", "2", "3", "4", "5", "6", "7", 
    "8"))

Let's see the effects of Region and Depth on the specimens sizes

qplot(Region, Sizecm, geom = c("boxplot"), data = saw)

plot of chunk unnamed-chunk-18

qplot(Sizecm, Depthm, data = saw) + geom_smooth()

plot of chunk unnamed-chunk-18

qplot(Sizecm, Depthm, data = saw, colour = Region) + facet_grid(. ~ Region)

plot of chunk unnamed-chunk-18

Using a LM with this data

Let's check if the response is Normally distributed

qplot(Sizecm, data = saw)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
## this.

plot of chunk unnamed-chunk-19

lillie.test(saw$Sizecm)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  saw$Sizecm 
## D = 0.1516, p-value < 2.2e-16

Compare a LM with a GLM with Normal errors and identity link function

fit7 <- lm(Sizecm ~ Depthm + Region, data = saw)
fit8 <- glm(Sizecm ~ Depthm + Region, family = gaussian, data = saw)
cbind(fit7$coef, fit8$coef)
##                   [,1]       [,2]
## (Intercept) 152.882525 152.882525
## Depthm        3.869187   3.869187
## Region2      29.617951  29.617951
## Region3       3.871427   3.871427
## Region4      -0.008396  -0.008396
## Region5      78.619636  78.619636
## Region6     122.880689 122.880689
## Region7     140.456897 140.456897
## Region8     106.097652 106.097652

What about the residuals of the linear model…

plot(fitted(fit8), rstandard(fit8))

plot of chunk unnamed-chunk-21

Using a Gamma GLM with log link

fit9 <- glm(Sizecm ~ Depthm + Region, family = Gamma(link = log), data = saw)
summary(fit9)
## 
## Call:
## glm(formula = Sizecm ~ Depthm + Region, family = Gamma(link = log), 
##     data = saw)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.716  -0.494  -0.144   0.245   1.657  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.03955    0.12460   40.44  < 2e-16 ***
## Depthm       0.01935    0.00133   14.57  < 2e-16 ***
## Region2      0.12470    0.15261    0.82  0.41393    
## Region3      0.01403    0.12551    0.11  0.91101    
## Region4     -0.01120    0.12651   -0.09  0.92947    
## Region5      0.37984    0.12622    3.01  0.00264 ** 
## Region6      0.49194    0.13974    3.52  0.00044 ***
## Region7      0.56939    0.13850    4.11  4.1e-05 ***
## Region8      0.43265    0.24040    1.80  0.07202 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for Gamma family taken to be 0.2948)
## 
##     Null deviance: 994.99  on 2840  degrees of freedom
## Residual deviance: 773.37  on 2832  degrees of freedom
## AIC: 33523
## 
## Number of Fisher Scoring iterations: 8

Pseudo-R2 (Proportion of Deviance explained)

1 - (773.37/2832)/(994.99/2840)  #R2 = 22.05%
## [1] 0.2205

GLM assumption: Continuous explanatory variables are linear with the predictor

Let's check this assumption visually with a GAM

plot(gam(Sizecm ~ s(Depthm), family = Gamma(link = log), data = saw))

plot of chunk unnamed-chunk-24

Scale and functional form transformations with fractional polynomials

fit10 <- mfp(Sizecm ~ fp(Depthm) + Region, family = Gamma(log), data = saw, 
    maxit = 1000)
summary(fit10)
## 
## Call:
## glm(formula = Sizecm ~ Region + I(((Depthm + 0.1)/10)^-1) + I(((Depthm + 
##     0.1)/10)^-1 * log(((Depthm + 0.1)/10))), family = Gamma(log), 
##     data = saw)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4674  -0.3689  -0.0905   0.2070   2.5594  
## 
## Coefficients:
##                                                      Estimate Std. Error
## (Intercept)                                           5.63643    0.11010
## Region2                                               0.04519    0.13378
## Region3                                               0.03389    0.11030
## Region4                                               0.21484    0.11145
## Region5                                               0.41198    0.11064
## Region6                                               0.43642    0.12184
## Region7                                               0.53688    0.12113
## Region8                                               0.37538    0.21014
## I(((Depthm + 0.1)/10)^-1)                            -0.12998    0.00668
## I(((Depthm + 0.1)/10)^-1 * log(((Depthm + 0.1)/10))) -0.02604    0.00177
##                                                      t value Pr(>|t|)    
## (Intercept)                                            51.19  < 2e-16 ***
## Region2                                                 0.34  0.73553    
## Region3                                                 0.31  0.75866    
## Region4                                                 1.93  0.05400 .  
## Region5                                                 3.72  0.00020 ***
## Region6                                                 3.58  0.00035 ***
## Region7                                                 4.43  9.7e-06 ***
## Region8                                                 1.79  0.07415 .  
## I(((Depthm + 0.1)/10)^-1)                             -19.45  < 2e-16 ***
## I(((Depthm + 0.1)/10)^-1 * log(((Depthm + 0.1)/10)))  -14.68  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for Gamma family taken to be 0.2255)
## 
##     Null deviance: 994.99  on 2840  degrees of freedom
## Residual deviance: 556.82  on 2831  degrees of freedom
## AIC: 32556
## 
## Number of Fisher Scoring iterations: 5

Transformations in Depth to make it a linear predictor

saw$Depthm1 <- I(((saw$Depthm + 0.1)/10)^-1)
saw$Depthm2 <- I(((saw$Depthm + 0.1)/10)^-1 * log(((saw$Depthm + 0.1)/10)))

Transformations in Depth to make it a linear predictor

fit11 <- glm(Sizecm ~ Depthm1 + Depthm2 + Region, family = Gamma(link = log), 
    data = saw)
summary(fit11)
## 
## Call:
## glm(formula = Sizecm ~ Depthm1 + Depthm2 + Region, family = Gamma(link = log), 
##     data = saw)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4674  -0.3689  -0.0905   0.2070   2.5594  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.63643    0.11010   51.19  < 2e-16 ***
## Depthm1     -0.12998    0.00668  -19.45  < 2e-16 ***
## Depthm2     -0.02604    0.00177  -14.68  < 2e-16 ***
## Region2      0.04519    0.13378    0.34  0.73553    
## Region3      0.03389    0.11030    0.31  0.75866    
## Region4      0.21484    0.11145    1.93  0.05400 .  
## Region5      0.41198    0.11064    3.72  0.00020 ***
## Region6      0.43642    0.12184    3.58  0.00035 ***
## Region7      0.53688    0.12113    4.43  9.7e-06 ***
## Region8      0.37538    0.21014    1.79  0.07415 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for Gamma family taken to be 0.2255)
## 
##     Null deviance: 994.99  on 2840  degrees of freedom
## Residual deviance: 556.82  on 2831  degrees of freedom
## AIC: 32556
## 
## Number of Fisher Scoring iterations: 5
1 - (fit11$deviance/fit11$df.residual)/(fit11$null.deviance/fit11$df.null)  #R2 = 43.9%
## [1] 0.4386

Goodness-of-fit comparisons

AIC(fit9, fit11)
##       df   AIC
## fit9  10 33523
## fit11 11 32556
BIC(fit9, fit11)
##       df   BIC
## fit9  10 33583
## fit11 11 32621
anova(fit9, fit11, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: Sizecm ~ Depthm + Region
## Model 2: Sizecm ~ Depthm1 + Depthm2 + Region
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1      2832        773                         
## 2      2831        557  1      217   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual analysis: Still some outliers and trends, but much better than a LM

par(mfrow = c(2, 2))
plot(fit11)

plot of chunk unnamed-chunk-29

We could have used a GAM instead of a GLM with transformations in the variables

fit12 <- gam(Sizecm ~ s(Depthm) + Region, family = Gamma(link = log), data = saw)
summary(fit12)
## 
## Family: Gamma 
## Link function: log 
## 
## Formula:
## Sizecm ~ s(Depthm) + Region
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.0337     0.1053   47.81  < 2e-16 ***
## Region2       0.0265     0.1289    0.21  0.83717    
## Region3       0.0217     0.1060    0.21  0.83756    
## Region4       0.1446     0.1071    1.35  0.17710    
## Region5       0.4026     0.1067    3.77  0.00016 ***
## Region6       0.4456     0.1203    3.71  0.00022 ***
## Region7       0.5360     0.1175    4.56  5.3e-06 ***
## Region8       0.3433     0.2045    1.68  0.09327 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Approximate significance of smooth terms:
##            edf Ref.df   F p-value    
## s(Depthm) 8.82   8.99 118  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## R-sq.(adj) =  0.381   Deviance explained = 40.3%
## GCV score = 0.21148  Scale est. = 0.21023   n = 2841

Prediction with the GLM vs. GAM: Predict the size of 1 specimen captured in region 7 at 1m of depth

# GLM
Depthm1.pred <- I(((1 + 0.1)/10)^-1)
Depthm2.pred <- I(((1 + 0.1)/10)^-1 * log(((1 + 0.1)/10)))
Depthm1.pred  # 1st transformed depth parameter
## [1] 9.091
Depthm2.pred  # 2nd transformed depth parameter
## [1] -20.07
x.glm <- data.frame(Depthm1 = 9.090909, Depthm2 = -20.06614, Region = "7")
pred.glm <- predict(fit11, new = x.glm, se = T, type = "response")
pred.glm$fit  # predicted size is 248.2 cm
##     1 
## 248.2
pred.glm$se.fit  # SE=13.3
##     1 
## 13.29
# GLM
x.gam <- data.frame(Depthm = 1, Region = "7")
pred.gam <- predict.gam(fit12, newdata = x.gam, se.fit = T, type = "response")
pred.gam$fit  # Predicted size is 227.7 cm
##     1 
## 227.7
pred.gam$se.fit  # SE=12.0
##     1 
## 12.04

- An example with Mixed models

#load dataset

cpue <- read.table("cpue.csv", header = T, sep = ",", dec = ".", na.strings = "-9999")
# factors
cpue$Vessel <- factor(cpue$Vessel)
cpue$Gear <- factor(cpue$Gear)
cpue$Season <- factor(cpue$Season)
cpue$Year <- factor(cpue$Year)
str(cpue)
## 'data.frame':    473 obs. of  9 variables:
##  $ Vessel: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Hookn : int  1360 1360 1360 1360 1360 1360 1360 1360 1360 1360 ...
##  $ Gear  : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Season: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Year  : Factor w/ 4 levels "2008","2009",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ n     : int  7 19 13 22 25 13 17 15 7 14 ...
##  $ cpue  : num  5.15 13.97 9.56 16.18 18.38 ...
##  $ chlo  : num  0.0921 0.1056 0.0794 0.097 0.1107 ...
##  $ sst   : num  27.5 27.7 27.7 28 28.5 ...

How many sets do we have with zero catches

cpue$PositiveSet <- ifelse(cpue$n >= 1, 1, 0)
table(cpue$PositiveSet)
## 
##   0   1 
##   8 465
8/465 * 100  #1.7% of fishing sets with 0 catches
## [1] 1.72

Is the response variable Normally distributed?

qplot(cpue, data = cpue)  # and don't forget that we also have some zeros
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
## this.

plot of chunk unnamed-chunk-35

1) Lets try a Gamma GLM

fit13 <- glm(cpue ~ Gear + Season + Year + chlo + sst, family = Gamma(link = log), 
    data = cpue)
## Error: non-positive values not allowed for the gamma family

Why do we have this error? What is the log of zero?

log(0)
## [1] -Inf

We can add a “small” constant because the obs=0 are not that many

fit14 <- glm(cpue + 1 ~ Gear + Season + Year + chlo + sst, family = Gamma(link = log), 
    data = cpue)
summary(fit14)
## 
## Call:
## glm(formula = cpue + 1 ~ Gear + Season + Year + chlo + sst, family = Gamma(link = log), 
##     data = cpue)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.281  -0.442  -0.147   0.229   1.743  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.5652     0.3414    7.51  3.0e-13 ***
## Gear2         0.8002     0.0829    9.65  < 2e-16 ***
## Season2      -0.3293     0.1366   -2.41  0.01634 *  
## Season3       0.5076     0.1224    4.15  4.0e-05 ***
## Season4       0.2134     0.1018    2.10  0.03653 *  
## Year2009      0.3643     0.1068    3.41  0.00071 ***
## Year2010      1.0972     0.1210    9.07  < 2e-16 ***
## Year2011      0.8195     0.1062    7.71  7.6e-14 ***
## chlo          0.5185     0.1689    3.07  0.00227 ** 
## sst          -0.0460     0.0119   -3.87  0.00012 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for Gamma family taken to be 0.3682)
## 
##     Null deviance: 362.58  on 472  degrees of freedom
## Residual deviance: 160.46  on 463  degrees of freedom
## AIC: 3408
## 
## Number of Fisher Scoring iterations: 7

The residuals are not that bad

par(mfrow = c(2, 2))
plot(fit14)

plot of chunk unnamed-chunk-39

Evaluate possible Collinearity

vif(fit14)  # variance inflation factors
##    Gear2  Season2  Season3  Season4 Year2009 Year2010 Year2011     chlo 
##    1.831    1.469    2.223    3.324    3.272    2.510    3.339    1.467 
##      sst 
##    2.041

Here we should test for linearity, but let's keep moving as if everything was linear with the predictors

let's try to add one interaction

fit15 <- glm(cpue + 1 ~ Gear + Season + Year + chlo + sst + Gear:sst, family = Gamma(link = log), 
    data = cpue)
summary(fit15)
## 
## Call:
## glm(formula = cpue + 1 ~ Gear + Season + Year + chlo + sst + 
##     Gear:sst, family = Gamma(link = log), data = cpue)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.285  -0.458  -0.154   0.220   1.740  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.9367     0.6999    2.77   0.0059 ** 
## Gear2         1.5711     0.7372    2.13   0.0336 *  
## Season2      -0.3217     0.1368   -2.35   0.0191 *  
## Season3       0.5284     0.1243    4.25  2.6e-05 ***
## Season4       0.2220     0.1025    2.17   0.0307 *  
## Year2009      0.3096     0.1221    2.54   0.0115 *  
## Year2010      1.0292     0.1384    7.44  5.1e-13 ***
## Year2011      0.7545     0.1250    6.04  3.2e-09 ***
## chlo          0.5117     0.1692    3.02   0.0026 ** 
## sst          -0.0203     0.0278   -0.73   0.4670    
## Gear2:sst    -0.0298     0.0283   -1.05   0.2935    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for Gamma family taken to be 0.3692)
## 
##     Null deviance: 362.58  on 472  degrees of freedom
## Residual deviance: 160.09  on 462  degrees of freedom
## AIC: 3409
## 
## Number of Fisher Scoring iterations: 8

How do we know that more complex is better (in this case it is not…)

anova(fit14, fit15, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: cpue + 1 ~ Gear + Season + Year + chlo + sst
## Model 2: cpue + 1 ~ Gear + Season + Year + chlo + sst + Gear:sst
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       463        160                     
## 2       462        160  1    0.371     0.32
AIC(fit14, fit15)
##       df  AIC
## fit14 11 3408
## fit15 12 3409
BIC(fit14, fit15)
##       df  BIC
## fit14 11 3454
## fit15 12 3459
# we should also test for other interactions, but let's keep things simple

Some residuals that I like to do and see

res.dev.fit15 <- residuals(fit15, type = "deviance")
par(mfcol = c(2, 3))
hist(res.dev.fit15, main = "1) Histogram", col = "gray88", xlab = "Deviance residuals")
plot(fit15, which = 2, main = "2) QQ Plot")
plot(fit15, which = 1, main = "3) Residuals")
halfnorm(res.dev.fit15, main = "4) Residuals halfnormal")
plot(cooks.distance(fit15), ylab = "Cook distances", main = "5) Cook distances")
halfnorm(cooks.distance(fit15), main = "6) Cook distances halfnormal")

plot of chunk unnamed-chunk-43

2) Now lets try a Poisson GLM: We use it in discrete (count) data, but can add an offset variable

fit16 <- glm(n ~ Gear + Season + Year + chlo + sst + offset(log(Hookn)), family = poisson(link = log), 
    data = cpue)
summary(fit16)
## 
## Call:
## glm(formula = n ~ Gear + Season + Year + chlo + sst + offset(log(Hookn)), 
##     family = poisson(link = log), data = cpue)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -9.858  -2.019  -0.505   1.225  13.806  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.34123    0.10695  -40.59  < 2e-16 ***
## Gear2        0.86826    0.03725   23.31  < 2e-16 ***
## Season2      0.04566    0.06087    0.75     0.45    
## Season3      0.65423    0.03738   17.50  < 2e-16 ***
## Season4      0.23571    0.03189    7.39  1.4e-13 ***
## Year2009     0.42694    0.04615    9.25  < 2e-16 ***
## Year2010     1.33073    0.04529   29.38  < 2e-16 ***
## Year2011     0.90496    0.04358   20.77  < 2e-16 ***
## chlo         0.33672    0.04482    7.51  5.8e-14 ***
## sst         -0.05718    0.00343  -16.66  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 9414.5  on 472  degrees of freedom
## Residual deviance: 4102.2  on 463  degrees of freedom
## AIC: 6337
## 
## Number of Fisher Scoring iterations: 5

But the residuals don't look so good

par(mfcol = c(2, 2))
plot(fit16)

plot of chunk unnamed-chunk-45

The Variance increases with the predicted values, but in Poisson variance should be constant, ie, the dispersion parameter should be 1

sum(residuals(fit16, type = "pearson")^2/fit16$df.res)  #dispersion parameter is 9.7 so our data is over-dispersed
## [1] 9.67

3) We can use a Neg Bin for discrete over-dispersed data

fit17 <- glm.nb(n ~ Gear + Season + Year + chlo + sst + offset(log(Hookn)), 
    link = log, data = cpue)
summary(fit17)
## 
## Call:
## glm.nb(formula = n ~ Gear + Season + Year + chlo + sst + offset(log(Hookn)), 
##     data = cpue, link = log, init.theta = 3.221256186)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.238  -0.786  -0.246   0.421   3.023  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.4956     0.3346  -13.43  < 2e-16 ***
## Gear2         0.8951     0.0855   10.47  < 2e-16 ***
## Season2      -0.3318     0.1479   -2.24  0.02488 *  
## Season3       0.5340     0.1201    4.45  8.7e-06 ***
## Season4       0.2166     0.0999    2.17  0.03016 *  
## Year2009      0.4028     0.1091    3.69  0.00022 ***
## Year2010      1.1887     0.1208    9.84  < 2e-16 ***
## Year2011      0.8867     0.1072    8.27  < 2e-16 ***
## chlo          0.5329     0.1627    3.27  0.00106 ** 
## sst          -0.0482     0.0115   -4.17  3.0e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for Negative Binomial(3.221) family taken to be 1)
## 
##     Null deviance: 1157.91  on 472  degrees of freedom
## Residual deviance:  518.98  on 463  degrees of freedom
## AIC: 3694
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  3.221 
##           Std. Err.:  0.248 
## 
##  2 x log-likelihood:  -3671.951

And the residuals look much better

par(mfcol = c(2, 2))
plot(fit17)

plot of chunk unnamed-chunk-48

Let's compare the models so far

AIC(fit15, fit16, fit17)
##       df  AIC
## fit15 12 3409
## fit16 10 6337
## fit17 11 3694

4) Now let's add the VESSEL Effect

4.1) We could use it as a fixed variable in a GLM

fit18 <- glm(cpue + 1 ~ Gear + Season + Year + chlo + sst + Vessel, family = Gamma(link = log), 
    data = cpue)
summary(fit18)
## 
## Call:
## glm(formula = cpue + 1 ~ Gear + Season + Year + chlo + sst + 
##     Vessel, family = Gamma(link = log), data = cpue)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.271  -0.450  -0.131   0.198   1.764  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.8948     0.3983    7.27  1.6e-12 ***
## Gear2         0.8136     0.0830    9.80  < 2e-16 ***
## Season2      -0.3732     0.1365   -2.73  0.00652 ** 
## Season3       0.4440     0.1298    3.42  0.00068 ***
## Season4       0.1768     0.1041    1.70  0.09023 .  
## Year2009      0.3663     0.1068    3.43  0.00066 ***
## Year2010      1.0778     0.1209    8.91  < 2e-16 ***
## Year2011      0.7300     0.1158    6.31  6.7e-10 ***
## chlo          0.4152     0.1747    2.38  0.01785 *  
## sst          -0.0587     0.0144   -4.08  5.4e-05 ***
## Vessel2       0.1800     0.1054    1.71  0.08840 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for Gamma family taken to be 0.3675)
## 
##     Null deviance: 362.58  on 472  degrees of freedom
## Residual deviance: 159.31  on 462  degrees of freedom
## AIC: 3406
## 
## Number of Fisher Scoring iterations: 11

The vessel effect could be important. In this example there are only 2 vessels, but what if we had sampled many more? We would need n_Vessels-1 df to calculate parameters, and the model could be highly complex to interpret

4.2) Use a GLMM: We may know that Vessel is important, but want a mean Vessel effect in the model and not individual parameters

fit19 <- glmmPQL(cpue + 1 ~ Gear + Season + Year + chlo + sst, random = ~1 | 
    Vessel, family = Gamma(link = log), data = cpue)
## Loading required package: nlme
## iteration 1
summary(fit19)
## Linear mixed-effects model fit by maximum likelihood
##  Data: cpue 
##   AIC BIC logLik
##    NA  NA     NA
## 
## Random effects:
##  Formula: ~1 | Vessel
##         (Intercept) Residual
## StdDev:   2.144e-05   0.6003
## 
## Variance function:
##  Structure: fixed weights
##  Formula: ~invwt 
## Fixed effects: cpue + 1 ~ Gear + Season + Year + chlo + sst 
##               Value Std.Error  DF t-value p-value
## (Intercept)  2.5652    0.3414 462   7.513  0.0000
## Gear2        0.8002    0.0829 462   9.654  0.0000
## Season2     -0.3292    0.1366 462  -2.410  0.0164
## Season3      0.5076    0.1224 462   4.147  0.0000
## Season4      0.2134    0.1018 462   2.097  0.0365
## Year2009     0.3643    0.1068 462   3.409  0.0007
## Year2010     1.0972    0.1210 462   9.067  0.0000
## Year2011     0.8195    0.1062 462   7.713  0.0000
## chlo         0.5186    0.1689 462   3.070  0.0023
## sst         -0.0460    0.0119 462  -3.871  0.0001
##  Correlation: 
##          (Intr) Gear2  Seasn2 Seasn3 Seasn4 Yr2009 Yr2010 Yr2011 chlo  
## Gear2    -0.131                                                        
## Season2  -0.051 -0.130                                                 
## Season3  -0.121 -0.414  0.404                                          
## Season4  -0.666 -0.391  0.285  0.474                                   
## Year2009 -0.208 -0.377 -0.134  0.041  0.343                            
## Year2010 -0.214 -0.432  0.162  0.292  0.442  0.653                     
## Year2011 -0.243 -0.477  0.131  0.223  0.452  0.748  0.726              
## chlo     -0.169 -0.023 -0.154 -0.351  0.125  0.000  0.057  0.085       
## sst      -0.952  0.194 -0.019  0.044  0.519 -0.004 -0.009  0.007  0.122
## 
## Standardized Within-Group Residuals:
##     Min      Q1     Med      Q3     Max 
## -1.6190 -0.6317 -0.2337  0.4110  4.7863 
## 
## Number of Observations: 473
## Number of Groups: 2

And the residuals still look acceptable

par(mfcol = c(2, 2))
plot(fit19)

plot of chunk unnamed-chunk-52

PQL is a fast way to calculate mixed models, but may be inaccurate, and there is no likelihood function. A more powerful way is to use Laplace approximations. Need some extra software (ADMB) for this. For complex GLMM we need MCMC and Bayesian statistics…

library(glmmADMB)
## Loading required package: R2admb
## Attaching package: 'glmmADMB'
## The following object(s) are masked from 'package:MASS':
## 
## stepAIC
## The following object(s) are masked from 'package:stats':
## 
## step
fit21 <- glmmadmb(cpue + 1 ~ Gear + Season + Year + chlo + sst, random = ~1 | 
    Vessel, family = "gamma", link = "log", data = cpue)
summary(fit21)
## 
## Call:
## glmmadmb(formula = cpue + 1 ~ Gear + Season + Year + chlo + sst, 
##     data = cpue, family = "gamma", link = "log", random = ~1 | 
##         Vessel)
## 
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.5652     0.3150    8.14  3.9e-16 ***
## Gear2         0.8002     0.0804    9.96  < 2e-16 ***
## Season2      -0.3292     0.1367   -2.41  0.01603 *  
## Season3       0.5076     0.1124    4.51  6.3e-06 ***
## Season4       0.2134     0.0926    2.30  0.02125 *  
## Year2009      0.3643     0.1014    3.59  0.00033 ***
## Year2010      1.0972     0.1111    9.88  < 2e-16 ***
## Year2011      0.8195     0.1021    8.03  1.0e-15 ***
## chlo          0.5185     0.1621    3.20  0.00138 ** 
## sst          -0.0460     0.0109   -4.20  2.6e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of observations: total=473, Vessel=2 
## Random effect variance(s):
## Group=Vessel
##              Variance    StdDev
## (Intercept) 4.983e-09 7.059e-05
## Gamma shape parameter: 3.11 (std. err.: 0.1923)
## 
## Log-likelihood: -1693

#compare the GLM with GLMM

AIC(fit18, fit21)
##       df  AIC
## fit18 12 3406
## fit21 12 3409
BIC(fit18, fit21)
##       df  BIC
## fit18 12 3456
## fit21 12 3459