Load libraries.

library(ggplot2)
library(dplyr)
library(tidyverse)
library(ggfortify)
library(readxl)      # read .xls files

# Packages for raincloud plots
library(ggdist)      # for stat_halfeye()
library(ggbeeswarm)  # for nicer jitter
library(grid)        # for textGrob()
library(patchwork)   # to combine ggplots

# Packages for modeling
library(lme4)
library(car)         # for vif function
library(lmerTest)
library(MuMIn)       # for IT approach
library(ggeffects)   # for ggpredict()
library(arm)         # for binary model diagnostics
library(DHARMa)      # for GLM model diagnostics
library(vegan)       # for multivariate analysis

Question 1: Model selection (IT approach)

Load roadkill dataset and check the data.

# Read the "road.csv" sheet
roadkill <- read.csv("/Users/joylynngoh/Files/Uni/Y4/ES3307/Assignment 4/Assignment 4 data/subsistence.csv")


# Quick check of data structure
head(roadkill)

##      name kill     habitat water tree shrub human fruit cable transectID
## 1 control    0 subsistence    40   20    20    60    10    NA          0
## 2 control    0 subsistence    30   10    15    30    20    NA          0
## 3 control    0 subsistence    30   10     5   140    20    NA          0
## 4 control    0 subsistence    25   15    10   190    10    NA          0
## 5 control    0 subsistence    20   20    15   180    10    NA          0
## 6 control    0 subsistence    30   20    15   120    10    NA          0

str(roadkill)

## 'data.frame':    87 obs. of  10 variables:
##  $ name      : chr  "control" "control" "control" "control" ...
##  $ kill      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ habitat   : chr  "subsistence" "subsistence" "subsistence" "subsistence" ...
##  $ water     : int  40 30 30 25 20 30 10 20 5 10 ...
##  $ tree      : int  20 10 10 15 20 20 25 10 5 10 ...
##  $ shrub     : int  20 15 5 10 15 15 20 25 30 15 ...
##  $ human     : int  60 30 140 190 180 120 120 20 310 50 ...
##  $ fruit     : int  10 20 20 10 10 10 10 5 20 30 ...
##  $ cable     : logi  NA NA NA NA NA NA ...
##  $ transectID: int  0 0 0 0 0 0 0 0 0 0 ...

# code to check for any NA data.
sum(rowSums(is.na(roadkill)) == ncol(roadkill))   # 0

## [1] 0

# Factor name column
roadkill <- roadkill %>% 
  mutate(name = factor(name))

1A. Which model(s) have you included in the final candidate set?

# Fit full model with all interactions
mod1.1 <- glm(kill ~ water + shrub + human + fruit + tree,
            family = binomial, data = roadkill)

# Check VIF values
vif(mod1.1)

##    water    shrub    human    fruit     tree 
## 1.053694 1.044163 1.179552 1.165295 1.039607

All the predictors have a relatively low VIF near 1. Even at a low VIF threshold of 2, all 5 variables are still reasonable to keep in the model.

Next, determine candidate models using dredge.

mod1.1 <- glm(kill ~ water + shrub + human + fruit + tree,
            family = binomial, data = roadkill, na.action = "na.fail")

candidates <- dredge(mod1.1)
print(candidates)

## Global model call: glm(formula = kill ~ water + shrub + human + fruit + tree, family = binomial, 
##     data = roadkill, na.action = "na.fail")
## ---
## Model selection table 
##    (Intrc)   fruit    human    shrub    tree    water df  logLik  AICc delta
## 16  3.9640 -0.2340 0.007149 -0.09202 -0.1346           5 -40.864  92.5  0.00
## 32  4.2160 -0.2255 0.007210 -0.09276 -0.1361 -0.02289  6 -40.679  94.4  1.94
## 12  2.9620 -0.2309 0.006524          -0.1379           4 -43.052  94.6  2.12
## 28  3.1790 -0.2237 0.006506          -0.1379 -0.02035  5 -42.879  96.5  4.03
## 8   2.6430 -0.2444 0.006595 -0.09817                   4 -45.053  98.6  6.13
## 14  4.0530 -0.1931          -0.07290 -0.1099           4 -45.656  99.8  7.33
## 24  2.7710 -0.2371 0.006649 -0.09732         -0.01528  5 -44.957 100.7  8.19
## 10  3.1990 -0.1914                   -0.1123           3 -47.190 100.7  8.20
## 4   1.4880 -0.2299 0.005798                            3 -47.624 101.5  9.07
## 30  4.2360 -0.1838          -0.07157 -0.1114 -0.01930  5 -45.504 101.7  9.28
## 26  3.4190 -0.1821                   -0.1132 -0.02113  4 -46.981 102.5  9.98
## 20  1.6710 -0.2229 0.005847                  -0.01834  4 -47.459 103.4 10.94
## 6   2.9090 -0.2040          -0.07782                   3 -49.071 104.4 11.96
## 2   1.9410 -0.1965                                     2 -50.851 105.8 13.38
## 22  2.9980 -0.1970          -0.07652         -0.01206  4 -49.002 106.5 14.02
## 31  2.7880         0.005995 -0.09172 -0.1324 -0.05648  5 -48.006 106.8 14.28
## 18  2.0950 -0.1881                           -0.01660  3 -50.706 107.7 15.23
## 15  1.9900         0.005832 -0.08785 -0.1351           4 -49.907 108.3 15.83
## 27  1.7490         0.005282          -0.1293 -0.05454  4 -50.502 109.5 17.02
## 11  1.0190         0.005143          -0.1328           3 -52.421 111.1 18.66
## 29  3.0870                  -0.07137 -0.1148 -0.05406  4 -51.790 112.1 19.60
## 25  2.1870                           -0.1116 -0.05259  3 -53.565 113.4 20.95
## 13  2.3100                  -0.06960 -0.1160           3 -53.763 113.8 21.35
## 23  1.2870         0.005258 -0.08570         -0.05509  4 -53.108 114.7 22.24
## 9   1.4660                           -0.1145           2 -55.541 115.2 22.76
## 7   0.4986         0.004955 -0.08192                   3 -55.332 117.0 24.49
## 19  0.3807         0.004417                  -0.05371  3 -55.506 117.3 24.83
## 21  1.6860                  -0.06750         -0.05185  3 -56.103 118.5 26.03
## 3  -0.3582         0.004182                            2 -57.765 119.7 27.20
## 17  0.8962                                   -0.05168  2 -57.775 119.7 27.23
## 5   0.9397                  -0.06663                   2 -58.280 120.7 28.23
## 1   0.1613                                             1 -60.022 122.1 29.62
##    weight
## 16  0.500
## 32  0.189
## 12  0.173
## 28  0.067
## 8   0.023
## 14  0.013
## 24  0.008
## 10  0.008
## 4   0.005
## 30  0.005
## 26  0.003
## 20  0.002
## 6   0.001
## 2   0.001
## 22  0.000
## 31  0.000
## 18  0.000
## 15  0.000
## 27  0.000
## 11  0.000
## 29  0.000
## 25  0.000
## 13  0.000
## 23  0.000
## 9   0.000
## 7   0.000
## 19  0.000
## 21  0.000
## 3   0.000
## 17  0.000
## 5   0.000
## 1   0.000
## Models ranked by AICc(x)

plot(candidates, main = "Fig. 1A: Cumulative weight of all possible candidate models")

I decided to set my ∆AICc threshold at 2, which is accepted in literature as models with substantial support (Burnham & Anderson (2002, 2004, 2010), Model Selection and Multimodel Inference: A Practical Information-Theoretic Approach). This narrows down the candidate selection down to model 16 and 32. Together, they acocunt for \(0.500 + 0.189 = 0.689\), or 68.9% of cumulative weight (also seen in Fig. 1A). In other words, models 16 and 32 together hold 68.9% of total model support.

1B. Model predicted lines and confidence intervals with the best model. Plot with your original data.

The best model is 16. Note that weights are not needed because each observation of roadkill contributes equally to the likelihood of roadkill occurring.

mod1.2 <- glm(kill ~ fruit + human + shrub + tree,
            family = binomial, data = roadkill)

# predict and plot for FRUIT
predFruit <- ggpredict(mod1.2, terms = "fruit")
predHuman <- ggpredict(mod1.2, terms = "human")
predShrub <- ggpredict(mod1.2, terms = "shrub")
predTree  <- ggpredict(mod1.2, terms = "tree")


fruit <- ggplot() +
  geom_point(aes(x = fruit, y = kill),
             data = roadkill, alpha = 0.3, size = 2) +
  geom_ribbon(aes(x = x, ymin = conf.low, ymax = conf.high),
              data = predFruit, alpha = 0.3, fill = "steelblue") +
  geom_line(aes(x = x, y = predicted),
            data = predFruit, color = "steelblue", size = 1) +
  labs(
    x = "Fruit distance",
    y = "Pr(roadkill)",
    title = "Fig. 1B: Predicted means and CIs of predictors on probability of roadkill"
  ) +
  theme_minimal()

human <- ggplot() +
  geom_point(aes(x = human, y = kill),
             data = roadkill, alpha = 0.3) +
  geom_ribbon(aes(x = x, ymin = conf.low, ymax = conf.high),
              data = predHuman, fill = "darkorange", alpha = 0.3) +
  geom_line(aes(x = x, y = predicted),
            data = predHuman, color = "darkorange", size = 1) +
  labs(x = "Human distance", y = "Pr(roadkill)") +
  theme_minimal()

shrub <- ggplot() +
  geom_point(aes(x = shrub, y = kill),
             data = roadkill, alpha = 0.3) +
  geom_ribbon(aes(x = x, ymin = conf.low, ymax = conf.high),
              data = predShrub, fill = "seagreen", alpha = 0.3) +
  geom_line(aes(x = x, y = predicted),
            data = predShrub, color = "seagreen", size = 1) +
  labs(x = "Shrub distance", y = "Pr(roadkill)") +
  theme_minimal()

tree <- ggplot() +
  geom_point(aes(x = tree, y = kill),
             data = roadkill, alpha = 0.3) +
  geom_ribbon(aes(x = x, ymin = conf.low, ymax = conf.high),
              data = predTree, fill = "firebrick", alpha = 0.3) +
  geom_line(aes(x = x, y = predicted),
            data = predTree, color = "firebrick", size = 1) +
  labs(x = "Tree distance", y = "Pr(roadkill)") +
  theme_minimal()

fruit + human + shrub + tree + plot_layout(ncol = 2)

Fig. 1B shows me that distance to the nearest human dwelling is the only plot where predicted mean probability of roadkill increases with increasing distance. There is a negative relationship between probability of roadkill occurrences and distance to fruit/shrub/non-fruiting tree for the rest of the predictors.

1C. Using the best model from your candidate set, report and interpret your results.

First, check the summary of the model to see if there is overdispersion.

summary(mod1.2)

## 
## Call:
## glm(formula = kill ~ fruit + human + shrub + tree, family = binomial, 
##     data = roadkill)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  3.964419   1.030820   3.846 0.000120
## fruit       -0.234031   0.069852  -3.350 0.000807
## human        0.007149   0.002613   2.736 0.006217
## shrub       -0.092020   0.046696  -1.971 0.048771
## tree        -0.134638   0.051944  -2.592 0.009542
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 120.044  on 86  degrees of freedom
## Residual deviance:  81.727  on 82  degrees of freedom
## AIC: 91.727
## 
## Number of Fisher Scoring iterations: 5

The ratio of residual deviance to degrees of freedom is \(81.727/82 = 0.997\), which is less than 1. Additionally, the p-values of the predictors look reasonable, thus there should be no overdispersion.

Anova(mod1.2, type = 2)

## Analysis of Deviance Table (Type II tests)
## 
## Response: kill
##       LR Chisq Df Pr(>Chisq)
## fruit  18.0877  1   2.11e-05
## human   9.5854  1   0.001961
## shrub   4.3765  1   0.036437
## tree    8.3797  1   0.003794

df.residual(mod1.2)

## [1] 82

# plot model diagnostics
binnedplot(x = fitted(mod1.2), y = residuals(mod1.2, type = "response"),
           main = "Fig. 1C: Binned residual plot")

From Fig. 1C, the residuals lie inside the ±0.2 SE bands, indicating that the model is a good fit. All predictors exhibit moderate to strong evidence of having an effect on kill (I placed the results in bullet-point form since there’s 4 of them, for clarity). There is very strong evidence that distance to nearest fruit tree, fruit, has a negative effect of -0.234 on the occurrences of roadkill (\(ꭓ2_{1, 82} = 18.09, P < 0.001\)). There is strong evidence that distance to nearest human dwelling, human, has a positive effect of 0.007 on occurrences of roadkill (\(ꭓ2_{1, 82} = 9.59, P = 0.002\)). There is moderate evidence that distance to the nearest shrub, shrub, has a negative effect of -0.092 on occurrences of roadkill (\(ꭓ2_{1, 82} = 4.38, P = 0.036\)). Finally, there is strong evidence that the distance to the nearest non-fruiting tree, tree, has a negative effect of -0.135 on occurrences of roadkill (\(ꭓ2_{1, 82} = 8.38, P = 0.004\)).

Question 2: Interactions and non-linear data

Load trees dataset and check the data.

# Read the "road.csv" sheet
trees <- read_excel("/Users/joylynngoh/Files/Uni/Y4/ES3307/Assignment 4/Assignment 4 data/Assignment 1 data.xls", sheet = "Dioecious trees")


# Quick check of data structure
head(trees)

## # A tibble: 6 × 3
##     SEX   DBH FLOWERS
##   <dbl> <dbl>   <dbl>
## 1     2   174     234
## 2     1   171     110
## 3     2   212     512
## 4     1   187     133
## 5     2   227    1107
## 6     1   274     624

str(trees)

## tibble [50 × 3] (S3: tbl_df/tbl/data.frame)
##  $ SEX    : num [1:50] 2 1 2 1 2 1 2 2 1 1 ...
##  $ DBH    : num [1:50] 174 171 212 187 227 274 86 161 285 182 ...
##  $ FLOWERS: num [1:50] 234 110 512 133 1107 ...

# code to check for any NA data.
sum(rowSums(is.na(trees)) == ncol(trees))   # 0

## [1] 0

# Factor name column
trees <- trees %>% 
  mutate(SEX = factor(SEX))

str(trees)

## tibble [50 × 3] (S3: tbl_df/tbl/data.frame)
##  $ SEX    : Factor w/ 2 levels "1","2": 2 1 2 1 2 1 2 2 1 1 ...
##  $ DBH    : num [1:50] 174 171 212 187 227 274 86 161 285 182 ...
##  $ FLOWERS: num [1:50] 234 110 512 133 1107 ...

2A. Plot the raw data

ggplot(trees, aes(x = DBH, y = FLOWERS, group = SEX)) +
  # Points: light colors
  geom_point(aes(color = SEX),
             size = 2.2, alpha = 0.7) +
  # Lines: darker colors (same SEX grouping)
  geom_smooth(aes(color = SEX),
              method = "lm", se = T, linewidth = 1.1) +
  # Manually define matching color pairs
  scale_color_manual(
    name = "Sex",
    values = c("1" = "#08306B", "2" = "#99000D"),   # deep blue & dark red
    labels = c("Male", "Female")
  ) +
  labs(
    x = "DBH (mm)",
    y = "Number of flowers",
    title = "Fig. 2A: Relationship between DBH and number of flowers in dioecious trees"
  ) +
  theme_minimal(base_size = 12)

From Fig. 2A, the model doesn’t seem to be a good fit for the data: there is a cluster of ‘0’ flowers at low DBH, while the spread of number of flowers increases sharply with increasing DBH. The data points also seem to curve upwards with increasing DBH, especially for female trees, which the linear model predicted lines do not capture. Female trees seem to have a larger slope than male trees. Finally, the fitted lines seem to cross at a certain point, indicating that DBH and SEX may interact with one another.

2B. Fit linear model

Bsaed on the Fig. 2A, I will first try to fit an interaction between DBH and SEX, but will also test it against a non-interaction model to confirm if the interaction is truly significant.

mod2.1 <- lm(FLOWERS~DBH+SEX, data = trees)
mod2.2 <- lm(FLOWERS~DBH*SEX, data = trees)
anova(mod2.1, mod2.2)

## Analysis of Variance Table
## 
## Model 1: FLOWERS ~ DBH + SEX
## Model 2: FLOWERS ~ DBH * SEX
##   Res.Df     RSS Df Sum of Sq      F   Pr(>F)
## 1     47 1148328                             
## 2     46  973657  1    174671 8.2523 0.006138

There is strong evidence that the interaction between DBH and SEX is statistically significant (\(F_{1,46} = 8.25, P = 0.006\)), therefore I will keep the interaction in the model.

anova(mod2.2)

## Analysis of Variance Table
## 
## Response: FLOWERS
##           Df  Sum Sq Mean Sq  F value    Pr(>F)
## DBH        1 5060723 5060723 239.0916 < 2.2e-16
## SEX        1  980045  980045  46.3018 1.791e-08
## DBH:SEX    1  174671  174671   8.2523  0.006138
## Residuals 46  973657   21166

Again, the model results show that there is strong evidence that the interaction between DBH and SEX influences the number of flowers (\(F_{1,46} = 8.25, P = 0.006\)).

Next, run model diagnostics to check model fit.

autoplot(lm(trees$FLOWERS ~ trees$DBH*trees$SEX),
         which = c(1:3, 5), ncol = 2, label.size = 3, label.repel = TRUE) + 
  theme_minimal()

The linear model doesn’t appear to fit the data well. Curvature and widening spread of residuals rightward in the top left plot indicates nonlinearity. Residuals are also not quite normal, as the QQ plot shows that the right tail deviates upwards. Curvature and a similar increasing spread of residuals is evidence of heteroscedasticity. Finally, a few points (50, 28, 27) seem to have high-leverage on the model results. Overall, the model is not a good fit.

2C. Transform the data

Since FLOWERS is count data, I will log FLOWERS to reduce skewness and variance with the mean.

# apply log transformation to FLOWERS
trees$log_flowers <- log10(trees$FLOWERS+1)

# retest for significance of interaction
mod2.3 <- lm(log_flowers~DBH*SEX, data = trees)
mod2.4 <- lm(log_flowers~DBH+SEX, data = trees)
anova(mod2.3, mod2.4)

## Analysis of Variance Table
## 
## Model 1: log_flowers ~ DBH * SEX
## Model 2: log_flowers ~ DBH + SEX
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     46 4.7764                           
## 2     47 4.8588 -1 -0.082395 0.7935 0.3777

With logged data, there is no evidence that including interaction between DBH and SEX improves model fit (\(P = 0.378\)). Thus, I will not fit any interaction in the model.

## PLOT MODEL PREDICTED LINES AND CONFIDENCE INTERVALS
# create prediction grid
pred <- expand.grid(
  DBH = seq(min(trees$DBH, na.rm = TRUE),
            max(trees$DBH, na.rm = TRUE),
            length.out = 200),
  SEX = levels(trees$SEX)
)

# predict on logged data
pr <- predict(mod2.4, newdata = pred, se.fit = TRUE)

# obtain t-multiplier for 95% CI
crit <- qt(0.975, df = mod2.4$df.residual)

# subset fit, SE, upper and lower CI limits from predicted results
pred$fit_log <- pr$fit
pred$se_log  <- pr$se.fit
pred$lwr_log <- pred$fit_log - crit * pred$se_log
pred$upr_log <- pred$fit_log + crit * pred$se_log


# back-transform means and CI to original count scale: 10^log_y - 1
bt <- function(z) 10^z - 1
pred$fit <- bt(pred$fit_log)
pred$lwr <- pmax(0, bt(pred$lwr_log))
pred$upr <- pmax(0, bt(pred$upr_log))

# plot raw data + model lines + CI ribbon on original scale
ggplot() +
  geom_point(aes(x = DBH, y = FLOWERS, group = SEX, colour = SEX),
    data = trees, alpha = 0.6) +
  geom_ribbon(aes(x = DBH, ymin = lwr, ymax = upr, fill = SEX, group = SEX),
    data = pred, alpha = 0.3, inherit.aes = FALSE) +
  geom_line(aes(x = DBH, y = fit, colour = SEX, group = SEX),
    data = pred, linewidth = 1.1, inherit.aes = FALSE) +
  scale_colour_manual(values = c("1" = "#08306B", "2" = "#99000D"),
                      labels = c("1" = "Male", "2" = "Female")) +
  scale_fill_manual(values = c("1" = "#08306B", "2" = "#99000D"),
                    labels = c("1" = "Male", "2" = "Female")) +
  labs(x = "DBH (mm)", y = "Number of flowers", colour = "Sex",
    fill = "Sex", title = "Fig. 2B: Model predicted means and 95% CI for log(flowers) against DBH by sex") +
  theme_minimal(base_size = 12)

# run diagnostics
autoplot(lm(trees$log_flowers ~ trees$DBH + trees$SEX),
         which = c(1:3, 5), ncol = 2, label.size = 3, label.repel = TRUE) + 
  theme_minimal()

# model results
Anova(mod2.4, type = 2)

## Anova Table (Type II tests)
## 
## Response: log_flowers
##            Sum Sq Df  F value Pr(>F)
## DBH       23.8102  1 230.3192 <2e-16
## SEX        0.4455  1   4.3094 0.0434
## Residuals  4.8588 47

Type II ANOVA shows that there is very strong evidence that DBH has an effect on log_flowers (\(F_{1, 47} = 230.32, P < 0.001\)). However, there is only moderate evidence that SEX has an effect on log_flowers (\(F_{1, 47} = 4.31, P = 0.043\)). The model diagnostics also show minimal improvement from the model with untransformed data. Curvature in the residuals vs fitted and scale-location plots have worsened, while the QQplot still shows the residuals do not follow the normal distribution well.

Fig. 2B shows a slightly better fit to the original data than Fig. 2A. Both predicted lines seem to run parallel to each other (ie. SEX and DBH do not interact). The right-skew is seems to be better accounted for by the model-predicted means, but the confidence intervals still don’t seem to account for increasing variance. Finally, the confidence intervals of the male and female model predicted means overlap even at higher DBH, so we cannot confidently say that there is a statistically significant difference in the means.

2D. Fit a GLM (Poisson)

# since now using a GLM, test again for statistical significance of interaction
mod2.5 <- glm(FLOWERS~DBH*SEX, family = poisson, data = trees)
mod2.6 <- glm(FLOWERS~DBH+SEX, family = poisson, data = trees)
anova(mod2.5, mod2.6, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: FLOWERS ~ DBH * SEX
## Model 2: FLOWERS ~ DBH + SEX
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)
## 1        46     2458.7                      
## 2        47     2486.8 -1  -28.074 1.168e-07

There is extremely strong evidence that including an interaction between DBH and SEX improves model fit (\(ꭓ2{1,47} = 28.07, P < 0.001\)). Thus, I will keep the interaction term for this part.

Next, run diagnostics.

# run diagnostics
resD <- simulateResiduals(mod2.5, plot = T)

From the QQplot, there is strong deviation from the diagonal. The non-normality is also further confirmed by all the tests returning significant results for dispersion, outliers, and uniformity (KS) tests. The residual vs predicted plot shows that residuals are not evenly distributed along predicted values—there is either non-linearity or overdispersion. The quantile curves show significant deviation from their predicted uniform lines.

Next, plot model-predicted lines. I try out the manual method for backtransforming here.

## PLOT MODEL PREDICTED LINES AND CONFIDENCE INTERVALS
# trying manual method of finding predicted means and backtransforming
# create prediction grid
pred2 <- expand.grid(
  DBH = seq(min(trees$DBH, na.rm = TRUE),
            max(trees$DBH, na.rm = TRUE),
            length.out = 200),
  SEX = levels(trees$SEX)
)

# predict on logged data
pr2 <- predict(mod2.5, newdata = pred2, se.fit = TRUE)

# Back-transform to the count scale
pred2$fit <- exp(pr2$fit)
pred2$lwr <- exp(pr2$fit - 1.96 * pr2$se.fit)
pred2$upr <- exp(pr2$fit + 1.96 * pr2$se.fit)

# plot raw data + model lines + CI ribbon on original scale
ggplot() +
  geom_point(aes(x = DBH, y = FLOWERS, group = SEX, colour = SEX),
    data = trees, alpha = 0.6) +
  geom_ribbon(aes(x = DBH, ymin = lwr, ymax = upr, fill = SEX, group = SEX),
    data = pred2, alpha = 0.3, inherit.aes = FALSE) +
  geom_line(aes(x = DBH, y = fit, colour = SEX, group = SEX),
    data = pred2, linewidth = 1.1, inherit.aes = FALSE) +
  scale_colour_manual(values = c("1" = "#08306B", "2" = "#99000D"),
                      labels = c("1" = "Male", "2" = "Female")) +
  scale_fill_manual(values = c("1" = "#08306B", "2" = "#99000D"),
                    labels = c("1" = "Male", "2" = "Female")) +
  labs(x = "DBH (mm)", y = "Number of flowers", colour = "Sex",
    fill = "Sex", title = "Fig. 2C: Model predicted means and 95% CI for GLM (Poisson) counts") +
  theme_minimal(base_size = 12)

# model results
print(anova(mod2.5, test = "Chisq"))

## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: FLOWERS
## 
## Terms added sequentially (first to last)
## 
## 
##         Df Deviance Resid. Df Resid. Dev  Pr(>Chi)
## NULL                       49    17921.6          
## DBH      1  14343.9        48     3577.6 < 2.2e-16
## SEX      1   1090.9        47     2486.7 < 2.2e-16
## DBH:SEX  1     28.1        46     2458.7 1.168e-07

From the model results, there is very strong evidence that the interaction between DBH and SEX has an effect on the number of FLOWERS when modeled with a Poisson distribution (\(ꭓ2_{1,46} = 28.1, P < 0.001\)).

Fig. 2C shows an even better fit (in terms of the mean) to the original data than Fig. 2B, however the confidence intervals are so narrow that they are nearly invisible. This shows that the Poisson distribution does not account well for the large dispersion in data.

Next, use summary() to check for overdispersion and calculate proportion of null deviance explained.

summary(mod2.5)

## 
## Call:
## glm(formula = FLOWERS ~ DBH * SEX, family = poisson, data = trees)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  2.2567217  0.0798972  28.245  < 2e-16
## DBH          0.0150589  0.0003198  47.087  < 2e-16
## SEX2         0.9453136  0.0869851  10.868  < 2e-16
## DBH:SEX2    -0.0018137  0.0003460  -5.242 1.59e-07
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 17921.6  on 49  degrees of freedom
## Residual deviance:  2458.7  on 46  degrees of freedom
## AIC: 2811.6
## 
## Number of Fisher Scoring iterations: 5

paste0("Proportion of null deviance explained = ", (17921.6-2458.7)/17921.6)

## [1] "Proportion of null deviance explained = 0.862808008213552"

paste0("Ratio of residual deviance to residual degrees of freedom = ", 2458.7/46)

## [1] "Ratio of residual deviance to residual degrees of freedom = 53.45"

The model explains roughly 86.3% of null deviance. However, the dispersion parameter assumes a 1:1 variance-mean relationship, whereas the true ratio of residual deviance to degrees of freedom, 53.45, is much higher than assumed dispersion. This might explain why all the independent variables now appear to have statistically significant effects on FLOWERS. This overdispersion may explain why the confidence intervals in Fig. 2C are so narrow, and do not accurately capture the wide spread of points at high DBH values. The model has severely underestimated the uncertainty in the data.

We can confirm the overdispersion with DHARMa.

testDispersion(resD)

## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 69.578, p-value < 2.2e-16
## alternative hypothesis: two.sided

The red line is the observed dispersion ratio, while the black distribution at the far left is the simulated expected distribution of dispersion ratios of the model—since the red line is quite literally off the scale, it’s clear that extreme overdispersion in the data is not accounted for.

2E. Fit a GLM (Quasipoisson)

mod2.7 <- glm(FLOWERS~DBH*SEX, family = quasipoisson, data = trees)
mod2.8 <- glm(FLOWERS~DBH + SEX, family = quasipoisson, data = trees)
anova(mod2.7, mod2.8, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: FLOWERS ~ DBH * SEX
## Model 2: FLOWERS ~ DBH + SEX
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        46     2458.7                     
## 2        47     2486.8 -1  -28.074   0.4571

Comparing the interaction and no-interaction models with the quasipoisson distribution, I can confirm that there is no evidence that including the interaction between DBH and SEX improves model fit (\(ꭓ2_{1,47} = 28.1, P = 0.457\)). Thus, I will not include the interaction for this part.

summary(mod2.8)

## 
## Call:
## glm(formula = FLOWERS ~ DBH + SEX, family = quasipoisson, data = trees)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6343031  0.2301817  11.444 3.45e-15
## DBH         0.0135204  0.0008678  15.579  < 2e-16
## SEX2        0.4982782  0.1097203   4.541 3.89e-05
## 
## (Dispersion parameter for quasipoisson family taken to be 50.4053)
## 
##     Null deviance: 17921.6  on 49  degrees of freedom
## Residual deviance:  2486.7  on 47  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

paste0("Proportion of null deviance explained = ", round((17921.6-2486.7)/17921.6, 3))

## [1] "Proportion of null deviance explained = 0.861"

paste0("Ratio of residual deviance to residual degrees of freedom = ", round(2486.7/47, 2))

## [1] "Ratio of residual deviance to residual degrees of freedom = 52.91"

With a quasipoisson distribution, the true dispersion now seems to be much more accurately accounted for—the dispersion parameter in this new model is assumed to be 50.78, which is much closer to the true dispersion of 52.91 This is reflected in more realistic p-values. The proportion of null deviance explained by the model remains at 86%.

# print model results
Anova(mod2.8, type = 2)

## Analysis of Deviance Table (Type II tests)
## 
## Response: FLOWERS
##     LR Chisq Df Pr(>Chisq)
## DBH  297.096  1  < 2.2e-16
## SEX   21.642  1  3.285e-06

df.residual(mod2.8)

## [1] 47

Without the interaction term, there is very strong evidence that both DBH and SEX have separate effects on FLOWERS (for DBH, effect estimate = 0.014, \(ꭓ2_{1,47} = 297.10, P < 0.001\); for SEX, effect estimate = 0.50, \(ꭓ2_{1,47} = 21.64, P < 0.001\)).

Next, run diagnostics. DHARMa cannot be used with a quasipoisson distribution so I manually plot the following diagnostic plots.

# check residuals
binnedplot(fitted(mod2.8), residuals(mod2.8, type = "pearson"),
           xlab = "Predicted values",
           ylab = "Pearson residuals",
           main = "Binned residual plot for GLM (Quasipoisson)")

# check normality of residuals
stdres <- rstandard(mod2.8)
qqnorm(stdres,
       main = "QQ Plot of Standardized Residuals")
qqline(stdres, col = "red", lwd = 2)

The binned plot shows a mild wave-like curve in the residuals, which suggests that the mean-variance relationship might still not be modelled too well by the quasipoisson distribution. Negative residuals at low and high fitted values suggest that the model overpredicts slightly on those ends (observed counts are lower than expected), and strongly positive residuals around the 300-500 range of predicted values shows that there is underprediction (observed counts are higher than expected). However, the residuals still lie within the 95% confidence intervals for the most part, meaning that the model handles the dispersion much better.

The QQplot shows that the tails deviate slightly from the diagonal, meaning there might be some outliers and slight skew. Regardless, the normality of residuals has improved a lot from the Poisson GLM, so overall, the model fits reasonably well. To further improve the residuals, I might try a negative binomial distribution as well.

Next, plot model-predicted lines. Here, I use ggpredict() to plot for me.

## PLOT MODEL PREDICTED LINES AND CONFIDENCE INTERVALS
# use ggpredict() which automatically back-transforms 
pred3 <- ggpredict(mod2.8, terms = c("DBH", "SEX")) %>% 
  as.data.frame() %>%
  # 2) rename to match your plotting code
  rename(
    DBH = x,
    fit = predicted,
    lwr = conf.low,
    upr = conf.high,
    SEX = group
  )

pred3$SEX <- factor(pred3$SEX, levels = levels(trees$SEX))

ggplot() +
  geom_point(aes(x = DBH, y = FLOWERS, group = SEX, colour = SEX),
             data = trees, alpha = 0.6) +
  geom_ribbon(aes(x = DBH, ymin = lwr, ymax = upr, fill = SEX, group = SEX),
              data = pred3, alpha = 0.3, inherit.aes = FALSE) +
  geom_line(aes(x = DBH, y = fit, colour = SEX, group = SEX),
            data = pred3, linewidth = 1.1, inherit.aes = FALSE) +
  scale_colour_manual(values = c("1" = "#08306B", "2" = "#99000D"),
                      labels = c("1" = "Male", "2" = "Female")) +
  scale_fill_manual(values = c("1" = "#08306B", "2" = "#99000D"),
                    labels = c("1" = "Male", "2" = "Female")) +
  labs(x = "DBH (mm)",
       y = "Number of flowers",
       colour = "Sex",
       fill   = "Sex",
       title  = "Fig. 2D: Model predicted means and 95% CI for GLM (quasipoisson) counts") +
  theme_minimal(base_size = 12)

Fig. 2D fits the original data the best. It accounts for both the right skew and heteroscedasticity. The 95% CIs also fit the wide spread of high means more accurately than Fig. 2C; this visually confirms that the model is no longer underestimating the uncertainty in the data.

2F.

From the above models, I would select the quasipoisson model because it fits the original data the closest, and extreme overdispersion has now been accounted for.

What are the advantages and disadvantages associated with running a linear model or a generalised linear model? Linear models are advantageous because of their simplicity. They are much simpler to fit and interpret, and there is a wide variety of well-established diagnostics to check the validity of the model. However, the simplicity of linear models also means that they are only suitable for data with simpler mean-variance structures, and data needs to be normal, linear, homoscedastic, and independent. Most ecological data collected might not meet these assumptions.

Against the drawbacks of using LMs, GLMs shine because of their flexibility to handle violated assumptions such as large variance/heteroscedasticity, non-normal data, and outliers. Their primary advantage is their ability to handle non-continuous response variables—count, binary, and proportional data. Means are also modelled on transformed scales inherently, so that predictions are ensured to stay within valid ranges (such as 0 and 1 for binary or proportional data).

However, because GLMs are built to handle complex data structure, they are also more complex to interpret and back-transform is often needed to plot model-predicted means. Avaiabile diagnostics are also limited in application—for example, the main diagnostic package used for GLMs cannot be applied to quasipoisson distributions.

For both LMs and GLMs, choosing the wrong transformation or family leads to misinterpretations of the data. For example, if I did not properly check through the model diagnostics above, I might have concluded that DBH and sex interact in affecting the number of flowers. Thus, being thorough in checking model diagnostics, understanding model fit and the biological implications of the proposed model is crucial to ensure the LM or GLM is the appropriate one to fit the data, regardless of the advantages or disadvantages of each.

Question 3

Load dungbeetletraptype.csv dataset and check the data.

# Read the "road.csv" sheet
beetle <- read.csv("/Users/joylynngoh/Files/Uni/Y4/ES3307/Assignment 4/Assignment 4 data/dungbeetletraptype.csv")


# Quick check of data structure
head(beetle)

##              trapID year trap           habitat Catharsius.dayacus
## 1 2005_FITs-BRL.2.1 2005  FIT Old-growth Forest                  0
## 2 2005_FITs-BRL.2.2 2005  FIT Old-growth Forest                  0
## 3 2005_FITs-BRL.2.3 2005  FIT Old-growth Forest                  1
## 4 2005_FITs-BRL.2.4 2005  FIT Old-growth Forest                  0
## 5 2005_FITs-BRL.2.5 2005  FIT Old-growth Forest                  0
## 6 2005_FITs-BRL.3.1 2005  FIT Old-growth Forest                  0
##   Catharsius.renaudpauliani Copris.agnus Copris.sinicus Microcopris.doriae
## 1                         0            0              0                  0
## 2                         0            0              0                  0
## 3                         0            0              0                  0
## 4                         0            0              0                  0
## 5                         0            0              0                  0
## 6                         0            0              0                  0
##   Microcopris.hidakai Ochicanthon.worroae Oniticellus.tessellatus
## 1                   0                   0                       2
## 2                   0                   0                       3
## 3                   0                   0                       2
## 4                   0                   0                       5
## 5                   1                   0                       4
## 6                   0                   0                       6
##   Onthophagus..Indachorius..aff.aereopictus Onthophagus..Indachorius..aff.cheyi
## 1                                         3                                   0
## 2                                         6                                   0
## 3                                         2                                   0
## 4                                         2                                   0
## 5                                         2                                   0
## 6                                         0                                   0
##   Onthophagus..Indachorius..aff.danumensis Onthophagus..Indachorius..aff.woroae
## 1                                        0                                    0
## 2                                        0                                    0
## 3                                        0                                    0
## 4                                        0                                    0
## 5                                        0                                    0
## 6                                        0                                    0
##   Onthophagus.aff.pacificus Onthophagus.aff.phanaeides Onthophagus.aff.rutilans
## 1                         0                          0                        0
## 2                         0                          0                        0
## 3                         0                          0                        0
## 4                         0                          0                        0
## 5                         0                          0                        1
## 6                         0                          0                        0
##   Onthophagus.aphodiodes Onthophagus.borneensis Onthophagus.cervicapra
## 1                      0                      0                      0
## 2                      0                      0                      3
## 3                      0                      0                      2
## 4                      1                      0                      2
## 5                      0                      0                      1
## 6                      0                      0                      2
##   Onthophagus.deflexicollis Onthophagus.hirsutulus Onthophagus.incisus
## 1                         1                      1                   0
## 2                         1                      2                   0
## 3                         2                      0                   0
## 4                         0                      0                   1
## 5                         0                      0                   0
## 6                         0                      0                   0
##   Onthophagus.johkii Onthophagus.mulleri Onthophagus.obscurior
## 1                  0                   0                     0
## 2                  0                   0                     0
## 3                  0                   0                     0
## 4                  0                   0                     0
## 5                  0                   0                     0
## 6                  0                   0                     0
##   Onthophagus.pacificus Onthophagus.parviobscurior Onthophagus.pavidus
## 1                     0                          0                   0
## 2                     0                          1                   0
## 3                     3                          0                   0
## 4                     2                          0                   0
## 5                     0                          0                   0
## 6                     1                          0                   0
##   Onthophagus.rorarius Onthophagus.rudis Onthophagus.rugicollis
## 1                    0                 5                      5
## 2                    0                 7                      3
## 3                    0                 9                      3
## 4                    0                 6                      2
## 5                    0                 8                      2
## 6                    0                 4                      8
##   Onthophagus.sarawacus Onthophagus.semiaureus Onthophagus.semicupreus
## 1                     0                      2                       1
## 2                     0                      4                       3
## 3                     0                      5                       2
## 4                     0                      2                       1
## 5                     0                      3                       2
## 6                     0                      6                       2
##   Onthophagus.sp.08 Onthophagus.sp.20 Onthophagus.vethi Onthophagus.vulpes
## 1                 3                 0                 0                  0
## 2                 0                 0                 0                  2
## 3                 4                 0                 0                  2
## 4                 1                 0                 1                  1
## 5                 0                 0                 0                  3
## 6                 2                 0                 0                  2
##   Paragymnopleurus.maurus Paragymnopleurus.sparsus Paragymnopleurus.striatus
## 1                       0                        2                         0
## 2                       0                        3                         0
## 3                       0                        1                         0
## 4                       0                        2                         0
## 5                       0                        1                         1
## 6                       1                        0                         0
##   Proagoderus.watanabei Sisyphus.thoracicus
## 1                     0                   4
## 2                     0                   6
## 3                     0                   5
## 4                     0                   3
## 5                     0                   6
## 6                     0                   7

str(beetle[,1:10])

## 'data.frame':    304 obs. of  10 variables:
##  $ trapID                   : chr  "2005_FITs-BRL.2.1" "2005_FITs-BRL.2.2" "2005_FITs-BRL.2.3" "2005_FITs-BRL.2.4" ...
##  $ year                     : int  2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
##  $ trap                     : chr  "FIT" "FIT" "FIT" "FIT" ...
##  $ habitat                  : chr  "Old-growth Forest" "Old-growth Forest" "Old-growth Forest" "Old-growth Forest" ...
##  $ Catharsius.dayacus       : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Catharsius.renaudpauliani: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Copris.agnus             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Copris.sinicus           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Microcopris.doriae       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Microcopris.hidakai      : int  0 0 0 0 1 0 1 0 0 0 ...

# code to check for any NA data.
sum(rowSums(is.na(beetle)) == ncol(beetle))   # 0

## [1] 0

# Factor name column
beetle <- beetle %>% 
  mutate(year    = factor(year),
         trap    = factor(trap),
         habitat = factor(habitat))

str(beetle[, 1:4])

## 'data.frame':    304 obs. of  4 variables:
##  $ trapID : chr  "2005_FITs-BRL.2.1" "2005_FITs-BRL.2.2" "2005_FITs-BRL.2.3" "2005_FITs-BRL.2.4" ...
##  $ year   : Factor w/ 2 levels "2005","2012": 1 1 1 1 1 1 1 1 1 1 ...
##  $ trap   : Factor w/ 2 levels "BPT","FIT": 2 2 2 2 2 2 2 2 2 2 ...
##  $ habitat: Factor w/ 3 levels "Logged Forest",..: 2 2 2 2 2 2 2 2 2 2 ...

I would use NMDS to visualise the differences in composition. This is because the data contains many zeros, species counts are likely non-normal, and the relationship between explanatory variables (trap type and habitat) and dependent variable (composition) are likely non-linear. Thus, ordination methods that assume linearity/unimodality, normality, and continuous data such as PCA, RDA, and CCA are not appropriate for this data. Since NMDS uses rank-based dissimilarities, this allows it to handle non-normal, non-linear count species data and categorical environmental variables.

Additionally, I use Bray-Curtis as the distance matrix since the data provided is abundance.

set.seed(123)
sp_transform <- decostand(beetle[, 5:46], method = "hellinger")

# Store results
stress_vals <- data.frame(k = 1:6, stress = NA)
nmds_list <- vector("list", length(1:6))

# Run NMDS for each dimension and save results
for (i in 1:6) {
  invisible(capture.output(
    nmds_fit <- metaMDS(sp_transform, distance = "bray",
                        k = i, trymax = 100, autotransform = FALSE)
  ))
  stress_vals$stress[i] <- nmds_fit$stress
  nmds_list[[i]] <- nmds_fit
}

# Screeplot: Stress vs. number of dimensions
plot(stress_vals$k, stress_vals$stress, type = "b", pch = 19,
     xlab = "Number of Dimensions (k)",
     ylab = "Stress",
     main = "Fig. 3A: NMDS Screeplot")
abline(h = 0.2, col = "red", lty = 2)
abline(h = 0.1, col = "blue", lty = 2)

# Stressplots for each k (Shepard plots)
par(mfrow = c(2, 3))
for (i in 1:6) {
  stressplot(nmds_list[[i]],
             main = paste("Shepard plot (k =", i, ")"))
}

par(mfrow = c(1, 1))  # reset layout

In Fig. 3A, the blue line shows the threshold where the NMDS is considered a good fit to the data (stress = 0.1), and the red line is the threshold where NMDS is considered an acceptable fit to the data (stress = 0.2). The NMDS needs to have 3 dimensions minimally for the stress to be within an acceptable range (ie. < 0.2). The series of Shepard plots also show that the linear fit is quite poor when k = 1 and k = 2, but the non-metric and linear \(R^2\) fit are acceptable when k = 3 (non-metric \(R^2\) = 0.97, linear \(R^2\) = 0.88). Thus, I will plot my ordination plot with 3 dimensions.

3A. Plot and interpret the ordination plot of the selected multivariate analysis.

nmds <- metaMDS(sp_transform, distance = "bray", k = 3, trymax = 100)

## Run 0 stress 0.1527174 
## Run 1 stress 0.1537081 
## Run 2 stress 0.1540843 
## Run 3 stress 0.15573 
## Run 4 stress 0.1569863 
## Run 5 stress 0.1524812 
## ... New best solution
## ... Procrustes: rmse 0.0139162  max resid 0.1368845 
## Run 6 stress 0.1536212 
## Run 7 stress 0.1552547 
## Run 8 stress 0.1542492 
## Run 9 stress 0.1556658 
## Run 10 stress 0.1535696 
## Run 11 stress 0.1531501 
## Run 12 stress 0.1511556 
## ... New best solution
## ... Procrustes: rmse 0.01222029  max resid 0.136235 
## Run 13 stress 0.1539856 
## Run 14 stress 0.1556293 
## Run 15 stress 0.1529907 
## Run 16 stress 0.1544035 
## Run 17 stress 0.1524752 
## Run 18 stress 0.1551964 
## Run 19 stress 0.1561474 
## Run 20 stress 0.154356 
## Run 21 stress 0.1558287 
## Run 22 stress 0.155541 
## Run 23 stress 0.1513194 
## ... Procrustes: rmse 0.008929023  max resid 0.1103343 
## Run 24 stress 0.1541663 
## Run 25 stress 0.1526696 
## Run 26 stress 0.1604286 
## Run 27 stress 0.1558357 
## Run 28 stress 0.1512504 
## ... Procrustes: rmse 0.005235563  max resid 0.07738059 
## Run 29 stress 0.153586 
## Run 30 stress 0.1515637 
## ... Procrustes: rmse 0.009393888  max resid 0.1122967 
## Run 31 stress 0.1543649 
## Run 32 stress 0.1521039 
## Run 33 stress 0.1541479 
## Run 34 stress 0.1553501 
## Run 35 stress 0.1530897 
## Run 36 stress 0.1521769 
## Run 37 stress 0.1512812 
## ... Procrustes: rmse 0.009436916  max resid 0.1064164 
## Run 38 stress 0.1535219 
## Run 39 stress 0.1549554 
## Run 40 stress 0.1540258 
## Run 41 stress 0.155363 
## Run 42 stress 0.1561154 
## Run 43 stress 0.1553417 
## Run 44 stress 0.1534732 
## Run 45 stress 0.1512948 
## ... Procrustes: rmse 0.007637316  max resid 0.09382083 
## Run 46 stress 0.153538 
## Run 47 stress 0.1554311 
## Run 48 stress 0.1525911 
## Run 49 stress 0.1510487 
## ... New best solution
## ... Procrustes: rmse 0.00313771  max resid 0.04434288 
## Run 50 stress 0.1556383 
## Run 51 stress 0.1546409 
## Run 52 stress 0.1552792 
## Run 53 stress 0.151306 
## ... Procrustes: rmse 0.008267294  max resid 0.1032006 
## Run 54 stress 0.1525788 
## Run 55 stress 0.1554732 
## Run 56 stress 0.1523341 
## Run 57 stress 0.1575604 
## Run 58 stress 0.1544746 
## Run 59 stress 0.1527646 
## Run 60 stress 0.1540346 
## Run 61 stress 0.1565822 
## Run 62 stress 0.1535672 
## Run 63 stress 0.1556993 
## Run 64 stress 0.1531263 
## Run 65 stress 0.1511625 
## ... Procrustes: rmse 0.008170931  max resid 0.09324797 
## Run 66 stress 0.15274 
## Run 67 stress 0.1560297 
## Run 68 stress 0.1535676 
## Run 69 stress 0.1539259 
## Run 70 stress 0.1553788 
## Run 71 stress 0.1534982 
## Run 72 stress 0.1555724 
## Run 73 stress 0.1555718 
## Run 74 stress 0.1540956 
## Run 75 stress 0.1586043 
## Run 76 stress 0.154804 
## Run 77 stress 0.1550488 
## Run 78 stress 0.1560716 
## Run 79 stress 0.1556992 
## Run 80 stress 0.1530368 
## Run 81 stress 0.1546032 
## Run 82 stress 0.1543198 
## Run 83 stress 0.153211 
## Run 84 stress 0.154698 
## Run 85 stress 0.1512799 
## ... Procrustes: rmse 0.008903265  max resid 0.1062317 
## Run 86 stress 0.1571561 
## Run 87 stress 0.1531263 
## Run 88 stress 0.1556657 
## Run 89 stress 0.1518979 
## Run 90 stress 0.1533643 
## Run 91 stress 0.1601035 
## Run 92 stress 0.1542321 
## Run 93 stress 0.1511152 
## ... Procrustes: rmse 0.008157752  max resid 0.09341982 
## Run 94 stress 0.152718 
## Run 95 stress 0.1571403 
## Run 96 stress 0.1574568 
## Run 97 stress 0.1553585 
## Run 98 stress 0.1567396 
## Run 99 stress 0.1558058 
## Run 100 stress 0.1513413 
## ... Procrustes: rmse 0.00559654  max resid 0.07703664 
## *** Best solution was not repeated -- monoMDS stopping criteria:
##     26: no. of iterations >= maxit
##     70: stress ratio > sratmax
##      4: scale factor of the gradient < sfgrmin

# 1) get site scores
nmdsScores <- as.data.frame(scores(nmds, display = "sites"))
nmdsScores$trap <- beetle[, 3]   # your grouping variable

plot_nmds_panel <- function(df, xvar, yvar, stress_val) {
  ggplot(df, aes_string(x = xvar, y = yvar, colour = "trap")) +
    geom_point(size = 1.5, alpha = 0.7) +
    stat_ellipse(aes(group = trap), level = 0.95,
                 linetype = 1, linewidth = 0.25) +
    labs(
      title = paste(xvar, "vs", yvar),
      x = xvar,
      y = yvar,
      colour = "Trap type"
    ) +
    theme_minimal() +
    theme(
      legend.position = "top",
      axis.text.x = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks  = element_blank(),
      plot.title = element_text(size = 10, face = "bold"),
      plot.subtitle = element_text(size = 9)
    )
}

p12 <- plot_nmds_panel(nmdsScores, "NMDS1", "NMDS2", nmds$stress)
p13 <- plot_nmds_panel(nmdsScores, "NMDS1", "NMDS3", nmds$stress)
p23 <- plot_nmds_panel(nmdsScores, "NMDS2", "NMDS3", nmds$stress)

gridExtra::grid.arrange(p12, p13, p23, ncol = 3,
                        top = textGrob(
    "Fig. 3B: Pairwise presentation of 3-D NMDS ordination"),
    bottom = textGrob("Stress = 0.151 | R² = 0.977")
    )

For simplicity, i decided to compress a 3-D NMDS ordination into 2-D by splitting the comparisons into pairwise ones. Along NMDS1, the species compositional separation is quite clear, since the 95% confidence ellipsoids do not overlap much. Species that were captured with the FIT trap type are quite different from those that were captured with the BPT trap type; only a handful of species were captured by both types. From left to right, the 95% confidence ellipsoids also overlap more, indicating that the separation becomes less and less clear for higher NMDS dimensions.

3B. Using appropriate statistical tests, report and interpret your results.

I chose to use PERMANOVA test to test for significant differences in the community compositions between two trap types. It does so by computing the dissimilarity matrix between sites based on the species abundance data, then partitions the sums of squared distances among groups (or explanatory factors) and residuals, calculates a pseudo-F value, then assesses significance via permutations of the data.

It is appropriate for this data because it doesn’t assume multivariate normality of raw species data, so it is not limited by possible non-normality in the data.

# Prepare the distance matrix
bray_dist <- vegdist(sp_transform, method = "bray")

# perform PERMANOVA
permanova <- adonis2(bray_dist ~ trap, data = beetle, permutations = 999)
permanova

## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = bray_dist ~ trap, data = beetle, permutations = 999)
##           Df SumOfSqs      R2      F Pr(>F)
## Model      1   14.245 0.21941 84.888  0.001
## Residual 302   50.678 0.78059              
## Total    303   64.923 1.00000

# Check for non-equal dispersions using betadisper() and permutest()
dispersion_model <- betadisper(bray_dist, beetle$trap)
permutest(dispersion_model, pairwise = TRUE) # include pairwise comparisons

## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##            Df Sum Sq  Mean Sq      F N.Perm Pr(>F)
## Groups      1 0.0294 0.029432 2.7506    999  0.085
## Residuals 302 3.2314 0.010700                     
## 
## Pairwise comparisons:
## (Observed p-value below diagonal, permuted p-value above diagonal)
##          BPT   FIT
## BPT          0.082
## FIT 0.098256

# Visualise differences in dispersions between groups
plot(dispersion_model)

There is strong evidence that there there is a compositional difference between trap types (\(F_{1, 302} = 84.89, P = 0.001\)).

Furthermore, the permutation test and dispersion_model plot shows that there is no statistically significant difference in group dispersions among trap types (\(F_{1, 302} = 2.75, P = 0.094\)). The plot shows substantial overlap between the FIT group and BPT group. This indicates that there is heterogeneity of multivariate dispersions between the trap types—spatial distance of each trap type from the centroids are quite similar. Thus, I can interpret that the significant differences observed in the PERMANOVA are due to differences in the group centroids, rather than the group dispersions.

ES3307 Assignment 4

Joylynn Goh

2025-10-28