# data
# load result dataset from step2
result <- read_excel("result_after_step2.xlsx")
# ensure date is a date
result$date_ym <- as.Date(result$date_ym)
## seasonality: Serfling Model (sine cosine)
# Serfling Model - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1915276/
t <- result$t
result <- result %>%
ungroup() %>%
mutate(
sin12 = sin(2 * pi * t / 12),
cos12 = cos(2 * pi * t / 12)) # can also test different lengths (only tried 6 instead of 12, did not improve model)
# added variables for intervention and trends
result$int <- ifelse(result$year >= 2024, 1, 0) # int = intervention (jan 2024)
result$trend <- pmax(0, result$t - 16 + 1 ) # trend - slope post change
# removes ID with >5% missing and old LHU (removing PPP Cascais too)
clean_result <- result %>%
filter(!id_number %in% c(11, 19, 14, 24, 33, 34, 35, 36, 37, 38, 39, 40))
# this means keeping 12, 13 and 26, which also have some NAs (3.33%)
models - monthly non-urgent - white blue green (wbg)
# model 1: simple OLS just w/ seasonality adjustment
m_ols <- lm(
wbg ~ t + int + trend + sin12 + cos12 + offset(log(catch_area)),
data = clean_result)
# model 1.1: institutions and random intercepts (linear fixed effects models)
m_ols1 <- lmer(wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + offset(log(catch_area)),
data = clean_result)
# model 2: negative binomial (supposedly better for counts)
m_nb <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + offset(log(catch_area)),
data = clean_result,
family = nbinom2)
# model 3: negative binomial and lag terms for autocorrelation
clean_result <- clean_result %>%
arrange(id_number, t) %>%
group_by(id_number) %>%
mutate(
lag1 = lag(wbg, 1),
lag2 = lag(wbg, 2),
lag3 = lag(wbg, 3),
lag4 = lag(wbg, 4),
lag5 = lag(wbg, 5)) %>%
ungroup()
m_nb2 <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number)
+ lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area)),
data = clean_result,
family = nbinom2)
# model 4: neg bin, lag terms and random slopes (instead of intercepts)
m_nb_rs <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12
+ (1 + t | id_number) # random slopes
+ lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area)),
data = clean_result,
family = nbinom2)
# model 5: neg bin, lag terms and random slopes (instead of intercepts), leads and lags for ant/delayed effects
clean_result <- clean_result %>%
mutate(
# Leads (anticipatory effects)
int_lead2 = ifelse(t >= 14, 1, 0), # t=14,15 before intervention
int_lead1 = ifelse(t >= 15, 1, 0), # t=15 just before intervention
# Lags (delayed effects)
int_lag1 = ifelse(t >= 17, 1, 0), # t=17 onwards (1 month delay)
int_lag2 = ifelse(t >= 18, 1, 0), # t=18 onwards (2 month delay)
int_lag3 = ifelse(t >= 19, 1, 0)) # t=19 onwards (3 month delay)
m_nb_ll <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12
+ (1 + t | id_number) # random slopes
+ int_lead2 + int_lead1 + int_lag1 + int_lag2 + int_lag3
+ lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area)),
data = clean_result,
family = nbinom2)
# model 6: same as nb ll, but removing ns lag terms
m_nb_ll2 <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12
+ (1 + t | id_number) # random slopes
+ int_lead1 + int_lag1 + int_lag2
+ lag1 + lag2 + lag3 + lag5 + offset(log(catch_area)),
data = clean_result,
family = nbinom2)
summaries
summary(m_ols)
##
## Call:
## lm(formula = wbg ~ t + int + trend + sin12 + cos12 + offset(log(catch_area)),
## data = clean_result)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4317.7 -1799.5 -347.4 939.6 10151.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6311.843 278.992 22.624 <2e-16 ***
## t -76.706 30.769 -2.493 0.0129 *
## int 593.933 383.590 1.548 0.1219
## trend 4.534 43.528 0.104 0.9171
## sin12 -91.385 135.400 -0.675 0.4999
## cos12 241.229 135.432 1.781 0.0752 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2644 on 831 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.0274, Adjusted R-squared: 0.02155
## F-statistic: 4.682 on 5 and 831 DF, p-value: 0.000319
summary(m_ols1)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + offset(log(catch_area))
## Data: clean_result
##
## REML criterion at convergence: 13495.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.1962 -0.6051 -0.0461 0.5732 3.8337
##
## Random effects:
## Groups Name Variance Std.Dev.
## id_number (Intercept) 6640100 2576.8
## Residual 518126 719.8
## Number of obs: 837, groups: id_number, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 6312.098 492.866 12.807
## t -76.645 8.378 -9.148
## int 593.887 104.446 5.686
## trend 4.027 11.852 0.340
## sin12 -94.375 36.869 -2.560
## cos12 238.214 36.878 6.460
##
## Correlation of Fixed Effects:
## (Intr) t int trend sin12
## t -0.136
## int 0.052 -0.577
## trend 0.098 -0.707 -0.057
## sin12 -0.028 0.120 0.089 -0.231
## cos12 0.022 -0.209 0.283 0.064 0.002
summary(m_nb)
## Family: nbinom2 ( log )
## Formula:
## wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + offset(log(catch_area))
## Data: clean_result
##
## AIC BIC logLik -2*log(L) df.resid
## 13530.5 13568.3 -6757.2 13514.5 829
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## id_number (Intercept) 0.07965 0.2822
## Number of obs: 837, groups: id_number, 28
##
## Dispersion parameter for nbinom2 family (): 47
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.8648309 0.0554952 -69.64 < 2e-16 ***
## t -0.0140348 0.0016846 -8.33 < 2e-16 ***
## int 0.1225341 0.0213878 5.73 1.01e-08 ***
## trend -0.0006062 0.0024237 -0.25 0.8025
## sin12 -0.0133298 0.0075350 -1.77 0.0769 .
## cos12 0.0426833 0.0075363 5.66 1.48e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_nb2)
## Family: nbinom2 ( log )
## Formula:
## wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + lag1 +
## lag2 + lag3 + lag4 + lag5 + offset(log(catch_area))
## Data: clean_result
##
## AIC BIC logLik -2*log(L) df.resid
## 10740.8 10799.6 -5357.4 10714.8 670
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## id_number (Intercept) 0.1405 0.3748
## Number of obs: 683, groups: id_number, 28
##
## Dispersion parameter for nbinom2 family (): 76.6
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.876e+00 1.027e-01 -47.50 < 2e-16 ***
## t -1.246e-03 3.422e-03 -0.36 0.71580
## int 5.241e-02 2.273e-02 2.31 0.02114 *
## trend -7.029e-03 3.394e-03 -2.07 0.03839 *
## sin12 -1.155e-02 8.351e-03 -1.38 0.16653
## cos12 1.645e-03 7.544e-03 0.22 0.82738
## lag1 9.213e-05 8.610e-06 10.70 < 2e-16 ***
## lag2 3.956e-05 9.368e-06 4.22 2.41e-05 ***
## lag3 -1.849e-05 9.227e-06 -2.00 0.04514 *
## lag4 1.862e-05 8.819e-06 2.11 0.03469 *
## lag5 2.491e-05 8.710e-06 2.86 0.00424 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_nb_rs)
## Family: nbinom2 ( log )
## Formula:
## wbg ~ t + int + trend + sin12 + cos12 + (1 + t | id_number) +
## lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area))
## Data: clean_result
##
## AIC BIC logLik -2*log(L) df.resid
## 10646.6 10714.5 -5308.3 10616.6 668
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## id_number (Intercept) 8.762e-02 0.296007
## t 7.872e-05 0.008873 -0.42
## Number of obs: 683, groups: id_number, 28
##
## Dispersion parameter for nbinom2 family (): 95.7
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.434e+00 1.027e-01 -43.16 < 2e-16 ***
## t -4.478e-03 3.546e-03 -1.26 0.206734
## int 6.765e-02 2.051e-02 3.30 0.000973 ***
## trend -6.592e-03 3.073e-03 -2.15 0.031947 *
## sin12 -1.365e-02 7.533e-03 -1.81 0.069961 .
## cos12 1.197e-02 6.851e-03 1.75 0.080674 .
## lag1 6.499e-05 7.974e-06 8.15 3.63e-16 ***
## lag2 2.398e-05 8.405e-06 2.85 0.004330 **
## lag3 -2.871e-05 8.383e-06 -3.42 0.000615 ***
## lag4 6.807e-06 8.172e-06 0.83 0.404909
## lag5 1.673e-05 8.274e-06 2.02 0.043191 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_nb_ll)
## Family: nbinom2 ( log )
## Formula:
## wbg ~ t + int + trend + sin12 + cos12 + (1 + t | id_number) +
## int_lead2 + int_lead1 + int_lag1 + int_lag2 + int_lag3 +
## lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area))
## Data: clean_result
##
## AIC BIC logLik -2*log(L) df.resid
## 10560.6 10651.2 -5260.3 10520.6 663
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## id_number (Intercept) 0.0867118 0.294469
## t 0.0000872 0.009338 -0.47
## Number of obs: 683, groups: id_number, 28
##
## Dispersion parameter for nbinom2 family (): 112
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.264e+00 1.078e-01 -39.56 < 2e-16 ***
## t -1.411e-02 4.461e-03 -3.16 0.001565 **
## int -7.151e-02 2.671e-02 -2.68 0.007414 **
## trend -8.533e-03 4.534e-03 -1.88 0.059840 .
## sin12 3.341e-02 9.594e-03 3.48 0.000498 ***
## cos12 1.592e-02 7.779e-03 2.05 0.040694 *
## int_lead2 -3.035e-02 2.648e-02 -1.15 0.251734
## int_lead1 4.631e-02 2.735e-02 1.69 0.090413 .
## int_lag1 1.489e-01 2.696e-02 5.52 3.35e-08 ***
## int_lag2 1.270e-01 2.731e-02 4.65 3.29e-06 ***
## int_lag3 -1.004e-02 2.658e-02 -0.38 0.705627
## lag1 5.655e-05 7.836e-06 7.22 5.31e-13 ***
## lag2 2.500e-05 8.153e-06 3.07 0.002164 **
## lag3 -1.482e-05 8.170e-06 -1.81 0.069599 .
## lag4 -2.614e-06 7.806e-06 -0.33 0.737749
## lag5 9.139e-06 7.867e-06 1.16 0.245355
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_nb_ll2)
## Family: nbinom2 ( log )
## Formula:
## wbg ~ t + int + trend + sin12 + cos12 + (1 + t | id_number) +
## int_lead1 + int_lag1 + int_lag2 + lag1 + lag2 + lag3 + lag5 +
## offset(log(catch_area))
## Data: clean_result
##
## AIC BIC logLik -2*log(L) df.resid
## 10605.0 10682.0 -5285.5 10571.0 669
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## id_number (Intercept) 8.653e-02 0.294166
## t 8.749e-05 0.009353 -0.47
## Number of obs: 686, groups: id_number, 28
##
## Dispersion parameter for nbinom2 family (): 111
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.233e+00 1.012e-01 -41.84 < 2e-16 ***
## t -1.644e-02 3.936e-03 -4.18 2.97e-05 ***
## int -6.739e-02 2.655e-02 -2.54 0.011140 *
## trend -6.822e-03 3.402e-03 -2.01 0.044909 *
## sin12 3.275e-02 8.495e-03 3.86 0.000115 ***
## cos12 1.769e-02 6.821e-03 2.59 0.009491 **
## int_lead1 3.047e-02 2.506e-02 1.22 0.224036
## int_lag1 1.464e-01 2.660e-02 5.50 3.73e-08 ***
## int_lag2 1.214e-01 2.518e-02 4.82 1.43e-06 ***
## lag1 5.483e-05 7.569e-06 7.24 4.34e-13 ***
## lag2 2.577e-05 7.901e-06 3.26 0.001110 **
## lag3 -1.706e-05 7.753e-06 -2.20 0.027753 *
## lag5 7.605e-06 7.734e-06 0.98 0.325450
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
residuals plots
acf(residuals(m_ols), main = "OLS Model - ACF of Residuals")

acf(residuals(m_ols1), main = "OLS Model 1 - ACF of Residuals")

acf(residuals(m_nb, type = "pearson"), main = "Negative Binomial - ACF of Residuals")

acf(residuals(m_nb2, type = "pearson"), main = "NB with Lags - ACF of Residuals")

acf(residuals(m_nb_rs, type = "pearson"), main = "NB with Lags and RS - ACF of Residuals")

acf(residuals(m_nb_ll, type = "pearson"), main = "NB with Lags, RS, A/D effects - ACF of Residuals")

acf(residuals(m_nb_ll2, type = "pearson"), main = "NB with Lags, RS, A/D effects - ACF of Residuals")
