# data 
# load result dataset from step2
result <- read_excel("result_after_step2.xlsx")

# ensure date is a date
result$date_ym <- as.Date(result$date_ym)

## seasonality: Serfling Model (sine cosine) 
# Serfling Model - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1915276/
t <- result$t
result <- result %>%
  ungroup() %>%  
  mutate(
    sin12 = sin(2 * pi * t / 12),
    cos12 = cos(2 * pi * t / 12)) # can also test different lengths (only tried 6 instead of 12, did not improve model)

# added variables for intervention and trends
result$int <- ifelse(result$year >= 2024, 1, 0)  # int = intervention (jan 2024)
result$trend <- pmax(0, result$t - 16 + 1 )      # trend - slope post change

# removes ID with >5% missing and old LHU (removing PPP Cascais too)
clean_result <- result %>% 
  filter(!id_number %in% c(11, 19, 14, 24, 33, 34, 35, 36, 37, 38, 39, 40))

# this means keeping 12, 13 and 26, which also have some NAs (3.33%)

models - monthly non-urgent - white blue green (wbg)

# model 1: simple OLS just w/ seasonality adjustment
m_ols <- lm(
  wbg ~ t + int + trend + sin12 + cos12 + offset(log(catch_area)),
  data = clean_result)

# model 1.1: institutions and random intercepts (linear fixed effects models)
m_ols1 <- lmer(wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + offset(log(catch_area)),
                 data = clean_result)

# model 2: negative binomial (supposedly better for counts)
m_nb <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + offset(log(catch_area)),
                 data = clean_result,
                 family = nbinom2)

# model 3: negative binomial and lag terms for autocorrelation
clean_result <- clean_result %>%
  arrange(id_number, t) %>%
  group_by(id_number) %>%
  mutate(
    lag1 = lag(wbg, 1),
    lag2 = lag(wbg, 2),
    lag3 = lag(wbg, 3),
    lag4 = lag(wbg, 4),
    lag5 = lag(wbg, 5)) %>%
  ungroup()

m_nb2 <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) 
                + lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area)),
                data = clean_result,
                family = nbinom2)

# model 4: neg bin, lag terms and random slopes (instead of intercepts)
m_nb_rs <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12 
                + (1 + t | id_number) # random slopes
                + lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area)),
                data = clean_result,
                family = nbinom2)

# model 5: neg bin, lag terms and random slopes (instead of intercepts), leads and lags for ant/delayed effects
clean_result <- clean_result %>%
  mutate(
    # Leads (anticipatory effects)
    int_lead2 = ifelse(t >= 14, 1, 0),  # t=14,15 before intervention
    int_lead1 = ifelse(t >= 15, 1, 0),  # t=15 just before intervention
    # Lags (delayed effects)
    int_lag1 = ifelse(t >= 17, 1, 0),   # t=17 onwards (1 month delay)
    int_lag2 = ifelse(t >= 18, 1, 0),   # t=18 onwards (2 month delay)
    int_lag3 = ifelse(t >= 19, 1, 0))   # t=19 onwards (3 month delay)

m_nb_ll <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12 
                + (1 + t | id_number) # random slopes
                + int_lead2 + int_lead1 + int_lag1 + int_lag2 + int_lag3 
                + lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area)),
                data = clean_result,
                family = nbinom2)

# model 6: same as nb ll, but removing ns lag terms
m_nb_ll2 <- glmmTMB(wbg ~ t + int + trend + sin12 + cos12 
                + (1 + t | id_number) # random slopes
                + int_lead1 + int_lag1 + int_lag2 
                + lag1 + lag2 + lag3 + lag5 + offset(log(catch_area)),
                data = clean_result,
                family = nbinom2)

summaries

summary(m_ols)
## 
## Call:
## lm(formula = wbg ~ t + int + trend + sin12 + cos12 + offset(log(catch_area)), 
##     data = clean_result)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4317.7 -1799.5  -347.4   939.6 10151.2 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6311.843    278.992  22.624   <2e-16 ***
## t            -76.706     30.769  -2.493   0.0129 *  
## int          593.933    383.590   1.548   0.1219    
## trend          4.534     43.528   0.104   0.9171    
## sin12        -91.385    135.400  -0.675   0.4999    
## cos12        241.229    135.432   1.781   0.0752 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2644 on 831 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.0274, Adjusted R-squared:  0.02155 
## F-statistic: 4.682 on 5 and 831 DF,  p-value: 0.000319
summary(m_ols1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + offset(log(catch_area))
##    Data: clean_result
## 
## REML criterion at convergence: 13495.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1962 -0.6051 -0.0461  0.5732  3.8337 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  id_number (Intercept) 6640100  2576.8  
##  Residual               518126   719.8  
## Number of obs: 837, groups:  id_number, 28
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 6312.098    492.866  12.807
## t            -76.645      8.378  -9.148
## int          593.887    104.446   5.686
## trend          4.027     11.852   0.340
## sin12        -94.375     36.869  -2.560
## cos12        238.214     36.878   6.460
## 
## Correlation of Fixed Effects:
##       (Intr) t      int    trend  sin12 
## t     -0.136                            
## int    0.052 -0.577                     
## trend  0.098 -0.707 -0.057              
## sin12 -0.028  0.120  0.089 -0.231       
## cos12  0.022 -0.209  0.283  0.064  0.002
summary(m_nb)
##  Family: nbinom2  ( log )
## Formula:          
## wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + offset(log(catch_area))
## Data: clean_result
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##   13530.5   13568.3   -6757.2   13514.5       829 
## 
## Random effects:
## 
## Conditional model:
##  Groups    Name        Variance Std.Dev.
##  id_number (Intercept) 0.07965  0.2822  
## Number of obs: 837, groups:  id_number, 28
## 
## Dispersion parameter for nbinom2 family ():   47 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.8648309  0.0554952  -69.64  < 2e-16 ***
## t           -0.0140348  0.0016846   -8.33  < 2e-16 ***
## int          0.1225341  0.0213878    5.73 1.01e-08 ***
## trend       -0.0006062  0.0024237   -0.25   0.8025    
## sin12       -0.0133298  0.0075350   -1.77   0.0769 .  
## cos12        0.0426833  0.0075363    5.66 1.48e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_nb2)
##  Family: nbinom2  ( log )
## Formula:          
## wbg ~ t + int + trend + sin12 + cos12 + (1 | id_number) + lag1 +  
##     lag2 + lag3 + lag4 + lag5 + offset(log(catch_area))
## Data: clean_result
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##   10740.8   10799.6   -5357.4   10714.8       670 
## 
## Random effects:
## 
## Conditional model:
##  Groups    Name        Variance Std.Dev.
##  id_number (Intercept) 0.1405   0.3748  
## Number of obs: 683, groups:  id_number, 28
## 
## Dispersion parameter for nbinom2 family (): 76.6 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.876e+00  1.027e-01  -47.50  < 2e-16 ***
## t           -1.246e-03  3.422e-03   -0.36  0.71580    
## int          5.241e-02  2.273e-02    2.31  0.02114 *  
## trend       -7.029e-03  3.394e-03   -2.07  0.03839 *  
## sin12       -1.155e-02  8.351e-03   -1.38  0.16653    
## cos12        1.645e-03  7.544e-03    0.22  0.82738    
## lag1         9.213e-05  8.610e-06   10.70  < 2e-16 ***
## lag2         3.956e-05  9.368e-06    4.22 2.41e-05 ***
## lag3        -1.849e-05  9.227e-06   -2.00  0.04514 *  
## lag4         1.862e-05  8.819e-06    2.11  0.03469 *  
## lag5         2.491e-05  8.710e-06    2.86  0.00424 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_nb_rs)
##  Family: nbinom2  ( log )
## Formula:          
## wbg ~ t + int + trend + sin12 + cos12 + (1 + t | id_number) +  
##     lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area))
## Data: clean_result
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##   10646.6   10714.5   -5308.3   10616.6       668 
## 
## Random effects:
## 
## Conditional model:
##  Groups    Name        Variance  Std.Dev. Corr  
##  id_number (Intercept) 8.762e-02 0.296007       
##            t           7.872e-05 0.008873 -0.42 
## Number of obs: 683, groups:  id_number, 28
## 
## Dispersion parameter for nbinom2 family (): 95.7 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.434e+00  1.027e-01  -43.16  < 2e-16 ***
## t           -4.478e-03  3.546e-03   -1.26 0.206734    
## int          6.765e-02  2.051e-02    3.30 0.000973 ***
## trend       -6.592e-03  3.073e-03   -2.15 0.031947 *  
## sin12       -1.365e-02  7.533e-03   -1.81 0.069961 .  
## cos12        1.197e-02  6.851e-03    1.75 0.080674 .  
## lag1         6.499e-05  7.974e-06    8.15 3.63e-16 ***
## lag2         2.398e-05  8.405e-06    2.85 0.004330 ** 
## lag3        -2.871e-05  8.383e-06   -3.42 0.000615 ***
## lag4         6.807e-06  8.172e-06    0.83 0.404909    
## lag5         1.673e-05  8.274e-06    2.02 0.043191 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_nb_ll)
##  Family: nbinom2  ( log )
## Formula:          
## wbg ~ t + int + trend + sin12 + cos12 + (1 + t | id_number) +  
##     int_lead2 + int_lead1 + int_lag1 + int_lag2 + int_lag3 +  
##     lag1 + lag2 + lag3 + lag4 + lag5 + offset(log(catch_area))
## Data: clean_result
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##   10560.6   10651.2   -5260.3   10520.6       663 
## 
## Random effects:
## 
## Conditional model:
##  Groups    Name        Variance  Std.Dev. Corr  
##  id_number (Intercept) 0.0867118 0.294469       
##            t           0.0000872 0.009338 -0.47 
## Number of obs: 683, groups:  id_number, 28
## 
## Dispersion parameter for nbinom2 family ():  112 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.264e+00  1.078e-01  -39.56  < 2e-16 ***
## t           -1.411e-02  4.461e-03   -3.16 0.001565 ** 
## int         -7.151e-02  2.671e-02   -2.68 0.007414 ** 
## trend       -8.533e-03  4.534e-03   -1.88 0.059840 .  
## sin12        3.341e-02  9.594e-03    3.48 0.000498 ***
## cos12        1.592e-02  7.779e-03    2.05 0.040694 *  
## int_lead2   -3.035e-02  2.648e-02   -1.15 0.251734    
## int_lead1    4.631e-02  2.735e-02    1.69 0.090413 .  
## int_lag1     1.489e-01  2.696e-02    5.52 3.35e-08 ***
## int_lag2     1.270e-01  2.731e-02    4.65 3.29e-06 ***
## int_lag3    -1.004e-02  2.658e-02   -0.38 0.705627    
## lag1         5.655e-05  7.836e-06    7.22 5.31e-13 ***
## lag2         2.500e-05  8.153e-06    3.07 0.002164 ** 
## lag3        -1.482e-05  8.170e-06   -1.81 0.069599 .  
## lag4        -2.614e-06  7.806e-06   -0.33 0.737749    
## lag5         9.139e-06  7.867e-06    1.16 0.245355    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m_nb_ll2)
##  Family: nbinom2  ( log )
## Formula:          
## wbg ~ t + int + trend + sin12 + cos12 + (1 + t | id_number) +  
##     int_lead1 + int_lag1 + int_lag2 + lag1 + lag2 + lag3 + lag5 +  
##     offset(log(catch_area))
## Data: clean_result
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##   10605.0   10682.0   -5285.5   10571.0       669 
## 
## Random effects:
## 
## Conditional model:
##  Groups    Name        Variance  Std.Dev. Corr  
##  id_number (Intercept) 8.653e-02 0.294166       
##            t           8.749e-05 0.009353 -0.47 
## Number of obs: 686, groups:  id_number, 28
## 
## Dispersion parameter for nbinom2 family ():  111 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.233e+00  1.012e-01  -41.84  < 2e-16 ***
## t           -1.644e-02  3.936e-03   -4.18 2.97e-05 ***
## int         -6.739e-02  2.655e-02   -2.54 0.011140 *  
## trend       -6.822e-03  3.402e-03   -2.01 0.044909 *  
## sin12        3.275e-02  8.495e-03    3.86 0.000115 ***
## cos12        1.769e-02  6.821e-03    2.59 0.009491 ** 
## int_lead1    3.047e-02  2.506e-02    1.22 0.224036    
## int_lag1     1.464e-01  2.660e-02    5.50 3.73e-08 ***
## int_lag2     1.214e-01  2.518e-02    4.82 1.43e-06 ***
## lag1         5.483e-05  7.569e-06    7.24 4.34e-13 ***
## lag2         2.577e-05  7.901e-06    3.26 0.001110 ** 
## lag3        -1.706e-05  7.753e-06   -2.20 0.027753 *  
## lag5         7.605e-06  7.734e-06    0.98 0.325450    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

residuals plots

acf(residuals(m_ols), main = "OLS Model - ACF of Residuals")

acf(residuals(m_ols1), main = "OLS Model 1 - ACF of Residuals")

acf(residuals(m_nb, type = "pearson"), main = "Negative Binomial - ACF of Residuals")

acf(residuals(m_nb2, type = "pearson"), main = "NB with Lags - ACF of Residuals")

acf(residuals(m_nb_rs, type = "pearson"), main = "NB with Lags and RS - ACF of Residuals")

acf(residuals(m_nb_ll, type = "pearson"), main = "NB with Lags, RS, A/D effects - ACF of Residuals")

acf(residuals(m_nb_ll2, type = "pearson"), main = "NB with Lags, RS, A/D effects - ACF of Residuals")