# had to remove this from knitting, it was not working when I tried to knit, couldn't fix it. keeping it here just for reference.

clean_subset <- read_excel("clean_result_29LHU.xlsx")
# this file has 28 LHU - old LHU removed, as well as 24 (PPP) 11, 14, 19 ( > 5% missings))

demo <- read_excel("~/Library/CloudStorage/OneDrive-ensp.unl.pt/ENSP 2024/ULS_paper/LHU_demograph_SES.xlsx")
# this file has confounders - telephone triage, 65+, mpp_weighted.
demo$telephone <- ym(demo$telephone)  # "YYYY-MM" recognised

clean_result <- inner_join( clean_subset, demo,
                            by = c("id_number"))

clean_result$impl <- ifelse(
  clean_result$tel != 0 & clean_result$t >= clean_result$tel,
  1,0)

# additional gp coverage
# summarise the totals (check script4 for full code)
df_sums <- gp_cov %>%
  group_by(id_number, date_ym) %>%
  summarise(
    utentes_insc = sum(utentes_inscritos, na.rm = TRUE),
    utentes_gp = sum(utentes_mdf, na.rm = TRUE),
    .groups = "drop")

clean_result <- clean_result %>%
  left_join( # left join is better bc inner_join will drop unmatched vars 
    df_sums,
    by = c("id_number", "date_ym")
  )

# variables
clean_result <- clean_result %>%
  arrange(id_number, t) %>%
  group_by(id_number) %>%
  mutate(
    p_gp_cov = utentes_gp / utentes_insc,
    p_gv_cov_ca = utentes_gp / catch_area
  ) %>%
  ungroup()

# variables
clean_result <- clean_result %>%
  arrange(id_number, t) %>%
  group_by(id_number) %>%
  mutate(
    lag1 = lag(wbg, 1),
    lag2 = lag(wbg, 2),
    lag3 = lag(wbg, 3),
    lag4 = lag(wbg, 4),
    lag5 = lag(wbg, 5)) %>%
  ungroup()

# creating leads and lags
clean_result <- clean_result %>%
  mutate(
    # Leads (anticipatory effects)
    int_lead2 = ifelse(t >= 14, 1, 0),  # t=14,15 before intervention
    int_lead1 = ifelse(t >= 15, 1, 0),  # t=15 just before intervention
    # Lags (delayed effects)
    int_lag1 = ifelse(t >= 17, 1, 0),   # t=17 onwards (1 month delay)
    int_lag2 = ifelse(t >= 18, 1, 0),   # t=18 onwards (2 month delay)
    int_lag3 = ifelse(t >= 19, 1, 0))   # t=19 onwards (3 month delay)

# lags for autocorrelation
clean_result$month <- factor((clean_result$t - 1) %% 12 + 1) # different approach to seasonality
clean_result$lag1 <- dplyr::lag(clean_result$wbg, 1)
clean_result$lag2 <- dplyr::lag(clean_result$wbg, 2)
clean_result$lag3 <- dplyr::lag(clean_result$wbg, 3)
clean_result$lag4 <- dplyr::lag(clean_result$wbg, 4)
clean_result$lag5 <- dplyr::lag(clean_result$wbg, 5)

# I was getting convergence errors with the latest model, so Claude suggested rescaling time variables (t and lags)
clean_result$tz <- scale(clean_result$t)
clean_result$lag1_z <- scale(clean_result$lag1)
clean_result$lag2_z <- scale(clean_result$lag2)
clean_result$lag3_z <- scale(clean_result$lag3)
clean_result$lag4_z <- scale(clean_result$lag4)
clean_result$lag5_z <- scale(clean_result$lag5)

write_xlsx(clean_result, "clean_result_full.xlsx")

models

clean_result<- read_excel("clean_result_full.xlsx")

m1 <- glmmTMB(
  wbg ~ tz +
    int_lead2 + int_lead1 + # anticipatory effects
    int +                    # reform
    int_lag1 + int_lag2 + int_lag3 +  # delayed effects
    trend +                  # slope
    sin12 + cos12 + 
    mpp_weighted + sixfive + impl + p_gp_cov + # confounders       
    lag1_z + lag3_z + lag5_z +lag2_z + lag4_z + # lags for autocorrelation
    (1 + tz | id_number) +       # random slopes
    offset(log(catch_area)),    #normalisation for catchment area
  family = nbinom2,   
  data = clean_result)
summary(m1)
##  Family: nbinom2  ( log )
## Formula:          
## wbg ~ tz + int_lead2 + int_lead1 + int + int_lag1 + int_lag2 +  
##     int_lag3 + trend + sin12 + cos12 + mpp_weighted + sixfive +  
##     impl + p_gp_cov + lag1_z + lag3_z + lag5_z + lag2_z + lag4_z +  
##     (1 + tz | id_number) + offset(log(catch_area))
## Data: clean_result
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##   12755.8   12868.7   -6353.9   12707.8       793 
## 
## Random effects:
## 
## Conditional model:
##  Groups    Name        Variance Std.Dev. Corr  
##  id_number (Intercept) 0.052629 0.22941        
##            tz          0.004496 0.06705  -0.03 
## Number of obs: 817, groups:  id_number, 28
## 
## Dispersion parameter for nbinom2 family (): 94.2 
## 
## Conditional model:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.854e+00  5.150e-01  -7.483 7.24e-14 ***
## tz           -1.199e-01  2.057e-02  -5.830 5.55e-09 ***
## int_lead2    -4.947e-03  2.646e-02  -0.187 0.851657    
## int_lead1     5.648e-02  2.861e-02   1.974 0.048364 *  
## int          -8.454e-02  2.856e-02  -2.960 0.003073 ** 
## int_lag1      1.230e-01  2.877e-02   4.276 1.90e-05 ***
## int_lag2      1.128e-01  2.877e-02   3.921 8.80e-05 ***
## int_lag3     -3.936e-02  2.716e-02  -1.449 0.147272    
## trend         1.025e-03  3.527e-03   0.291 0.771386    
## sin12         8.972e-03  8.237e-03   1.089 0.276027    
## cos12         2.702e-02  6.412e-03   4.213 2.52e-05 ***
## mpp_weighted -4.550e-03  2.615e-03  -1.740 0.081917 .  
## sixfive       1.449e+00  1.454e+00   0.996 0.319216    
## impl         -1.543e-01  2.092e-02  -7.375 1.64e-13 ***
## p_gp_cov     -1.763e-01  1.855e-01  -0.951 0.341747    
## lag1_z        7.316e-02  1.299e-02   5.634 1.76e-08 ***
## lag3_z       -3.079e-02  1.340e-02  -2.298 0.021565 *  
## lag5_z        3.731e-02  1.068e-02   3.494 0.000475 ***
## lag2_z       -1.396e-03  1.370e-02  -0.102 0.918812    
## lag4_z       -5.817e-05  1.308e-02  -0.004 0.996452    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
diagnose(m1)
## Unusually large Z-statistics (|x|>5):
## 
##            (Intercept)                     tz                   impl 
##              -7.483399              -5.829646              -7.375385 
##                 lag1_z       disp~(Intercept) theta_1+tz|id_number.1 
##               5.634261              86.444265             -10.415132 
## theta_1+tz|id_number.2 
##             -18.483314 
## 
## Large Z-statistics (estimate/std err) suggest a *possible* failure of
## the Wald approximation - often also associated with parameters that are
## at or near the edge of their range (e.g. random-effects standard
## deviations approaching 0).  (Alternately, they may simply represent
## very well-estimated parameters; intercepts of non-centered models may
## fall in this category.) While the Wald p-values and standard errors
## listed in summary() may be unreliable, profile confidence intervals
## (see ?confint.glmmTMB) and likelihood ratio test p-values derived by
## comparing models (e.g. ?drop1) are probably still OK.  (Note that the
## LRT is conservative when the null value is on the boundary, e.g. a
## variance or zero-inflation value of 0 (Self and Liang 1987; Stram and
## Lee 1994; Goldman and Whelan 2000); in simple cases these p-values are
## approximately twice as large as they should be.)

plots

# predicted values from model m1 (latest, above)
clean_result$predicted <- predict(m1, newdata = clean_result, type = "response")

# averages and SEs for predictions
avg_predictions <- aggregate(predicted ~ t, data = clean_result, mean, na.rm = TRUE)
se_predictions <- aggregate(predicted ~ t, data = clean_result, 
                            function(x) sd(x, na.rm = TRUE)/sqrt(length(x)))
avg_with_se <- merge(avg_predictions, se_predictions, by = "t", suffixes = c("_mean", "_se"))

# averages and SEs for observed data
avg_observed <- aggregate(wbg ~ t, data = clean_result, mean, na.rm = TRUE)
se_observed <- aggregate(wbg ~ t, data = clean_result, 
                         function(x) sd(x, na.rm = TRUE)/sqrt(length(x)))
obs_with_se <- merge(avg_observed, se_observed, by = "t", suffixes = c("_mean", "_se"))

# CI with SE
avg_with_se$upper_ci <- avg_with_se$predicted_mean + 1.96 * avg_with_se$predicted_se
avg_with_se$lower_ci <- avg_with_se$predicted_mean - 1.96 * avg_with_se$predicted_se
obs_with_se$upper_ci <- obs_with_se$wbg_mean + 1.96 * obs_with_se$wbg_se
obs_with_se$lower_ci <- obs_with_se$wbg_mean - 1.96 * obs_with_se$wbg_se

# pre / post int
int <- 16 
pre_pred <- avg_with_se[avg_with_se$t <= int, ]
post_pred <- avg_with_se[avg_with_se$t > int, ]
pre_obs <- obs_with_se[obs_with_se$t <= int, ]
post_obs <- obs_with_se[obs_with_se$t > int, ]

# ggplot
plot_df <- avg_with_se %>%
  rename(pred_mean = predicted_mean,
         pred_se = predicted_se,
         pred_lower = lower_ci,
         pred_upper = upper_ci) %>%
  left_join(obs_with_se %>%
              rename(obs_mean = wbg_mean,
                     obs_se = wbg_se,
                     obs_lower = lower_ci,
                     obs_upper = upper_ci), by = "t")

ggplot(plot_df, aes(x = t)) +
  geom_ribbon(aes(ymin = pred_lower, ymax = pred_upper), fill = "lightgrey", alpha = 0.4) + # predicted CI
  geom_ribbon(aes(ymin = obs_lower, ymax = obs_upper), fill = "lightblue", alpha = 0.4) + # obsv CI
  geom_line(aes(y = pred_mean), color = "darkgrey", linetype = "dashed", linewidth = 1) + # predicted line
  geom_line(aes(y = obs_mean), color = "steelblue", size = 1) + # observed line
  geom_vline(xintercept = int, linetype = "dotted", color = "grey", size = 1) +
  scale_x_continuous(breaks = seq(min(plot_df$t), max(plot_df$t), by = 2)) +
  labs(
    title = "Observed vs Predicted Non-Urgent ED Visits",
    subtitle = "95% CI",
    x = "Month",
    y = "ED visits per 100 000 population") +
  theme_minimal(base_size = 14) +
  theme(
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold"),
    legend.position = "none")

# descriptive tables
summary_table <- clean_result %>%
  group_by(int) %>%
  summarise(
    mean_nonurgent = mean(wbg, na.rm = TRUE),  
    sd_nonurgent = sd(wbg, na.rm = TRUE),
    n = n())
summary_table
## # A tibble: 2 × 4
##     int mean_nonurgent sd_nonurgent     n
##   <dbl>          <dbl>        <dbl> <int>
## 1     0          5718.        2733.   420
## 2     1          5144.        2582.   420
summary_table_by_id <- clean_result %>%
  group_by(int, id_number) %>%
  summarise(
    mean_nonurgent = mean(wbg, na.rm = TRUE),  
    sd_nonurgent = sd(wbg, na.rm = TRUE),
    n = n())
print(as.data.frame(summary_table_by_id), row.names = FALSE)
##  int id_number mean_nonurgent sd_nonurgent  n
##    0         1       5901.133     584.0652 15
##    0         2       6154.200     562.1087 15
##    0         3       6041.067     591.0172 15
##    0         4       3340.200     538.0506 15
##    0         5       5417.733     495.1931 15
##    0         6       4282.267     356.2003 15
##    0         7       7083.267     684.1884 15
##    0         8       7322.067     737.1867 15
##    0         9       8007.667     863.6231 15
##    0        10       5563.533     568.0884 15
##    0        12       5970.333     841.5255 15
##    0        13       5134.733     363.9279 15
##    0        15       2008.467     260.8097 15
##    0        16       3368.800     367.8768 15
##    0        17       4007.000     412.3345 15
##    0        18      13780.467    1022.4401 15
##    0        20       6769.067     575.8930 15
##    0        21       5941.467     607.9660 15
##    0        22       6432.133     924.7451 15
##    0        23       7899.333     389.9102 15
##    0        25       4102.400    1090.6784 15
##    0        26       4606.133     672.0864 15
##    0        27       2299.333     310.8632 15
##    0        28       4523.800     640.8488 15
##    0        29       2023.933     368.3223 15
##    0        30       6225.533     674.7899 15
##    0        31      12697.467    1110.6897 15
##    0        32       3208.200     246.3993 15
##    1         1       5404.933     397.8506 15
##    1         2       4682.933     782.4967 15
##    1         3       5435.667     555.9902 15
##    1         4       3281.600     470.7083 15
##    1         5       4410.000     890.8028 15
##    1         6       3652.267     912.9976 15
##    1         7       6263.267    1303.9808 15
##    1         8       6315.533     650.3263 15
##    1         9       4624.467    1393.9224 15
##    1        10       5463.800     574.9205 15
##    1        12       5874.643    1130.6567 15
##    1        13       4715.429     442.0031 15
##    1        15       1901.400     191.1154 15
##    1        16       3218.067     376.4307 15
##    1        17       4038.400     525.3116 15
##    1        18      12890.400    1077.6298 15
##    1        20       6809.933     949.3547 15
##    1        21       4845.400     714.2521 15
##    1        22       6257.333     603.8131 15
##    1        23       7470.600     676.4018 15
##    1        25       2987.400     351.7556 15
##    1        26       4449.143     556.9231 15
##    1        27       1985.200     225.6607 15
##    1        28       3384.267     499.9343 15
##    1        29       3382.733     605.3661 15
##    1        30       5378.400     980.2645 15
##    1        31      12060.200    1132.7360 15
##    1        32       2810.933     452.1715 15
id_LHU<- read_excel("id_number_2024.xlsx")
print(as.data.frame(id_LHU), row.names = FALSE)
##  id_number                   institution
##          1           ULS Arco Ribeirinho
##          2                    ULS Leiria
##          3          ULS Lisboa Ocidental
##          4                  ULS Arrábida
##          5                    ULS Aveiro
##          6                 ULS Médio Ave
##          7                     ULS Oeste
##          8                   ULS Coimbra
##          9       ULS Entre Douro e Vouga
##         10                ULS Médio Tejo
##         11  ULS Póvoa Varzim/ Vila Conde
##         12            ULS Tâmega e Sousa
##         13                ULS Dão-Lafões
##         14 ULS Trás-os-Montes Alto Douro
##         15                ULS Cova Beira
##         16             ULS Santo António
##         17                  ULS São João
##         18                   ULS Algarve
##         19                  ULS São José
##         20               ULS Santa Maria
##         21   ULS Vila Nova Gaia/ Espinho
##         22                  ULS Alto Ave
##         23                     ULS Braga
##         24      Hospital de Cascais, PPP
##         25           ULS Loures-Odivelas
##         26             ULS Estuário Tejo
##         27             ULS Baixo Mondego
##         28                   ULS Lezíria
##         29          ULS Alentejo Central
##         30             ULS Almada-Seixal
##         31           ULS Amadora/ Sintra
##         32       ULS Barcelos/ Esposende
##         33             ULS Alto Alentejo
##         34                    ULS Guarda
##         35                ULS Matosinhos
##         36                ULS Alto Minho
##         37            ULS Baixo Alentejo
##         38        ULS Litoral Alentejano
##         39                  ULS Nordeste
##         40            ULS Castelo Branco