# had to remove this from knitting, it was not working when I tried to knit, couldn't fix it. keeping it here just for reference.
clean_subset <- read_excel("clean_result_29LHU.xlsx")
# this file has 28 LHU - old LHU removed, as well as 24 (PPP) 11, 14, 19 ( > 5% missings))
demo <- read_excel("~/Library/CloudStorage/OneDrive-ensp.unl.pt/ENSP 2024/ULS_paper/LHU_demograph_SES.xlsx")
# this file has confounders - telephone triage, 65+, mpp_weighted.
demo$telephone <- ym(demo$telephone) # "YYYY-MM" recognised
clean_result <- inner_join( clean_subset, demo,
by = c("id_number"))
clean_result$impl <- ifelse(
clean_result$tel != 0 & clean_result$t >= clean_result$tel,
1,0)
# additional gp coverage
# summarise the totals (check script4 for full code)
df_sums <- gp_cov %>%
group_by(id_number, date_ym) %>%
summarise(
utentes_insc = sum(utentes_inscritos, na.rm = TRUE),
utentes_gp = sum(utentes_mdf, na.rm = TRUE),
.groups = "drop")
clean_result <- clean_result %>%
left_join( # left join is better bc inner_join will drop unmatched vars
df_sums,
by = c("id_number", "date_ym")
)
# variables
clean_result <- clean_result %>%
arrange(id_number, t) %>%
group_by(id_number) %>%
mutate(
p_gp_cov = utentes_gp / utentes_insc,
p_gv_cov_ca = utentes_gp / catch_area
) %>%
ungroup()
# variables
clean_result <- clean_result %>%
arrange(id_number, t) %>%
group_by(id_number) %>%
mutate(
lag1 = lag(wbg, 1),
lag2 = lag(wbg, 2),
lag3 = lag(wbg, 3),
lag4 = lag(wbg, 4),
lag5 = lag(wbg, 5)) %>%
ungroup()
# creating leads and lags
clean_result <- clean_result %>%
mutate(
# Leads (anticipatory effects)
int_lead2 = ifelse(t >= 14, 1, 0), # t=14,15 before intervention
int_lead1 = ifelse(t >= 15, 1, 0), # t=15 just before intervention
# Lags (delayed effects)
int_lag1 = ifelse(t >= 17, 1, 0), # t=17 onwards (1 month delay)
int_lag2 = ifelse(t >= 18, 1, 0), # t=18 onwards (2 month delay)
int_lag3 = ifelse(t >= 19, 1, 0)) # t=19 onwards (3 month delay)
# lags for autocorrelation
clean_result$month <- factor((clean_result$t - 1) %% 12 + 1) # different approach to seasonality
clean_result$lag1 <- dplyr::lag(clean_result$wbg, 1)
clean_result$lag2 <- dplyr::lag(clean_result$wbg, 2)
clean_result$lag3 <- dplyr::lag(clean_result$wbg, 3)
clean_result$lag4 <- dplyr::lag(clean_result$wbg, 4)
clean_result$lag5 <- dplyr::lag(clean_result$wbg, 5)
# I was getting convergence errors with the latest model, so Claude suggested rescaling time variables (t and lags)
clean_result$tz <- scale(clean_result$t)
clean_result$lag1_z <- scale(clean_result$lag1)
clean_result$lag2_z <- scale(clean_result$lag2)
clean_result$lag3_z <- scale(clean_result$lag3)
clean_result$lag4_z <- scale(clean_result$lag4)
clean_result$lag5_z <- scale(clean_result$lag5)
write_xlsx(clean_result, "clean_result_full.xlsx")
models
clean_result<- read_excel("clean_result_full.xlsx")
m1 <- glmmTMB(
wbg ~ tz +
int_lead2 + int_lead1 + # anticipatory effects
int + # reform
int_lag1 + int_lag2 + int_lag3 + # delayed effects
trend + # slope
sin12 + cos12 +
mpp_weighted + sixfive + impl + p_gp_cov + # confounders
lag1_z + lag3_z + lag5_z +lag2_z + lag4_z + # lags for autocorrelation
(1 + tz | id_number) + # random slopes
offset(log(catch_area)), #normalisation for catchment area
family = nbinom2,
data = clean_result)
summary(m1)
## Family: nbinom2 ( log )
## Formula:
## wbg ~ tz + int_lead2 + int_lead1 + int + int_lag1 + int_lag2 +
## int_lag3 + trend + sin12 + cos12 + mpp_weighted + sixfive +
## impl + p_gp_cov + lag1_z + lag3_z + lag5_z + lag2_z + lag4_z +
## (1 + tz | id_number) + offset(log(catch_area))
## Data: clean_result
##
## AIC BIC logLik -2*log(L) df.resid
## 12755.8 12868.7 -6353.9 12707.8 793
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## id_number (Intercept) 0.052629 0.22941
## tz 0.004496 0.06705 -0.03
## Number of obs: 817, groups: id_number, 28
##
## Dispersion parameter for nbinom2 family (): 94.2
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.854e+00 5.150e-01 -7.483 7.24e-14 ***
## tz -1.199e-01 2.057e-02 -5.830 5.55e-09 ***
## int_lead2 -4.947e-03 2.646e-02 -0.187 0.851657
## int_lead1 5.648e-02 2.861e-02 1.974 0.048364 *
## int -8.454e-02 2.856e-02 -2.960 0.003073 **
## int_lag1 1.230e-01 2.877e-02 4.276 1.90e-05 ***
## int_lag2 1.128e-01 2.877e-02 3.921 8.80e-05 ***
## int_lag3 -3.936e-02 2.716e-02 -1.449 0.147272
## trend 1.025e-03 3.527e-03 0.291 0.771386
## sin12 8.972e-03 8.237e-03 1.089 0.276027
## cos12 2.702e-02 6.412e-03 4.213 2.52e-05 ***
## mpp_weighted -4.550e-03 2.615e-03 -1.740 0.081917 .
## sixfive 1.449e+00 1.454e+00 0.996 0.319216
## impl -1.543e-01 2.092e-02 -7.375 1.64e-13 ***
## p_gp_cov -1.763e-01 1.855e-01 -0.951 0.341747
## lag1_z 7.316e-02 1.299e-02 5.634 1.76e-08 ***
## lag3_z -3.079e-02 1.340e-02 -2.298 0.021565 *
## lag5_z 3.731e-02 1.068e-02 3.494 0.000475 ***
## lag2_z -1.396e-03 1.370e-02 -0.102 0.918812
## lag4_z -5.817e-05 1.308e-02 -0.004 0.996452
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
diagnose(m1)
## Unusually large Z-statistics (|x|>5):
##
## (Intercept) tz impl
## -7.483399 -5.829646 -7.375385
## lag1_z disp~(Intercept) theta_1+tz|id_number.1
## 5.634261 86.444265 -10.415132
## theta_1+tz|id_number.2
## -18.483314
##
## Large Z-statistics (estimate/std err) suggest a *possible* failure of
## the Wald approximation - often also associated with parameters that are
## at or near the edge of their range (e.g. random-effects standard
## deviations approaching 0). (Alternately, they may simply represent
## very well-estimated parameters; intercepts of non-centered models may
## fall in this category.) While the Wald p-values and standard errors
## listed in summary() may be unreliable, profile confidence intervals
## (see ?confint.glmmTMB) and likelihood ratio test p-values derived by
## comparing models (e.g. ?drop1) are probably still OK. (Note that the
## LRT is conservative when the null value is on the boundary, e.g. a
## variance or zero-inflation value of 0 (Self and Liang 1987; Stram and
## Lee 1994; Goldman and Whelan 2000); in simple cases these p-values are
## approximately twice as large as they should be.)
plots
# predicted values from model m1 (latest, above)
clean_result$predicted <- predict(m1, newdata = clean_result, type = "response")
# averages and SEs for predictions
avg_predictions <- aggregate(predicted ~ t, data = clean_result, mean, na.rm = TRUE)
se_predictions <- aggregate(predicted ~ t, data = clean_result,
function(x) sd(x, na.rm = TRUE)/sqrt(length(x)))
avg_with_se <- merge(avg_predictions, se_predictions, by = "t", suffixes = c("_mean", "_se"))
# averages and SEs for observed data
avg_observed <- aggregate(wbg ~ t, data = clean_result, mean, na.rm = TRUE)
se_observed <- aggregate(wbg ~ t, data = clean_result,
function(x) sd(x, na.rm = TRUE)/sqrt(length(x)))
obs_with_se <- merge(avg_observed, se_observed, by = "t", suffixes = c("_mean", "_se"))
# CI with SE
avg_with_se$upper_ci <- avg_with_se$predicted_mean + 1.96 * avg_with_se$predicted_se
avg_with_se$lower_ci <- avg_with_se$predicted_mean - 1.96 * avg_with_se$predicted_se
obs_with_se$upper_ci <- obs_with_se$wbg_mean + 1.96 * obs_with_se$wbg_se
obs_with_se$lower_ci <- obs_with_se$wbg_mean - 1.96 * obs_with_se$wbg_se
# pre / post int
int <- 16
pre_pred <- avg_with_se[avg_with_se$t <= int, ]
post_pred <- avg_with_se[avg_with_se$t > int, ]
pre_obs <- obs_with_se[obs_with_se$t <= int, ]
post_obs <- obs_with_se[obs_with_se$t > int, ]
# ggplot
plot_df <- avg_with_se %>%
rename(pred_mean = predicted_mean,
pred_se = predicted_se,
pred_lower = lower_ci,
pred_upper = upper_ci) %>%
left_join(obs_with_se %>%
rename(obs_mean = wbg_mean,
obs_se = wbg_se,
obs_lower = lower_ci,
obs_upper = upper_ci), by = "t")
ggplot(plot_df, aes(x = t)) +
geom_ribbon(aes(ymin = pred_lower, ymax = pred_upper), fill = "lightgrey", alpha = 0.4) + # predicted CI
geom_ribbon(aes(ymin = obs_lower, ymax = obs_upper), fill = "lightblue", alpha = 0.4) + # obsv CI
geom_line(aes(y = pred_mean), color = "darkgrey", linetype = "dashed", linewidth = 1) + # predicted line
geom_line(aes(y = obs_mean), color = "steelblue", size = 1) + # observed line
geom_vline(xintercept = int, linetype = "dotted", color = "grey", size = 1) +
scale_x_continuous(breaks = seq(min(plot_df$t), max(plot_df$t), by = 2)) +
labs(
title = "Observed vs Predicted Non-Urgent ED Visits",
subtitle = "95% CI",
x = "Month",
y = "ED visits per 100 000 population") +
theme_minimal(base_size = 14) +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold"),
legend.position = "none")
# descriptive tables
summary_table <- clean_result %>%
group_by(int) %>%
summarise(
mean_nonurgent = mean(wbg, na.rm = TRUE),
sd_nonurgent = sd(wbg, na.rm = TRUE),
n = n())
summary_table
## # A tibble: 2 × 4
## int mean_nonurgent sd_nonurgent n
## <dbl> <dbl> <dbl> <int>
## 1 0 5718. 2733. 420
## 2 1 5144. 2582. 420
summary_table_by_id <- clean_result %>%
group_by(int, id_number) %>%
summarise(
mean_nonurgent = mean(wbg, na.rm = TRUE),
sd_nonurgent = sd(wbg, na.rm = TRUE),
n = n())
print(as.data.frame(summary_table_by_id), row.names = FALSE)
## int id_number mean_nonurgent sd_nonurgent n
## 0 1 5901.133 584.0652 15
## 0 2 6154.200 562.1087 15
## 0 3 6041.067 591.0172 15
## 0 4 3340.200 538.0506 15
## 0 5 5417.733 495.1931 15
## 0 6 4282.267 356.2003 15
## 0 7 7083.267 684.1884 15
## 0 8 7322.067 737.1867 15
## 0 9 8007.667 863.6231 15
## 0 10 5563.533 568.0884 15
## 0 12 5970.333 841.5255 15
## 0 13 5134.733 363.9279 15
## 0 15 2008.467 260.8097 15
## 0 16 3368.800 367.8768 15
## 0 17 4007.000 412.3345 15
## 0 18 13780.467 1022.4401 15
## 0 20 6769.067 575.8930 15
## 0 21 5941.467 607.9660 15
## 0 22 6432.133 924.7451 15
## 0 23 7899.333 389.9102 15
## 0 25 4102.400 1090.6784 15
## 0 26 4606.133 672.0864 15
## 0 27 2299.333 310.8632 15
## 0 28 4523.800 640.8488 15
## 0 29 2023.933 368.3223 15
## 0 30 6225.533 674.7899 15
## 0 31 12697.467 1110.6897 15
## 0 32 3208.200 246.3993 15
## 1 1 5404.933 397.8506 15
## 1 2 4682.933 782.4967 15
## 1 3 5435.667 555.9902 15
## 1 4 3281.600 470.7083 15
## 1 5 4410.000 890.8028 15
## 1 6 3652.267 912.9976 15
## 1 7 6263.267 1303.9808 15
## 1 8 6315.533 650.3263 15
## 1 9 4624.467 1393.9224 15
## 1 10 5463.800 574.9205 15
## 1 12 5874.643 1130.6567 15
## 1 13 4715.429 442.0031 15
## 1 15 1901.400 191.1154 15
## 1 16 3218.067 376.4307 15
## 1 17 4038.400 525.3116 15
## 1 18 12890.400 1077.6298 15
## 1 20 6809.933 949.3547 15
## 1 21 4845.400 714.2521 15
## 1 22 6257.333 603.8131 15
## 1 23 7470.600 676.4018 15
## 1 25 2987.400 351.7556 15
## 1 26 4449.143 556.9231 15
## 1 27 1985.200 225.6607 15
## 1 28 3384.267 499.9343 15
## 1 29 3382.733 605.3661 15
## 1 30 5378.400 980.2645 15
## 1 31 12060.200 1132.7360 15
## 1 32 2810.933 452.1715 15
id_LHU<- read_excel("id_number_2024.xlsx")
print(as.data.frame(id_LHU), row.names = FALSE)
## id_number institution
## 1 ULS Arco Ribeirinho
## 2 ULS Leiria
## 3 ULS Lisboa Ocidental
## 4 ULS Arrábida
## 5 ULS Aveiro
## 6 ULS Médio Ave
## 7 ULS Oeste
## 8 ULS Coimbra
## 9 ULS Entre Douro e Vouga
## 10 ULS Médio Tejo
## 11 ULS Póvoa Varzim/ Vila Conde
## 12 ULS Tâmega e Sousa
## 13 ULS Dão-Lafões
## 14 ULS Trás-os-Montes Alto Douro
## 15 ULS Cova Beira
## 16 ULS Santo António
## 17 ULS São João
## 18 ULS Algarve
## 19 ULS São José
## 20 ULS Santa Maria
## 21 ULS Vila Nova Gaia/ Espinho
## 22 ULS Alto Ave
## 23 ULS Braga
## 24 Hospital de Cascais, PPP
## 25 ULS Loures-Odivelas
## 26 ULS Estuário Tejo
## 27 ULS Baixo Mondego
## 28 ULS Lezíria
## 29 ULS Alentejo Central
## 30 ULS Almada-Seixal
## 31 ULS Amadora/ Sintra
## 32 ULS Barcelos/ Esposende
## 33 ULS Alto Alentejo
## 34 ULS Guarda
## 35 ULS Matosinhos
## 36 ULS Alto Minho
## 37 ULS Baixo Alentejo
## 38 ULS Litoral Alentejano
## 39 ULS Nordeste
## 40 ULS Castelo Branco