DATA - setting up database from step 2 script (result)

result <- read_excel("result_after_step2.xlsx")
result$date_ym <- as.Date(result$date_ym)

# filter out september (NA because of delta)
result <- result[!(format(result$date_ym, "%Y-%m-%d") == "2022-09-01"),]

#new data frame with national sums
monthly_sums_national <- result %>%
  group_by(date_ym) %>%
  summarise(
    monthly_total_blue = sum(m_n_blue, na.rm = TRUE),
    monthly_total_green = sum(m_n_green, na.rm = TRUE),
    monthly_total_yellow = sum(m_n_yellow, na.rm = TRUE),
    monthly_total_white = sum(m_n_white, na.rm = TRUE),
    monthly_total_red = sum(m_n_red, na.rm = TRUE),
    monthly_total_orange = sum(m_n_orange, na.rm = TRUE),
    monthly_total_notriage = sum(m_n_nt, na.rm = TRUE),
    monthly_national = sum(m_n_blue + m_n_green + m_n_yellow + 
                             m_n_white + m_n_red + m_n_orange + 
                             m_n_nt, na.rm = TRUE),
    MNU_gb = sum(m_n_blue + m_n_green, na.rm = TRUE),
    MNU_gbw = sum(m_n_blue + m_n_green + m_n_white, na.rm = TRUE),
    MNU_bw = sum(m_n_blue + m_n_white, na.rm = TRUE)
  )

monthly_sums_national$national_population <- 10639726 # resident population 2023 from INE

# normalise per inhabitant (for plots)
monthly_sums_national <- monthly_sums_national %>%
  group_by(date_ym) %>%
  mutate(
    ALL_national = monthly_national/national_population*10^5,
    MNUGr_nat = MNU_gbw/national_population*10^5,
    MNU_nat = MNU_bw/national_population*10^5,
    MNU_gb_nat = MNU_gb/national_population*10^5)

# variable time t = 1,n
monthly_sums_national <- monthly_sums_national %>%
  ungroup() %>%                       # important step!!
  arrange(date_ym) %>%                     # sort by date
  mutate(t = row_number()) %>%              # assign t starting from 1
  ungroup()

monthly_sums_national <- monthly_sums_national %>%
  mutate(
    year = as.numeric(format(date_ym, "%Y")),
    month = as.numeric(format(date_ym, "%m")))

## Seasonality: Serfling Model (sine cosine) 
#Serfling Model - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1915276/
monthly_sums_national <- monthly_sums_national %>%
  ungroup() %>%  
  mutate(
    sin12 = sin(2 * pi * t / 12),
    cos12 = cos(2 * pi * t / 12)) 

# intervention and trend 
monthly_sums_national$int <- ifelse(monthly_sums_national$year >= 2024, 1, 0)
monthly_sums_national$trend <- pmax(0, monthly_sums_national$t - 16 + 1)

models - green blue white

# model 1: simple LM 
m_nat_ols <- lm(MNU_gbw~offset(log(national_population))+t+int+trend+sin12+cos12, monthly_sums_national)
summary(m_nat_ols)
## 
## Call:
## lm(formula = MNU_gbw ~ offset(log(national_population)) + t + 
##     int + trend + sin12 + cos12, data = monthly_sums_national)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15156  -5959  -1066   7641  15392 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 225788.828   5363.050  42.101  < 2e-16 ***
## t            -2339.260    591.464  -3.955  0.00059 ***
## int          14713.913   7369.259   1.997  0.05732 .  
## trend            3.574    836.456   0.004  0.99663    
## sin12        -3223.142   2598.888  -1.240  0.22689    
## cos12         8526.998   2598.888   3.281  0.00315 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9604 on 24 degrees of freedom
## Multiple R-squared:  0.7655, Adjusted R-squared:  0.7166 
## F-statistic: 15.67 on 5 and 24 DF,  p-value: 6.967e-07
monthly_sums_national$pred.cases_gbw1 <- predict(m_nat_ols, type="response") #predict cases according to the model 
monthly_sums_national$pred_gbw1 <- predict(m_nat_ols, type="response")/(monthly_sums_national$national_population)*10^5

# model 2: different way of doing neg bionomial (minimal AIC diff but v different results)
m_nat_nb2 <- glmmTMB(MNU_gbw ~ t + int + trend + sin12 + cos12
                      + offset(log(national_population)),
                 data = monthly_sums_national,
                 family = nbinom2)

monthly_sums_national$pred.cases_gbw21 <- predict(m_nat_nb2, type="response") #predict cases according to the model 
monthly_sums_national$pred_gbw21 <- predict(m_nat_nb2, type="response")/(monthly_sums_national$national_population)*10^5

# no need for lags' corrections for autocorrelation - ACF plots seem okay (I think)
# model 3: negative binomial w/ lead and lags (ll models)
monthly_sums_national <- monthly_sums_national %>%
  mutate(
    # Leads (anticipatory effects)
    int_lead2 = ifelse(t >= 14, 1, 0),  # t=14,15 before intervention
    int_lead1 = ifelse(t >= 15, 1, 0),  # t=15 just before intervention
    # Main intervention
    int = ifelse(t >= 16, 1, 0),        # t=16 onwards
    # Lags (delayed effects)
    int_lag1 = ifelse(t >= 17, 1, 0),   # t=17 onwards (1 month delay)
    int_lag2 = ifelse(t >= 18, 1, 0),   # t=18 onwards (2 month delay)
    int_lag3 = ifelse(t >= 19, 1, 0),   # t=19 onwards (3 month delay)
    # Post-intervention trend
    trend = ifelse(t >= 16, t - 16 + 1, 0))  # Counts months since intervention
 
m_nat_nbll <- glmmTMB(MNU_gbw ~ t + int + trend + sin12 + cos12 
                      + int_lead2 + int_lead1 + int_lag1 + int_lag2 + int_lag3
                      + offset(log(national_population)),
                 data = monthly_sums_national,
                 family = nbinom2)

monthly_sums_national$pred.cases_gbw3 <- predict(m_nat_nbll, type="response") #predict cases according to the model 
monthly_sums_national$pred_gbw3 <- predict(m_nat_nbll, type="response")/(monthly_sums_national$national_population)*10^5

summaries and ACF plots

summary(m_nat_ols)
## 
## Call:
## lm(formula = MNU_gbw ~ offset(log(national_population)) + t + 
##     int + trend + sin12 + cos12, data = monthly_sums_national)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15156  -5959  -1066   7641  15392 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 225788.828   5363.050  42.101  < 2e-16 ***
## t            -2339.260    591.464  -3.955  0.00059 ***
## int          14713.913   7369.259   1.997  0.05732 .  
## trend            3.574    836.456   0.004  0.99663    
## sin12        -3223.142   2598.888  -1.240  0.22689    
## cos12         8526.998   2598.888   3.281  0.00315 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9604 on 24 degrees of freedom
## Multiple R-squared:  0.7655, Adjusted R-squared:  0.7166 
## F-statistic: 15.67 on 5 and 24 DF,  p-value: 6.967e-07
acf(residuals(m_nat_ols), main = "OLS Model - ACF of Residuals")

summary(m_nat_nb2)
##  Family: nbinom2  ( log )
## Formula:          
## MNU_gbw ~ t + int + trend + sin12 + cos12 + offset(log(national_population))
## Data: monthly_sums_national
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     642.9     652.7    -314.4     628.9        23 
## 
## 
## Dispersion parameter for nbinom2 family ():  515 
## 
## Conditional model:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.850694   0.024485 -157.27  < 2e-16 ***
## t           -0.011207   0.002694   -4.16 3.19e-05 ***
## int          0.079708   0.033975    2.35 0.018972 *  
## trend       -0.002027   0.003853   -0.53 0.598755    
## sin12       -0.016989   0.011939   -1.42 0.154732    
## cos12        0.044919   0.011966    3.75 0.000174 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(residuals(m_nat_nb2, type = "pearson"), main = "Negative Binomial - ACF of Residuals")

summary(m_nat_nbll)
##  Family: nbinom2  ( log )
## Formula:          
## MNU_gbw ~ t + int + trend + sin12 + cos12 + int_lead2 + int_lead1 +  
##     int_lag1 + int_lag2 + int_lag3 + offset(log(national_population))
## Data: monthly_sums_national
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     641.5     658.3    -308.7     617.5        18 
## 
## 
## Dispersion parameter for nbinom2 family ():  754 
## 
## Conditional model:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.850843   0.025158 -153.07  < 2e-16 ***
## t           -0.011800   0.003290   -3.59 0.000335 ***
## int         -0.057226   0.051999   -1.10 0.271102    
## trend       -0.008204   0.005960   -1.38 0.168661    
## sin12       -0.002726   0.014745   -0.18 0.853315    
## cos12        0.040632   0.011135    3.65 0.000263 ***
## int_lead2   -0.009236   0.048290   -0.19 0.848318    
## int_lead1    0.062488   0.052288    1.20 0.232057    
## int_lag1     0.068794   0.052237    1.32 0.187858    
## int_lag2     0.079803   0.052546    1.52 0.128828    
## int_lag3     0.013770   0.048775    0.28 0.777706    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(residuals(m_nat_nbll, type = "pearson"), main = "NB with A/D effects - ACF of Residuals")

# MODEL 1 OLS
plot(monthly_sums_national$t, monthly_sums_national$MNUGr_nat, # green blue white normalised per population
     ylab = "ED visits per 100000 residents",
     ylim = c(500, 3000),
     xlab = "months",
     col = "steelblue",
     xaxt = "n",
     cex.lab = 0.8, cex.axis = 0.8, cex.main = 0.9)
ticks <- seq(1, max(monthly_sums_national$t), by = 12)
axis(1, at = ticks, labels = ticks, cex.axis = 0.7)
abline(v = 16, lty = 2)
lines(monthly_sums_national$t[1:16],monthly_sums_national$pred_gbw21[1:16],col="steelblue")
lines(monthly_sums_national$t[16:30],monthly_sums_national$pred_gbw21[16:30],col="steelblue")

# trend line if no intervention had occurred
newdata <- monthly_sums_national
newdata$int <- newdata$trend <- 0  #assumes no intervention
newdata$pred.cases_gbw21 <- predict(m_nat_ols,newdata, type="response")/(newdata$national_population)*10^5
lines(newdata$t[16:30],newdata$pred.cases_gbw21[16:30],col="grey",lty=2)

# MODEL 2 NB
plot(monthly_sums_national$t, monthly_sums_national$MNUGr_nat, # green blue white normalised per population
     ylab = "ED visits per 100000 residents",
     ylim = c(0, 3000),
     xlab = "months",
     col = "steelblue",
     xaxt = "n",
     cex.lab = 0.8, cex.axis = 0.8, cex.main = 0.9)
ticks <- seq(1, max(monthly_sums_national$t), by = 5)
axis(1, at = ticks, labels = ticks, cex.axis = 0.7)
abline(v = 16, lty = 2)
lines(monthly_sums_national$t[1:16],monthly_sums_national$pred_gbw21[1:16],col="steelblue")
lines(monthly_sums_national$t[16:30],monthly_sums_national$pred_gbw21[16:30],col="steelblue")

# trend line if no intervention had occurred
newdata <- monthly_sums_national
newdata$int <- newdata$trend <- 0  #assumes no intervention
newdata$pred.cases_gbw21 <- predict(m_nat_nb2,newdata, type="response")/(newdata$national_population)*10^5
lines(newdata$t[16:30],newdata$pred.cases_gbw21[16:30],col="grey",lty=2)

# MODEL 3 NB LagsLeads
plot(monthly_sums_national$t, monthly_sums_national$MNUGr_nat, # green blue white normalised per population
     ylab = "ED visits per 100000 residents",
     ylim = c(0, 3000),
     xlab = "months",
     col = "steelblue",
     xaxt = "n",
     cex.lab = 0.8, cex.axis = 0.8, cex.main = 0.9)
ticks <- seq(1, max(monthly_sums_national$t), by = 3)
axis(1, at = ticks, labels = ticks, cex.axis = 0.7)
abline(v = 16, lty = 2)
lines(monthly_sums_national$t[1:16],monthly_sums_national$pred_gbw3[1:16],col="steelblue")
lines(monthly_sums_national$t[16:30],monthly_sums_national$pred_gbw3[16:30],col="steelblue")

# trend line if no intervention had occurred
newdata <- monthly_sums_national
newdata$int <- newdata$trend <- 0  #assumes no intervention
newdata$pred.cases_gbw3 <- predict(m_nat_nbll,newdata, type="response")/(newdata$national_population)*10^5
lines(newdata$t[16:30],newdata$pred.cases_gbw3[16:30],col="grey",lty=2)