DATA - setting up database from step 2 script (result)

result <- read_excel("result_after_step2.xlsx")

# this includes ALL LHU and PPP (no missings excluded)

#new data frame with national sums
monthly_sums_national <- result %>%
  group_by(date_ym) %>%
  summarise(
    nat_wbg = sum(wbg, na.rm = TRUE), # na.rm true ensures NA are ignored 
    nat_wb = sum(wb, na.rm = TRUE))

monthly_sums_national$national_population <- 10639726 # resident population 2023 from INE

# normalise per inhabitant (for plots)
monthly_sums_national <- monthly_sums_national %>%
  group_by(date_ym) %>%
  mutate(
    nat_wbg_rate = nat_wbg / national_population * 10^5,
    nat_wb_rate = nat_wb / national_population * 10^5)
    
# variable time t = 1,n
monthly_sums_national <- monthly_sums_national %>%
  ungroup() %>%                       # important step!!
  arrange(date_ym) %>%                     # sort by date
  mutate(t = row_number()) %>%              # assign t starting from 1
  ungroup()

monthly_sums_national <- monthly_sums_national %>%
  mutate(
    year = as.numeric(format(date_ym, "%Y")),
    month = as.numeric(format(date_ym, "%m")))

## Seasonality: Serfling Model (sine cosine) 
#Serfling Model - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1915276/
monthly_sums_national <- monthly_sums_national %>%
  ungroup() %>%  
  mutate(
    sin12 = sin(2 * pi * t / 12),
    cos12 = cos(2 * pi * t / 12)) 

# intervention and trend 
monthly_sums_national$int <- ifelse(monthly_sums_national$year >= 2024, 1, 0)
monthly_sums_national$trend <- pmax(0, monthly_sums_national$t - 16 + 1)

models - green blue white

# model 1: simple LM 
m_nat_ols <- lm(nat_wbg~offset(log(national_population))+t+int+trend+sin12+cos12, monthly_sums_national)
summary(m_nat_ols)
## 
## Call:
## lm(formula = nat_wbg ~ offset(log(national_population)) + t + 
##     int + trend + sin12 + cos12, data = monthly_sums_national)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15156  -5959  -1066   7641  15392 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 225788.828   5363.050  42.101  < 2e-16 ***
## t            -2339.260    591.464  -3.955  0.00059 ***
## int          14713.913   7369.259   1.997  0.05732 .  
## trend            3.574    836.456   0.004  0.99663    
## sin12        -3223.142   2598.888  -1.240  0.22689    
## cos12         8526.998   2598.888   3.281  0.00315 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9604 on 24 degrees of freedom
## Multiple R-squared:  0.7655, Adjusted R-squared:  0.7166 
## F-statistic: 15.67 on 5 and 24 DF,  p-value: 6.967e-07
monthly_sums_national$pred.cases_gbw1 <- predict(m_nat_ols, type="response") #predict cases according to the model 
monthly_sums_national$pred_gbw1 <- predict(m_nat_ols, type="response")/(monthly_sums_national$national_population)*10^5

# model 2: different way of doing neg bionomial (minimal AIC diff but v different results)
m_nat_nb2 <- glmmTMB(nat_wbg ~ t + int + trend + sin12 + cos12
                      + offset(log(national_population)),
                 data = monthly_sums_national,
                 family = nbinom2)

monthly_sums_national$pred.cases_gbw21 <- predict(m_nat_nb2, type="response") #predict cases according to the model 
monthly_sums_national$pred_gbw21 <- predict(m_nat_nb2, type="response")/(monthly_sums_national$national_population)*10^5

# no need for lags' corrections for autocorrelation - ACF plots seem okay (I think)
# model 3: negative binomial w/ lead and lags (ll models)
monthly_sums_national <- monthly_sums_national %>%
  mutate(
    int_lead2 = ifelse(t >= 14, 1, 0),  # t=14,15 before intervention
    int_lead1 = ifelse(t >= 15, 1, 0),  # t=15 just before intervention
    int_lag1 = ifelse(t >= 17, 1, 0),   # t=17 onwards (1 month delay)
    int_lag2 = ifelse(t >= 18, 1, 0),   # t=18 onwards (2 month delay)
    int_lag3 = ifelse(t >= 19, 1, 0))   # t=19 onwards (3 month delay)
 
m_nat_nbll <- glmmTMB(nat_wbg ~ t + int + trend + sin12 + cos12 
                      + int_lead2 + int_lead1 + int_lag1 + int_lag2 + int_lag3
                      + offset(log(national_population)),
                 data = monthly_sums_national,
                 family = nbinom2)

monthly_sums_national$pred.cases_gbw3 <- predict(m_nat_nbll, type="response") #predict cases according to the model 
monthly_sums_national$pred_gbw3 <- predict(m_nat_nbll, type="response")/(monthly_sums_national$national_population)*10^5

summaries and ACF plots

summary(m_nat_ols)
## 
## Call:
## lm(formula = nat_wbg ~ offset(log(national_population)) + t + 
##     int + trend + sin12 + cos12, data = monthly_sums_national)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15156  -5959  -1066   7641  15392 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 225788.828   5363.050  42.101  < 2e-16 ***
## t            -2339.260    591.464  -3.955  0.00059 ***
## int          14713.913   7369.259   1.997  0.05732 .  
## trend            3.574    836.456   0.004  0.99663    
## sin12        -3223.142   2598.888  -1.240  0.22689    
## cos12         8526.998   2598.888   3.281  0.00315 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9604 on 24 degrees of freedom
## Multiple R-squared:  0.7655, Adjusted R-squared:  0.7166 
## F-statistic: 15.67 on 5 and 24 DF,  p-value: 6.967e-07
acf(residuals(m_nat_ols), main = "OLS Model - ACF of Residuals")

summary(m_nat_nb2)
##  Family: nbinom2  ( log )
## Formula:          
## nat_wbg ~ t + int + trend + sin12 + cos12 + offset(log(national_population))
## Data: monthly_sums_national
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     642.9     652.7    -314.4     628.9        23 
## 
## 
## Dispersion parameter for nbinom2 family ():  515 
## 
## Conditional model:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.850694   0.024485 -157.27  < 2e-16 ***
## t           -0.011207   0.002694   -4.16 3.19e-05 ***
## int          0.079708   0.033975    2.35 0.018972 *  
## trend       -0.002027   0.003853   -0.53 0.598755    
## sin12       -0.016989   0.011939   -1.42 0.154732    
## cos12        0.044919   0.011966    3.75 0.000174 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(residuals(m_nat_nb2, type = "pearson"), main = "Negative Binomial - ACF of Residuals")

summary(m_nat_nbll)
##  Family: nbinom2  ( log )
## Formula:          
## nat_wbg ~ t + int + trend + sin12 + cos12 + int_lead2 + int_lead1 +  
##     int_lag1 + int_lag2 + int_lag3 + offset(log(national_population))
## Data: monthly_sums_national
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     641.5     658.3    -308.7     617.5        18 
## 
## 
## Dispersion parameter for nbinom2 family ():  754 
## 
## Conditional model:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.850843   0.025158 -153.07  < 2e-16 ***
## t           -0.011800   0.003290   -3.59 0.000335 ***
## int         -0.057226   0.051999   -1.10 0.271102    
## trend       -0.008204   0.005960   -1.38 0.168661    
## sin12       -0.002726   0.014745   -0.18 0.853315    
## cos12        0.040632   0.011135    3.65 0.000263 ***
## int_lead2   -0.009236   0.048290   -0.19 0.848318    
## int_lead1    0.062488   0.052288    1.20 0.232057    
## int_lag1     0.068794   0.052237    1.32 0.187858    
## int_lag2     0.079803   0.052546    1.52 0.128828    
## int_lag3     0.013770   0.048775    0.28 0.777706    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(residuals(m_nat_nbll, type = "pearson"), main = "NB with A/D effects - ACF of Residuals")

# MODEL 1 OLS
plot(monthly_sums_national$t, monthly_sums_national$nat_wbg_rate, # green blue white normalised per population
     ylab = "ED visits per 100000 residents",
     ylim = c(500, 3000),
     xlab = "months",
     col = "steelblue",
     xaxt = "n",
     cex.lab = 0.8, cex.axis = 0.8, cex.main = 0.9)
ticks <- seq(2, max(monthly_sums_national$t), by = 1)
axis(1, at = ticks, labels = ticks, cex.axis = 0.7)
abline(v = 16, lty = 2)
lines(monthly_sums_national$t[1:16],monthly_sums_national$pred_gbw21[1:16],col="steelblue")
lines(monthly_sums_national$t[16:30],monthly_sums_national$pred_gbw21[16:30],col="steelblue")

# trend line if no intervention had occurred
newdata <- monthly_sums_national
newdata$int <- newdata$trend <- 0  #assumes no intervention
newdata$pred.cases_gbw21 <- predict(m_nat_ols,newdata, type="response")/(newdata$national_population)*10^5
lines(newdata$t[16:30],newdata$pred.cases_gbw21[16:30],col="grey",lty=2)

# MODEL 2 NB
plot(monthly_sums_national$t, monthly_sums_national$nat_wbg_rate, # green blue white normalised per population
     ylab = "ED visits per 100000 residents",
     ylim = c(0, 3000),
     xlab = "months",
     col = "steelblue",
     xaxt = "n",
     cex.lab = 0.8, cex.axis = 0.8, cex.main = 0.9)
ticks <- seq(2, max(monthly_sums_national$t), by = 1)
axis(1, at = ticks, labels = ticks, cex.axis = 0.7)
abline(v = 16, lty = 2)
lines(monthly_sums_national$t[1:16],monthly_sums_national$pred_gbw21[1:16],col="steelblue")
lines(monthly_sums_national$t[16:30],monthly_sums_national$pred_gbw21[16:30],col="steelblue")

# trend line if no intervention had occurred
newdata <- monthly_sums_national
newdata$int <- newdata$trend <- 0  #assumes no intervention
newdata$pred.cases_gbw21 <- predict(m_nat_nb2,newdata, type="response")/(newdata$national_population)*10^5
lines(newdata$t[16:30],newdata$pred.cases_gbw21[16:30],col="grey",lty=2)

# MODEL 3 NB LagsLeads
plot(monthly_sums_national$t, monthly_sums_national$nat_wbg_rate, # green blue white normalised per population
     ylab = "ED visits per 100000 residents",
     ylim = c(0, 3000),
     xlab = "months",
     col = "steelblue",
     xaxt = "n",
     cex.lab = 0.8, cex.axis = 0.8, cex.main = 0.9)
ticks <- seq(2, max(monthly_sums_national$t), by = 1)
axis(1, at = ticks, labels = ticks, cex.axis = 0.7)
abline(v = 16, lty = 2)
lines(monthly_sums_national$t[1:16],monthly_sums_national$pred_gbw3[1:16],col="steelblue")
lines(monthly_sums_national$t[16:30],monthly_sums_national$pred_gbw3[16:30],col="steelblue")

# trend line if no intervention had occurred
newdata <- monthly_sums_national
newdata$int <- newdata$trend <- 0  #assumes no intervention
newdata$pred.cases_gbw3 <- predict(m_nat_nbll,newdata, type="response")/(newdata$national_population)*10^5
lines(newdata$t[16:30],newdata$pred.cases_gbw3[16:30],col="grey",lty=2)