library(tidyverse)
## Warning: package 'tibble' was built under R version 3.6.1
## Warning: package 'dplyr' was built under R version 3.6.1
library(here)
library(odbc)
library(DBI)
library(lubridate)
library(ggbeeswarm)
library(DT)
library(broom)
library(caret)
library(kableExtra)
library(scales)
knitr::opts_chunk$set(dev = "png",
                      cache = TRUE)
# 1) set up database connections and import functions: -----------
source(here::here("src", 
                  "setup-denodo_function.R"))
source(here::here("src", 
                  "ed-visits-denodo_function.R"))


setup_denodo()

Data

# 2) pull ed data: -----------
df1.ed_visits <- extract_ed_visits("20170101",  # todo: earlier start? 
                                   "20190617")

df2.ed_visits_cleaned <- 
  df1.ed_visits %>% 
  mutate(date = ymd(date_id), 
         weekday = weekdays(date), 
         month = month(date), 
         year = year(date), 
         years_from_2017 = year - 2017) %>% 
  
  # fiddle with factors: 
  mutate(weekday = fct_relevel(weekday, 
                               levels = c("Monday", 
                                          "Tuesday", 
                                          "Wednesday", 
                                          "Thursday", 
                                          "Friday",
                                          "Saturday", 
                                          "Sunday")), 
         # years_from_2017 = as.factor(years_from_2017), 
         year = as.factor(year), 
         month = as.factor(month)) %>% 
  
  rename(ed_visits = value) %>% 
  
  mutate(lag_ed_visits = lag(ed_visits)) %>% 
  
  select(date, 
         years_from_2017, 
         month, 
         year, 
         weekday, 
         ed_visits, 
         lag_ed_visits)
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
str(df2.ed_visits_cleaned)
## Classes 'tbl_df', 'tbl' and 'data.frame':    898 obs. of  7 variables:
##  $ date           : Date, format: "2017-01-01" "2017-01-02" ...
##  $ years_from_2017: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ month          : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year           : Factor w/ 3 levels "2017","2018",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday        : Factor w/ 7 levels "Monday","Tuesday",..: 7 1 2 3 4 5 6 7 1 2 ...
##  $ ed_visits      : int  188 194 192 189 164 187 181 166 175 186 ...
##  $ lag_ed_visits  : int  NA 188 194 192 189 164 187 181 166 175 ...
df2.ed_visits_cleaned %>% datatable()
# mean and sd: 
df3.mean_and_sd <- 
  df2.ed_visits_cleaned %>% 
  group_by(year, 
           weekday) %>% 
  summarise(mean_visits = mean(ed_visits), 
            sd_visits = sd(ed_visits))

df3.mean_and_sd %>% 
  datatable() %>% 
  formatRound(2:4, 2)

     

Exploratory plots

# 3) plots: ------------
# time series 
df2.ed_visits_cleaned %>% 
  ggplot(aes(x = date, 
             y = ed_visits)) + 
  geom_line() + 
  geom_smooth() + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
      panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# facet by year
df2.ed_visits_cleaned %>% 
  ggplot(aes(x = weekday, 
             y = ed_visits)) + 
  geom_beeswarm(alpha = .4) + 
  facet_wrap(~year) + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"), 
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

df2.ed_visits_cleaned %>% 
  ggplot(aes(x = weekday, 
             y = ed_visits)) + 
  geom_boxplot() + 
  facet_wrap(~year) + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# facet by weekday
df2.ed_visits_cleaned %>% 
  ggplot(aes(x = year, 
             y = ed_visits)) + 
  geom_beeswarm() + 
  facet_wrap(~weekday) + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

df2.ed_visits_cleaned %>% 
  ggplot(aes(x = year, 
             y = ed_visits)) + 
  geom_boxplot() + 
  facet_wrap(~weekday) + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# density by year: 
df2.ed_visits_cleaned %>% 
  ggplot(aes(x = ed_visits)) + 
  geom_density() + 
  facet_wrap(~year) + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# checking normality: 
for (i in 0:2) {
x <- df2.ed_visits_cleaned %>% 
  filter(years_from_2017 == i) %>% 
  pull(ed_visits) 

qqnorm(x, main = paste("years from 2017 = ", i))
qqline(x, col = "red")
}

# 3.1) fitting normal distributions --------

Fitting normal dist to 2017 data

fit2017 <- 
  df2.ed_visits_cleaned %>% 
  filter(year == "2017") %>% 
  pull(ed_visits) %>% 
  fitdistrplus::fitdist("norm")
  
# str(fit2017)
summary(fit2017)
## Fitting of the distribution ' norm ' by maximum likelihood 
## Parameters : 
##       estimate Std. Error
## mean 175.67397  0.8113670
## sd    15.50114  0.5737231
## Loglikelihood:  -1518.346   AIC:  3040.692   BIC:  3048.492 
## Correlation matrix:
##               mean            sd
## mean  1.000000e+00 -2.646061e-08
## sd   -2.646061e-08  1.000000e+00
plot(fit2017)

Fitting normal dist to 2018 data

fit2018 <- 
  df2.ed_visits_cleaned %>% 
  filter(year == "2018") %>% 
  pull(ed_visits) %>% 
  fitdistrplus::fitdist("norm")

# str(fit2018)
summary(fit2018)
## Fitting of the distribution ' norm ' by maximum likelihood 
## Parameters : 
##       estimate Std. Error
## mean 178.55068  0.8151148
## sd    15.57275  0.5763732
## Loglikelihood:  -1520.028   AIC:  3044.056   BIC:  3051.856 
## Correlation matrix:
##      mean sd
## mean    1  0
## sd      0  1
plot(fit2018)

# variation by month (4 data points per cell): 
df2.ed_visits_cleaned %>% 
  filter(weekday == "Monday", 
         year %in% c("2018")) %>% 
  ggplot(aes(x = weekday, 
             y = ed_visits)) + 
  geom_beeswarm() + 
  facet_wrap(~month) + 
  labs(title = "2018") + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# simple average by day of week
df2.ed_visits_cleaned %>% 
  filter(year == "2018") %>% 
  group_by(weekday) %>% 
  summarise(mean_visits = mean(ed_visits)) %>% 
  
  ggplot(aes(x = weekday, 
             y = mean_visits ,
             group = weekday)) + 
  geom_point(size = 5, 
             col = "dodgerblue4") + 
  labs(title = "2018") + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
      panel.grid.major = element_line(colour = "grey95"))

# simple average by month
df2.ed_visits_cleaned %>% 
  filter(year == "2018") %>% 
  group_by(month) %>% 
  summarise(mean_visits = mean(ed_visits)) %>% 
  
  ggplot(aes(x = month, 
             y = mean_visits ,
             group = month)) + 
  geom_point(size = 5, 
             col = "dodgerblue4") + 
  labs(title = "2018") + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"))

# "Seasonality" plot 

# set 
x <- seq(0, 1, length.out = 7)
cols <- seq_gradient_pal(low = "blue", 
                         high = "red")(x)

# show_col(cols)

p <- df2.ed_visits_cleaned %>% 
  filter(year == "2018") %>% 
  group_by(month, 
           weekday) %>% 
  summarise(mean_visits = mean(ed_visits)) %>% 
  ggplot(aes(x = month, 
             y = mean_visits, 
             group = weekday)) +
  geom_line(aes(col = weekday)) + 
  labs(title = "2018") + 
  scale_y_continuous(limits = c(0, 250), 
                     expand = c(0, 0)) + 
  
  scale_color_manual(values = cols) + 
  
  theme_light() +
  labs(title = "2018") + 
  theme(panel.grid.minor = element_line(colour = "grey95"), 
      panel.grid.major = element_line(colour = "grey95")); p 

# ggplotly(p)


# avg ED visits by weekday AND month
# this is the type of plot that I am arguing against - it's all noise

df2.ed_visits_cleaned %>% 
  filter(year == "2018") %>% 
  group_by(month, 
           weekday) %>% 
  summarise(mean_visits = mean(ed_visits)) %>% 
  
  ggplot(aes(x = weekday, 
             y = mean_visits)) + 
  geom_col(fill = "dodgerblue4") + 
  facet_wrap(~month) + 
  
  labs(title = "Is there really any point in looking at graphs like this?") + 
  
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
      panel.grid.major = element_line(colour = "grey95"), 
      axis.text.x = element_text(angle = 45, 
                                 hjust = 1))

# 4) regression model 1: ----

Regression models

With interaction between month and weekday

set.seed(121)
v1_train_index <- createDataPartition(df2.ed_visits_cleaned$ed_visits, 
                                      p = 0.8, 
                                      list = FALSE)

m1 <- lm(ed_visits ~ years_from_2017 + weekday + month + lag_ed_visits + weekday:month, 
         data = df2.ed_visits_cleaned[v1_train_index, ])

summary(m1)
## 
## Call:
## lm(formula = ed_visits ~ years_from_2017 + weekday + month + 
##     lag_ed_visits + weekday:month, data = df2.ed_visits_cleaned[v1_train_index, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.813  -8.957  -0.669   8.713  45.814 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              154.29831    8.42202  18.321  < 2e-16 ***
## years_from_2017            3.86729    0.77749   4.974 8.45e-07 ***
## weekdayTuesday           -13.91675    5.55343  -2.506   0.0125 *  
## weekdayWednesday          -8.97043    6.06841  -1.478   0.1398    
## weekdayThursday          -14.08910    5.77753  -2.439   0.0150 *  
## weekdayFriday             -6.49409    6.04687  -1.074   0.2832    
## weekdaySaturday            0.29425    6.42933   0.046   0.9635    
## weekdaySunday              4.35935    6.02763   0.723   0.4698    
## month2                     6.10651    6.40260   0.954   0.3406    
## month3                    -6.20713    6.19185  -1.002   0.3165    
## month4                     0.49503    6.02592   0.082   0.9346    
## month5                    -3.54823    6.20717  -0.572   0.5678    
## month6                     9.21882    6.19283   1.489   0.1371    
## month7                    -0.91198    6.41855  -0.142   0.8871    
## month8                    -8.94113    7.01207  -1.275   0.2027    
## month9                    -4.87975    6.41281  -0.761   0.4470    
## month10                   -0.73576    6.68543  -0.110   0.9124    
## month11                   -7.36432    7.44091  -0.990   0.3227    
## month12                   -2.12975    6.41281  -0.332   0.7399    
## lag_ed_visits              0.16145    0.03898   4.142 3.91e-05 ***
## weekdayTuesday:month2     -8.45395    8.48089  -0.997   0.3192    
## weekdayWednesday:month2  -11.88124    9.00409  -1.320   0.1875    
## weekdayThursday:month2    -7.46883    8.70939  -0.858   0.3915    
## weekdayFriday:month2       1.39762    8.88457   0.157   0.8751    
## weekdaySaturday:month2    -8.28994    9.82495  -0.844   0.3991    
## weekdaySunday:month2      -7.84377    9.00265  -0.871   0.3839    
## weekdayTuesday:month3      4.10907    8.89019   0.462   0.6441    
## weekdayWednesday:month3    4.94635    8.63566   0.573   0.5670    
## weekdayThursday:month3     9.49706    8.36926   1.135   0.2569    
## weekdayFriday:month3       9.47538    8.55196   1.108   0.2683    
## weekdaySaturday:month3     0.73155    9.00371   0.081   0.9353    
## weekdaySunday:month3       3.24521    8.63654   0.376   0.7072    
## weekdayTuesday:month4     -0.02075    8.19964  -0.003   0.9980    
## weekdayWednesday:month4    0.39170    8.61876   0.045   0.9638    
## weekdayThursday:month4    -3.75115    8.55926  -0.438   0.6613    
## weekdayFriday:month4     -11.08138    8.51455  -1.301   0.1936    
## weekdaySaturday:month4   -12.08204    8.78996  -1.375   0.1698    
## weekdaySunday:month4      -5.34330    8.51584  -0.627   0.5306    
## weekdayTuesday:month5      9.15091    8.24166   1.110   0.2673    
## weekdayWednesday:month5   -2.55882    8.63942  -0.296   0.7672    
## weekdayThursday:month5    11.01220    8.57626   1.284   0.1996    
## weekdayFriday:month5      -1.66005    8.55726  -0.194   0.8462    
## weekdaySaturday:month5    -3.86797    9.48196  -0.408   0.6835    
## weekdaySunday:month5      10.08381    8.87612   1.136   0.2564    
## weekdayTuesday:month6     -1.93767    8.55250  -0.227   0.8208    
## weekdayWednesday:month6   -9.37933    9.43759  -0.994   0.3207    
## weekdayThursday:month6    -5.01131    8.55625  -0.586   0.5583    
## weekdayFriday:month6      -4.84159    8.55450  -0.566   0.5716    
## weekdaySaturday:month6   -15.53082    9.00417  -1.725   0.0850 .  
## weekdaySunday:month6      -5.27018    9.00619  -0.585   0.5586    
## weekdayTuesday:month7      5.85680    9.03788   0.648   0.5172    
## weekdayWednesday:month7    5.26234    9.33891   0.563   0.5733    
## weekdayThursday:month7     7.88056    9.40324   0.838   0.4023    
## weekdayFriday:month7       6.61324    9.56984   0.691   0.4898    
## weekdaySaturday:month7    -3.53939    9.41553  -0.376   0.7071    
## weekdaySunday:month7      -3.78904    8.88587  -0.426   0.6700    
## weekdayTuesday:month8     14.75793    9.46722   1.559   0.1195    
## weekdayWednesday:month8    5.59955    9.31913   0.601   0.5481    
## weekdayThursday:month8     8.71024    9.40954   0.926   0.3550    
## weekdayFriday:month8       8.97914    9.44174   0.951   0.3420    
## weekdaySaturday:month8    -2.70443   10.51477  -0.257   0.7971    
## weekdaySunday:month8      11.06397    9.76366   1.133   0.2576    
## weekdayTuesday:month9      7.68724    9.03920   0.850   0.3954    
## weekdayWednesday:month9    5.67244    9.33385   0.608   0.5436    
## weekdayThursday:month9    18.03731    9.73479   1.853   0.0644 .  
## weekdayFriday:month9       6.24955    9.14893   0.683   0.4948    
## weekdaySaturday:month9   -13.05150    9.81526  -1.330   0.1841    
## weekdaySunday:month9      11.54577    9.33681   1.237   0.2167    
## weekdayTuesday:month10    -1.54519    9.47577  -0.163   0.8705    
## weekdayWednesday:month10  -9.31467    9.33728  -0.998   0.3189    
## weekdayThursday:month10    9.08713    9.34294   0.973   0.3311    
## weekdayFriday:month10    -15.89211    9.51745  -1.670   0.0955 .  
## weekdaySaturday:month10  -14.01015    9.75814  -1.436   0.1516    
## weekdaySunday:month10     -5.29747    9.33707  -0.567   0.5707    
## weekdayTuesday:month11     0.73784    9.61805   0.077   0.9389    
## weekdayWednesday:month11  -7.17345    9.89593  -0.725   0.4688    
## weekdayThursday:month11   14.68065    9.92134   1.480   0.1394    
## weekdayFriday:month11     -8.99053   10.06545  -0.893   0.3721    
## weekdaySaturday:month11  -12.06443   10.51771  -1.147   0.2518    
## weekdaySunday:month11     -1.19333   10.07296  -0.118   0.9057    
## weekdayTuesday:month12     6.81710    9.03566   0.754   0.4508    
## weekdayWednesday:month12   5.75093    9.15022   0.629   0.5299    
## weekdayThursday:month12   -0.90732    9.40556  -0.096   0.9232    
## weekdayFriday:month12      4.71921    9.15022   0.516   0.6062    
## weekdaySaturday:month12   -6.89133    9.26351  -0.744   0.4572    
## weekdaySunday:month12      2.50005    9.33173   0.268   0.7889    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.78 on 634 degrees of freedom
## Multiple R-squared:  0.3295, Adjusted R-squared:  0.2396 
## F-statistic: 3.665 on 85 and 634 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m1)

par(mfrow = c(1,1))


# glance(m1) 
# tidy(m1)
# augment(m1) # %>% names
# predict(m1, interval = "prediction")

m1.train_rmse <- sqrt(mean(resid(m1)^2))



# test set performance: 
df4.predictions <- 
  data.frame(ed_visits = df2.ed_visits_cleaned[-v1_train_index, 6], 
             predicted = predict(m1, 
                                 newdata = df2.ed_visits_cleaned[-v1_train_index, ])) 

m1.test_rmse <- sqrt(mean((df4.predictions$predicted - df4.predictions$ed_visits)^2, 
                          na.rm = TRUE))



# 5) regression model 2: ----

Without interaction between month and weekday

set.seed(121)
v1_train_index <- createDataPartition(df2.ed_visits_cleaned$ed_visits, 
                                      p = 0.8, 
                                      list = FALSE)

m2 <- lm(ed_visits ~ years_from_2017 + weekday + month + lag_ed_visits, 
         data = df2.ed_visits_cleaned[v1_train_index, ])

summary(m2)
## 
## Call:
## lm(formula = ed_visits ~ years_from_2017 + weekday + month + 
##     lag_ed_visits, data = df2.ed_visits_cleaned[v1_train_index, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.229  -9.538  -0.286   9.233  46.245 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      158.72362    7.35109  21.592  < 2e-16 ***
## years_from_2017    3.99472    0.76845   5.198 2.64e-07 ***
## weekdayTuesday   -11.15955    1.94051  -5.751 1.33e-08 ***
## weekdayWednesday -10.53659    2.02594  -5.201 2.61e-07 ***
## weekdayThursday  -10.19050    2.06858  -4.926 1.05e-06 ***
## weekdayFriday     -7.28938    2.01238  -3.622 0.000313 ***
## weekdaySaturday   -7.54671    2.07650  -3.634 0.000299 ***
## weekdaySunday      4.45044    2.00022   2.225 0.026401 *  
## month2            -0.01046    2.36364  -0.004 0.996469    
## month3            -0.83117    2.28412  -0.364 0.716049    
## month4            -3.71075    2.27845  -1.629 0.103841    
## month5            -0.02794    2.29764  -0.012 0.990301    
## month6             3.87182    2.36417   1.638 0.101931    
## month7             1.74936    2.53443   0.690 0.490274    
## month8            -1.59659    2.53108  -0.631 0.528380    
## month9             0.60023    2.59564   0.231 0.817192    
## month10           -5.80302    2.57302  -2.255 0.024420 *  
## month11           -9.25944    2.61501  -3.541 0.000425 ***
## month12            0.32243    2.51470   0.128 0.898014    
## lag_ed_visits      0.13632    0.03746   3.639 0.000294 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.81 on 700 degrees of freedom
## Multiple R-squared:  0.2557, Adjusted R-squared:  0.2355 
## F-statistic: 12.66 on 19 and 700 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m2)

par(mfrow = c(1,1))


# glance(m2) 
# tidy(m2)
# augment(m2) # %>% names
# predict(m2, interval = "prediction")

m2.train_rmse <- sqrt(mean(resid(m2)^2))



# test set performance: 
df4.predictions <- 
  data.frame(ed_visits = df2.ed_visits_cleaned[-v1_train_index, 6], 
             predicted = predict(m2, 
                                 newdata = df2.ed_visits_cleaned[-v1_train_index, ])) 

m2.test_rmse <- sqrt(mean((df4.predictions$predicted - df4.predictions$ed_visits)^2, 
                          na.rm = TRUE))

Summary of models

df5.model.performance <- 
  data.frame(model = c("year + month + weekday + month:weekday", 
                       "year + month + weekday + month:weekday", 
                       "year + month + weekday", 
                       "year + month + weekday"), 
             metric = rep(c("Train RMSE", 
                            "Test RMSE"), 2), 
             value = c(m1.train_rmse, 
                       m1.test_rmse, 
                       m2.train_rmse, 
                       m2.test_rmse)) 


df5.model.performance %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped",
              "condensed", 
              "responsive"))
model metric value
year + month + weekday + month:weekday Train RMSE 12.92700
year + month + weekday + month:weekday Test RMSE 14.38572
year + month + weekday Train RMSE 13.61936
year + month + weekday Test RMSE 13.98180

     

Model selection notes

Including month, weekday and year is very likely to overfit - there’s just 4 data points per cell!!

The general strategy to prevent overfitting is, of course, cross-validation or a train/test split      

# 6) train model 2 on full dataset: -----------

Train selected model on full dataset

m3.full_dataset <- lm(ed_visits ~ years_from_2017 + weekday + month + lag_ed_visits, 
                      data = df2.ed_visits_cleaned)

summary(m3.full_dataset)
## 
## Call:
## lm(formula = ed_visits ~ years_from_2017 + weekday + month + 
##     lag_ed_visits, data = df2.ed_visits_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.570  -9.486  -0.253   9.466  43.926 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      153.39456    6.54392  23.441  < 2e-16 ***
## years_from_2017    4.13042    0.67963   6.077 1.82e-09 ***
## weekdayTuesday   -10.80266    1.72528  -6.261 5.97e-10 ***
## weekdayWednesday -11.19875    1.78843  -6.262 5.95e-10 ***
## weekdayThursday  -10.49927    1.80956  -5.802 9.14e-09 ***
## weekdayFriday     -6.88718    1.80750  -3.810 0.000148 ***
## weekdaySaturday   -7.16601    1.77412  -4.039 5.83e-05 ***
## weekdaySunday      5.12227    1.76991   2.894 0.003897 ** 
## month2            -0.63584    2.08307  -0.305 0.760253    
## month3            -0.22125    2.03080  -0.109 0.913269    
## month4            -4.27939    2.05232  -2.085 0.037344 *  
## month5             2.09335    2.02958   1.031 0.302628    
## month6             2.67357    2.13742   1.251 0.211326    
## month7             2.19038    2.29445   0.955 0.340022    
## month8            -1.18906    2.29405  -0.518 0.604364    
## month9             0.17245    2.31562   0.074 0.940651    
## month10           -5.29449    2.30121  -2.301 0.021640 *  
## month11           -8.29155    2.34607  -3.534 0.000430 ***
## month12            1.99130    2.29397   0.868 0.385599    
## lag_ed_visits      0.16218    0.03336   4.861 1.38e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.8 on 877 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2759, Adjusted R-squared:  0.2602 
## F-statistic: 17.59 on 19 and 877 DF,  p-value: < 2.2e-16
glance(m3.full_dataset) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped",
              "condensed", 
              "responsive"))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.275883 0.2601952 13.79842 17.5858 0 20 -3616.899 7275.799 7376.579 166977.5 877
# actual vs fitted values: 
augment(m3.full_dataset) %>% 
  ggplot(aes(x = .fitted, 
             y = ed_visits)) + 
  geom_point() + 
  
  scale_x_continuous(limits = c(150, 210)) + 
  scale_y_continuous(limits = c(150, 210)) + 
  
  geom_smooth() + 
  geom_abline(slope = 1, 
              intercept = 0) + 
  
  labs(x = "predicted values", 
       y = "actual values",
       title = "LGH ED visits - Prediction using day of week, month, year, and previous day's ED visits") + 
  
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 53 rows containing non-finite values (stat_smooth).
## Warning: Removed 53 rows containing missing values (geom_point).

df6.coeffs <- 
  tidy(m3.full_dataset) %>% 
  mutate(lower_ci = estimate - 1.96 * std.error, 
         upper_ci = estimate + 1.96 * std.error) %>% 
  
  dplyr::select(term, 
         lower_ci, 
         estimate, 
         upper_ci, 
         everything()) 

df6.coeffs %>% 
  datatable() %>% 
  formatRound(2:7, 2)

     

Visuals of day of week effects

# 7) visuals of day of week effects ----------

df6.coeffs %>% 
  filter(grepl("weekday", term)) %>% 
  
  mutate(term = substring(term, 8)) %>% 
  mutate(term = factor(term, 
                       levels = c("Monday", 
                                  "Tuesday", 
                                  "Wednesday", 
                                  "Thursday", 
                                  "Friday",
                                  "Saturday", 
                                  "Sunday"))) %>% 
  
  ggplot()  +
  geom_pointrange(aes(x = term, 
                      ymin = lower_ci, 
                      ymax = upper_ci, 
                      y = estimate)) + 
  geom_hline(yintercept = 0) + 
  
  scale_y_continuous(limits = c(-20, 20), 
                     breaks = seq(-20, 20, 4)) + 
  
  labs(x = "Day of week", 
       y = "Difference in average daily ED visits" ,
       title = "LGH ED \nImpact of Day of Week on average daily ED visits", 
       subtitle = "These estimates control for year and month, allowing us to isolate weekday effects \nfrom other factors and from statistical noise \n\nBaseline - Monday", 
       caption = "\n\nNote: There is no significant interaction between day-of-week effects and month effects") + 
  
  theme_light(base_size = 12) +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
      panel.grid.major = element_line(colour = "grey95"))

     

Visuals of month effects

# 8) visuals of month effects ----------

df6.coeffs %>% 
  filter(grepl("month", term)) %>% 
  
  mutate(term = factor(term,
                       levels = c(
                         "month2", 
                         "month3", 
                         "month4", 
                         "month5", 
                         "month6", 
                         "month7", 
                         "month8", 
                         "month9", 
                         "month10", 
                         "month11", 
                         "month12"
                       ))) %>% 

  
  ggplot()  +
  geom_pointrange(aes(x = term, 
                      ymin = lower_ci, 
                      ymax = upper_ci, 
                      y = estimate)) + 
  geom_hline(yintercept = 0) + 
  
  scale_y_continuous(limits = c(-20, 20), 
                     breaks = seq(-20, 20, 4)) + 
  
  
  labs(x = "Month", 
       y = "Difference in average daily ED visits" ,
       title = "LGH ED \nImpact of Month on average daily ED visits", 
       subtitle = "These estimates control for year and day-of-week, allowing us to isolate month effects \nfrom other factors and from statistical noise \n\nBaseline - January", 
       caption = "\n\nNote: There is no significant interaction between day-of-week effects and month effects") + 
  
  theme_light(base_size = 12) +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"), 
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1)) 

   

Prediction intervals for illustrative data

Import illustrative data to predict on. Note that all lagged ed_visits values are set to the overall mean for the corresponding day of week in 2019 (see df3.mean_and_sd)

# 9) Prediction intervals for illustrative data ---------

df7.predict_intervals <- 
  read_csv(here::here("data", 
                      "2019-06-30_ed-daily_illustrative-data-for-prediction-intervals.csv")) %>% 
  
  
  mutate(weekday = fct_relevel(weekday, 
                               levels = c("Monday", 
                                          "Tuesday", 
                                          "Wednesday", 
                                          "Thursday", 
                                          "Friday",
                                          "Saturday", 
                                          "Sunday")), 
         date = mdy(date), 
         # years_from_2017 = as.factor(years_from_2017), 
         year = as.factor(year), 
         month = as.factor(month))
## Parsed with column specification:
## cols(
##   date = col_character(),
##   years_from_2017 = col_double(),
##   month = col_double(),
##   year = col_double(),
##   weekday = col_character(),
##   ed_visits = col_double(),
##   lag_ed_visits = col_double()
## )
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
df7.predict_intervals <- 
  predict(m2, 
        newdata = df7.predict_intervals, 
        interval = "prediction") %>% 
  as.data.frame() %>% 
  
  bind_cols(df7.predict_intervals)  
  
df7.predict_intervals %>% 
  select(-fit,
         -lwr, 
         -upr, 
         everything()) %>% 
  datatable() %>% 
  formatRound(8:10, 2)
df7.predict_intervals %>%   
  ggplot(aes(x = weekday,
             ymin = lwr, 
             ymax = upr, 
             y = fit)) + 
  geom_pointrange() + 
  
  labs(title = "2019 weekday effects", 
       subtitle = "Predicted LGH ED visits by day of week \n\nBased on our model, only 5% of actual data points should fall outside these ranges. \nObserved value is about 3% as of 2019-06-17", 
       y = "number of ED visits") + 
  
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"))

Validating prediction intervals

Is it true that in 2019, 5% of data points fall outsite these prediction intervals?

df8.validate_predict_int <- 
  df7.predict_intervals %>% 
  left_join(df2.ed_visits_cleaned %>% 
              filter(years_from_2017 == 2),
            by = c("weekday" = "weekday")) %>% 
  select(date.y, 
         month.x, 
         weekday, 
         ed_visits.y, 
         lwr, 
         upr) %>% 
  
  mutate(outsite_predict_int = ifelse(ed_visits.y < lwr | ed_visits.y > upr, 
                                      1, 0)) %>% 
  arrange(date.y)


df8.validate_predict_int %>% 
  datatable(extensions = 'Buttons',
          options = list(dom = 'Bfrtip', 
                         buttons = c('excel', "csv")))
df8.validate_predict_int %>% 
  summarise(n = n(), 
            num_outsite_interval = sum(outsite_predict_int), 
            prop = num_outsite_interval/n)
##     n num_outsite_interval      prop
## 1 168                    5 0.0297619

Conclusion: the prediction intervals may in fact be a bit conservative (too wide). That’s why a smaller proportion of the real data is falling outside the interval (3% versus 5%).

Of course, it could just be sampling variability due to a small dataset.

# 10) write outputs: ---------
# write_csv(df6.coeffs,
#           here::here("results", 
#                      "dst", 
#                      "2019-06-21_lgh_ed-visits-regression-coeffs.csv"))
              

# write_csv(df6.coeffs,
#           here::here("results", 
#                      "dst", 
#                      "2019-06-24_lgh_ed-visits-regression-coeffs.csv"))

Appendix: Null models

# only month 
m4.month <- lm(ed_visits ~ month,
               data = df2.ed_visits_cleaned)

summary(m4.month)
## 
## Call:
## lm(formula = ed_visits ~ month, data = df2.ed_visits_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.022 -10.959  -0.928  10.955  59.823 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 181.0215     1.6174 111.921  < 2e-16 ***
## month2       -0.8072     2.3478  -0.344 0.731066    
## month3       -0.6022     2.2874  -0.263 0.792418    
## month4       -4.6771     2.3063  -2.028 0.042867 *  
## month5        1.9677     2.2874   0.860 0.389873    
## month6        2.3551     2.4032   0.980 0.327366    
## month7        0.6559     2.5573   0.256 0.797638    
## month8       -4.4892     2.5573  -1.755 0.079530 .  
## month9       -2.1548     2.5828  -0.834 0.404333    
## month10      -8.5215     2.5573  -3.332 0.000897 ***
## month11     -12.9048     2.5828  -4.996 7.03e-07 ***
## month12       0.1559     2.5573   0.061 0.951399    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.6 on 886 degrees of freedom
## Multiple R-squared:  0.06556,    Adjusted R-squared:  0.05396 
## F-statistic: 5.651 on 11 and 886 DF,  p-value: 7.873e-09
glance(m4.month) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped",
                                      "condensed", 
                                      "responsive"))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.0655617 0.0539603 15.59768 5.651195 0 12 -3735.082 7496.164 7558.566 215552.8 886
augment(m4.month) %>% 
  ggplot(aes(x = .fitted, 
             y = ed_visits, 
             col = month)) + 
  geom_point() + 
  
  scale_x_continuous(limits = c(150, 210)) + 
  scale_y_continuous(limits = c(150, 210)) + 
  
  geom_smooth() + 
  geom_abline(slope = 1, 
              intercept = 0) + 
  
  labs(x = "predicted values", 
       y = "actual values") + 
  
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 53 rows containing non-finite values (stat_smooth).
## Warning: Removed 53 rows containing missing values (geom_point).

# only day of week 
m5.weekday <- lm(ed_visits ~ weekday,
               data = df2.ed_visits_cleaned)

summary(m5.weekday)
## 
## Call:
## lm(formula = ed_visits ~ weekday, data = df2.ed_visits_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.117 -10.359  -0.359   9.850  51.233 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       186.620      1.309 142.522  < 2e-16 ***
## weekdayTuesday    -11.370      1.855  -6.128 1.33e-09 ***
## weekdayWednesday  -13.581      1.855  -7.320 5.55e-13 ***
## weekdayThursday   -13.261      1.855  -7.147 1.84e-12 ***
## weekdayFriday      -9.503      1.855  -5.122 3.71e-07 ***
## weekdaySaturday    -9.112      1.855  -4.911 1.08e-06 ***
## weekdaySunday       3.147      1.852   1.700   0.0896 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.87 on 891 degrees of freedom
## Multiple R-squared:  0.1457, Adjusted R-squared:  0.1399 
## F-statistic: 25.32 on 6 and 891 DF,  p-value: < 2.2e-16
glance(m5.weekday) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped",
                                      "condensed", 
                                      "responsive"))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.145682 0.139929 14.8721 25.32286 0 7 -3694.833 7405.665 7444.067 197070.9 891
augment(m5.weekday) %>% 
  ggplot(aes(x = .fitted, 
             y = ed_visits ,
             col = weekday)) + 
  geom_point() + 
  
  scale_x_continuous(limits = c(150, 210)) + 
  scale_y_continuous(limits = c(150, 210)) + 
  
  geom_abline(slope = 1, 
              intercept = 0) + 
  
  geom_smooth(col = "red") + 
  
  labs(x = "predicted values", 
       y = "actual values", 
       title = "LGH daily ED visits, by day of week") + 
  
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 53 rows containing non-finite values (stat_smooth).

## Warning: Removed 53 rows containing missing values (geom_point).