library(tidyverse)
library(here)
library(odbc)
library(DBI)
library(lubridate)
library(ggbeeswarm)
library(DT)
library(broom)
library(caret)
library(kableExtra)
library(scales)
library(plotly)
knitr::opts_chunk$set(dev = "png",
                      cache = TRUE)
# 1) set up database connections and import functions: -----------
source(here::here("src", 
                  "setup-denodo_function.R"))
source(here::here("src", 
                  "ed-visits-by-hour-denodo_function.R"))


setup_denodo()

Data

# 2) pull ed data: -----------
df1.ed_visits <- extract_ed_visits_by_hour("20170101",  # todo: earlier start?
                                           "20190617")
## Warning: Column `interval_1_hour_at_start_date`/`hours` joining character
## vector and factor, coercing into character vector
df2.ed_visits_cleaned <- 
  df1.ed_visits %>% 
  mutate(weekday = weekdays(date), 
         month = month(date), 
         year = year(date), 
         years_from_2017 = year - 2017) %>% 
  
  # fiddle with factors: 
  mutate(weekday = fct_relevel(weekday, 
                               levels = c("Monday", 
                                          "Tuesday", 
                                          "Wednesday", 
                                          "Thursday", 
                                          "Friday",
                                          "Saturday", 
                                          "Sunday")), 
         # years_from_2017 = as.factor(years_from_2017), 
         year = as.factor(year), 
         month = as.factor(month), 
         interval_1_hour_at_start_date = as.factor(interval_1_hour_at_start_date)) %>% 
  
  rename(ed_visits = value, 
         hour = interval_1_hour_at_start_date) %>% 
  
  mutate(lag_ed_visits = lag(ed_visits), 
         lag2_ed_visits = lag(ed_visits, 2), 
         lag3_ed_visits = lag(ed_visits, 3), 
         lag4_ed_visits = lag(ed_visits, 4), 
         lag5_ed_visits = lag(ed_visits, 5),
         lag6_ed_visits = lag(ed_visits, 6)) %>% 
  
  replace(is.na(.), 0) %>% 
  
  select(date, 
         hour, 
         years_from_2017, 
         month, 
         year, 
         weekday, 
         ed_visits, 
         lag_ed_visits, 
         lag2_ed_visits, 
         lag3_ed_visits, 
         lag4_ed_visits, 
         lag5_ed_visits, 
         lag6_ed_visits)
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
# add hour_int column: 
days <- df2.ed_visits_cleaned$date %>% unique() %>% length()

df2.ed_visits_cleaned <- 
  df2.ed_visits_cleaned %>% 
  mutate(hour_int = rep(1:24, 
                        days))


# result: 
str(df2.ed_visits_cleaned)
## Classes 'tbl_df', 'tbl' and 'data.frame':    21552 obs. of  14 variables:
##  $ date           : Date, format: "2017-01-01" "2017-01-01" ...
##  $ hour           : Factor w/ 24 levels "0000 - 0059",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ years_from_2017: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ month          : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year           : Factor w/ 3 levels "2017","2018",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday        : Factor w/ 7 levels "Monday","Tuesday",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ ed_visits      : num  6 6 4 4 2 0 4 8 10 8 ...
##  $ lag_ed_visits  : num  0 6 6 4 4 2 0 4 8 10 ...
##  $ lag2_ed_visits : num  0 0 6 6 4 4 2 0 4 8 ...
##  $ lag3_ed_visits : num  0 0 0 6 6 4 4 2 0 4 ...
##  $ lag4_ed_visits : num  0 0 0 0 6 6 4 4 2 0 ...
##  $ lag5_ed_visits : num  0 0 0 0 0 6 6 4 4 2 ...
##  $ lag6_ed_visits : num  0 0 0 0 0 0 6 6 4 4 ...
##  $ hour_int       : int  1 2 3 4 5 6 7 8 9 10 ...
df2.ed_visits_cleaned %>%
  head(100) %>% 
  datatable()
# mean and sd: 
df3.mean_and_sd <- 
  df2.ed_visits_cleaned %>% 
  group_by(hour) %>% 
  summarise(mean_visits = mean(ed_visits, 
                               na.rm = TRUE), 
            sd_visits = sd(ed_visits, 
                           na.rm = TRUE))

df3.mean_and_sd %>% 
  datatable() %>% 
  formatRound(2:3, 1)

Notes - Poisson?

Mean and variance aren’t really close for most hours. Does this mean that arrival rate isn’t really Poisson-distributed?      

Exploratory plots

# 3) plots: ------------
# facet by year
df2.ed_visits_cleaned %>% 
  ggplot(aes(x = hour, 
             y = ed_visits)) + 
  geom_boxplot() + 
  facet_wrap(~year) + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# facet by weekday
df2.ed_visits_cleaned %>% 
  ggplot(aes(x = hour, 
             y = ed_visits)) + 
  geom_boxplot() + 
  facet_wrap(~weekday) + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

df2.ed_visits_cleaned %>% 
  ggplot(aes(x = year, 
             y = ed_visits)) + 
  geom_boxplot() + 
  facet_wrap(~weekday) + 
  labs(title = "hourly arrival rate by year") + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# density by year: 
df2.ed_visits_cleaned %>% 
  ggplot(aes(x = ed_visits)) + 
  geom_density() + 
  facet_wrap(~year) + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

Notes - distribution of response variable

Unfortunately, this is very non-normal.

Hypothesis: this is a mixture of two normal distributions: one for “late-night” rates, one for rest of the day.

Let’s start by analyzing visits over the period 7 AM to midnight. This seems to give a reasonable distribution.

# density for "busy" hours: 
df2.ed_visits_cleaned %>% 
  filter(hour_int >= 8, 
         hour_int <= 24) %>% 
  ggplot(aes(x = ed_visits)) + 
  geom_density() + 
  facet_wrap(~year) + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# compare with poisson curves: 
# mean_visits_from_5am <- 
#   df2.ed_visits_cleaned %>% 
#   filter(year == "2018", 
#          hour_int >= 5, 
#          hour_int <= 24) %>% 
#   pull(ed_visits) %>% 
#   mean(na.rm = TRUE)
# 
# x <- rpois(1000, mean_visits_from_5am)
# x %>% density() %>% plot(main = paste("Poisson with mean", round(mean_visits_from_5am, 2)))

Plots for each hour of day in 2018

df4.nest_by_hour <- 
  df2.ed_visits_cleaned %>% 
  filter(year == "2018") %>% 
  group_by(hour_int) %>% 
  nest %>% 
  
  mutate(density = map2(data, 
                        hour_int, 
                        function(x, y){
                         ggplot(x, 
                                aes(x = ed_visits)) + 
                           geom_density() + 
                           labs(title = y %>% as.character())
                       }))

df4.nest_by_hour$density
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

Data - subset for 7 AM to midnight

df5.ed_visits_busy_hours <- 
  df2.ed_visits_cleaned %>% 
  filter(hour_int >= 8)

df5.ed_visits_busy_hours %>% 
  head(50) %>% 
  datatable()

Variation by day of week

# set colours for days of week 
x <- seq(0, 1, length.out = 7)
cols <- seq_gradient_pal(low = "blue", 
                         high = "red")(x)

# hour of day, split by day of week 
p <- 
  df2.ed_visits_cleaned %>% 
  group_by(hour, 
           weekday) %>% 
  filter(hour_int >= 8, 
         hour_int <= 24) %>% 
  summarize(mean_visits = mean(ed_visits, na.rm = T)) %>% 
  ggplot(aes(x = hour, 
             y = mean_visits, 
             group = weekday, 
             col = weekday)) + 
  geom_line() + 
  scale_color_manual(values = cols) +
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"), 
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1)); p

# ggplotly(p)

df2.ed_visits_cleaned %>% 
  filter(year %in% c("2018")) %>% 
  ggplot(aes(x = hour, 
             y = ed_visits)) + 
  geom_boxplot() + 
  facet_wrap(~weekday) + 
  labs(title = "2018") + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"),
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1))

# simple average by hour of day 
df2.ed_visits_cleaned %>% 
  group_by(hour) %>% 
  summarise(mean_visits = mean(ed_visits, 
                               na.rm = TRUE)) %>% 
  
  ggplot(aes(x = hour, 
             y = mean_visits)) + 
  geom_point(size = 5, 
             col = "dodgerblue4") + 
  labs(title = "2018") + 
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"), 
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1)
        )

# avg ED visits by weekday AND hour
# this is the type of plot that I am arguing against - it's all noise

df2.ed_visits_cleaned %>% 
  filter(year == "2018") %>% 
  group_by(hour, 
           weekday) %>% 
  summarise(mean_visits = mean(ed_visits, 
                               na.rm = TRUE)) %>% 
  
  ggplot(aes(x = hour, 
             y = mean_visits)) + 
  geom_col(fill = "dodgerblue4") + 
  facet_wrap(~weekday) + 
  
  labs(title = "Is there really any point in looking at graphs like this?") + 
  
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
      panel.grid.major = element_line(colour = "grey95"), 
      axis.text.x = element_text(angle = 45, 
                                 hjust = 1))

# 4) regression model 1: ----

Regression models

m1: With interaction between hour and weekday

set.seed(11)
v1_train_index <- createDataPartition(df5.ed_visits_busy_hours$ed_visits, 
                                      p = 0.8, 
                                      list = FALSE)

# fit model: 
m1 <- lm(ed_visits ~ hour + weekday + years_from_2017 + 
           lag_ed_visits + lag2_ed_visits + hour:weekday, 
         data = df5.ed_visits_busy_hours[v1_train_index, ])

summary(m1)
## 
## Call:
## lm(formula = ed_visits ~ hour + weekday + years_from_2017 + lag_ed_visits + 
##     lag2_ed_visits + hour:weekday, data = df5.ed_visits_busy_hours[v1_train_index, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6194 -2.0872 -0.1588  1.9166 12.7806 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       4.805911   0.307834  15.612  < 2e-16 ***
## hour0800 - 0859                   2.255010   0.431560   5.225 1.77e-07 ***
## hour0900 - 0959                   5.252166   0.427673  12.281  < 2e-16 ***
## hour1000 - 1059                   6.196535   0.443487  13.972  < 2e-16 ***
## hour1100 - 1159                   7.126052   0.438067  16.267  < 2e-16 ***
## hour1200 - 1259                   6.497693   0.450907  14.410  < 2e-16 ***
## hour1300 - 1359                   5.660915   0.443696  12.759  < 2e-16 ***
## hour1400 - 1459                   5.962379   0.444887  13.402  < 2e-16 ***
## hour1500 - 1559                   4.834981   0.440112  10.986  < 2e-16 ***
## hour1600 - 1659                   4.547910   0.442771  10.271  < 2e-16 ***
## hour1700 - 1759                   4.306518   0.449075   9.590  < 2e-16 ***
## hour1800 - 1859                   3.375277   0.438238   7.702 1.44e-14 ***
## hour1900 - 1959                   4.002728   0.443050   9.034  < 2e-16 ***
## hour2000 - 2059                   4.252633   0.425078  10.004  < 2e-16 ***
## hour2100 - 2159                   2.773498   0.433704   6.395 1.67e-10 ***
## hour2200 - 2259                   0.690483   0.449060   1.538  0.12417    
## hour2300 - 2359                  -0.964185   0.426158  -2.263  0.02368 *  
## weekdayTuesday                   -0.301565   0.431123  -0.699  0.48426    
## weekdayWednesday                 -0.773018   0.431116  -1.793  0.07299 .  
## weekdayThursday                  -0.324062   0.423061  -0.766  0.44370    
## weekdayFriday                     0.002037   0.431117   0.005  0.99623    
## weekdaySaturday                  -0.153665   0.432214  -0.356  0.72220    
## weekdaySunday                     0.262850   0.426982   0.616  0.53817    
## years_from_2017                   0.248527   0.037571   6.615 3.87e-11 ***
## lag_ed_visits                     0.059935   0.009151   6.550 6.00e-11 ***
## lag2_ed_visits                    0.030412   0.009326   3.261  0.00111 ** 
## hour0800 - 0859:weekdayTuesday   -0.554164   0.604708  -0.916  0.35947    
## hour0900 - 0959:weekdayTuesday   -0.289981   0.606192  -0.478  0.63240    
## hour1000 - 1059:weekdayTuesday   -1.071022   0.610004  -1.756  0.07915 .  
## hour1100 - 1159:weekdayTuesday   -0.615698   0.600667  -1.025  0.30537    
## hour1200 - 1259:weekdayTuesday   -1.675616   0.609188  -2.751  0.00596 ** 
## hour1300 - 1359:weekdayTuesday   -0.319355   0.602012  -0.530  0.59579    
## hour1400 - 1459:weekdayTuesday   -0.539365   0.613374  -0.879  0.37923    
## hour1500 - 1559:weekdayTuesday    0.431069   0.607128   0.710  0.47771    
## hour1600 - 1659:weekdayTuesday   -0.055270   0.609812  -0.091  0.92779    
## hour1700 - 1759:weekdayTuesday   -0.282050   0.609467  -0.463  0.64353    
## hour1800 - 1859:weekdayTuesday    0.509488   0.603131   0.845  0.39827    
## hour1900 - 1959:weekdayTuesday   -0.557356   0.611309  -0.912  0.36192    
## hour2000 - 2059:weekdayTuesday   -0.349433   0.595273  -0.587  0.55721    
## hour2100 - 2159:weekdayTuesday    0.175640   0.603292   0.291  0.77095    
## hour2200 - 2259:weekdayTuesday    0.259565   0.613413   0.423  0.67219    
## hour2300 - 2359:weekdayTuesday    0.948124   0.601992   1.575  0.11529    
## hour0800 - 0859:weekdayWednesday  0.043334   0.602109   0.072  0.94263    
## hour0900 - 0959:weekdayWednesday  0.413774   0.602455   0.687  0.49221    
## hour1000 - 1059:weekdayWednesday -0.640091   0.614410  -1.042  0.29753    
## hour1100 - 1159:weekdayWednesday -0.908977   0.604918  -1.503  0.13296    
## hour1200 - 1259:weekdayWednesday -1.057942   0.613784  -1.724  0.08480 .  
## hour1300 - 1359:weekdayWednesday -0.216078   0.602043  -0.359  0.71967    
## hour1400 - 1459:weekdayWednesday -0.140617   0.604074  -0.233  0.81593    
## hour1500 - 1559:weekdayWednesday  0.117360   0.606212   0.194  0.84650    
## hour1600 - 1659:weekdayWednesday  0.727571   0.603906   1.205  0.22831    
## hour1700 - 1759:weekdayWednesday -0.002663   0.612249  -0.004  0.99653    
## hour1800 - 1859:weekdayWednesday  0.792702   0.607467   1.305  0.19194    
## hour1900 - 1959:weekdayWednesday  0.371652   0.612822   0.606  0.54422    
## hour2000 - 2059:weekdayWednesday  0.063948   0.596685   0.107  0.91465    
## hour2100 - 2159:weekdayWednesday  0.125080   0.599680   0.209  0.83478    
## hour2200 - 2259:weekdayWednesday  1.048328   0.617875   1.697  0.08979 .  
## hour2300 - 2359:weekdayWednesday  0.931876   0.599787   1.554  0.12029    
## hour0800 - 0859:weekdayThursday  -0.414179   0.603258  -0.687  0.49237    
## hour0900 - 0959:weekdayThursday  -0.288866   0.597400  -0.484  0.62872    
## hour1000 - 1059:weekdayThursday  -1.130530   0.601577  -1.879  0.06023 .  
## hour1100 - 1159:weekdayThursday  -1.575271   0.599317  -2.628  0.00859 ** 
## hour1200 - 1259:weekdayThursday  -1.566167   0.604437  -2.591  0.00958 ** 
## hour1300 - 1359:weekdayThursday  -0.613064   0.602419  -1.018  0.30885    
## hour1400 - 1459:weekdayThursday  -1.328793   0.601965  -2.207  0.02730 *  
## hour1500 - 1559:weekdayThursday  -0.224441   0.604818  -0.371  0.71058    
## hour1600 - 1659:weekdayThursday  -0.196888   0.601899  -0.327  0.74359    
## hour1700 - 1759:weekdayThursday  -0.272886   0.607373  -0.449  0.65323    
## hour1800 - 1859:weekdayThursday   0.497301   0.596025   0.834  0.40409    
## hour1900 - 1959:weekdayThursday  -0.483426   0.604914  -0.799  0.42421    
## hour2000 - 2059:weekdayThursday  -0.182456   0.588754  -0.310  0.75664    
## hour2100 - 2159:weekdayThursday   0.594252   0.595289   0.998  0.31817    
## hour2200 - 2259:weekdayThursday   1.127335   0.607062   1.857  0.06333 .  
## hour2300 - 2359:weekdayThursday   0.885577   0.598647   1.479  0.13909    
## hour0800 - 0859:weekdayFriday    -0.209296   0.607471  -0.345  0.73045    
## hour0900 - 0959:weekdayFriday    -0.871836   0.605375  -1.440  0.14985    
## hour1000 - 1059:weekdayFriday    -0.098513   0.614430  -0.160  0.87262    
## hour1100 - 1159:weekdayFriday    -1.019797   0.603332  -1.690  0.09100 .  
## hour1200 - 1259:weekdayFriday    -1.677521   0.609072  -2.754  0.00589 ** 
## hour1300 - 1359:weekdayFriday    -0.850217   0.613032  -1.387  0.16550    
## hour1400 - 1459:weekdayFriday    -1.426260   0.609926  -2.338  0.01938 *  
## hour1500 - 1559:weekdayFriday     0.030666   0.602543   0.051  0.95941    
## hour1600 - 1659:weekdayFriday    -0.080191   0.606860  -0.132  0.89488    
## hour1700 - 1759:weekdayFriday    -0.714537   0.614487  -1.163  0.24493    
## hour1800 - 1859:weekdayFriday    -0.193181   0.599836  -0.322  0.74741    
## hour1900 - 1959:weekdayFriday    -0.708031   0.610588  -1.160  0.24624    
## hour2000 - 2059:weekdayFriday    -0.834777   0.597402  -1.397  0.16234    
## hour2100 - 2159:weekdayFriday    -0.187389   0.603296  -0.311  0.75610    
## hour2200 - 2259:weekdayFriday     0.620618   0.612116   1.014  0.31066    
## hour2300 - 2359:weekdayFriday     0.943946   0.597690   1.579  0.11429    
## hour0800 - 0859:weekdaySaturday  -0.052465   0.607534  -0.086  0.93118    
## hour0900 - 0959:weekdaySaturday  -0.901767   0.607712  -1.484  0.13787    
## hour1000 - 1059:weekdaySaturday  -0.758609   0.612291  -1.239  0.21538    
## hour1100 - 1159:weekdaySaturday  -1.468632   0.601404  -2.442  0.01462 *  
## hour1200 - 1259:weekdaySaturday  -0.660572   0.610069  -1.083  0.27893    
## hour1300 - 1359:weekdaySaturday  -0.371911   0.601438  -0.618  0.53634    
## hour1400 - 1459:weekdaySaturday  -1.129492   0.604057  -1.870  0.06153 .  
## hour1500 - 1559:weekdaySaturday  -0.657950   0.603415  -1.090  0.27557    
## hour1600 - 1659:weekdaySaturday  -0.262677   0.608391  -0.432  0.66593    
## hour1700 - 1759:weekdaySaturday  -0.381246   0.611010  -0.624  0.53267    
## hour1800 - 1859:weekdaySaturday  -0.139971   0.602561  -0.232  0.81631    
## hour1900 - 1959:weekdaySaturday  -0.409307   0.610605  -0.670  0.50266    
## hour2000 - 2059:weekdaySaturday  -0.389122   0.598937  -0.650  0.51591    
## hour2100 - 2159:weekdaySaturday  -0.302403   0.602587  -0.502  0.61579    
## hour2200 - 2259:weekdaySaturday   0.655608   0.615006   1.066  0.28644    
## hour2300 - 2359:weekdaySaturday   1.195874   0.605114   1.976  0.04815 *  
## hour0800 - 0859:weekdaySunday     0.037441   0.606746   0.062  0.95080    
## hour0900 - 0959:weekdaySunday    -0.833425   0.599444  -1.390  0.16445    
## hour1000 - 1059:weekdaySunday     0.017108   0.602432   0.028  0.97735    
## hour1100 - 1159:weekdaySunday    -0.638865   0.596093  -1.072  0.28385    
## hour1200 - 1259:weekdaySunday    -0.703858   0.601346  -1.170  0.24183    
## hour1300 - 1359:weekdaySunday    -0.361447   0.602437  -0.600  0.54853    
## hour1400 - 1459:weekdaySunday    -1.387462   0.603780  -2.298  0.02158 *  
## hour1500 - 1559:weekdaySunday    -0.959727   0.598902  -1.602  0.10908    
## hour1600 - 1659:weekdaySunday    -0.658843   0.603872  -1.091  0.27528    
## hour1700 - 1759:weekdaySunday    -0.694827   0.610966  -1.137  0.25545    
## hour1800 - 1859:weekdaySunday     0.968872   0.604573   1.603  0.10905    
## hour1900 - 1959:weekdaySunday    -0.376120   0.609155  -0.617  0.53695    
## hour2000 - 2059:weekdaySunday    -0.141726   0.589598  -0.240  0.81004    
## hour2100 - 2159:weekdaySunday    -0.107582   0.595998  -0.181  0.85676    
## hour2200 - 2259:weekdaySunday     0.237607   0.612640   0.388  0.69814    
## hour2300 - 2359:weekdaySunday     0.685388   0.595341   1.151  0.24965    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.048 on 12092 degrees of freedom
## Multiple R-squared:  0.3221, Adjusted R-squared:  0.3153 
## F-statistic: 47.48 on 121 and 12092 DF,  p-value: < 2.2e-16
# diagnostics
resid(m1) %>% hist

par(mfrow = c(2,2))
plot(m1)

par(mfrow = c(1,1))


glance(m1) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped",
              "condensed", 
              "responsive"))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.3220953 0.3153118 3.048405 47.48194 0 122 -30883.56 62013.12 62924.59 112368.2 12092
# tidy(m1)

augment(m1) %>% 
  ggplot(aes(x = .fitted, 
             y = ed_visits)) + 
  geom_point() + 
  scale_x_continuous(limits = c(0, 25)) + 
  scale_y_continuous(limits = c(0, 25)) + 
  
  geom_smooth() + 
  geom_abline(slope = 1, 
              intercept = 0) + 
  
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
      panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# predict(m1, interval = "prediction")

m1.train_rmse <- sqrt(mean(resid(m1)^2))


# test set performance: 
df6.predictions_m1 <- 
  data.frame(ed_visits = df5.ed_visits_busy_hours[-v1_train_index, 7], 
             predicted = predict(m1, 
                                 newdata = df5.ed_visits_busy_hours[-v1_train_index, ])) 

m1.test_rmse <- sqrt(mean((df6.predictions_m1$predicted - df6.predictions_m1$ed_visits)^2, 
                          na.rm = TRUE))

m2: Without interaction between hour and weekday

set.seed(11)
v1_train_index <- createDataPartition(df5.ed_visits_busy_hours$ed_visits, 
                                      p = 0.8, 
                                      list = FALSE)

# fit model: 
m2 <- lm(ed_visits ~ hour + weekday + years_from_2017 + 
           lag_ed_visits + lag2_ed_visits,  
         data = df5.ed_visits_busy_hours[v1_train_index, ])

summary(m2)
## 
## Call:
## lm(formula = ed_visits ~ hour + weekday + years_from_2017 + lag_ed_visits + 
##     lag2_ed_visits, data = df5.ed_visits_busy_hours[v1_train_index, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2011 -2.1184 -0.1705  1.9225 12.5830 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.010342   0.141715  35.355  < 2e-16 ***
## hour0800 - 0859   2.081330   0.162303  12.824  < 2e-16 ***
## hour0900 - 0959   4.846651   0.167737  28.894  < 2e-16 ***
## hour1000 - 1059   5.646275   0.178864  31.567  < 2e-16 ***
## hour1100 - 1159   6.219017   0.190368  32.668  < 2e-16 ***
## hour1200 - 1259   5.419069   0.197054  27.500  < 2e-16 ***
## hour1300 - 1359   5.246597   0.197434  26.574  < 2e-16 ***
## hour1400 - 1459   5.083131   0.193885  26.217  < 2e-16 ***
## hour1500 - 1559   4.619737   0.191645  24.106  < 2e-16 ***
## hour1600 - 1659   4.448185   0.190135  23.395  < 2e-16 ***
## hour1700 - 1759   3.945781   0.188295  20.955  < 2e-16 ***
## hour1800 - 1859   3.691394   0.184915  19.963  < 2e-16 ***
## hour1900 - 1959   3.668444   0.183497  19.992  < 2e-16 ***
## hour2000 - 2059   3.970795   0.180473  22.002  < 2e-16 ***
## hour2100 - 2159   2.790615   0.181540  15.372  < 2e-16 ***
## hour2200 - 2259   1.247362   0.180998   6.892 5.79e-12 ***
## hour2300 - 2359  -0.195626   0.173514  -1.127 0.259579    
## weekdayTuesday   -0.537365   0.103896  -5.172 2.35e-07 ***
## weekdayWednesday -0.668682   0.103947  -6.433 1.30e-10 ***
## weekdayThursday  -0.621549   0.103993  -5.977 2.34e-09 ***
## weekdayFriday    -0.421446   0.104005  -4.052 5.11e-05 ***
## weekdaySaturday  -0.507863   0.103628  -4.901 9.67e-07 ***
## weekdaySunday    -0.020989   0.103149  -0.203 0.838758    
## years_from_2017   0.248449   0.037638   6.601 4.25e-11 ***
## lag_ed_visits     0.062136   0.009117   6.815 9.86e-12 ***
## lag2_ed_visits    0.031586   0.009296   3.398 0.000682 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.056 on 12188 degrees of freedom
## Multiple R-squared:  0.3131, Adjusted R-squared:  0.3117 
## F-statistic: 222.2 on 25 and 12188 DF,  p-value: < 2.2e-16
# diagnostics
resid(m2) %>% hist

par(mfrow = c(2,2))
plot(m2)

par(mfrow = c(1,1))


glance(m2) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped",
                                      "condensed", 
                                      "responsive"))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.3131248 0.3117159 3.056399 222.2451 0 26 -30963.84 61981.68 62181.76 113855.1 12188
# tidy(m2)

# actual vs predicted values: 
augment(m2) %>% 
  ggplot(aes(x = .fitted, 
             y = ed_visits)) + 
  geom_point() + 
  scale_x_continuous(limits = c(0, 25)) + 
  scale_y_continuous(limits = c(0, 25)) + 
  
  geom_smooth() + 
  geom_abline(slope = 1, 
              intercept = 0) + 
  
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# predict(m2, interval = "prediction")

m2.train_rmse <- sqrt(mean(resid(m2)^2))


# test set performance: 
df7.predictions_m2 <- 
  data.frame(ed_visits = df5.ed_visits_busy_hours[-v1_train_index, 7], 
             predicted = predict(m2, 
                                 newdata = df5.ed_visits_busy_hours[-v1_train_index, ])) 

m2.test_rmse <- sqrt(mean((df7.predictions_m2$predicted - df7.predictions_m2$ed_visits)^2, 
                          na.rm = TRUE))

m3: Without interaction between hour and weekday: POISSON model

set.seed(11)
v1_train_index <- createDataPartition(df5.ed_visits_busy_hours$ed_visits, 
                                      p = 0.8, 
                                      list = FALSE)

m3 <- glm(ed_visits ~ hour + weekday + years_from_2017 + 
           lag_ed_visits + lag2_ed_visits, 
         data = df5.ed_visits_busy_hours[v1_train_index, ], 
         family = "poisson")

summary(m3)
## 
## Call:
## glm(formula = ed_visits ~ hour + weekday + years_from_2017 + 
##     lag_ed_visits + lag2_ed_visits, family = "poisson", data = df5.ed_visits_busy_hours[v1_train_index, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7011  -0.7357  -0.0591   0.6166   3.5011  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       1.6179029  0.0188134  85.997  < 2e-16 ***
## hour0800 - 0859   0.3482760  0.0216671  16.074  < 2e-16 ***
## hour0900 - 0959   0.6690587  0.0208461  32.095  < 2e-16 ***
## hour1000 - 1059   0.7410579  0.0215607  34.371  < 2e-16 ***
## hour1100 - 1159   0.7872640  0.0224738  35.030  < 2e-16 ***
## hour1200 - 1259   0.7177184  0.0232659  30.848  < 2e-16 ***
## hour1300 - 1359   0.7027531  0.0233537  30.092  < 2e-16 ***
## hour1400 - 1459   0.6883853  0.0230669  29.843  < 2e-16 ***
## hour1500 - 1559   0.6451589  0.0229803  28.074  < 2e-16 ***
## hour1600 - 1659   0.6288517  0.0228981  27.463  < 2e-16 ***
## hour1700 - 1759   0.5785349  0.0228909  25.274  < 2e-16 ***
## hour1800 - 1859   0.5518446  0.0226751  24.337  < 2e-16 ***
## hour1900 - 1959   0.5491589  0.0225756  24.325  < 2e-16 ***
## hour2000 - 2059   0.5809296  0.0221974  26.171  < 2e-16 ***
## hour2100 - 2159   0.4507247  0.0227291  19.830  < 2e-16 ***
## hour2200 - 2259   0.2473912  0.0235359  10.511  < 2e-16 ***
## hour2300 - 2359   0.0017426  0.0241234   0.072  0.94241    
## weekdayTuesday   -0.0560865  0.0110444  -5.078 3.81e-07 ***
## weekdayWednesday -0.0707288  0.0110963  -6.374 1.84e-10 ***
## weekdayThursday  -0.0651128  0.0110950  -5.869 4.39e-09 ***
## weekdayFriday    -0.0437700  0.0110278  -3.969 7.22e-05 ***
## weekdaySaturday  -0.0529417  0.0109939  -4.816 1.47e-06 ***
## weekdaySunday    -0.0016563  0.0107871  -0.154  0.87797    
## years_from_2017   0.0265133  0.0040157   6.602 4.04e-11 ***
## lag_ed_visits     0.0062641  0.0009522   6.578 4.75e-11 ***
## lag2_ed_visits    0.0031998  0.0009760   3.278  0.00104 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18472  on 12213  degrees of freedom
## Residual deviance: 12455  on 12188  degrees of freedom
## AIC: 61386
## 
## Number of Fisher Scoring iterations: 4
# diagnostics
resid(m3) %>% hist

par(mfrow = c(2,2))
plot(m3)

par(mfrow = c(1,1))



glance(m3) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped",
                                      "condensed", 
                                      "responsive"))
null.deviance df.null logLik AIC BIC deviance df.residual
18471.63 12213 -30666.77 61385.55 61578.21 12455.22 12188
# Actual vs predicted vals 
augment(m3) %>% 
  ggplot(aes(x = .fitted, 
             y = ed_visits)) + 
  geom_point() + 
  scale_x_continuous(limits = c(0, 5)) + 
  scale_y_continuous(limits = c(0, 25)) + 
  
  geom_smooth() + 
  geom_abline(slope = 1, 
              intercept = 0) + 
  
  theme_light() +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# 5) summary of models --------------------------------------

Summary of models

  • Poisson model is clearly wildly inappropriate

  • Between m1 and m2, m2 is simpler with similar adj-Rsqrd, and similar test RMSE

  • However, m1 includes several significant interactions, and studying interactions is one of the key goals in this analysis. To look at it another way, m1 is more complex, but is probably not overfitting much.

  • Let’s go with m1 here.  

df8.model.performance <- 
  data.frame(model = c("year + weekday + hour + hour:weekday + lag + lag2", 
                       "year + weekday + hour + hour:weekday + lag + lag2", 
                       "year + weekday + hour + lag + lag2", 
                       "year + weekday + hour + lag + lag2"), 
             metric = rep(c("Train RMSE", 
                            "Test RMSE"), 2), 
             value = c(m1.train_rmse, 
                       m1.test_rmse, 
                       m2.train_rmse, 
                       m2.test_rmse)) 


df8.model.performance %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped",
                                      "condensed", 
                                      "responsive"))
model metric value
year + weekday + hour + hour:weekday + lag + lag2 Train RMSE 3.033142
year + weekday + hour + hour:weekday + lag + lag2 Test RMSE 3.089874
year + weekday + hour + lag + lag2 Train RMSE 3.053144
year + weekday + hour + lag + lag2 Test RMSE 3.083873

Train selected model on full dataset

m4.full_dataset <- lm(ed_visits ~ hour + weekday + years_from_2017 + 
                        lag_ed_visits + lag2_ed_visits + hour:weekday, 
                      data = df5.ed_visits_busy_hours)


glance(m4.full_dataset) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped",
                                      "condensed", 
                                      "responsive"))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.3182033 0.3127558 3.054163 58.41239 0 122 -38644.85 77535.69 78474.6 141261.9 15144
df9.coeffs <- 
  tidy(m4.full_dataset) %>% 
  mutate(lower_ci = estimate - 1.96 * std.error, 
         upper_ci = estimate + 1.96 * std.error, 
         is_signif_0.10 = ifelse(p.value < 0.10, 
                                 1, 0)) %>% 
  
  dplyr::select(term, 
                lower_ci, 
                estimate, 
                upper_ci, 
                everything()) 

df9.coeffs %>% 
  datatable() %>% 
  formatRound(2:7, 2)

 

# 6) pre-processing before visualization: -----------

Pre-processing before visualization

df10.sig_interactions <- 
  df9.coeffs %>% 
  filter(grepl(":", term), 
         is_signif_0.10 == 1) %>% 
  mutate(hour = substr(term, 1, 15)) %>% 
  select(term, 
         hour, 
         estimate)


df11.hour_effects <- 
  df9.coeffs %>% 
  filter(!grepl(":", term), 
         grepl("hour", term)) %>% 
  
  left_join(df10.sig_interactions, 
            by = c("term" = "hour")) %>% 
  
  mutate(is_interaction = ifelse(is.na(estimate.y), 
                                       "No", "Yes") %>% 
           as.factor) %>% 
  
  select(term, 
         lower_ci, 
         estimate.x, 
         upper_ci, 
         is_interaction)

Visual of hour effects

# 7) visuals of hour and day of week effects ----------

df11.hour_effects %>% 
  
  mutate(term = substring(term, 5) %>% as.factor()) %>% 
  
  ggplot()  +
  geom_pointrange(aes(x = term, 
                      ymin = lower_ci, 
                      ymax = upper_ci, 
                      y = estimate.x, 
                      col = is_interaction)) + 
  geom_hline(yintercept = 0) + 
  
  scale_y_continuous(limits = c(-5, 10), 
                     breaks = seq(-5, 10, 3)) + 
  
  scale_color_manual(values = c("black", 
                               "red")) + 
  
  labs(x = "Hour of day", 
       y = "Difference in average hourly ED visits" ,
       title = "LGH ED \nImpact of Hour of Day on average daily ED visits", 
       subtitle = "These estimates control for year and day-of-week, allowing us to \nisolate hourly effects from other factors and from statistical noise \n\nBaseline - 0700 to 0759 on Monday", 
       caption = "\n\nNote: our model accounts for 30% of the variation\nin hourly ED visits between 7 AM and midnight", 
       col = "Varies by weekday?") + 
  
  theme_light(base_size = 12) +
  theme(panel.grid.minor = element_line(colour = "grey95"), 
        panel.grid.major = element_line(colour = "grey95"), 
        axis.text.x = element_text(angle = 45, 
                                   hjust = 1)
        )

     

Prediction intervals for illustrative data

Import illustrative data to predict on. Note that all lagged ed_visits values are set to the overall mean for the corresponding hour of day (see df3.mean_and_sd)

df12.predict_intervals <- 
  read_csv(here::here("data", 
                      "2019-06-30_illustrative-data-for-prediction-intervals.csv")) %>% 
  mutate(weekday = fct_relevel(weekday, 
                               levels = c("Monday", 
                                          "Tuesday", 
                                          "Wednesday", 
                                          "Thursday", 
                                          "Friday",
                                          "Saturday", 
                                          "Sunday")), 
         # years_from_2017 = as.factor(years_from_2017), 
         year = as.factor(year), 
         month = as.factor(month), 
         hour = as.factor(hour)) %>% 
  
  # only MOnday: 
  filter(weekday == "Monday") %>% 
  
  # identify hours with significant weekday effects: 
  left_join(df10.sig_interactions %>% 
              mutate(hour = substr(hour, 5, 15)) %>% 
              select(hour) %>% 
              unique() %>% 
              mutate(is_interaction = 1)) %>% 
  
  replace(is.na(.), 0) %>% 
  mutate(is_interaction = as.factor(is_interaction))
## Parsed with column specification:
## cols(
##   date = col_character(),
##   hour = col_character(),
##   years_from_2017 = col_double(),
##   month = col_double(),
##   year = col_double(),
##   weekday = col_character(),
##   ed_visits = col_double(),
##   lag_ed_visits = col_double(),
##   lag2_ed_visits = col_double(),
##   lag3_ed_visits = col_double(),
##   lag4_ed_visits = col_double(),
##   lag5_ed_visits = col_double(),
##   lag6_ed_visits = col_double(),
##   hour_int = col_double()
## )
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
## Joining, by = "hour"
## Warning: Column `hour` joining factor and character vector, coercing into
## character vector
# add predictions from model m1: 
df12.predict_intervals <- 
  df12.predict_intervals %>% 
  bind_cols(predict(m1, 
                    newdata = df12.predict_intervals,
                    interval = "predict") %>% 
              as.data.frame()) %>% 
  
  
  
# group and nest: 
  group_by(weekday) %>% 
  nest() %>% 

# graphs of time of day ed_visits, by weekday: 
  mutate(hourly_visits = map2(data, 
                              weekday, 
                              function(data, weekday){
                               ggplot(data) + 
                                 geom_pointrange(aes(x = hour, 
                                                     y = fit, 
                                                     ymin = lwr, 
                                                     ymax = upr, 
                                                     col = is_interaction)) + 
                                  labs(title = paste0(as.character(weekday), " - 2019"), 
                                       subtitle = "Predicted ED visits by hour of day, after accounting for year and weekday effects", 
                                       x = "Hour", 
                                       y = "ED visits", 
                                       col = "Varies by weekday?") + 
                                  
                                  scale_y_continuous(limits = c(-2, 20), 
                                                     breaks = seq(-2, 20, 2)) + 
                                  
                                  scale_color_manual(values = c("black", 
                                                                "red")) + 
                                  
                                  theme_light() +
                                  theme(panel.grid.minor = element_line(colour = "grey95"), 
                                      panel.grid.major = element_line(colour = "grey95"), 
                                      axis.text.x = element_text(angle = 45, 
                                                                 hjust = 1))
                                      
                                  
                                 
                             }
                             ))
  
  
df12.predict_intervals$hourly_visits    
## [[1]]

# 8) write outputs: -----------------------------
# write_csv(df9.coeffs,
#           here::here("results", 
#                      "dst", 
#                     "2019-06-28_lgh_ed-visits-by-hour_regression-coeffs.csv"))