library(tidyverse)
library(here)
library(odbc)
library(DBI)
library(lubridate)
library(ggbeeswarm)
library(DT)
library(broom)
library(caret)
library(kableExtra)
library(scales)
library(plotly)
knitr::opts_chunk$set(dev = "png",
cache = TRUE)
# 1) set up database connections and import functions: -----------
source(here::here("src",
"setup-denodo_function.R"))
source(here::here("src",
"ed-visits-by-hour-denodo_function.R"))
setup_denodo()
# 2) pull ed data: -----------
df1.ed_visits <- extract_ed_visits_by_hour("20170101", # todo: earlier start?
"20190617")
## Warning: Column `interval_1_hour_at_start_date`/`hours` joining character
## vector and factor, coercing into character vector
df2.ed_visits_cleaned <-
df1.ed_visits %>%
mutate(weekday = weekdays(date),
month = month(date),
year = year(date),
years_from_2017 = year - 2017) %>%
# fiddle with factors:
mutate(weekday = fct_relevel(weekday,
levels = c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday")),
# years_from_2017 = as.factor(years_from_2017),
year = as.factor(year),
month = as.factor(month),
interval_1_hour_at_start_date = as.factor(interval_1_hour_at_start_date)) %>%
rename(ed_visits = value,
hour = interval_1_hour_at_start_date) %>%
mutate(lag_ed_visits = lag(ed_visits),
lag2_ed_visits = lag(ed_visits, 2),
lag3_ed_visits = lag(ed_visits, 3),
lag4_ed_visits = lag(ed_visits, 4),
lag5_ed_visits = lag(ed_visits, 5),
lag6_ed_visits = lag(ed_visits, 6)) %>%
replace(is.na(.), 0) %>%
select(date,
hour,
years_from_2017,
month,
year,
weekday,
ed_visits,
lag_ed_visits,
lag2_ed_visits,
lag3_ed_visits,
lag4_ed_visits,
lag5_ed_visits,
lag6_ed_visits)
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
# add hour_int column:
days <- df2.ed_visits_cleaned$date %>% unique() %>% length()
df2.ed_visits_cleaned <-
df2.ed_visits_cleaned %>%
mutate(hour_int = rep(1:24,
days))
# result:
str(df2.ed_visits_cleaned)
## Classes 'tbl_df', 'tbl' and 'data.frame': 21552 obs. of 14 variables:
## $ date : Date, format: "2017-01-01" "2017-01-01" ...
## $ hour : Factor w/ 24 levels "0000 - 0059",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ years_from_2017: num 0 0 0 0 0 0 0 0 0 0 ...
## $ month : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : Factor w/ 3 levels "2017","2018",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday : Factor w/ 7 levels "Monday","Tuesday",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ ed_visits : num 6 6 4 4 2 0 4 8 10 8 ...
## $ lag_ed_visits : num 0 6 6 4 4 2 0 4 8 10 ...
## $ lag2_ed_visits : num 0 0 6 6 4 4 2 0 4 8 ...
## $ lag3_ed_visits : num 0 0 0 6 6 4 4 2 0 4 ...
## $ lag4_ed_visits : num 0 0 0 0 6 6 4 4 2 0 ...
## $ lag5_ed_visits : num 0 0 0 0 0 6 6 4 4 2 ...
## $ lag6_ed_visits : num 0 0 0 0 0 0 6 6 4 4 ...
## $ hour_int : int 1 2 3 4 5 6 7 8 9 10 ...
df2.ed_visits_cleaned %>%
head(100) %>%
datatable()
# mean and sd:
df3.mean_and_sd <-
df2.ed_visits_cleaned %>%
group_by(hour) %>%
summarise(mean_visits = mean(ed_visits,
na.rm = TRUE),
sd_visits = sd(ed_visits,
na.rm = TRUE))
df3.mean_and_sd %>%
datatable() %>%
formatRound(2:3, 1)
Mean and variance aren’t really close for most hours. Does this mean that arrival rate isn’t really Poisson-distributed?
# 3) plots: ------------
# facet by year
df2.ed_visits_cleaned %>%
ggplot(aes(x = hour,
y = ed_visits)) +
geom_boxplot() +
facet_wrap(~year) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# facet by weekday
df2.ed_visits_cleaned %>%
ggplot(aes(x = hour,
y = ed_visits)) +
geom_boxplot() +
facet_wrap(~weekday) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
df2.ed_visits_cleaned %>%
ggplot(aes(x = year,
y = ed_visits)) +
geom_boxplot() +
facet_wrap(~weekday) +
labs(title = "hourly arrival rate by year") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# density by year:
df2.ed_visits_cleaned %>%
ggplot(aes(x = ed_visits)) +
geom_density() +
facet_wrap(~year) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
Unfortunately, this is very non-normal.
Hypothesis: this is a mixture of two normal distributions: one for “late-night” rates, one for rest of the day.
Let’s start by analyzing visits over the period 7 AM to midnight. This seems to give a reasonable distribution.
# density for "busy" hours:
df2.ed_visits_cleaned %>%
filter(hour_int >= 8,
hour_int <= 24) %>%
ggplot(aes(x = ed_visits)) +
geom_density() +
facet_wrap(~year) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# compare with poisson curves:
# mean_visits_from_5am <-
# df2.ed_visits_cleaned %>%
# filter(year == "2018",
# hour_int >= 5,
# hour_int <= 24) %>%
# pull(ed_visits) %>%
# mean(na.rm = TRUE)
#
# x <- rpois(1000, mean_visits_from_5am)
# x %>% density() %>% plot(main = paste("Poisson with mean", round(mean_visits_from_5am, 2)))
df4.nest_by_hour <-
df2.ed_visits_cleaned %>%
filter(year == "2018") %>%
group_by(hour_int) %>%
nest %>%
mutate(density = map2(data,
hour_int,
function(x, y){
ggplot(x,
aes(x = ed_visits)) +
geom_density() +
labs(title = y %>% as.character())
}))
df4.nest_by_hour$density
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
df5.ed_visits_busy_hours <-
df2.ed_visits_cleaned %>%
filter(hour_int >= 8)
df5.ed_visits_busy_hours %>%
head(50) %>%
datatable()
# set colours for days of week
x <- seq(0, 1, length.out = 7)
cols <- seq_gradient_pal(low = "blue",
high = "red")(x)
# hour of day, split by day of week
p <-
df2.ed_visits_cleaned %>%
group_by(hour,
weekday) %>%
filter(hour_int >= 8,
hour_int <= 24) %>%
summarize(mean_visits = mean(ed_visits, na.rm = T)) %>%
ggplot(aes(x = hour,
y = mean_visits,
group = weekday,
col = weekday)) +
geom_line() +
scale_color_manual(values = cols) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1)); p
# ggplotly(p)
df2.ed_visits_cleaned %>%
filter(year %in% c("2018")) %>%
ggplot(aes(x = hour,
y = ed_visits)) +
geom_boxplot() +
facet_wrap(~weekday) +
labs(title = "2018") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# simple average by hour of day
df2.ed_visits_cleaned %>%
group_by(hour) %>%
summarise(mean_visits = mean(ed_visits,
na.rm = TRUE)) %>%
ggplot(aes(x = hour,
y = mean_visits)) +
geom_point(size = 5,
col = "dodgerblue4") +
labs(title = "2018") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1)
)
# avg ED visits by weekday AND hour
# this is the type of plot that I am arguing against - it's all noise
df2.ed_visits_cleaned %>%
filter(year == "2018") %>%
group_by(hour,
weekday) %>%
summarise(mean_visits = mean(ed_visits,
na.rm = TRUE)) %>%
ggplot(aes(x = hour,
y = mean_visits)) +
geom_col(fill = "dodgerblue4") +
facet_wrap(~weekday) +
labs(title = "Is there really any point in looking at graphs like this?") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# 4) regression model 1: ----
set.seed(11)
v1_train_index <- createDataPartition(df5.ed_visits_busy_hours$ed_visits,
p = 0.8,
list = FALSE)
# fit model:
m1 <- lm(ed_visits ~ hour + weekday + years_from_2017 +
lag_ed_visits + lag2_ed_visits + hour:weekday,
data = df5.ed_visits_busy_hours[v1_train_index, ])
summary(m1)
##
## Call:
## lm(formula = ed_visits ~ hour + weekday + years_from_2017 + lag_ed_visits +
## lag2_ed_visits + hour:weekday, data = df5.ed_visits_busy_hours[v1_train_index,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6194 -2.0872 -0.1588 1.9166 12.7806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.805911 0.307834 15.612 < 2e-16 ***
## hour0800 - 0859 2.255010 0.431560 5.225 1.77e-07 ***
## hour0900 - 0959 5.252166 0.427673 12.281 < 2e-16 ***
## hour1000 - 1059 6.196535 0.443487 13.972 < 2e-16 ***
## hour1100 - 1159 7.126052 0.438067 16.267 < 2e-16 ***
## hour1200 - 1259 6.497693 0.450907 14.410 < 2e-16 ***
## hour1300 - 1359 5.660915 0.443696 12.759 < 2e-16 ***
## hour1400 - 1459 5.962379 0.444887 13.402 < 2e-16 ***
## hour1500 - 1559 4.834981 0.440112 10.986 < 2e-16 ***
## hour1600 - 1659 4.547910 0.442771 10.271 < 2e-16 ***
## hour1700 - 1759 4.306518 0.449075 9.590 < 2e-16 ***
## hour1800 - 1859 3.375277 0.438238 7.702 1.44e-14 ***
## hour1900 - 1959 4.002728 0.443050 9.034 < 2e-16 ***
## hour2000 - 2059 4.252633 0.425078 10.004 < 2e-16 ***
## hour2100 - 2159 2.773498 0.433704 6.395 1.67e-10 ***
## hour2200 - 2259 0.690483 0.449060 1.538 0.12417
## hour2300 - 2359 -0.964185 0.426158 -2.263 0.02368 *
## weekdayTuesday -0.301565 0.431123 -0.699 0.48426
## weekdayWednesday -0.773018 0.431116 -1.793 0.07299 .
## weekdayThursday -0.324062 0.423061 -0.766 0.44370
## weekdayFriday 0.002037 0.431117 0.005 0.99623
## weekdaySaturday -0.153665 0.432214 -0.356 0.72220
## weekdaySunday 0.262850 0.426982 0.616 0.53817
## years_from_2017 0.248527 0.037571 6.615 3.87e-11 ***
## lag_ed_visits 0.059935 0.009151 6.550 6.00e-11 ***
## lag2_ed_visits 0.030412 0.009326 3.261 0.00111 **
## hour0800 - 0859:weekdayTuesday -0.554164 0.604708 -0.916 0.35947
## hour0900 - 0959:weekdayTuesday -0.289981 0.606192 -0.478 0.63240
## hour1000 - 1059:weekdayTuesday -1.071022 0.610004 -1.756 0.07915 .
## hour1100 - 1159:weekdayTuesday -0.615698 0.600667 -1.025 0.30537
## hour1200 - 1259:weekdayTuesday -1.675616 0.609188 -2.751 0.00596 **
## hour1300 - 1359:weekdayTuesday -0.319355 0.602012 -0.530 0.59579
## hour1400 - 1459:weekdayTuesday -0.539365 0.613374 -0.879 0.37923
## hour1500 - 1559:weekdayTuesday 0.431069 0.607128 0.710 0.47771
## hour1600 - 1659:weekdayTuesday -0.055270 0.609812 -0.091 0.92779
## hour1700 - 1759:weekdayTuesday -0.282050 0.609467 -0.463 0.64353
## hour1800 - 1859:weekdayTuesday 0.509488 0.603131 0.845 0.39827
## hour1900 - 1959:weekdayTuesday -0.557356 0.611309 -0.912 0.36192
## hour2000 - 2059:weekdayTuesday -0.349433 0.595273 -0.587 0.55721
## hour2100 - 2159:weekdayTuesday 0.175640 0.603292 0.291 0.77095
## hour2200 - 2259:weekdayTuesday 0.259565 0.613413 0.423 0.67219
## hour2300 - 2359:weekdayTuesday 0.948124 0.601992 1.575 0.11529
## hour0800 - 0859:weekdayWednesday 0.043334 0.602109 0.072 0.94263
## hour0900 - 0959:weekdayWednesday 0.413774 0.602455 0.687 0.49221
## hour1000 - 1059:weekdayWednesday -0.640091 0.614410 -1.042 0.29753
## hour1100 - 1159:weekdayWednesday -0.908977 0.604918 -1.503 0.13296
## hour1200 - 1259:weekdayWednesday -1.057942 0.613784 -1.724 0.08480 .
## hour1300 - 1359:weekdayWednesday -0.216078 0.602043 -0.359 0.71967
## hour1400 - 1459:weekdayWednesday -0.140617 0.604074 -0.233 0.81593
## hour1500 - 1559:weekdayWednesday 0.117360 0.606212 0.194 0.84650
## hour1600 - 1659:weekdayWednesday 0.727571 0.603906 1.205 0.22831
## hour1700 - 1759:weekdayWednesday -0.002663 0.612249 -0.004 0.99653
## hour1800 - 1859:weekdayWednesday 0.792702 0.607467 1.305 0.19194
## hour1900 - 1959:weekdayWednesday 0.371652 0.612822 0.606 0.54422
## hour2000 - 2059:weekdayWednesday 0.063948 0.596685 0.107 0.91465
## hour2100 - 2159:weekdayWednesday 0.125080 0.599680 0.209 0.83478
## hour2200 - 2259:weekdayWednesday 1.048328 0.617875 1.697 0.08979 .
## hour2300 - 2359:weekdayWednesday 0.931876 0.599787 1.554 0.12029
## hour0800 - 0859:weekdayThursday -0.414179 0.603258 -0.687 0.49237
## hour0900 - 0959:weekdayThursday -0.288866 0.597400 -0.484 0.62872
## hour1000 - 1059:weekdayThursday -1.130530 0.601577 -1.879 0.06023 .
## hour1100 - 1159:weekdayThursday -1.575271 0.599317 -2.628 0.00859 **
## hour1200 - 1259:weekdayThursday -1.566167 0.604437 -2.591 0.00958 **
## hour1300 - 1359:weekdayThursday -0.613064 0.602419 -1.018 0.30885
## hour1400 - 1459:weekdayThursday -1.328793 0.601965 -2.207 0.02730 *
## hour1500 - 1559:weekdayThursday -0.224441 0.604818 -0.371 0.71058
## hour1600 - 1659:weekdayThursday -0.196888 0.601899 -0.327 0.74359
## hour1700 - 1759:weekdayThursday -0.272886 0.607373 -0.449 0.65323
## hour1800 - 1859:weekdayThursday 0.497301 0.596025 0.834 0.40409
## hour1900 - 1959:weekdayThursday -0.483426 0.604914 -0.799 0.42421
## hour2000 - 2059:weekdayThursday -0.182456 0.588754 -0.310 0.75664
## hour2100 - 2159:weekdayThursday 0.594252 0.595289 0.998 0.31817
## hour2200 - 2259:weekdayThursday 1.127335 0.607062 1.857 0.06333 .
## hour2300 - 2359:weekdayThursday 0.885577 0.598647 1.479 0.13909
## hour0800 - 0859:weekdayFriday -0.209296 0.607471 -0.345 0.73045
## hour0900 - 0959:weekdayFriday -0.871836 0.605375 -1.440 0.14985
## hour1000 - 1059:weekdayFriday -0.098513 0.614430 -0.160 0.87262
## hour1100 - 1159:weekdayFriday -1.019797 0.603332 -1.690 0.09100 .
## hour1200 - 1259:weekdayFriday -1.677521 0.609072 -2.754 0.00589 **
## hour1300 - 1359:weekdayFriday -0.850217 0.613032 -1.387 0.16550
## hour1400 - 1459:weekdayFriday -1.426260 0.609926 -2.338 0.01938 *
## hour1500 - 1559:weekdayFriday 0.030666 0.602543 0.051 0.95941
## hour1600 - 1659:weekdayFriday -0.080191 0.606860 -0.132 0.89488
## hour1700 - 1759:weekdayFriday -0.714537 0.614487 -1.163 0.24493
## hour1800 - 1859:weekdayFriday -0.193181 0.599836 -0.322 0.74741
## hour1900 - 1959:weekdayFriday -0.708031 0.610588 -1.160 0.24624
## hour2000 - 2059:weekdayFriday -0.834777 0.597402 -1.397 0.16234
## hour2100 - 2159:weekdayFriday -0.187389 0.603296 -0.311 0.75610
## hour2200 - 2259:weekdayFriday 0.620618 0.612116 1.014 0.31066
## hour2300 - 2359:weekdayFriday 0.943946 0.597690 1.579 0.11429
## hour0800 - 0859:weekdaySaturday -0.052465 0.607534 -0.086 0.93118
## hour0900 - 0959:weekdaySaturday -0.901767 0.607712 -1.484 0.13787
## hour1000 - 1059:weekdaySaturday -0.758609 0.612291 -1.239 0.21538
## hour1100 - 1159:weekdaySaturday -1.468632 0.601404 -2.442 0.01462 *
## hour1200 - 1259:weekdaySaturday -0.660572 0.610069 -1.083 0.27893
## hour1300 - 1359:weekdaySaturday -0.371911 0.601438 -0.618 0.53634
## hour1400 - 1459:weekdaySaturday -1.129492 0.604057 -1.870 0.06153 .
## hour1500 - 1559:weekdaySaturday -0.657950 0.603415 -1.090 0.27557
## hour1600 - 1659:weekdaySaturday -0.262677 0.608391 -0.432 0.66593
## hour1700 - 1759:weekdaySaturday -0.381246 0.611010 -0.624 0.53267
## hour1800 - 1859:weekdaySaturday -0.139971 0.602561 -0.232 0.81631
## hour1900 - 1959:weekdaySaturday -0.409307 0.610605 -0.670 0.50266
## hour2000 - 2059:weekdaySaturday -0.389122 0.598937 -0.650 0.51591
## hour2100 - 2159:weekdaySaturday -0.302403 0.602587 -0.502 0.61579
## hour2200 - 2259:weekdaySaturday 0.655608 0.615006 1.066 0.28644
## hour2300 - 2359:weekdaySaturday 1.195874 0.605114 1.976 0.04815 *
## hour0800 - 0859:weekdaySunday 0.037441 0.606746 0.062 0.95080
## hour0900 - 0959:weekdaySunday -0.833425 0.599444 -1.390 0.16445
## hour1000 - 1059:weekdaySunday 0.017108 0.602432 0.028 0.97735
## hour1100 - 1159:weekdaySunday -0.638865 0.596093 -1.072 0.28385
## hour1200 - 1259:weekdaySunday -0.703858 0.601346 -1.170 0.24183
## hour1300 - 1359:weekdaySunday -0.361447 0.602437 -0.600 0.54853
## hour1400 - 1459:weekdaySunday -1.387462 0.603780 -2.298 0.02158 *
## hour1500 - 1559:weekdaySunday -0.959727 0.598902 -1.602 0.10908
## hour1600 - 1659:weekdaySunday -0.658843 0.603872 -1.091 0.27528
## hour1700 - 1759:weekdaySunday -0.694827 0.610966 -1.137 0.25545
## hour1800 - 1859:weekdaySunday 0.968872 0.604573 1.603 0.10905
## hour1900 - 1959:weekdaySunday -0.376120 0.609155 -0.617 0.53695
## hour2000 - 2059:weekdaySunday -0.141726 0.589598 -0.240 0.81004
## hour2100 - 2159:weekdaySunday -0.107582 0.595998 -0.181 0.85676
## hour2200 - 2259:weekdaySunday 0.237607 0.612640 0.388 0.69814
## hour2300 - 2359:weekdaySunday 0.685388 0.595341 1.151 0.24965
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.048 on 12092 degrees of freedom
## Multiple R-squared: 0.3221, Adjusted R-squared: 0.3153
## F-statistic: 47.48 on 121 and 12092 DF, p-value: < 2.2e-16
# diagnostics
resid(m1) %>% hist
par(mfrow = c(2,2))
plot(m1)
par(mfrow = c(1,1))
glance(m1) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped",
"condensed",
"responsive"))
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.3220953 | 0.3153118 | 3.048405 | 47.48194 | 0 | 122 | -30883.56 | 62013.12 | 62924.59 | 112368.2 | 12092 |
# tidy(m1)
augment(m1) %>%
ggplot(aes(x = .fitted,
y = ed_visits)) +
geom_point() +
scale_x_continuous(limits = c(0, 25)) +
scale_y_continuous(limits = c(0, 25)) +
geom_smooth() +
geom_abline(slope = 1,
intercept = 0) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# predict(m1, interval = "prediction")
m1.train_rmse <- sqrt(mean(resid(m1)^2))
# test set performance:
df6.predictions_m1 <-
data.frame(ed_visits = df5.ed_visits_busy_hours[-v1_train_index, 7],
predicted = predict(m1,
newdata = df5.ed_visits_busy_hours[-v1_train_index, ]))
m1.test_rmse <- sqrt(mean((df6.predictions_m1$predicted - df6.predictions_m1$ed_visits)^2,
na.rm = TRUE))
set.seed(11)
v1_train_index <- createDataPartition(df5.ed_visits_busy_hours$ed_visits,
p = 0.8,
list = FALSE)
# fit model:
m2 <- lm(ed_visits ~ hour + weekday + years_from_2017 +
lag_ed_visits + lag2_ed_visits,
data = df5.ed_visits_busy_hours[v1_train_index, ])
summary(m2)
##
## Call:
## lm(formula = ed_visits ~ hour + weekday + years_from_2017 + lag_ed_visits +
## lag2_ed_visits, data = df5.ed_visits_busy_hours[v1_train_index,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2011 -2.1184 -0.1705 1.9225 12.5830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.010342 0.141715 35.355 < 2e-16 ***
## hour0800 - 0859 2.081330 0.162303 12.824 < 2e-16 ***
## hour0900 - 0959 4.846651 0.167737 28.894 < 2e-16 ***
## hour1000 - 1059 5.646275 0.178864 31.567 < 2e-16 ***
## hour1100 - 1159 6.219017 0.190368 32.668 < 2e-16 ***
## hour1200 - 1259 5.419069 0.197054 27.500 < 2e-16 ***
## hour1300 - 1359 5.246597 0.197434 26.574 < 2e-16 ***
## hour1400 - 1459 5.083131 0.193885 26.217 < 2e-16 ***
## hour1500 - 1559 4.619737 0.191645 24.106 < 2e-16 ***
## hour1600 - 1659 4.448185 0.190135 23.395 < 2e-16 ***
## hour1700 - 1759 3.945781 0.188295 20.955 < 2e-16 ***
## hour1800 - 1859 3.691394 0.184915 19.963 < 2e-16 ***
## hour1900 - 1959 3.668444 0.183497 19.992 < 2e-16 ***
## hour2000 - 2059 3.970795 0.180473 22.002 < 2e-16 ***
## hour2100 - 2159 2.790615 0.181540 15.372 < 2e-16 ***
## hour2200 - 2259 1.247362 0.180998 6.892 5.79e-12 ***
## hour2300 - 2359 -0.195626 0.173514 -1.127 0.259579
## weekdayTuesday -0.537365 0.103896 -5.172 2.35e-07 ***
## weekdayWednesday -0.668682 0.103947 -6.433 1.30e-10 ***
## weekdayThursday -0.621549 0.103993 -5.977 2.34e-09 ***
## weekdayFriday -0.421446 0.104005 -4.052 5.11e-05 ***
## weekdaySaturday -0.507863 0.103628 -4.901 9.67e-07 ***
## weekdaySunday -0.020989 0.103149 -0.203 0.838758
## years_from_2017 0.248449 0.037638 6.601 4.25e-11 ***
## lag_ed_visits 0.062136 0.009117 6.815 9.86e-12 ***
## lag2_ed_visits 0.031586 0.009296 3.398 0.000682 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.056 on 12188 degrees of freedom
## Multiple R-squared: 0.3131, Adjusted R-squared: 0.3117
## F-statistic: 222.2 on 25 and 12188 DF, p-value: < 2.2e-16
# diagnostics
resid(m2) %>% hist
par(mfrow = c(2,2))
plot(m2)
par(mfrow = c(1,1))
glance(m2) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped",
"condensed",
"responsive"))
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.3131248 | 0.3117159 | 3.056399 | 222.2451 | 0 | 26 | -30963.84 | 61981.68 | 62181.76 | 113855.1 | 12188 |
# tidy(m2)
# actual vs predicted values:
augment(m2) %>%
ggplot(aes(x = .fitted,
y = ed_visits)) +
geom_point() +
scale_x_continuous(limits = c(0, 25)) +
scale_y_continuous(limits = c(0, 25)) +
geom_smooth() +
geom_abline(slope = 1,
intercept = 0) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# predict(m2, interval = "prediction")
m2.train_rmse <- sqrt(mean(resid(m2)^2))
# test set performance:
df7.predictions_m2 <-
data.frame(ed_visits = df5.ed_visits_busy_hours[-v1_train_index, 7],
predicted = predict(m2,
newdata = df5.ed_visits_busy_hours[-v1_train_index, ]))
m2.test_rmse <- sqrt(mean((df7.predictions_m2$predicted - df7.predictions_m2$ed_visits)^2,
na.rm = TRUE))
set.seed(11)
v1_train_index <- createDataPartition(df5.ed_visits_busy_hours$ed_visits,
p = 0.8,
list = FALSE)
m3 <- glm(ed_visits ~ hour + weekday + years_from_2017 +
lag_ed_visits + lag2_ed_visits,
data = df5.ed_visits_busy_hours[v1_train_index, ],
family = "poisson")
summary(m3)
##
## Call:
## glm(formula = ed_visits ~ hour + weekday + years_from_2017 +
## lag_ed_visits + lag2_ed_visits, family = "poisson", data = df5.ed_visits_busy_hours[v1_train_index,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7011 -0.7357 -0.0591 0.6166 3.5011
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.6179029 0.0188134 85.997 < 2e-16 ***
## hour0800 - 0859 0.3482760 0.0216671 16.074 < 2e-16 ***
## hour0900 - 0959 0.6690587 0.0208461 32.095 < 2e-16 ***
## hour1000 - 1059 0.7410579 0.0215607 34.371 < 2e-16 ***
## hour1100 - 1159 0.7872640 0.0224738 35.030 < 2e-16 ***
## hour1200 - 1259 0.7177184 0.0232659 30.848 < 2e-16 ***
## hour1300 - 1359 0.7027531 0.0233537 30.092 < 2e-16 ***
## hour1400 - 1459 0.6883853 0.0230669 29.843 < 2e-16 ***
## hour1500 - 1559 0.6451589 0.0229803 28.074 < 2e-16 ***
## hour1600 - 1659 0.6288517 0.0228981 27.463 < 2e-16 ***
## hour1700 - 1759 0.5785349 0.0228909 25.274 < 2e-16 ***
## hour1800 - 1859 0.5518446 0.0226751 24.337 < 2e-16 ***
## hour1900 - 1959 0.5491589 0.0225756 24.325 < 2e-16 ***
## hour2000 - 2059 0.5809296 0.0221974 26.171 < 2e-16 ***
## hour2100 - 2159 0.4507247 0.0227291 19.830 < 2e-16 ***
## hour2200 - 2259 0.2473912 0.0235359 10.511 < 2e-16 ***
## hour2300 - 2359 0.0017426 0.0241234 0.072 0.94241
## weekdayTuesday -0.0560865 0.0110444 -5.078 3.81e-07 ***
## weekdayWednesday -0.0707288 0.0110963 -6.374 1.84e-10 ***
## weekdayThursday -0.0651128 0.0110950 -5.869 4.39e-09 ***
## weekdayFriday -0.0437700 0.0110278 -3.969 7.22e-05 ***
## weekdaySaturday -0.0529417 0.0109939 -4.816 1.47e-06 ***
## weekdaySunday -0.0016563 0.0107871 -0.154 0.87797
## years_from_2017 0.0265133 0.0040157 6.602 4.04e-11 ***
## lag_ed_visits 0.0062641 0.0009522 6.578 4.75e-11 ***
## lag2_ed_visits 0.0031998 0.0009760 3.278 0.00104 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18472 on 12213 degrees of freedom
## Residual deviance: 12455 on 12188 degrees of freedom
## AIC: 61386
##
## Number of Fisher Scoring iterations: 4
# diagnostics
resid(m3) %>% hist
par(mfrow = c(2,2))
plot(m3)
par(mfrow = c(1,1))
glance(m3) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped",
"condensed",
"responsive"))
| null.deviance | df.null | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|
| 18471.63 | 12213 | -30666.77 | 61385.55 | 61578.21 | 12455.22 | 12188 |
# Actual vs predicted vals
augment(m3) %>%
ggplot(aes(x = .fitted,
y = ed_visits)) +
geom_point() +
scale_x_continuous(limits = c(0, 5)) +
scale_y_continuous(limits = c(0, 25)) +
geom_smooth() +
geom_abline(slope = 1,
intercept = 0) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# 5) summary of models --------------------------------------
Poisson model is clearly wildly inappropriate
Between m1 and m2, m2 is simpler with similar adj-Rsqrd, and similar test RMSE
However, m1 includes several significant interactions, and studying interactions is one of the key goals in this analysis. To look at it another way, m1 is more complex, but is probably not overfitting much.
Let’s go with m1 here.
df8.model.performance <-
data.frame(model = c("year + weekday + hour + hour:weekday + lag + lag2",
"year + weekday + hour + hour:weekday + lag + lag2",
"year + weekday + hour + lag + lag2",
"year + weekday + hour + lag + lag2"),
metric = rep(c("Train RMSE",
"Test RMSE"), 2),
value = c(m1.train_rmse,
m1.test_rmse,
m2.train_rmse,
m2.test_rmse))
df8.model.performance %>%
kable() %>%
kable_styling(bootstrap_options = c("striped",
"condensed",
"responsive"))
| model | metric | value |
|---|---|---|
| year + weekday + hour + hour:weekday + lag + lag2 | Train RMSE | 3.033142 |
| year + weekday + hour + hour:weekday + lag + lag2 | Test RMSE | 3.089874 |
| year + weekday + hour + lag + lag2 | Train RMSE | 3.053144 |
| year + weekday + hour + lag + lag2 | Test RMSE | 3.083873 |
m4.full_dataset <- lm(ed_visits ~ hour + weekday + years_from_2017 +
lag_ed_visits + lag2_ed_visits + hour:weekday,
data = df5.ed_visits_busy_hours)
glance(m4.full_dataset) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped",
"condensed",
"responsive"))
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.3182033 | 0.3127558 | 3.054163 | 58.41239 | 0 | 122 | -38644.85 | 77535.69 | 78474.6 | 141261.9 | 15144 |
df9.coeffs <-
tidy(m4.full_dataset) %>%
mutate(lower_ci = estimate - 1.96 * std.error,
upper_ci = estimate + 1.96 * std.error,
is_signif_0.10 = ifelse(p.value < 0.10,
1, 0)) %>%
dplyr::select(term,
lower_ci,
estimate,
upper_ci,
everything())
df9.coeffs %>%
datatable() %>%
formatRound(2:7, 2)
# 6) pre-processing before visualization: -----------
df10.sig_interactions <-
df9.coeffs %>%
filter(grepl(":", term),
is_signif_0.10 == 1) %>%
mutate(hour = substr(term, 1, 15)) %>%
select(term,
hour,
estimate)
df11.hour_effects <-
df9.coeffs %>%
filter(!grepl(":", term),
grepl("hour", term)) %>%
left_join(df10.sig_interactions,
by = c("term" = "hour")) %>%
mutate(is_interaction = ifelse(is.na(estimate.y),
"No", "Yes") %>%
as.factor) %>%
select(term,
lower_ci,
estimate.x,
upper_ci,
is_interaction)
# 7) visuals of hour and day of week effects ----------
df11.hour_effects %>%
mutate(term = substring(term, 5) %>% as.factor()) %>%
ggplot() +
geom_pointrange(aes(x = term,
ymin = lower_ci,
ymax = upper_ci,
y = estimate.x,
col = is_interaction)) +
geom_hline(yintercept = 0) +
scale_y_continuous(limits = c(-5, 10),
breaks = seq(-5, 10, 3)) +
scale_color_manual(values = c("black",
"red")) +
labs(x = "Hour of day",
y = "Difference in average hourly ED visits" ,
title = "LGH ED \nImpact of Hour of Day on average daily ED visits",
subtitle = "These estimates control for year and day-of-week, allowing us to \nisolate hourly effects from other factors and from statistical noise \n\nBaseline - 0700 to 0759 on Monday",
caption = "\n\nNote: our model accounts for 30% of the variation\nin hourly ED visits between 7 AM and midnight",
col = "Varies by weekday?") +
theme_light(base_size = 12) +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1)
)
Import illustrative data to predict on. Note that all lagged ed_visits values are set to the overall mean for the corresponding hour of day (see df3.mean_and_sd)
df12.predict_intervals <-
read_csv(here::here("data",
"2019-06-30_illustrative-data-for-prediction-intervals.csv")) %>%
mutate(weekday = fct_relevel(weekday,
levels = c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday")),
# years_from_2017 = as.factor(years_from_2017),
year = as.factor(year),
month = as.factor(month),
hour = as.factor(hour)) %>%
# only MOnday:
filter(weekday == "Monday") %>%
# identify hours with significant weekday effects:
left_join(df10.sig_interactions %>%
mutate(hour = substr(hour, 5, 15)) %>%
select(hour) %>%
unique() %>%
mutate(is_interaction = 1)) %>%
replace(is.na(.), 0) %>%
mutate(is_interaction = as.factor(is_interaction))
## Parsed with column specification:
## cols(
## date = col_character(),
## hour = col_character(),
## years_from_2017 = col_double(),
## month = col_double(),
## year = col_double(),
## weekday = col_character(),
## ed_visits = col_double(),
## lag_ed_visits = col_double(),
## lag2_ed_visits = col_double(),
## lag3_ed_visits = col_double(),
## lag4_ed_visits = col_double(),
## lag5_ed_visits = col_double(),
## lag6_ed_visits = col_double(),
## hour_int = col_double()
## )
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
## Joining, by = "hour"
## Warning: Column `hour` joining factor and character vector, coercing into
## character vector
# add predictions from model m1:
df12.predict_intervals <-
df12.predict_intervals %>%
bind_cols(predict(m1,
newdata = df12.predict_intervals,
interval = "predict") %>%
as.data.frame()) %>%
# group and nest:
group_by(weekday) %>%
nest() %>%
# graphs of time of day ed_visits, by weekday:
mutate(hourly_visits = map2(data,
weekday,
function(data, weekday){
ggplot(data) +
geom_pointrange(aes(x = hour,
y = fit,
ymin = lwr,
ymax = upr,
col = is_interaction)) +
labs(title = paste0(as.character(weekday), " - 2019"),
subtitle = "Predicted ED visits by hour of day, after accounting for year and weekday effects",
x = "Hour",
y = "ED visits",
col = "Varies by weekday?") +
scale_y_continuous(limits = c(-2, 20),
breaks = seq(-2, 20, 2)) +
scale_color_manual(values = c("black",
"red")) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
}
))
df12.predict_intervals$hourly_visits
## [[1]]
# 8) write outputs: -----------------------------
# write_csv(df9.coeffs,
# here::here("results",
# "dst",
# "2019-06-28_lgh_ed-visits-by-hour_regression-coeffs.csv"))