library(tidyverse)
## Warning: package 'tibble' was built under R version 3.6.1
## Warning: package 'dplyr' was built under R version 3.6.1
library(here)
library(odbc)
library(DBI)
library(lubridate)
library(ggbeeswarm)
library(DT)
library(broom)
library(caret)
library(kableExtra)
library(scales)
knitr::opts_chunk$set(dev = "png",
cache = TRUE)
# 1) set up database connections and import functions: -----------
source(here::here("src",
"setup-denodo_function.R"))
source(here::here("src",
"ed-visits-denodo_function.R"))
setup_denodo()
# 2) pull ed data: -----------
df1.ed_visits <- extract_ed_visits("20170101", # todo: earlier start?
"20190617")
df2.ed_visits_cleaned <-
df1.ed_visits %>%
mutate(date = ymd(date_id),
weekday = weekdays(date),
month = month(date),
year = year(date),
years_from_2017 = year - 2017) %>%
# fiddle with factors:
mutate(weekday = fct_relevel(weekday,
levels = c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday")),
# years_from_2017 = as.factor(years_from_2017),
year = as.factor(year),
month = as.factor(month)) %>%
rename(ed_visits = value) %>%
mutate(lag_ed_visits = lag(ed_visits)) %>%
select(date,
years_from_2017,
month,
year,
weekday,
ed_visits,
lag_ed_visits)
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
str(df2.ed_visits_cleaned)
## Classes 'tbl_df', 'tbl' and 'data.frame': 898 obs. of 7 variables:
## $ date : Date, format: "2017-01-01" "2017-01-02" ...
## $ years_from_2017: num 0 0 0 0 0 0 0 0 0 0 ...
## $ month : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : Factor w/ 3 levels "2017","2018",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday : Factor w/ 7 levels "Monday","Tuesday",..: 7 1 2 3 4 5 6 7 1 2 ...
## $ ed_visits : int 188 194 192 189 164 187 181 166 175 186 ...
## $ lag_ed_visits : int NA 188 194 192 189 164 187 181 166 175 ...
df2.ed_visits_cleaned %>% datatable()
# mean and sd:
df3.mean_and_sd <-
df2.ed_visits_cleaned %>%
group_by(year,
weekday) %>%
summarise(mean_visits = mean(ed_visits),
sd_visits = sd(ed_visits))
df3.mean_and_sd %>%
datatable() %>%
formatRound(2:4, 2)
  Â
# 3) plots: ------------
# time series
df2.ed_visits_cleaned %>%
ggplot(aes(x = date,
y = ed_visits)) +
geom_line() +
geom_smooth() +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# facet by year
df2.ed_visits_cleaned %>%
ggplot(aes(x = weekday,
y = ed_visits)) +
geom_beeswarm(alpha = .4) +
facet_wrap(~year) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
df2.ed_visits_cleaned %>%
ggplot(aes(x = weekday,
y = ed_visits)) +
geom_boxplot() +
facet_wrap(~year) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# facet by weekday
df2.ed_visits_cleaned %>%
ggplot(aes(x = year,
y = ed_visits)) +
geom_beeswarm() +
facet_wrap(~weekday) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
df2.ed_visits_cleaned %>%
ggplot(aes(x = year,
y = ed_visits)) +
geom_boxplot() +
facet_wrap(~weekday) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# density by year:
df2.ed_visits_cleaned %>%
ggplot(aes(x = ed_visits)) +
geom_density() +
facet_wrap(~year) +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# checking normality:
for (i in 0:2) {
x <- df2.ed_visits_cleaned %>%
filter(years_from_2017 == i) %>%
pull(ed_visits)
qqnorm(x, main = paste("years from 2017 = ", i))
qqline(x, col = "red")
}
# 3.1) fitting normal distributions --------
fit2017 <-
df2.ed_visits_cleaned %>%
filter(year == "2017") %>%
pull(ed_visits) %>%
fitdistrplus::fitdist("norm")
# str(fit2017)
summary(fit2017)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## mean 175.67397 0.8113670
## sd 15.50114 0.5737231
## Loglikelihood: -1518.346 AIC: 3040.692 BIC: 3048.492
## Correlation matrix:
## mean sd
## mean 1.000000e+00 -2.646061e-08
## sd -2.646061e-08 1.000000e+00
plot(fit2017)
fit2018 <-
df2.ed_visits_cleaned %>%
filter(year == "2018") %>%
pull(ed_visits) %>%
fitdistrplus::fitdist("norm")
# str(fit2018)
summary(fit2018)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## mean 178.55068 0.8151148
## sd 15.57275 0.5763732
## Loglikelihood: -1520.028 AIC: 3044.056 BIC: 3051.856
## Correlation matrix:
## mean sd
## mean 1 0
## sd 0 1
plot(fit2018)
# variation by month (4 data points per cell):
df2.ed_visits_cleaned %>%
filter(weekday == "Monday",
year %in% c("2018")) %>%
ggplot(aes(x = weekday,
y = ed_visits)) +
geom_beeswarm() +
facet_wrap(~month) +
labs(title = "2018") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# simple average by day of week
df2.ed_visits_cleaned %>%
filter(year == "2018") %>%
group_by(weekday) %>%
summarise(mean_visits = mean(ed_visits)) %>%
ggplot(aes(x = weekday,
y = mean_visits ,
group = weekday)) +
geom_point(size = 5,
col = "dodgerblue4") +
labs(title = "2018") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
# simple average by month
df2.ed_visits_cleaned %>%
filter(year == "2018") %>%
group_by(month) %>%
summarise(mean_visits = mean(ed_visits)) %>%
ggplot(aes(x = month,
y = mean_visits ,
group = month)) +
geom_point(size = 5,
col = "dodgerblue4") +
labs(title = "2018") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
# "Seasonality" plot
# set
x <- seq(0, 1, length.out = 7)
cols <- seq_gradient_pal(low = "blue",
high = "red")(x)
# show_col(cols)
p <- df2.ed_visits_cleaned %>%
filter(year == "2018") %>%
group_by(month,
weekday) %>%
summarise(mean_visits = mean(ed_visits)) %>%
ggplot(aes(x = month,
y = mean_visits,
group = weekday)) +
geom_line(aes(col = weekday)) +
labs(title = "2018") +
scale_y_continuous(limits = c(0, 250),
expand = c(0, 0)) +
scale_color_manual(values = cols) +
theme_light() +
labs(title = "2018") +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95")); p
# ggplotly(p)
# avg ED visits by weekday AND month
# this is the type of plot that I am arguing against - it's all noise
df2.ed_visits_cleaned %>%
filter(year == "2018") %>%
group_by(month,
weekday) %>%
summarise(mean_visits = mean(ed_visits)) %>%
ggplot(aes(x = weekday,
y = mean_visits)) +
geom_col(fill = "dodgerblue4") +
facet_wrap(~month) +
labs(title = "Is there really any point in looking at graphs like this?") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
# 4) regression model 1: ----
set.seed(121)
v1_train_index <- createDataPartition(df2.ed_visits_cleaned$ed_visits,
p = 0.8,
list = FALSE)
m1 <- lm(ed_visits ~ years_from_2017 + weekday + month + lag_ed_visits + weekday:month,
data = df2.ed_visits_cleaned[v1_train_index, ])
summary(m1)
##
## Call:
## lm(formula = ed_visits ~ years_from_2017 + weekday + month +
## lag_ed_visits + weekday:month, data = df2.ed_visits_cleaned[v1_train_index,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.813 -8.957 -0.669 8.713 45.814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 154.29831 8.42202 18.321 < 2e-16 ***
## years_from_2017 3.86729 0.77749 4.974 8.45e-07 ***
## weekdayTuesday -13.91675 5.55343 -2.506 0.0125 *
## weekdayWednesday -8.97043 6.06841 -1.478 0.1398
## weekdayThursday -14.08910 5.77753 -2.439 0.0150 *
## weekdayFriday -6.49409 6.04687 -1.074 0.2832
## weekdaySaturday 0.29425 6.42933 0.046 0.9635
## weekdaySunday 4.35935 6.02763 0.723 0.4698
## month2 6.10651 6.40260 0.954 0.3406
## month3 -6.20713 6.19185 -1.002 0.3165
## month4 0.49503 6.02592 0.082 0.9346
## month5 -3.54823 6.20717 -0.572 0.5678
## month6 9.21882 6.19283 1.489 0.1371
## month7 -0.91198 6.41855 -0.142 0.8871
## month8 -8.94113 7.01207 -1.275 0.2027
## month9 -4.87975 6.41281 -0.761 0.4470
## month10 -0.73576 6.68543 -0.110 0.9124
## month11 -7.36432 7.44091 -0.990 0.3227
## month12 -2.12975 6.41281 -0.332 0.7399
## lag_ed_visits 0.16145 0.03898 4.142 3.91e-05 ***
## weekdayTuesday:month2 -8.45395 8.48089 -0.997 0.3192
## weekdayWednesday:month2 -11.88124 9.00409 -1.320 0.1875
## weekdayThursday:month2 -7.46883 8.70939 -0.858 0.3915
## weekdayFriday:month2 1.39762 8.88457 0.157 0.8751
## weekdaySaturday:month2 -8.28994 9.82495 -0.844 0.3991
## weekdaySunday:month2 -7.84377 9.00265 -0.871 0.3839
## weekdayTuesday:month3 4.10907 8.89019 0.462 0.6441
## weekdayWednesday:month3 4.94635 8.63566 0.573 0.5670
## weekdayThursday:month3 9.49706 8.36926 1.135 0.2569
## weekdayFriday:month3 9.47538 8.55196 1.108 0.2683
## weekdaySaturday:month3 0.73155 9.00371 0.081 0.9353
## weekdaySunday:month3 3.24521 8.63654 0.376 0.7072
## weekdayTuesday:month4 -0.02075 8.19964 -0.003 0.9980
## weekdayWednesday:month4 0.39170 8.61876 0.045 0.9638
## weekdayThursday:month4 -3.75115 8.55926 -0.438 0.6613
## weekdayFriday:month4 -11.08138 8.51455 -1.301 0.1936
## weekdaySaturday:month4 -12.08204 8.78996 -1.375 0.1698
## weekdaySunday:month4 -5.34330 8.51584 -0.627 0.5306
## weekdayTuesday:month5 9.15091 8.24166 1.110 0.2673
## weekdayWednesday:month5 -2.55882 8.63942 -0.296 0.7672
## weekdayThursday:month5 11.01220 8.57626 1.284 0.1996
## weekdayFriday:month5 -1.66005 8.55726 -0.194 0.8462
## weekdaySaturday:month5 -3.86797 9.48196 -0.408 0.6835
## weekdaySunday:month5 10.08381 8.87612 1.136 0.2564
## weekdayTuesday:month6 -1.93767 8.55250 -0.227 0.8208
## weekdayWednesday:month6 -9.37933 9.43759 -0.994 0.3207
## weekdayThursday:month6 -5.01131 8.55625 -0.586 0.5583
## weekdayFriday:month6 -4.84159 8.55450 -0.566 0.5716
## weekdaySaturday:month6 -15.53082 9.00417 -1.725 0.0850 .
## weekdaySunday:month6 -5.27018 9.00619 -0.585 0.5586
## weekdayTuesday:month7 5.85680 9.03788 0.648 0.5172
## weekdayWednesday:month7 5.26234 9.33891 0.563 0.5733
## weekdayThursday:month7 7.88056 9.40324 0.838 0.4023
## weekdayFriday:month7 6.61324 9.56984 0.691 0.4898
## weekdaySaturday:month7 -3.53939 9.41553 -0.376 0.7071
## weekdaySunday:month7 -3.78904 8.88587 -0.426 0.6700
## weekdayTuesday:month8 14.75793 9.46722 1.559 0.1195
## weekdayWednesday:month8 5.59955 9.31913 0.601 0.5481
## weekdayThursday:month8 8.71024 9.40954 0.926 0.3550
## weekdayFriday:month8 8.97914 9.44174 0.951 0.3420
## weekdaySaturday:month8 -2.70443 10.51477 -0.257 0.7971
## weekdaySunday:month8 11.06397 9.76366 1.133 0.2576
## weekdayTuesday:month9 7.68724 9.03920 0.850 0.3954
## weekdayWednesday:month9 5.67244 9.33385 0.608 0.5436
## weekdayThursday:month9 18.03731 9.73479 1.853 0.0644 .
## weekdayFriday:month9 6.24955 9.14893 0.683 0.4948
## weekdaySaturday:month9 -13.05150 9.81526 -1.330 0.1841
## weekdaySunday:month9 11.54577 9.33681 1.237 0.2167
## weekdayTuesday:month10 -1.54519 9.47577 -0.163 0.8705
## weekdayWednesday:month10 -9.31467 9.33728 -0.998 0.3189
## weekdayThursday:month10 9.08713 9.34294 0.973 0.3311
## weekdayFriday:month10 -15.89211 9.51745 -1.670 0.0955 .
## weekdaySaturday:month10 -14.01015 9.75814 -1.436 0.1516
## weekdaySunday:month10 -5.29747 9.33707 -0.567 0.5707
## weekdayTuesday:month11 0.73784 9.61805 0.077 0.9389
## weekdayWednesday:month11 -7.17345 9.89593 -0.725 0.4688
## weekdayThursday:month11 14.68065 9.92134 1.480 0.1394
## weekdayFriday:month11 -8.99053 10.06545 -0.893 0.3721
## weekdaySaturday:month11 -12.06443 10.51771 -1.147 0.2518
## weekdaySunday:month11 -1.19333 10.07296 -0.118 0.9057
## weekdayTuesday:month12 6.81710 9.03566 0.754 0.4508
## weekdayWednesday:month12 5.75093 9.15022 0.629 0.5299
## weekdayThursday:month12 -0.90732 9.40556 -0.096 0.9232
## weekdayFriday:month12 4.71921 9.15022 0.516 0.6062
## weekdaySaturday:month12 -6.89133 9.26351 -0.744 0.4572
## weekdaySunday:month12 2.50005 9.33173 0.268 0.7889
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.78 on 634 degrees of freedom
## Multiple R-squared: 0.3295, Adjusted R-squared: 0.2396
## F-statistic: 3.665 on 85 and 634 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m1)
par(mfrow = c(1,1))
# glance(m1)
# tidy(m1)
# augment(m1) # %>% names
# predict(m1, interval = "prediction")
m1.train_rmse <- sqrt(mean(resid(m1)^2))
# test set performance:
df4.predictions <-
data.frame(ed_visits = df2.ed_visits_cleaned[-v1_train_index, 6],
predicted = predict(m1,
newdata = df2.ed_visits_cleaned[-v1_train_index, ]))
m1.test_rmse <- sqrt(mean((df4.predictions$predicted - df4.predictions$ed_visits)^2,
na.rm = TRUE))
# 5) regression model 2: ----
set.seed(121)
v1_train_index <- createDataPartition(df2.ed_visits_cleaned$ed_visits,
p = 0.8,
list = FALSE)
m2 <- lm(ed_visits ~ years_from_2017 + weekday + month + lag_ed_visits,
data = df2.ed_visits_cleaned[v1_train_index, ])
summary(m2)
##
## Call:
## lm(formula = ed_visits ~ years_from_2017 + weekday + month +
## lag_ed_visits, data = df2.ed_visits_cleaned[v1_train_index,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.229 -9.538 -0.286 9.233 46.245
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 158.72362 7.35109 21.592 < 2e-16 ***
## years_from_2017 3.99472 0.76845 5.198 2.64e-07 ***
## weekdayTuesday -11.15955 1.94051 -5.751 1.33e-08 ***
## weekdayWednesday -10.53659 2.02594 -5.201 2.61e-07 ***
## weekdayThursday -10.19050 2.06858 -4.926 1.05e-06 ***
## weekdayFriday -7.28938 2.01238 -3.622 0.000313 ***
## weekdaySaturday -7.54671 2.07650 -3.634 0.000299 ***
## weekdaySunday 4.45044 2.00022 2.225 0.026401 *
## month2 -0.01046 2.36364 -0.004 0.996469
## month3 -0.83117 2.28412 -0.364 0.716049
## month4 -3.71075 2.27845 -1.629 0.103841
## month5 -0.02794 2.29764 -0.012 0.990301
## month6 3.87182 2.36417 1.638 0.101931
## month7 1.74936 2.53443 0.690 0.490274
## month8 -1.59659 2.53108 -0.631 0.528380
## month9 0.60023 2.59564 0.231 0.817192
## month10 -5.80302 2.57302 -2.255 0.024420 *
## month11 -9.25944 2.61501 -3.541 0.000425 ***
## month12 0.32243 2.51470 0.128 0.898014
## lag_ed_visits 0.13632 0.03746 3.639 0.000294 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.81 on 700 degrees of freedom
## Multiple R-squared: 0.2557, Adjusted R-squared: 0.2355
## F-statistic: 12.66 on 19 and 700 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m2)
par(mfrow = c(1,1))
# glance(m2)
# tidy(m2)
# augment(m2) # %>% names
# predict(m2, interval = "prediction")
m2.train_rmse <- sqrt(mean(resid(m2)^2))
# test set performance:
df4.predictions <-
data.frame(ed_visits = df2.ed_visits_cleaned[-v1_train_index, 6],
predicted = predict(m2,
newdata = df2.ed_visits_cleaned[-v1_train_index, ]))
m2.test_rmse <- sqrt(mean((df4.predictions$predicted - df4.predictions$ed_visits)^2,
na.rm = TRUE))
df5.model.performance <-
data.frame(model = c("year + month + weekday + month:weekday",
"year + month + weekday + month:weekday",
"year + month + weekday",
"year + month + weekday"),
metric = rep(c("Train RMSE",
"Test RMSE"), 2),
value = c(m1.train_rmse,
m1.test_rmse,
m2.train_rmse,
m2.test_rmse))
df5.model.performance %>%
kable() %>%
kable_styling(bootstrap_options = c("striped",
"condensed",
"responsive"))
| model | metric | value |
|---|---|---|
| year + month + weekday + month:weekday | Train RMSE | 12.92700 |
| year + month + weekday + month:weekday | Test RMSE | 14.38572 |
| year + month + weekday | Train RMSE | 13.61936 |
| year + month + weekday | Test RMSE | 13.98180 |
  Â
Including month, weekday and year is very likely to overfit - there’s just 4 data points per cell!!
The general strategy to prevent overfitting is, of course, cross-validation or a train/test split   Â
# 6) train model 2 on full dataset: -----------
m3.full_dataset <- lm(ed_visits ~ years_from_2017 + weekday + month + lag_ed_visits,
data = df2.ed_visits_cleaned)
summary(m3.full_dataset)
##
## Call:
## lm(formula = ed_visits ~ years_from_2017 + weekday + month +
## lag_ed_visits, data = df2.ed_visits_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.570 -9.486 -0.253 9.466 43.926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 153.39456 6.54392 23.441 < 2e-16 ***
## years_from_2017 4.13042 0.67963 6.077 1.82e-09 ***
## weekdayTuesday -10.80266 1.72528 -6.261 5.97e-10 ***
## weekdayWednesday -11.19875 1.78843 -6.262 5.95e-10 ***
## weekdayThursday -10.49927 1.80956 -5.802 9.14e-09 ***
## weekdayFriday -6.88718 1.80750 -3.810 0.000148 ***
## weekdaySaturday -7.16601 1.77412 -4.039 5.83e-05 ***
## weekdaySunday 5.12227 1.76991 2.894 0.003897 **
## month2 -0.63584 2.08307 -0.305 0.760253
## month3 -0.22125 2.03080 -0.109 0.913269
## month4 -4.27939 2.05232 -2.085 0.037344 *
## month5 2.09335 2.02958 1.031 0.302628
## month6 2.67357 2.13742 1.251 0.211326
## month7 2.19038 2.29445 0.955 0.340022
## month8 -1.18906 2.29405 -0.518 0.604364
## month9 0.17245 2.31562 0.074 0.940651
## month10 -5.29449 2.30121 -2.301 0.021640 *
## month11 -8.29155 2.34607 -3.534 0.000430 ***
## month12 1.99130 2.29397 0.868 0.385599
## lag_ed_visits 0.16218 0.03336 4.861 1.38e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.8 on 877 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2759, Adjusted R-squared: 0.2602
## F-statistic: 17.59 on 19 and 877 DF, p-value: < 2.2e-16
glance(m3.full_dataset) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped",
"condensed",
"responsive"))
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.275883 | 0.2601952 | 13.79842 | 17.5858 | 0 | 20 | -3616.899 | 7275.799 | 7376.579 | 166977.5 | 877 |
# actual vs fitted values:
augment(m3.full_dataset) %>%
ggplot(aes(x = .fitted,
y = ed_visits)) +
geom_point() +
scale_x_continuous(limits = c(150, 210)) +
scale_y_continuous(limits = c(150, 210)) +
geom_smooth() +
geom_abline(slope = 1,
intercept = 0) +
labs(x = "predicted values",
y = "actual values",
title = "LGH ED visits - Prediction using day of week, month, year, and previous day's ED visits") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 53 rows containing non-finite values (stat_smooth).
## Warning: Removed 53 rows containing missing values (geom_point).
df6.coeffs <-
tidy(m3.full_dataset) %>%
mutate(lower_ci = estimate - 1.96 * std.error,
upper_ci = estimate + 1.96 * std.error) %>%
dplyr::select(term,
lower_ci,
estimate,
upper_ci,
everything())
df6.coeffs %>%
datatable() %>%
formatRound(2:7, 2)
  Â
# 7) visuals of day of week effects ----------
df6.coeffs %>%
filter(grepl("weekday", term)) %>%
mutate(term = substring(term, 8)) %>%
mutate(term = factor(term,
levels = c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"))) %>%
ggplot() +
geom_pointrange(aes(x = term,
ymin = lower_ci,
ymax = upper_ci,
y = estimate)) +
geom_hline(yintercept = 0) +
scale_y_continuous(limits = c(-20, 20),
breaks = seq(-20, 20, 4)) +
labs(x = "Day of week",
y = "Difference in average daily ED visits" ,
title = "LGH ED \nImpact of Day of Week on average daily ED visits",
subtitle = "These estimates control for year and month, allowing us to isolate weekday effects \nfrom other factors and from statistical noise \n\nBaseline - Monday",
caption = "\n\nNote: There is no significant interaction between day-of-week effects and month effects") +
theme_light(base_size = 12) +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
  Â
# 8) visuals of month effects ----------
df6.coeffs %>%
filter(grepl("month", term)) %>%
mutate(term = factor(term,
levels = c(
"month2",
"month3",
"month4",
"month5",
"month6",
"month7",
"month8",
"month9",
"month10",
"month11",
"month12"
))) %>%
ggplot() +
geom_pointrange(aes(x = term,
ymin = lower_ci,
ymax = upper_ci,
y = estimate)) +
geom_hline(yintercept = 0) +
scale_y_continuous(limits = c(-20, 20),
breaks = seq(-20, 20, 4)) +
labs(x = "Month",
y = "Difference in average daily ED visits" ,
title = "LGH ED \nImpact of Month on average daily ED visits",
subtitle = "These estimates control for year and day-of-week, allowing us to isolate month effects \nfrom other factors and from statistical noise \n\nBaseline - January",
caption = "\n\nNote: There is no significant interaction between day-of-week effects and month effects") +
theme_light(base_size = 12) +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"),
axis.text.x = element_text(angle = 45,
hjust = 1))
 Â
Import illustrative data to predict on. Note that all lagged ed_visits values are set to the overall mean for the corresponding day of week in 2019 (see df3.mean_and_sd)
# 9) Prediction intervals for illustrative data ---------
df7.predict_intervals <-
read_csv(here::here("data",
"2019-06-30_ed-daily_illustrative-data-for-prediction-intervals.csv")) %>%
mutate(weekday = fct_relevel(weekday,
levels = c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday")),
date = mdy(date),
# years_from_2017 = as.factor(years_from_2017),
year = as.factor(year),
month = as.factor(month))
## Parsed with column specification:
## cols(
## date = col_character(),
## years_from_2017 = col_double(),
## month = col_double(),
## year = col_double(),
## weekday = col_character(),
## ed_visits = col_double(),
## lag_ed_visits = col_double()
## )
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
df7.predict_intervals <-
predict(m2,
newdata = df7.predict_intervals,
interval = "prediction") %>%
as.data.frame() %>%
bind_cols(df7.predict_intervals)
df7.predict_intervals %>%
select(-fit,
-lwr,
-upr,
everything()) %>%
datatable() %>%
formatRound(8:10, 2)
df7.predict_intervals %>%
ggplot(aes(x = weekday,
ymin = lwr,
ymax = upr,
y = fit)) +
geom_pointrange() +
labs(title = "2019 weekday effects",
subtitle = "Predicted LGH ED visits by day of week \n\nBased on our model, only 5% of actual data points should fall outside these ranges. \nObserved value is about 3% as of 2019-06-17",
y = "number of ED visits") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
Is it true that in 2019, 5% of data points fall outsite these prediction intervals?
df8.validate_predict_int <-
df7.predict_intervals %>%
left_join(df2.ed_visits_cleaned %>%
filter(years_from_2017 == 2),
by = c("weekday" = "weekday")) %>%
select(date.y,
month.x,
weekday,
ed_visits.y,
lwr,
upr) %>%
mutate(outsite_predict_int = ifelse(ed_visits.y < lwr | ed_visits.y > upr,
1, 0)) %>%
arrange(date.y)
df8.validate_predict_int %>%
datatable(extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))
df8.validate_predict_int %>%
summarise(n = n(),
num_outsite_interval = sum(outsite_predict_int),
prop = num_outsite_interval/n)
## n num_outsite_interval prop
## 1 168 5 0.0297619
Conclusion: the prediction intervals may in fact be a bit conservative (too wide). That’s why a smaller proportion of the real data is falling outside the interval (3% versus 5%).
Of course, it could just be sampling variability due to a small dataset.
# 10) write outputs: ---------
# write_csv(df6.coeffs,
# here::here("results",
# "dst",
# "2019-06-21_lgh_ed-visits-regression-coeffs.csv"))
# write_csv(df6.coeffs,
# here::here("results",
# "dst",
# "2019-06-24_lgh_ed-visits-regression-coeffs.csv"))
# only month
m4.month <- lm(ed_visits ~ month,
data = df2.ed_visits_cleaned)
summary(m4.month)
##
## Call:
## lm(formula = ed_visits ~ month, data = df2.ed_visits_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.022 -10.959 -0.928 10.955 59.823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 181.0215 1.6174 111.921 < 2e-16 ***
## month2 -0.8072 2.3478 -0.344 0.731066
## month3 -0.6022 2.2874 -0.263 0.792418
## month4 -4.6771 2.3063 -2.028 0.042867 *
## month5 1.9677 2.2874 0.860 0.389873
## month6 2.3551 2.4032 0.980 0.327366
## month7 0.6559 2.5573 0.256 0.797638
## month8 -4.4892 2.5573 -1.755 0.079530 .
## month9 -2.1548 2.5828 -0.834 0.404333
## month10 -8.5215 2.5573 -3.332 0.000897 ***
## month11 -12.9048 2.5828 -4.996 7.03e-07 ***
## month12 0.1559 2.5573 0.061 0.951399
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.6 on 886 degrees of freedom
## Multiple R-squared: 0.06556, Adjusted R-squared: 0.05396
## F-statistic: 5.651 on 11 and 886 DF, p-value: 7.873e-09
glance(m4.month) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped",
"condensed",
"responsive"))
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.0655617 | 0.0539603 | 15.59768 | 5.651195 | 0 | 12 | -3735.082 | 7496.164 | 7558.566 | 215552.8 | 886 |
augment(m4.month) %>%
ggplot(aes(x = .fitted,
y = ed_visits,
col = month)) +
geom_point() +
scale_x_continuous(limits = c(150, 210)) +
scale_y_continuous(limits = c(150, 210)) +
geom_smooth() +
geom_abline(slope = 1,
intercept = 0) +
labs(x = "predicted values",
y = "actual values") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 53 rows containing non-finite values (stat_smooth).
## Warning: Removed 53 rows containing missing values (geom_point).
# only day of week
m5.weekday <- lm(ed_visits ~ weekday,
data = df2.ed_visits_cleaned)
summary(m5.weekday)
##
## Call:
## lm(formula = ed_visits ~ weekday, data = df2.ed_visits_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.117 -10.359 -0.359 9.850 51.233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 186.620 1.309 142.522 < 2e-16 ***
## weekdayTuesday -11.370 1.855 -6.128 1.33e-09 ***
## weekdayWednesday -13.581 1.855 -7.320 5.55e-13 ***
## weekdayThursday -13.261 1.855 -7.147 1.84e-12 ***
## weekdayFriday -9.503 1.855 -5.122 3.71e-07 ***
## weekdaySaturday -9.112 1.855 -4.911 1.08e-06 ***
## weekdaySunday 3.147 1.852 1.700 0.0896 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.87 on 891 degrees of freedom
## Multiple R-squared: 0.1457, Adjusted R-squared: 0.1399
## F-statistic: 25.32 on 6 and 891 DF, p-value: < 2.2e-16
glance(m5.weekday) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped",
"condensed",
"responsive"))
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.145682 | 0.139929 | 14.8721 | 25.32286 | 0 | 7 | -3694.833 | 7405.665 | 7444.067 | 197070.9 | 891 |
augment(m5.weekday) %>%
ggplot(aes(x = .fitted,
y = ed_visits ,
col = weekday)) +
geom_point() +
scale_x_continuous(limits = c(150, 210)) +
scale_y_continuous(limits = c(150, 210)) +
geom_abline(slope = 1,
intercept = 0) +
geom_smooth(col = "red") +
labs(x = "predicted values",
y = "actual values",
title = "LGH daily ED visits, by day of week") +
theme_light() +
theme(panel.grid.minor = element_line(colour = "grey95"),
panel.grid.major = element_line(colour = "grey95"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 53 rows containing non-finite values (stat_smooth).
## Warning: Removed 53 rows containing missing values (geom_point).