library( dplyr )
library( ggplot2 )
## Warning: package 'ggplot2' was built under R version 4.0.5
library( gridExtra )
library( rvest )
library( tidyverse )
library( multcompView )
## Warning: package 'multcompView' was built under R version 4.0.5
library( cdcfluview )
## Warning: package 'cdcfluview' was built under R version 4.0.5
library( purrr )
library( inspectdf )
## Warning: package 'inspectdf' was built under R version 4.0.5
library( naniar )
## Warning: package 'naniar' was built under R version 4.0.5
library( imputeTS )
library( usmap )
## Warning: package 'usmap' was built under R version 4.0.5
library( scales )
library( ggpubr )
## Warning: package 'ggpubr' was built under R version 4.0.5
# PIC National
url <- "https://raw.githubusercontent.com/SmilodonCub/DATA621/master/NCHSData12.csv"
PIC_national_df <- read.csv( url ) %>%
rowid_to_column()
colnames( PIC_national_df ) <- c( 'rowid', 'year', 'week', 'perc_death_PI',
'perc_death_PIC', 'expected', 'threshold',
'all_deaths', 'P_deaths', 'I_deaths',
'C_deaths', 'PIC_deaths' )
# PIC State-level
url <- "https://raw.githubusercontent.com/SmilodonCub/DATA621/master/State_Custom_Data.csv"
PIC_by_state_df <- read.csv( url )
glimpse( PIC_by_state_df )
## Rows: 20,176
## Columns: 13
## $ AREA <chr> "State", "State", "State", "State", "State", "...
## $ SUB.AREA <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ AGE.GROUP <chr> "All", "All", "All", "All", "All", "All", "All...
## $ SEASON <chr> "2020-21", "2020-21", "2020-21", "2020-21", "2...
## $ WEEK <int> 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51...
## $ PERCENT.P.I <dbl> 8.6, 8.8, 7.8, 9.5, 9.4, 8.3, 9.5, 8.7, 8.8, 1...
## $ PERCENT.PIC <dbl> 15.4, 15.7, 15.2, 17.2, 16.4, 15.6, 18.4, 18.0...
## $ NUM.INFLUENZA.DEATHS <chr> "0", "1", "0", "1", "0", "0", "0", "2", "2", "...
## $ NUM.PNEUMONIA.DEATHS <chr> "97", "107", "94", "109", "112", "108", "124",...
## $ NUM.COVID.19.DEATHS <chr> "116", "141", "132", "131", "135", "151", "186...
## $ TOTAL.PIC <chr> "173", "193", "183", "199", "195", "202", "240...
## $ TOTAL.DEATHS <chr> "1,122", "1,233", "1,205", "1,160", "1,190", "...
## $ PERCENT.COMPLETE <chr> "> 100%", "> 100%", "> 100%", "> 100%", "> 100...
## Statewide Restriction
url <- "http://en.wikipedia.org/wiki/U.S._state_and_local_government_responses_to_the_COVID-19_pandemic"
state_level_regulation_df <- url %>%
read_html() %>%
html_node(xpath='//*[@id="mw-content-text"]/div[1]/table[2]') %>%
html_table(fill=T)
state_level_regulation_df <- state_level_regulation_df[-1,]
state_level_regulation_df <- state_level_regulation_df[,-1]
state_level_regulation_df <- state_level_regulation_df[,-11]
colnames( state_level_regulation_df ) <- c( "State_territory", "State_of_emergency_declared", "Stay_at_home_ordered",
"Face_coverings_required_in_public", "Gatherings_banned",
"Out_of_state_travel_restrictions", "Closures_School", "Closures_Daycare",
"Closures_Bars", "Closures_Retail")
PIC National
ggplot( data = PIC_national_df, aes( x = rowid, y = perc_death_PIC, color = '%PIC' ) ) +
geom_line( ) +
geom_line( aes( x = rowid, y = threshold, color = 'threshold' ) ) +
geom_line( aes( x = rowid, y = expected, color = 'expected' ), linetype = "dashed" ) +
geom_line( aes( x = rowid, y = perc_death_PI, color = '%PI') ) +
theme_classic() +
ggtitle( 'Pneumonia, Influenza, and COVID-19 Mortality', subtitle = " from the NCHS Mortality Surveillance System" ) +
ylab( '% of All Deaths' ) +
xlab( 'MMWR Week' ) +
scale_x_continuous(breaks=c(seq( 25,392,49)),
labels=c("2013-14", "2014-15", "2015-16", "2016-17", "2017-18", "2018-19", "2019-20", "2020-21")) +
scale_color_manual(name="PIC data", values=c("%PIC"="red", "%PI"="blue", 'threshold'='black', 'expected'='black'))
Mean Annual Influenza Death Predictions (Expected & Threshold)
expectedDeaths <- PIC_national_df %>%
dplyr::select( c( 'week', 'expected', 'threshold' ) ) %>%
dplyr::group_by(week) %>%
dplyr::summarise( 'expected' = mean( expected ), 'threshold' = mean( threshold ) ) %>%
dplyr::rename( WEEK = week )
## `summarise()` ungrouping output (override with `.groups` argument)
head( expectedDeaths )
## # A tibble: 6 x 3
## WEEK expected threshold
## <int> <dbl> <dbl>
## 1 1 7.17 7.52
## 2 2 7.25 7.59
## 3 3 7.31 7.65
## 4 4 7.36 7.70
## 5 5 7.40 7.74
## 6 6 7.42 7.76
PIC State-Level
# remove instances of this string to make later conversion easier
PIC_by_state_df[PIC_by_state_df == 'Insufficient Data'] <- NA
PIC_by_state_df <- PIC_by_state_df %>%
dplyr::select( -c( 'AREA', 'AGE.GROUP' ) ) %>% #remove AREA & AGE.GROUP -> Uninformative, but could add later if people think it'd be an interesting feature
mutate( NUM.INFLUENZA.DEATHS = as.numeric(NUM.INFLUENZA.DEATHS),
NUM.PNEUMONIA.DEATHS = as.numeric(NUM.PNEUMONIA.DEATHS),
NUM.COVID.19.DEATHS = as.numeric(NUM.COVID.19.DEATHS),
TOTAL.PIC = as.numeric(TOTAL.PIC),
TOTAL.DEATHS = as.numeric(TOTAL.DEATHS)) #remove commas and recast numeric
## Warning: Problem with `mutate()` input `NUM.PNEUMONIA.DEATHS`.
## i NAs introduced by coercion
## i Input `NUM.PNEUMONIA.DEATHS` is `as.numeric(NUM.PNEUMONIA.DEATHS)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
## Warning: Problem with `mutate()` input `NUM.COVID.19.DEATHS`.
## i NAs introduced by coercion
## i Input `NUM.COVID.19.DEATHS` is `as.numeric(NUM.COVID.19.DEATHS)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
## Warning: Problem with `mutate()` input `TOTAL.PIC`.
## i NAs introduced by coercion
## i Input `TOTAL.PIC` is `as.numeric(TOTAL.PIC)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
## Warning: Problem with `mutate()` input `TOTAL.DEATHS`.
## i NAs introduced by coercion
## i Input `TOTAL.DEATHS` is `as.numeric(TOTAL.DEATHS)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
#Join the mean expected deaths by week to the df
PIC_by_state_df <- left_join( PIC_by_state_df, expectedDeaths, by = "WEEK", copy = FALSE ) %>%
mutate( PI_expected_diff = PERCENT.P.I - expected,
PI_threshold_diff = PERCENT.P.I - threshold )
# visualize deviation from expected PI deaths for each flu season:
eachSeason_diff <- PIC_by_state_df %>%
dplyr::rename( area = SUB.AREA, season = SEASON ) %>%
dplyr::group_by( season, area ) %>%
dplyr::summarise( meanPI_expected_diff = mean( PI_expected_diff, na.rm = TRUE ),
meanPI_threshold_diff = mean( PI_threshold_diff, na.rm = TRUE )) %>%
dplyr::mutate( season = factor( season ) )
## `summarise()` regrouping output by 'season' (override with `.groups` argument)
p1 <- ggplot(data = eachSeason_diff, aes( x = season, y = meanPI_expected_diff ) ) +
geom_boxplot() +
geom_jitter( color="black", size = 0.4, alpha = 0.6 ) +
theme_classic() +
ggtitle( 'Deviance from Expected PI Mortality', subtitle = 'distribution of deviation for each US state per flu season') +
ylab( '% Deviation' ) +
xlab( 'Flu Season' )
#ANOVA + TUKEY Test for PIC state-level
eachSeason_diff <- eachSeason_diff %>%
mutate( season2 = gsub("-","", season ) )
#ANOVA + Tukey test
linmod <- lm( data = eachSeason_diff, meanPI_expected_diff ~ season2 )
ANOVA <- aov( linmod )
TUKEY <- TukeyHSD( x = ANOVA, 'season2', conf.level = 0.99)
TUKEY_df <- data.frame( TUKEY$season2 )
TUKEY_df <- tibble::rownames_to_column( TUKEY_df, 'Season' )
p3 <- ggplot() +
geom_linerange( data = TUKEY_df, mapping = aes( y = Season, xmin = lwr, xmax = upr ), width = 0.2, size = 1, color = 'red' ) +
geom_point( data = TUKEY_df, aes( x = diff, y = Season ), size = 3, shape = 21, color = 'red' ) +
geom_vline( xintercept = 0, linetype = 'dashed', color = 'gray' ) +
xlab( 'Difference in mean level' ) +
ggtitle( "99% Family-wise Confidence Level" ) +
theme_classic()
## Warning: Ignoring unknown parameters: width
grid.arrange( p1, p3, ncol =2 )
EDA state level regulation
glimpse( state_level_regulation_df )
## Rows: 56
## Columns: 10
## $ State_territory <chr> "Alabama", "Alaska", "American Sa...
## $ State_of_emergency_declared <chr> "March 13", "March 11", "January ...
## $ Stay_at_home_ordered <chr> "April 4", "March 28", "No", "Mar...
## $ Face_coverings_required_in_public <chr> "Yes", "No", "No", "No", "Yes", "...
## $ Gatherings_banned <chr> "10 or more", "10 or more", "10 o...
## $ Out_of_state_travel_restrictions <chr> "No", "Mandatory quarantine", "Tr...
## $ Closures_School <chr> "Yes (remainder of term)", "Yes (...
## $ Closures_Daycare <chr> "Yes", "Yes", "Yes", "Yes", "Yes"...
## $ Closures_Bars <chr> "Yes", "Yes", "No", "Yes", "Yes",...
## $ Closures_Retail <chr> "Yes", "Yes", "No", "Yes", "Regio...
state_level_regulation_df <- state_level_regulation_df %>%
modify_if( is.character, as.factor )
p5 <- inspectdf::inspect_imb( state_level_regulation_df ) %>% show_plot()
p5 + theme_classic()
Missingness in state PIC data
miss_var_summary( PIC_by_state_df )
## # A tibble: 15 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 TOTAL.DEATHS 7844 38.9
## 2 TOTAL.PIC 146 0.724
## 3 NUM.COVID.19.DEATHS 113 0.560
## 4 NUM.PNEUMONIA.DEATHS 68 0.337
## 5 PERCENT.P.I 32 0.159
## 6 PERCENT.PIC 32 0.159
## 7 NUM.INFLUENZA.DEATHS 32 0.159
## 8 PI_expected_diff 32 0.159
## 9 PI_threshold_diff 32 0.159
## 10 SUB.AREA 0 0
## 11 SEASON 0 0
## 12 WEEK 0 0
## 13 PERCENT.COMPLETE 0 0
## 14 expected 0 0
## 15 threshold 0 0
miss_var_table( PIC_by_state_df )
## # A tibble: 6 x 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 6 40
## 2 32 5 33.3
## 3 68 1 6.67
## 4 113 1 6.67
## 5 146 1 6.67
## 6 7844 1 6.67
# Visualize the missingness by season
gg_miss_fct( x = PIC_by_state_df, fct = SEASON)
The figure above shows a heatmap of missingness. We see that TOTAL.DEATHS has a lot of missing values. That’s okay, because we will be using the PERCENT.P.I, PERCENT.PIC & features derived from the (_diff). However, what is troubling is that there are quite a few missing values for the 2020-21 flu season and to a lesser extend 2019-20 for what we would like to use as a predictor variable PI_expected_diff.
missingness for PI_expect_diff 2020-2021 by State:
states_missingness <- PIC_by_state_df %>%
dplyr::filter( SEASON == '2020-21' ) %>%
dplyr::select( SUB.AREA, PI_expected_diff ) %>%
dplyr::group_by( SUB.AREA ) %>%
miss_var_summary() %>%
filter( n_miss != 0 ) %>%
arrange(desc( n_miss ))
states_missingness
## # A tibble: 13 x 4
## # Groups: SUB.AREA [13]
## SUB.AREA variable n_miss pct_miss
## <chr> <chr> <int> <dbl>
## 1 North Carolina PI_expected_diff 13 56.5
## 2 West Virginia PI_expected_diff 4 17.4
## 3 Connecticut PI_expected_diff 2 8.70
## 4 Indiana PI_expected_diff 2 8.70
## 5 Rhode Island PI_expected_diff 2 8.70
## 6 Utah PI_expected_diff 2 8.70
## 7 Alaska PI_expected_diff 1 4.35
## 8 Delaware PI_expected_diff 1 4.35
## 9 District of Columbia PI_expected_diff 1 4.35
## 10 Montana PI_expected_diff 1 4.35
## 11 North Dakota PI_expected_diff 1 4.35
## 12 Ohio PI_expected_diff 1 4.35
## 13 South Dakota PI_expected_diff 1 4.35
We should exclude North Carolina and consider excluding West Virginia, but otherwise perhaps find a clever way to impute the other missing values. First, I will visualize the impact of the missingness on the target variable, PI_expected_diff
Let’s see if there is a pattern for missingness by WEEK:
week_missingness <- PIC_by_state_df %>%
dplyr::filter( SEASON == '2020-21', !SUB.AREA %in% c( 'North Carolina', 'West Virginia' ) ) %>%
dplyr::select( WEEK, PI_expected_diff ) %>%
dplyr::group_by( WEEK ) %>%
miss_var_summary() %>%
filter( n_miss != 0 ) %>%
arrange(desc( n_miss ))
week_missingness
## # A tibble: 2 x 4
## # Groups: WEEK [2]
## WEEK variable n_miss pct_miss
## <int> <chr> <int> <dbl>
## 1 9 PI_expected_diff 11 22
## 2 8 PI_expected_diff 4 8
After filtering West Virginia and North Carolina, we can see that 22% of the remaining states are missing data for week 9. Week 9 is the last week of the 2020-2021 flu season to be included in the data set, so the missingness is probably due to lags in reporting. We will drop week 9 from further analysis and impute the remainder of values using kalman smoothing methods from the imputeTS library.
PI_missingness_2 <- PIC_by_state_df %>%
dplyr::filter( SEASON == '2020-21') %>%
dplyr::select( SUB.AREA, WEEK, PI_expected_diff ) %>%
dplyr::group_by( SUB.AREA ) %>%
dplyr::mutate( PI_expected_diff_imp = na_kalman( PI_expected_diff ),
num = row_number() )
PI_missingness_2
## # A tibble: 1,196 x 5
## # Groups: SUB.AREA [52]
## SUB.AREA WEEK PI_expected_diff PI_expected_diff_imp num
## <chr> <int> <dbl> <dbl> <int>
## 1 Alabama 40 2.79 2.79 1
## 2 Alabama 41 2.91 2.91 2
## 3 Alabama 42 1.82 1.82 3
## 4 Alabama 43 3.41 3.41 4
## 5 Alabama 44 3.21 3.21 5
## 6 Alabama 45 1.99 1.99 6
## 7 Alabama 46 3.08 3.08 7
## 8 Alabama 47 2.16 2.16 8
## 9 Alabama 48 2.15 2.15 9
## 10 Alabama 49 3.44 3.44 10
## # ... with 1,186 more rows
Visualize the imputation the make sure it behaves:
ggplot( data = PI_missingness_2, aes( x = num, y = PI_expected_diff_imp ) ) +
geom_line( color = 'red' ) +
geom_line( data = PI_missingness_2, aes( x = num, y = PI_expected_diff ), color = 'blue' ) +
facet_wrap( ~ SUB.AREA )
We don’t observe any obvious disagreement with the imputation results (red). This plot is also important because it reinforces our earlier decision to remove North Carolina and West Virginia due to missingness.
Moving forward we will perform the following to prepare the data:
PIC_by_state_imp <- PIC_by_state_df %>%
dplyr::filter( !SUB.AREA %in% c( 'North Carolina', 'West Virginia' ) ) %>% #remove the states that are missing a lot of 2020-21 data
#filter only the WEEKs that have adequate data for 2020-21
#WEEK %in% c( 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 1, 2, 3, 4, 5, 6, 7, 8 ) ) %>%
dplyr::select( -c(TOTAL.DEATHS, PERCENT.COMPLETE) ) #remove the total deaths columns due to missingness
#combine the New York Data
PIC_NY <- PIC_by_state_imp %>%
dplyr::filter( SUB.AREA %in% c( 'New York', 'New York City' ) ) %>%
mutate( est_Total_Deaths = round(((NUM.INFLUENZA.DEATHS + NUM.PNEUMONIA.DEATHS + NUM.COVID.19.DEATHS) * 100)/PERCENT.PIC, 0 ) ) %>%
dplyr::group_by( SEASON, WEEK ) %>%
dplyr::summarise( NUM.INFLUENZA.DEATHS = sum( NUM.INFLUENZA.DEATHS ),
NUM.PNEUMONIA.DEATHS = sum( NUM.PNEUMONIA.DEATHS ),
NUM.COVID.19.DEATHS = sum( NUM.COVID.19.DEATHS ),
TOTAL.PIC = sum( TOTAL.PIC ),
est_Total_Deaths = sum( est_Total_Deaths ),
expected = expected,
threshold = threshold) %>%
mutate( PERCENT.P.I = 100*(NUM.INFLUENZA.DEATHS + NUM.PNEUMONIA.DEATHS)/est_Total_Deaths,
PERCENT.PIC = 100*(TOTAL.PIC)/est_Total_Deaths,
PI_expected_diff = PERCENT.P.I - expected,
PI_threshold_diff = PERCENT.P.I - threshold,
SUB.AREA = 'New York' ) %>%
distinct() %>%
dplyr::select( -est_Total_Deaths )
## `summarise()` regrouping output by 'SEASON', 'WEEK' (override with `.groups` argument)
Concatenate the combined New York + New York City data with the data from the rest of the state (minus W Virginia & N Carolina)
#sort( colnames( PIC_by_state_imp ) ) == sort( colnames( PIC_NY ) )
PIC_notNY <- PIC_by_state_imp %>%
dplyr::filter( !SUB.AREA %in% c( 'New York', 'New York City' ) )
PIC_by_state_clean <- rbind( PIC_NY, PIC_notNY )
Visualization of States map:
diff_by_state_plot <- PIC_by_state_clean %>%
filter( SEASON == '2020-21' ) %>%
group_by( SUB.AREA) %>%
summarise( max_PI_expected_diff = max( PI_expected_diff, na.rm = T ) ) %>%
mutate( fips = fips( SUB.AREA ) )
## `summarise()` ungrouping output (override with `.groups` argument)
pdiff <- plot_usmap( data = diff_by_state_plot, values = 'max_PI_expected_diff', labels = F ) +
scale_fill_continuous( low = 'white', high = 'red',
name = 'Diff in PI', label = scales::comma,
limits = c( 0,25 ) ) +
theme( legend.position = 'right' ) +
theme( panel.background = element_rect( colour = "red"))
PIC_by_state_plot <- PIC_by_state_clean %>%
filter( SEASON == '2020-21' ) %>%
group_by( SUB.AREA) %>%
summarise( max_percent_PIC = max( PERCENT.PIC, na.rm = T ) ) %>%
mutate( fips = fips( SUB.AREA ) )
## `summarise()` ungrouping output (override with `.groups` argument)
pperPIC <- plot_usmap( data = PIC_by_state_plot, values = 'max_percent_PIC', labels = F ) +
scale_fill_continuous( low = 'white', high = 'red',
name = 'Percent PIC', label = scales::comma,
limits = c( 0,60 ) ) +
theme( legend.position = 'right' ) +
theme( panel.background = element_rect( colour = "red"))
grid.arrange( pdiff, pperPIC, ncol = 2 )
Visualize the relationship between excess PI mortality and PIC mortality:
relation_plot <- PIC_by_state_plot
relation_plot$max_PI_expected_diff <- diff_by_state_plot$max_PI_expected_diff
relation_plot <- relation_plot %>%
dplyr::arrange( max_percent_PIC )
#this dataset has been uploaded to B Cooper's git for plotting in the manuscript
# https://raw.githubusercontent.com/SmilodonCub/DATA621/master/PIC_PIexcess_relation.csv
#write.csv( relation_plot, 'PIC_PIexcess_relation.csv' )
prel <- ggplot( data = relation_plot, aes( x = max_percent_PIC, y = max_PI_expected_diff, label = SUB.AREA ) ) +
geom_point() +
geom_smooth( method = 'lm', se = F, color = 'black' ) +
stat_cor( r.digits = 3, p.accuracy = 0.001 ) +
geom_label(check_overlap = TRUE, nudge_y = 0.75) +
theme_classic() +
ggtitle('Excess PI mortality vs PIC Mortality for the 2020-2021 flu season') +
ylab( 'Max. Weekly Excess PI Mortality' ) +
xlab( 'Max. Weekly PIC Mortality' )
## Warning: Ignoring unknown parameters: check_overlap
prel
## `geom_smooth()` using formula 'y ~ x'
We see a linear relationship between the excess PI mortality and PIC mortality. The goal of our modeling and further analysis will be to see if variables that describe state-level COVID-19 regulatory measure and/or demographic features (e.g. state population ) can add to the explanatory power of this relationship.
Now we prepare to merge the state-level PIC data with the state-level regulation data.
Checking Unique States in Each Source:
# check to see how joining on state looks
d_pbs2 <- data.frame( SUB.AREA = unique( PIC_by_state_clean$SUB.AREA ) )
d_pbs2$state = d_pbs2$SUB.AREA
d_pbs <- distinct(PIC_by_state_df, SUB.AREA)
d_pbs$state = d_pbs$SUB.AREA
d_reg <- distinct(state_level_regulation_df, State_territory)
d_reg$state <- d_reg$State_territory
key_check <- full_join(d_pbs2, d_reg, by = c('state' = 'state')) %>%
filter(is.na(SUB.AREA) | is.na(State_territory))
key_check
## SUB.AREA state State_territory
## 1 California California <NA>
## 2 Texas Texas <NA>
## 3 <NA> American Samoa American Samoa
## 4 <NA> California (government response) California (government response)
## 5 <NA> Guam Guam
## 6 <NA> North Carolina North Carolina
## 7 <NA> N. Mariana Islands N. Mariana Islands
## 8 <NA> Puerto Rico Puerto Rico
## 9 <NA> Texas (government response) Texas (government response)
## 10 <NA> U.S. Virgin Islands U.S. Virgin Islands
## 11 <NA> West Virginia West Virginia
This output shows the mismatches in state names between the PIC and COVID regulation state-level data. We will change the regulation California (government response) to California and drop all territories from the regulation data that are not present in the state-level PIC.
state_level_regulation_df$State_territory <- as.character( state_level_regulation_df$State_territory )
state_level_regulation_df$State_territory[ state_level_regulation_df$State_territory == 'California (government response)' ] <- 'California'
state_level_regulation_df$State_territory <- factor( state_level_regulation_df$State_territory )
state_level_regulation_df$State_territory
## [1] Alabama Alaska
## [3] American Samoa Arizona
## [5] Arkansas California
## [7] Colorado Connecticut
## [9] Delaware District of Columbia
## [11] Florida Georgia
## [13] Guam Hawaii
## [15] Idaho Illinois
## [17] Indiana Iowa
## [19] Kansas Kentucky
## [21] Louisiana Maine
## [23] Maryland Massachusetts
## [25] Michigan Minnesota
## [27] Mississippi Missouri
## [29] Montana Nebraska
## [31] Nevada New Hampshire
## [33] New Jersey New Mexico
## [35] New York North Carolina
## [37] North Dakota N. Mariana Islands
## [39] Ohio Oklahoma
## [41] Oregon Pennsylvania
## [43] Puerto Rico Rhode Island
## [45] South Carolina South Dakota
## [47] Tennessee Texas (government response)
## [49] Utah U.S. Virgin Islands
## [51] Vermont Virginia
## [53] Washington West Virginia
## [55] Wisconsin Wyoming
## 56 Levels: Alabama Alaska American Samoa Arizona Arkansas ... Wyoming
df <- inner_join(data.frame(PIC_by_state_clean), state_level_regulation_df, by = c('SUB.AREA' = 'State_territory'))
names(df) <- lapply(names(df), tolower)
str(df)
## 'data.frame': 18624 obs. of 22 variables:
## $ season : chr "2013-14" "2013-14" "2013-14" "2013-14" ...
## $ week : int 1 2 3 4 5 6 7 8 9 10 ...
## $ num.influenza.deaths : num 5 7 12 16 19 9 8 10 9 6 ...
## $ num.pneumonia.deaths : num 279 301 309 289 306 274 299 269 262 273 ...
## $ num.covid.19.deaths : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total.pic : num 284 308 321 305 325 283 307 279 271 279 ...
## $ expected : num 7.17 7.25 7.31 7.36 7.4 ...
## $ threshold : num 7.52 7.59 7.65 7.7 7.74 ...
## $ percent.p.i : num 9.42 9.56 10.53 10.04 10.08 ...
## $ percent.pic : num 9.42 9.56 10.53 10.04 10.08 ...
## $ pi_expected_diff : num 2.25 2.31 3.22 2.68 2.69 ...
## $ pi_threshold_diff : num 1.9 1.97 2.87 2.33 2.34 ...
## $ sub.area : chr "New York" "New York" "New York" "New York" ...
## $ state_of_emergency_declared : Factor w/ 16 levels "February 29",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ stay_at_home_ordered : Factor w/ 24 levels "April 1","April 2",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ face_coverings_required_in_public: Factor w/ 3 levels "No","Varies by county",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ gatherings_banned : Factor w/ 9 levels "10 or more","10 or more (recommended)",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ out_of_state_travel_restrictions : Factor w/ 8 levels "Limited quarantine",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ closures_school : Factor w/ 2 levels "Yes (districts' choice)",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ closures_daycare : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ closures_bars : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ closures_retail : Factor w/ 4 levels "No","Regional",..: 4 4 4 4 4 4 4 4 4 4 ...
Some visual explorations of the differences in observed and expected PI mortality in relation to state-level regulations:
# state of emergency
df$soe_week <- as.Date(paste0(df$state_of_emergency_declared, ' 2020'), '%B %d %Y') %>%
strftime(format = '%V') %>%
as.numeric()
# stay at home
df$sah_week <- as.Date(paste0(df$stay_at_home_ordered, ' 2020'), '%B %d %Y') %>%
strftime(format = '%V') %>%
as.numeric()
# sample new york view
ny <- filter(df, sub.area == 'New York' & season %in% c('2019-20', '2020-21'))
ggplot(ny, aes(x = week, y = pi_expected_diff, color = season)) +
geom_point() +
geom_vline(xintercept = ny$soe_week, color = 'blue') +
geom_vline(xintercept = ny$sah_week, color = 'purple')
## Warning: Removed 8 rows containing missing values (geom_point).
This visualization depicts the weekly excess PI mortality rates for the 2020-2021 (blue) and 2019-2020 (red) flu seasons with vertical lines to show when the New York State government restrictions for the declaration of a state of emergency (blue) and the stay at home order (purple) were enacted.
df2 <- filter(df, season %in% c('2019-20', '2020-21'))
# add weeks after 52
df2$week_v2 <- with(df2, ifelse(season == '2020-21' & week >= 40, (week - 40 + 1) + 52, ifelse(season == '2020-21' & week < 40, week + 52 + 14, week)))
# check against ny, with references lines for state of emergency and stay at home orders
pNY <- filter(df2, sub.area == 'New York') %>%
ggplot(aes(x = week_v2, y = pi_expected_diff, color = season)) +
geom_point() +
geom_vline(aes(xintercept = soe_week), color = 'blue') +
geom_vline(aes(xintercept = sah_week), color = 'purple') +
ggtitle( 'New York State' )
# facet by state
pStates <- ggplot(df2, aes(x = week_v2, y = pi_expected_diff, color = season)) +
geom_point() +
geom_vline(aes(xintercept = soe_week), color = 'blue') +
geom_vline(aes(xintercept = sah_week), color = 'purple') +
facet_wrap(~sub.area, scales = 'free_y') +
ggtitle( 'U.S. States' )
#distinct(df2, sub.area, soe_week, sah_week, face_coverings_required_in_public)
#grid.arrange( pNY, pStates, ncol = 2, widths = c( 6,10 ) )
pNY
## Warning: Removed 8 rows containing missing values (geom_point).
pStates
## Warning: Removed 23 rows containing missing values (geom_point).
## Warning: Removed 825 rows containing missing values (geom_vline).
These visualizations plot the weekly data contiguously. There is a detailed plot for New York as well as an overview plot for all states.
Here we incorporate state-level population data from the U.S. census and look at some other visualizations of our target variable for excess PI mortality PI_Expected_diff in relation to the state-level COVID-19 restrictions. These visualization are informative on the subsequent data processing steps to ready the data set for modeling as comments adjacent to the plots code explain.
url <- 'https://raw.githubusercontent.com/SmilodonCub/DATA621/master/nst-est2019-alldata.csv'
pop <- read.csv(url)
names(pop) <- lapply(names(pop), tolower)
str(pop)
## 'data.frame': 57 obs. of 151 variables:
## $ sumlev : int 10 20 20 20 20 40 40 40 40 40 ...
## $ region : chr "0" "1" "2" "3" ...
## $ division : chr "0" "0" "0" "0" ...
## $ state : int 0 0 0 0 0 1 2 4 5 6 ...
## $ name : chr "United States" "Northeast Region" "Midwest Region" "South Region" ...
## $ census2010pop : int 308745538 55317240 66927001 114555744 71945553 4779736 710231 6392017 2915918 37253956 ...
## $ estimatesbase2010 : int 308758105 55318443 66929725 114563030 71946907 4780125 710249 6392288 2916031 37254519 ...
## $ popestimate2010 : int 309321666 55380134 66974416 114866680 72100436 4785437 713910 6407172 2921964 37319502 ...
## $ popestimate2011 : int 311556874 55604223 67157800 116006522 72788329 4799069 722128 6472643 2940667 37638369 ...
## $ popestimate2012 : int 313830990 55775216 67336743 117241208 73477823 4815588 730443 6554978 2952164 37948800 ...
## $ popestimate2013 : int 315993715 55901806 67560379 118364400 74167130 4830081 737068 6632764 2959400 38260787 ...
## $ popestimate2014 : int 318301008 56006011 67745167 119624037 74925793 4841799 736283 6730413 2967392 38596972 ...
## $ popestimate2015 : int 320635163 56034684 67860583 120997341 75742555 4852347 737498 6829676 2978048 38918045 ...
## $ popestimate2016 : int 322941311 56042330 67987540 122351760 76559681 4863525 741456 6941072 2989918 39167117 ...
## $ popestimate2017 : int 324985539 56059240 68126781 123542189 77257329 4874486 739700 7044008 3001345 39358497 ...
## $ popestimate2018 : int 326687501 56046620 68236628 124569433 77834820 4887681 735139 7158024 3009733 39461588 ...
## $ popestimate2019 : int 328239523 55982803 68329004 125580448 78347268 4903185 731545 7278717 3017804 39512223 ...
## $ npopchg_2010 : int 563561 61691 44691 303650 153529 5312 3661 14884 5933 64983 ...
## $ npopchg_2011 : int 2235208 224089 183384 1139842 687893 13632 8218 65471 18703 318867 ...
## $ npopchg_2012 : int 2274116 170993 178943 1234686 689494 16519 8315 82335 11497 310431 ...
## $ npopchg_2013 : int 2162725 126590 223636 1123192 689307 14493 6625 77786 7236 311987 ...
## $ npopchg_2014 : int 2307293 104205 184788 1259637 758663 11718 -785 97649 7992 336185 ...
## $ npopchg_2015 : int 2334155 28673 115416 1373304 816762 10548 1215 99263 10656 321073 ...
## $ npopchg_2016 : int 2306148 7646 126957 1354419 817126 11178 3958 111396 11870 249072 ...
## $ npopchg_2017 : int 2044228 16910 139241 1190429 697648 10961 -1756 102936 11427 191380 ...
## $ npopchg_2018 : int 1701962 -12620 109847 1027244 577491 13195 -4561 114016 8388 103091 ...
## $ npopchg_2019 : int 1552022 -63817 92376 1011015 512448 15504 -3594 120693 8071 50635 ...
## $ births2010 : int 987836 163466 212570 368759 243041 14226 2892 20934 9438 123327 ...
## $ births2011 : int 3973485 646249 834866 1509634 982736 59690 11716 86097 38453 509771 ...
## $ births2012 : int 3936976 637860 830701 1504955 963460 59067 11124 85577 38594 497451 ...
## $ births2013 : int 3940576 635751 830890 1504774 969161 57929 11350 86093 37978 499629 ...
## $ births2014 : int 3963195 632433 836538 1525313 968911 58903 11443 86129 38225 498914 ...
## $ births2015 : int 3992376 634504 838012 1545679 974181 59647 11330 86764 38796 500380 ...
## $ births2016 : int 3962654 628030 831641 1541412 961571 59389 11246 84957 38612 490358 ...
## $ births2017 : int 3901982 618490 818785 1519344 945363 58961 10803 83021 37737 481943 ...
## $ births2018 : int 3824521 610713 801587 1494950 917271 58271 10284 81767 37239 465017 ...
## $ births2019 : int 3791712 602740 792343 1481244 915385 57313 10031 81942 36640 462617 ...
## $ deaths2010 : int 598691 110878 140862 228435 118516 11075 920 11505 6979 57319 ...
## $ deaths2011 : int 2512442 470811 586660 963051 491920 48833 3937 48174 29455 238388 ...
## $ deaths2012 : int 2501531 460915 584671 960791 495154 48366 3940 48754 29772 239535 ...
## $ deaths2013 : int 2608019 479963 605165 1011227 511664 50851 4039 50612 30994 247704 ...
## $ deaths2014 : int 2582448 470268 596898 1006109 509173 49712 4024 50236 29967 244055 ...
## $ deaths2015 : int 2699826 489209 626084 1052226 532307 51876 4350 52846 31318 253798 ...
## $ deaths2016 : int 2703215 480502 619187 1058081 545445 51710 4447 56032 31134 260121 ...
## $ deaths2017 : int 2788163 497919 640407 1092315 557522 53195 4463 56333 32340 265241 ...
## $ deaths2018 : int 2824382 503076 627911 1116929 576466 53665 4664 60127 31743 277578 ...
## $ deaths2019 : int 2835038 505588 622854 1122130 584466 53879 4819 60523 31322 282520 ...
## $ naturalinc2010 : int 389145 52588 71708 140324 124525 3151 1972 9429 2459 66008 ...
## $ naturalinc2011 : int 1461043 175438 248206 546583 490816 10857 7779 37923 8998 271383 ...
## $ naturalinc2012 : int 1435445 176945 246030 544164 468306 10701 7184 36823 8822 257916 ...
## $ naturalinc2013 : int 1332557 155788 225725 493547 457497 7078 7311 35481 6984 251925 ...
## $ naturalinc2014 : int 1380747 162165 239640 519204 459738 9191 7419 35893 8258 254859 ...
## $ naturalinc2015 : int 1292550 145295 211928 493453 441874 7771 6980 33918 7478 246582 ...
## $ naturalinc2016 : int 1259439 147528 212454 483331 416126 7679 6799 28925 7478 230237 ...
## $ naturalinc2017 : int 1113819 120571 178378 427029 387841 5766 6340 26688 5397 216702 ...
## $ naturalinc2018 : int 1000139 107637 173676 378021 340805 4606 5620 21640 5496 187439 ...
## $ naturalinc2019 : int 956674 97152 169489 359114 330919 3434 5212 21419 5318 180097 ...
## $ internationalmig2010 : int 174416 45032 24720 67464 37200 924 430 2550 801 19635 ...
## $ internationalmig2011 : int 774165 204242 114945 279639 175339 4665 1314 16489 3662 99179 ...
## $ internationalmig2012 : int 838671 205018 118864 338025 176764 5817 2348 15100 3732 98297 ...
## $ internationalmig2013 : int 830168 191468 124489 323413 190798 5046 2863 17997 2321 116528 ...
## $ internationalmig2014 : int 926546 219965 132052 358590 215939 3684 1198 21312 3100 131001 ...
## $ internationalmig2015 : int 1041605 224592 140410 422001 254602 4580 2273 19511 3303 156870 ...
## $ internationalmig2016 : int 1046709 234098 143098 428463 241050 5777 2303 20062 3366 141892 ...
## $ internationalmig2017 : int 930409 213507 124250 395903 196749 3011 2612 13385 2039 112814 ...
## $ internationalmig2018 : int 701823 179097 94152 299648 128926 3379 537 7937 588 71300 ...
## $ internationalmig2019 : int 595348 134145 85675 242942 132586 2772 659 7782 268 74028 ...
## $ domesticmig2010 : int 0 -32807 -50673 91267 -7787 1244 1168 3122 2469 -20356 ...
## $ domesticmig2011 : int 0 -154970 -179592 312610 21952 -1893 -892 11062 5996 -51326 ...
## $ domesticmig2012 : int 0 -207154 -184359 344278 47235 -114 -1319 29844 -1293 -41795 ...
## $ domesticmig2013 : int 0 -217700 -124956 300008 42648 2297 -3564 24050 -2150 -53692 ...
## $ domesticmig2014 : int 0 -275141 -183669 374593 84217 -959 -9641 39948 -3371 -46321 ...
## $ domesticmig2015 : int 0 -340104 -234550 453543 121111 -1544 -8166 45561 -106 -79938 ...
## $ domesticmig2016 : int 0 -373661 -227445 441519 159587 -2157 -5185 62320 1069 -122369 ...
## $ domesticmig2017 : int 0 -316475 -162183 366143 112515 2298 -10753 62686 3981 -137546 ...
## $ domesticmig2018 : int 0 -298739 -157067 348608 107198 5279 -10757 84113 2294 -155281 ...
## $ domesticmig2019 : int 0 -294331 -161549 407913 47967 9387 -9482 91017 2515 -203414 ...
## $ netmig2010 : int 174416 12225 -25953 158731 29413 2168 1598 5672 3270 -721 ...
## $ netmig2011 : int 774165 49272 -64647 592249 197291 2772 422 27551 9658 47853 ...
## $ netmig2012 : int 838671 -2136 -65495 682303 223999 5703 1029 44944 2439 56502 ...
## $ netmig2013 : int 830168 -26232 -467 623421 233446 7343 -701 42047 171 62836 ...
## $ netmig2014 : int 926546 -55176 -51617 733183 300156 2725 -8443 61260 -271 84680 ...
## $ netmig2015 : int 1041605 -115512 -94140 875544 375713 3036 -5893 65072 3197 76932 ...
## $ netmig2016 : int 1046709 -139563 -84347 869982 400637 3620 -2882 82382 4435 19523 ...
## $ netmig2017 : int 930409 -102968 -37933 762046 309264 5309 -8141 76071 6020 -24732 ...
## $ netmig2018 : int 701823 -119642 -62915 648256 236124 8658 -10220 92050 2882 -83981 ...
## $ netmig2019 : int 595348 -160186 -75874 650855 180553 12159 -8823 98799 2783 -129386 ...
## $ residual2010 : int 0 -3122 -1064 4595 -409 -7 91 -217 204 -304 ...
## $ residual2011 : int 0 -621 -175 1010 -214 3 17 -3 47 -369 ...
## $ residual2012 : int 0 -3816 -1592 8219 -2811 115 102 568 236 -3987 ...
## $ residual2013 : int 0 -2966 -1622 6224 -1636 72 15 258 81 -2774 ...
## $ residual2014 : int 0 -2784 -3235 7250 -1231 -198 239 496 5 -3354 ...
## $ residual2015 : int 0 -1110 -2372 4307 -825 -259 128 273 -19 -2441 ...
## $ residual2016 : int 0 -319 -1150 1106 363 -121 41 89 -43 -688 ...
## $ residual2017 : int 0 -693 -1204 1354 543 -114 45 177 10 -590 ...
## $ residual2018 : int 0 -615 -914 967 562 -69 39 326 10 -367 ...
## $ residual2019 : int 0 -783 -1239 1046 976 -89 17 475 -30 -76 ...
## $ rbirth2011 : num 12.8 11.6 12.4 13.1 13.6 ...
## $ rbirth2012 : num 12.6 11.5 12.4 12.9 13.2 ...
## [list output truncated]
table(pop$name)
##
## Alabama Alaska Arizona
## 1 1 1
## Arkansas California Colorado
## 1 1 1
## Connecticut Delaware District of Columbia
## 1 1 1
## Florida Georgia Hawaii
## 1 1 1
## Idaho Illinois Indiana
## 1 1 1
## Iowa Kansas Kentucky
## 1 1 1
## Louisiana Maine Maryland
## 1 1 1
## Massachusetts Michigan Midwest Region
## 1 1 1
## Minnesota Mississippi Missouri
## 1 1 1
## Montana Nebraska Nevada
## 1 1 1
## New Hampshire New Jersey New Mexico
## 1 1 1
## New York North Carolina North Dakota
## 1 1 1
## Northeast Region Ohio Oklahoma
## 1 1 1
## Oregon Pennsylvania Puerto Rico
## 1 1 1
## Rhode Island South Carolina South Dakota
## 1 1 1
## South Region Tennessee Texas
## 1 1 1
## United States Utah Vermont
## 1 1 1
## Virginia Washington West Region
## 1 1 1
## West Virginia Wisconsin Wyoming
## 1 1 1
pop_est <- select(pop, name, popestimate2019)
df2 <- left_join(df2, pop_est, by = c('sub.area' = 'name'))
str(df2)
## 'data.frame': 3600 obs. of 26 variables:
## $ season : chr "2019-20" "2019-20" "2019-20" "2019-20" ...
## $ week : int 1 2 3 4 5 6 7 8 9 10 ...
## $ num.influenza.deaths : num 30 21 38 22 36 40 30 37 24 32 ...
## $ num.pneumonia.deaths : num 290 274 286 285 241 256 257 240 256 260 ...
## $ num.covid.19.deaths : num 0 0 0 0 0 1 0 1 0 0 ...
## $ total.pic : num 320 295 324 307 277 296 287 278 280 292 ...
## $ expected : num 7.17 7.25 7.31 7.36 7.4 ...
## $ threshold : num 7.52 7.59 7.65 7.7 7.74 ...
## $ percent.p.i : num 9.63 9.06 9.77 9.51 8.58 ...
## $ percent.pic : num 9.63 9.06 9.77 9.51 8.58 ...
## $ pi_expected_diff : num 2.46 1.82 2.46 2.15 1.19 ...
## $ pi_threshold_diff : num 2.11 1.47 2.119 1.81 0.844 ...
## $ sub.area : chr "New York" "New York" "New York" "New York" ...
## $ state_of_emergency_declared : Factor w/ 16 levels "February 29",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ stay_at_home_ordered : Factor w/ 24 levels "April 1","April 2",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ face_coverings_required_in_public: Factor w/ 3 levels "No","Varies by county",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ gatherings_banned : Factor w/ 9 levels "10 or more","10 or more (recommended)",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ out_of_state_travel_restrictions : Factor w/ 8 levels "Limited quarantine",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ closures_school : Factor w/ 2 levels "Yes (districts' choice)",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ closures_daycare : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ closures_bars : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ closures_retail : Factor w/ 4 levels "No","Regional",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ soe_week : num 10 10 10 10 10 10 10 10 10 10 ...
## $ sah_week : num 12 12 12 12 12 12 12 12 12 12 ...
## $ week_v2 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ popestimate2019 : int 19453561 19453561 19453561 19453561 19453561 19453561 19453561 19453561 19453561 19453561 ...
df2$pct_covid_deaths <- df2$num.covid.19.deaths / df2$popestimate2019
## Percent of population dying from covid
ggplot(df2, aes(x = week_v2, y = pct_covid_deaths)) +
geom_line() +
facet_wrap(~sub.area)
## percent pi vs pic
ggplot(df2, aes(x = week_v2, y = percent.p.i / 100)) +
geom_line() +
geom_line(aes(y = percent.pic / 100), color = 'orange') +
facet_wrap(~sub.area)
## showing different facets
ggplot(df2, aes(x = week_v2, y = pi_expected_diff, color = sub.area)) +
geom_line() +
facet_wrap(~face_coverings_required_in_public, nrow = n_distinct(df$face_coverings_required_in_public))
## Warning: Removed 15 row(s) containing missing values (geom_path).
### maybe look at averages instead
## face coverings v2
group_by(df2, week_v2, face_coverings_required_in_public) %>%
summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
.groups = 'drop') %>%
ggplot(aes(x = week_v2, y = avg_excess_pi, color = face_coverings_required_in_public)) +
geom_line() +
theme(legend.position = 'bottom')
## gatherings banned
group_by(df2, week_v2, gatherings_banned) %>%
summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
.groups = 'drop') %>%
ggplot(aes(x = week_v2, y = avg_excess_pi, color = gatherings_banned)) +
geom_line()
## Warning: Removed 4 row(s) containing missing values (geom_path).
## travel restrictions
group_by(df2, week_v2, out_of_state_travel_restrictions) %>%
summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
.groups = 'drop') %>%
ggplot(aes(x = week_v2, y = avg_excess_pi, color = out_of_state_travel_restrictions)) +
geom_line()
## Warning: Removed 2 row(s) containing missing values (geom_path).
## closures - school
group_by(df2, week_v2, closures_school) %>%
summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
.groups = 'drop') %>%
ggplot(aes(x = week_v2, y = avg_excess_pi, color = closures_school)) +
geom_line()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## closures - daycare
group_by(df2, week_v2, closures_daycare) %>%
summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
.groups = 'drop') %>%
ggplot(aes(x = week_v2, y = avg_excess_pi, color = closures_daycare)) +
geom_line()
## closures - bars
group_by(df2, week_v2, closures_bars) %>%
summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
.groups = 'drop') %>%
ggplot(aes(x = week_v2, y = avg_excess_pi, color = closures_bars)) +
geom_line()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## closures - retail
group_by(df2, week_v2, closures_retail) %>%
summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
.groups = 'drop') %>%
ggplot(aes(x = week_v2, y = avg_excess_pi, color = closures_retail)) +
geom_line()
## state of emergency timing
group_by(df2, week_v2, soe_week) %>%
summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
.groups = 'drop') %>%
ggplot(aes(x = week_v2, y = avg_excess_pi, color = factor(soe_week))) +
geom_line()
## stay at home timing
group_by(df2, week_v2, sah_week) %>%
summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
.groups = 'drop') %>%
ggplot(aes(x = week_v2, y = avg_excess_pi, color = factor(sah_week))) +
geom_line()
model_dat <- df %>%
filter( week %in% c( 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 1, 2, 3, 4, 5, 6, 7, 8 )) %>%
dplyr::group_by( sub.area ) %>%
dplyr::mutate( PI_expected_diff_imp = na_kalman( pi_expected_diff ),
num = row_number() )
#join population estimate
model_dat <- left_join(model_dat, pop_est, by = c('sub.area' = 'name'))
# engineer a feature from previous non-COVID-19 years (before 2019-2020) flu data to take the mean difference between observed and expected PI mortality
preCOVID_diff <- model_dat %>%
filter( !season %in% c('2019-20', '2020-21') ) %>%
rowwise() %>%
mutate( season2 = list(strsplit(season, split = "-")[[1]][1])) %>%
select( c( season2, week, sub.area, pi_expected_diff )) %>%
pivot_wider( names_from = season2, names_prefix = 'S', values_from = pi_expected_diff) %>%
mutate( mean_preCOVID_diff = rowMeans(dplyr::select( ., S2013:S2018), na.rm = TRUE) ) %>%
select( c( week, sub.area, mean_preCOVID_diff ) )
train_dat <- model_dat %>%
filter( season == '2020-21' )
#merge the preCOVID_diff feature
train_dat <- merge( train_dat, preCOVID_diff, by.x = c('week', 'sub.area'), by.y = c('week', 'sub.area' ))
#remove features that won't contribute to the model
train_dat <- train_dat %>%
select( -c( season, num.influenza.deaths, num.pneumonia.deaths, num.covid.19.deaths, closures_school,
total.pic, state_of_emergency_declared, stay_at_home_ordered, pi_threshold_diff, threshold, ) ) %>%
mutate_if( is.character, as.factor )
glimpse( train_dat )
## Rows: 1,056
## Columns: 18
## $ week <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ sub.area <fct> Alabama, Alaska, Arizona, Arkansa...
## $ expected <dbl> 7.174014, 7.174014, 7.174014, 7.1...
## $ percent.p.i <dbl> 14.4, 8.3, 26.6, 19.7, 29.5, 12.4...
## $ percent.pic <dbl> 34.2, 14.6, 41.6, 32.3, 46.9, 22....
## $ pi_expected_diff <dbl> 7.225986, 1.125986, 19.425986, 12...
## $ face_coverings_required_in_public <fct> Yes, No, No, Yes, Yes, Yes, Yes, ...
## $ gatherings_banned <fct> "10 or more", "10 or more", "50 o...
## $ out_of_state_travel_restrictions <fct> No, Mandatory quarantine, Limited...
## $ closures_daycare <fct> Yes, Yes, Yes, Yes, Yes, Restrict...
## $ closures_bars <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes...
## $ closures_retail <fct> Yes, Yes, Yes, Regional, Yes, Yes...
## $ soe_week <dbl> 11, 11, 11, 11, 10, 11, 11, 11, 1...
## $ sah_week <dbl> 14, 13, 14, NA, 12, 13, 13, 13, 1...
## $ PI_expected_diff_imp <dbl> 7.225986, 1.125986, 19.425986, 12...
## $ num <int> 15, 15, 15, 15, 15, 15, 15, 15, 1...
## $ popestimate2019 <int> 4903185, 731545, 7278717, 3017804...
## $ mean_preCOVID_diff <dbl> 0.8093196, -1.5406804, 0.5259862,...
summary( train_dat )
## week sub.area expected percent.p.i
## Min. : 1.00 Alabama : 22 Min. :5.805 Min. : 0.00
## 1st Qu.: 6.00 Alaska : 22 1st Qu.:6.306 1st Qu.: 8.60
## Median :42.50 Arizona : 22 Median :6.921 Median :12.05
## Mean :31.23 Arkansas : 22 Mean :6.795 Mean :12.70
## 3rd Qu.:48.00 California: 22 3rd Qu.:7.309 3rd Qu.:16.20
## Max. :53.00 Colorado : 22 Max. :7.427 Max. :31.80
## (Other) :924 NA's :6
## percent.pic pi_expected_diff face_coverings_required_in_public
## Min. : 0.00 Min. :-7.422 No :396
## 1st Qu.:13.90 1st Qu.: 2.011 Varies by county: 22
## Median :20.95 Median : 5.265 Yes :638
## Mean :22.06 Mean : 5.908
## 3rd Qu.:29.30 3rd Qu.: 9.098
## Max. :54.40 Max. :24.553
## NA's :8 NA's :6
## gatherings_banned
## 10 or more :462
## All :286
## 50 or more :110
## 11 or more : 88
## 10 or more (recommended) : 22
## 11 or more, and public gathering in public places: 22
## (Other) : 66
## out_of_state_travel_restrictions closures_daycare
## No :462 No : 88
## Mandatory quarantine :396 Restricted: 88
## Limited quarantine :132 Yes :880
## Limited quarantine / Screened: 22
## Recommended quarantine : 22
## Regional : 22
## (Other) : 0
## closures_bars closures_retail soe_week sah_week
## No : 22 No : 66 Min. : 9.00 Min. :12.00
## Restricted: 22 Regional : 44 1st Qu.:10.75 1st Qu.:13.00
## Yes :1012 Restricted: 88 Median :11.00 Median :13.00
## Yes :858 Mean :10.73 Mean :13.35
## 3rd Qu.:11.00 3rd Qu.:14.00
## Max. :12.00 Max. :15.00
## NA's :242
## PI_expected_diff_imp num popestimate2019 mean_preCOVID_diff
## Min. :-7.422 Min. : 1.00 Min. : 578759 Min. :-4.8947
## 1st Qu.: 2.006 1st Qu.: 6.00 1st Qu.: 1694267 1st Qu.:-0.4009
## Median : 5.228 Median : 12.00 Median : 4342705 Median : 0.4690
## Mean : 5.887 Mean : 14.58 Mean : 5978404 Mean : 0.6844
## 3rd Qu.: 9.057 3rd Qu.: 17.00 3rd Qu.: 6989056 3rd Qu.: 1.5053
## Max. :24.553 Max. :170.00 Max. :39512223 Max. : 8.6053
##
98% of states closed schools for the remainder of the term, so we will not consider this feature. Here we create flags for several of the regulations variables to simplify the modeling approach:
train_dat <- train_dat %>%
mutate( yes_closures_bars = closures_bars == 'Yes', # Bar closures == Yes
yes_closures_daycare = closures_daycare == 'Yes', # Daycare closures == Yes
yes_closures_retail = closures_retail == 'Yes', # retail closures == Yes
yes_face_coverings = face_coverings_required_in_public == 'Yes', # Face coverings required in public == Yes
yes_travel_restrictions = out_of_state_travel_restrictions != 'No',
# Out of State travel restrictions != No (some form or restriction enacted)
all_gatherings_banned = gatherings_banned == 'All' ) %>% # All Gathering Banned
mutate_if( is.logical, as.factor )
train + validation set
#split train_dat into a train and validation set.
splitSample <- sample(1:3, size=nrow(train_dat), prob=c(0.7,0.15,0.15), replace = TRUE)
train_PI <- train_dat[splitSample==1,]
valid_PI <- train_dat[splitSample==2,]
test_PI <- train_dat[splitSample==3,]
#str( train_PI )
We will be using the linear model of Excess PI Mortality ~ PIC Mortality as a benchmark
Model 1 Simple Linear Regression
model1 <- lm( data = train_PI, PI_expected_diff_imp ~ percent.pic )
summary( model1 )
##
## Call:
## lm(formula = PI_expected_diff_imp ~ percent.pic, data = train_PI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5737 -1.3490 -0.0694 1.2506 11.2639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.22529 0.20751 -20.36 <2e-16 ***
## percent.pic 0.45729 0.00841 54.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.304 on 718 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.8046, Adjusted R-squared: 0.8043
## F-statistic: 2957 on 1 and 718 DF, p-value: < 2.2e-16
ggplot( data = train_PI, aes( x = percent.pic, y = PI_expected_diff_imp ) ) +
geom_point() +
geom_smooth( method = 'lm', se = F, color = 'black' ) +
theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).
Diagnostic plots
plot( model1 )
The simple linear model gives a statistically significant fit to the data and has a relatively high \(R^2\) value. However, there are some indications from the diagnostic plots that the variance of residuals is not constant across the data series. We will attempt to explain more of the variance by adding features to the model that will hopefully increase the predictive power.
Model2 Multiple Linear Regression
Adding feature variables for government regulations:
mlm <- PI_expected_diff_imp ~ percent.pic + soe_week + sah_week + yes_closures_daycare + yes_closures_retail +
yes_face_coverings + yes_travel_restrictions + all_gatherings_banned
model2 <- lm( formula = mlm, data = train_PI )
summary( model2 )
##
## Call:
## lm(formula = mlm, data = train_PI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7475 -1.1220 -0.0027 1.1443 10.6079
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.8862 3.3489 -1.459 0.145142
## percent.pic 0.4790 0.0103 46.506 < 2e-16 ***
## soe_week -0.5948 0.1642 -3.623 0.000319 ***
## sah_week 0.3978 0.1743 2.282 0.022889 *
## yes_closures_daycareTRUE 0.5817 0.2815 2.067 0.039229 *
## yes_closures_retailTRUE 1.6495 0.3436 4.801 2.05e-06 ***
## yes_face_coveringsTRUE -0.2041 0.2342 -0.871 0.383902
## yes_travel_restrictionsTRUE -0.5680 0.2219 -2.560 0.010733 *
## all_gatherings_bannedTRUE -0.3411 0.2746 -1.242 0.214742
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.242 on 539 degrees of freedom
## (178 observations deleted due to missingness)
## Multiple R-squared: 0.8159, Adjusted R-squared: 0.8131
## F-statistic: 298.5 on 8 and 539 DF, p-value: < 2.2e-16
step_m2 <- step( model2 )
## Start: AIC=893.71
## PI_expected_diff_imp ~ percent.pic + soe_week + sah_week + yes_closures_daycare +
## yes_closures_retail + yes_face_coverings + yes_travel_restrictions +
## all_gatherings_banned
##
## Df Sum of Sq RSS AIC
## - yes_face_coverings 1 3.8 2712.7 892.48
## - all_gatherings_banned 1 7.8 2716.6 893.28
## <none> 2708.9 893.71
## - yes_closures_daycare 1 21.5 2730.3 896.04
## - sah_week 1 26.2 2735.0 896.98
## - yes_travel_restrictions 1 32.9 2741.8 898.34
## - soe_week 1 66.0 2774.8 904.90
## - yes_closures_retail 1 115.9 2824.7 914.66
## - percent.pic 1 10869.9 13578.8 1775.07
##
## Step: AIC=892.48
## PI_expected_diff_imp ~ percent.pic + soe_week + sah_week + yes_closures_daycare +
## yes_closures_retail + yes_travel_restrictions + all_gatherings_banned
##
## Df Sum of Sq RSS AIC
## - all_gatherings_banned 1 7.2 2719.9 891.94
## <none> 2712.7 892.48
## - yes_closures_daycare 1 21.1 2733.8 894.73
## - yes_travel_restrictions 1 29.4 2742.1 896.39
## - sah_week 1 37.9 2750.6 898.09
## - soe_week 1 64.3 2777.0 903.32
## - yes_closures_retail 1 122.7 2835.4 914.73
## - percent.pic 1 10935.7 13648.4 1775.88
##
## Step: AIC=891.94
## PI_expected_diff_imp ~ percent.pic + soe_week + sah_week + yes_closures_daycare +
## yes_closures_retail + yes_travel_restrictions
##
## Df Sum of Sq RSS AIC
## <none> 2719.9 891.94
## - yes_closures_daycare 1 22.6 2742.5 894.48
## - yes_travel_restrictions 1 23.8 2743.7 894.72
## - soe_week 1 57.9 2777.9 901.49
## - sah_week 1 76.9 2796.8 905.21
## - yes_closures_retail 1 141.1 2861.1 917.67
## - percent.pic 1 10934.5 13654.4 1774.12
summary( step_m2 )
##
## Call:
## lm(formula = PI_expected_diff_imp ~ percent.pic + soe_week +
## sah_week + yes_closures_daycare + yes_closures_retail + yes_travel_restrictions,
## data = train_PI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5314 -1.1072 -0.0158 1.1423 10.4794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.88802 2.66609 -2.959 0.003225 **
## percent.pic 0.47807 0.01025 46.636 < 2e-16 ***
## soe_week -0.54266 0.15985 -3.395 0.000737 ***
## sah_week 0.55038 0.14077 3.910 0.000104 ***
## yes_closures_daycareTRUE 0.59589 0.28099 2.121 0.034402 *
## yes_closures_retailTRUE 1.76926 0.33391 5.299 1.7e-07 ***
## yes_travel_restrictionsTRUE -0.44996 0.20678 -2.176 0.029986 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.242 on 541 degrees of freedom
## (178 observations deleted due to missingness)
## Multiple R-squared: 0.8151, Adjusted R-squared: 0.8131
## F-statistic: 397.5 on 6 and 541 DF, p-value: < 2.2e-16
Adding the following feature variables:
#trying to figure out why only 1 level is seen for the closures_bars variable
#cant figure this out? must be something really dumb
debug_contr_error <- function (dat, subset_vec = NULL) {
if (!is.null(subset_vec)) {
## step 0
if (mode(subset_vec) == "logical") {
if (length(subset_vec) != nrow(dat)) {
stop("'logical' `subset_vec` provided but length does not match `nrow(dat)`")
}
subset_log_vec <- subset_vec
} else if (mode(subset_vec) == "numeric") {
## check range
ran <- range(subset_vec)
if (ran[1] < 1 || ran[2] > nrow(dat)) {
stop("'numeric' `subset_vec` provided but values are out of bound")
} else {
subset_log_vec <- logical(nrow(dat))
subset_log_vec[as.integer(subset_vec)] <- TRUE
}
} else {
stop("`subset_vec` must be either 'logical' or 'numeric'")
}
dat <- base::subset(dat, subset = subset_log_vec)
} else {
## step 1
dat <- stats::na.omit(dat)
}
if (nrow(dat) == 0L) warning("no complete cases")
## step 2
var_mode <- sapply(dat, mode)
if (any(var_mode %in% c("complex", "raw"))) stop("complex or raw not allowed!")
var_class <- sapply(dat, class)
if (any(var_mode[var_class == "AsIs"] %in% c("logical", "character"))) {
stop("matrix variables with 'AsIs' class must be 'numeric'")
}
ind1 <- which(var_mode %in% c("logical", "character"))
dat[ind1] <- lapply(dat[ind1], as.factor)
## step 3
fctr <- which(sapply(dat, is.factor))
if (length(fctr) == 0L) warning("no factor variables to summary")
ind2 <- if (length(ind1) > 0L) fctr[-ind1] else fctr
dat[ind2] <- lapply(dat[ind2], base::droplevels.factor)
## step 4
lev <- lapply(dat[fctr], base::levels.default)
nl <- lengths(lev)
## return
list(nlevels = nl, levels = lev)
}
debug_contr_error( train_dat )
## $nlevels
## sub.area face_coverings_required_in_public
## 37 3
## gatherings_banned out_of_state_travel_restrictions
## 9 6
## closures_daycare closures_bars
## 3 1
## closures_retail yes_closures_bars
## 2 1
## yes_closures_daycare yes_closures_retail
## 2 2
## yes_face_coverings yes_travel_restrictions
## 2 2
## all_gatherings_banned
## 2
##
## $levels
## $levels$sub.area
## [1] "Alabama" "Alaska" "Arizona"
## [4] "California" "Colorado" "Connecticut"
## [7] "Delaware" "District of Columbia" "Florida"
## [10] "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Kansas"
## [16] "Louisiana" "Maine" "Maryland"
## [19] "Michigan" "Minnesota" "Mississippi"
## [22] "Missouri" "Montana" "Nevada"
## [25] "New Hampshire" "New Jersey" "New Mexico"
## [28] "New York" "Ohio" "Oregon"
## [31] "Pennsylvania" "Rhode Island" "South Carolina"
## [34] "Tennessee" "Vermont" "Virginia"
## [37] "Washington"
##
## $levels$face_coverings_required_in_public
## [1] "No" "Varies by county" "Yes"
##
## $levels$gatherings_banned
## [1] "10 or more"
## [2] "10 or more (recommended)"
## [3] "11 or more"
## [4] "11 or more, and public gathering in public places"
## [5] "25 or more"
## [6] "50 or more"
## [7] "6 or more"
## [8] "All"
## [9] "All outside, and 11 or more inside a household"
##
## $levels$out_of_state_travel_restrictions
## [1] "Limited quarantine" "Limited quarantine / Screened"
## [3] "Mandatory quarantine" "No"
## [5] "Recommended quarantine" "Regional"
##
## $levels$closures_daycare
## [1] "No" "Restricted" "Yes"
##
## $levels$closures_bars
## [1] "Yes"
##
## $levels$closures_retail
## [1] "Restricted" "Yes"
##
## $levels$yes_closures_bars
## [1] "TRUE"
##
## $levels$yes_closures_daycare
## [1] "FALSE" "TRUE"
##
## $levels$yes_closures_retail
## [1] "FALSE" "TRUE"
##
## $levels$yes_face_coverings
## [1] "FALSE" "TRUE"
##
## $levels$yes_travel_restrictions
## [1] "FALSE" "TRUE"
##
## $levels$all_gatherings_banned
## [1] "FALSE" "TRUE"
str( train_PI )
## 'data.frame': 726 obs. of 24 variables:
## $ week : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sub.area : Factor w/ 48 levels "Alabama","Alaska",..: 1 2 3 4 5 7 8 9 10 11 ...
## $ expected : num 7.17 7.17 7.17 7.17 7.17 ...
## $ percent.p.i : num 14.4 8.3 26.6 19.7 29.5 14.8 16.8 15.4 17.9 21.5 ...
## $ percent.pic : num 34.2 14.6 41.6 32.3 46.9 32.1 29.6 24.5 23.9 33.5 ...
## $ pi_expected_diff : num 7.23 1.13 19.43 12.53 22.33 ...
## $ face_coverings_required_in_public: Factor w/ 3 levels "No","Varies by county",..: 3 1 1 3 3 3 3 3 1 1 ...
## $ gatherings_banned : Factor w/ 9 levels "10 or more","10 or more (recommended)",..: 1 1 6 1 8 8 8 1 1 1 ...
## $ out_of_state_travel_restrictions : Factor w/ 8 levels "Limited quarantine",..: 4 3 1 4 4 5 3 4 2 4 ...
## $ closures_daycare : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ closures_bars : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ closures_retail : Factor w/ 4 levels "No","Regional",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ soe_week : num 11 11 11 11 10 11 11 11 9 11 ...
## $ sah_week : num 14 13 14 NA 12 13 13 14 14 14 ...
## $ PI_expected_diff_imp : num 7.23 1.13 19.43 12.53 22.33 ...
## $ num : int 15 15 15 15 15 15 15 15 15 15 ...
## $ popestimate2019 : int 4903185 731545 7278717 3017804 39512223 3565287 973764 705749 21477737 10617423 ...
## $ mean_preCOVID_diff : num 0.809 -1.541 0.526 2.593 3.143 ...
## $ yes_closures_bars : Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
## $ yes_closures_daycare : Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 1 ...
## $ yes_closures_retail : Factor w/ 2 levels "FALSE","TRUE": 2 2 2 1 2 2 2 2 2 2 ...
## $ yes_face_coverings : Factor w/ 2 levels "FALSE","TRUE": 2 1 1 2 2 2 2 2 1 1 ...
## $ yes_travel_restrictions : Factor w/ 2 levels "FALSE","TRUE": 1 2 2 1 1 2 2 1 2 1 ...
## $ all_gatherings_banned : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 2 2 2 1 1 1 ...
mlm2 <- PI_expected_diff_imp ~ mean_preCOVID_diff + percent.pic + popestimate2019 +
soe_week + sah_week + closures_daycare + closures_retail +
face_coverings_required_in_public + out_of_state_travel_restrictions
model3 <- lm( formula = mlm2, data = train_PI )
summary( model3 )
##
## Call:
## lm(formula = mlm2, data = train_PI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.457 -1.058 -0.008 1.034 10.095
##
## Coefficients:
## Estimate
## (Intercept) -1.625e+01
## mean_preCOVID_diff -7.429e-03
## percent.pic 4.740e-01
## popestimate2019 6.005e-08
## soe_week -3.665e-02
## sah_week 7.420e-01
## closures_daycareRestricted -1.195e+00
## closures_daycareYes -1.343e-01
## closures_retailYes 1.695e+00
## face_coverings_required_in_publicVaries by county 5.133e-01
## face_coverings_required_in_publicYes 2.417e-01
## out_of_state_travel_restrictionsLimited quarantine / Screened 1.563e+00
## out_of_state_travel_restrictionsMandatory quarantine 5.999e-01
## out_of_state_travel_restrictionsNo 5.376e-01
## out_of_state_travel_restrictionsRecommended quarantine -1.782e+00
## out_of_state_travel_restrictionsRegional -1.420e+00
## Std. Error
## (Intercept) 3.877e+00
## mean_preCOVID_diff 7.919e-02
## percent.pic 1.023e-02
## popestimate2019 1.792e-08
## soe_week 2.003e-01
## sah_week 2.003e-01
## closures_daycareRestricted 6.157e-01
## closures_daycareYes 4.203e-01
## closures_retailYes 3.370e-01
## face_coverings_required_in_publicVaries by county 6.814e-01
## face_coverings_required_in_publicYes 2.791e-01
## out_of_state_travel_restrictionsLimited quarantine / Screened 7.463e-01
## out_of_state_travel_restrictionsMandatory quarantine 4.166e-01
## out_of_state_travel_restrictionsNo 4.232e-01
## out_of_state_travel_restrictionsRecommended quarantine 7.290e-01
## out_of_state_travel_restrictionsRegional 6.463e-01
## t value Pr(>|t|)
## (Intercept) -4.192 3.23e-05
## mean_preCOVID_diff -0.094 0.925297
## percent.pic 46.333 < 2e-16
## popestimate2019 3.351 0.000863
## soe_week -0.183 0.854884
## sah_week 3.704 0.000234
## closures_daycareRestricted -1.940 0.052850
## closures_daycareYes -0.319 0.749522
## closures_retailYes 5.029 6.76e-07
## face_coverings_required_in_publicVaries by county 0.753 0.451614
## face_coverings_required_in_publicYes 0.866 0.386862
## out_of_state_travel_restrictionsLimited quarantine / Screened 2.094 0.036703
## out_of_state_travel_restrictionsMandatory quarantine 1.440 0.150390
## out_of_state_travel_restrictionsNo 1.270 0.204485
## out_of_state_travel_restrictionsRecommended quarantine -2.444 0.014847
## out_of_state_travel_restrictionsRegional -2.198 0.028403
##
## (Intercept) ***
## mean_preCOVID_diff
## percent.pic ***
## popestimate2019 ***
## soe_week
## sah_week ***
## closures_daycareRestricted .
## closures_daycareYes
## closures_retailYes ***
## face_coverings_required_in_publicVaries by county
## face_coverings_required_in_publicYes
## out_of_state_travel_restrictionsLimited quarantine / Screened *
## out_of_state_travel_restrictionsMandatory quarantine
## out_of_state_travel_restrictionsNo
## out_of_state_travel_restrictionsRecommended quarantine *
## out_of_state_travel_restrictionsRegional *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.185 on 532 degrees of freedom
## (178 observations deleted due to missingness)
## Multiple R-squared: 0.8274, Adjusted R-squared: 0.8225
## F-statistic: 170 on 15 and 532 DF, p-value: < 2.2e-16
step_m3 <- step( model3 )
## Start: AIC=872.28
## PI_expected_diff_imp ~ mean_preCOVID_diff + percent.pic + popestimate2019 +
## soe_week + sah_week + closures_daycare + closures_retail +
## face_coverings_required_in_public + out_of_state_travel_restrictions
##
## Df Sum of Sq RSS AIC
## - face_coverings_required_in_public 2 4.5 2543.8 869.25
## - mean_preCOVID_diff 1 0.0 2539.3 870.29
## - soe_week 1 0.2 2539.4 870.32
## <none> 2539.3 872.28
## - closures_daycare 2 28.2 2567.5 874.33
## - popestimate2019 1 53.6 2592.9 881.73
## - sah_week 1 65.5 2604.8 884.24
## - out_of_state_travel_restrictions 5 129.7 2669.0 889.58
## - closures_retail 1 120.7 2660.0 895.73
## - percent.pic 1 10246.8 12786.1 1756.11
##
## Step: AIC=869.25
## PI_expected_diff_imp ~ mean_preCOVID_diff + percent.pic + popestimate2019 +
## soe_week + sah_week + closures_daycare + closures_retail +
## out_of_state_travel_restrictions
##
## Df Sum of Sq RSS AIC
## - soe_week 1 0.2 2544.0 867.30
## - mean_preCOVID_diff 1 0.3 2544.1 867.32
## <none> 2543.8 869.25
## - closures_daycare 2 25.4 2569.2 870.70
## - popestimate2019 1 53.5 2597.2 878.65
## - sah_week 1 77.8 2621.6 883.76
## - out_of_state_travel_restrictions 5 134.1 2677.9 887.40
## - closures_retail 1 123.0 2666.7 893.12
## - percent.pic 1 10362.2 12905.9 1757.22
##
## Step: AIC=867.3
## PI_expected_diff_imp ~ mean_preCOVID_diff + percent.pic + popestimate2019 +
## sah_week + closures_daycare + closures_retail + out_of_state_travel_restrictions
##
## Df Sum of Sq RSS AIC
## - mean_preCOVID_diff 1 0.5 2544.5 865.40
## <none> 2544.0 867.30
## - closures_daycare 2 26.3 2570.3 868.94
## - popestimate2019 1 59.3 2603.3 877.94
## - sah_week 1 78.0 2622.0 881.84
## - out_of_state_travel_restrictions 5 146.4 2690.4 887.97
## - closures_retail 1 122.9 2666.9 891.15
## - percent.pic 1 10483.4 13027.4 1760.36
##
## Step: AIC=865.4
## PI_expected_diff_imp ~ percent.pic + popestimate2019 + sah_week +
## closures_daycare + closures_retail + out_of_state_travel_restrictions
##
## Df Sum of Sq RSS AIC
## <none> 2544.5 865.40
## - closures_daycare 2 28.0 2572.4 867.39
## - popestimate2019 1 63.7 2608.2 876.95
## - sah_week 1 78.2 2622.7 880.00
## - out_of_state_travel_restrictions 5 146.0 2690.5 885.98
## - closures_retail 1 126.4 2670.9 889.98
## - percent.pic 1 10629.1 13173.5 1764.47
summary( step_m3 )
##
## Call:
## lm(formula = PI_expected_diff_imp ~ percent.pic + popestimate2019 +
## sah_week + closures_daycare + closures_retail + out_of_state_travel_restrictions,
## data = train_PI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5928 -1.0270 -0.0051 1.1067 10.1345
##
## Coefficients:
## Estimate
## (Intercept) -1.714e+01
## percent.pic 4.749e-01
## popestimate2019 6.177e-08
## sah_week 7.702e-01
## closures_daycareRestricted -1.037e+00
## closures_daycareYes -1.847e-02
## closures_retailYes 1.705e+00
## out_of_state_travel_restrictionsLimited quarantine / Screened 1.548e+00
## out_of_state_travel_restrictionsMandatory quarantine 7.088e-01
## out_of_state_travel_restrictionsNo 7.463e-01
## out_of_state_travel_restrictionsRecommended quarantine -1.581e+00
## out_of_state_travel_restrictionsRegional -1.407e+00
## Std. Error
## (Intercept) 2.908e+00
## percent.pic 1.004e-02
## popestimate2019 1.687e-08
## sah_week 1.897e-01
## closures_daycareRestricted 5.665e-01
## closures_daycareYes 3.963e-01
## closures_retailYes 3.304e-01
## out_of_state_travel_restrictionsLimited quarantine / Screened 6.686e-01
## out_of_state_travel_restrictionsMandatory quarantine 4.008e-01
## out_of_state_travel_restrictionsNo 3.669e-01
## out_of_state_travel_restrictionsRecommended quarantine 6.967e-01
## out_of_state_travel_restrictionsRegional 6.408e-01
## t value Pr(>|t|)
## (Intercept) -5.892 6.74e-09
## percent.pic 47.318 < 2e-16
## popestimate2019 3.663 0.000274
## sah_week 4.060 5.65e-05
## closures_daycareRestricted -1.831 0.067590
## closures_daycareYes -0.047 0.962836
## closures_retailYes 5.161 3.47e-07
## out_of_state_travel_restrictionsLimited quarantine / Screened 2.315 0.020971
## out_of_state_travel_restrictionsMandatory quarantine 1.769 0.077522
## out_of_state_travel_restrictionsNo 2.034 0.042452
## out_of_state_travel_restrictionsRecommended quarantine -2.269 0.023646
## out_of_state_travel_restrictionsRegional -2.196 0.028502
##
## (Intercept) ***
## percent.pic ***
## popestimate2019 ***
## sah_week ***
## closures_daycareRestricted .
## closures_daycareYes
## closures_retailYes ***
## out_of_state_travel_restrictionsLimited quarantine / Screened *
## out_of_state_travel_restrictionsMandatory quarantine .
## out_of_state_travel_restrictionsNo *
## out_of_state_travel_restrictionsRecommended quarantine *
## out_of_state_travel_restrictionsRegional *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.179 on 536 degrees of freedom
## (178 observations deleted due to missingness)
## Multiple R-squared: 0.827, Adjusted R-squared: 0.8235
## F-statistic: 233 on 11 and 536 DF, p-value: < 2.2e-16