Methods

Dependancies

library( dplyr )
library( ggplot2 )
## Warning: package 'ggplot2' was built under R version 4.0.5
library( gridExtra )
library( rvest )
library( tidyverse )
library( multcompView )
## Warning: package 'multcompView' was built under R version 4.0.5
library( cdcfluview )
## Warning: package 'cdcfluview' was built under R version 4.0.5
library( purrr )
library( inspectdf )
## Warning: package 'inspectdf' was built under R version 4.0.5
library( naniar )
## Warning: package 'naniar' was built under R version 4.0.5
library( imputeTS )
library( usmap )
## Warning: package 'usmap' was built under R version 4.0.5
library( scales )
library( ggpubr )
## Warning: package 'ggpubr' was built under R version 4.0.5

Data Acquisition

# PIC National
url <- "https://raw.githubusercontent.com/SmilodonCub/DATA621/master/NCHSData12.csv"
PIC_national_df <- read.csv( url ) %>%
    rowid_to_column()
colnames( PIC_national_df ) <- c( 'rowid', 'year', 'week', 'perc_death_PI', 
                                  'perc_death_PIC', 'expected', 'threshold', 
                                  'all_deaths', 'P_deaths', 'I_deaths', 
                                  'C_deaths', 'PIC_deaths' )

# PIC State-level
url <- "https://raw.githubusercontent.com/SmilodonCub/DATA621/master/State_Custom_Data.csv"
PIC_by_state_df <- read.csv( url )
glimpse( PIC_by_state_df )
## Rows: 20,176
## Columns: 13
## $ AREA                 <chr> "State", "State", "State", "State", "State", "...
## $ SUB.AREA             <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ AGE.GROUP            <chr> "All", "All", "All", "All", "All", "All", "All...
## $ SEASON               <chr> "2020-21", "2020-21", "2020-21", "2020-21", "2...
## $ WEEK                 <int> 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51...
## $ PERCENT.P.I          <dbl> 8.6, 8.8, 7.8, 9.5, 9.4, 8.3, 9.5, 8.7, 8.8, 1...
## $ PERCENT.PIC          <dbl> 15.4, 15.7, 15.2, 17.2, 16.4, 15.6, 18.4, 18.0...
## $ NUM.INFLUENZA.DEATHS <chr> "0", "1", "0", "1", "0", "0", "0", "2", "2", "...
## $ NUM.PNEUMONIA.DEATHS <chr> "97", "107", "94", "109", "112", "108", "124",...
## $ NUM.COVID.19.DEATHS  <chr> "116", "141", "132", "131", "135", "151", "186...
## $ TOTAL.PIC            <chr> "173", "193", "183", "199", "195", "202", "240...
## $ TOTAL.DEATHS         <chr> "1,122", "1,233", "1,205", "1,160", "1,190", "...
## $ PERCENT.COMPLETE     <chr> "> 100%", "> 100%", "> 100%", "> 100%", "> 100...
## Statewide Restriction
url <- "http://en.wikipedia.org/wiki/U.S._state_and_local_government_responses_to_the_COVID-19_pandemic"
state_level_regulation_df <- url %>%
  read_html() %>%
  html_node(xpath='//*[@id="mw-content-text"]/div[1]/table[2]') %>%
  html_table(fill=T)
state_level_regulation_df <- state_level_regulation_df[-1,]
state_level_regulation_df <- state_level_regulation_df[,-1]
state_level_regulation_df <- state_level_regulation_df[,-11]

colnames( state_level_regulation_df ) <- c( "State_territory", "State_of_emergency_declared", "Stay_at_home_ordered",
                                            "Face_coverings_required_in_public", "Gatherings_banned", 
                                            "Out_of_state_travel_restrictions", "Closures_School", "Closures_Daycare",
                                            "Closures_Bars", "Closures_Retail")

Data Exploration

PIC National

ggplot( data = PIC_national_df, aes( x = rowid, y = perc_death_PIC, color = '%PIC' ) ) + 
    geom_line( ) +
    geom_line( aes( x = rowid, y = threshold, color = 'threshold' ) ) +
    geom_line( aes( x = rowid, y = expected, color = 'expected' ), linetype = "dashed" ) +
    geom_line( aes( x = rowid, y = perc_death_PI, color = '%PI') ) +
    theme_classic() +
    ggtitle( 'Pneumonia, Influenza, and COVID-19 Mortality', subtitle = " from the NCHS Mortality Surveillance System" ) +
    ylab( '% of All Deaths' ) +
    xlab( 'MMWR Week' ) +
    scale_x_continuous(breaks=c(seq( 25,392,49)),
                       labels=c("2013-14", "2014-15", "2015-16", "2016-17", "2017-18", "2018-19", "2019-20", "2020-21")) +
    scale_color_manual(name="PIC data", values=c("%PIC"="red", "%PI"="blue", 'threshold'='black', 'expected'='black'))

Mean Annual Influenza Death Predictions (Expected & Threshold)

expectedDeaths <- PIC_national_df %>%
    dplyr::select( c( 'week', 'expected', 'threshold' ) ) %>%
    dplyr::group_by(week) %>%
    dplyr::summarise( 'expected' = mean( expected ), 'threshold' = mean( threshold ) ) %>%
    dplyr::rename( WEEK = week )
## `summarise()` ungrouping output (override with `.groups` argument)
head( expectedDeaths )
## # A tibble: 6 x 3
##    WEEK expected threshold
##   <int>    <dbl>     <dbl>
## 1     1     7.17      7.52
## 2     2     7.25      7.59
## 3     3     7.31      7.65
## 4     4     7.36      7.70
## 5     5     7.40      7.74
## 6     6     7.42      7.76

PIC State-Level

# remove instances of this string to make later conversion easier
PIC_by_state_df[PIC_by_state_df == 'Insufficient Data'] <- NA
PIC_by_state_df <- PIC_by_state_df %>%
  dplyr::select( -c( 'AREA', 'AGE.GROUP' ) ) %>% #remove AREA & AGE.GROUP -> Uninformative, but could add later if people think it'd be an interesting feature
  mutate( NUM.INFLUENZA.DEATHS = as.numeric(NUM.INFLUENZA.DEATHS),
          NUM.PNEUMONIA.DEATHS = as.numeric(NUM.PNEUMONIA.DEATHS),
          NUM.COVID.19.DEATHS = as.numeric(NUM.COVID.19.DEATHS),
          TOTAL.PIC = as.numeric(TOTAL.PIC),
          TOTAL.DEATHS = as.numeric(TOTAL.DEATHS)) #remove commas and recast numeric
## Warning: Problem with `mutate()` input `NUM.PNEUMONIA.DEATHS`.
## i NAs introduced by coercion
## i Input `NUM.PNEUMONIA.DEATHS` is `as.numeric(NUM.PNEUMONIA.DEATHS)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
## Warning: Problem with `mutate()` input `NUM.COVID.19.DEATHS`.
## i NAs introduced by coercion
## i Input `NUM.COVID.19.DEATHS` is `as.numeric(NUM.COVID.19.DEATHS)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
## Warning: Problem with `mutate()` input `TOTAL.PIC`.
## i NAs introduced by coercion
## i Input `TOTAL.PIC` is `as.numeric(TOTAL.PIC)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
## Warning: Problem with `mutate()` input `TOTAL.DEATHS`.
## i NAs introduced by coercion
## i Input `TOTAL.DEATHS` is `as.numeric(TOTAL.DEATHS)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
#Join the mean expected deaths by week to the df
PIC_by_state_df <- left_join( PIC_by_state_df, expectedDeaths, by = "WEEK", copy = FALSE ) %>%
    mutate( PI_expected_diff = PERCENT.P.I - expected,
            PI_threshold_diff = PERCENT.P.I - threshold )
# visualize deviation from expected PI deaths for each flu season:
eachSeason_diff <- PIC_by_state_df %>%
    dplyr::rename( area = SUB.AREA, season = SEASON ) %>%
    dplyr::group_by( season, area ) %>%
    dplyr::summarise( meanPI_expected_diff = mean( PI_expected_diff, na.rm = TRUE ),
               meanPI_threshold_diff = mean( PI_threshold_diff, na.rm = TRUE )) %>%
    dplyr::mutate( season = factor( season ) )
## `summarise()` regrouping output by 'season' (override with `.groups` argument)
p1 <- ggplot(data = eachSeason_diff, aes( x = season, y = meanPI_expected_diff ) ) +
    geom_boxplot() +
    geom_jitter( color="black", size = 0.4, alpha = 0.6 ) +
    theme_classic() +
    ggtitle( 'Deviance from Expected PI Mortality', subtitle = 'distribution of deviation for each US state per flu season') +
    ylab( '% Deviation' ) +
    xlab( 'Flu Season' )
#ANOVA + TUKEY Test for PIC state-level
eachSeason_diff <- eachSeason_diff %>%
    mutate( season2 = gsub("-","", season ) )
#ANOVA + Tukey test
linmod <- lm( data = eachSeason_diff, meanPI_expected_diff ~ season2 )
ANOVA <- aov( linmod )
TUKEY <- TukeyHSD( x = ANOVA, 'season2', conf.level = 0.99)
TUKEY_df <- data.frame( TUKEY$season2 )
TUKEY_df <- tibble::rownames_to_column( TUKEY_df, 'Season' )
p3 <- ggplot() +
    geom_linerange( data = TUKEY_df, mapping = aes( y = Season, xmin = lwr, xmax = upr ), width = 0.2, size = 1, color = 'red' ) +
    geom_point( data = TUKEY_df, aes( x = diff, y = Season ), size = 3, shape = 21, color = 'red' ) +
    geom_vline( xintercept = 0, linetype = 'dashed', color = 'gray' ) +
    xlab( 'Difference in mean level' ) +
    ggtitle( "99% Family-wise Confidence Level" ) +
    theme_classic()
## Warning: Ignoring unknown parameters: width
grid.arrange( p1, p3, ncol =2 )

EDA state level regulation

glimpse( state_level_regulation_df )
## Rows: 56
## Columns: 10
## $ State_territory                   <chr> "Alabama", "Alaska", "American Sa...
## $ State_of_emergency_declared       <chr> "March 13", "March 11", "January ...
## $ Stay_at_home_ordered              <chr> "April 4", "March 28", "No", "Mar...
## $ Face_coverings_required_in_public <chr> "Yes", "No", "No", "No", "Yes", "...
## $ Gatherings_banned                 <chr> "10 or more", "10 or more", "10 o...
## $ Out_of_state_travel_restrictions  <chr> "No", "Mandatory quarantine", "Tr...
## $ Closures_School                   <chr> "Yes (remainder of term)", "Yes (...
## $ Closures_Daycare                  <chr> "Yes", "Yes", "Yes", "Yes", "Yes"...
## $ Closures_Bars                     <chr> "Yes", "Yes", "No", "Yes", "Yes",...
## $ Closures_Retail                   <chr> "Yes", "Yes", "No", "Yes", "Regio...
state_level_regulation_df <- state_level_regulation_df %>%
  modify_if( is.character, as.factor )
p5 <- inspectdf::inspect_imb( state_level_regulation_df ) %>% show_plot()
p5 + theme_classic()

Data Preparation

Missingness in state PIC data

miss_var_summary( PIC_by_state_df )
## # A tibble: 15 x 3
##    variable             n_miss pct_miss
##    <chr>                 <int>    <dbl>
##  1 TOTAL.DEATHS           7844   38.9  
##  2 TOTAL.PIC               146    0.724
##  3 NUM.COVID.19.DEATHS     113    0.560
##  4 NUM.PNEUMONIA.DEATHS     68    0.337
##  5 PERCENT.P.I              32    0.159
##  6 PERCENT.PIC              32    0.159
##  7 NUM.INFLUENZA.DEATHS     32    0.159
##  8 PI_expected_diff         32    0.159
##  9 PI_threshold_diff        32    0.159
## 10 SUB.AREA                  0    0    
## 11 SEASON                    0    0    
## 12 WEEK                      0    0    
## 13 PERCENT.COMPLETE          0    0    
## 14 expected                  0    0    
## 15 threshold                 0    0
miss_var_table( PIC_by_state_df )
## # A tibble: 6 x 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      6    40   
## 2            32      5    33.3 
## 3            68      1     6.67
## 4           113      1     6.67
## 5           146      1     6.67
## 6          7844      1     6.67
# Visualize the missingness by season
gg_miss_fct( x = PIC_by_state_df, fct = SEASON)

The figure above shows a heatmap of missingness. We see that TOTAL.DEATHS has a lot of missing values. That’s okay, because we will be using the PERCENT.P.I, PERCENT.PIC & features derived from the (_diff). However, what is troubling is that there are quite a few missing values for the 2020-21 flu season and to a lesser extend 2019-20 for what we would like to use as a predictor variable PI_expected_diff.

missingness for PI_expect_diff 2020-2021 by State:

states_missingness <- PIC_by_state_df %>%
  dplyr::filter( SEASON == '2020-21' ) %>%
  dplyr::select( SUB.AREA, PI_expected_diff ) %>%
  dplyr::group_by( SUB.AREA ) %>%
  miss_var_summary() %>%
  filter( n_miss != 0 ) %>%
  arrange(desc( n_miss ))

states_missingness
## # A tibble: 13 x 4
## # Groups:   SUB.AREA [13]
##    SUB.AREA             variable         n_miss pct_miss
##    <chr>                <chr>             <int>    <dbl>
##  1 North Carolina       PI_expected_diff     13    56.5 
##  2 West Virginia        PI_expected_diff      4    17.4 
##  3 Connecticut          PI_expected_diff      2     8.70
##  4 Indiana              PI_expected_diff      2     8.70
##  5 Rhode Island         PI_expected_diff      2     8.70
##  6 Utah                 PI_expected_diff      2     8.70
##  7 Alaska               PI_expected_diff      1     4.35
##  8 Delaware             PI_expected_diff      1     4.35
##  9 District of Columbia PI_expected_diff      1     4.35
## 10 Montana              PI_expected_diff      1     4.35
## 11 North Dakota         PI_expected_diff      1     4.35
## 12 Ohio                 PI_expected_diff      1     4.35
## 13 South Dakota         PI_expected_diff      1     4.35

We should exclude North Carolina and consider excluding West Virginia, but otherwise perhaps find a clever way to impute the other missing values. First, I will visualize the impact of the missingness on the target variable, PI_expected_diff

Let’s see if there is a pattern for missingness by WEEK:

week_missingness <- PIC_by_state_df %>%
  dplyr::filter( SEASON == '2020-21', !SUB.AREA %in% c( 'North Carolina', 'West Virginia' ) ) %>%
  dplyr::select( WEEK, PI_expected_diff ) %>%
  dplyr::group_by( WEEK ) %>%
  miss_var_summary() %>%
  filter( n_miss != 0 ) %>%
  arrange(desc( n_miss ))

week_missingness
## # A tibble: 2 x 4
## # Groups:   WEEK [2]
##    WEEK variable         n_miss pct_miss
##   <int> <chr>             <int>    <dbl>
## 1     9 PI_expected_diff     11       22
## 2     8 PI_expected_diff      4        8

After filtering West Virginia and North Carolina, we can see that 22% of the remaining states are missing data for week 9. Week 9 is the last week of the 2020-2021 flu season to be included in the data set, so the missingness is probably due to lags in reporting. We will drop week 9 from further analysis and impute the remainder of values using kalman smoothing methods from the imputeTS library.

PI_missingness_2 <- PIC_by_state_df %>%
  dplyr::filter( SEASON == '2020-21') %>%
  dplyr::select( SUB.AREA, WEEK, PI_expected_diff ) %>%
  dplyr::group_by( SUB.AREA ) %>%
  dplyr::mutate( PI_expected_diff_imp = na_kalman( PI_expected_diff ),
          num = row_number() )
PI_missingness_2  
## # A tibble: 1,196 x 5
## # Groups:   SUB.AREA [52]
##    SUB.AREA  WEEK PI_expected_diff PI_expected_diff_imp   num
##    <chr>    <int>            <dbl>                <dbl> <int>
##  1 Alabama     40             2.79                 2.79     1
##  2 Alabama     41             2.91                 2.91     2
##  3 Alabama     42             1.82                 1.82     3
##  4 Alabama     43             3.41                 3.41     4
##  5 Alabama     44             3.21                 3.21     5
##  6 Alabama     45             1.99                 1.99     6
##  7 Alabama     46             3.08                 3.08     7
##  8 Alabama     47             2.16                 2.16     8
##  9 Alabama     48             2.15                 2.15     9
## 10 Alabama     49             3.44                 3.44    10
## # ... with 1,186 more rows

Visualize the imputation the make sure it behaves:

ggplot( data = PI_missingness_2, aes( x = num, y = PI_expected_diff_imp ) ) +
  geom_line( color = 'red' ) +
  geom_line( data = PI_missingness_2, aes( x = num, y = PI_expected_diff ), color = 'blue' ) +
  facet_wrap( ~ SUB.AREA )

We don’t observe any obvious disagreement with the imputation results (red). This plot is also important because it reinforces our earlier decision to remove North Carolina and West Virginia due to missingness.

Moving forward we will perform the following to prepare the data:

  • remove North Carolina & West Virginia
  • restrict the WEEKs to those included in the data for the 2020-2021 season - week 9.
  • combine the data for New York and New York City
PIC_by_state_imp <- PIC_by_state_df %>%
  dplyr::filter( !SUB.AREA %in% c( 'North Carolina', 'West Virginia' ) ) %>% #remove the states that are missing a lot of 2020-21 data
                 #filter only the WEEKs that have adequate data for 2020-21
                 #WEEK %in% c( 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 1, 2, 3, 4, 5, 6, 7, 8 ) ) %>%
  dplyr::select( -c(TOTAL.DEATHS, PERCENT.COMPLETE) ) #remove the total deaths columns due to missingness
#combine the New York Data
PIC_NY <- PIC_by_state_imp %>%
  dplyr::filter( SUB.AREA %in% c( 'New York', 'New York City' ) ) %>%
  mutate( est_Total_Deaths = round(((NUM.INFLUENZA.DEATHS + NUM.PNEUMONIA.DEATHS + NUM.COVID.19.DEATHS) * 100)/PERCENT.PIC, 0 ) ) %>%
  dplyr::group_by( SEASON, WEEK ) %>%
  dplyr::summarise( NUM.INFLUENZA.DEATHS = sum( NUM.INFLUENZA.DEATHS ), 
             NUM.PNEUMONIA.DEATHS = sum( NUM.PNEUMONIA.DEATHS ), 
             NUM.COVID.19.DEATHS = sum( NUM.COVID.19.DEATHS ),
             TOTAL.PIC = sum( TOTAL.PIC ),
             est_Total_Deaths = sum( est_Total_Deaths ),
             expected = expected,
             threshold = threshold) %>%
  mutate( PERCENT.P.I = 100*(NUM.INFLUENZA.DEATHS + NUM.PNEUMONIA.DEATHS)/est_Total_Deaths,
          PERCENT.PIC = 100*(TOTAL.PIC)/est_Total_Deaths,
          PI_expected_diff = PERCENT.P.I - expected,
          PI_threshold_diff = PERCENT.P.I - threshold,
          SUB.AREA = 'New York' ) %>%
  distinct() %>%
  dplyr::select( -est_Total_Deaths )
## `summarise()` regrouping output by 'SEASON', 'WEEK' (override with `.groups` argument)

Concatenate the combined New York + New York City data with the data from the rest of the state (minus W Virginia & N Carolina)

#sort( colnames( PIC_by_state_imp ) ) == sort( colnames( PIC_NY ) )

PIC_notNY <- PIC_by_state_imp %>%
  dplyr::filter( !SUB.AREA %in% c( 'New York', 'New York City' ) )

PIC_by_state_clean <- rbind( PIC_NY, PIC_notNY )

Visualization of States map:

diff_by_state_plot <- PIC_by_state_clean %>%
  filter( SEASON == '2020-21' ) %>%
  group_by( SUB.AREA) %>%
  summarise( max_PI_expected_diff = max( PI_expected_diff, na.rm = T ) ) %>%
  mutate( fips = fips( SUB.AREA ) )
## `summarise()` ungrouping output (override with `.groups` argument)
pdiff <- plot_usmap( data = diff_by_state_plot, values = 'max_PI_expected_diff', labels = F ) +
  scale_fill_continuous( low = 'white', high = 'red',
                         name = 'Diff in PI', label = scales::comma,
                         limits = c( 0,25 ) ) +
  theme( legend.position = 'right' ) +
  theme( panel.background = element_rect( colour = "red"))

PIC_by_state_plot <- PIC_by_state_clean %>%
  filter( SEASON == '2020-21' ) %>%
  group_by( SUB.AREA) %>%
  summarise( max_percent_PIC = max( PERCENT.PIC, na.rm = T ) ) %>%
  mutate( fips = fips( SUB.AREA ) )
## `summarise()` ungrouping output (override with `.groups` argument)
pperPIC <- plot_usmap( data = PIC_by_state_plot, values = 'max_percent_PIC', labels = F ) +
  scale_fill_continuous( low = 'white', high = 'red',
                         name = 'Percent PIC', label = scales::comma,
                         limits = c( 0,60 ) ) +
  theme( legend.position = 'right' ) +
  theme( panel.background = element_rect( colour = "red"))

grid.arrange( pdiff, pperPIC, ncol = 2 )

Visualize the relationship between excess PI mortality and PIC mortality:

relation_plot <- PIC_by_state_plot
relation_plot$max_PI_expected_diff <- diff_by_state_plot$max_PI_expected_diff
relation_plot <- relation_plot %>%
  dplyr::arrange( max_percent_PIC ) 

#this dataset has been uploaded to B Cooper's git for plotting in the manuscript
# https://raw.githubusercontent.com/SmilodonCub/DATA621/master/PIC_PIexcess_relation.csv
#write.csv( relation_plot, 'PIC_PIexcess_relation.csv' )
prel <- ggplot( data = relation_plot, aes( x = max_percent_PIC, y = max_PI_expected_diff, label = SUB.AREA ) ) +
  geom_point() +
  geom_smooth( method = 'lm', se = F, color = 'black' ) +
  stat_cor( r.digits = 3, p.accuracy = 0.001 ) +
  geom_label(check_overlap = TRUE, nudge_y = 0.75) +
  theme_classic() +
  ggtitle('Excess PI mortality vs PIC Mortality for the 2020-2021 flu season') +
  ylab( 'Max. Weekly Excess PI Mortality' ) +
  xlab( 'Max. Weekly PIC Mortality' )
## Warning: Ignoring unknown parameters: check_overlap
prel
## `geom_smooth()` using formula 'y ~ x'

We see a linear relationship between the excess PI mortality and PIC mortality. The goal of our modeling and further analysis will be to see if variables that describe state-level COVID-19 regulatory measure and/or demographic features (e.g. state population ) can add to the explanatory power of this relationship.

Now we prepare to merge the state-level PIC data with the state-level regulation data.

Checking Unique States in Each Source:

# check to see how joining on state looks
d_pbs2 <- data.frame( SUB.AREA = unique( PIC_by_state_clean$SUB.AREA ) )
d_pbs2$state = d_pbs2$SUB.AREA

d_pbs <- distinct(PIC_by_state_df, SUB.AREA)
d_pbs$state = d_pbs$SUB.AREA

d_reg <- distinct(state_level_regulation_df, State_territory)
d_reg$state <- d_reg$State_territory

key_check <- full_join(d_pbs2, d_reg, by = c('state' = 'state')) %>%
  filter(is.na(SUB.AREA) | is.na(State_territory))
key_check
##      SUB.AREA                            state                  State_territory
## 1  California                       California                             <NA>
## 2       Texas                            Texas                             <NA>
## 3        <NA>                   American Samoa                   American Samoa
## 4        <NA> California (government response) California (government response)
## 5        <NA>                             Guam                             Guam
## 6        <NA>                   North Carolina                   North Carolina
## 7        <NA>               N. Mariana Islands               N. Mariana Islands
## 8        <NA>                      Puerto Rico                      Puerto Rico
## 9        <NA>      Texas (government response)      Texas (government response)
## 10       <NA>              U.S. Virgin Islands              U.S. Virgin Islands
## 11       <NA>                    West Virginia                    West Virginia

This output shows the mismatches in state names between the PIC and COVID regulation state-level data. We will change the regulation California (government response) to California and drop all territories from the regulation data that are not present in the state-level PIC.

state_level_regulation_df$State_territory <- as.character( state_level_regulation_df$State_territory )
state_level_regulation_df$State_territory[ state_level_regulation_df$State_territory == 'California (government response)' ] <- 'California'
state_level_regulation_df$State_territory <- factor( state_level_regulation_df$State_territory )
state_level_regulation_df$State_territory
##  [1] Alabama                     Alaska                     
##  [3] American Samoa              Arizona                    
##  [5] Arkansas                    California                 
##  [7] Colorado                    Connecticut                
##  [9] Delaware                    District of Columbia       
## [11] Florida                     Georgia                    
## [13] Guam                        Hawaii                     
## [15] Idaho                       Illinois                   
## [17] Indiana                     Iowa                       
## [19] Kansas                      Kentucky                   
## [21] Louisiana                   Maine                      
## [23] Maryland                    Massachusetts              
## [25] Michigan                    Minnesota                  
## [27] Mississippi                 Missouri                   
## [29] Montana                     Nebraska                   
## [31] Nevada                      New Hampshire              
## [33] New Jersey                  New Mexico                 
## [35] New York                    North Carolina             
## [37] North Dakota                N. Mariana Islands         
## [39] Ohio                        Oklahoma                   
## [41] Oregon                      Pennsylvania               
## [43] Puerto Rico                 Rhode Island               
## [45] South Carolina              South Dakota               
## [47] Tennessee                   Texas (government response)
## [49] Utah                        U.S. Virgin Islands        
## [51] Vermont                     Virginia                   
## [53] Washington                  West Virginia              
## [55] Wisconsin                   Wyoming                    
## 56 Levels: Alabama Alaska American Samoa Arizona Arkansas ... Wyoming
df <- inner_join(data.frame(PIC_by_state_clean), state_level_regulation_df, by = c('SUB.AREA' = 'State_territory'))
names(df) <- lapply(names(df), tolower)
str(df)
## 'data.frame':    18624 obs. of  22 variables:
##  $ season                           : chr  "2013-14" "2013-14" "2013-14" "2013-14" ...
##  $ week                             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ num.influenza.deaths             : num  5 7 12 16 19 9 8 10 9 6 ...
##  $ num.pneumonia.deaths             : num  279 301 309 289 306 274 299 269 262 273 ...
##  $ num.covid.19.deaths              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total.pic                        : num  284 308 321 305 325 283 307 279 271 279 ...
##  $ expected                         : num  7.17 7.25 7.31 7.36 7.4 ...
##  $ threshold                        : num  7.52 7.59 7.65 7.7 7.74 ...
##  $ percent.p.i                      : num  9.42 9.56 10.53 10.04 10.08 ...
##  $ percent.pic                      : num  9.42 9.56 10.53 10.04 10.08 ...
##  $ pi_expected_diff                 : num  2.25 2.31 3.22 2.68 2.69 ...
##  $ pi_threshold_diff                : num  1.9 1.97 2.87 2.33 2.34 ...
##  $ sub.area                         : chr  "New York" "New York" "New York" "New York" ...
##  $ state_of_emergency_declared      : Factor w/ 16 levels "February 29",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ stay_at_home_ordered             : Factor w/ 24 levels "April 1","April 2",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ face_coverings_required_in_public: Factor w/ 3 levels "No","Varies by county",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ gatherings_banned                : Factor w/ 9 levels "10 or more","10 or more (recommended)",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ out_of_state_travel_restrictions : Factor w/ 8 levels "Limited quarantine",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ closures_school                  : Factor w/ 2 levels "Yes (districts' choice)",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ closures_daycare                 : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ closures_bars                    : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ closures_retail                  : Factor w/ 4 levels "No","Regional",..: 4 4 4 4 4 4 4 4 4 4 ...

Some visual explorations of the differences in observed and expected PI mortality in relation to state-level regulations:

# state of emergency
df$soe_week <- as.Date(paste0(df$state_of_emergency_declared, ' 2020'), '%B %d %Y') %>%
  strftime(format = '%V') %>%
  as.numeric()

# stay at home
df$sah_week <- as.Date(paste0(df$stay_at_home_ordered, ' 2020'), '%B %d %Y') %>%
  strftime(format = '%V') %>%
  as.numeric()

# sample new york view
ny <- filter(df, sub.area == 'New York' & season %in% c('2019-20', '2020-21'))

ggplot(ny, aes(x = week, y = pi_expected_diff, color = season)) +
  geom_point() +
  geom_vline(xintercept = ny$soe_week, color = 'blue') +
  geom_vline(xintercept = ny$sah_week, color = 'purple')
## Warning: Removed 8 rows containing missing values (geom_point).

This visualization depicts the weekly excess PI mortality rates for the 2020-2021 (blue) and 2019-2020 (red) flu seasons with vertical lines to show when the New York State government restrictions for the declaration of a state of emergency (blue) and the stay at home order (purple) were enacted.

df2 <- filter(df, season %in% c('2019-20', '2020-21'))

# add weeks after 52
df2$week_v2 <- with(df2, ifelse(season == '2020-21' & week >= 40, (week - 40 + 1) + 52, ifelse(season == '2020-21' & week < 40, week + 52 + 14, week)))

# check against ny, with references lines for state of emergency and stay at home orders
pNY <- filter(df2, sub.area == 'New York') %>%
  ggplot(aes(x = week_v2, y = pi_expected_diff, color = season)) +
  geom_point() +
  geom_vline(aes(xintercept = soe_week), color = 'blue') +
  geom_vline(aes(xintercept = sah_week), color = 'purple') +
  ggtitle( 'New York State' )

# facet by state
pStates <- ggplot(df2, aes(x = week_v2, y = pi_expected_diff, color = season)) +
  geom_point() +
  geom_vline(aes(xintercept = soe_week), color = 'blue') +
  geom_vline(aes(xintercept = sah_week), color = 'purple') +
  facet_wrap(~sub.area, scales = 'free_y') +
  ggtitle( 'U.S. States' )

#distinct(df2, sub.area, soe_week, sah_week, face_coverings_required_in_public)
#grid.arrange( pNY, pStates, ncol = 2, widths = c( 6,10 ) )
pNY
## Warning: Removed 8 rows containing missing values (geom_point).

pStates
## Warning: Removed 23 rows containing missing values (geom_point).
## Warning: Removed 825 rows containing missing values (geom_vline).

These visualizations plot the weekly data contiguously. There is a detailed plot for New York as well as an overview plot for all states.

Here we incorporate state-level population data from the U.S. census and look at some other visualizations of our target variable for excess PI mortality PI_Expected_diff in relation to the state-level COVID-19 restrictions. These visualization are informative on the subsequent data processing steps to ready the data set for modeling as comments adjacent to the plots code explain.

url <- 'https://raw.githubusercontent.com/SmilodonCub/DATA621/master/nst-est2019-alldata.csv'
pop <- read.csv(url)

names(pop) <- lapply(names(pop), tolower)

str(pop)
## 'data.frame':    57 obs. of  151 variables:
##  $ sumlev               : int  10 20 20 20 20 40 40 40 40 40 ...
##  $ region               : chr  "0" "1" "2" "3" ...
##  $ division             : chr  "0" "0" "0" "0" ...
##  $ state                : int  0 0 0 0 0 1 2 4 5 6 ...
##  $ name                 : chr  "United States" "Northeast Region" "Midwest Region" "South Region" ...
##  $ census2010pop        : int  308745538 55317240 66927001 114555744 71945553 4779736 710231 6392017 2915918 37253956 ...
##  $ estimatesbase2010    : int  308758105 55318443 66929725 114563030 71946907 4780125 710249 6392288 2916031 37254519 ...
##  $ popestimate2010      : int  309321666 55380134 66974416 114866680 72100436 4785437 713910 6407172 2921964 37319502 ...
##  $ popestimate2011      : int  311556874 55604223 67157800 116006522 72788329 4799069 722128 6472643 2940667 37638369 ...
##  $ popestimate2012      : int  313830990 55775216 67336743 117241208 73477823 4815588 730443 6554978 2952164 37948800 ...
##  $ popestimate2013      : int  315993715 55901806 67560379 118364400 74167130 4830081 737068 6632764 2959400 38260787 ...
##  $ popestimate2014      : int  318301008 56006011 67745167 119624037 74925793 4841799 736283 6730413 2967392 38596972 ...
##  $ popestimate2015      : int  320635163 56034684 67860583 120997341 75742555 4852347 737498 6829676 2978048 38918045 ...
##  $ popestimate2016      : int  322941311 56042330 67987540 122351760 76559681 4863525 741456 6941072 2989918 39167117 ...
##  $ popestimate2017      : int  324985539 56059240 68126781 123542189 77257329 4874486 739700 7044008 3001345 39358497 ...
##  $ popestimate2018      : int  326687501 56046620 68236628 124569433 77834820 4887681 735139 7158024 3009733 39461588 ...
##  $ popestimate2019      : int  328239523 55982803 68329004 125580448 78347268 4903185 731545 7278717 3017804 39512223 ...
##  $ npopchg_2010         : int  563561 61691 44691 303650 153529 5312 3661 14884 5933 64983 ...
##  $ npopchg_2011         : int  2235208 224089 183384 1139842 687893 13632 8218 65471 18703 318867 ...
##  $ npopchg_2012         : int  2274116 170993 178943 1234686 689494 16519 8315 82335 11497 310431 ...
##  $ npopchg_2013         : int  2162725 126590 223636 1123192 689307 14493 6625 77786 7236 311987 ...
##  $ npopchg_2014         : int  2307293 104205 184788 1259637 758663 11718 -785 97649 7992 336185 ...
##  $ npopchg_2015         : int  2334155 28673 115416 1373304 816762 10548 1215 99263 10656 321073 ...
##  $ npopchg_2016         : int  2306148 7646 126957 1354419 817126 11178 3958 111396 11870 249072 ...
##  $ npopchg_2017         : int  2044228 16910 139241 1190429 697648 10961 -1756 102936 11427 191380 ...
##  $ npopchg_2018         : int  1701962 -12620 109847 1027244 577491 13195 -4561 114016 8388 103091 ...
##  $ npopchg_2019         : int  1552022 -63817 92376 1011015 512448 15504 -3594 120693 8071 50635 ...
##  $ births2010           : int  987836 163466 212570 368759 243041 14226 2892 20934 9438 123327 ...
##  $ births2011           : int  3973485 646249 834866 1509634 982736 59690 11716 86097 38453 509771 ...
##  $ births2012           : int  3936976 637860 830701 1504955 963460 59067 11124 85577 38594 497451 ...
##  $ births2013           : int  3940576 635751 830890 1504774 969161 57929 11350 86093 37978 499629 ...
##  $ births2014           : int  3963195 632433 836538 1525313 968911 58903 11443 86129 38225 498914 ...
##  $ births2015           : int  3992376 634504 838012 1545679 974181 59647 11330 86764 38796 500380 ...
##  $ births2016           : int  3962654 628030 831641 1541412 961571 59389 11246 84957 38612 490358 ...
##  $ births2017           : int  3901982 618490 818785 1519344 945363 58961 10803 83021 37737 481943 ...
##  $ births2018           : int  3824521 610713 801587 1494950 917271 58271 10284 81767 37239 465017 ...
##  $ births2019           : int  3791712 602740 792343 1481244 915385 57313 10031 81942 36640 462617 ...
##  $ deaths2010           : int  598691 110878 140862 228435 118516 11075 920 11505 6979 57319 ...
##  $ deaths2011           : int  2512442 470811 586660 963051 491920 48833 3937 48174 29455 238388 ...
##  $ deaths2012           : int  2501531 460915 584671 960791 495154 48366 3940 48754 29772 239535 ...
##  $ deaths2013           : int  2608019 479963 605165 1011227 511664 50851 4039 50612 30994 247704 ...
##  $ deaths2014           : int  2582448 470268 596898 1006109 509173 49712 4024 50236 29967 244055 ...
##  $ deaths2015           : int  2699826 489209 626084 1052226 532307 51876 4350 52846 31318 253798 ...
##  $ deaths2016           : int  2703215 480502 619187 1058081 545445 51710 4447 56032 31134 260121 ...
##  $ deaths2017           : int  2788163 497919 640407 1092315 557522 53195 4463 56333 32340 265241 ...
##  $ deaths2018           : int  2824382 503076 627911 1116929 576466 53665 4664 60127 31743 277578 ...
##  $ deaths2019           : int  2835038 505588 622854 1122130 584466 53879 4819 60523 31322 282520 ...
##  $ naturalinc2010       : int  389145 52588 71708 140324 124525 3151 1972 9429 2459 66008 ...
##  $ naturalinc2011       : int  1461043 175438 248206 546583 490816 10857 7779 37923 8998 271383 ...
##  $ naturalinc2012       : int  1435445 176945 246030 544164 468306 10701 7184 36823 8822 257916 ...
##  $ naturalinc2013       : int  1332557 155788 225725 493547 457497 7078 7311 35481 6984 251925 ...
##  $ naturalinc2014       : int  1380747 162165 239640 519204 459738 9191 7419 35893 8258 254859 ...
##  $ naturalinc2015       : int  1292550 145295 211928 493453 441874 7771 6980 33918 7478 246582 ...
##  $ naturalinc2016       : int  1259439 147528 212454 483331 416126 7679 6799 28925 7478 230237 ...
##  $ naturalinc2017       : int  1113819 120571 178378 427029 387841 5766 6340 26688 5397 216702 ...
##  $ naturalinc2018       : int  1000139 107637 173676 378021 340805 4606 5620 21640 5496 187439 ...
##  $ naturalinc2019       : int  956674 97152 169489 359114 330919 3434 5212 21419 5318 180097 ...
##  $ internationalmig2010 : int  174416 45032 24720 67464 37200 924 430 2550 801 19635 ...
##  $ internationalmig2011 : int  774165 204242 114945 279639 175339 4665 1314 16489 3662 99179 ...
##  $ internationalmig2012 : int  838671 205018 118864 338025 176764 5817 2348 15100 3732 98297 ...
##  $ internationalmig2013 : int  830168 191468 124489 323413 190798 5046 2863 17997 2321 116528 ...
##  $ internationalmig2014 : int  926546 219965 132052 358590 215939 3684 1198 21312 3100 131001 ...
##  $ internationalmig2015 : int  1041605 224592 140410 422001 254602 4580 2273 19511 3303 156870 ...
##  $ internationalmig2016 : int  1046709 234098 143098 428463 241050 5777 2303 20062 3366 141892 ...
##  $ internationalmig2017 : int  930409 213507 124250 395903 196749 3011 2612 13385 2039 112814 ...
##  $ internationalmig2018 : int  701823 179097 94152 299648 128926 3379 537 7937 588 71300 ...
##  $ internationalmig2019 : int  595348 134145 85675 242942 132586 2772 659 7782 268 74028 ...
##  $ domesticmig2010      : int  0 -32807 -50673 91267 -7787 1244 1168 3122 2469 -20356 ...
##  $ domesticmig2011      : int  0 -154970 -179592 312610 21952 -1893 -892 11062 5996 -51326 ...
##  $ domesticmig2012      : int  0 -207154 -184359 344278 47235 -114 -1319 29844 -1293 -41795 ...
##  $ domesticmig2013      : int  0 -217700 -124956 300008 42648 2297 -3564 24050 -2150 -53692 ...
##  $ domesticmig2014      : int  0 -275141 -183669 374593 84217 -959 -9641 39948 -3371 -46321 ...
##  $ domesticmig2015      : int  0 -340104 -234550 453543 121111 -1544 -8166 45561 -106 -79938 ...
##  $ domesticmig2016      : int  0 -373661 -227445 441519 159587 -2157 -5185 62320 1069 -122369 ...
##  $ domesticmig2017      : int  0 -316475 -162183 366143 112515 2298 -10753 62686 3981 -137546 ...
##  $ domesticmig2018      : int  0 -298739 -157067 348608 107198 5279 -10757 84113 2294 -155281 ...
##  $ domesticmig2019      : int  0 -294331 -161549 407913 47967 9387 -9482 91017 2515 -203414 ...
##  $ netmig2010           : int  174416 12225 -25953 158731 29413 2168 1598 5672 3270 -721 ...
##  $ netmig2011           : int  774165 49272 -64647 592249 197291 2772 422 27551 9658 47853 ...
##  $ netmig2012           : int  838671 -2136 -65495 682303 223999 5703 1029 44944 2439 56502 ...
##  $ netmig2013           : int  830168 -26232 -467 623421 233446 7343 -701 42047 171 62836 ...
##  $ netmig2014           : int  926546 -55176 -51617 733183 300156 2725 -8443 61260 -271 84680 ...
##  $ netmig2015           : int  1041605 -115512 -94140 875544 375713 3036 -5893 65072 3197 76932 ...
##  $ netmig2016           : int  1046709 -139563 -84347 869982 400637 3620 -2882 82382 4435 19523 ...
##  $ netmig2017           : int  930409 -102968 -37933 762046 309264 5309 -8141 76071 6020 -24732 ...
##  $ netmig2018           : int  701823 -119642 -62915 648256 236124 8658 -10220 92050 2882 -83981 ...
##  $ netmig2019           : int  595348 -160186 -75874 650855 180553 12159 -8823 98799 2783 -129386 ...
##  $ residual2010         : int  0 -3122 -1064 4595 -409 -7 91 -217 204 -304 ...
##  $ residual2011         : int  0 -621 -175 1010 -214 3 17 -3 47 -369 ...
##  $ residual2012         : int  0 -3816 -1592 8219 -2811 115 102 568 236 -3987 ...
##  $ residual2013         : int  0 -2966 -1622 6224 -1636 72 15 258 81 -2774 ...
##  $ residual2014         : int  0 -2784 -3235 7250 -1231 -198 239 496 5 -3354 ...
##  $ residual2015         : int  0 -1110 -2372 4307 -825 -259 128 273 -19 -2441 ...
##  $ residual2016         : int  0 -319 -1150 1106 363 -121 41 89 -43 -688 ...
##  $ residual2017         : int  0 -693 -1204 1354 543 -114 45 177 10 -590 ...
##  $ residual2018         : int  0 -615 -914 967 562 -69 39 326 10 -367 ...
##  $ residual2019         : int  0 -783 -1239 1046 976 -89 17 475 -30 -76 ...
##  $ rbirth2011           : num  12.8 11.6 12.4 13.1 13.6 ...
##  $ rbirth2012           : num  12.6 11.5 12.4 12.9 13.2 ...
##   [list output truncated]
table(pop$name)
## 
##              Alabama               Alaska              Arizona 
##                    1                    1                    1 
##             Arkansas           California             Colorado 
##                    1                    1                    1 
##          Connecticut             Delaware District of Columbia 
##                    1                    1                    1 
##              Florida              Georgia               Hawaii 
##                    1                    1                    1 
##                Idaho             Illinois              Indiana 
##                    1                    1                    1 
##                 Iowa               Kansas             Kentucky 
##                    1                    1                    1 
##            Louisiana                Maine             Maryland 
##                    1                    1                    1 
##        Massachusetts             Michigan       Midwest Region 
##                    1                    1                    1 
##            Minnesota          Mississippi             Missouri 
##                    1                    1                    1 
##              Montana             Nebraska               Nevada 
##                    1                    1                    1 
##        New Hampshire           New Jersey           New Mexico 
##                    1                    1                    1 
##             New York       North Carolina         North Dakota 
##                    1                    1                    1 
##     Northeast Region                 Ohio             Oklahoma 
##                    1                    1                    1 
##               Oregon         Pennsylvania          Puerto Rico 
##                    1                    1                    1 
##         Rhode Island       South Carolina         South Dakota 
##                    1                    1                    1 
##         South Region            Tennessee                Texas 
##                    1                    1                    1 
##        United States                 Utah              Vermont 
##                    1                    1                    1 
##             Virginia           Washington          West Region 
##                    1                    1                    1 
##        West Virginia            Wisconsin              Wyoming 
##                    1                    1                    1
pop_est <- select(pop, name, popestimate2019)

df2 <- left_join(df2, pop_est, by = c('sub.area' = 'name'))

str(df2)
## 'data.frame':    3600 obs. of  26 variables:
##  $ season                           : chr  "2019-20" "2019-20" "2019-20" "2019-20" ...
##  $ week                             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ num.influenza.deaths             : num  30 21 38 22 36 40 30 37 24 32 ...
##  $ num.pneumonia.deaths             : num  290 274 286 285 241 256 257 240 256 260 ...
##  $ num.covid.19.deaths              : num  0 0 0 0 0 1 0 1 0 0 ...
##  $ total.pic                        : num  320 295 324 307 277 296 287 278 280 292 ...
##  $ expected                         : num  7.17 7.25 7.31 7.36 7.4 ...
##  $ threshold                        : num  7.52 7.59 7.65 7.7 7.74 ...
##  $ percent.p.i                      : num  9.63 9.06 9.77 9.51 8.58 ...
##  $ percent.pic                      : num  9.63 9.06 9.77 9.51 8.58 ...
##  $ pi_expected_diff                 : num  2.46 1.82 2.46 2.15 1.19 ...
##  $ pi_threshold_diff                : num  2.11 1.47 2.119 1.81 0.844 ...
##  $ sub.area                         : chr  "New York" "New York" "New York" "New York" ...
##  $ state_of_emergency_declared      : Factor w/ 16 levels "February 29",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ stay_at_home_ordered             : Factor w/ 24 levels "April 1","April 2",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ face_coverings_required_in_public: Factor w/ 3 levels "No","Varies by county",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ gatherings_banned                : Factor w/ 9 levels "10 or more","10 or more (recommended)",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ out_of_state_travel_restrictions : Factor w/ 8 levels "Limited quarantine",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ closures_school                  : Factor w/ 2 levels "Yes (districts' choice)",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ closures_daycare                 : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ closures_bars                    : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ closures_retail                  : Factor w/ 4 levels "No","Regional",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ soe_week                         : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ sah_week                         : num  12 12 12 12 12 12 12 12 12 12 ...
##  $ week_v2                          : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ popestimate2019                  : int  19453561 19453561 19453561 19453561 19453561 19453561 19453561 19453561 19453561 19453561 ...
df2$pct_covid_deaths <- df2$num.covid.19.deaths / df2$popestimate2019 
## Percent of population dying from covid
ggplot(df2, aes(x = week_v2, y = pct_covid_deaths)) +
  geom_line() +
  facet_wrap(~sub.area)

## percent pi vs pic
ggplot(df2, aes(x = week_v2, y = percent.p.i / 100)) +
  geom_line() +
  geom_line(aes(y = percent.pic / 100), color = 'orange') +
  facet_wrap(~sub.area)

## showing different facets
ggplot(df2, aes(x = week_v2, y = pi_expected_diff, color = sub.area)) +
  geom_line() +
  facet_wrap(~face_coverings_required_in_public, nrow = n_distinct(df$face_coverings_required_in_public))
## Warning: Removed 15 row(s) containing missing values (geom_path).

### maybe look at averages instead
## face coverings v2
group_by(df2, week_v2, face_coverings_required_in_public) %>%
  summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
            .groups = 'drop') %>%
  ggplot(aes(x = week_v2, y = avg_excess_pi, color = face_coverings_required_in_public)) +
  geom_line() +
  theme(legend.position = 'bottom')

## gatherings banned
group_by(df2, week_v2, gatherings_banned) %>%
  summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
            .groups = 'drop') %>%
  ggplot(aes(x = week_v2, y = avg_excess_pi, color = gatherings_banned)) +
  geom_line()
## Warning: Removed 4 row(s) containing missing values (geom_path).

## travel restrictions
group_by(df2, week_v2, out_of_state_travel_restrictions) %>%
  summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
            .groups = 'drop') %>%
  ggplot(aes(x = week_v2, y = avg_excess_pi, color = out_of_state_travel_restrictions)) +
  geom_line()
## Warning: Removed 2 row(s) containing missing values (geom_path).

## closures - school
group_by(df2, week_v2, closures_school) %>%
  summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
            .groups = 'drop') %>%
  ggplot(aes(x = week_v2, y = avg_excess_pi, color = closures_school)) +
  geom_line()
## Warning: Removed 1 row(s) containing missing values (geom_path).

## closures - daycare
group_by(df2, week_v2, closures_daycare) %>%
  summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
            .groups = 'drop') %>%
  ggplot(aes(x = week_v2, y = avg_excess_pi, color = closures_daycare)) +
  geom_line()

## closures - bars
group_by(df2, week_v2, closures_bars) %>%
  summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
            .groups = 'drop') %>%
  ggplot(aes(x = week_v2, y = avg_excess_pi, color = closures_bars)) +
  geom_line()
## Warning: Removed 1 row(s) containing missing values (geom_path).

## closures - retail
group_by(df2, week_v2, closures_retail) %>%
  summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
            .groups = 'drop') %>%
  ggplot(aes(x = week_v2, y = avg_excess_pi, color = closures_retail)) +
  geom_line()

## state of emergency timing
group_by(df2, week_v2, soe_week) %>%
  summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
            .groups = 'drop') %>%
  ggplot(aes(x = week_v2, y = avg_excess_pi, color = factor(soe_week))) +
  geom_line()

## stay at home timing
group_by(df2, week_v2, sah_week) %>%
  summarize(avg_excess_pi = mean(pi_expected_diff, na.rm = T),
            .groups = 'drop') %>%
  ggplot(aes(x = week_v2, y = avg_excess_pi, color = factor(sah_week))) +
  geom_line()

model_dat <- df %>%
  filter( week %in% c( 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 1, 2, 3, 4, 5, 6, 7, 8 )) %>%
  dplyr::group_by( sub.area ) %>%
  dplyr::mutate( PI_expected_diff_imp = na_kalman( pi_expected_diff ),
                 num = row_number() )

#join population estimate
model_dat <- left_join(model_dat, pop_est, by = c('sub.area' = 'name'))

# engineer a feature from previous non-COVID-19 years (before 2019-2020) flu data to take the mean difference between observed and expected PI mortality
preCOVID_diff <- model_dat %>%
  filter( !season %in% c('2019-20', '2020-21') ) %>%
  rowwise() %>%
  mutate( season2 = list(strsplit(season, split = "-")[[1]][1])) %>%
  select( c( season2, week, sub.area, pi_expected_diff )) %>%
  pivot_wider( names_from = season2, names_prefix = 'S', values_from = pi_expected_diff) %>%
  mutate( mean_preCOVID_diff = rowMeans(dplyr::select( ., S2013:S2018), na.rm = TRUE) ) %>%
  select( c( week, sub.area, mean_preCOVID_diff ) )

train_dat <- model_dat %>%
  filter( season == '2020-21' )

#merge the preCOVID_diff feature
train_dat <- merge( train_dat, preCOVID_diff, by.x = c('week', 'sub.area'), by.y = c('week', 'sub.area' ))

#remove features that won't contribute to the model
train_dat <- train_dat %>%
  select( -c( season, num.influenza.deaths, num.pneumonia.deaths, num.covid.19.deaths, closures_school,
              total.pic, state_of_emergency_declared, stay_at_home_ordered, pi_threshold_diff, threshold, ) ) %>%
  mutate_if( is.character, as.factor )
glimpse( train_dat )
## Rows: 1,056
## Columns: 18
## $ week                              <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ sub.area                          <fct> Alabama, Alaska, Arizona, Arkansa...
## $ expected                          <dbl> 7.174014, 7.174014, 7.174014, 7.1...
## $ percent.p.i                       <dbl> 14.4, 8.3, 26.6, 19.7, 29.5, 12.4...
## $ percent.pic                       <dbl> 34.2, 14.6, 41.6, 32.3, 46.9, 22....
## $ pi_expected_diff                  <dbl> 7.225986, 1.125986, 19.425986, 12...
## $ face_coverings_required_in_public <fct> Yes, No, No, Yes, Yes, Yes, Yes, ...
## $ gatherings_banned                 <fct> "10 or more", "10 or more", "50 o...
## $ out_of_state_travel_restrictions  <fct> No, Mandatory quarantine, Limited...
## $ closures_daycare                  <fct> Yes, Yes, Yes, Yes, Yes, Restrict...
## $ closures_bars                     <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes...
## $ closures_retail                   <fct> Yes, Yes, Yes, Regional, Yes, Yes...
## $ soe_week                          <dbl> 11, 11, 11, 11, 10, 11, 11, 11, 1...
## $ sah_week                          <dbl> 14, 13, 14, NA, 12, 13, 13, 13, 1...
## $ PI_expected_diff_imp              <dbl> 7.225986, 1.125986, 19.425986, 12...
## $ num                               <int> 15, 15, 15, 15, 15, 15, 15, 15, 1...
## $ popestimate2019                   <int> 4903185, 731545, 7278717, 3017804...
## $ mean_preCOVID_diff                <dbl> 0.8093196, -1.5406804, 0.5259862,...
summary( train_dat )
##       week             sub.area      expected      percent.p.i   
##  Min.   : 1.00   Alabama   : 22   Min.   :5.805   Min.   : 0.00  
##  1st Qu.: 6.00   Alaska    : 22   1st Qu.:6.306   1st Qu.: 8.60  
##  Median :42.50   Arizona   : 22   Median :6.921   Median :12.05  
##  Mean   :31.23   Arkansas  : 22   Mean   :6.795   Mean   :12.70  
##  3rd Qu.:48.00   California: 22   3rd Qu.:7.309   3rd Qu.:16.20  
##  Max.   :53.00   Colorado  : 22   Max.   :7.427   Max.   :31.80  
##                  (Other)   :924                   NA's   :6      
##   percent.pic    pi_expected_diff face_coverings_required_in_public
##  Min.   : 0.00   Min.   :-7.422   No              :396             
##  1st Qu.:13.90   1st Qu.: 2.011   Varies by county: 22             
##  Median :20.95   Median : 5.265   Yes             :638             
##  Mean   :22.06   Mean   : 5.908                                    
##  3rd Qu.:29.30   3rd Qu.: 9.098                                    
##  Max.   :54.40   Max.   :24.553                                    
##  NA's   :8       NA's   :6                                         
##                                          gatherings_banned
##  10 or more                                       :462    
##  All                                              :286    
##  50 or more                                       :110    
##  11 or more                                       : 88    
##  10 or more (recommended)                         : 22    
##  11 or more, and public gathering in public places: 22    
##  (Other)                                          : 66    
##               out_of_state_travel_restrictions   closures_daycare
##  No                           :462             No        : 88    
##  Mandatory quarantine         :396             Restricted: 88    
##  Limited quarantine           :132             Yes       :880    
##  Limited quarantine / Screened: 22                               
##  Recommended quarantine       : 22                               
##  Regional                     : 22                               
##  (Other)                      :  0                               
##     closures_bars    closures_retail    soe_week        sah_week    
##  No        :  22   No        : 66    Min.   : 9.00   Min.   :12.00  
##  Restricted:  22   Regional  : 44    1st Qu.:10.75   1st Qu.:13.00  
##  Yes       :1012   Restricted: 88    Median :11.00   Median :13.00  
##                    Yes       :858    Mean   :10.73   Mean   :13.35  
##                                      3rd Qu.:11.00   3rd Qu.:14.00  
##                                      Max.   :12.00   Max.   :15.00  
##                                                      NA's   :242    
##  PI_expected_diff_imp      num         popestimate2019    mean_preCOVID_diff
##  Min.   :-7.422       Min.   :  1.00   Min.   :  578759   Min.   :-4.8947   
##  1st Qu.: 2.006       1st Qu.:  6.00   1st Qu.: 1694267   1st Qu.:-0.4009   
##  Median : 5.228       Median : 12.00   Median : 4342705   Median : 0.4690   
##  Mean   : 5.887       Mean   : 14.58   Mean   : 5978404   Mean   : 0.6844   
##  3rd Qu.: 9.057       3rd Qu.: 17.00   3rd Qu.: 6989056   3rd Qu.: 1.5053   
##  Max.   :24.553       Max.   :170.00   Max.   :39512223   Max.   : 8.6053   
## 

98% of states closed schools for the remainder of the term, so we will not consider this feature. Here we create flags for several of the regulations variables to simplify the modeling approach:

train_dat <- train_dat %>%
  mutate( yes_closures_bars = closures_bars == 'Yes', # Bar closures == Yes
          yes_closures_daycare = closures_daycare == 'Yes', # Daycare closures == Yes
          yes_closures_retail = closures_retail == 'Yes', # retail closures == Yes
          yes_face_coverings = face_coverings_required_in_public == 'Yes', # Face coverings required in public == Yes
          yes_travel_restrictions = out_of_state_travel_restrictions != 'No', 
          # Out of State travel restrictions != No (some form or restriction enacted)
          all_gatherings_banned = gatherings_banned == 'All' ) %>% # All Gathering Banned 
    
  mutate_if( is.logical, as.factor )

Model Building

train + validation set

#split train_dat into a train and validation set. 
splitSample <- sample(1:3, size=nrow(train_dat), prob=c(0.7,0.15,0.15), replace = TRUE)
train_PI <- train_dat[splitSample==1,]
valid_PI <- train_dat[splitSample==2,]
test_PI <- train_dat[splitSample==3,]

#str( train_PI )

We will be using the linear model of Excess PI Mortality ~ PIC Mortality as a benchmark

Model 1 Simple Linear Regression

model1 <- lm( data = train_PI, PI_expected_diff_imp ~ percent.pic )
summary( model1 )
## 
## Call:
## lm(formula = PI_expected_diff_imp ~ percent.pic, data = train_PI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5737 -1.3490 -0.0694  1.2506 11.2639 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.22529    0.20751  -20.36   <2e-16 ***
## percent.pic  0.45729    0.00841   54.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.304 on 718 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.8046, Adjusted R-squared:  0.8043 
## F-statistic:  2957 on 1 and 718 DF,  p-value: < 2.2e-16
ggplot( data = train_PI, aes( x = percent.pic, y = PI_expected_diff_imp ) ) +
  geom_point() +
  geom_smooth( method = 'lm', se = F, color = 'black' ) +
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).

Diagnostic plots

plot( model1 )

The simple linear model gives a statistically significant fit to the data and has a relatively high \(R^2\) value. However, there are some indications from the diagnostic plots that the variance of residuals is not constant across the data series. We will attempt to explain more of the variance by adding features to the model that will hopefully increase the predictive power.

Model2 Multiple Linear Regression

Adding feature variables for government regulations:

  • soe_week
  • sah_week
  • yes_closure_bars
  • yes_closures_daycare
  • yes_closures_retail
  • yes_face_coverings
  • yes_travel_restrictions
  • yes_travel_restrictions
mlm <- PI_expected_diff_imp ~ percent.pic + soe_week + sah_week  + yes_closures_daycare + yes_closures_retail +  
  yes_face_coverings + yes_travel_restrictions + all_gatherings_banned

model2 <- lm( formula = mlm, data = train_PI )
summary( model2 )
## 
## Call:
## lm(formula = mlm, data = train_PI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7475 -1.1220 -0.0027  1.1443 10.6079 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -4.8862     3.3489  -1.459 0.145142    
## percent.pic                   0.4790     0.0103  46.506  < 2e-16 ***
## soe_week                     -0.5948     0.1642  -3.623 0.000319 ***
## sah_week                      0.3978     0.1743   2.282 0.022889 *  
## yes_closures_daycareTRUE      0.5817     0.2815   2.067 0.039229 *  
## yes_closures_retailTRUE       1.6495     0.3436   4.801 2.05e-06 ***
## yes_face_coveringsTRUE       -0.2041     0.2342  -0.871 0.383902    
## yes_travel_restrictionsTRUE  -0.5680     0.2219  -2.560 0.010733 *  
## all_gatherings_bannedTRUE    -0.3411     0.2746  -1.242 0.214742    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.242 on 539 degrees of freedom
##   (178 observations deleted due to missingness)
## Multiple R-squared:  0.8159, Adjusted R-squared:  0.8131 
## F-statistic: 298.5 on 8 and 539 DF,  p-value: < 2.2e-16
step_m2 <- step( model2 )
## Start:  AIC=893.71
## PI_expected_diff_imp ~ percent.pic + soe_week + sah_week + yes_closures_daycare + 
##     yes_closures_retail + yes_face_coverings + yes_travel_restrictions + 
##     all_gatherings_banned
## 
##                           Df Sum of Sq     RSS     AIC
## - yes_face_coverings       1       3.8  2712.7  892.48
## - all_gatherings_banned    1       7.8  2716.6  893.28
## <none>                                  2708.9  893.71
## - yes_closures_daycare     1      21.5  2730.3  896.04
## - sah_week                 1      26.2  2735.0  896.98
## - yes_travel_restrictions  1      32.9  2741.8  898.34
## - soe_week                 1      66.0  2774.8  904.90
## - yes_closures_retail      1     115.9  2824.7  914.66
## - percent.pic              1   10869.9 13578.8 1775.07
## 
## Step:  AIC=892.48
## PI_expected_diff_imp ~ percent.pic + soe_week + sah_week + yes_closures_daycare + 
##     yes_closures_retail + yes_travel_restrictions + all_gatherings_banned
## 
##                           Df Sum of Sq     RSS     AIC
## - all_gatherings_banned    1       7.2  2719.9  891.94
## <none>                                  2712.7  892.48
## - yes_closures_daycare     1      21.1  2733.8  894.73
## - yes_travel_restrictions  1      29.4  2742.1  896.39
## - sah_week                 1      37.9  2750.6  898.09
## - soe_week                 1      64.3  2777.0  903.32
## - yes_closures_retail      1     122.7  2835.4  914.73
## - percent.pic              1   10935.7 13648.4 1775.88
## 
## Step:  AIC=891.94
## PI_expected_diff_imp ~ percent.pic + soe_week + sah_week + yes_closures_daycare + 
##     yes_closures_retail + yes_travel_restrictions
## 
##                           Df Sum of Sq     RSS     AIC
## <none>                                  2719.9  891.94
## - yes_closures_daycare     1      22.6  2742.5  894.48
## - yes_travel_restrictions  1      23.8  2743.7  894.72
## - soe_week                 1      57.9  2777.9  901.49
## - sah_week                 1      76.9  2796.8  905.21
## - yes_closures_retail      1     141.1  2861.1  917.67
## - percent.pic              1   10934.5 13654.4 1774.12
summary( step_m2 )
## 
## Call:
## lm(formula = PI_expected_diff_imp ~ percent.pic + soe_week + 
##     sah_week + yes_closures_daycare + yes_closures_retail + yes_travel_restrictions, 
##     data = train_PI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5314 -1.1072 -0.0158  1.1423 10.4794 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -7.88802    2.66609  -2.959 0.003225 ** 
## percent.pic                  0.47807    0.01025  46.636  < 2e-16 ***
## soe_week                    -0.54266    0.15985  -3.395 0.000737 ***
## sah_week                     0.55038    0.14077   3.910 0.000104 ***
## yes_closures_daycareTRUE     0.59589    0.28099   2.121 0.034402 *  
## yes_closures_retailTRUE      1.76926    0.33391   5.299  1.7e-07 ***
## yes_travel_restrictionsTRUE -0.44996    0.20678  -2.176 0.029986 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.242 on 541 degrees of freedom
##   (178 observations deleted due to missingness)
## Multiple R-squared:  0.8151, Adjusted R-squared:  0.8131 
## F-statistic: 397.5 on 6 and 541 DF,  p-value: < 2.2e-16

Adding the following feature variables:

  • mean_preCOVID_diff - what is the weekly mean difference in observed and predicted PI mortality for each state.
  • percent.pic - weekly percent of deaths due to PI
  • popestimate2019
  • soe_week
  • sah_week
  • yes_closure_bars
  • yes_closures_daycare
  • yes_closures_retail
  • yes_face_coverings
  • yes_travel_restrictions
  • yes_travel_restrictions
#trying to figure out why only 1 level is seen for the closures_bars variable
#cant figure this out? must be something really dumb
debug_contr_error <- function (dat, subset_vec = NULL) {
  if (!is.null(subset_vec)) {
    ## step 0
    if (mode(subset_vec) == "logical") {
      if (length(subset_vec) != nrow(dat)) {
        stop("'logical' `subset_vec` provided but length does not match `nrow(dat)`")
        }
      subset_log_vec <- subset_vec
      } else if (mode(subset_vec) == "numeric") {
      ## check range
      ran <- range(subset_vec)
      if (ran[1] < 1 || ran[2] > nrow(dat)) {
        stop("'numeric' `subset_vec` provided but values are out of bound")
        } else {
        subset_log_vec <- logical(nrow(dat))
        subset_log_vec[as.integer(subset_vec)] <- TRUE
        } 
      } else {
      stop("`subset_vec` must be either 'logical' or 'numeric'")
      }
    dat <- base::subset(dat, subset = subset_log_vec)
    } else {
    ## step 1
    dat <- stats::na.omit(dat)
    }
  if (nrow(dat) == 0L) warning("no complete cases")
  ## step 2
  var_mode <- sapply(dat, mode)
  if (any(var_mode %in% c("complex", "raw"))) stop("complex or raw not allowed!")
  var_class <- sapply(dat, class)
  if (any(var_mode[var_class == "AsIs"] %in% c("logical", "character"))) {
    stop("matrix variables with 'AsIs' class must be 'numeric'")
    }
  ind1 <- which(var_mode %in% c("logical", "character"))
  dat[ind1] <- lapply(dat[ind1], as.factor)
  ## step 3
  fctr <- which(sapply(dat, is.factor))
  if (length(fctr) == 0L) warning("no factor variables to summary")
  ind2 <- if (length(ind1) > 0L) fctr[-ind1] else fctr
  dat[ind2] <- lapply(dat[ind2], base::droplevels.factor)
  ## step 4
  lev <- lapply(dat[fctr], base::levels.default)
  nl <- lengths(lev)
  ## return
  list(nlevels = nl, levels = lev)
  }


debug_contr_error( train_dat )
## $nlevels
##                          sub.area face_coverings_required_in_public 
##                                37                                 3 
##                 gatherings_banned  out_of_state_travel_restrictions 
##                                 9                                 6 
##                  closures_daycare                     closures_bars 
##                                 3                                 1 
##                   closures_retail                 yes_closures_bars 
##                                 2                                 1 
##              yes_closures_daycare               yes_closures_retail 
##                                 2                                 2 
##                yes_face_coverings           yes_travel_restrictions 
##                                 2                                 2 
##             all_gatherings_banned 
##                                 2 
## 
## $levels
## $levels$sub.area
##  [1] "Alabama"              "Alaska"               "Arizona"             
##  [4] "California"           "Colorado"             "Connecticut"         
##  [7] "Delaware"             "District of Columbia" "Florida"             
## [10] "Georgia"              "Hawaii"               "Idaho"               
## [13] "Illinois"             "Indiana"              "Kansas"              
## [16] "Louisiana"            "Maine"                "Maryland"            
## [19] "Michigan"             "Minnesota"            "Mississippi"         
## [22] "Missouri"             "Montana"              "Nevada"              
## [25] "New Hampshire"        "New Jersey"           "New Mexico"          
## [28] "New York"             "Ohio"                 "Oregon"              
## [31] "Pennsylvania"         "Rhode Island"         "South Carolina"      
## [34] "Tennessee"            "Vermont"              "Virginia"            
## [37] "Washington"          
## 
## $levels$face_coverings_required_in_public
## [1] "No"               "Varies by county" "Yes"             
## 
## $levels$gatherings_banned
## [1] "10 or more"                                       
## [2] "10 or more (recommended)"                         
## [3] "11 or more"                                       
## [4] "11 or more, and public gathering in public places"
## [5] "25 or more"                                       
## [6] "50 or more"                                       
## [7] "6 or more"                                        
## [8] "All"                                              
## [9] "All outside, and 11 or more inside a household"   
## 
## $levels$out_of_state_travel_restrictions
## [1] "Limited quarantine"            "Limited quarantine / Screened"
## [3] "Mandatory quarantine"          "No"                           
## [5] "Recommended quarantine"        "Regional"                     
## 
## $levels$closures_daycare
## [1] "No"         "Restricted" "Yes"       
## 
## $levels$closures_bars
## [1] "Yes"
## 
## $levels$closures_retail
## [1] "Restricted" "Yes"       
## 
## $levels$yes_closures_bars
## [1] "TRUE"
## 
## $levels$yes_closures_daycare
## [1] "FALSE" "TRUE" 
## 
## $levels$yes_closures_retail
## [1] "FALSE" "TRUE" 
## 
## $levels$yes_face_coverings
## [1] "FALSE" "TRUE" 
## 
## $levels$yes_travel_restrictions
## [1] "FALSE" "TRUE" 
## 
## $levels$all_gatherings_banned
## [1] "FALSE" "TRUE"
str( train_PI )
## 'data.frame':    726 obs. of  24 variables:
##  $ week                             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sub.area                         : Factor w/ 48 levels "Alabama","Alaska",..: 1 2 3 4 5 7 8 9 10 11 ...
##  $ expected                         : num  7.17 7.17 7.17 7.17 7.17 ...
##  $ percent.p.i                      : num  14.4 8.3 26.6 19.7 29.5 14.8 16.8 15.4 17.9 21.5 ...
##  $ percent.pic                      : num  34.2 14.6 41.6 32.3 46.9 32.1 29.6 24.5 23.9 33.5 ...
##  $ pi_expected_diff                 : num  7.23 1.13 19.43 12.53 22.33 ...
##  $ face_coverings_required_in_public: Factor w/ 3 levels "No","Varies by county",..: 3 1 1 3 3 3 3 3 1 1 ...
##  $ gatherings_banned                : Factor w/ 9 levels "10 or more","10 or more (recommended)",..: 1 1 6 1 8 8 8 1 1 1 ...
##  $ out_of_state_travel_restrictions : Factor w/ 8 levels "Limited quarantine",..: 4 3 1 4 4 5 3 4 2 4 ...
##  $ closures_daycare                 : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ closures_bars                    : Factor w/ 3 levels "No","Restricted",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ closures_retail                  : Factor w/ 4 levels "No","Regional",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ soe_week                         : num  11 11 11 11 10 11 11 11 9 11 ...
##  $ sah_week                         : num  14 13 14 NA 12 13 13 14 14 14 ...
##  $ PI_expected_diff_imp             : num  7.23 1.13 19.43 12.53 22.33 ...
##  $ num                              : int  15 15 15 15 15 15 15 15 15 15 ...
##  $ popestimate2019                  : int  4903185 731545 7278717 3017804 39512223 3565287 973764 705749 21477737 10617423 ...
##  $ mean_preCOVID_diff               : num  0.809 -1.541 0.526 2.593 3.143 ...
##  $ yes_closures_bars                : Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
##  $ yes_closures_daycare             : Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 1 ...
##  $ yes_closures_retail              : Factor w/ 2 levels "FALSE","TRUE": 2 2 2 1 2 2 2 2 2 2 ...
##  $ yes_face_coverings               : Factor w/ 2 levels "FALSE","TRUE": 2 1 1 2 2 2 2 2 1 1 ...
##  $ yes_travel_restrictions          : Factor w/ 2 levels "FALSE","TRUE": 1 2 2 1 1 2 2 1 2 1 ...
##  $ all_gatherings_banned            : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 2 2 2 1 1 1 ...
mlm2 <- PI_expected_diff_imp ~ mean_preCOVID_diff + percent.pic + popestimate2019 +
  soe_week + sah_week  + closures_daycare + closures_retail +
  face_coverings_required_in_public + out_of_state_travel_restrictions 

model3 <- lm( formula = mlm2, data = train_PI )
summary( model3 )
## 
## Call:
## lm(formula = mlm2, data = train_PI)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.457 -1.058 -0.008  1.034 10.095 
## 
## Coefficients:
##                                                                 Estimate
## (Intercept)                                                   -1.625e+01
## mean_preCOVID_diff                                            -7.429e-03
## percent.pic                                                    4.740e-01
## popestimate2019                                                6.005e-08
## soe_week                                                      -3.665e-02
## sah_week                                                       7.420e-01
## closures_daycareRestricted                                    -1.195e+00
## closures_daycareYes                                           -1.343e-01
## closures_retailYes                                             1.695e+00
## face_coverings_required_in_publicVaries by county              5.133e-01
## face_coverings_required_in_publicYes                           2.417e-01
## out_of_state_travel_restrictionsLimited quarantine / Screened  1.563e+00
## out_of_state_travel_restrictionsMandatory quarantine           5.999e-01
## out_of_state_travel_restrictionsNo                             5.376e-01
## out_of_state_travel_restrictionsRecommended quarantine        -1.782e+00
## out_of_state_travel_restrictionsRegional                      -1.420e+00
##                                                               Std. Error
## (Intercept)                                                    3.877e+00
## mean_preCOVID_diff                                             7.919e-02
## percent.pic                                                    1.023e-02
## popestimate2019                                                1.792e-08
## soe_week                                                       2.003e-01
## sah_week                                                       2.003e-01
## closures_daycareRestricted                                     6.157e-01
## closures_daycareYes                                            4.203e-01
## closures_retailYes                                             3.370e-01
## face_coverings_required_in_publicVaries by county              6.814e-01
## face_coverings_required_in_publicYes                           2.791e-01
## out_of_state_travel_restrictionsLimited quarantine / Screened  7.463e-01
## out_of_state_travel_restrictionsMandatory quarantine           4.166e-01
## out_of_state_travel_restrictionsNo                             4.232e-01
## out_of_state_travel_restrictionsRecommended quarantine         7.290e-01
## out_of_state_travel_restrictionsRegional                       6.463e-01
##                                                               t value Pr(>|t|)
## (Intercept)                                                    -4.192 3.23e-05
## mean_preCOVID_diff                                             -0.094 0.925297
## percent.pic                                                    46.333  < 2e-16
## popestimate2019                                                 3.351 0.000863
## soe_week                                                       -0.183 0.854884
## sah_week                                                        3.704 0.000234
## closures_daycareRestricted                                     -1.940 0.052850
## closures_daycareYes                                            -0.319 0.749522
## closures_retailYes                                              5.029 6.76e-07
## face_coverings_required_in_publicVaries by county               0.753 0.451614
## face_coverings_required_in_publicYes                            0.866 0.386862
## out_of_state_travel_restrictionsLimited quarantine / Screened   2.094 0.036703
## out_of_state_travel_restrictionsMandatory quarantine            1.440 0.150390
## out_of_state_travel_restrictionsNo                              1.270 0.204485
## out_of_state_travel_restrictionsRecommended quarantine         -2.444 0.014847
## out_of_state_travel_restrictionsRegional                       -2.198 0.028403
##                                                                  
## (Intercept)                                                   ***
## mean_preCOVID_diff                                               
## percent.pic                                                   ***
## popestimate2019                                               ***
## soe_week                                                         
## sah_week                                                      ***
## closures_daycareRestricted                                    .  
## closures_daycareYes                                              
## closures_retailYes                                            ***
## face_coverings_required_in_publicVaries by county                
## face_coverings_required_in_publicYes                             
## out_of_state_travel_restrictionsLimited quarantine / Screened *  
## out_of_state_travel_restrictionsMandatory quarantine             
## out_of_state_travel_restrictionsNo                               
## out_of_state_travel_restrictionsRecommended quarantine        *  
## out_of_state_travel_restrictionsRegional                      *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.185 on 532 degrees of freedom
##   (178 observations deleted due to missingness)
## Multiple R-squared:  0.8274, Adjusted R-squared:  0.8225 
## F-statistic:   170 on 15 and 532 DF,  p-value: < 2.2e-16
step_m3 <- step( model3 )
## Start:  AIC=872.28
## PI_expected_diff_imp ~ mean_preCOVID_diff + percent.pic + popestimate2019 + 
##     soe_week + sah_week + closures_daycare + closures_retail + 
##     face_coverings_required_in_public + out_of_state_travel_restrictions
## 
##                                     Df Sum of Sq     RSS     AIC
## - face_coverings_required_in_public  2       4.5  2543.8  869.25
## - mean_preCOVID_diff                 1       0.0  2539.3  870.29
## - soe_week                           1       0.2  2539.4  870.32
## <none>                                            2539.3  872.28
## - closures_daycare                   2      28.2  2567.5  874.33
## - popestimate2019                    1      53.6  2592.9  881.73
## - sah_week                           1      65.5  2604.8  884.24
## - out_of_state_travel_restrictions   5     129.7  2669.0  889.58
## - closures_retail                    1     120.7  2660.0  895.73
## - percent.pic                        1   10246.8 12786.1 1756.11
## 
## Step:  AIC=869.25
## PI_expected_diff_imp ~ mean_preCOVID_diff + percent.pic + popestimate2019 + 
##     soe_week + sah_week + closures_daycare + closures_retail + 
##     out_of_state_travel_restrictions
## 
##                                    Df Sum of Sq     RSS     AIC
## - soe_week                          1       0.2  2544.0  867.30
## - mean_preCOVID_diff                1       0.3  2544.1  867.32
## <none>                                           2543.8  869.25
## - closures_daycare                  2      25.4  2569.2  870.70
## - popestimate2019                   1      53.5  2597.2  878.65
## - sah_week                          1      77.8  2621.6  883.76
## - out_of_state_travel_restrictions  5     134.1  2677.9  887.40
## - closures_retail                   1     123.0  2666.7  893.12
## - percent.pic                       1   10362.2 12905.9 1757.22
## 
## Step:  AIC=867.3
## PI_expected_diff_imp ~ mean_preCOVID_diff + percent.pic + popestimate2019 + 
##     sah_week + closures_daycare + closures_retail + out_of_state_travel_restrictions
## 
##                                    Df Sum of Sq     RSS     AIC
## - mean_preCOVID_diff                1       0.5  2544.5  865.40
## <none>                                           2544.0  867.30
## - closures_daycare                  2      26.3  2570.3  868.94
## - popestimate2019                   1      59.3  2603.3  877.94
## - sah_week                          1      78.0  2622.0  881.84
## - out_of_state_travel_restrictions  5     146.4  2690.4  887.97
## - closures_retail                   1     122.9  2666.9  891.15
## - percent.pic                       1   10483.4 13027.4 1760.36
## 
## Step:  AIC=865.4
## PI_expected_diff_imp ~ percent.pic + popestimate2019 + sah_week + 
##     closures_daycare + closures_retail + out_of_state_travel_restrictions
## 
##                                    Df Sum of Sq     RSS     AIC
## <none>                                           2544.5  865.40
## - closures_daycare                  2      28.0  2572.4  867.39
## - popestimate2019                   1      63.7  2608.2  876.95
## - sah_week                          1      78.2  2622.7  880.00
## - out_of_state_travel_restrictions  5     146.0  2690.5  885.98
## - closures_retail                   1     126.4  2670.9  889.98
## - percent.pic                       1   10629.1 13173.5 1764.47
summary( step_m3 )
## 
## Call:
## lm(formula = PI_expected_diff_imp ~ percent.pic + popestimate2019 + 
##     sah_week + closures_daycare + closures_retail + out_of_state_travel_restrictions, 
##     data = train_PI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5928 -1.0270 -0.0051  1.1067 10.1345 
## 
## Coefficients:
##                                                                 Estimate
## (Intercept)                                                   -1.714e+01
## percent.pic                                                    4.749e-01
## popestimate2019                                                6.177e-08
## sah_week                                                       7.702e-01
## closures_daycareRestricted                                    -1.037e+00
## closures_daycareYes                                           -1.847e-02
## closures_retailYes                                             1.705e+00
## out_of_state_travel_restrictionsLimited quarantine / Screened  1.548e+00
## out_of_state_travel_restrictionsMandatory quarantine           7.088e-01
## out_of_state_travel_restrictionsNo                             7.463e-01
## out_of_state_travel_restrictionsRecommended quarantine        -1.581e+00
## out_of_state_travel_restrictionsRegional                      -1.407e+00
##                                                               Std. Error
## (Intercept)                                                    2.908e+00
## percent.pic                                                    1.004e-02
## popestimate2019                                                1.687e-08
## sah_week                                                       1.897e-01
## closures_daycareRestricted                                     5.665e-01
## closures_daycareYes                                            3.963e-01
## closures_retailYes                                             3.304e-01
## out_of_state_travel_restrictionsLimited quarantine / Screened  6.686e-01
## out_of_state_travel_restrictionsMandatory quarantine           4.008e-01
## out_of_state_travel_restrictionsNo                             3.669e-01
## out_of_state_travel_restrictionsRecommended quarantine         6.967e-01
## out_of_state_travel_restrictionsRegional                       6.408e-01
##                                                               t value Pr(>|t|)
## (Intercept)                                                    -5.892 6.74e-09
## percent.pic                                                    47.318  < 2e-16
## popestimate2019                                                 3.663 0.000274
## sah_week                                                        4.060 5.65e-05
## closures_daycareRestricted                                     -1.831 0.067590
## closures_daycareYes                                            -0.047 0.962836
## closures_retailYes                                              5.161 3.47e-07
## out_of_state_travel_restrictionsLimited quarantine / Screened   2.315 0.020971
## out_of_state_travel_restrictionsMandatory quarantine            1.769 0.077522
## out_of_state_travel_restrictionsNo                              2.034 0.042452
## out_of_state_travel_restrictionsRecommended quarantine         -2.269 0.023646
## out_of_state_travel_restrictionsRegional                       -2.196 0.028502
##                                                                  
## (Intercept)                                                   ***
## percent.pic                                                   ***
## popestimate2019                                               ***
## sah_week                                                      ***
## closures_daycareRestricted                                    .  
## closures_daycareYes                                              
## closures_retailYes                                            ***
## out_of_state_travel_restrictionsLimited quarantine / Screened *  
## out_of_state_travel_restrictionsMandatory quarantine          .  
## out_of_state_travel_restrictionsNo                            *  
## out_of_state_travel_restrictionsRecommended quarantine        *  
## out_of_state_travel_restrictionsRegional                      *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.179 on 536 degrees of freedom
##   (178 observations deleted due to missingness)
## Multiple R-squared:  0.827,  Adjusted R-squared:  0.8235 
## F-statistic:   233 on 11 and 536 DF,  p-value: < 2.2e-16