Prepare Data

Libraries

library(tidyverse)
library(zoo)
library(GGally)
library(skimr)
library(magrittr)
library(moderndive)
library(corrplot)
library(DT)

Recycle/Reduce/Reuse Homework 4 & 5 Code

# load("G:/My Drive/homework/Cooper P/MA_COVID19_21_03_09.RData")

load("G:/My Drive/homework/Cooper P/MA_county_variables_2020.RData")

counties <-
  read_csv(
    "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   date = col_date(format = ""),
##   county = col_character(),
##   state = col_character(),
##   fips = col_character(),
##   cases = col_double(),
##   deaths = col_double()
## )
mas_cov <-
  counties %>%
  filter(state == "Massachusetts") %>%
  select(!c(state, fips)) %>%
  # mutate(date = as.Date(date)) %>%
  mutate(new_cases = cases - lag(cases, k = 1), .after = cases) %>%
  mutate(new_deaths = deaths - lag(deaths, k = 1), .after = deaths) %>%
  mutate(new_cases_07_day = rollmean(new_cases, k = 7, fill = NA),
         .after = new_cases) %>%
  mutate(new_deaths_07_day = rollmean(new_deaths, k = 7, fill = NA),
         .after = new_deaths) %>% 
  
  full_join(ma_extra) %>%
  
  mutate(cases_per_100000 = cases / X2019_pop_est * 100000,
         .after = new_cases_07_day) %>%
  
  mutate(new_cases_per_100000 = new_cases / X2019_pop_est * 100000,
         .after = cases_per_100000) %>%
  
  mutate(new_cases_per_100000_07_day = new_cases_07_day / X2019_pop_est * 100000,
         .after = new_cases_per_100000) %>%
  
  mutate(deaths_per_100000 = deaths / X2019_pop_est * 100000,
         .after = new_deaths_07_day) %>%
  
  mutate(new_deaths_per_100000 = new_deaths / X2019_pop_est * 100000,
         .after = deaths_per_100000) %>%
  
  mutate(new_deaths_per_100000_07_day = new_cases_07_day / X2019_pop_est * 100000,
          .after = new_deaths_per_100000) %>%

  # mutate(month = month(date, label = TRUE), .after = date) %>%
  # mutate(year = year(date), .after = month) %>%
  
  mutate(region = if_else(county == "Hampshire" |
                          county == "Franklin" |
                          county == "Hampden" |
                          county == "Berkshire" |
                          county == "Worcester",
                          "West", "East") %>% as_factor(),
         .after = county) %>%
  
  mutate(pct_over_85 = X2019_age_85_plus / X2019_pop_est * 100,
         .after = X2019_age_85_plus) %>%
  mutate(pct_NHWA_2019 = pct_NHWA_2019 * 100)
## Joining, by = "county"
sum(!complete.cases(mas_cov))
## [1] 386
mas_cov %<>% drop_na()

Question 2

Part (1)

Parts (a) & (b)

https://moderndive.com/6-multiple-regression.html#model4interactiontable

mas_cov %>%
  select(deaths_per_100000, 
         people_per_Housing,
         region) %>%
  skim_without_charts()
Data summary
Name Piped data
Number of rows 5393
Number of columns 3
_______________________
Column type frequency:
factor 1
numeric 2
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
region 0 1 FALSE 2 Eas: 3481, Wes: 1912

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
deaths_per_100000 0 1 106.28 77.67 0.0 32.01 112.54 157.53 309.41
people_per_Housing 0 1 2.08 0.57 0.9 1.81 2.41 2.50 2.51
mas_cov %>%
  select(deaths_per_100000, 
         people_per_Housing,
         region) %>%
  ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

model.21a <- 
  mas_cov %>%
  lm(deaths_per_100000 ~ people_per_Housing * region, data = .)

model.21a %>% get_regression_table() %>% datatable()
model.21a %>% summary()
## 
## Call:
## lm(formula = deaths_per_100000 ~ people_per_Housing * region, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.693  -30.570   -3.343   33.371  185.568 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -51.953      3.442 -15.095   <2e-16 ***
## people_per_Housing              78.001      1.635  47.707   <2e-16 ***
## regionWest                     -17.723     12.686  -1.397    0.162    
## people_per_Housing:regionWest    2.453      5.648   0.434    0.664    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 64.16 on 5389 degrees of freedom
## Multiple R-squared:  0.3179, Adjusted R-squared:  0.3175 
## F-statistic: 837.2 on 3 and 5389 DF,  p-value: < 2.2e-16
model.coef <- model.21a %>% coefficients()

mas_cov %>%
  ggplot(aes(people_per_Housing, deaths_per_100000, color = region)) +
  scale_color_manual(values = c("East" = "salmon", "West" = "cyan")) +
  geom_point() +
  # East
  geom_abline(intercept = model.coef[1],
              slope = model.coef[2],
              color = "salmon",
              size = 1.25, 
              linetype = "longdash") +
  # West
  geom_abline(intercept = model.coef[1] + model.coef[3],
              slope = model.coef[2] + model.coef[4],
              color = "cyan",
              size = 1.25,
              linetype = "longdash") +
  # Linear Model
  geom_smooth(method = "lm",
              se = FALSE,
              aes(people_per_Housing, deaths_per_100000),
              inherit.aes = FALSE,
              color = "grey25",
              size = 1.25,
              linetype = "longdash") +
  ggtitle("Interaction & Linear Models")
## `geom_smooth()` using formula 'y ~ x'

Part (d)

https://moderndive.com/6-multiple-regression.html#model4table

mas_cov %>%
  ggplot(aes(people_per_Housing, deaths_per_100000, color = region)) +
  scale_color_manual(values = c("East" = "salmon", "West" = "cyan")) +
  geom_point() +
  geom_parallel_slopes(se = FALSE,
                       size = 1.25,
                       linetype = "longdash") +
  ggtitle("Parallel Slopes Model")

Part (e)

model.21e <- 
  mas_cov %>%
  lm(deaths_per_100000 ~ people_per_Housing + region, data = .)

model.21e %>% get_regression_table() %>% datatable()
model.21e %>% summary()
## 
## Call:
## lm(formula = deaths_per_100000 ~ people_per_Housing + region, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.798  -30.477   -3.259   33.481  185.934 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -52.364      3.309 -15.824  < 2e-16 ***
## people_per_Housing   78.206      1.565  49.976  < 2e-16 ***
## regionWest          -12.274      1.866  -6.577 5.26e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 64.16 on 5390 degrees of freedom
## Multiple R-squared:  0.3179, Adjusted R-squared:  0.3176 
## F-statistic:  1256 on 2 and 5390 DF,  p-value: < 2.2e-16

Part (g)

https://www.statology.org/sst-ssr-sse-in-r/

# Homework 5
model.23c <- 
  mas_cov %>%
  lm(deaths_per_100000 ~ people_per_Housing, data = .)

# Homework 5
model.24c <-
  mas_cov %>%
  lm(deaths_per_100000 ~ region, data = .)

SSE.23c <- sum((fitted(model.23c) - mas_cov$deaths_per_100000)^2)

SSE.24c <- sum((fitted(model.24c) - mas_cov$deaths_per_100000)^2)

SSE.21a <- sum((fitted(model.21a) - mas_cov$deaths_per_100000)^2)

SSE.21e <- sum((fitted(model.21e) - mas_cov$deaths_per_100000)^2)

knitr::kable(c(SSE.23c, SSE.24c, SSE.21a, SSE.21e))
x
22365968
32469367
22187133
22187910

Part (2)

Part (a)

mas_cov %>%
  select(deaths_per_100000,
         pct_NHWA_2019,
         people_per_Housing,
         Median_Household_Income_2018,
         X2019_median_age) %>%
  skim_without_charts()
Data summary
Name Piped data
Number of rows 5393
Number of columns 5
_______________________
Column type frequency:
numeric 5
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
deaths_per_100000 0 1 106.28 77.67 0.00 32.01 112.54 157.53 309.41
pct_NHWA_2019 0 1 76.04 12.22 45.18 71.09 75.69 86.91 90.38
people_per_Housing 0 1 2.08 0.57 0.90 1.81 2.41 2.50 2.51
Median_Household_Income_2018 0 1 75336.37 15176.56 52682.00 66005.00 70224.00 89678.00 100374.00
X2019_median_age 0 1 42.18 5.26 33.30 39.10 40.80 47.20 54.30
mas_cov %>%
  select(deaths_per_100000,
         pct_NHWA_2019,
         people_per_Housing,
         Median_Household_Income_2018,
         X2019_median_age) %>%
  ggpairs()

Part (b)

mas_cov %>%
  select(deaths_per_100000,
         pct_NHWA_2019,
         people_per_Housing,
         Median_Household_Income_2018,
         X2019_median_age) %>%
  cor() %>%
  corrplot.mixed(tl.pos = "lt")

Part (c)

mas_cov %>%
  select(deaths_per_100000,
         pct_NHWA_2019,
         people_per_Housing,
         Median_Household_Income_2018,
         X2019_median_age) %>%
  pivot_longer(cols = !deaths_per_100000) %>%
  ggplot(aes(value, deaths_per_100000)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  facet_wrap(vars(name), scales = "free_x")
## `geom_smooth()` using formula 'y ~ x'

Part (d)

model.22d <-
  mas_cov %>%
  lm(deaths_per_100000 ~ X2019_median_age, data = .)

model.22d %>% get_regression_table() %>% datatable()
model.22d %>% summary()
## 
## Call:
## lm(formula = deaths_per_100000 ~ X2019_median_age, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -138.983  -63.011    6.244   47.242  191.784 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      261.6040     8.2745   31.62   <2e-16 ***
## X2019_median_age  -3.6823     0.1947  -18.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.22 on 5391 degrees of freedom
## Multiple R-squared:  0.06225,    Adjusted R-squared:  0.06208 
## F-statistic: 357.9 on 1 and 5391 DF,  p-value: < 2.2e-16

Part (e)

model.22e <-
  mas_cov %>%
  lm(deaths_per_100000 ~ X2019_median_age + people_per_Housing, data = .)

model.22e %>% get_regression_table() %>% datatable()
model.22e %>% summary()
## 
## Call:
## lm(formula = deaths_per_100000 ~ X2019_median_age + people_per_Housing, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -142.287  -32.934   -0.307   30.775  181.127 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -150.1949    11.4642 -13.101   <2e-16 ***
## X2019_median_age      1.8284     0.2051   8.915   <2e-16 ***
## people_per_Housing   86.0533     1.8917  45.491   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 63.95 on 5390 degrees of freedom
## Multiple R-squared:  0.3224, Adjusted R-squared:  0.3221 
## F-statistic:  1282 on 2 and 5390 DF,  p-value: < 2.2e-16

Part (g)

model.22g <-
  mas_cov %>%
  select(deaths_per_100000,
         pct_NHWA_2019,
         people_per_Housing,
         X2019_median_age) %>%
  lm(deaths_per_100000 ~ ., data = .)

model.22g %>% get_regression_table() %>% datatable()
model.22g %>% summary()
## 
## Call:
## lm(formula = deaths_per_100000 ~ ., data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -154.02  -23.89    0.02   26.05  173.34 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -195.9742    11.3246  -17.30   <2e-16 ***
## pct_NHWA_2019        -2.2274     0.1140  -19.54   <2e-16 ***
## people_per_Housing   95.3823     1.8895   50.48   <2e-16 ***
## X2019_median_age      6.4681     0.3093   20.91   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61.8 on 5389 degrees of freedom
## Multiple R-squared:  0.3672, Adjusted R-squared:  0.3669 
## F-statistic:  1042 on 3 and 5389 DF,  p-value: < 2.2e-16

Part (3)

mas_cov %>%
  select(people_per_Housing,
         deaths,
         deaths_per_100000) %>%
  cor() %>%
  corrplot.mixed(tl.pos = "lt")