Prepare Data

library(tidyverse)
library(zoo)
library(GGally)
library(skimr)
library(magrittr)

Recycle/Reduce/Reuse Homework 4 Code

load("G:/My Drive/homework/Cooper P/MA_county_variables_2020.RData") 

# load("G:/My Drive/homework/Cooper P/MA_COVID19_21_03_02.RData")

counties <-
  read_csv(
    "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   date = col_date(format = ""),
##   county = col_character(),
##   state = col_character(),
##   fips = col_character(),
##   cases = col_double(),
##   deaths = col_double()
## )
mas_cov <-
  counties %>%
  filter(state == "Massachusetts") %>%
  select(!c(state, fips)) %>%
  mutate(date = as.Date(date))%>%
  full_join(ma_extra) %>%
  mutate(deaths_per_100000 = deaths / X2019_pop_est * 100000,
         .after = deaths) %>%
  drop_na()
## Joining, by = "county"

Question 2

Part (3)

Part (a)

mas_cov %>%
  select(deaths_per_100000, 
         pct_NHWA_2019,
         people_per_Housing,
         Median_Household_Income_2018,
         X2019_median_age) %>%
  ggpairs()

Part (b)

mas_cov %>%
  select(deaths_per_100000, 
         pct_NHWA_2019,
         people_per_Housing,
         Median_Household_Income_2018,
         X2019_median_age) %>%
  glimpse()
## Rows: 5,371
## Columns: 5
## $ deaths_per_100000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ pct_NHWA_2019                <dbl> 0.4518321, 0.4518321, 0.4518321, 0.451832~
## $ people_per_Housing           <dbl> 2.327372, 2.327372, 2.327372, 2.327372, 2~
## $ Median_Household_Income_2018 <int> 68743, 68743, 68743, 68743, 68743, 68743,~
## $ X2019_median_age             <dbl> 33.3, 33.3, 33.3, 33.3, 33.3, 33.3, 33.3,~
mas_cov %>%
  select(deaths_per_100000, 
         pct_NHWA_2019,
         people_per_Housing,
         Median_Household_Income_2018,
         X2019_median_age) %>%
  skim_without_charts()
Data summary
Name Piped data
Number of rows 5371
Number of columns 5
_______________________
Column type frequency:
numeric 5
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
deaths_per_100000 0 1 105.74 77.31 0.00 31.15 111.92 157.04 308.77
pct_NHWA_2019 0 1 0.76 0.12 0.45 0.71 0.76 0.87 0.90
people_per_Housing 0 1 2.08 0.57 0.90 1.81 2.39 2.50 2.51
Median_Household_Income_2018 0 1 75329.69 15169.53 52682.00 66005.00 70224.00 89678.00 100374.00
X2019_median_age 0 1 42.17 5.27 33.30 39.10 40.80 47.20 54.30
mas_cov %>%
  ggplot(aes(people_per_Housing, deaths_per_100000)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, col = "red")
## `geom_smooth()` using formula 'y ~ x'

Part (c)

model.fit <- 
  mas_cov %>%
  lm(deaths_per_100000 ~ people_per_Housing, data = .)

model.fit %>% summary()
## 
## Call:
## lm(formula = deaths_per_100000 ~ people_per_Housing, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -137.846  -31.860   -1.992   33.541  178.718 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -52.186      3.316  -15.74   <2e-16 ***
## people_per_Housing   75.762      1.534   49.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 64.12 on 5369 degrees of freedom
## Multiple R-squared:  0.3123, Adjusted R-squared:  0.3122 
## F-statistic:  2438 on 1 and 5369 DF,  p-value: < 2.2e-16
mas_cov %>%
  ggplot(aes(sample = deaths_per_100000)) +
  geom_qq() +
  geom_qq_line()

data.frame(Residuals = residuals(model.fit)) %>%
  ggplot(aes(sample = Residuals)) +
  geom_qq() +
  geom_qq_line()

mas_cov %>%
  select(deaths_per_100000, people_per_Housing) %>%
  bind_cols(data.frame(Residuals = residuals(model.fit))) %>%
  pivot_longer(cols = c(deaths_per_100000, people_per_Housing)) %>%
  ggplot(aes(Residuals, value)) +
  geom_point() +
  facet_wrap(vars(name), scales = "free")

Part (e)

mas_cov %>%
  bind_cols(data.frame(Residuals = residuals(model.fit))) %>%
  select(county, Residuals) %>% 
  group_by(county) %>%
  summarise(across(.cols = Residuals,
                   list(Min = min,
                        Mean = mean,
                        Median = median,
                        Max = max),
                   .names = "{.fn}"
                   )
            )
## # A tibble: 14 x 5
##    county        Min   Mean Median    Max
##    <chr>       <dbl>  <dbl>  <dbl>  <dbl>
##  1 Barnstable  -45.8  42.1   34.9  161.  
##  2 Berkshire   -84.6 -15.3  -45.4  136.  
##  3 Bristol    -129.    3.66  -7.55 159.  
##  4 Dukes       -20.2 -19.3  -20.2   -8.64
##  5 Essex      -138.   20.3   22.2  151.  
##  6 Franklin   -103.   -9.07  -7.78  49.2 
##  7 Hampden    -130.   37.7   35.5  179.  
##  8 Hampshire  -137.  -48.1  -49.4   38.3 
##  9 Middlesex  -137.  -11.0   -6.18  88.3 
## 10 Nantucket   -15.9  -6.42  -7.18   1.60
## 11 Norfolk    -138.    1.54   8.38 105.  
## 12 Plymouth   -136.    9.61   9.56 127.  
## 13 Suffolk    -124.   -3.24  13.4   97.7 
## 14 Worcester  -134.   -3.06  -4.04 124.
mas_cov %>%
  bind_cols(data.frame(Residuals = residuals(model.fit))) %>%
  select(county, Residuals) %>% 
  ggplot(aes(Residuals)) +
  geom_density() +
  facet_wrap(vars(county))

mas_cov %>%
  bind_cols(data.frame(Residuals = residuals(model.fit))) %>%
  select(county, Residuals) %>% 
  ggplot(aes(Residuals)) +
  geom_density() +
  facet_wrap(vars(county), scales = "free_y")

Part (4)

Part (a)

mas_cov %<>%
  mutate(region = if_else(county == "Hampshire" |
                          county == "Franklin" |
                          county == "Hampden" |
                          county == "Berkshire" |
                          county == "Worcester",
                          "West", "East") %>%
           as_factor(),
         .after = county)

mas_cov %>%
  select(county, region) %>%
  distinct(county, .keep_all = TRUE) %>%
  arrange(region, county)
## # A tibble: 14 x 2
##    county     region
##    <chr>      <fct> 
##  1 Barnstable East  
##  2 Bristol    East  
##  3 Dukes      East  
##  4 Essex      East  
##  5 Middlesex  East  
##  6 Nantucket  East  
##  7 Norfolk    East  
##  8 Plymouth   East  
##  9 Suffolk    East  
## 10 Berkshire  West  
## 11 Franklin   West  
## 12 Hampden    West  
## 13 Hampshire  West  
## 14 Worcester  West
mas_cov$region %>% levels()
## [1] "East" "West"

Part (b)

mas_cov %>%
  select(deaths_per_100000, region) %>%
  ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Part (c)

model.fit <-
  mas_cov %>%
  lm(deaths_per_100000 ~ region, data = .)

model.fit %>% summary()
## 
## Call:
## lm(formula = deaths_per_100000 ~ region, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -110.212  -77.422    5.702   51.489  198.555 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  103.283      1.312  78.737  < 2e-16 ***
## regionWest     6.929      2.204   3.144  0.00168 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77.25 on 5369 degrees of freedom
## Multiple R-squared:  0.001838,   Adjusted R-squared:  0.001652 
## F-statistic: 9.885 on 1 and 5369 DF,  p-value: 0.001675
data.frame(Residuals = residuals(model.fit)) %>%
  ggplot(aes(sample = Residuals)) +
  geom_qq() +
  geom_qq_line()

mas_cov %>%
  ggplot(aes(residuals(model.fit), deaths_per_100000)) +
  geom_point()

mas_cov %>%
  ggplot(aes(residuals(model.fit), region)) +
  geom_point()

Part (d)

mas_cov %>%
  group_by(region) %>%
  summarise(Mean_deaths_per_100000 = mean(deaths_per_100000))
## # A tibble: 2 x 2
##   region Mean_deaths_per_100000
##   <fct>                   <dbl>
## 1 East                     103.
## 2 West                     110.
mas_cov %>%
  filter(region == "East") %>%
  group_by(county) %>%
  summarise(Mean_deaths_per_100000 = mean(deaths_per_100000))
## # A tibble: 9 x 2
##   county     Mean_deaths_per_100000
##   <chr>                       <dbl>
## 1 Barnstable                 87.9  
## 2 Bristol                   132.   
## 3 Dukes                       0.876
## 4 Essex                     158.   
## 5 Middlesex                 126.   
## 6 Nantucket                   9.53 
## 7 Norfolk                   139.   
## 8 Plymouth                  146.   
## 9 Suffolk                   121.
mas_cov %>%
  filter(region == "West") %>%
  group_by(county) %>%
  summarise(Mean_deaths_per_100000 = mean(deaths_per_100000))
## # A tibble: 5 x 2
##   county    Mean_deaths_per_100000
##   <chr>                      <dbl>
## 1 Berkshire                   69.3
## 2 Franklin                    94.2
## 3 Hampden                    168. 
## 4 Hampshire                   89.0
## 5 Worcester                  131.

Question 4