library(tidyverse)
library(GGally)
library(magrittr)
library(zoo) # rollmean
library(gridExtra) # grid.arrange
library(lubridate)

Part 2

Problem 1

counties <-
  read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   date = col_date(format = ""),
##   county = col_character(),
##   state = col_character(),
##   fips = col_character(),
##   cases = col_double(),
##   deaths = col_double()
## )
mass2 <-
  counties %>%
  filter(state == "Massachusetts") %>%
  select(!c(state, fips)) %>%
  filter(county != "Unknown") %>%
  mutate(date = as.Date(date),
         # county = droplevels(county) # not necessary
         )

# write_csv(mass2, "G:/My Drive/homework/Cooper P/mass2.csv")

Part (a)

# mass2 <- read_csv("G:/My Drive/homework/Cooper P/mass2.csv")

mass2 %>%
  # filter(county != "Unknown") %>%
  mutate(county = as_factor(county)) %>%
  ggpairs(axisLabels = "none")

mass2 %>%
  mutate(county = as_factor(county)) %>%
  summary()
##       date                  county         cases            deaths      
##  Min.   :2020-02-01   Suffolk  : 466   Min.   :     1   Min.   :   0.0  
##  1st Qu.:2020-06-25   Norfolk  : 436   1st Qu.:   860   1st Qu.:  49.0  
##  Median :2020-10-10   Middlesex: 433   Median :  7752   Median : 448.0  
##  Mean   :2020-10-09   Berkshire: 431   Mean   : 18109   Mean   : 719.2  
##  3rd Qu.:2021-01-25   Worcester: 430   3rd Qu.: 23468   3rd Qu.:1188.0  
##  Max.   :2021-05-11   Essex    : 428   Max.   :133668   Max.   :3736.0  
##                       (Other)  :3363
mass2 %>% glimpse()
## Rows: 5,987
## Columns: 4
## $ date   <date> 2020-02-01, 2020-02-02, 2020-02-03, 2020-02-04, 2020-02-05, 20~
## $ county <chr> "Suffolk", "Suffolk", "Suffolk", "Suffolk", "Suffolk", "Suffolk~
## $ cases  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~

Part (b)

# cases are cumulative
mass2 %<>%
  mutate(new_cases = cases - lag(cases, k = 1), .after = cases) %>%
  mutate(new_deaths = deaths - lag(deaths, k = 1), .after = deaths)

mass2 %>%
  # filter(county != "Unknown") %>%
  filter(deaths > 0) %>% # unnecessary
  ggplot(aes(date, new_deaths, col = county)) +
  geom_line()

mass2 %>%
  # filter(county != "Unknown") %>%
  filter(deaths > 0) %>%
  ggplot(aes(date, new_deaths)) +
  geom_line() +
  facet_wrap(vars(county)) +
  theme(axis.text.x=element_text(angle = 90, hjust = 0))

mass2 %>%
  # filter(county != "Unknown") %>%
  filter(deaths > 0) %>%
  ggplot(aes(date, new_deaths)) +
  geom_line() +
  facet_wrap(vars(county), scales = "free_y") +
  theme(axis.text.x=element_text(angle = 90, hjust = 0))

Part (c)

mass2 %<>%
  drop_na() %>%
  mutate(new_cases_07_day = rollmean(new_cases, k = 7, fill = NA),
         .after = new_cases) %>%
  mutate(new_deaths_07_day = rollmean(new_deaths, k = 7, fill = NA),
         .after = new_deaths)
  
mass2 %>%
  ggplot(aes(date, new_deaths_07_day)) +
  geom_line() +
  facet_wrap(vars(county)) +
  theme(axis.text.x=element_text(angle = 90, hjust = 0))

Part (d)

mass_totals2 <-
  mass2 %>%
  group_by(date) %>%
  select(!c(county)) %>%
  summarize(across(.cols = everything(), sum))
  
mass_totals2 %>% glimpse()
## Rows: 465
## Columns: 7
## $ date              <date> 2020-02-02, 2020-02-03, 2020-02-04, 2020-02-05, 202~
## $ cases             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ new_cases         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ new_cases_07_day  <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ deaths            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ new_deaths        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ new_deaths_07_day <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~

Problem 2

# not same file listed in homework
load("G:/My Drive/homework/Cooper P/MA_county_variables_2020.RData")
ma_extra %<>% as_tibble()

Part (a)

ma_extra %<>%
  mutate(pct_over_85 = X2019_age_85_plus / X2019_pop_est * 100,
         .after = X2019_age_85_plus)
  
ma_extra %>%
  ggplot(aes(pct_over_85, fill = county)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Part (b)

ma_extra %<>%
  mutate(pct_NHWA_2019 = pct_NHWA_2019 * 100)

ma_extra %>%
  ggplot(aes(pct_NHWA_2019, fill = county)) +
  geom_histogram()

Part (c)

No codebook

Part (d)

https://cran.r-project.org/web/packages/egg/vignettes/Ecosystem.html

mas_cov <- 
  mass2 %>%
  full_join(ma_extra) %>%
  
  mutate(cases_per_100000 = cases / X2019_pop_est * 100000,
         .after = new_cases_07_day) %>%
  
  mutate(new_cases_per_100000 = new_cases / X2019_pop_est * 100000,
         .after = cases_per_100000) %>%
  
  mutate(new_cases_per_100000_07_day = new_cases_07_day / X2019_pop_est * 100000,
         .after = new_cases_per_100000) %>%
  
  mutate(deaths_per_100000 = deaths / X2019_pop_est * 100000,
         .after = new_deaths_07_day) %>%
  
  mutate(new_deaths_per_100000 = new_deaths / X2019_pop_est * 100000,
         .after = deaths_per_100000) %>%
  
  mutate(new_deaths_per_100000_07_day = new_cases_07_day / X2019_pop_est * 100000,
          .after = new_deaths_per_100000)
## Joining, by = "county"
mas_cov %>% select(contains("100000")) %>% summary()
##  cases_per_100000    new_cases_per_100000 new_cases_per_100000_07_day
##  Min.   :    0.062   Min.   :-1159400     Min.   :-20432.1           
##  1st Qu.:  681.462   1st Qu.:   -9904     1st Qu.:  -473.9           
##  Median : 1771.117   Median :    -270     Median :     0.0           
##  Mean   : 3157.819   Mean   :  -46696     Mean   :  3594.9           
##  3rd Qu.: 4850.126   3rd Qu.:    1641     3rd Qu.:   865.6           
##  Max.   :13229.231   Max.   :   12061     Max.   :111872.0           
##                                           NA's   :6                  
##  deaths_per_100000 new_deaths_per_100000 new_deaths_per_100000_07_day
##  Min.   :  0.00    Min.   :-32774.80     Min.   :-20432.1            
##  1st Qu.: 35.62    1st Qu.:  -483.59     1st Qu.:  -473.9            
##  Median :121.73    Median :    -4.23     Median :     0.0            
##  Mean   :116.33    Mean   : -1888.43     Mean   :  3594.9            
##  3rd Qu.:171.73    3rd Qu.:   119.50     3rd Qu.:   865.6            
##  Max.   :321.63    Max.   :   299.35     Max.   :111872.0            
##                                          NA's   :6
p1 <-
  mas_cov %>%
  # filter(county != "Unknown") %>%
  ggplot(aes(date, cases_per_100000, col = county)) + 
  geom_line()
  
p2 <-
  mas_cov %>%
  # filter(county != "Unknown") %>%
  ggplot(aes(date, new_cases_per_100000, col = county)) + 
  geom_line()

p3 <-
  mas_cov %>%
  # filter(county != "Unknown") %>%
  ggplot(aes(date, new_cases_per_100000_07_day, col = county)) + 
  geom_line()

p4 <-
  mas_cov %>%
  # filter(county != "Unknown") %>%
  ggplot(aes(date, deaths_per_100000, col = county)) + 
  geom_line()

p5 <-
  mas_cov %>%
  # filter(county != "Unknown") %>%
  ggplot(aes(date, new_deaths_per_100000, col = county)) + 
  geom_line()

p6 <-
  mas_cov %>%
  # filter(county != "Unknown") %>%
  ggplot(aes(date, new_deaths_per_100000_07_day, col = county)) + 
  geom_line()

grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 3)

to_string <- as_labeller(c(`cases_per_100000` = "Cases",
                           `new_cases_per_100000` = "New Cases",
                           `new_cases_per_100000_07_day` = "New Cases 7 Days",
                           `deaths_per_100000` = "Deaths",
                           `new_deaths_per_100000` = "New Deaths",
                           `new_deaths_per_100000_07_day` = "New Deaths 7 Days"))


mas_cov %>%
  select(contains("100000"), county, date) %>%
  # filter(county != "Unknown") %>%
  pivot_longer(names_to = "Variable", values_to = "value", !c(date, county)) %>%
  ggplot(aes(date, value, col = county)) +
  geom_line() +
  facet_wrap(vars(Variable), scales = "free_y", labeller = to_string) +
  theme(axis.text.x=element_text(angle = 90, hjust = 0)) +
  ggtitle("Per 100,000 each")

Part (e)

totals <- 
  mas_cov %>%
  select(county, cases_per_100000, deaths_per_100000) %>%
  # filter(county != "Unknown") %>%
  group_by(county) %>%
  summarize(total_cases_per_100000 = sum(cases_per_100000),
            total_deaths_per_100000 = sum(deaths_per_100000))

totals %>% arrange(desc(total_cases_per_100000))
## # A tibble: 14 x 3
##    county     total_cases_per_100000 total_deaths_per_100000
##    <chr>                       <dbl>                   <dbl>
##  1 Essex                    2101150.                  73684.
##  2 Suffolk                  2092557.                  60918.
##  3 Bristol                  1807737.                  63285.
##  4 Hampden                  1700176.                  77502.
##  5 Nantucket                1689648.                   3553.
##  6 Plymouth                 1528565.                  67092.
##  7 Worcester                1518618.                  62108.
##  8 Middlesex                1403567.                  59122.
##  9 Norfolk                  1254503.                  65592.
## 10 Barnstable                885027.                  42817.
## 11 Dukes                     848367.                    323.
## 12 Hampshire                 808431.                  41243.
## 13 Berkshire                 713328.                  36724.
## 14 Franklin                  551030.                  42368.
totals %>% arrange(desc(total_deaths_per_100000))
## # A tibble: 14 x 3
##    county     total_cases_per_100000 total_deaths_per_100000
##    <chr>                       <dbl>                   <dbl>
##  1 Hampden                  1700176.                  77502.
##  2 Essex                    2101150.                  73684.
##  3 Plymouth                 1528565.                  67092.
##  4 Norfolk                  1254503.                  65592.
##  5 Bristol                  1807737.                  63285.
##  6 Worcester                1518618.                  62108.
##  7 Suffolk                  2092557.                  60918.
##  8 Middlesex                1403567.                  59122.
##  9 Barnstable                885027.                  42817.
## 10 Franklin                  551030.                  42368.
## 11 Hampshire                 808431.                  41243.
## 12 Berkshire                 713328.                  36724.
## 13 Nantucket                1689648.                   3553.
## 14 Dukes                     848367.                    323.

Problem 3

Part (a)

mas_cov %<>%
  mutate(month = month(date, label = TRUE), .after = date) %>%
  mutate(year = year(date), .after = month)

table(mas_cov$month, mas_cov$year)
##      
##       2020 2021
##   Jan    0  434
##   Feb   28  392
##   Mar  274  434
##   Apr  420  420
##   May  434  154
##   Jun  420    0
##   Jul  434    0
##   Aug  434    0
##   Sep  420    0
##   Oct  434    0
##   Nov  420    0
##   Dec  434    0

Part (b)

Instructions unclear

mas_cov %>%
  # filter(county != "Unknown") %>%
  ggplot(aes(new_cases_per_100000, county)) +
  geom_boxplot() + 
  facet_grid(cols = vars(month), rows = vars(year)) +
  theme(axis.text.x=element_text(angle = 90, hjust = 0))

Part (c)

Instructions unclear

mas_cov %>%
  filter(county == "Hampshire" | county == "Franklin") %>%
  ggplot(aes(new_cases_per_100000, fill = county)) +
  geom_boxplot() + 
  facet_grid(rows = vars(month), cols = vars(year)) +
  theme(axis.text.x=element_text(angle = 90, hjust = 0))

Part (d)

Instructions unclear

mas_cov %>%
  # filter(county != "Unknown") %>%
  group_by(county, year, month) %>%
  summarize(mean_cases_per_100000 = mean(cases_per_100000)) %>% 
  ungroup() %>%
  ggplot(aes(month, mean_cases_per_100000, fill = county)) +
  geom_col() +
  facet_wrap(vars(year)) +
  theme(axis.text.x=element_text(angle = 90, hjust = 0))
## `summarise()` has grouped output by 'county', 'year'. You can override using the `.groups` argument.

Part (f)

mas_cov %>%
  # filter(county != "Unknown") %>%
  group_by(Urban_rural) %>%
  summarize(total_cases_per_100000 = sum(cases_per_100000))
## # A tibble: 2 x 2
##   Urban_rural total_cases_per_100000
##   <fct>                        <dbl>
## 1 rural                     3089046.
## 2 urban                    15813661.