library(tidyverse)
library(GGally)
library(magrittr)
library(zoo) # rollmean
library(gridExtra) # grid.arrange
library(lubridate)
counties <-
read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## date = col_date(format = ""),
## county = col_character(),
## state = col_character(),
## fips = col_character(),
## cases = col_double(),
## deaths = col_double()
## )
mass2 <-
counties %>%
filter(state == "Massachusetts") %>%
select(!c(state, fips)) %>%
filter(county != "Unknown") %>%
mutate(date = as.Date(date),
# county = droplevels(county) # not necessary
)
# write_csv(mass2, "G:/My Drive/homework/Cooper P/mass2.csv")
# mass2 <- read_csv("G:/My Drive/homework/Cooper P/mass2.csv")
mass2 %>%
# filter(county != "Unknown") %>%
mutate(county = as_factor(county)) %>%
ggpairs(axisLabels = "none")
mass2 %>%
mutate(county = as_factor(county)) %>%
summary()
## date county cases deaths
## Min. :2020-02-01 Suffolk : 466 Min. : 1 Min. : 0.0
## 1st Qu.:2020-06-25 Norfolk : 436 1st Qu.: 860 1st Qu.: 49.0
## Median :2020-10-10 Middlesex: 433 Median : 7752 Median : 448.0
## Mean :2020-10-09 Berkshire: 431 Mean : 18109 Mean : 719.2
## 3rd Qu.:2021-01-25 Worcester: 430 3rd Qu.: 23468 3rd Qu.:1188.0
## Max. :2021-05-11 Essex : 428 Max. :133668 Max. :3736.0
## (Other) :3363
mass2 %>% glimpse()
## Rows: 5,987
## Columns: 4
## $ date <date> 2020-02-01, 2020-02-02, 2020-02-03, 2020-02-04, 2020-02-05, 20~
## $ county <chr> "Suffolk", "Suffolk", "Suffolk", "Suffolk", "Suffolk", "Suffolk~
## $ cases <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
# cases are cumulative
mass2 %<>%
mutate(new_cases = cases - lag(cases, k = 1), .after = cases) %>%
mutate(new_deaths = deaths - lag(deaths, k = 1), .after = deaths)
mass2 %>%
# filter(county != "Unknown") %>%
filter(deaths > 0) %>% # unnecessary
ggplot(aes(date, new_deaths, col = county)) +
geom_line()
mass2 %>%
# filter(county != "Unknown") %>%
filter(deaths > 0) %>%
ggplot(aes(date, new_deaths)) +
geom_line() +
facet_wrap(vars(county)) +
theme(axis.text.x=element_text(angle = 90, hjust = 0))
mass2 %>%
# filter(county != "Unknown") %>%
filter(deaths > 0) %>%
ggplot(aes(date, new_deaths)) +
geom_line() +
facet_wrap(vars(county), scales = "free_y") +
theme(axis.text.x=element_text(angle = 90, hjust = 0))
mass2 %<>%
drop_na() %>%
mutate(new_cases_07_day = rollmean(new_cases, k = 7, fill = NA),
.after = new_cases) %>%
mutate(new_deaths_07_day = rollmean(new_deaths, k = 7, fill = NA),
.after = new_deaths)
mass2 %>%
ggplot(aes(date, new_deaths_07_day)) +
geom_line() +
facet_wrap(vars(county)) +
theme(axis.text.x=element_text(angle = 90, hjust = 0))
mass_totals2 <-
mass2 %>%
group_by(date) %>%
select(!c(county)) %>%
summarize(across(.cols = everything(), sum))
mass_totals2 %>% glimpse()
## Rows: 465
## Columns: 7
## $ date <date> 2020-02-02, 2020-02-03, 2020-02-04, 2020-02-05, 202~
## $ cases <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ new_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ new_cases_07_day <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ new_deaths_07_day <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
# not same file listed in homework
load("G:/My Drive/homework/Cooper P/MA_county_variables_2020.RData")
ma_extra %<>% as_tibble()
ma_extra %<>%
mutate(pct_over_85 = X2019_age_85_plus / X2019_pop_est * 100,
.after = X2019_age_85_plus)
ma_extra %>%
ggplot(aes(pct_over_85, fill = county)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ma_extra %<>%
mutate(pct_NHWA_2019 = pct_NHWA_2019 * 100)
ma_extra %>%
ggplot(aes(pct_NHWA_2019, fill = county)) +
geom_histogram()
No codebook
https://cran.r-project.org/web/packages/egg/vignettes/Ecosystem.html
mas_cov <-
mass2 %>%
full_join(ma_extra) %>%
mutate(cases_per_100000 = cases / X2019_pop_est * 100000,
.after = new_cases_07_day) %>%
mutate(new_cases_per_100000 = new_cases / X2019_pop_est * 100000,
.after = cases_per_100000) %>%
mutate(new_cases_per_100000_07_day = new_cases_07_day / X2019_pop_est * 100000,
.after = new_cases_per_100000) %>%
mutate(deaths_per_100000 = deaths / X2019_pop_est * 100000,
.after = new_deaths_07_day) %>%
mutate(new_deaths_per_100000 = new_deaths / X2019_pop_est * 100000,
.after = deaths_per_100000) %>%
mutate(new_deaths_per_100000_07_day = new_cases_07_day / X2019_pop_est * 100000,
.after = new_deaths_per_100000)
## Joining, by = "county"
mas_cov %>% select(contains("100000")) %>% summary()
## cases_per_100000 new_cases_per_100000 new_cases_per_100000_07_day
## Min. : 0.062 Min. :-1159400 Min. :-20432.1
## 1st Qu.: 681.462 1st Qu.: -9904 1st Qu.: -473.9
## Median : 1771.117 Median : -270 Median : 0.0
## Mean : 3157.819 Mean : -46696 Mean : 3594.9
## 3rd Qu.: 4850.126 3rd Qu.: 1641 3rd Qu.: 865.6
## Max. :13229.231 Max. : 12061 Max. :111872.0
## NA's :6
## deaths_per_100000 new_deaths_per_100000 new_deaths_per_100000_07_day
## Min. : 0.00 Min. :-32774.80 Min. :-20432.1
## 1st Qu.: 35.62 1st Qu.: -483.59 1st Qu.: -473.9
## Median :121.73 Median : -4.23 Median : 0.0
## Mean :116.33 Mean : -1888.43 Mean : 3594.9
## 3rd Qu.:171.73 3rd Qu.: 119.50 3rd Qu.: 865.6
## Max. :321.63 Max. : 299.35 Max. :111872.0
## NA's :6
p1 <-
mas_cov %>%
# filter(county != "Unknown") %>%
ggplot(aes(date, cases_per_100000, col = county)) +
geom_line()
p2 <-
mas_cov %>%
# filter(county != "Unknown") %>%
ggplot(aes(date, new_cases_per_100000, col = county)) +
geom_line()
p3 <-
mas_cov %>%
# filter(county != "Unknown") %>%
ggplot(aes(date, new_cases_per_100000_07_day, col = county)) +
geom_line()
p4 <-
mas_cov %>%
# filter(county != "Unknown") %>%
ggplot(aes(date, deaths_per_100000, col = county)) +
geom_line()
p5 <-
mas_cov %>%
# filter(county != "Unknown") %>%
ggplot(aes(date, new_deaths_per_100000, col = county)) +
geom_line()
p6 <-
mas_cov %>%
# filter(county != "Unknown") %>%
ggplot(aes(date, new_deaths_per_100000_07_day, col = county)) +
geom_line()
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 3)
to_string <- as_labeller(c(`cases_per_100000` = "Cases",
`new_cases_per_100000` = "New Cases",
`new_cases_per_100000_07_day` = "New Cases 7 Days",
`deaths_per_100000` = "Deaths",
`new_deaths_per_100000` = "New Deaths",
`new_deaths_per_100000_07_day` = "New Deaths 7 Days"))
mas_cov %>%
select(contains("100000"), county, date) %>%
# filter(county != "Unknown") %>%
pivot_longer(names_to = "Variable", values_to = "value", !c(date, county)) %>%
ggplot(aes(date, value, col = county)) +
geom_line() +
facet_wrap(vars(Variable), scales = "free_y", labeller = to_string) +
theme(axis.text.x=element_text(angle = 90, hjust = 0)) +
ggtitle("Per 100,000 each")
totals <-
mas_cov %>%
select(county, cases_per_100000, deaths_per_100000) %>%
# filter(county != "Unknown") %>%
group_by(county) %>%
summarize(total_cases_per_100000 = sum(cases_per_100000),
total_deaths_per_100000 = sum(deaths_per_100000))
totals %>% arrange(desc(total_cases_per_100000))
## # A tibble: 14 x 3
## county total_cases_per_100000 total_deaths_per_100000
## <chr> <dbl> <dbl>
## 1 Essex 2101150. 73684.
## 2 Suffolk 2092557. 60918.
## 3 Bristol 1807737. 63285.
## 4 Hampden 1700176. 77502.
## 5 Nantucket 1689648. 3553.
## 6 Plymouth 1528565. 67092.
## 7 Worcester 1518618. 62108.
## 8 Middlesex 1403567. 59122.
## 9 Norfolk 1254503. 65592.
## 10 Barnstable 885027. 42817.
## 11 Dukes 848367. 323.
## 12 Hampshire 808431. 41243.
## 13 Berkshire 713328. 36724.
## 14 Franklin 551030. 42368.
totals %>% arrange(desc(total_deaths_per_100000))
## # A tibble: 14 x 3
## county total_cases_per_100000 total_deaths_per_100000
## <chr> <dbl> <dbl>
## 1 Hampden 1700176. 77502.
## 2 Essex 2101150. 73684.
## 3 Plymouth 1528565. 67092.
## 4 Norfolk 1254503. 65592.
## 5 Bristol 1807737. 63285.
## 6 Worcester 1518618. 62108.
## 7 Suffolk 2092557. 60918.
## 8 Middlesex 1403567. 59122.
## 9 Barnstable 885027. 42817.
## 10 Franklin 551030. 42368.
## 11 Hampshire 808431. 41243.
## 12 Berkshire 713328. 36724.
## 13 Nantucket 1689648. 3553.
## 14 Dukes 848367. 323.
mas_cov %<>%
mutate(month = month(date, label = TRUE), .after = date) %>%
mutate(year = year(date), .after = month)
table(mas_cov$month, mas_cov$year)
##
## 2020 2021
## Jan 0 434
## Feb 28 392
## Mar 274 434
## Apr 420 420
## May 434 154
## Jun 420 0
## Jul 434 0
## Aug 434 0
## Sep 420 0
## Oct 434 0
## Nov 420 0
## Dec 434 0
Instructions unclear
mas_cov %>%
# filter(county != "Unknown") %>%
ggplot(aes(new_cases_per_100000, county)) +
geom_boxplot() +
facet_grid(cols = vars(month), rows = vars(year)) +
theme(axis.text.x=element_text(angle = 90, hjust = 0))
Instructions unclear
mas_cov %>%
filter(county == "Hampshire" | county == "Franklin") %>%
ggplot(aes(new_cases_per_100000, fill = county)) +
geom_boxplot() +
facet_grid(rows = vars(month), cols = vars(year)) +
theme(axis.text.x=element_text(angle = 90, hjust = 0))
Instructions unclear
mas_cov %>%
# filter(county != "Unknown") %>%
group_by(county, year, month) %>%
summarize(mean_cases_per_100000 = mean(cases_per_100000)) %>%
ungroup() %>%
ggplot(aes(month, mean_cases_per_100000, fill = county)) +
geom_col() +
facet_wrap(vars(year)) +
theme(axis.text.x=element_text(angle = 90, hjust = 0))
## `summarise()` has grouped output by 'county', 'year'. You can override using the `.groups` argument.
mas_cov %>%
# filter(county != "Unknown") %>%
group_by(Urban_rural) %>%
summarize(total_cases_per_100000 = sum(cases_per_100000))
## # A tibble: 2 x 2
## Urban_rural total_cases_per_100000
## <fct> <dbl>
## 1 rural 3089046.
## 2 urban 15813661.