# install.packages("tidyverse")
# install.packages(haven)
# install.packages(readxl)
# These packages do not successfully install because we are missing "" at the package names
# install.packages("haven")
# install.packages("readxl")
library("tidyverse")
## -- Attaching packages -------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.1
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# storms is a dataset that comes with the tidyverse
# use View(storms) to see the full dataset
big_storms <- storms %>% # Here we tell R to take the output of the code and assign it to the name big_storms
group_by(name, year) %>%
filter(max(category) == 5)
# install.packages("maps")
library("maps")
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
ggplot(aes(x = long, y = lat, color = name), data = big_storms) +
geom_path() +
borders("world") +
coord_quickmap(xlim = c(-130, -60), ylim = c(20, 50))
covid_data <-
read_csv(paste0("https://data.cdc.gov/api/views/qfhf-uhaa/rows.csv?","accessType=DOWNLOAD&bom=true&format=true%20target="),
col_types = cols(Suppress = col_character())) %>%
mutate(week = `Week Ending Date`,
race_ethnicity = `Race/Ethnicity`,
n_deaths = `Number of Deaths`,
diff = `Difference from 2015-2019 to 2020`,
expected_deaths = n_deaths - diff,
perc_diff = `Percent Difference from 2015-2019 to 2020`,
year = MMWRYear,
week_no = MMWRWeek,
jurisdiction = Jurisdiction,
state = `State Abbreviation`
) %>%
filter(`Time Period` == "2020", Outcome == "All Cause", Type != "Unweighted") %>%
select(jurisdiction, state, week, year, week_no,
race_ethnicity, n_deaths, expected_deaths, diff, perc_diff)
View(covid_data)
We observe covid_data as tidy because each row represents one week of deaths for one race/ethnicity within one state
summary(covid_data)
## jurisdiction state week year
## Length:11010 Length:11010 Length:11010 Min. :2020
## Class :character Class :character Class :character 1st Qu.:2020
## Mode :character Mode :character Mode :character Median :2020
## Mean :2020
## 3rd Qu.:2020
## Max. :2020
##
## week_no race_ethnicity n_deaths expected_deaths
## Min. : 1.00 Length:11010 Min. : 10.0 Min. : 0.0
## 1st Qu.: 9.00 Class :character 1st Qu.: 27.0 1st Qu.: 21.0
## Median :17.00 Mode :character Median : 94.0 Median : 69.5
## Mean :17.49 Mean : 682.7 Mean : 587.9
## 3rd Qu.:26.00 3rd Qu.: 487.8 3rd Qu.: 421.0
## Max. :34.00 Max. :53723.0 Max. :47365.0
## NA's :4796 NA's :4796
## diff perc_diff
## Min. :-1359.00 Min. : -60.70
## 1st Qu.: 3.00 1st Qu.: 4.10
## Median : 12.00 Median : 18.80
## Mean : 94.84 Mean : 36.27
## 3rd Qu.: 51.00 3rd Qu.: 45.50
## Max. :11850.00 Max. :1064.70
## NA's :4796 NA's :4797
covid_data %>%
filter(n_deaths > 10000)
## # A tibble: 39 x 10
## jurisdiction state week year week_no race_ethnicity n_deaths
## <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 United Stat~ US 04/0~ 2020 14 Non-Hispanic ~ 11870
## 2 United Stat~ US 04/1~ 2020 15 Non-Hispanic ~ 13523
## 3 United Stat~ US 04/1~ 2020 16 Non-Hispanic ~ 12526
## 4 United Stat~ US 04/2~ 2020 17 Non-Hispanic ~ 11447
## 5 United Stat~ US 05/0~ 2020 18 Non-Hispanic ~ 10303
## 6 United Stat~ US 01/0~ 2020 1 Non-Hispanic ~ 45569
## 7 United Stat~ US 01/1~ 2020 2 Non-Hispanic ~ 46272
## 8 United Stat~ US 01/1~ 2020 3 Non-Hispanic ~ 45251
## 9 United Stat~ US 01/2~ 2020 4 Non-Hispanic ~ 44963
## 10 United Stat~ US 02/0~ 2020 5 Non-Hispanic ~ 44707
## # ... with 29 more rows, and 3 more variables: expected_deaths <dbl>,
## # diff <dbl>, perc_diff <dbl>
Looking at n_deaths from summary() we get Min, 1st Quarter, Median, Mean, 3rd Quarter, Max deaths, and total number of NA values. What is a little surprising is the Max value is very high compared to the rest of the summary values for n_deaths. It is a bit of an outlier. We may have super large values here because the dataset includes US deaths by race per week as if it were also a state
us_deaths_by_race <- covid_data %>%
filter(state == "US") %>%
group_by(race_ethnicity) %>%
summarize(expected_deaths = sum(expected_deaths, na.rm = TRUE),
total_additional_deaths = sum(diff, na.rm = TRUE),
percent_diff = 100 * total_additional_deaths / expected_deaths
)
## `summarise()` ungrouping output (override with `.groups` argument)
us_deaths_by_race %>%
ggplot(aes(y = race_ethnicity, x = percent_diff)) +
geom_col()+
labs(x = "Percent above expected death count",
y = "",
title = "Racial disparities of Covid-19 (USA, 2020)" )
When removing ‘+ labs()’ we observe that the default x axis title is percent_diff, and conversely the default y axis title is race_ethnicity
il_deaths_by_race <- covid_data %>%
filter(state == "IL") %>%
group_by(race_ethnicity) %>%
summarize(expected_deaths = sum(expected_deaths, na.rm = TRUE),
total_additional_deaths = sum(diff, na.rm = TRUE),
percent_diff = 100 * total_additional_deaths / expected_deaths
)
## `summarise()` ungrouping output (override with `.groups` argument)
il_deaths_by_race %>%
ggplot(aes(y = race_ethnicity, x = percent_diff)) +
geom_col()+
labs(x = "Percent above expected death count",
y = "",
title = "Racial disparities of Covid-19 (Illinois, USA, 2020)" )
## Warning: Removed 1 rows containing missing values (position_stack).
covid_data %>%
filter(state == "US") %>%
# filter(race_ethnicity %in%
# c("Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Non-Hispanic Asian")) %>%
ggplot(
aes(x = week_no,
y = perc_diff,
color = race_ethnicity)) +
geom_line() +
labs(y = "Percent above expected death count",
x = "week since January 1, 2020",
title = "Racial disparities of Covid-19 (USA, 2020)",
color = ""
)
Notice that groups with smaller populations have noisier time-series. The unfortunately-named “Other” time series is saw toothed, while the “Non-Hispanic White” line is much smoother
covid_data %>%
filter(state == "US") %>%
filter(race_ethnicity %in%
c("Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Non-Hispanic Asian")) %>%
ggplot(
aes(x = week_no,
y = perc_diff,
color = race_ethnicity)) +
geom_line() +
labs(y = "Percent above expected death count",
x = "week since January 1, 2020",
title = "Racial disparities of Covid-19 (USA, 2020)",
color = ""
)
Finally, let’s investigate the suspicuous drop off in our “Percent above expected death count” after week 30. # Modifying code to examine death counts over time
covid_data %>%
filter(state == "US") %>%
filter(race_ethnicity %in%
c("Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Non-Hispanic Asian")) %>%
ggplot(
aes(x = week_no,
y = n_deaths,
color = race_ethnicity)) +
geom_line() +
labs(y = "Percent above expected death count",
x = "week since January 1, 2020",
title = "Racial disparities of Covid-19 (USA, 2020)",
color = ""
)
covid_data %>%
filter(state == "US") %>%
filter(week_no < 30) %>%
filter(race_ethnicity %in%
c("Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Non-Hispanic Asian")) %>%
ggplot(
aes(x = week_no,
y = n_deaths,
color = race_ethnicity)) +
geom_line() +
labs(y = "Percent above expected death count",
x = "week since January 1, 2020",
title = "Racial disparities of Covid-19 (USA, 2020)",
color = ""
)