# install.packages("tidyverse")
# install.packages(haven)
# install.packages(readxl)
# These packages do not successfully install because we are missing "" at the package names
# install.packages("haven")
# install.packages("readxl")

library("tidyverse")
## -- Attaching packages -------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.1
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# storms is a dataset that comes with the tidyverse
# use View(storms) to see the full dataset

big_storms <- storms %>% # Here we tell R to take the output of the code and assign it to the name big_storms
  group_by(name, year) %>%
  filter(max(category) == 5)

Plotting the dataset

# install.packages("maps")
library("maps")
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
ggplot(aes(x = long, y = lat, color = name), data = big_storms) +
  geom_path() +
  borders("world") +
  coord_quickmap(xlim = c(-130, -60), ylim = c(20, 50))

Processing and analyzing data, an example:

covid_data <-
  read_csv(paste0("https://data.cdc.gov/api/views/qfhf-uhaa/rows.csv?","accessType=DOWNLOAD&bom=true&format=true%20target="),
           col_types = cols(Suppress = col_character())) %>%
  mutate(week = `Week Ending Date`,
         race_ethnicity = `Race/Ethnicity`,
         n_deaths = `Number of Deaths`,
         diff = `Difference from 2015-2019 to 2020`,
         expected_deaths = n_deaths - diff,
         perc_diff = `Percent Difference from 2015-2019 to 2020`,
         year = MMWRYear,
         week_no = MMWRWeek,
         jurisdiction = Jurisdiction,
         state = `State Abbreviation`
         ) %>%
filter(`Time Period` == "2020", Outcome == "All Cause", Type != "Unweighted") %>%
select(jurisdiction, state, week, year, week_no,
       race_ethnicity, n_deaths, expected_deaths, diff, perc_diff)

View(covid_data)

We observe covid_data as tidy because each row represents one week of deaths for one race/ethnicity within one state

summary(covid_data)
##  jurisdiction          state               week                year     
##  Length:11010       Length:11010       Length:11010       Min.   :2020  
##  Class :character   Class :character   Class :character   1st Qu.:2020  
##  Mode  :character   Mode  :character   Mode  :character   Median :2020  
##                                                           Mean   :2020  
##                                                           3rd Qu.:2020  
##                                                           Max.   :2020  
##                                                                         
##     week_no      race_ethnicity        n_deaths       expected_deaths  
##  Min.   : 1.00   Length:11010       Min.   :   10.0   Min.   :    0.0  
##  1st Qu.: 9.00   Class :character   1st Qu.:   27.0   1st Qu.:   21.0  
##  Median :17.00   Mode  :character   Median :   94.0   Median :   69.5  
##  Mean   :17.49                      Mean   :  682.7   Mean   :  587.9  
##  3rd Qu.:26.00                      3rd Qu.:  487.8   3rd Qu.:  421.0  
##  Max.   :34.00                      Max.   :53723.0   Max.   :47365.0  
##                                     NA's   :4796      NA's   :4796     
##       diff            perc_diff      
##  Min.   :-1359.00   Min.   : -60.70  
##  1st Qu.:    3.00   1st Qu.:   4.10  
##  Median :   12.00   Median :  18.80  
##  Mean   :   94.84   Mean   :  36.27  
##  3rd Qu.:   51.00   3rd Qu.:  45.50  
##  Max.   :11850.00   Max.   :1064.70  
##  NA's   :4796       NA's   :4797
covid_data %>%
filter(n_deaths > 10000)
## # A tibble: 39 x 10
##    jurisdiction state week   year week_no race_ethnicity n_deaths
##    <chr>        <chr> <chr> <dbl>   <dbl> <chr>             <dbl>
##  1 United Stat~ US    04/0~  2020      14 Non-Hispanic ~    11870
##  2 United Stat~ US    04/1~  2020      15 Non-Hispanic ~    13523
##  3 United Stat~ US    04/1~  2020      16 Non-Hispanic ~    12526
##  4 United Stat~ US    04/2~  2020      17 Non-Hispanic ~    11447
##  5 United Stat~ US    05/0~  2020      18 Non-Hispanic ~    10303
##  6 United Stat~ US    01/0~  2020       1 Non-Hispanic ~    45569
##  7 United Stat~ US    01/1~  2020       2 Non-Hispanic ~    46272
##  8 United Stat~ US    01/1~  2020       3 Non-Hispanic ~    45251
##  9 United Stat~ US    01/2~  2020       4 Non-Hispanic ~    44963
## 10 United Stat~ US    02/0~  2020       5 Non-Hispanic ~    44707
## # ... with 29 more rows, and 3 more variables: expected_deaths <dbl>,
## #   diff <dbl>, perc_diff <dbl>

Looking at n_deaths from summary() we get Min, 1st Quarter, Median, Mean, 3rd Quarter, Max deaths, and total number of NA values. What is a little surprising is the Max value is very high compared to the rest of the summary values for n_deaths. It is a bit of an outlier. We may have super large values here because the dataset includes US deaths by race per week as if it were also a state

us_deaths_by_race <- covid_data %>%
filter(state == "US") %>%
group_by(race_ethnicity) %>%
summarize(expected_deaths = sum(expected_deaths, na.rm = TRUE),
total_additional_deaths = sum(diff, na.rm = TRUE),
percent_diff = 100 * total_additional_deaths / expected_deaths
)
## `summarise()` ungrouping output (override with `.groups` argument)

Plotting US deaths by race

us_deaths_by_race %>%
ggplot(aes(y = race_ethnicity, x = percent_diff)) +
geom_col()+
labs(x = "Percent above expected death count",
y = "",
title = "Racial disparities of Covid-19 (USA, 2020)" )

When removing ‘+ labs()’ we observe that the default x axis title is percent_diff, and conversely the default y axis title is race_ethnicity

Plotting deaths by race for the state of IL (Illinois)

il_deaths_by_race <- covid_data %>%
filter(state == "IL") %>% 
group_by(race_ethnicity) %>%
summarize(expected_deaths = sum(expected_deaths, na.rm = TRUE),
total_additional_deaths = sum(diff, na.rm = TRUE),
percent_diff = 100 * total_additional_deaths / expected_deaths
)
## `summarise()` ungrouping output (override with `.groups` argument)
il_deaths_by_race %>%
ggplot(aes(y = race_ethnicity, x = percent_diff)) +
geom_col()+
labs(x = "Percent above expected death count",
y = "",
title = "Racial disparities of Covid-19 (Illinois, USA, 2020)" )
## Warning: Removed 1 rows containing missing values (position_stack).

Including time series element into Plots

covid_data %>%
  filter(state == "US") %>%
  # filter(race_ethnicity %in%
  # c("Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Non-Hispanic Asian")) %>%
  ggplot(
    aes(x = week_no, 
        y = perc_diff, 
        color = race_ethnicity)) + 
  geom_line() + 
  labs(y = "Percent above expected death count", 
       x = "week since January 1, 2020", 
       title = "Racial disparities of Covid-19 (USA, 2020)", 
       color = "" 
       )

Notice that groups with smaller populations have noisier time-series. The unfortunately-named “Other” time series is saw toothed, while the “Non-Hispanic White” line is much smoother

Running above code again after removing ‘#’

covid_data %>%
  filter(state == "US") %>%
  filter(race_ethnicity %in%
  c("Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Non-Hispanic Asian")) %>%
  ggplot(
    aes(x = week_no, 
        y = perc_diff, 
        color = race_ethnicity)) + 
  geom_line() + 
  labs(y = "Percent above expected death count", 
       x = "week since January 1, 2020", 
       title = "Racial disparities of Covid-19 (USA, 2020)", 
       color = "" 
       )

Finally, let’s investigate the suspicuous drop off in our “Percent above expected death count” after week 30. # Modifying code to examine death counts over time

covid_data %>%
  filter(state == "US") %>%
  filter(race_ethnicity %in%
  c("Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Non-Hispanic Asian")) %>%
  ggplot(
    aes(x = week_no, 
        y = n_deaths, 
        color = race_ethnicity)) + 
  geom_line() + 
  labs(y = "Percent above expected death count", 
       x = "week since January 1, 2020", 
       title = "Racial disparities of Covid-19 (USA, 2020)", 
       color = "" 
       )

Cutoff the data sooner

covid_data %>%
  filter(state == "US") %>%
  filter(week_no < 30) %>%
  filter(race_ethnicity %in%
  c("Hispanic", "Non-Hispanic White", "Non-Hispanic Black", "Non-Hispanic Asian")) %>%
  ggplot(
    aes(x = week_no, 
        y = n_deaths, 
        color = race_ethnicity)) + 
  geom_line() + 
  labs(y = "Percent above expected death count", 
       x = "week since January 1, 2020", 
       title = "Racial disparities of Covid-19 (USA, 2020)", 
       color = "" 
       )