Initial Setup and Import Libraries

knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(readr)
library(zoo)
library(scales)
library(sf)
library(urbnmapr)
library(plotly)
library(knitr)

setwd("~/R_ScratchWork")

Data Input and Parsing

This markdown file leverages data from:

  1. United States Census Bureau (https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/)
  2. COVID-19 Dashboard by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University

It’s important to point out that there are a few small issues with the CSSE data. In particular, in some cases the cumulative number of cases for a given US county actually decreases slightly. This is odd, since the cumulative number of cases should be a monotonically increasing function with respect to time. Hence, there may be instances where our code returns a negative new case rate. However, these events seem rare and much smaller in magnitude to the growing number of cases. For now, lets just take the data as it is:

#Tell me where to find the latest and greatest COVID DATA
jhu_url <- paste("https://raw.githubusercontent.com/CSSEGISandData/",
                 "COVID-19/master/csse_covid_19_data/", "csse_covid_19_time_series/",
                 "time_series_covid19_confirmed_US.csv", sep = "")

jhu_deaths <- paste("https://raw.githubusercontent.com/CSSEGISandData/",
                 "COVID-19/master/csse_covid_19_data/", "csse_covid_19_time_series/",
                 "time_series_covid19_deaths_US.csv", sep = "")

census_url <- paste("https://www2.census.gov/programs-surveys/popest/",
                    "datasets/2010-2019/counties/totals/co-est2019-alldata.csv",sep = "")

We begin by reading in the COVID data from JHU. Note that we do a bit of data cleaning. For example, we remove rows with FIPS codes of “NA”

#Read in the COVID data
covidData <-
  read_csv(jhu_url) %>%
  rename(province = "Province_State",
         country_region = "Country_Region",
         county="Admin2")  %>%
  select(-c(UID,iso2,iso3,code3,country_region,Lat,Long_,Combined_Key)) %>% 
  pivot_longer(-c(province,county,FIPS), names_to = "d", 
               values_to = "cumulative_cases") %>%
  separate(d,c("Month","Day","Year"),sep="/") %>%
  mutate(dstring=sprintf("%02i/%02i/%02i",   #some parsing to make dates work correctly
                         as.numeric(Month), 
                         as.numeric(Day), 
                         as.numeric(Year)),
         d=as.Date(dstring,"%m/%d/%y")) %>%
  select(d,county,province,FIPS,cumulative_cases) %>%
  arrange(d) %>%
  group_by(FIPS) %>%  #for each FIPS ID, calculate new cases per day
  mutate(new_cases = cumulative_cases-dplyr::lag(cumulative_cases,1)) %>%
  ungroup() %>%
  filter(!is.na(FIPS))

Next we’ll turn our attention to the COVID death data.

#Read in the COVID death data
covidDeath <-
  read_csv(jhu_deaths) %>%
  rename(province = "Province_State",
         country_region = "Country_Region",
         county="Admin2")  %>%
  select(-c(UID,iso2,iso3,code3,country_region,Lat,Long_,Combined_Key)) %>% 
  pivot_longer(-c(province,Population,county,FIPS), names_to = "d", 
               values_to = "cumulative_deaths") %>%
  separate(d,c("Month","Day","Year"),sep="/") %>%
  mutate(dstring=sprintf("%02i/%02i/%02i",   #some parsing to make dates work correctly
                         as.numeric(Month), 
                         as.numeric(Day), 
                         as.numeric(Year)),
         d=as.Date(dstring,"%m/%d/%y")) %>%
  select(d,county,province,FIPS,cumulative_deaths) %>%
  arrange(d) %>%
  group_by(FIPS) %>%  #for each FIPS ID, calculate new cases per day
  mutate(new_deaths = cumulative_deaths-dplyr::lag(cumulative_deaths,1)) %>%
  ungroup() %>%
  filter(!is.na(FIPS))

Merging the data sets together:

#Join the death data to the case data
covidData <- 
  covidData %>%
  left_join(select(covidDeath,d,FIPS,cumulative_deaths,new_deaths),
            by = c("FIPS","d"),
            na_matches="never")

rm(covidDeath)

Now, since we also care about per capita measures, we need population estimates for every county. The COVID death data does appear to have a population field, but we’ll just go ahead and pull the population estimate from the US Census Bureau. Reading in US Census population data and add the 2019 estimate for population into the COVID data:

#Read in Population Data
co_est2019_alldata =
  read_csv(census_url) %>%
  select(STATE,COUNTY,POPESTIMATE2019) %>%
  rename(population="POPESTIMATE2019") %>%
  mutate(FIPS=as.numeric(str_c(as.character(STATE),as.character(COUNTY)))) 



#Combine Covid and population Data
covidData =
  covidData %>%
  left_join(select(co_est2019_alldata,FIPS,population),by="FIPS") %>%
  filter(!is.na(population))

Calculate Rolling Averages and Plot Prep

Let’s calculate some rolling averages for both the number of cases and the number of deaths for each county using the rollmean function from the zoo library.

# calculate 7 day rolling averages
dCovid = covidData %>%
  mutate(ncap=new_cases/population*1e5) %>%
  mutate(dcap=new_deaths/population*1e5) %>%
  group_by(FIPS) %>%
  mutate(rnew=rollmean(new_cases,7,fill=NA,align="right")) %>%
  mutate(rdnew=rollmean(new_deaths,7,fill=NA,align="right")) %>%
  mutate(rncap=rollmean(ncap,7,fill=NA,align="right")) %>%
  mutate(rdcap=rollmean(dcap,7,fill=NA,align="right")) %>%
  ungroup()

Now, we extract the peak value and when the peak value occurred in the rolling average of new cases.

dCovid =
  dCovid %>%
  group_by(FIPS) %>%
  mutate(peak_rnew = max(rnew,na.rm=TRUE),
         peak_rdnew = max(rdnew,na.rm=TRUE),
         dateMax =d[which.max(rnew)],
         dateMax_rdnew =d[which.max(rdnew)],
         daySince = as.numeric(d - dateMax),
         daySince_rdnew = as.numeric(d - dateMax_rdnew))

Next, we need to do some more preliminaries before making our plot. We extract the latest COVID data we have from our dataset:

#take data corresponding to the latest date in the COIVD database.
dNow = max(dCovid$d)
ddCovid=dCovid %>%
  filter(d==dNow)

In order to make a map, we need to have physical geometries of counties to plot. We can grab these from the urbnmapr package:

#read in and parse the county shapefile.
counties_sf =
  get_urbn_map("counties", sf = TRUE) %>%
  mutate(FIPS=as.numeric(county_fips)) %>%
  left_join(ddCovid,by="FIPS")

#combine county shapefiles into state featuresfor plotting
states <-
  counties_sf %>%
  group_by(province) %>%
  summarise(do_union = TRUE) %>%
  st_cast()
## `summarise()` ungrouping output (override with `.groups` argument)

Generating Plots with ggplot

Now we finally create our plots using ggplot. Note that we choose to cap the color plot maximum at 90 days. First, time since maximum number of cases per day by county:

#create our plot! Time since M
pday_cases=counties_sf %>%
  ggplot() +
  geom_sf(aes(fill=daySince),size=0.1) +
  geom_sf(data=states,fill=NA,size=0.5) +
  scale_fill_gradient2(low="red",mid="yellow",high="green",midpoint=90/2,
                       breaks=c(0,30,60,90),
                       labels=c("0","30","60","90+"),
                       limits=c(0,90),
                       oob=squish)+
  ggtitle(str_c("Time Since Peak New Cases | ",dNow)) +
  coord_sf(crs=5070) +
  theme_bw() +
  labs(fill = "Days")

print(pday_cases) 

Now let’s repeat for deaths, keeping the timescale (colorbar) the same.

#create our plot!
pday_deaths=counties_sf %>%
  ggplot() +
  geom_sf(aes(fill=daySince_rdnew),size=0.1) +
  geom_sf(data=states,fill=NA,size=0.5) +
  scale_fill_gradient2(low="red",mid="yellow",high="green",midpoint=90/2,
                       breaks=c(0,30,60,90),
                       labels=c("0","30","60","90+"),
                       limits=c(0,90),
                       oob=squish)+
  ggtitle(str_c("Time Since Peak New Deaths | ",dNow)) +
  coord_sf(crs=5070) +
  theme_bw() +
  labs(fill = "Days")

print(pday_deaths)

Generating Interactive Plots with plotly

Let’s also generate an interactive plotly versions to allow us to zoom in and look at the data directly. Time since peak new cases. A copy of this interactive figure can also be seen at the following link: https://rpubs.com/kangen558/peakTime

plot_interactive_deaths <-
  counties_sf %>%
  mutate(info = str_c(county,', ', state_abbv, '\n',
                      sprintf("Day Since Peak: %2d",daySince), '\n',
                      sprintf("Peak New Cases/Day: %.2f",peak_rnew), '\n',
                      sprintf("Current New Cases/Day: %.2f",rnew))) %>%
  ggplot() +
  geom_sf(aes(fill=daySince,text=info),size=0.1) +
  geom_sf(data=states,fill=NA,size=0.5) +
  scale_fill_gradient2(low="red",mid="yellow",high="green",midpoint=90/2,
                       breaks=c(0,30,60,90),
                       labels=c("0","30","60","90+"),
                       limits=c(0,90),
                       oob=squish)+
  ggtitle(str_c("Time Since Peak New Cases | ",dNow)) +
  coord_sf(crs=5070) +
  theme_bw() +
  labs(fill = "Days")
## Warning: Ignoring unknown aesthetics: text
ggplotly(plot_interactive_deaths,tooltip="info")