This site is under construction. Visit it again for updates

Introduction

We use functions in the following libraries:

  • RCurl # Functions to compose general HTTP requests

  • tidyverse # Set of packages for data cleaning and reshaping data

  • data.table # Fast aggregation of large data

  • reshape2 # Restructures/aggregates data using functions melt and ‘dcast’/ ’acast).

Reads data from github

urlfile <- getURL("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
cases_UScounties_full <- read.csv(text = urlfile)

Removes data for territories and unknown fips

cases_UScounties <- subset(cases_UScounties_full,
     cases_UScounties_full$state !="Guam" &
     cases_UScounties_full$state !="Virgin Islands" &
     cases_UScounties_full$state !="Puerto Rico" &
     cases_UScounties_full$state !="Northern Mariana Islands")

Removes missing county data

cases_UScounties <- subset(cases_UScounties_full,
                           cases_UScounties_full$county !="Unknown") 

# some cases are assigned county== "Kansas City"ord "New York City" with fips=='00NA' CONSIDER REMOVING

Reformats “fips” code to match NIST

# Adds a leading zero to 4-digit fips
cases_UScounties$fips <- sprintf("%05d",as.numeric(cases_UScounties$fips))

Adds new fields for “week” and “month”

# 'month' field
cases_UScounties$month <- lubridate::month(
              as.Date(cases_UScounties$date), label=TRUE)

# 'week' field with a label

cases_UScounties$week <- strftime(cases_UScounties$date, format = "%Y-W%V")

Creates new fields “new cases” and “new deaths”

(should be done by fips but it doesn’t work due to missing/misleading values–see comment above)

#(using the lag function in dplyr)
cases_UScounties <- cases_UScounties %>%
                          group_by(state,county) %>%
                          mutate(new_cases = cases - dplyr::lag(cases))

cases_UScounties <- cases_UScounties %>%
                          group_by(state,county) %>%
                          mutate(new_deaths = deaths - dplyr::lag(deaths))

cases_UScounties[is.na(cases_UScounties)] = 0   # not sure about this

Summarize by month

data_by_month <- cases_UScounties %>%
                       group_by(state, month) %>%
                       summarize(sum(new_cases))
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
new_cases_by_month <- dcast(data_by_month, 
                           state ~ month, 
                           value.var = "sum(new_cases)"
                           )

Summarize by week

# summarize by week
data_by_week <- cases_UScounties %>%
                      group_by(state, week) %>%
                      summarize(sum(new_cases))
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
# pivot state by week
new_cases_by_week <- dcast(data_by_week, 
                            state ~ week, 
                            value.var = "sum(new_cases)"
                            )
Summary of New Cases by Month
state Jan Feb Mar Apr May Jun
Alabama NA NA 929 6056 10884 11597
Alaska NA NA 125 212 110 357
Arizona 0 0 1279 6350 12288 30262
Arkansas NA NA 492 2534 3951 7517
California 0 16 8505 41882 62640 61740
Colorado NA NA 2792 12258 11233 3970
Connecticut NA NA 3040 24187 14729 3472
Delaware NA NA 314 4390 4738 1215
Summary of New Cases by Week
state 2020-W18 2020-W19 2020-W20 2020-W21 2020-W22 2020-W23 2020-W24 2020-W25
Alabama 1467 2001 1882 2707 3474 2548 5115 3934
Alaska 27 11 18 11 57 116 140 101
Arizona 2114 2479 2818 2402 3597 6988 9029 14245
Arkansas 471 628 741 1145 1293 2097 2999 2421
California 11380 12978 12315 14374 18371 18883 20956 21901
Colorado 3125 3083 2374 2194 2315 1623 1128 1219
Connecticut 4228 4279 3955 3049 1709 1733 1118 621
Delaware 1160 1221 1212 1134 695 480 323 412