## load necessary libraries
library(tidyverse)
library(gt)
library(sf)
library(e1071)
library(tmap)
library(maptiles)
library(sfdep)

##cumulative covid cases and deaths on April 1, 2020
covid_deaths_04012020 <-
  st_read("https://drive.google.com/uc?export=download&id=1spBLqVxa25a7FDo7U7aNDJF2t2N0-uXv")
## Reading layer `covid_cases_deaths_04012020' from data source 
##   `https://drive.google.com/uc?export=download&id=1spBLqVxa25a7FDo7U7aNDJF2t2N0-uXv' 
##   using driver `GeoJSON'
## Simple feature collection with 2169 features and 4 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -2303834 ymin: 343590.7 xmax: 2156703 ymax: 3126169
## Projected CRS: NAD83 / Conus Albers
##cumulative covid cases and deaths on July 1, 2020
covid_deaths_07012020 <-
  st_read("https://drive.google.com/uc?export=download&id=1ZNka0ffMXSVAqVDCj1R-oYh37BH9ziAZ")
## Reading layer `covid_cases_deaths_07012020' from data source 
##   `https://drive.google.com/uc?export=download&id=1ZNka0ffMXSVAqVDCj1R-oYh37BH9ziAZ' 
##   using driver `GeoJSON'
## Simple feature collection with 3008 features and 4 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -2303834 ymin: 343590.7 xmax: 2200081 ymax: 3126169
## Projected CRS: NAD83 / Conus Albers
glimpse(covid_deaths_04012020)
## Rows: 2,169
## Columns: 5
## $ county   <chr> "Houston", "Choctaw", "Pickens", "Colbert", "La Paz", "Apache…
## $ state    <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Arizona", "Arizo…
## $ cases    <dbl> 9, 4, 4, 4, 2, 17, 3, 3518, 91, 380, 4, 14, 1, 23, 19, 101, 2…
## $ deaths   <dbl> 1, 0, 0, 0, 0, 0, 0, 65, 1, 8, 0, 0, 0, 0, 1, 3, 11, 10, 0, 9…
## $ geometry <POINT [m]> POINT (1014496 954669.3), POINT (726596.9 1023551), POI…
glimpse(covid_deaths_07012020)
## Rows: 3,008
## Columns: 5
## $ county   <chr> "Houston", "Choctaw", "Barbour", "Pickens", "Colbert", "La Pa…
## $ state    <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Arizo…
## $ cases    <dbl> 449, 193, 326, 212, 377, 345, 2381, 38, 50, 105507, 1229, 617…
## $ deaths   <dbl> 6, 12, 1, 6, 5, 5, 90, 0, 1, 3402, 9, 136, 0, 0, 0, 3, 1, 4, …
## $ geometry <POINT [m]> POINT (1014496 954669.3), POINT (726596.9 1023551), POI…
st_crs(covid_deaths_04012020)
## Coordinate Reference System:
##   User input: NAD83 / Conus Albers 
##   wkt:
## PROJCRS["NAD83 / Conus Albers",
##     BASEGEOGCRS["NAD83",
##         DATUM["North American Datum 1983",
##             ELLIPSOID["GRS 1980",6378137,298.257222101,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4269]],
##     CONVERSION["Conus Albers",
##         METHOD["Albers Equal Area",
##             ID["EPSG",9822]],
##         PARAMETER["Latitude of false origin",23,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8821]],
##         PARAMETER["Longitude of false origin",-96,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8822]],
##         PARAMETER["Latitude of 1st standard parallel",29.5,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8823]],
##         PARAMETER["Latitude of 2nd standard parallel",45.5,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8824]],
##         PARAMETER["Easting at false origin",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8826]],
##         PARAMETER["Northing at false origin",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8827]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["Data analysis and small scale data presentation for contiguous lower 48 states."],
##         AREA["United States (USA) - CONUS onshore - Alabama; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming."],
##         BBOX[24.41,-124.79,49.38,-66.91]],
##     ID["EPSG",5070]]
covid_deaths_04012020 <- st_transform(covid_deaths_04012020, 4326)
covid_deaths_07012020 <- st_transform(covid_deaths_07012020, 4326)

st_geometry_type(covid_deaths_04012020) %>% table()
## .
##           GEOMETRY              POINT         LINESTRING            POLYGON 
##                  0               2169                  0                  0 
##         MULTIPOINT    MULTILINESTRING       MULTIPOLYGON GEOMETRYCOLLECTION 
##                  0                  0                  0                  0 
##     CIRCULARSTRING      COMPOUNDCURVE       CURVEPOLYGON         MULTICURVE 
##                  0                  0                  0                  0 
##       MULTISURFACE              CURVE            SURFACE  POLYHEDRALSURFACE 
##                  0                  0                  0                  0 
##                TIN           TRIANGLE 
##                  0                  0
covid_deaths_04012020 <- st_make_valid(covid_deaths_04012020)
covid_deaths_07012020 <- st_make_valid(covid_deaths_07012020)

Introduction

This brief analyzes the geographic distribution and shifting intensity of the COVID-19 pandemic across U.S. counties between April 1, 2020, and July 1, 2020. By integrating descriptive statistics, log-scaled distributional visualizations, and spatial weighted mean center tracking, we demonstrate a massive transition from a highly localized outbreak centered in the Northeast to a broad, nationally dispersed crisis pushing heavily into the South and West.

Descriptive Statistics

# Pull non-spatial summary stats (drop geometry for speed)
stats_april <- covid_deaths_04012020 %>%
  st_drop_geometry() %>%
  summarise(
    Date = "April 1, 2020",
    Cases_Mean = mean(cases, na.rm = TRUE),
    Cases_Median = median(cases, na.rm = TRUE),
    Cases_SD = sd(cases, na.rm = TRUE),
    Cases_Max = max(cases, na.rm = TRUE),
    Cases_Skew = skewness(cases, na.rm = TRUE),
    Deaths_Mean = mean(deaths, na.rm = TRUE),
    Deaths_Median = median(deaths, na.rm = TRUE),
    Deaths_SD = sd(deaths, na.rm = TRUE),
    Deaths_Max = max(deaths, na.rm = TRUE),
    Deaths_Skew = skewness(deaths, na.rm = TRUE)
  )

stats_july <- covid_deaths_07012020 %>%
  st_drop_geometry() %>%
  summarise(
    Date = "July 1, 2020",
    Cases_Mean = mean(cases, na.rm = TRUE),
    Cases_Median = median(cases, na.rm = TRUE),
    Cases_SD = sd(cases, na.rm = TRUE),
    Cases_Max = max(cases, na.rm = TRUE),
    Cases_Skew = skewness(cases, na.rm = TRUE),
    Deaths_Mean = mean(deaths, na.rm = TRUE),
    Deaths_Median = median(deaths, na.rm = TRUE),
    Deaths_SD = sd(deaths, na.rm = TRUE),
    Deaths_Max = max(deaths, na.rm = TRUE),
    Deaths_Skew = skewness(deaths, na.rm = TRUE)
  )

desc_table <- bind_rows(stats_april, stats_july)

desc_table %>%
  gt() %>%
  tab_header(title = "Descriptive Statistics: Cumulative Cases and Deaths") %>%
  fmt_number(columns = -Date, decimals = 1) %>%
  cols_label(
    Cases_Mean = "Mean", Cases_Median = "Median", Cases_SD = "SD",
    Cases_Max = "Max", Cases_Skew = "Skewness",
    Deaths_Mean = "Mean", Deaths_Median = "Median", Deaths_SD = "SD",
    Deaths_Max = "Max", Deaths_Skew = "Skewness"
  ) %>%
  tab_spanner(label = "Cumulative Cases", columns = starts_with("Cases")) %>%
  tab_spanner(label = "Cumulative Deaths", columns = starts_with("Deaths"))
Descriptive Statistics: Cumulative Cases and Deaths
Date
Cumulative Cases
Cumulative Deaths
Mean Median SD Max Skewness Mean Median SD Max Skewness
April 1, 2020 96.3 5.0 1,123.7 47,914.0 36.5 2.2 0.0 40.4 1,848.0 44.1
July 1, 2020 876.2 86.0 5,463.6 220,143.0 26.4 40.9 2.0 448.7 22,574.0 43.0

Descriptive Statistics Breakdown: To understand the overall scale and structure of the outbreak, we computed cumulative metrics across all reporting counties. Based on the data, the average county-level cases jumped by 810% (from 96.3 to 876.2) and deaths by 1,759% (from 2.2 to 40.9) between April and July. The extreme right-skewness in April (36.5 for cases) shows that the early pandemic was driven entirely by a few hyper-dense metropolitan epicenters, while the growing medians by July indicate a transition toward widespread national community transmission.

Non-Map Visualizations

What these show (Cases): This log-scale histogram shows the county case distribution shifting from a sharp, zero-inflated spike in April to a broader, more normal-looking curve in July, visually demonstrating the geographic diffusion of the virus across the country.

What these show (Deaths): This chart reveals a severe mortality lag; while most counties had zero or one death in April, the distribution widens significantly by July as widespread community infections eventually translated into a widespread mortality burden.

combined <- bind_rows(
  covid_deaths_04012020 %>% st_drop_geometry() %>% mutate(Date = "April 1, 2020"),
  covid_deaths_07012020 %>% st_drop_geometry() %>% mutate(Date = "July 1, 2020")
)

ggplot(combined, aes(x = cases + 1)) +
  geom_histogram(bins = 40, fill = "#4B9CD3", color = "white") +
  scale_x_log10() +
  facet_wrap(~Date) +
  labs(
    title = "Distribution of Cumulative Cases by County",
    x = "Cumulative Cases (log scale)", y = "Number of Counties"
  ) +
  theme_minimal(base_size = 12)

ggplot(combined, aes(x = deaths + 1)) +
  geom_histogram(bins = 40, fill = "#13294B", color = "white") +
  scale_x_log10() +
  facet_wrap(~Date) +
  labs(
    title = "Distribution of Cumulative Deaths by County",
    x = "Cumulative Deaths (log scale)", y = "Number of Counties"
  ) +
  theme_minimal(base_size = 12)

Weighted Mean Centers

The weighted mean center for cases shifted drastically southwest from (-82.553, 39.374) in April to (-87.990, 37.658) in July, capturing the explosion of the “Sunbelt wave” in the South and West. Conversely, the death center shifted slightly northeast, proving that the lingering, lagging mortality toll from the initial Northeast wave was still dominating national cumulative death data by mid-summer.

# Get county centroids (make sure geometry is valid / projected first)
get_wmc <- function(data, weight_col) {
  cent <- st_centroid(st_make_valid(data))
  coords <- st_coordinates(cent)
  w <- data[[weight_col]]
  tibble(
    x = weighted.mean(coords[, 1], w, na.rm = TRUE),
    y = weighted.mean(coords[, 2], w, na.rm = TRUE)
  )
}

wmc_cases_april  <- get_wmc(covid_deaths_04012020, "cases")
wmc_deaths_april <- get_wmc(covid_deaths_04012020, "deaths")
wmc_cases_july   <- get_wmc(covid_deaths_07012020, "cases")
wmc_deaths_july  <- get_wmc(covid_deaths_07012020, "deaths")

wmc_table <- bind_rows(
  wmc_cases_april  %>% mutate(Date = "April 1", Metric = "Cases"),
  wmc_deaths_april %>% mutate(Date = "April 1", Metric = "Deaths"),
  wmc_cases_july   %>% mutate(Date = "July 1",  Metric = "Cases"),
  wmc_deaths_july  %>% mutate(Date = "July 1",  Metric = "Deaths")
) %>% relocate(Date, Metric)

wmc_table %>%
  gt() %>%
  tab_header(title = "Weighted Mean Centers (Longitude, Latitude)") %>%
  fmt_number(columns = c(x, y), decimals = 3)
Weighted Mean Centers (Longitude, Latitude)
Date Metric x y
April 1 Cases −82.553 39.374
April 1 Deaths −83.611 39.597
July 1 Cases −87.990 37.658
July 1 Deaths −82.656 39.266
# Convert to sf points for mapping
wmc_sf <- st_as_sf(wmc_table, coords = c("x", "y"), crs = st_crs(covid_deaths_04012020))

Dual-Panel Maps: April vs. July

tmap_mode("plot")
tmap_options(show.messages = FALSE)

wmc_april_sf <- wmc_sf %>% filter(Date == "April 1")
wmc_july_sf  <- wmc_sf %>% filter(Date == "July 1")

# Download background map tiles of the US
bg_tiles_april <- maptiles::get_tiles(covid_deaths_04012020, provider = "CartoDB.Positron", zoom = 4)
bg_tiles_july  <- maptiles::get_tiles(covid_deaths_07012020, provider = "CartoDB.Positron", zoom = 4)

tmap_ver <- as.integer(substr(as.character(packageVersion("tmap")), 1, 1))

# Build April Map with Color-Coded Case Dots
if (tmap_ver >= 4) {
  map_april <- tm_shape(bg_tiles_april) + 
    tm_rgb() + 
    tm_shape(covid_deaths_04012020) +
    tm_dots(
      fill = "cases", 
      fill.scale = tm_scale_intervals(style = "jenks", values = "YlOrRd"),
      fill.legend = tm_legend(title = "Cumulative Cases"),
      size = 0.06
    ) +
    tm_shape(wmc_april_sf) +
    tm_dots(fill = "black", size = 0.6, shape = 8) +
    tm_title("April 1, 2020") +
    tm_layout(legend.outside = TRUE, frame = TRUE)
} else {
  map_april <- tm_shape(bg_tiles_april) + 
    tm_rgb() + 
    tm_shape(covid_deaths_04012020) +
    tm_dots(
      col = "cases", 
      style = "jenks", 
      palette = "YlOrRd", 
      size = 0.06, 
      title = "Cumulative Cases"
    ) +
    tm_shape(wmc_april_sf) +
    tm_dots(col = "black", size = 0.25, shape = 8) +
    tm_layout(title = "April 1, 2020", legend.outside = TRUE, frame = TRUE)
}

# Build July Map with Color-Coded Case Dots
if (tmap_ver >= 4) {
  map_july <- tm_shape(bg_tiles_july) + 
    tm_rgb() + 
    tm_shape(covid_deaths_07012020) +
    tm_dots(
      fill = "cases", 
      fill.scale = tm_scale_intervals(style = "jenks", values = "YlOrRd"),
      fill.legend = tm_legend(title = "Cumulative Cases"),
      size = 0.06
    ) +
    tm_shape(wmc_july_sf) +
    tm_dots(fill = "black", size = 0.6, shape = 8) +
    tm_title("July 1, 2020") +
    tm_layout(legend.outside = TRUE, frame = TRUE)
} else {
  map_july <- tm_shape(bg_tiles_july) + 
    tm_rgb() + 
    tm_shape(covid_deaths_07012020) +
    tm_dots(
      col = "cases", 
      style = "jenks", 
      palette = "YlOrRd", 
      size = 0.06, 
      title = "Cumulative Cases"
    ) +
    tm_shape(wmc_july_sf) +
    tm_dots(col = "black", size = 0.25, shape = 8) +
    tm_layout(title = "July 1, 2020", legend.outside = TRUE, frame = TRUE)
}

# Arrange side-by-side
tmap_arrange(map_april, map_july, ncol = 2)

Interpretation: The April panel features tightly clustered, isolated orange dots concentrated in the Northeast and a few metropolitan areas, with both case and death weighted mean centers overlapping closely in eastern Ohio. By July, the map displays a widespread expansion of yellow and orange points across the Sunbelt, Midwest, and Pacific coast. This geographic diffusion drove a clear split in the weighted mean centers: the case asterisk pulled sharply west into Indiana to track the active community spread, while the lagging mortality center remained anchored further east.

Synthesis and Policy Recommendation

Summary of geographic change: In April 2020, COVID-19 cases and deaths were heavily localized in a few high-density metropolitan areas, primarily centered in the Northeast corridor. This extreme concentration is mathematically demonstrated by an enormous initial case skewness of 36.5, with the national weighted mean center resting firmly in the Ohio/Kentucky region.

By July 1, 2020, the pandemic underwent a massive geographic diffusion. The weighted mean center for cases took a dramatic leap southwest toward Illinois and Missouri, pulled by the explosive “Sunbelt wave” crashing across Texas, Arizona, Florida, and California. Concurrently, the nationwide case skewness dropped to 26.4 and median county cases jumped from 5.0 to 86.0, indicating that the virus shifted from an isolated, urban crisis into a decentralized, generalized national burden impacting suburban and rural counties alike.

Recommended resource allocation strategy: To optimize public health impact and prevent local healthcare systems from collapsing, we propose a Dynamic Priority Tiering Matrix for resource allocation (PPE, ventilators, personnel, and testing kits):

Priority 1: High Volume / High Trend (Tier 1 - Crimson Counties): Immediate deployment of emergency medical staff, temporary field hospital kits, and maximum shipments of ventilators. (Targeting emerging Southern/Western hotspots in July).

Priority 2: Low Volume / High Trend (Tier 3 - Orange Counties): Heavy allocation of testing kits, contact-tracing personnel, and public communication funding. Intercepting the virus in these emerging counties yields the highest return on investment to prevent them from scaling into Tier 1 epicenters.

Priority 3: High Volume / Low Trend (Tier 2 - Yellow Counties): Maintenance of baseline PPE supply chains, transition from emergency triage to long-term ICU capacity stabilization, and economic recovery support (Targeting stabilizing Northeast counties).

Limitation: Decision-makers must note that this analysis relies entirely on absolute cumulative counts aggregated at the county scale. This introduces two major structural limitations:

Population Density Bias: Raw counts inherently flag highly populated urban counties as major epicenters simply because more people live there, potentially masking severe, high-percentage per capita outbreaks in small, rural communities.

Testing Disparities: Early 2020 was plagued by severe diagnostic shortages. The data maps variations in testing availability just as much as it maps actual viral prevalence. A county showing zero cases in April may simply have lacked the kits to detect them.