knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)

Load Data

url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/refs/heads/master/csse_covid_19_data/csse_covid_19_time_series/"
file_names <- c("time_series_covid19_confirmed_US.csv",
                "time_series_covid19_confirmed_global.csv",
                "time_series_covid19_deaths_US.csv",
                "time_series_covid19_deaths_global.csv")
paths <- str_c(url, file_names)

US_cases <- read_csv(paths[1])

## Rows: 3342 Columns: 1154
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1148): UID, code3, FIPS, Lat, Long_, 1/22/20, 1/23/20, 1/24/20, 1/25/20...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

global_cases<- read_csv(paths[2])

## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

US_deaths <- read_csv(paths[3])

## Rows: 3342 Columns: 1155
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1149): UID, code3, FIPS, Lat, Long_, Population, 1/22/20, 1/23/20, 1/24...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

global_deaths <- read_csv(paths[4])

## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Tidy Global Data

global_cases <- global_cases %>%
  pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
               names_to = "date", values_to = "cases") %>%
  select(-Lat, -Long)

global_deaths <- global_deaths %>%
  pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
               names_to = "date", values_to = "deaths") %>%
  select(-Lat, -Long)

global <- global_cases %>%
  full_join(global_deaths,
            by = c("Province/State", "Country/Region", "date")) %>%
  rename(Country_Region = `Country/Region`,
         Province_State = `Province/State`) %>%
  mutate(date = mdy(date))
global <- global %>% filter(cases > 0)

Tidy US Data

US_cases <- US_cases %>%
  pivot_longer(cols = -(UID:Combined_Key), names_to = "date", values_to = "cases") %>%
  select(Admin2:cases) %>%
  mutate(date = mdy(date)) %>%
  select(-Lat, -Long_)
US_deaths <- US_deaths %>%
  pivot_longer(cols = -(UID:Population), names_to = "date", values_to = "deaths") %>%
  select(Admin2:deaths) %>%
  mutate(date = mdy(date)) %>%
  select(-Lat, -Long_)
US <- US_cases %>% full_join(US_deaths)

## Joining with `by = join_by(Admin2, Province_State, Country_Region,
## Combined_Key, date)`

Harmonize Datasets

global <- global %>%
  unite("Combined_Key", c(Province_State, Country_Region), sep = ", ", na.rm = TRUE, remove = FALSE)
lookup_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/refs/heads/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
lookup <- read_csv(lookup_url) %>% select(-iso2, -iso3, -code3, -Admin2, -Lat, -Long_, -Combined_Key)

## Rows: 4321 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): iso2, iso3, FIPS, Admin2, Province_State, Country_Region, Combined_Key
## dbl (5): UID, code3, Lat, Long_, Population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

global <- global %>%
  left_join(lookup, by = c("Province_State", "Country_Region")) %>%
  select(-UID, -FIPS) %>%
  select(Province_State, Country_Region, date, cases, deaths, Population, Combined_Key)

US COVID-19 by State

US_by_state <- US %>%
  group_by(Province_State, Country_Region, date) %>%
  summarize(cases = sum(cases, na.rm=TRUE), deaths = sum(deaths, na.rm=TRUE), Population = sum(Population, na.rm=TRUE)) %>%
  mutate(deaths_per_mill = deaths * 1e6 / Population) %>%
  select(Province_State, Country_Region, date, cases, deaths, deaths_per_mill, Population) %>%
  ungroup()

## `summarise()` has grouped output by 'Province_State', 'Country_Region'. You can
## override using the `.groups` argument.

US_totals <- US_by_state

US COVID-19 Visualization (Total Cases & Deaths)

US_totals %>%
  filter(cases > 0) %>%
  ggplot(aes(x=date)) +
    geom_line(aes(y = cases, color = "cases")) +
    geom_point(aes(y = cases, color = "cases")) +
    geom_line(aes(y = deaths, color = "deaths")) +
    geom_point(aes(y = deaths, color = "deaths")) +
    scale_y_log10() +
    theme(legend.position = "bottom",
          axis.text.x = element_text(angle = 90)) +
    labs(title = "COVID-19 in US (Total Cases & Deaths)", y = NULL)

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.

## Daily New Cases and Deaths

US_by_state <- US_by_state %>%
    group_by(Province_State) %>%
    arrange(date) %>%
    mutate(new_cases = cases - lag(cases),
           new_deaths = deaths - lag(deaths)) %>%
    ungroup()
US_totals <- US_totals %>%
    arrange(date) %>%
    mutate(new_cases = cases - lag(cases),
           new_deaths = deaths - lag(deaths))

US New Cases & Deaths Plot

US_totals %>%
  filter(!is.na(new_cases) & cases > 0) %>%
  ggplot(aes(x=date)) +
    geom_line(aes(y = new_cases, color = "new_cases")) +
    geom_point(aes(y = new_cases, color = "new_cases")) +
    geom_line(aes(y = new_deaths, color = "new_deaths")) +
    geom_point(aes(y = new_deaths, color = "new_deaths")) +
    scale_y_log10() +
    theme(legend.position = "bottom",
          axis.text.x = element_text(angle = 90)) +
    labs(title = "COVID-19 in US (New Cases & Deaths)", y = NULL)

## Warning in transformation$transform(x): NaNs produced

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning in transformation$transform(x): NaNs produced

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning in transformation$transform(x): NaNs produced

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning in transformation$transform(x): NaNs produced

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 34124 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 33173 rows containing missing values or values outside the scale range
## (`geom_point()`).

Focus on a Single State

state <- "Illinois"
US_by_state %>%
  filter(Province_State == state) %>%
  filter(!is.na(new_cases)) %>%
  ggplot(aes(x = date)) +
    geom_line(aes(y = new_cases, color = "new_cases")) +
    geom_point(aes(y = new_cases, color = "new_cases")) +
    geom_line(aes(y = new_deaths, color = "new_deaths")) +
    geom_point(aes(y = new_deaths, color = "new_deaths")) +
    scale_y_log10() +
    theme(legend.position = "bottom",
          axis.text.x = element_text(angle = 90)) +
    labs(title = str_c("COVID-19 in ", state), y = NULL)

## Warning in transformation$transform(x): NaNs produced

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning in transformation$transform(x): NaNs produced

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning in transformation$transform(x): NaNs produced

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning in transformation$transform(x): NaNs produced

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

Compare States by Death Rate

US_state_totals <- US_by_state %>%
  group_by(Province_State) %>%
  summarize(deaths = max(deaths, na.rm=TRUE), cases = max(cases, na.rm=TRUE),
            population = max(Population, na.rm=TRUE),
            cases_per_thou = 1000 * cases / population,
            deaths_per_thou = 1000 * deaths / population) %>%
  filter(cases > 0, population > 0)
US_state_totals %>%
  slice_min(deaths_per_thou, n = 10)

## # A tibble: 10 × 6
##    Province_State        deaths  cases population cases_per_thou deaths_per_thou
##    <chr>                  <dbl>  <dbl>      <dbl>          <dbl>           <dbl>
##  1 American Samoa            34 8.32e3      55641           150.           0.611
##  2 Northern Mariana Isl…     41 1.37e4      55144           248.           0.744
##  3 Virgin Islands           130 2.48e4     107268           231.           1.21 
##  4 Hawaii                  1841 3.81e5    1415872           269.           1.30 
##  5 Vermont                  929 1.53e5     623989           245.           1.49 
##  6 Puerto Rico             5823 1.10e6    3754939           293.           1.55 
##  7 Utah                    5298 1.09e6    3205958           340.           1.65 
##  8 Alaska                  1486 3.08e5     740995           415.           2.01 
##  9 District of Columbia    1432 1.78e5     705749           252.           2.03 
## 10 Washington             15683 1.93e6    7614893           253.           2.06

US_state_totals %>%
  slice_max(deaths_per_thou, n = 10)

## # A tibble: 10 × 6
##    Province_State deaths   cases population cases_per_thou deaths_per_thou
##    <chr>           <dbl>   <dbl>      <dbl>          <dbl>           <dbl>
##  1 Arizona         33102 2443514    7278717           336.            4.55
##  2 Oklahoma        17972 1290929    3956971           326.            4.54
##  3 Mississippi     13370  990756    2976149           333.            4.49
##  4 West Virginia    7960  642760    1792147           359.            4.44
##  5 New Mexico       9061  670929    2096829           320.            4.32
##  6 Arkansas        13020 1006883    3017804           334.            4.31
##  7 Alabama         21032 1644533    4903185           335.            4.29
##  8 Tennessee       29263 2515130    6829174           368.            4.28
##  9 Michigan        42205 3064125    9986857           307.            4.23
## 10 Kentucky        18130 1718471    4467673           385.            4.06

Regression: Deaths vs. Cases

mod <- lm(deaths_per_thou ~ cases_per_thou, data = US_state_totals)
summary(mod)

## 
## Call:
## lm(formula = deaths_per_thou ~ cases_per_thou, data = US_state_totals)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3352 -0.5978  0.1491  0.6535  1.2086 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.36167    0.72480  -0.499     0.62    
## cases_per_thou  0.01133    0.00232   4.881 9.76e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8615 on 54 degrees of freedom
## Multiple R-squared:  0.3061, Adjusted R-squared:  0.2933 
## F-statistic: 23.82 on 1 and 54 DF,  p-value: 9.763e-06

US_tot_w_pred <- US_state_totals %>% mutate(pred = predict(mod))
US_tot_w_pred %>%
  ggplot() +
    geom_point(aes(x = cases_per_thou, y = deaths_per_thou), color = "blue") +
    geom_point(aes(x = cases_per_thou, y = pred), color = "red") +
    labs(title = "Deaths per Thousand vs. Cases per Thousand")

Linear Regression of New Cases

US_national <- US_totals %>%
  group_by(date) %>%
  summarize(new_cases = sum(new_cases, na.rm=TRUE)) %>%
  ungroup() %>%
  mutate(days_since_start = as.numeric(date - min(date)))

# Fit and plot the trend as before
lm_trend <- lm(new_cases ~ days_since_start, data = US_national)
summary(lm_trend)

## 
## Call:
## lm(formula = new_cases ~ days_since_start, data = US_national)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -782.8 -171.0 -130.9    5.8 3917.1 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      141.22182   19.50931   7.239  8.3e-13 ***
## days_since_start   0.03672    0.02958   1.241    0.215    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 330 on 1141 degrees of freedom
## Multiple R-squared:  0.001349,   Adjusted R-squared:  0.0004736 
## F-statistic: 1.541 on 1 and 1141 DF,  p-value: 0.2147

US_national %>%
  ggplot(aes(x = date, y = new_cases)) +
    geom_point(color = "steelblue", alpha = 0.5) +
    geom_smooth(method = "lm", se = TRUE, color = "red", linetype = "dashed") +
    labs(title = "US COVID-19: DAILY NEW CASES AND LINEAR TREND",
         y = "Daily New Cases",
         x = "Date") +
    theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Conclusion

This report show clear correlations between case rates and mortality, and indicate significant variation among states in terms of deaths per thousand residents.

Visualization of the data supports the statistical model, illustrating how increases in case rates often correspond to higher death rates.

Possible Bias
Population Differences: Differences in population density, age distribution,
healthcare access, and testing rates can affect case and death rates
Selection Bias: Focusing on aggregated data by state may mask significant
within-state variation

Personal Bias
My background shape interpretation of the data—such as focusing on states
I am most familiar with and interpreting results using my own knowledge of
US healthcare systems

I have used R code and documenttaion to ensure the analysis is transparent and 
and can be reviews

Session Info

# Display R session information
sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.9.4 forcats_1.0.1   stringr_1.5.1   dplyr_1.1.4    
##  [5] purrr_1.1.0     readr_2.1.5     tidyr_1.3.1     tibble_3.3.0   
##  [9] ggplot2_4.0.0   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        utf8_1.2.6         generics_0.1.4     lattice_0.22-7    
##  [5] stringi_1.8.7      hms_1.1.3          digest_0.6.37      magrittr_2.0.3    
##  [9] evaluate_1.0.4     grid_4.5.1         timechange_0.3.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      Matrix_1.7-3       jsonlite_2.0.0     mgcv_1.9-3        
## [17] scales_1.4.0       jquerylib_0.1.4    cli_3.6.5          rlang_1.1.6       
## [21] crayon_1.5.3       bit64_4.6.0-1      splines_4.5.1      withr_3.0.2       
## [25] cachem_1.1.0       yaml_2.3.10        tools_4.5.1        parallel_4.5.1    
## [29] tzdb_0.5.0         curl_7.0.0         vctrs_0.6.5        R6_2.6.1          
## [33] lifecycle_1.0.4    bit_4.6.0          vroom_1.6.6        pkgconfig_2.0.3   
## [37] pillar_1.11.1      bslib_0.9.0        gtable_0.3.6       glue_1.8.0        
## [41] xfun_0.52          tidyselect_1.2.1   rstudioapi_0.17.1  knitr_1.50        
## [45] farver_2.1.2       nlme_3.1-168       htmltools_0.5.8.1  rmarkdown_2.29    
## [49] labeling_0.4.3     compiler_4.5.1     S7_0.2.0

COVID-19 US and Global Data Analysis

Igor Polidva

Load Data

Tidy Global Data

Tidy US Data

Harmonize Datasets

US COVID-19 by State

US COVID-19 Visualization (Total Cases & Deaths)

US New Cases & Deaths Plot

Focus on a Single State

Compare States by Death Rate

Regression: Deaths vs. Cases

Linear Regression of New Cases

Conclusion

Session Info