knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.5
## âś” forcats 1.0.1 âś” stringr 1.5.1
## âś” ggplot2 4.0.0 âś” tibble 3.3.0
## âś” lubridate 1.9.4 âś” tidyr 1.3.1
## âś” purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/refs/heads/master/csse_covid_19_data/csse_covid_19_time_series/"
file_names <- c("time_series_covid19_confirmed_US.csv",
"time_series_covid19_confirmed_global.csv",
"time_series_covid19_deaths_US.csv",
"time_series_covid19_deaths_global.csv")
paths <- str_c(url, file_names)
US_cases <- read_csv(paths[1])
## Rows: 3342 Columns: 1154
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1148): UID, code3, FIPS, Lat, Long_, 1/22/20, 1/23/20, 1/24/20, 1/25/20...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global_cases<- read_csv(paths[2])
## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
US_deaths <- read_csv(paths[3])
## Rows: 3342 Columns: 1155
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1149): UID, code3, FIPS, Lat, Long_, Population, 1/22/20, 1/23/20, 1/24...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global_deaths <- read_csv(paths[4])
## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global_cases <- global_cases %>%
pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date", values_to = "cases") %>%
select(-Lat, -Long)
global_deaths <- global_deaths %>%
pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date", values_to = "deaths") %>%
select(-Lat, -Long)
global <- global_cases %>%
full_join(global_deaths,
by = c("Province/State", "Country/Region", "date")) %>%
rename(Country_Region = `Country/Region`,
Province_State = `Province/State`) %>%
mutate(date = mdy(date))
global <- global %>% filter(cases > 0)
US_cases <- US_cases %>%
pivot_longer(cols = -(UID:Combined_Key), names_to = "date", values_to = "cases") %>%
select(Admin2:cases) %>%
mutate(date = mdy(date)) %>%
select(-Lat, -Long_)
US_deaths <- US_deaths %>%
pivot_longer(cols = -(UID:Population), names_to = "date", values_to = "deaths") %>%
select(Admin2:deaths) %>%
mutate(date = mdy(date)) %>%
select(-Lat, -Long_)
US <- US_cases %>% full_join(US_deaths)
## Joining with `by = join_by(Admin2, Province_State, Country_Region,
## Combined_Key, date)`
global <- global %>%
unite("Combined_Key", c(Province_State, Country_Region), sep = ", ", na.rm = TRUE, remove = FALSE)
lookup_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/refs/heads/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
lookup <- read_csv(lookup_url) %>% select(-iso2, -iso3, -code3, -Admin2, -Lat, -Long_, -Combined_Key)
## Rows: 4321 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): iso2, iso3, FIPS, Admin2, Province_State, Country_Region, Combined_Key
## dbl (5): UID, code3, Lat, Long_, Population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global <- global %>%
left_join(lookup, by = c("Province_State", "Country_Region")) %>%
select(-UID, -FIPS) %>%
select(Province_State, Country_Region, date, cases, deaths, Population, Combined_Key)
US_by_state <- US %>%
group_by(Province_State, Country_Region, date) %>%
summarize(cases = sum(cases, na.rm=TRUE), deaths = sum(deaths, na.rm=TRUE), Population = sum(Population, na.rm=TRUE)) %>%
mutate(deaths_per_mill = deaths * 1e6 / Population) %>%
select(Province_State, Country_Region, date, cases, deaths, deaths_per_mill, Population) %>%
ungroup()
## `summarise()` has grouped output by 'Province_State', 'Country_Region'. You can
## override using the `.groups` argument.
US_totals <- US_by_state
US_totals %>%
filter(cases > 0) %>%
ggplot(aes(x=date)) +
geom_line(aes(y = cases, color = "cases")) +
geom_point(aes(y = cases, color = "cases")) +
geom_line(aes(y = deaths, color = "deaths")) +
geom_point(aes(y = deaths, color = "deaths")) +
scale_y_log10() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 90)) +
labs(title = "COVID-19 in US (Total Cases & Deaths)", y = NULL)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## Daily New Cases and Deaths
US_by_state <- US_by_state %>%
group_by(Province_State) %>%
arrange(date) %>%
mutate(new_cases = cases - lag(cases),
new_deaths = deaths - lag(deaths)) %>%
ungroup()
US_totals <- US_totals %>%
arrange(date) %>%
mutate(new_cases = cases - lag(cases),
new_deaths = deaths - lag(deaths))
US_totals %>%
filter(!is.na(new_cases) & cases > 0) %>%
ggplot(aes(x=date)) +
geom_line(aes(y = new_cases, color = "new_cases")) +
geom_point(aes(y = new_cases, color = "new_cases")) +
geom_line(aes(y = new_deaths, color = "new_deaths")) +
geom_point(aes(y = new_deaths, color = "new_deaths")) +
scale_y_log10() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 90)) +
labs(title = "COVID-19 in US (New Cases & Deaths)", y = NULL)
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 34124 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 33173 rows containing missing values or values outside the scale range
## (`geom_point()`).
state <- "Illinois"
US_by_state %>%
filter(Province_State == state) %>%
filter(!is.na(new_cases)) %>%
ggplot(aes(x = date)) +
geom_line(aes(y = new_cases, color = "new_cases")) +
geom_point(aes(y = new_cases, color = "new_cases")) +
geom_line(aes(y = new_deaths, color = "new_deaths")) +
geom_point(aes(y = new_deaths, color = "new_deaths")) +
scale_y_log10() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 90)) +
labs(title = str_c("COVID-19 in ", state), y = NULL)
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).
US_state_totals <- US_by_state %>%
group_by(Province_State) %>%
summarize(deaths = max(deaths, na.rm=TRUE), cases = max(cases, na.rm=TRUE),
population = max(Population, na.rm=TRUE),
cases_per_thou = 1000 * cases / population,
deaths_per_thou = 1000 * deaths / population) %>%
filter(cases > 0, population > 0)
US_state_totals %>%
slice_min(deaths_per_thou, n = 10)
## # A tibble: 10 Ă— 6
## Province_State deaths cases population cases_per_thou deaths_per_thou
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 American Samoa 34 8.32e3 55641 150. 0.611
## 2 Northern Mariana Isl… 41 1.37e4 55144 248. 0.744
## 3 Virgin Islands 130 2.48e4 107268 231. 1.21
## 4 Hawaii 1841 3.81e5 1415872 269. 1.30
## 5 Vermont 929 1.53e5 623989 245. 1.49
## 6 Puerto Rico 5823 1.10e6 3754939 293. 1.55
## 7 Utah 5298 1.09e6 3205958 340. 1.65
## 8 Alaska 1486 3.08e5 740995 415. 2.01
## 9 District of Columbia 1432 1.78e5 705749 252. 2.03
## 10 Washington 15683 1.93e6 7614893 253. 2.06
US_state_totals %>%
slice_max(deaths_per_thou, n = 10)
## # A tibble: 10 Ă— 6
## Province_State deaths cases population cases_per_thou deaths_per_thou
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Arizona 33102 2443514 7278717 336. 4.55
## 2 Oklahoma 17972 1290929 3956971 326. 4.54
## 3 Mississippi 13370 990756 2976149 333. 4.49
## 4 West Virginia 7960 642760 1792147 359. 4.44
## 5 New Mexico 9061 670929 2096829 320. 4.32
## 6 Arkansas 13020 1006883 3017804 334. 4.31
## 7 Alabama 21032 1644533 4903185 335. 4.29
## 8 Tennessee 29263 2515130 6829174 368. 4.28
## 9 Michigan 42205 3064125 9986857 307. 4.23
## 10 Kentucky 18130 1718471 4467673 385. 4.06
mod <- lm(deaths_per_thou ~ cases_per_thou, data = US_state_totals)
summary(mod)
##
## Call:
## lm(formula = deaths_per_thou ~ cases_per_thou, data = US_state_totals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3352 -0.5978 0.1491 0.6535 1.2086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.36167 0.72480 -0.499 0.62
## cases_per_thou 0.01133 0.00232 4.881 9.76e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8615 on 54 degrees of freedom
## Multiple R-squared: 0.3061, Adjusted R-squared: 0.2933
## F-statistic: 23.82 on 1 and 54 DF, p-value: 9.763e-06
US_tot_w_pred <- US_state_totals %>% mutate(pred = predict(mod))
US_tot_w_pred %>%
ggplot() +
geom_point(aes(x = cases_per_thou, y = deaths_per_thou), color = "blue") +
geom_point(aes(x = cases_per_thou, y = pred), color = "red") +
labs(title = "Deaths per Thousand vs. Cases per Thousand")
US_national <- US_totals %>%
group_by(date) %>%
summarize(new_cases = sum(new_cases, na.rm=TRUE)) %>%
ungroup() %>%
mutate(days_since_start = as.numeric(date - min(date)))
# Fit and plot the trend as before
lm_trend <- lm(new_cases ~ days_since_start, data = US_national)
summary(lm_trend)
##
## Call:
## lm(formula = new_cases ~ days_since_start, data = US_national)
##
## Residuals:
## Min 1Q Median 3Q Max
## -782.8 -171.0 -130.9 5.8 3917.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 141.22182 19.50931 7.239 8.3e-13 ***
## days_since_start 0.03672 0.02958 1.241 0.215
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 330 on 1141 degrees of freedom
## Multiple R-squared: 0.001349, Adjusted R-squared: 0.0004736
## F-statistic: 1.541 on 1 and 1141 DF, p-value: 0.2147
US_national %>%
ggplot(aes(x = date, y = new_cases)) +
geom_point(color = "steelblue", alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE, color = "red", linetype = "dashed") +
labs(title = "US COVID-19: DAILY NEW CASES AND LINEAR TREND",
y = "Daily New Cases",
x = "Date") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
This report show clear correlations between case rates and mortality, and indicate significant variation among states in terms of deaths per thousand residents.
Visualization of the data supports the statistical model, illustrating how increases in case rates often correspond to higher death rates.
Possible Bias
Population Differences: Differences in population density, age distribution,
healthcare access, and testing rates can affect case and death rates
Selection Bias: Focusing on aggregated data by state may mask significant
within-state variation
Personal Bias
My background shape interpretation of the data—such as focusing on states
I am most familiar with and interpreting results using my own knowledge of
US healthcare systems
I have used R code and documenttaion to ensure the analysis is transparent and
and can be reviews
# Display R session information
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Chicago
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] lubridate_1.9.4 forcats_1.0.1 stringr_1.5.1 dplyr_1.1.4
## [5] purrr_1.1.0 readr_2.1.5 tidyr_1.3.1 tibble_3.3.0
## [9] ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 utf8_1.2.6 generics_0.1.4 lattice_0.22-7
## [5] stringi_1.8.7 hms_1.1.3 digest_0.6.37 magrittr_2.0.3
## [9] evaluate_1.0.4 grid_4.5.1 timechange_0.3.0 RColorBrewer_1.1-3
## [13] fastmap_1.2.0 Matrix_1.7-3 jsonlite_2.0.0 mgcv_1.9-3
## [17] scales_1.4.0 jquerylib_0.1.4 cli_3.6.5 rlang_1.1.6
## [21] crayon_1.5.3 bit64_4.6.0-1 splines_4.5.1 withr_3.0.2
## [25] cachem_1.1.0 yaml_2.3.10 tools_4.5.1 parallel_4.5.1
## [29] tzdb_0.5.0 curl_7.0.0 vctrs_0.6.5 R6_2.6.1
## [33] lifecycle_1.0.4 bit_4.6.0 vroom_1.6.6 pkgconfig_2.0.3
## [37] pillar_1.11.1 bslib_0.9.0 gtable_0.3.6 glue_1.8.0
## [41] xfun_0.52 tidyselect_1.2.1 rstudioapi_0.17.1 knitr_1.50
## [45] farver_2.1.2 nlme_3.1-168 htmltools_0.5.8.1 rmarkdown_2.29
## [49] labeling_0.4.3 compiler_4.5.1 S7_0.2.0