Overview

For this analysis, we'll be looking at COVID-19 cases and deaths by state over time. We'll be using the CDC dataset. For our analysis, we will be doing the following:

Compare and comfirm/probable cases by season and state
Compare and comfirm/probable deaths by season and state
Compare the time gap between data creation and submission by state

Step 1: CSV

First, we'll read the dataset in

setwd('~/git/CUNY.MDS/DATA607/')
dataset.1 = as.data.table(read.csv(url('https://data.cdc.gov/resource/9mfq-cb36.csv')))
head(dataset.1) %>% kable() %>% kable_styling( full_width = F)

submission_date	state	tot_cases	conf_cases	prob_cases	new_case	pnew_case	tot_death	conf_death	prob_death	new_death	pnew_death	created_at	consent_cases	consent_deaths
2021-12-01T00:00:00.000	ND	163565	135705	27860	589	220	1907	NA	NA	9	0	2021-12-02T14:35:20.922	Agree	Not agree
2020-08-17T00:00:00.000	MD	100715	NA	NA	503	0	3765	3616	149	3	0	2020-08-19T00:00:00.000	N/A	Agree
2021-07-20T00:00:00.000	MD	464491	NA	NA	155	0	9822	9604	218	3	1	2021-07-22T00:00:00.000	N/A	Agree
2020-05-13T00:00:00.000	VT	855	NA	NA	2	0	52	NA	NA	0	0	2020-05-15T00:00:00.000	Not agree	Not agree
2021-01-21T00:00:00.000	NC	706315	632991	73324	7113	1040	10066	9048	1018	113	22	2021-01-21T00:00:00.000	Agree	Agree
2021-03-11T00:00:00.000	WA	346788	NA	NA	674	89	5107	NA	NA	7	0	2021-03-13T00:00:00.000	N/A	N/A

Right off the bat, we notice this dataset has a wide structure--which is great for the purposes of exporting the CSV file! We do need to manipulate the date stamps though and change a couple of these variables to booleans:

dataset.1[ , `:=` (submission_date = as_date(submission_date), created_at = as_date(created_at))]
dataset.1[consent_cases == 'Agree', consent_cases.boolean := T]
dataset.1[consent_cases == 'Not agree', consent_cases.boolean := F]
dataset.1[!(consent_cases %in%  c('Agree', 'Not agree')), consent_cases.boolean := NA]

dataset.1[consent_deaths == 'Agree', consent_deaths.boolean := T]
dataset.1[consent_deaths == 'Not agree', consent_deaths.boolean := F]
dataset.1[!(consent_deaths %in%  c('Agree', 'Not agree')), consent_deaths.boolean := NA]

head(dataset.1) %>% kable() %>% kable_styling( full_width = F)

submission_date	state	tot_cases	conf_cases	prob_cases	new_case	pnew_case	tot_death	conf_death	prob_death	new_death	pnew_death	created_at	consent_cases	consent_deaths	consent_cases.boolean	consent_deaths.boolean
2021-12-01	ND	163565	135705	27860	589	220	1907	NA	NA	9	0	2021-12-02	Agree	Not agree	TRUE	FALSE
2020-08-17	MD	100715	NA	NA	503	0	3765	3616	149	3	0	2020-08-19	N/A	Agree	NA	TRUE
2021-07-20	MD	464491	NA	NA	155	0	9822	9604	218	3	1	2021-07-22	N/A	Agree	NA	TRUE
2020-05-13	VT	855	NA	NA	2	0	52	NA	NA	0	0	2020-05-15	Not agree	Not agree	FALSE	FALSE
2021-01-21	NC	706315	632991	73324	7113	1040	10066	9048	1018	113	22	2021-01-21	Agree	Agree	TRUE	TRUE
2021-03-11	WA	346788	NA	NA	674	89	5107	NA	NA	7	0	2021-03-13	N/A	N/A	NA	NA

We also need to create the season variable. To do this, we'll create a function to do just that:

getSeason = function(x) {
 winter = list(lubridate::interval(ymd("19721221"),ymd("19721231")), lubridate::interval(ymd("19720101"),ymd("19720319")))
  #winter <- lubridate::interval(ymd("19720101"),ymd("19720319"))
  spring <- lubridate::interval(ymd("19720320"),ymd("19720619"))
  summer <- lubridate::interval(ymd("19720620"),ymd("19720921"))
  fall <- lubridate::interval(ymd("19720922"),ymd("19721220"))
  x = as_date(x)
  year(x) = 1972
  if (x %within% winter) {
    return("winter")
  } else if (x %within% spring) {
    return("spring")
  } else if (x %within% summer) {
    return("summer")
  } else if (x %within% fall) {
    return("fall")
  } else {
    return(NA)
  }
}

dataset.1[, season := sapply(submission_date, getSeason)]
dataset.1[, table(season)] %>% kable() %>% kable_styling( full_width = F)

season	Freq
fall	227
spring	263
summer	249
winter	261

From here we can export

write.csv(dataset.1, 'dataset1.csv', row.names = F)

Step 2: Analysis

Season and State Correlations

From our manufactured variables, we can produce the tables of interest:

dataset.1[order(state), list(prob_cases_total = sum(prob_cases, na.rm = T), prob_deaths_total = sum(prob_death, na.rm = T)), by = "state"]%>% kable() %>% kable_styling( full_width = F)

state	prob_cases_total	prob_deaths_total
AL	5472879	90395
CA	3964300	0
CT	1019360	53589
DE	293285	5187
GU	0	0
ID	1140157	9566
IL	20183	67281
IN	0	13303
MD	0	9614
ME	500567	1329
MI	2698417	30843
MO	0	0
MS	5567979	86322
MT	600962	0
NC	4528573	41055
ND	466505	0
NE	649330	3880
NH	0	0
NV	239879	0
VI	0	0
VT	0	0
WA	0	0
WI	2404120	23297

dataset.1[order(season), list(prob_cases_total = sum(prob_cases, na.rm = T), prob_deaths_total = sum(prob_death, na.rm = T)), by = "season"] %>% kable() %>% kable_styling( full_width = F)

season	prob_cases_total	prob_deaths_total
fall	7308254	100484
spring	3872362	91616
summer	5321176	75826
winter	13064704	167735

Time Gap

The time gap would involve creating a new variable. We can aggregate it by mean time gap per state:

dataset.1[, time.gap := difftime(created_at, submission_date, units = "days")]
time.lag = dataset.1[order(state), list(mean_time_gap = mean(time.gap, na.rm = T)), by = "state"]  

time.lag %>% kable() %>% kable_styling( full_width = F)

state	mean_time_gap
AL	2.000000 days
CA	16.717391 days
CT	2.957447 days
DE	3.023810 days
GU	4.609756 days
ID	3.500000 days
IL	4.891304 days
IN	17.775000 days
MD	13.909091 days
ME	21.314286 days
MI	1.029412 days
MO	1.647059 days
MS	16.148936 days
MT	3.026316 days
NC	1.225000 days
ND	3.745098 days
NE	17.000000 days
NH	4.619048 days
NV	3.036364 days
VI	2.675000 days
VT	2.000000 days
WA	2.000000 days
WI	4.538462 days

plt.time.lag = ggplot(time.lag, aes(x = state, y = mean_time_gap)) + geom_bar(fill = 'lightpink', stat = "identity") + theme_minimal()  + labs(x = "State", y = "Mean Time Gap", title = "Time Gap per State") + theme(axis.text.x =  element_text(angle = 90, vjust = 0.5, hjust=1))

ggplotly(plt.time.lag)

## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

The states with the highest time lag values:

time.lag[order(mean_time_gap, decreasing = T),][1:10] %>% kable() %>% kable_styling( full_width = F)

state	mean_time_gap
ME	21.314286 days
IN	17.775000 days
NE	17.000000 days
CA	16.717391 days
MS	16.148936 days
MD	13.909091 days
IL	4.891304 days
NH	4.619048 days
GU	4.609756 days
WI	4.538462 days

dataset.1[, year := year(submission_date)]
dataset.1[, season := factor(season, c("winter", "spring", "summer", "fall"))]

season.cases = dataset.1[,list(prob_cases_total = sum(prob_cases, na.rm = T), prob_deaths_total = sum(prob_death, na.rm = T)), by = c("year", "season")]
season.cases = season.cases[order(year, season)]

Season Plots

season.cases[, time.period := paste0(year,"_", season)]
plt.seasons.cases = ggplot(season.cases, aes(x = season, y = prob_cases_total)) + geom_bar(fill = 'darkblue', stat = "identity") + theme_minimal()  + labs(x = "Season/Year", y = "Probable Cases (Total)", title = "Probable Cases per Season/Year") + theme(axis.text.x =  element_text(angle = 90, vjust = 0.5, hjust=1)) + facet_wrap(~year)
plt.seasons.deaths = ggplot(season.cases, aes(x = season, y = prob_deaths_total)) + geom_bar(fill = 'seagreen', stat = "identity") + theme_minimal()  + labs(x = "Season/Year", y = "Probable Deaths (Total)", title = "Probable Deaths per Season/Year") + theme(axis.text.x =  element_text(angle = 90, vjust = 0.5, hjust=1)) + facet_wrap(~year)

ggplotly(plt.seasons.cases)

ggplotly(plt.seasons.deaths)

Key Takeaways

Based on our analysis, we can conclude the following:

Fall to winter indicated the higher rate of increase for probable cases
The trend was also observed in probable deaths BUT the trend oscillated a bit. This is most likely due to improved surveillance; as a result the death count is less likely to be probable and more likely to be confirmed.
ME, IN, NE, CA, MS, and MD had some of the states with the highest time gaps between probable cases and deaths.

Project 2, Dataset 1

Deepika Dilip