For this analysis, we'll be looking at COVID-19 cases and deaths by state over time. We'll be using the CDC dataset. For our analysis, we will be doing the following:
First, we'll read the dataset in
setwd('~/git/CUNY.MDS/DATA607/')
dataset.1 = as.data.table(read.csv(url('https://data.cdc.gov/resource/9mfq-cb36.csv')))
head(dataset.1) %>% kable() %>% kable_styling( full_width = F)
| submission_date | state | tot_cases | conf_cases | prob_cases | new_case | pnew_case | tot_death | conf_death | prob_death | new_death | pnew_death | created_at | consent_cases | consent_deaths |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2021-12-01T00:00:00.000 | ND | 163565 | 135705 | 27860 | 589 | 220 | 1907 | NA | NA | 9 | 0 | 2021-12-02T14:35:20.922 | Agree | Not agree |
| 2020-08-17T00:00:00.000 | MD | 100715 | NA | NA | 503 | 0 | 3765 | 3616 | 149 | 3 | 0 | 2020-08-19T00:00:00.000 | N/A | Agree |
| 2021-07-20T00:00:00.000 | MD | 464491 | NA | NA | 155 | 0 | 9822 | 9604 | 218 | 3 | 1 | 2021-07-22T00:00:00.000 | N/A | Agree |
| 2020-05-13T00:00:00.000 | VT | 855 | NA | NA | 2 | 0 | 52 | NA | NA | 0 | 0 | 2020-05-15T00:00:00.000 | Not agree | Not agree |
| 2021-01-21T00:00:00.000 | NC | 706315 | 632991 | 73324 | 7113 | 1040 | 10066 | 9048 | 1018 | 113 | 22 | 2021-01-21T00:00:00.000 | Agree | Agree |
| 2021-03-11T00:00:00.000 | WA | 346788 | NA | NA | 674 | 89 | 5107 | NA | NA | 7 | 0 | 2021-03-13T00:00:00.000 | N/A | N/A |
Right off the bat, we notice this dataset has a wide structure--which is great for the purposes of exporting the CSV file! We do need to manipulate the date stamps though and change a couple of these variables to booleans:
dataset.1[ , `:=` (submission_date = as_date(submission_date), created_at = as_date(created_at))]
dataset.1[consent_cases == 'Agree', consent_cases.boolean := T]
dataset.1[consent_cases == 'Not agree', consent_cases.boolean := F]
dataset.1[!(consent_cases %in% c('Agree', 'Not agree')), consent_cases.boolean := NA]
dataset.1[consent_deaths == 'Agree', consent_deaths.boolean := T]
dataset.1[consent_deaths == 'Not agree', consent_deaths.boolean := F]
dataset.1[!(consent_deaths %in% c('Agree', 'Not agree')), consent_deaths.boolean := NA]
head(dataset.1) %>% kable() %>% kable_styling( full_width = F)
| submission_date | state | tot_cases | conf_cases | prob_cases | new_case | pnew_case | tot_death | conf_death | prob_death | new_death | pnew_death | created_at | consent_cases | consent_deaths | consent_cases.boolean | consent_deaths.boolean |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2021-12-01 | ND | 163565 | 135705 | 27860 | 589 | 220 | 1907 | NA | NA | 9 | 0 | 2021-12-02 | Agree | Not agree | TRUE | FALSE |
| 2020-08-17 | MD | 100715 | NA | NA | 503 | 0 | 3765 | 3616 | 149 | 3 | 0 | 2020-08-19 | N/A | Agree | NA | TRUE |
| 2021-07-20 | MD | 464491 | NA | NA | 155 | 0 | 9822 | 9604 | 218 | 3 | 1 | 2021-07-22 | N/A | Agree | NA | TRUE |
| 2020-05-13 | VT | 855 | NA | NA | 2 | 0 | 52 | NA | NA | 0 | 0 | 2020-05-15 | Not agree | Not agree | FALSE | FALSE |
| 2021-01-21 | NC | 706315 | 632991 | 73324 | 7113 | 1040 | 10066 | 9048 | 1018 | 113 | 22 | 2021-01-21 | Agree | Agree | TRUE | TRUE |
| 2021-03-11 | WA | 346788 | NA | NA | 674 | 89 | 5107 | NA | NA | 7 | 0 | 2021-03-13 | N/A | N/A | NA | NA |
We also need to create the season variable. To do this, we'll create a function to do just that:
getSeason = function(x) {
winter = list(lubridate::interval(ymd("19721221"),ymd("19721231")), lubridate::interval(ymd("19720101"),ymd("19720319")))
#winter <- lubridate::interval(ymd("19720101"),ymd("19720319"))
spring <- lubridate::interval(ymd("19720320"),ymd("19720619"))
summer <- lubridate::interval(ymd("19720620"),ymd("19720921"))
fall <- lubridate::interval(ymd("19720922"),ymd("19721220"))
x = as_date(x)
year(x) = 1972
if (x %within% winter) {
return("winter")
} else if (x %within% spring) {
return("spring")
} else if (x %within% summer) {
return("summer")
} else if (x %within% fall) {
return("fall")
} else {
return(NA)
}
}
dataset.1[, season := sapply(submission_date, getSeason)]
dataset.1[, table(season)] %>% kable() %>% kable_styling( full_width = F)
| season | Freq |
|---|---|
| fall | 227 |
| spring | 263 |
| summer | 249 |
| winter | 261 |
From here we can export
write.csv(dataset.1, 'dataset1.csv', row.names = F)
Season and State Correlations
From our manufactured variables, we can produce the tables of interest:
dataset.1[order(state), list(prob_cases_total = sum(prob_cases, na.rm = T), prob_deaths_total = sum(prob_death, na.rm = T)), by = "state"]%>% kable() %>% kable_styling( full_width = F)
| state | prob_cases_total | prob_deaths_total |
|---|---|---|
| AL | 5472879 | 90395 |
| CA | 3964300 | 0 |
| CT | 1019360 | 53589 |
| DE | 293285 | 5187 |
| GU | 0 | 0 |
| ID | 1140157 | 9566 |
| IL | 20183 | 67281 |
| IN | 0 | 13303 |
| MD | 0 | 9614 |
| ME | 500567 | 1329 |
| MI | 2698417 | 30843 |
| MO | 0 | 0 |
| MS | 5567979 | 86322 |
| MT | 600962 | 0 |
| NC | 4528573 | 41055 |
| ND | 466505 | 0 |
| NE | 649330 | 3880 |
| NH | 0 | 0 |
| NV | 239879 | 0 |
| VI | 0 | 0 |
| VT | 0 | 0 |
| WA | 0 | 0 |
| WI | 2404120 | 23297 |
dataset.1[order(season), list(prob_cases_total = sum(prob_cases, na.rm = T), prob_deaths_total = sum(prob_death, na.rm = T)), by = "season"] %>% kable() %>% kable_styling( full_width = F)
| season | prob_cases_total | prob_deaths_total |
|---|---|---|
| fall | 7308254 | 100484 |
| spring | 3872362 | 91616 |
| summer | 5321176 | 75826 |
| winter | 13064704 | 167735 |
The time gap would involve creating a new variable. We can aggregate it by mean time gap per state:
dataset.1[, time.gap := difftime(created_at, submission_date, units = "days")]
time.lag = dataset.1[order(state), list(mean_time_gap = mean(time.gap, na.rm = T)), by = "state"]
time.lag %>% kable() %>% kable_styling( full_width = F)
| state | mean_time_gap |
|---|---|
| AL | 2.000000 days |
| CA | 16.717391 days |
| CT | 2.957447 days |
| DE | 3.023810 days |
| GU | 4.609756 days |
| ID | 3.500000 days |
| IL | 4.891304 days |
| IN | 17.775000 days |
| MD | 13.909091 days |
| ME | 21.314286 days |
| MI | 1.029412 days |
| MO | 1.647059 days |
| MS | 16.148936 days |
| MT | 3.026316 days |
| NC | 1.225000 days |
| ND | 3.745098 days |
| NE | 17.000000 days |
| NH | 4.619048 days |
| NV | 3.036364 days |
| VI | 2.675000 days |
| VT | 2.000000 days |
| WA | 2.000000 days |
| WI | 4.538462 days |
plt.time.lag = ggplot(time.lag, aes(x = state, y = mean_time_gap)) + geom_bar(fill = 'lightpink', stat = "identity") + theme_minimal() + labs(x = "State", y = "Mean Time Gap", title = "Time Gap per State") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplotly(plt.time.lag)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
The states with the highest time lag values:
time.lag[order(mean_time_gap, decreasing = T),][1:10] %>% kable() %>% kable_styling( full_width = F)
| state | mean_time_gap |
|---|---|
| ME | 21.314286 days |
| IN | 17.775000 days |
| NE | 17.000000 days |
| CA | 16.717391 days |
| MS | 16.148936 days |
| MD | 13.909091 days |
| IL | 4.891304 days |
| NH | 4.619048 days |
| GU | 4.609756 days |
| WI | 4.538462 days |
dataset.1[, year := year(submission_date)]
dataset.1[, season := factor(season, c("winter", "spring", "summer", "fall"))]
season.cases = dataset.1[,list(prob_cases_total = sum(prob_cases, na.rm = T), prob_deaths_total = sum(prob_death, na.rm = T)), by = c("year", "season")]
season.cases = season.cases[order(year, season)]
season.cases[, time.period := paste0(year,"_", season)]
plt.seasons.cases = ggplot(season.cases, aes(x = season, y = prob_cases_total)) + geom_bar(fill = 'darkblue', stat = "identity") + theme_minimal() + labs(x = "Season/Year", y = "Probable Cases (Total)", title = "Probable Cases per Season/Year") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + facet_wrap(~year)
plt.seasons.deaths = ggplot(season.cases, aes(x = season, y = prob_deaths_total)) + geom_bar(fill = 'seagreen', stat = "identity") + theme_minimal() + labs(x = "Season/Year", y = "Probable Deaths (Total)", title = "Probable Deaths per Season/Year") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + facet_wrap(~year)
ggplotly(plt.seasons.cases)
ggplotly(plt.seasons.deaths)
Based on our analysis, we can conclude the following: