library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openxlsx)
library(dplyr)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
First things first read the data from the CDC.
url.data <- "https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD"
raw <-read.csv(url.data)
raw
Firs things first lets grab the unique items from the state column.
sort(unique(raw$state))
## [1] "AK" "AL" "AR" "AS" "AZ" "CA" "CO" "CT" "DC" "DE" "FL" "FSM"
## [13] "GA" "GU" "HI" "IA" "ID" "IL" "IN" "KS" "KY" "LA" "MA" "MD"
## [25] "ME" "MI" "MN" "MO" "MP" "MS" "MT" "NC" "ND" "NE" "NH" "NJ"
## [37] "NM" "NV" "NY" "NYC" "OH" "OK" "OR" "PA" "PR" "PW" "RI" "RMI"
## [49] "SC" "SD" "TN" "TX" "UT" "VA" "VI" "VT" "WA" "WI" "WV" "WY"
So firstly FSM is a small island Nation, NYC is New York City, which should be combined with New York State. In addition not all states are included in this list, indicating that not all states provided data. Let’s see what happens when we look into the data some more. Just going off the back of my hand and a quick skim of the data list, “AS” “NYC” “PR” “PW” “RMI” “FSM” “MP” all appear to be not valid options.
First things first, I want to create a new data frame linking Created_at and submission date, mostly because I am interested in seeing it, and the transformations to effectively combine and mitigate the data will eliminate deliveries of data.
deltaDate <- subset(raw, select = c(submission_date,created_at) )
deltaDate$date_diff <- as.Date(as.character(deltaDate$submission_date), format="%m/%d/%Y")-
as.Date(as.character(deltaDate$created_at), format="%m/%d/%Y %H:%M:%S")
deltaDate <- deltaDate[order(deltaDate$date_diff),]
deltaDate
write.csv(deltaDate,"Covid_data_reporting_delta.csv", row.names = FALSE)
Firstly, there are more entries than there are actual states. At just a first glance over, NYC should be incorporated into NY as its a city in a state. IE:
“AK” “AL” “AR” “AZ” “CA” “CO” “CT” “DC” “DE” “FL” “GA” “HI” “IA” “ID” “IL” “IN” “KY” “LA” “MA” “MD” “ME” “MI” “MN” “MO” “MS” “MT” “NC” “ND” “NE” “NH” “NJ” “NM” “NV” “NY” “OH” “OK” “OR” “PA” “RI” “SC” “SD” “TN” “TX” “UT” “VA” “VT” “WA” “WI” “WV” “WY”
Ans while we are at it, let’s see the exact list of what is not a state.
acceptable_states <- c('AK','AL','AR','AZ','CA','CO','CT','DC','DE','FL','GA','HI','IA','ID','IL','IN','KY','KS','LA','MA','MD','ME','MI','MN','MO','MS','MT','NC','ND','NE','NH','NJ','NM','NV','NY','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VA','VT','WA','WI','WV','WY')
all_states <- unique(raw$state)
all_states[!(all_states %in% acceptable_states)]
## [1] "GU" "VI" "FSM" "MP" "AS" "PR" "NYC" "PW" "RMI"
From there, let’s merge in the NYC data into the NY data.
data_new <- raw %>% # Replacing values
mutate(state = replace(state, state == 'NYC', 'NY'))
data_new
data_new <- subset(data_new, select = -c(created_at, consent_cases, consent_deaths) )
unique(data_new$state)
## [1] "ND" "MD" "ME" "NE" "IL" "NC" "VT" "MI" "NH" "NV" "DE" "GU"
## [13] "CT" "IN" "AL" "MO" "WI" "MS" "CA" "MT" "VI" "ID" "WA" "OR"
## [25] "FSM" "NJ" "DC" "MN" "AZ" "LA" "KY" "VA" "RI" "WY" "SC" "KS"
## [37] "UT" "AR" "MP" "AS" "HI" "AK" "PR" "OK" "NY" "PW" "GA" "TX"
## [49] "FL" "WV" "MA" "CO" "RMI" "IA" "TN" "OH" "NM" "PA" "SD"
data_new
And at this point we will proceede to summarise the data, based on the submission date and state.
data_new %>%
group_by(submission_date, state) %>%
summarise_all(sum)
Now we have the population counts teed up, lets move onto quarters.
I really didn’t like the approach of aggregating by season as it fails to take into account changes in the COVID situation due to time of year, so I opted to add year and quarter in as it standardizes definitions and makes it cleaner.
data_new$yearquarter = as.yearqtr(data_new$submission_date, format = "%m/%d/%Y")
data_new2 <- subset(data_new, select = -c(submission_date, state) )
data_new3 <- data_new2 %>%
group_by(yearquarter) %>%
summarise_all(sum)
data_new3
write.csv(data_new3,"Covid_data_year_quarter.csv", row.names = FALSE)
So we have our data saved and easily accessible. It is ready to be read into anything else, or picked up in R to analyze!