I will mainly be using tidyr and dplyr here to do the analysis.
Thanks to Moiya Josephs for the analysis questions:
1- Find the state with the highest and lowest deaths.
2- Compare the death rate both before and after the vaccine was released.
To answer the questions above I will be doing a lot of sorting and group_by + filtering.
my_git_url <- getURL("https://raw.githubusercontent.com/aelsaeyed/Data607/main/Project2/COVID/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
covid_cases_raw <- read.csv(text = my_git_url)
head(covid_cases_raw, 10)
cases_cleaned = covid_cases_raw %>%
separate(ï..submission_date, into = c("month", "day", "year"), sep="/") %>% #separate the date values
mutate(`new_death`= gsub(",", "", new_death) ) %>% #remove commas
mutate(`new_death`= as.integer(new_death) ) %>% #cast to int
mutate(`tot_death`= gsub(",", "", tot_death) ) %>% #remove commas
mutate(`tot_death`= as.integer(tot_death) ) %>% #cast to int
mutate(`year`= as.integer(year)) %>% #cast the date values as int as well
mutate(`month`= as.integer(month)) %>%
mutate(`day`= as.integer(day)) %>%
arrange(desc(year), desc(month), desc(day)) %>% #arrange by dates descending
select(3,1,2,4,10,13) #Select only the columns I need
head(cases_cleaned, 10)
sum_dth = cases_cleaned %>%
group_by(state) %>%
summarise(sum_tot_death= sum(tot_death))
state_max_deaths <- sum_dth[which.max(sum_dth$sum_tot_death),]
state_min_deaths <- sum_dth[which.min(sum_dth$sum_tot_death),]
state_min_deaths
state_max_deaths
From what I looked up, PW is Palau, a “freely associated state”, which recorded 150 deaths.
head(sum_dth %>% arrange(sum_tot_death), 10)
chrono_dth <- cases_cleaned %>%
filter(new_death>0) %>%
group_by(year, month, day) %>%
summarise(sum_new_dth = sum(new_death)) %>%
mutate(`date` = as.Date(paste(year,month,day,sep="-"))) %>%
filter(year > 2020)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
head(chrono_dth, 10)
dates_vline <- as.Date(c("2021-08-23")) # Define positions of vline
dates_vline <- which(chrono_dth$date %in% dates_vline)
ggplot(data = chrono_dth, aes(x = date, y = sum_new_dth)) +
geom_line(color="blue") +
geom_point() +
geom_vline(xintercept = as.numeric(chrono_dth$date[dates_vline]), color="red", linetype="dotted", size = 1)