library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(stringr)
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
require(stats)
This data set is record the COVID case and death in US. We want to use this dataset to know the state with the highest and lowest deaths and Compare the death rate both before and after the vaccine was released.
covid_url <- read.csv(file="https://raw.githubusercontent.com/jayleecunysps/AssignmentforSPS/main/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv",header = TRUE, sep=",")
This time we convert the data format and selected some of the variables that needed from the dataset into data frames.
covid_url[is.na(covid_url)] = 0
covid_url$submission_date <- as.Date(covid_url$submission_date, "%m/%d/%Y")
covid_url$submission_Year<-year(covid_url$submission_date)
covid_url$submission_Year <-as.character(covid_url$submission_Year)
covid_url$tot_death <-as.numeric(covid_url$tot_death)
covid_url$tot_cases <-as.numeric(covid_url$tot_cases)
statedeath <- aggregate(tot_death ~ state, covid_url, sum)
statecase <- aggregate(tot_cases ~ state, covid_url, sum)
covid_url2 <- covid_url %>%
select("submission_Year","state","tot_cases","tot_death")
Firstly, we filter the data with different year into 2 data frames. And group death and case by state into new data frame.
newdata2020 <- subset(covid_url2, submission_Year == "2020",
select=c(submission_Year, state, tot_cases,tot_death))
newdata2021 <- subset(covid_url2, submission_Year == "2021",
select=c(submission_Year, state, tot_cases,tot_death))
total <- merge(statedeath,statecase,by="state")
Then we use the 2020 and 2021 year data frame to calculate the total case and death case for each years. And found the death rate by suing total death divide total cases.
total$deathrate <- total$tot_death/total$tot_cases*100
death2020 <- sum(newdata2020$tot_death,na.rm=TRUE)
case2020 <- sum(newdata2020$tot_cases,na.rm=TRUE)
death2021 <- sum(newdata2021$tot_death,na.rm=TRUE)
case2021 <- sum(newdata2021$tot_cases,na.rm=TRUE)
rate2020 <- death2020/case2020*100
rate2021 <- death2021/case2021*100
We use sorting to find the highest and lower death rate States. And we found NYC had the highest death rate in COVID cases.
Here, I assume the vaccine was the release year 2021, and I compare the death rate between the year 2020 and 2021 to find the efficiency of the vaccine. And I find the death rate is decreased from 2.75% to 1.71%, which is a 61% death rate after the vaccine is released to the public.
deathrate_state <- total[order(total$deathrate, decreasing = TRUE), ]
head(deathrate_state)
## state tot_death tot_cases deathrate
## 40 NYC 20499031 561724705 3.649302
## 36 NJ 15396408 572248244 2.690512
## 23 MA 10238244 407472441 2.512622
## 8 CT 4787009 190925550 2.507265
## 44 PA 14816813 683591905 2.167494
## 30 MS 4433196 208495430 2.126280
deathrate_state <- total[order(total$deathrate, decreasing = FALSE), ]
head(deathrate_state)
## state tot_death tot_cases deathrate
## 4 AS 0 5225 0.0000000
## 12 FSM 0 1898 0.0000000
## 48 RMI 0 2066 0.0000000
## 46 PW 150 141591 0.1059389
## 29 MP 3123 592583 0.5270148
## 53 UT 1310876 241175659 0.5435358
deathrate_compare <- 1-(rate2021/rate2020)*100
rate2021
## [1] 1.708816
rate2020
## [1] 2.746389
deathrate_compare
## [1] -61.22049