Project 2-2

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(stringr) 
library(rvest)

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

require(stats)

COVID case and Death in US

This data set is record the COVID case and death in US. We want to use this dataset to know the state with the highest and lowest deaths and Compare the death rate both before and after the vaccine was released.

covid_url <- read.csv(file="https://raw.githubusercontent.com/jayleecunysps/AssignmentforSPS/main/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv",header = TRUE, sep=",")

Data Transformation

This time we convert the data format and selected some of the variables that needed from the dataset into data frames.

covid_url[is.na(covid_url)] = 0
covid_url$submission_date <- as.Date(covid_url$submission_date, "%m/%d/%Y")
covid_url$submission_Year<-year(covid_url$submission_date)
covid_url$submission_Year <-as.character(covid_url$submission_Year)
covid_url$tot_death <-as.numeric(covid_url$tot_death)
covid_url$tot_cases <-as.numeric(covid_url$tot_cases)
statedeath <- aggregate(tot_death ~ state, covid_url, sum)
statecase  <- aggregate(tot_cases ~ state, covid_url, sum)

covid_url2 <- covid_url %>%  
select("submission_Year","state","tot_cases","tot_death")

filtering data

Firstly, we filter the data with different year into 2 data frames. And group death and case by state into new data frame.

newdata2020 <- subset(covid_url2, submission_Year == "2020",
select=c(submission_Year, state, tot_cases,tot_death))

newdata2021 <- subset(covid_url2, submission_Year == "2021",
select=c(submission_Year, state, tot_cases,tot_death))

total <- merge(statedeath,statecase,by="state")

Calculate the total

Then we use the 2020 and 2021 year data frame to calculate the total case and death case for each years. And found the death rate by suing total death divide total cases.

total$deathrate <- total$tot_death/total$tot_cases*100

death2020 <- sum(newdata2020$tot_death,na.rm=TRUE)
case2020 <- sum(newdata2020$tot_cases,na.rm=TRUE)
death2021 <- sum(newdata2021$tot_death,na.rm=TRUE)
case2021 <- sum(newdata2021$tot_cases,na.rm=TRUE)

rate2020 <- death2020/case2020*100
rate2021 <- death2021/case2021*100

Summary

We use sorting to find the highest and lower death rate States. And we found NYC had the highest death rate in COVID cases.

Here, I assume the vaccine was the release year 2021, and I compare the death rate between the year 2020 and 2021 to find the efficiency of the vaccine. And I find the death rate is decreased from 2.75% to 1.71%, which is a 61% death rate after the vaccine is released to the public.

deathrate_state <- total[order(total$deathrate, decreasing = TRUE), ]
head(deathrate_state)

##    state tot_death tot_cases deathrate
## 40   NYC  20499031 561724705  3.649302
## 36    NJ  15396408 572248244  2.690512
## 23    MA  10238244 407472441  2.512622
## 8     CT   4787009 190925550  2.507265
## 44    PA  14816813 683591905  2.167494
## 30    MS   4433196 208495430  2.126280

deathrate_state <- total[order(total$deathrate, decreasing = FALSE), ]
head(deathrate_state)

##    state tot_death tot_cases deathrate
## 4     AS         0      5225 0.0000000
## 12   FSM         0      1898 0.0000000
## 48   RMI         0      2066 0.0000000
## 46    PW       150    141591 0.1059389
## 29    MP      3123    592583 0.5270148
## 53    UT   1310876 241175659 0.5435358

deathrate_compare <- 1-(rate2021/rate2020)*100
rate2021

## [1] 1.708816

rate2020

## [1] 2.746389

deathrate_compare

## [1] -61.22049