load the library

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Upload the file to Github and read the file from Github

Vacclist_url<- "https://raw.githubusercontent.com/jayleecunysps/AssignmentforSPS/main/israeli_vaccination_data_analysis_start.csv"
Vaccdata <-read.csv(Vacclist_url)
Vaccdata <-head(Vaccdata,5) #just put the numbers and delete the questions

Clean the data, selet and join and add identifier

fullyvax <- Vaccdata %>%  
select("Age","X","X.1")

fullyvax<-fullyvax %>% 
slice(-1,-3,-5) #delete row1,3 and 5 to get the number

notvax <- Vaccdata %>%
select("Age","Population..","Severe.Cases")
notvax<-notvax %>%
slice(-1,-3,-5)

percent <- Vaccdata %>%
  select("Population..","X")
percent<-percent  %>%
  slice(-1,-2,-4) #delete row1,2 and 4 to get %

fullyvax<-fullyvax %>%
add_column(percentofpopulation =percent$X) %>%  #join the percent back
rename(population=X) %>% #rename back the column
rename(Severe.Cases=X.1)

notvax<-notvax %>%
add_column(percentofpopulation =percent$Population..) %>%
rename(population=Population..)

vaxstate <- c("FullyVax","FullyVax","NotVax","NotVax") #identifier

fulldata <-rbind(fullyvax,notvax)

fulldata <- fulldata %>%
  add_column(vax_state =vaxstate)

fulldata
##   Age population Severe.Cases percentofpopulation vax_state
## 1 <50  3,501,118           11               73.0%  FullyVax
## 2 >50  2,133,516          290               90.4%  FullyVax
## 3 <50  1,116,834           43               23.3%    NotVax
## 4 >50    186,078          171                7.9%    NotVax

re-format the data

fulldata$Age <- as.factor(fulldata$Age)
fulldata$population <- str_remove_all(fulldata$population,",")
fulldata$population <-as.numeric(fulldata$population)
fulldata$Severe.Cases <-as.numeric(fulldata$Severe.Cases)
fulldata$percentofpopulation <- str_remove_all(fulldata$percentofpopulation,"%")
fulldata$percentofpopulation <-as.numeric(fulldata$percentofpopulation)
fulldata$vax_state <- as.factor(fulldata$vax_state)

fulldata
##   Age population Severe.Cases percentofpopulation vax_state
## 1 <50    3501118           11                73.0  FullyVax
## 2 >50    2133516          290                90.4  FullyVax
## 3 <50    1116834           43                23.3    NotVax
## 4 >50     186078          171                 7.9    NotVax

Q1

Do you have enough information to calculate the total population? What does this total population represent?

Yes, we can see 5,634,634 people are fully vaccinated and 1,302,912 are not vaccinated which is total of 6,937,546.

tapply(fulldata$population, fulldata$vax_state, FUN=sum)
## FullyVax   NotVax 
##  5634634  1302912
sum(fulldata$population)
## [1] 6937546

Q2 and Q3

Calculate the Efficacy vs. Disease; Explain your results.

From your calculation of efficacy vs. disease, are you able to compare the rate of severe cases in unvaccinated individuals to that in vaccinated individuals?

the lowest severe cases rate is people who are under 50 who is fully vaccinated. However, vaccinated people who is over 50 has a higher severe cases rate than who is not vaccinated.

I do not think the result is fair to give a conclusion due to people who is over 50 has more variety of personal health issues and concerns.

For example, people tend to get vaccinated if they think their immune system is not good enough to fight with Covid. This can explain why >50 has the highest percent of vaccinated. The un-vaccinated group maybe is the healthier group so the severe cases rate is lower.

we should pick a group of people who has similar health conditions.

I think CMS may able to provide a better and more fair report because they have people health condition information, and each of them has a Risk Score base on their Hierarchical Condition Categories which are sets of medical codes that are linked to specific clinical diagnoses.

for Risk Score info, please see the video:

https://www.youtube.com/watch?v=m78C-tVtQIA

fulldata<-fulldata %>%
add_column(sevcaserate =fulldata$Severe.Cases/100000*100)

fulldata<-fulldata %>%
add_column(efficacyrate =100-fulldata$sevcaserate)


fulldata
##   Age population Severe.Cases percentofpopulation vax_state sevcaserate
## 1 <50    3501118           11                73.0  FullyVax       0.011
## 2 >50    2133516          290                90.4  FullyVax       0.290
## 3 <50    1116834           43                23.3    NotVax       0.043
## 4 >50     186078          171                 7.9    NotVax       0.171
##   efficacyrate
## 1       99.989
## 2       99.710
## 3       99.957
## 4       99.829