Tidy data using dplyr and tidyr
Import readxl and tidyverse
library(readxl)
library(tidyverse) # ggplot2, dplyr, tidyr, readr, tibble, sringr and more
library(knitr)
Download the excel file. Note the mode is “wb” to preserve the binary elements of the excel file.
sourcefile<-"https://github.com/acatlin/data/raw/master/israeli_vaccination_data_analysis_start.xlsx"
curr_path<-str_trim(getwd())
destfile<-paste0(curr_path,"/vaccine_data.xlsx")
download.file(sourcefile, destfile, mode = "wb")
Read the excel file into a tibble.
efile <- read_excel(destfile)
Get rid of the rows we dont need.
efile <- efile %>% filter(Age == ">50" | Age == "<50")
Rename the columns.
efile <- efile %>% rename("pop_nv" = "Population %", "pop_v" = "...3", "sc_nv" = "Severe Cases", "sc_v" = "...5" )
Convert the fields to numeric.
efile<- efile %>% mutate(across(2:6, as.numeric))
Im isolating each individual field as opposed to performing vector operations.
It provides some clarity and its a good excercise for dplyr functionality.
# overall population
pop_lt_50_nv <- efile %>% filter(Age == "<50") %>% select("pop_nv") %>% first()
pop_gt_50_nv <- efile %>% filter(Age == ">50") %>% select("pop_nv") %>% first()
pop_lt_50_v <- efile %>% filter(Age == "<50") %>% select("pop_v") %>% first()
pop_gt_50_v <- efile %>% filter(Age == ">50") %>% select("pop_v") %>% first()
# severe cases
sc_lt_50_nv<-efile %>% filter(Age == "<50") %>% select("sc_nv") %>% first()
sc_gt_50_nv<-efile %>% filter(Age == ">50") %>% select("sc_nv") %>% first()
sc_lt_50_v<-efile %>% filter(Age == "<50") %>% select("sc_v") %>% first()
sc_gt_50_v<-efile %>% filter(Age == ">50") %>% select("sc_v") %>% first()
Calculate totals.
tot_pop_nv<-pop_lt_50_nv + pop_gt_50_nv
tot_pop_v<-pop_lt_50_v + pop_gt_50_v
tot_pop<-tot_pop_nv+tot_pop_v
tot_sc_nv<-sc_lt_50_nv + sc_gt_50_nv
tot_sc_v<-sc_lt_50_v + sc_gt_50_v
tot_sc<-tot_sc_nv+tot_sc_v
Calculate Attack Rates.
aru_lt_50<-sc_lt_50_nv / 1000
aru_gt_50<-sc_gt_50_nv / 1000
arv_lt_50<-sc_lt_50_v / 1000
arv_gt_50<-sc_gt_50_v / 1000
# relative risk is the severe cases of vaccinated over non vaccinated
rr_lt_50 <- arv_lt_50 / aru_lt_50
rr_gt_50 <- arv_gt_50 / aru_gt_50
# divide by 2000 since we are averaging the 2 ratios of 1000
aru_all_ages<-(sc_gt_50_nv + sc_lt_50_nv) / 2000
arv_all_ages<-(sc_gt_50_v + sc_lt_50_v) / 2000
rr_all_ages<-(rr_gt_50 + rr_lt_50) / 2
Calculate Vaccine Efficacy.
ve_lt_50<-(aru_lt_50-arv_lt_50)/aru_lt_50
ve_gt_50<-(aru_gt_50-arv_gt_50)/aru_gt_50
# ve is the difference / aru
ve<-(aru_all_ages-arv_all_ages)/aru_all_ages # .674
# same thing
ve<-(1- (arv_all_ages)/(aru_all_ages))
# print(sprintf("The attack rate on the vaccinated for those under 50 is %.3f", arv_lt_50))
# print(sprintf("The attack rate on the vaccinated for those over 50 is %.3f", arv_gt_50))
# print(sprintf("The attack rate for all vaccinated is %.3f", arv_all_ages))
#
# print(sprintf("The attack rate on the unvaccinated for those under 50 is %.3f", aru_lt_50))
# print(sprintf("The attack rate on the unvaccinated for those over 50 is %.3f", aru_gt_50))
# print(sprintf("The attack rate for all unvaccinated is %.3f", aru_all_ages))
Add the Efficacies and the Totals to our tibble.
# efile <- efile %>% add_column(rr = c(rr_lt_50, rr_gt_50, rr_all_ages), .after = "sc_v")
efile <- efile %>% add_column(rr = 0, .after = "sc_v")
efile <-efile %>% rows_insert(tibble("Age"= "Total", pop_nv=tot_pop_nv, pop_v=tot_pop_v, sc_nv=tot_sc_nv, sc_v=tot_sc_v, rr=rr_all_ages, Efficacy=ve))
efile <- efile %>% rows_update(tibble(Age= "<50", Efficacy = ve_lt_50))
efile <- efile %>% rows_update(tibble(Age= ">50", Efficacy = ve_gt_50))
efile <- efile %>% rows_update(tibble(Age= "<50", rr = rr_lt_50))
efile <- efile %>% rows_update(tibble(Age= ">50", rr = rr_gt_50))
Round the numerics for presentation purposes.
efile <- efile %>% mutate( Efficacy = round(Efficacy,3 ))
Show everyone our updated tibble.
kable(efile, caption="",row.names = FALSE, booktabs=TRUE,format.args = list(decimal.mark = '.', big.mark = ",", digits=3))| Age | pop_nv | pop_v | sc_nv | sc_v | rr | Efficacy |
|---|---|---|---|---|---|---|
| <50 | 1,116,834 | 3,501,118 | 43 | 11 | 0.256 | 0.744 |
| >50 | 186,078 | 2,133,516 | 171 | 290 | 1.696 | -0.696 |
| Total | 1,302,912 | 5,634,634 | 214 | 301 | 0.976 | -0.407 |
Save the tibble to a csv file.
curr_path<-str_trim(getwd())
destfile<-paste0(curr_path,"/vaccine_data_efficacy.csv")
write.csv(efile, destfile)1. What is the total population ?
The total population adds up to about 6.9MM. That was Israels total population in 2005
Its now about 9MM. The percentage values seem to indicate that about 96% of the less than 50 population is accounted for.
So perhaps this data was created around 2005. If I were to publish analysis on this data, I would need to validate the data.
2. Explain the efficacy result.
The results indicate that the vaccine adversely effects the older population, i.e. doesnt work.
\[Efficacy \ = \ \frac{ARU \ - \ ARV}{ARU} \ * 100%\]
where
\[ARU \ = \ Attack \ Rate \ of \ Unvaccinated \ People \ = \frac{Severe \ Cases (Unvaccinated) }{Population(Unvaccinated) }\]
\[ARV \ = \ Attack \ Rate \ of \ Vaccinated \ People \ = \frac{Severe \ Cases (Vaccinated) }{Population(Vaccinated) }\]
(3) Compare the attack rates.
I dont understand why the severe cases is higher for the over 50 vaccinated group. Is this supposed to represent an actual distribution to the Isreali population or just a small sample ? I would need to understand the context.