Tidy data using dplyr and tidyr
Import readxl and tidyverse
library(readxl)
library(tidyverse) # ggplot2, dplyr, tidyr, readr, tibble, sringr and more
library(knitr)
Download the excel file. Note the mode is “wb” to preserve the binary elements of the excel file.
sourcefile<-"https://github.com/acatlin/data/raw/master/israeli_vaccination_data_analysis_start.xlsx"
curr_path<-str_trim(getwd())
destfile<-paste0(curr_path,"/vaccine_data.xlsx")
download.file(sourcefile, destfile, mode = "wb")
Read the excel file into a tibble.
efile <- read_excel(destfile)
Get rid of the rows we dont need.
efile <- efile %>% filter(Age == ">50" | Age == "<50")
Rename the columns.
efile <- efile %>% rename("pop_nv" = "Population %", "pop_v" = "...3", "sc_nv" = "Severe Cases", "sc_v" = "...5" )
Convert the fields to numeric.
efile<- efile %>% mutate(across(2:6, as.numeric))
Im isolating each individual field as opposed to performing vector operations.
It provides some clarity and its a good excercise for dplyr functionality.
# overall population
pop_lt_50_nv <- efile %>% filter(Age == "<50") %>% select("pop_nv") %>% first()
pop_gt_50_nv <- efile %>% filter(Age == ">50") %>% select("pop_nv") %>% first()
pop_lt_50_v <- efile %>% filter(Age == "<50") %>% select("pop_v") %>% first()
pop_gt_50_v <- efile %>% filter(Age == ">50") %>% select("pop_v") %>% first()
# severe cases
sc_lt_50_nv<-efile %>% filter(Age == "<50") %>% select("sc_nv") %>% first()
sc_gt_50_nv<-efile %>% filter(Age == ">50") %>% select("sc_nv") %>% first()
sc_lt_50_v<-efile %>% filter(Age == "<50") %>% select("sc_v") %>% first()
sc_gt_50_v<-efile %>% filter(Age == ">50") %>% select("sc_v") %>% first()
Calculate totals.
tot_pop_nv<-pop_lt_50_nv + pop_gt_50_nv
tot_pop_v<-pop_lt_50_v + pop_gt_50_v
tot_pop<-tot_pop_nv+tot_pop_v
tot_sc_nv<-sc_lt_50_nv + sc_gt_50_nv
tot_sc_v<-sc_lt_50_v + sc_gt_50_v
tot_sc<-tot_sc_nv+tot_sc_v
Calculate Attack Rates. Note the initial numbers are the number of severe cases (not a percentage). They scaled to units of 100K.
So we mutliply by 100K then divide that by the total population for the category to get an attack rate.
aru_lt_50<-sc_lt_50_nv * 100000/ pop_lt_50_nv
aru_gt_50<-sc_gt_50_nv * 100000/ pop_gt_50_nv
arv_lt_50<-sc_lt_50_v * 100000/ pop_lt_50_v
arv_gt_50<-sc_gt_50_v * 100000/ pop_gt_50_v
aru_all_ages<-tot_sc_nv * 100000/tot_pop_nv
arv_all_ages<-tot_sc_v * 100000/tot_pop_v
rr_lt_50<-(sc_lt_50_v + sc_lt_50_nv) * 100000/(pop_lt_50_v + pop_lt_50_nv)
rr_gt_50<-(sc_gt_50_v + sc_gt_50_nv) * 100000/(pop_gt_50_v + pop_gt_50_nv)
rr_all_ages<-tot_sc * 100000/tot_pop
Calculate Vaccine Efficacy.
ve_lt_50<-(aru_lt_50-arv_lt_50)/aru_lt_50
ve_gt_50<-(aru_gt_50-arv_gt_50)/aru_gt_50
# ve is the difference / aru
ve<-(aru_all_ages-arv_all_ages)/aru_all_ages # .674
# same thing
ve<-(1- (arv_all_ages)/(aru_all_ages))
Add the Efficacies and the Totals to our tibble.
# efile <- efile %>% add_column(rr = c(rr_lt_50, rr_gt_50, rr_all_ages), .after = "sc_v")
efile <- efile %>% add_column(ar_v = 0, .after = "sc_v")
efile <- efile %>% add_column(ar_u = 0, .after = "sc_nv")
efile <-efile %>%
rows_insert(tibble("Age"= "Total", pop_nv=tot_pop_nv, pop_v=tot_pop_v, sc_nv=tot_sc_nv, ar_u=aru_all_ages, sc_v=tot_sc_v, ar_v=arv_all_ages,Efficacy=ve))
efile <- efile %>% rows_update(tibble(Age= "<50", Efficacy = ve_lt_50))
efile <- efile %>% rows_update(tibble(Age= ">50", Efficacy = ve_gt_50))
efile <- efile %>% rows_update(tibble(Age= "<50", ar_u = aru_lt_50))
efile <- efile %>% rows_update(tibble(Age= ">50", ar_u = aru_gt_50))
efile <- efile %>% rows_update(tibble(Age= "<50", ar_v = arv_lt_50))
efile <- efile %>% rows_update(tibble(Age= ">50", ar_v = arv_gt_50))
Round the numerics for presentation purposes.
efile <- efile %>% mutate( Efficacy = round(Efficacy,3 ))
Show everyone our updated tibble.
kable(efile, caption="",row.names = FALSE, booktabs=TRUE,format.args = list(decimal.mark = '.', big.mark = ",", digits=3))| Age | pop_nv | pop_v | sc_nv | ar_u | sc_v | ar_v | Efficacy |
|---|---|---|---|---|---|---|---|
| <50 | 1,116,834 | 3,501,118 | 43 | 3.85 | 11 | 0.314 | 0.918 |
| >50 | 186,078 | 2,133,516 | 171 | 91.90 | 290 | 13.593 | 0.852 |
| Total | 1,302,912 | 5,634,634 | 214 | 16.42 | 301 | 5.342 | 0.675 |
Save the tibble to a csv file.
curr_path<-str_trim(getwd())
destfile<-paste0(curr_path,"/vaccine_data_efficacy.csv")
write.csv(efile, destfile)1. What is the total population ?
The total population adds up to about 6.9MM.
2. Explain the efficacy result.
The efficacy is a ratio between the unvaccinated and the vaccinated.
The total population efficacy is lower than both the categories which is not inuitive. The two groups (over and under 50) diverge greatly both in vulnerability to the disease as well as their vaccination percentage.
One way to look at is the spread between vaccinated and unvaccinated is large when you divide up the age groups. But in total, that spead is reduced.
I understand that its a standard metric, so if you work with it every day, its useful.
The public, however, would need something more intuitive.
\[Efficacy \ = \ \frac{ARU \ - \ ARV}{ARU} \ * 100%\]
where
\[ARU \ = \ Attack \ Rate \ of \ Unvaccinated \ People \ = \frac{Severe \ Cases (Unvaccinated) }{Population(Unvaccinated) }\]
\[ARV \ = \ Attack \ Rate \ of \ Vaccinated \ People \ = \frac{Severe \ Cases (Vaccinated) }{Population(Vaccinated) }\]
(3) Compare the attack rates.
As mentioned above since the vaccinated vs unvaccinated diverges so much between the groups, its not properly scaled for comparison.