Tidy data using dplyr and tidyr





The R Code

Import readxl and tidyverse

library(readxl)
library(tidyverse)         # ggplot2, dplyr, tidyr, readr, tibble, sringr and more
library(knitr)


Download the excel file. Note the mode is “wb” to preserve the binary elements of the excel file.

sourcefile<-"https://github.com/acatlin/data/raw/master/israeli_vaccination_data_analysis_start.xlsx"
curr_path<-str_trim(getwd())

destfile<-paste0(curr_path,"/vaccine_data.xlsx")

download.file(sourcefile, destfile, mode = "wb")


Read the excel file into a tibble.

efile <- read_excel(destfile) 


Get rid of the rows we dont need.

efile <- efile %>% filter(Age == ">50" | Age == "<50") 


Rename the columns.

efile <- efile %>% rename("pop_nv" = "Population %", "pop_v" = "...3", "sc_nv" = "Severe Cases", "sc_v" = "...5" )


Convert the fields to numeric.

efile<- efile %>% mutate(across(2:6, as.numeric))


Im isolating each individual field as opposed to performing vector operations.
It provides some clarity and its a good excercise for dplyr functionality.

# overall population
pop_lt_50_nv <- efile %>% filter(Age == "<50") %>% select("pop_nv") %>% first()
pop_gt_50_nv <- efile %>% filter(Age == ">50") %>% select("pop_nv") %>% first()
pop_lt_50_v <- efile %>% filter(Age == "<50") %>% select("pop_v") %>% first()
pop_gt_50_v <- efile %>% filter(Age == ">50") %>% select("pop_v") %>% first()


# severe cases

sc_lt_50_nv<-efile %>% filter(Age == "<50") %>% select("sc_nv") %>% first()
sc_gt_50_nv<-efile %>% filter(Age == ">50") %>% select("sc_nv") %>% first()
sc_lt_50_v<-efile %>% filter(Age == "<50") %>% select("sc_v") %>% first()
sc_gt_50_v<-efile %>% filter(Age == ">50") %>% select("sc_v") %>% first()


Calculate totals.

tot_pop_nv<-pop_lt_50_nv + pop_gt_50_nv
tot_pop_v<-pop_lt_50_v + pop_gt_50_v
tot_pop<-tot_pop_nv+tot_pop_v


tot_sc_nv<-sc_lt_50_nv + sc_gt_50_nv
tot_sc_v<-sc_lt_50_v + sc_gt_50_v
tot_sc<-tot_sc_nv+tot_sc_v


Calculate Attack Rates. Note the initial numbers are the number of severe cases (not a percentage). They scaled to units of 100K.
So we mutliply by 100K then divide that by the total population for the category to get an attack rate.

aru_lt_50<-sc_lt_50_nv * 100000/ pop_lt_50_nv
aru_gt_50<-sc_gt_50_nv * 100000/ pop_gt_50_nv

arv_lt_50<-sc_lt_50_v * 100000/ pop_lt_50_v
arv_gt_50<-sc_gt_50_v * 100000/ pop_gt_50_v

aru_all_ages<-tot_sc_nv * 100000/tot_pop_nv
arv_all_ages<-tot_sc_v * 100000/tot_pop_v

rr_lt_50<-(sc_lt_50_v + sc_lt_50_nv) * 100000/(pop_lt_50_v + pop_lt_50_nv)
rr_gt_50<-(sc_gt_50_v + sc_gt_50_nv) * 100000/(pop_gt_50_v + pop_gt_50_nv)
rr_all_ages<-tot_sc * 100000/tot_pop


Calculate Vaccine Efficacy.

ve_lt_50<-(aru_lt_50-arv_lt_50)/aru_lt_50 
ve_gt_50<-(aru_gt_50-arv_gt_50)/aru_gt_50                      


# ve is the difference / aru
ve<-(aru_all_ages-arv_all_ages)/aru_all_ages                   # .674

# same thing
ve<-(1- (arv_all_ages)/(aru_all_ages))


Add the Efficacies and the Totals to our tibble.

# efile <- efile %>%  add_column(rr = c(rr_lt_50, rr_gt_50, rr_all_ages),  .after = "sc_v") 
efile <- efile %>%  add_column(ar_v = 0,  .after = "sc_v") 
efile <- efile %>%  add_column(ar_u = 0,  .after = "sc_nv") 

efile <-efile %>% 
  rows_insert(tibble("Age"= "Total", pop_nv=tot_pop_nv,  pop_v=tot_pop_v, sc_nv=tot_sc_nv, ar_u=aru_all_ages,  sc_v=tot_sc_v, ar_v=arv_all_ages,Efficacy=ve))




efile <- efile %>% rows_update(tibble(Age= "<50", Efficacy = ve_lt_50))
efile <- efile %>% rows_update(tibble(Age= ">50", Efficacy = ve_gt_50))
efile <- efile %>% rows_update(tibble(Age= "<50", ar_u = aru_lt_50))
efile <- efile %>% rows_update(tibble(Age= ">50", ar_u = aru_gt_50))
efile <- efile %>% rows_update(tibble(Age= "<50", ar_v = arv_lt_50))
efile <- efile %>% rows_update(tibble(Age= ">50", ar_v = arv_gt_50))


Round the numerics for presentation purposes.

efile <- efile %>% mutate( Efficacy = round(Efficacy,3 ))


Show everyone our updated tibble.

kable(efile, caption="",row.names = FALSE, booktabs=TRUE,format.args = list(decimal.mark = '.', big.mark = ",", digits=3))
Age pop_nv pop_v sc_nv ar_u sc_v ar_v Efficacy
<50 1,116,834 3,501,118 43 3.85 11 0.314 0.918
>50 186,078 2,133,516 171 91.90 290 13.593 0.852
Total 1,302,912 5,634,634 214 16.42 301 5.342 0.675


Save the tibble to a csv file.

curr_path<-str_trim(getwd())
destfile<-paste0(curr_path,"/vaccine_data_efficacy.csv")
write.csv(efile, destfile)







The Questions. The Answers




1. What is the total population ?

The total population adds up to about 6.9MM.




2. Explain the efficacy result.

The efficacy is a ratio between the unvaccinated and the vaccinated.

The total population efficacy is lower than both the categories which is not inuitive. The two groups (over and under 50) diverge greatly both in vulnerability to the disease as well as their vaccination percentage.
One way to look at is the spread between vaccinated and unvaccinated is large when you divide up the age groups. But in total, that spead is reduced.
I understand that its a standard metric, so if you work with it every day, its useful.
The public, however, would need something more intuitive.



\[Efficacy \ = \ \frac{ARU \ - \ ARV}{ARU} \ * 100%\]
where
\[ARU \ = \ Attack \ Rate \ of \ Unvaccinated \ People \ = \frac{Severe \ Cases (Unvaccinated) }{Population(Unvaccinated) }\]

\[ARV \ = \ Attack \ Rate \ of \ Vaccinated \ People \ = \frac{Severe \ Cases (Vaccinated) }{Population(Vaccinated) }\]


(3) Compare the attack rates.


As mentioned above since the vaccinated vs unvaccinated diverges so much between the groups, its not properly scaled for comparison.