Import Libraries

library(data.table)
library(ggplot2)
library(readxl)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(knitr)
dir.home = '~/git/CUNY.MDS/DATA607/'

Cleaning the Vaccination Data

This dataset is quite a mess since it's a cross tabulations. The first step will be using the readxl package to import everything and using data.table and janitor to clean it up.

dt.vaccine = read_xlsx(file.path(dir.home, 'israeli_vaccination_data_analysis_start.xlsx'))
## New names:
## * `` -> ...3
## * `` -> ...5
dt.vaccine = as.data.table(dt.vaccine[1:5, 1:5])
colnames(dt.vaccine) = unlist(dt.vaccine[1, ])
dt.vaccine = dt.vaccine[2:nrow(dt.vaccine), ]
dt.vaccine = dt.vaccine %>% clean_names()
writeLines(names(dt.vaccine))
## age
## not_vax_percent
## fully_vax_percent
## not_vax_per_100k_p
## fully_vax_per_100k

Shift from Cross Tabulation to Long Format

Since this is a 2x2 contingency table, the first step would be to change this to a long format:

dt.vaccine[, age.new := shift(age, 1L)]
dt.vaccine[!is.na(age), age := age]
dt.vaccine[!is.na(age.new), age := age.new][, age.new := NULL]

dt.vax.pct = dt.vaccine[, .(age, not_vax_percent, fully_vax_percent)]
dt.vax.pct = melt(dt.vax.pct, id.vars = "age", value.name = "total")
dt.vax.pct[, total := as.numeric(total)]
dt.vax.pct[as.numeric(total) <= 1, var2 := "percent"]
dt.vax.pct[total > 1, var2 := "N"]
dt.vax.pct[grepl("fully\\_vax", variable), vax := "FULL"]
dt.vax.pct[!grepl("fully\\_vax", variable), vax := "NOT_VAX"]

dt.vax.per.100k = dt.vaccine[, .(age, not_vax_per_100k_p, fully_vax_per_100k)][!is.na(not_vax_per_100k_p),]

kable(dt.vax.per.100k)
age not_vax_per_100k_p fully_vax_per_100k
<50 43 11
>50 171 290

Additional Variables

We'll need to create some additional variables to format the table as needed

mlt.vax.100k = melt(dt.vax.per.100k, id.vars = "age", value.name = "counts")

mlt.vax.100k[grepl("fully\\_vax", variable), vax := "FULL"]
mlt.vax.100k[!grepl("fully\\_vax", variable), vax := "NOT_VAX"]

Table Export

The format below allows for aggregation functions:

dt.vax.merged = rbind(dt.vax.pct[, .(age, value = total, var2, vax)], mlt.vax.100k[, .(age, value = counts, vax, var2 = "per100k")])
kable(dt.vax.merged)
age value var2 vax
<50 1116834 N NOT_VAX
<50 0.233 percent NOT_VAX
>50 186078 N NOT_VAX
>50 0.079 percent NOT_VAX
<50 3501118 N FULL
<50 0.73 percent FULL
>50 2133516 N FULL
>50 0.904 percent FULL
<50 43 per100k NOT_VAX
>50 171 per100k NOT_VAX
<50 11 per100k FULL
>50 290 per100k FULL

We need to switch this to a wide format:

dt.vax.merged.wide = dcast(dt.vax.merged, age + vax ~ var2, fun.aggregate = max)
setnames(dt.vax.merged.wide, "per100k", "severe_cases")
dt.vax.merged.wide[, `:=` (N = as.numeric(N), severe_cases = as.numeric(severe_cases))]
kable(dt.vax.merged.wide)
age vax N severe_cases percent
<50 FULL 3501118 11 0.73
<50 NOT_VAX 1116834 43 0.233
>50 FULL 2133516 290 0.904
>50 NOT_VAX 186078 171 0.079

From here we can export the CSV

write.csv(dt.vax.merged.wide, 'vaccine_table.csv', row.names = F)

Health Questions

Question 1: Total Vaccine Population

The total population would be as follows:

paste(sum(as.integer(dt.vax.merged.wide$N)), "total individuals eligible for vaccination") %>% writeLines()
## 6937546 total individuals eligible for vaccination

This is the total number of people eligible for vaccination: this, in turn, depends on when the dataset was published. Assuming it was recent, this would be everyone ages 5 and up (excluding those with medical exemptions). The definition of fully vaccinated as changed as well--we could be only looking at individuals with two shots or considering fully boosted as our standard.

Israel's total population is 9 million so there's a discrepancy between this number and those eligible.

Question 2: Efficacy vs Disease

Effectiveness vs Disease for Totals

The formula we'd need is 1 - (severe cases per 100K for vaccinated individuals/severe cases per 100k for unvaccinated individuals). We'll start by summing the data so that the ages aren't stratified:

dt.vax.totals = dt.vax.merged.wide[,.(vax, N, severe_cases)][, lapply(.SD, sum, na.rm=TRUE), by=vax ]
dt.vax.totals[, age := 'TOTAL']
dt.vax.totals[, severe_per100k := (severe_cases/N)*100000]
kable(dt.vax.totals)
vax N severe_cases age severe_per100k
FULL 5634634 301 TOTAL 5.341962
NOT_VAX 1302912 214 TOTAL 16.424747

From there we can calulate the effectiveness

efficacy = 1-(dt.vax.totals[vax == "FULL", severe_per100k]/dt.vax.totals[vax == "NOT_VAX", severe_per100k])

paste0(round(efficacy * 100, 2), '%', " Vaccine Effectiveness vs. Severe disease") %>% writeLines()
## 67.48% Vaccine Effectiveness vs. Severe disease

Effectiveness vs Disease per age strata

To make sure we're thorough, we can calculate this metric per age group as well.

dt.vax.merged.wide[, severe_per100k:= (severe_cases/N)*100000]
efficacy.young = 1-(dt.vax.merged.wide[vax == "FULL" & age == '<50', severe_per100k]/dt.vax.merged.wide[vax == "NOT_VAX"  & age == '<50', severe_per100k])

efficacy.old = 1-(dt.vax.merged.wide[vax == "FULL" & age == '>50', severe_per100k]/dt.vax.merged.wide[vax == "NOT_VAX"  & age == '>50', severe_per100k])

paste0(round(efficacy.young * 100, 2), '%', " Vaccine Effectiveness vs. Severe disease (<50)") %>% writeLines()
## 91.84% Vaccine Effectiveness vs. Severe disease (<50)
paste0(round(efficacy.old * 100, 2), '%', " Vaccine Effectiveness vs. Severe disease (>50)") %>% writeLines()
## 85.21% Vaccine Effectiveness vs. Severe disease (>50)

Question 3: Severe Disease Case Rate (Unvaccinated vs Vaccinated)

For this metric, we'll need the number of severe cases per 100K for 1) unvaccinated people and 2) vaccinated people:

dt.vax.totals[vax == 'NOT_VAX', severe_per100k]
## [1] 16.42475
dt.vax.totals[vax == 'FULL', severe_per100k]
## [1] 5.341962

From here, we can calculate the severe disease case rate:

rate.severe = dt.vax.totals[vax == 'NOT_VAX', severe_per100k]/dt.vax.totals[vax == 'FULL', severe_per100k]
paste("The rate of severe disease is", round(rate.severe, 2), "times higher in unvaccinated individuals compared to those who are vaccinated") %>% writeLines()
## The rate of severe disease is 3.07 times higher in unvaccinated individuals compared to those who are vaccinated