library(data.table)
library(ggplot2)
library(readxl)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.0.0 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(knitr)
dir.home = '~/git/CUNY.MDS/DATA607/'
This dataset is quite a mess since it's a cross tabulations. The first step will be using the readxl package to import everything and using data.table and janitor to clean it up.
dt.vaccine = read_xlsx(file.path(dir.home, 'israeli_vaccination_data_analysis_start.xlsx'))
## New names:
## * `` -> ...3
## * `` -> ...5
dt.vaccine = as.data.table(dt.vaccine[1:5, 1:5])
colnames(dt.vaccine) = unlist(dt.vaccine[1, ])
dt.vaccine = dt.vaccine[2:nrow(dt.vaccine), ]
dt.vaccine = dt.vaccine %>% clean_names()
writeLines(names(dt.vaccine))
## age
## not_vax_percent
## fully_vax_percent
## not_vax_per_100k_p
## fully_vax_per_100k
Since this is a 2x2 contingency table, the first step would be to change this to a long format:
dt.vaccine[, age.new := shift(age, 1L)]
dt.vaccine[!is.na(age), age := age]
dt.vaccine[!is.na(age.new), age := age.new][, age.new := NULL]
dt.vax.pct = dt.vaccine[, .(age, not_vax_percent, fully_vax_percent)]
dt.vax.pct = melt(dt.vax.pct, id.vars = "age", value.name = "total")
dt.vax.pct[, total := as.numeric(total)]
dt.vax.pct[as.numeric(total) <= 1, var2 := "percent"]
dt.vax.pct[total > 1, var2 := "N"]
dt.vax.pct[grepl("fully\\_vax", variable), vax := "FULL"]
dt.vax.pct[!grepl("fully\\_vax", variable), vax := "NOT_VAX"]
dt.vax.per.100k = dt.vaccine[, .(age, not_vax_per_100k_p, fully_vax_per_100k)][!is.na(not_vax_per_100k_p),]
kable(dt.vax.per.100k)
| age | not_vax_per_100k_p | fully_vax_per_100k |
|---|---|---|
| <50 | 43 | 11 |
| >50 | 171 | 290 |
We'll need to create some additional variables to format the table as needed
mlt.vax.100k = melt(dt.vax.per.100k, id.vars = "age", value.name = "counts")
mlt.vax.100k[grepl("fully\\_vax", variable), vax := "FULL"]
mlt.vax.100k[!grepl("fully\\_vax", variable), vax := "NOT_VAX"]
The format below allows for aggregation functions:
dt.vax.merged = rbind(dt.vax.pct[, .(age, value = total, var2, vax)], mlt.vax.100k[, .(age, value = counts, vax, var2 = "per100k")])
kable(dt.vax.merged)
| age | value | var2 | vax |
|---|---|---|---|
| <50 | 1116834 | N | NOT_VAX |
| <50 | 0.233 | percent | NOT_VAX |
| >50 | 186078 | N | NOT_VAX |
| >50 | 0.079 | percent | NOT_VAX |
| <50 | 3501118 | N | FULL |
| <50 | 0.73 | percent | FULL |
| >50 | 2133516 | N | FULL |
| >50 | 0.904 | percent | FULL |
| <50 | 43 | per100k | NOT_VAX |
| >50 | 171 | per100k | NOT_VAX |
| <50 | 11 | per100k | FULL |
| >50 | 290 | per100k | FULL |
We need to switch this to a wide format:
dt.vax.merged.wide = dcast(dt.vax.merged, age + vax ~ var2, fun.aggregate = max)
setnames(dt.vax.merged.wide, "per100k", "severe_cases")
dt.vax.merged.wide[, `:=` (N = as.numeric(N), severe_cases = as.numeric(severe_cases))]
kable(dt.vax.merged.wide)
| age | vax | N | severe_cases | percent |
|---|---|---|---|---|
| <50 | FULL | 3501118 | 11 | 0.73 |
| <50 | NOT_VAX | 1116834 | 43 | 0.233 |
| >50 | FULL | 2133516 | 290 | 0.904 |
| >50 | NOT_VAX | 186078 | 171 | 0.079 |
From here we can export the CSV
write.csv(dt.vax.merged.wide, 'vaccine_table.csv', row.names = F)
The total population would be as follows:
paste(sum(as.integer(dt.vax.merged.wide$N)), "total individuals eligible for vaccination") %>% writeLines()
## 6937546 total individuals eligible for vaccination
This is the total number of people eligible for vaccination: this, in turn, depends on when the dataset was published. Assuming it was recent, this would be everyone ages 5 and up (excluding those with medical exemptions). The definition of fully vaccinated as changed as well--we could be only looking at individuals with two shots or considering fully boosted as our standard.
Israel's total population is 9 million so there's a discrepancy between this number and those eligible.
The formula we'd need is 1 - (severe cases per 100K for vaccinated individuals/severe cases per 100k for unvaccinated individuals). We'll start by summing the data so that the ages aren't stratified:
dt.vax.totals = dt.vax.merged.wide[,.(vax, N, severe_cases)][, lapply(.SD, sum, na.rm=TRUE), by=vax ]
dt.vax.totals[, age := 'TOTAL']
dt.vax.totals[, severe_per100k := (severe_cases/N)*100000]
kable(dt.vax.totals)
| vax | N | severe_cases | age | severe_per100k |
|---|---|---|---|---|
| FULL | 5634634 | 301 | TOTAL | 5.341962 |
| NOT_VAX | 1302912 | 214 | TOTAL | 16.424747 |
From there we can calulate the effectiveness
efficacy = 1-(dt.vax.totals[vax == "FULL", severe_per100k]/dt.vax.totals[vax == "NOT_VAX", severe_per100k])
paste0(round(efficacy * 100, 2), '%', " Vaccine Effectiveness vs. Severe disease") %>% writeLines()
## 67.48% Vaccine Effectiveness vs. Severe disease
To make sure we're thorough, we can calculate this metric per age group as well.
dt.vax.merged.wide[, severe_per100k:= (severe_cases/N)*100000]
efficacy.young = 1-(dt.vax.merged.wide[vax == "FULL" & age == '<50', severe_per100k]/dt.vax.merged.wide[vax == "NOT_VAX" & age == '<50', severe_per100k])
efficacy.old = 1-(dt.vax.merged.wide[vax == "FULL" & age == '>50', severe_per100k]/dt.vax.merged.wide[vax == "NOT_VAX" & age == '>50', severe_per100k])
paste0(round(efficacy.young * 100, 2), '%', " Vaccine Effectiveness vs. Severe disease (<50)") %>% writeLines()
## 91.84% Vaccine Effectiveness vs. Severe disease (<50)
paste0(round(efficacy.old * 100, 2), '%', " Vaccine Effectiveness vs. Severe disease (>50)") %>% writeLines()
## 85.21% Vaccine Effectiveness vs. Severe disease (>50)
For this metric, we'll need the number of severe cases per 100K for 1) unvaccinated people and 2) vaccinated people:
dt.vax.totals[vax == 'NOT_VAX', severe_per100k]
## [1] 16.42475
dt.vax.totals[vax == 'FULL', severe_per100k]
## [1] 5.341962
From here, we can calculate the severe disease case rate:
rate.severe = dt.vax.totals[vax == 'NOT_VAX', severe_per100k]/dt.vax.totals[vax == 'FULL', severe_per100k]
paste("The rate of severe disease is", round(rate.severe, 2), "times higher in unvaccinated individuals compared to those who are vaccinated") %>% writeLines()
## The rate of severe disease is 3.07 times higher in unvaccinated individuals compared to those who are vaccinated