Import Libraries
Cleaning the Vaccination Data
- Shift from Cross Tabulation to Long Format
- Additional Variables
Table Export
Health Questions

Import Libraries

library(data.table)
library(ggplot2)
library(readxl)
library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ✓ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()

library(knitr)
dir.home = '~/git/CUNY.MDS/DATA607/'

Cleaning the Vaccination Data

This dataset is quite a mess since it's a cross tabulations. The first step will be using the readxl package to import everything and using data.table and janitor to clean it up.

dt.vaccine = read_xlsx(file.path(dir.home, 'israeli_vaccination_data_analysis_start.xlsx'))

## New names:
## * `` -> ...3
## * `` -> ...5

dt.vaccine = as.data.table(dt.vaccine[1:5, 1:5])
colnames(dt.vaccine) = unlist(dt.vaccine[1, ])
dt.vaccine = dt.vaccine[2:nrow(dt.vaccine), ]
dt.vaccine = dt.vaccine %>% clean_names()
writeLines(names(dt.vaccine))

## age
## not_vax_percent
## fully_vax_percent
## not_vax_per_100k_p
## fully_vax_per_100k

Shift from Cross Tabulation to Long Format

Since this is a 2x2 contingency table, the first step would be to change this to a long format:

dt.vaccine[, age.new := shift(age, 1L)]
dt.vaccine[!is.na(age), age := age]
dt.vaccine[!is.na(age.new), age := age.new][, age.new := NULL]

dt.vax.pct = dt.vaccine[, .(age, not_vax_percent, fully_vax_percent)]
dt.vax.pct = melt(dt.vax.pct, id.vars = "age", value.name = "total")
dt.vax.pct[, total := as.numeric(total)]
dt.vax.pct[as.numeric(total) <= 1, var2 := "percent"]
dt.vax.pct[total > 1, var2 := "N"]
dt.vax.pct[grepl("fully\\_vax", variable), vax := "FULL"]
dt.vax.pct[!grepl("fully\\_vax", variable), vax := "NOT_VAX"]

dt.vax.per.100k = dt.vaccine[, .(age, not_vax_per_100k_p, fully_vax_per_100k)][!is.na(not_vax_per_100k_p),]

kable(dt.vax.per.100k)

age	not_vax_per_100k_p	fully_vax_per_100k
<50	43	11
>50	171	290

Additional Variables

We'll need to create some additional variables to format the table as needed

mlt.vax.100k = melt(dt.vax.per.100k, id.vars = "age", value.name = "counts")

mlt.vax.100k[grepl("fully\\_vax", variable), vax := "FULL"]
mlt.vax.100k[!grepl("fully\\_vax", variable), vax := "NOT_VAX"]

Table Export

The format below allows for aggregation functions:

dt.vax.merged = rbind(dt.vax.pct[, .(age, value = total, var2, vax)], mlt.vax.100k[, .(age, value = counts, vax, var2 = "per100k")])
kable(dt.vax.merged)

age	value	var2	vax
<50	1116834	N	NOT_VAX
<50	0.233	percent	NOT_VAX
>50	186078	N	NOT_VAX
>50	0.079	percent	NOT_VAX
<50	3501118	N	FULL
<50	0.73	percent	FULL
>50	2133516	N	FULL
>50	0.904	percent	FULL
<50	43	per100k	NOT_VAX
>50	171	per100k	NOT_VAX
<50	11	per100k	FULL
>50	290	per100k	FULL

We need to switch this to a wide format:

dt.vax.merged.wide = dcast(dt.vax.merged, age + vax ~ var2, fun.aggregate = max)
setnames(dt.vax.merged.wide, "per100k", "severe_cases")
dt.vax.merged.wide[, `:=` (N = as.numeric(N), severe_cases = as.numeric(severe_cases))]
kable(dt.vax.merged.wide)

age	vax	N	severe_cases	percent
<50	FULL	3501118	11	0.73
<50	NOT_VAX	1116834	43	0.233
>50	FULL	2133516	290	0.904
>50	NOT_VAX	186078	171	0.079

From here we can export the CSV

write.csv(dt.vax.merged.wide, 'vaccine_table.csv', row.names = F)

Health Questions

Question 1: Total Vaccine Population

The total population would be as follows:

paste(sum(as.integer(dt.vax.merged.wide$N)), "total individuals eligible for vaccination") %>% writeLines()

## 6937546 total individuals eligible for vaccination

This is the total number of people eligible for vaccination: this, in turn, depends on when the dataset was published. Assuming it was recent, this would be everyone ages 5 and up (excluding those with medical exemptions). The definition of fully vaccinated as changed as well--we could be only looking at individuals with two shots or considering fully boosted as our standard.

Israel's total population is 9 million so there's a discrepancy between this number and those eligible.

Question 2: Efficacy vs Disease

Effectiveness vs Disease for Totals

The formula we'd need is 1 - (severe cases per 100K for vaccinated individuals/severe cases per 100k for unvaccinated individuals). We'll start by summing the data so that the ages aren't stratified:

dt.vax.totals = dt.vax.merged.wide[,.(vax, N, severe_cases)][, lapply(.SD, sum, na.rm=TRUE), by=vax ]
dt.vax.totals[, age := 'TOTAL']
dt.vax.totals[, severe_per100k := (severe_cases/N)*100000]
kable(dt.vax.totals)

vax	N	severe_cases	age	severe_per100k
FULL	5634634	301	TOTAL	5.341962
NOT_VAX	1302912	214	TOTAL	16.424747

From there we can calulate the effectiveness

efficacy = 1-(dt.vax.totals[vax == "FULL", severe_per100k]/dt.vax.totals[vax == "NOT_VAX", severe_per100k])

paste0(round(efficacy * 100, 2), '%', " Vaccine Effectiveness vs. Severe disease") %>% writeLines()

## 67.48% Vaccine Effectiveness vs. Severe disease

Effectiveness vs Disease per age strata

To make sure we're thorough, we can calculate this metric per age group as well.

dt.vax.merged.wide[, severe_per100k:= (severe_cases/N)*100000]
efficacy.young = 1-(dt.vax.merged.wide[vax == "FULL" & age == '<50', severe_per100k]/dt.vax.merged.wide[vax == "NOT_VAX"  & age == '<50', severe_per100k])

efficacy.old = 1-(dt.vax.merged.wide[vax == "FULL" & age == '>50', severe_per100k]/dt.vax.merged.wide[vax == "NOT_VAX"  & age == '>50', severe_per100k])

paste0(round(efficacy.young * 100, 2), '%', " Vaccine Effectiveness vs. Severe disease (<50)") %>% writeLines()

## 91.84% Vaccine Effectiveness vs. Severe disease (<50)

paste0(round(efficacy.old * 100, 2), '%', " Vaccine Effectiveness vs. Severe disease (>50)") %>% writeLines()

## 85.21% Vaccine Effectiveness vs. Severe disease (>50)

Question 3: Severe Disease Case Rate (Unvaccinated vs Vaccinated)

For this metric, we'll need the number of severe cases per 100K for 1) unvaccinated people and 2) vaccinated people:

dt.vax.totals[vax == 'NOT_VAX', severe_per100k]

## [1] 16.42475

dt.vax.totals[vax == 'FULL', severe_per100k]

## [1] 5.341962

From here, we can calculate the severe disease case rate:

rate.severe = dt.vax.totals[vax == 'NOT_VAX', severe_per100k]/dt.vax.totals[vax == 'FULL', severe_per100k]
paste("The rate of severe disease is", round(rate.severe, 2), "times higher in unvaccinated individuals compared to those who are vaccinated") %>% writeLines()

## The rate of severe disease is 3.07 times higher in unvaccinated individuals compared to those who are vaccinated

Assignment 5

Deepika Dilip

2/26/2022