I will use the package “readxl” to read directly the .XLS file from my drive.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x mosaic::count() masks dplyr::count()
## x purrr::cross() masks mosaic::cross()
## x mosaic::do() masks dplyr::do()
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x ggstance::geom_errorbarh() masks ggplot2::geom_errorbarh()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x mosaic::stat() masks ggplot2::stat()
## x mosaic::tally() masks dplyr::tally()
## x tidyr::unpack() masks Matrix::unpack()
library(readxl)
library(httr)
rm(list=ls())
I will use the GET function from httr package to read the file from Github and save it on a temporary folder on my windows folder. Then I will use the read_excel function from readxl package to read directly the .XLS file from the temporary folder
url1 <- "https://raw.githubusercontent.com/acatlin/data/master/israeli_vaccination_data_analysis_start.xlsx"
GET(url1, write_disk(tf <- tempfile(fileext = ".xlsx")))
## Response [https://raw.githubusercontent.com/acatlin/data/master/israeli_vaccination_data_analysis_start.xlsx]
## Date: 2022-03-10 21:00
## Status: 200
## Content-Type: application/octet-stream
## Size: 11.3 kB
## <ON DISK> C:\Users\jrfal\AppData\Local\Temp\RtmpaGuera\file36c04c335284.xlsx
my_excel <- read_excel(tf)
## New names:
## * `` -> ...3
## * `` -> ...5
my_excel
## # A tibble: 15 x 6
## Age `Population %` ...3 `Severe Cases` ...5 Efficacy
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 <NA> "Not Vax\r\n%" "Ful~ "Not Vax\r\np~ "Ful~ vs. sev~
## 2 <50 "1116834" "350~ "43" "11" <NA>
## 3 <NA> "0.2330000000~ "0.7~ <NA> <NA> <NA>
## 4 >50 "186078" "213~ "171" "290" <NA>
## 5 <NA> "7.9000000000~ "0.9~ <NA> <NA> <NA>
## 6 <NA> <NA> <NA> <NA> <NA> <NA>
## 7 <NA> <NA> <NA> <NA> <NA> <NA>
## 8 <NA> <NA> <NA> <NA> <NA> <NA>
## 9 Definitions <NA> <NA> <NA> <NA> <NA>
## 10 <NA> "Severe Cases~ <NA> <NA> <NA> <NA>
## 11 <NA> "Efficacy vs.~ <NA> <NA> <NA> <NA>
## 12 <NA> <NA> <NA> <NA> <NA> <NA>
## 13 (1) Do you have enough in~ <NA> <NA> <NA> <NA> <NA>
## 14 (2) Calculate the Efficac~ <NA> <NA> <NA> <NA> <NA>
## 15 (3) From your calculation~ <NA> <NA> <NA> <NA> <NA>
The tibble is absolutely messy. I will delete all rows and columns I dont need.
So I decided I only need rows 2,3,4,5 and the first 4 columns. Everyrhing else I will delete.
#Delete Rows I don't need.
my_excel2 <- my_excel %>% slice(-c(1,6:15))
my_excel2
## # A tibble: 4 x 6
## Age `Population %` ...3 `Severe Cases` ...5 Efficacy
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 <50 1116834 3501118 43 11 <NA>
## 2 <NA> 0.23300000000000001 0.73 <NA> <NA> <NA>
## 3 >50 186078 2133516 171 290 <NA>
## 4 <NA> 7.9000000000000001E-2 0.90400000000000003 <NA> <NA> <NA>
It looks much better now. We don’t need last column
my_excel2 <- my_excel2 %>%
select(-Efficacy)
my_excel2
## # A tibble: 4 x 5
## Age `Population %` ...3 `Severe Cases` ...5
## <chr> <chr> <chr> <chr> <chr>
## 1 <50 1116834 3501118 43 11
## 2 <NA> 0.23300000000000001 0.73 <NA> <NA>
## 3 >50 186078 2133516 171 290
## 4 <NA> 7.9000000000000001E-2 0.90400000000000003 <NA> <NA>
I don’t like the column names so I will change them to: age_group, pop_n_vax, pop_f_vax, severe_nvax, severe_vax
my_excel3 <- my_excel2 %>%
rename(
age_group = Age,
pop_n_vaxed = `Population %`,
pop_f_vaxed = ...3,
severe_nvax = `Severe Cases`,
severe_vax = ...5)
str(my_excel3)
## tibble [4 x 5] (S3: tbl_df/tbl/data.frame)
## $ age_group : chr [1:4] "<50" NA ">50" NA
## $ pop_n_vaxed: chr [1:4] "1116834" "0.23300000000000001" "186078" "7.9000000000000001E-2"
## $ pop_f_vaxed: chr [1:4] "3501118" "0.73" "2133516" "0.90400000000000003"
## $ severe_nvax: chr [1:4] "43" NA "171" NA
## $ severe_vax : chr [1:4] "11" NA "290" NA
Population numbers are now characters so I will convert them to numbers
cols.num <- c("pop_n_vaxed","pop_f_vaxed","severe_nvax","severe_vax")
my_excel3[,cols.num] <- sapply(my_excel3[,cols.num],as.numeric)
str(my_excel3)
## tibble [4 x 5] (S3: tbl_df/tbl/data.frame)
## $ age_group : chr [1:4] "<50" NA ">50" NA
## $ pop_n_vaxed: num [1:4] 1.12e+06 2.33e-01 1.86e+05 7.90e-02
## $ pop_f_vaxed: num [1:4] 3.50e+06 7.30e-01 2.13e+06 9.04e-01
## $ severe_nvax: num [1:4] 43 NA 171 NA
## $ severe_vax : num [1:4] 11 NA 290 NA
Let’s see how it looks
my_excel3
## # A tibble: 4 x 5
## age_group pop_n_vaxed pop_f_vaxed severe_nvax severe_vax
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 <50 1116834 3501118 43 11
## 2 <NA> 0.233 0.73 NA NA
## 3 >50 186078 2133516 171 290
## 4 <NA> 0.079 0.904 NA NA
we need to convert the absolute numbers of severe cases to cases / 100K
my_excel3[1,4]<- my_excel3[1,4] / (my_excel3[1,2]/100000)
my_excel3[1,5]<- my_excel3[1,5] / (my_excel3[1,3]/100000)
my_excel3[3,4]<- my_excel3[3,4] / (my_excel3[3,2]/100000)
my_excel3[3,5]<- my_excel3[3,5] / (my_excel3[3,3]/100000)
my_excel3
## # A tibble: 4 x 5
## age_group pop_n_vaxed pop_f_vaxed severe_nvax severe_vax
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 <50 1116834 3501118 3.85 0.314
## 2 <NA> 0.233 0.73 NA NA
## 3 >50 186078 2133516 91.9 13.6
## 4 <NA> 0.079 0.904 NA NA
Looks good!
One last step before I write to csv is to move the percentage in the two rows to the respective column
#First lets add two new empty columns
my_excel4 <- my_excel3 %>%
add_column(pct_f_vax = NA,
pct_n_vax = NA)
#now the pcts to the proper row and column
my_excel4[1,6]<- my_excel4[2,3]
my_excel4[1,7]<- my_excel4[2,2]
my_excel4[3,6]<- my_excel4[4,3]
my_excel4[3,7]<- my_excel4[4,2]
# Finally delete the two rows you don't need
my_excel4 <- my_excel4 %>% slice(-c(2,4))
my_excel4
## # A tibble: 2 x 7
## age_group pop_n_vaxed pop_f_vaxed severe_nvax severe_vax pct_f_vax pct_n_vax
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <50 1116834 3501118 3.85 0.314 0.73 0.233
## 2 >50 186078 2133516 91.9 13.6 0.904 0.079
Looks good and ready to send to csv
write.csv(my_excel4,"covid.csv", row.names = FALSE)
Let’s read it back to see it all is ok
my_csv <- read_csv("covid.csv")
## Rows: 2 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): age_group
## dbl (6): pop_n_vaxed, pop_f_vaxed, severe_nvax, severe_vax, pct_f_vax, pct_n...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_csv
## # A tibble: 2 x 7
## age_group pop_n_vaxed pop_f_vaxed severe_nvax severe_vax pct_f_vax pct_n_vax
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <50 1116834 3501118 3.85 0.314 0.73 0.233
## 2 >50 186078 2133516 91.9 13.6 0.904 0.079
PERFECT!
Do I have enough information to calculate the total population?
ANSWER
Yes. One important thing to note is that the total population in the table is not 100%. I am assuming there is a category “vaxed but not fully” which is not in the table. So in order to get full population we would need to do some basic algebra.
my_csv2 <- my_csv %>%
mutate(pct_s_vax = 1-(pct_f_vax+pct_n_vax),
pop_s_vax = (1-(pct_f_vax+pct_n_vax)) * pop_f_vaxed/pct_f_vax)
# Total population single vax by Age group
my_csv2[,c("age_group","pop_s_vax")]
## # A tibble: 2 x 2
## age_group pop_s_vax
## <chr> <dbl>
## 1 <50 177454.
## 2 >50 40121.
#Total pct of population single vax by age group
my_csv2[,c("age_group","pct_s_vax")]
## # A tibble: 2 x 2
## age_group pct_s_vax
## <chr> <dbl>
## 1 <50 0.0370
## 2 >50 0.0170
Now we have all we need to output full population
my_csv2 %>%
summarise(Age_group = age_group,
total = pop_n_vaxed+pop_f_vaxed+pop_s_vax,
pct = pct_f_vax+pct_n_vax+pct_s_vax
)
## # A tibble: 2 x 3
## Age_group total pct
## <chr> <dbl> <dbl>
## 1 <50 4795406. 1
## 2 >50 2359715. 1
Calculate the efficacy ratio
ANSWER
Now that we have our tibble lest add a column for efficacy. using the formula 1- RiskRatio where Risk Ratio is equal Risk_vaxed / Risk_unvaxed
my_csv3 <- my_csv2 %>%
mutate(efficacy = 1-(severe_vax/severe_nvax))
my_csv3$efficacy
## [1] 0.9183970 0.8520888
The results are as expected 0.92 efficacy rate for the population under 50 and 0.85 for the populations over 50.
my_csv3$severe_nvax / my_csv3$severe_vax
## [1] 12.254452 6.760814
We can see here that for population <50, the proportion is 12+ times greater to get a severe disease when not vaxinated. For population >50 the proportion is 6.7 times greater for a unvaccinated person to get severe disease vs a vaxinated person.
Can we compare total rates for vaccinated vs unvaccinated?
ANSWER
Yes, but we need to do some algebra again.
# we need to calculate first the total number of severe cases
my_csv3 %>%
mutate(total_severe_vax = pop_f_vaxed/100000*severe_vax,
total_severe_nvax = pop_n_vaxed/100000*severe_nvax) %>%
summarise(rate_vaxed = total_severe_vax / (sum(pop_f_vaxed) / 100000),
rate_nvaxed = total_severe_nvax / (sum(pop_n_vaxed) / 100000)) %>%
summarise_all(sum)
## # A tibble: 1 x 2
## rate_vaxed rate_nvaxed
## <dbl> <dbl>
## 1 5.34 16.4
We can see now that the blended rate for the population tells us that severe cases per 100K is 5.34 for vaccinated people vs 16.4 for unvaccinated.
Thanks!