Methods
- First we need to recreate the table of vaccination data
- create a data frame to populate with our vaccination data
- create column names based on the table provided
- create a row name called ‘Age’
covid_df <-
data.frame(
Age = as.character(),
"Population.Not.Vaxed" = as.character(),
"Population.Fully.Vaxed" = as.character(),
"Severe.Cases.Not.Vax.per.100K" = as.character(),
"Severe.Cases.Fully.Vax.per.100K" = as.character(),
"Efficacy.vs.severe.disease" = as.character(), row.names = "Age"
)
- Populate the rows of our csv, with a primary row based on age stratification
- Create a secondary row with the age percentage
- Blank values are represented with ‘NA’
covid_df['<50',]<- c('1,116,834', '3,501,118', '43', '11', NA)
covid_df['<50%',]<- c('23.3%', '73.0%', NA, NA, NA)
covid_df['>50',]<- c('186,078', '2,133,516', '171', '290', NA)
covid_df['>50%',]<- c('7.9%%', '90.4%', NA, NA, NA)
covid_df %>% kbl %>% kable_classic
|
Population.Not.Vaxed
|
Population.Fully.Vaxed
|
Severe.Cases.Not.Vax.per.100K
|
Severe.Cases.Fully.Vax.per.100K
|
Efficacy.vs.severe.disease
|
<50
|
1,116,834
|
3,501,118
|
43
|
11
|
NA
|
<50%
|
23.3%
|
73.0%
|
NA
|
NA
|
NA
|
>50
|
186,078
|
2,133,516
|
171
|
290
|
NA
|
>50%
|
7.9%%
|
90.4%
|
NA
|
NA
|
NA
|
- Save out our csv that we are going to commit to github
- We don’t want to run this again
#write.table(covid_df, "israeli_vaccination_data.csv", row.names=TRUE, sep = ",")
- Load csv from github with readr
file_path <- "https://raw.githubusercontent.com/catfoodlover/Data607/main/israeli_vaccination_data.csv"
covid_df2 <- read_csv(file_path, show_col_types = FALSE)
- readr saves the data as a tibble which doesn’t support row names so we need to fix our data structrue
- convert to a data frame
- name the rows the first column’s values
- save out a list of column names
- delete the first column
- split the last column which contains a comma separated list
- rename the columns with the saved list
- set all the strings ‘NA’ to NA
covid_df2 <- as.data.frame(covid_df2)
rownames(covid_df2) <- covid_df2[, 1]
## Warning: One or more parsing issues, see `problems()` for details
names_list <- names(covid_df2)
covid_df2[,1] <- NULL
covid_df2 <- covid_df2 %>% separate(Efficacy.vs.severe.disease, sep = ",", c("temp1", "temp2"))
colnames(covid_df2) <- names_list
covid_df2[covid_df2 == 'NA'] <- NA
- We want to get the information in those secondary rows into columns
- use case_when to detect row names and put the age stratification into a new column called ‘age_group’
- group by age_group and move those population %s into a new column and use the ‘fill’ function to populate all the rows
- remove that no longer needed secondary row
covid_df2 <-
covid_df2 %>% mutate(age_group = case_when(
str_detect(row.names(.), "<50") ~ "<50",
str_detect(row.names(.), ">50") ~ ">50"
)) %>% group_by(age_group) %>% mutate(
percent.not.vaxed = str_extract(Population.Not.Vaxed, ".*%"),
percent.vaxed = str_extract(Population.Fully.Vaxed, ".*%")
) %>% fill(percent.not.vaxed, .direction = c("up")) %>% fill(percent.vaxed, .direction = c("up")) %>% ungroup(.) %>% filter(!is.na(Severe.Cases.Not.Vax.per.100K))
covid_df2 %>% kbl() %>% kable_classic()
Population.Not.Vaxed
|
Population.Fully.Vaxed
|
Severe.Cases.Not.Vax.per.100K
|
Severe.Cases.Fully.Vax.per.100K
|
Efficacy.vs.severe.disease
|
age_group
|
percent.not.vaxed
|
percent.vaxed
|
1,116,834
|
3,501,118
|
43
|
11
|
NA
|
<50
|
23.3%
|
73.0%
|
186,078
|
2,133,516
|
171
|
290
|
NA
|
>50
|
7.9%%
|
90.4%
|
- Now we can calculate our Efficacy vs severe disease
- we cast our severe cases per 100K to numeric and divide by 100K to get percentages
- we calculate Efficacy as 1 - (% Severe cases vaccinated per 100K)/(% Severe cases not vaccinated per 100K)
covid_df2 <-
covid_df2 %>% mutate(Efficacy.vs.severe.disease = 1 - ((as.numeric(Severe.Cases.Fully.Vax.per.100K)/100000)/(as.numeric(Severe.Cases.Not.Vax.per.100K)/100000)))