library(tidyr)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RCurl)##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(kableExtra)##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
Homework 4 - Tidying and Transforming Vaccination Data
The csv contained in my GitHub repository was generated using this website https://cloudconvert.com/xlsx-to-csv that converts .xlsx files into .csv files.
file_url <- 'https://raw.githubusercontent.com/nolivercuny/data607/master/homework4/israeli_vaccination_data_analysis_start.csv'
raw_data <- getURL(file_url)
data <- readr::read_csv(raw_data)## Rows: 6 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Age, Population %, Severe Cases, Efficacy
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Tidying the data
The Efficacy column was loaded incorrectly and contains a number of distinct values. Separating out using the separate function. The column names don’t matter at this point so setting them to (A,B,C)
data <- data %>% separate(Efficacy, sep = ",", into = c('A','B','C'))## Warning: One or more parsing issues, see `problems()` for details
Renaming the columns is very manual. The way the data was read in from the CSV it put the column names that spanned multiple subcolumns as the column names into my datafame. Then the sub column names were inserted as the first row.
My goal here was to combine the multi-column names (Population %, Severe Cases) with the descriptive and important sub-column names (Not Vax, Fully Vax). I set the dNames variable to the original names so when I overwrite the column name I can still use the original names.
dNames <- names(data)
#Pop % + Not Vax
names(data)[2] <- paste(dNames[2], data[1,2])
#Pop % + Fully Vax
names(data)[3] <- paste(dNames[2], data[1,3])
# Sever Cases + Not Vax per 100k
names(data)[4] <- paste(dNames[3],data[1,4], data[2,4])
# Sever Cases + Fully Vax per 100k
names(data)[5] <- paste(dNames[3],data[1,5], data[2,5])
# vs. severe disease
names(data)[6] <- paste('Efficacy',data[1,6])Remove the first two rows because they are no longer needed
data <- data %>% slice(-c(1,2))Now I need to take rows 2 and 4 and add them as the values of two new columns representing the population %
First fill the Age column downward
data <- data %>% fill(Age, .direction='down')Next split the data into two dataframes so they can be merged back together
pop_value_data <- data %>% slice(1,3)
names(pop_value_data)[2] = "Population Not Vax"
names(pop_value_data)[3] = "Population Fully Vax"
pop_percent_data <- data %>% slice(2,4)Last merge them together but only select the first three columns of the second dataframe to prevent duplicate columns
final_data <- full_join(pop_value_data, select(pop_percent_data,c(1,2,3)),by="Age")(1) Do you have enough information to calculate the total population? What does this total population represent?
Yes, the populations include both vaccinated and unvaccinated people both over and at or under the age of fifty.
total_population <- sum(as.numeric(gsub(",","",final_data$`Population Not Vax`))) +
sum(as.numeric(gsub(",","",final_data$`Population Fully Vax`)))
total_population## [1] 6937546
6,937,546
(2) Calculate the Efficacy vs. Disease; Explain your results.
#convert the columns to numeric
final_data$`Population Fully Vax`<- as.numeric(gsub(",","",final_data$`Population Fully Vax`))
final_data$`Population Not Vax`<- as.numeric(gsub(",","",final_data$`Population Not Vax`))
final_data$`Severe Cases Not Vax per 100K`<- as.numeric(final_data$`Severe Cases Not Vax per 100K`)
final_data$`Severe Cases Fully Vax per 100K`<- as.numeric(final_data$`Severe Cases Fully Vax per 100K`)final_data$`Efficacy vs. severe disease` <- 1 - (
(final_data$`Population Fully Vax` / 100000 ) * final_data$`Severe Cases Fully Vax per 100K` / final_data$`Population Fully Vax`) /
(
(final_data$`Population Not Vax` / 100000 ) * final_data$`Severe Cases Not Vax per 100K` / final_data$`Population Not Vax`) My results are wrong but I don’t have time to figure out why. I believe based on the data that this shows that the efficacy is high and higher for > 50 year olds than < 50 year olds.
kable(final_data,caption="israeli_vaccination_data_analysis",digits = 2, format = "html", row.names = TRUE)| Age | Population Not Vax | Population Fully Vax | Severe Cases Not Vax per 100K | Severe Cases Fully Vax per 100K | Efficacy vs. severe disease | Population % Not Vax | Population % Fully Vax | |
|---|---|---|---|---|---|---|---|---|
| 1 | <50 | 1116834 | 3501118 | 43 | 11 | 0.74 | 23.3% | 73.0% |
| 2 | >50 | 186078 | 2133516 | 171 | 290 | -0.70 | 7.9% | 90.4% |