library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## I cleaned up the CSV file a little bit before I read it into R, now I will attempt to tidy up the data
Israeli_data <- read.csv("https://raw.githubusercontent.com/AldataSci/Data607Assignment5/main/Israeli_Vaccination_Data_Analysis.csv",stringsAsFactors = FALSE)
Israeli_data <- as.data.frame(Israeli_data)
head(Israeli_data)
## Age Population.. X Severe.Cases X.1
## 1 Not Vax\n% Fully Vax\n% Not Vax\nper 100K\n\n\np Fully Vax\nper 100K
## 2 <50 1,116,834 3,501,118 43 11
## 3 23.3% 73.0%
## 4 >50 186,078 2,133,516 171 290
## 5 7.9% 90.4%
## Efficacy
## 1 vs. severe disease
## 2
## 3
## 4
## 5
##Step 1 Use tidyr and diplys to clean up the data since the data looks messy and unorganized..
Unvax <- Israeli_data %>%
select("Age","Population..","Severe.Cases")
Unvaxx <- Unvax %>%
slice(2,4) %>%
mutate(Percent = as.numeric("23.3"))
Unvaxx[2,4] = as.numeric("7.9")
NotvaxLabel <- c("Not Vaccinated", "Not Vaccinated")
Clean_Unvaxx <- Unvaxx %>%
mutate(Vaccine_Status = NotvaxLabel)
Clean_Unvaxx
## Age Population.. Severe.Cases Percent Vaccine_Status
## 1 <50 1,116,834 43 23.3 Not Vaccinated
## 2 >50 186,078 171 7.9 Not Vaccinated
## I use tidyr and diplyr to clean up the vaxxed data and then after I clean that up I plan to join the two tables.. in a nice and neatly manner...
Vaxx <- Israeli_data %>%
select("Age","X","X.1")
Vax <- Vaxx %>%
slice(-1) %>%
slice(1,3)
Clean_Vaccinated <- Vax %>%
mutate(Percent = c("73.0","90.4")) %>%
mutate(Vaccine_Status = c("Vaccinated","Vaccinated")) %>%
rename(Population.. = X) %>%
rename(Severe.Cases= X.1)
Clean_Vaccinated
## Age Population.. Severe.Cases Percent Vaccine_Status
## 1 <50 3,501,118 11 73.0 Vaccinated
## 2 >50 2,133,516 290 90.4 Vaccinated
## I used Rbind to add the two data tables together, I arranged by age to make it more presentable..
Cleaned_Data <- rbind(Clean_Unvaxx,Clean_Vaccinated)
Cleaned_Data <-Cleaned_Data %>%
arrange(Age)
Cleaned_Data
## Age Population.. Severe.Cases Percent Vaccine_Status
## 1 <50 1,116,834 43 23.3 Not Vaccinated
## 2 <50 3,501,118 11 73.0 Vaccinated
## 3 >50 186,078 171 7.9 Not Vaccinated
## 4 >50 2,133,516 290 90.4 Vaccinated
Looking at the chart the total population is not given to us. The data gives us 96.3 percent of people younger than 50 and also gives us 98.3 percent of people older than 50. To calculate the true total population we have to find the total population which would be 100 percent for both groups and add them together to get the total population. Here I calculated the totals based on the chart and also calculated the total when I gathered 100 percent of the population. The true total population is 7155090 while the chart recorded only 6937546.
## Calculating the Total Population, I first filter out the age where they are less than 50, I replaced the commas within the vector with spaces so I can convert them to integer and compute the sum
Young <- Cleaned_Data %>%
filter(Age=="<50")
young_num <-gsub(",","",Young$Population..)
Sum <- sum(as.integer(young_num))
sprintf("The total population for people younger than 50 is approximately %s. which is 96.3 percent of the population",Sum)
## [1] "The total population for people younger than 50 is approximately 4617952. which is 96.3 percent of the population"
## Here I calculated the true total population by doing some simple proportions from algebra to get the population for 100% of the population
True_ysum <- (Sum * 100) / 96.3
sprintf("The true total population for people younger than 50 is %s which is 100 percent of the population",round(True_ysum))
## [1] "The true total population for people younger than 50 is 4795381 which is 100 percent of the population"
## Did the same thing except I filtered out the age where they are greater than 50
Old <- Cleaned_Data %>%
filter(Age==">50")
old_num <-gsub(",","",Old$Population..)
old_sum <-sum(as.integer(old_num))
sprintf("The total population for people older than 50 is approximately %s which is 98.3 percent of the population",old_sum)
## [1] "The total population for people older than 50 is approximately 2319594 which is 98.3 percent of the population"
True_odsum <- (old_sum * 100/98.3)
sprintf("The true total population for people older than 50 is %s which is 100 percent of the population",round(True_odsum))
## [1] "The true total population for people older than 50 is 2359709 which is 100 percent of the population"
## I calculated the total population regardless of age and vaccination status
Total <- Sum+old_sum
sprintf("The total population that was recorded is: %s",Total)
## [1] "The total population that was recorded is: 6937546"
## Here is the true total population
True_total <- True_odsum+True_ysum
sprintf("The true total population is: %s",round(True_total))
## [1] "The true total population is: 7155090"
I’ve calculated the differences in between the population in the graph and the total population in Israel and got a difference of 217544. I believe that these people were not calculated in the table for a few reasons. One possibility is that it didn’t capture the entire population under 50 since the vaccines were not authorized for kids younger than 12. Also there may have been people who got a first dose but not the second dose thus they weren’t completely “vaccinated” in the sense of being counted in the data.
Difference <- True_total - Total
sprintf("The difference in population numbers is %s",round(Difference))
## [1] "The difference in population numbers is 217544"
I got -0.4065 I wonder why I got a negative value, does this mean that covid vaccines are not effective at helping patients with severe cases? This is weird since the belief is that vaccines usually help reduce cases.
Fully_vax_Severe <- Cleaned_Data %>%
filter(Severe.Cases=="11" | Severe.Cases == "290") %>%
summarise(sum(as.integer(Severe.Cases)))
Un_Vax_Severe <- Cleaned_Data %>%
filter(Severe.Cases=="43"|Severe.Cases=="171") %>%
summarise(sum(as.integer(Severe.Cases)))
formula <- 1 - (Fully_vax_Severe/Un_Vax_Severe)
sprintf("The efficacy vs disease is: %s",round(formula,digits = 4))
## [1] "The efficacy vs disease is: -0.4065"
Computing the rates and converting them to percentages I got that the percentage of severe cases is higher in vaccinated individuals then in unvaccinated individuals
## The rate of severe cases in un vaccinated individuals can be calculated by taking the number of unvaccinated individuals divided by the total amount of ppl
Total_pop <- Fully_vax_Severe + Un_Vax_Severe
Unvac_Percentage <- (Un_Vax_Severe / Total_pop) * 100
sprintf("The rate of severe cases in unvaccinated individuals is approximately %s percent",round(Unvac_Percentage, digits = 1))
## [1] "The rate of severe cases in unvaccinated individuals is approximately 41.6 percent"
## The rate of severe cases in un vaccinated individuals can be calculated by taking the number of vaccinated individuals divided by the total amount of ppl
Vac_Percentage <- (Fully_vax_Severe/ Total_pop) * 100
sprintf("The rate of severe cases in vaccinated individuals is approximately %s percent",round(Vac_Percentage,digits=1))
## [1] "The rate of severe cases in vaccinated individuals is approximately 58.4 percent"
This data set was really interesting and I found it really surprising that the data was incomplete since the chart didn’t capture 100 percent of the people in Israel. It was also surprising that I got a negative value in calculating the efficacy vs severe disease, it seems older folks who got the vaccines had a higher percentage of getting into a severe case than compared to their un vaccinated counterparts. Even calculating the rates vaccinated individuals had a higher rate than the un vaccinated folks.