WHO keeps records of various Vaccinations administered throughout the world and categorizes them on basis of countries, development status, type od vaccination, year, number of doses, total coverage percentage etc.The data set contains year-wise data of vaccination coverage worldwide.
Data Source:The Data has been taken from the WHO Immunization Dataset
Characteristics: The original data consists of 347243 observations with 11 variables and 9 distict categories in GROUP column
Variables: There are 4 numerical and 7 categorical variables
In order to reduce the data size to less than 5 MB, some rows has been ommitted using the below code
data_df<-data_df %>% drop_na()
unique(data_df["GROUP"])
data_df<-data_df[1:47286,] %>%
filter(YEAR!=1984,YEAR!=1986, YEAR!=1991, YEAR!=2003, YEAR!=2007, YEAR!=2013, YEAR!=2018)
Data Load
library(tidyverse)
library(readxl)
library(ggplot2)
library(knitr)
library(kableExtra)
# read data from size reduced file
data <- read_csv("Assignment1-Ankit-Vaccination Coverage Data Wrangling and Visualization.csv",
col_names = TRUE)
Data Summary
str(data)
## spec_tbl_df [38,000 x 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ GROUP : chr [1:38000] "Countries" "Countries" "Countries" "Countries" ...
## $ CODE : chr [1:38000] "ABW" "ABW" "ABW" "ABW" ...
## $ NAME : chr [1:38000] "Aruba" "Aruba" "Aruba" "Aruba" ...
## $ YEAR : num [1:38000] 2019 2019 2019 2019 2019 ...
## $ ANTIGEN : chr [1:38000] "DIPHCV4" "DIPHCV5" "DIPHCV6" "DTPCV1" ...
## $ ANTIGEN_DESCRIPTION : chr [1:38000] "Diphtheria-containing vaccine, 4th dose (1st booster)" "Diphtheria-containing vaccine, 5th dose (2nd booster)" "Diphtheria-containing vaccine, 6th dose (3rd booster)" "DTP-containing vaccine, 1st dose" ...
## $ COVERAGE_CATEGORY : chr [1:38000] "ADMIN" "ADMIN" "ADMIN" "ADMIN" ...
## $ COVERAGE_CATEGORY_DESCRIPTION: chr [1:38000] "Administrative coverage" "Administrative coverage" "Administrative coverage" "Administrative coverage" ...
## $ TARGET_NUMBER : num [1:38000] 1242 1448 1590 1181 1181 ...
## $ DOSES : num [1:38000] 1140 1234 1462 1165 1132 ...
## $ COVERAGE : num [1:38000] 92 85 92 99 96 94 96 63 99 95 ...
## - attr(*, "spec")=
## .. cols(
## .. GROUP = col_character(),
## .. CODE = col_character(),
## .. NAME = col_character(),
## .. YEAR = col_double(),
## .. ANTIGEN = col_character(),
## .. ANTIGEN_DESCRIPTION = col_character(),
## .. COVERAGE_CATEGORY = col_character(),
## .. COVERAGE_CATEGORY_DESCRIPTION = col_character(),
## .. TARGET_NUMBER = col_double(),
## .. DOSES = col_double(),
## .. COVERAGE = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Tidy rule violation
As we can see in the table below, there are 2 distinct categories of data. This is in violation of tidy rule
Each variable must have its own column.
data_df <- as.data.frame(data)
kbl(cbind(data_df[1:10,], data_df[1:10,])) %>%
kable_paper() %>%
scroll_box(width = "100%", height = "200px")
| GROUP | CODE | NAME | YEAR | ANTIGEN | ANTIGEN_DESCRIPTION | COVERAGE_CATEGORY | COVERAGE_CATEGORY_DESCRIPTION | TARGET_NUMBER | DOSES | COVERAGE | GROUP | CODE | NAME | YEAR | ANTIGEN | ANTIGEN_DESCRIPTION | COVERAGE_CATEGORY | COVERAGE_CATEGORY_DESCRIPTION | TARGET_NUMBER | DOSES | COVERAGE |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Countries | ABW | Aruba | 2019 | DIPHCV4 | Diphtheria-containing vaccine, 4th dose (1st booster) | ADMIN | Administrative coverage | 1242 | 1140 | 92 | Countries | ABW | Aruba | 2019 | DIPHCV4 | Diphtheria-containing vaccine, 4th dose (1st booster) | ADMIN | Administrative coverage | 1242 | 1140 | 92 |
| Countries | ABW | Aruba | 2019 | DIPHCV5 | Diphtheria-containing vaccine, 5th dose (2nd booster) | ADMIN | Administrative coverage | 1448 | 1234 | 85 | Countries | ABW | Aruba | 2019 | DIPHCV5 | Diphtheria-containing vaccine, 5th dose (2nd booster) | ADMIN | Administrative coverage | 1448 | 1234 | 85 |
| Countries | ABW | Aruba | 2019 | DIPHCV6 | Diphtheria-containing vaccine, 6th dose (3rd booster) | ADMIN | Administrative coverage | 1590 | 1462 | 92 | Countries | ABW | Aruba | 2019 | DIPHCV6 | Diphtheria-containing vaccine, 6th dose (3rd booster) | ADMIN | Administrative coverage | 1590 | 1462 | 92 |
| Countries | ABW | Aruba | 2019 | DTPCV1 | DTP-containing vaccine, 1st dose | ADMIN | Administrative coverage | 1181 | 1165 | 99 | Countries | ABW | Aruba | 2019 | DTPCV1 | DTP-containing vaccine, 1st dose | ADMIN | Administrative coverage | 1181 | 1165 | 99 |
| Countries | ABW | Aruba | 2019 | DTPCV3 | DTP-containing vaccine, 3rd dose | ADMIN | Administrative coverage | 1181 | 1132 | 96 | Countries | ABW | Aruba | 2019 | DTPCV3 | DTP-containing vaccine, 3rd dose | ADMIN | Administrative coverage | 1181 | 1132 | 96 |
| Countries | ABW | Aruba | 2019 | HEPB3 | HepB3 | ADMIN | Administrative coverage | 1181 | 1108 | 94 | Countries | ABW | Aruba | 2019 | HEPB3 | HepB3 | ADMIN | Administrative coverage | 1181 | 1108 | 94 |
| Countries | ABW | Aruba | 2019 | HIB3 | Hib3 | ADMIN | Administrative coverage | 1181 | 1132 | 96 | Countries | ABW | Aruba | 2019 | HIB3 | Hib3 | ADMIN | Administrative coverage | 1181 | 1132 | 96 |
| Countries | ABW | Aruba | 2019 | HPV_FEM | HPV Female, final dose | ADMIN | Administrative coverage | 815 | 517 | 63 | Countries | ABW | Aruba | 2019 | HPV_FEM | HPV Female, final dose | ADMIN | Administrative coverage | 815 | 517 | 63 |
| Countries | ABW | Aruba | 2019 | IPV1 | Inactivated polio-containing vaccine, 1st dose | ADMIN | Administrative coverage | 1181 | 1165 | 99 | Countries | ABW | Aruba | 2019 | IPV1 | Inactivated polio-containing vaccine, 1st dose | ADMIN | Administrative coverage | 1181 | 1165 | 99 |
| Countries | ABW | Aruba | 2019 | MCV1 | Measles-containing vaccine, 1st dose | ADMIN | Administrative coverage | 1242 | 1182 | 95 | Countries | ABW | Aruba | 2019 | MCV1 | Measles-containing vaccine, 1st dose | ADMIN | Administrative coverage | 1242 | 1182 | 95 |
kbl(unique(data_df["GROUP"])) %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
| GROUP | |
|---|---|
| 1 | Countries |
| 36610 | Development Status |
As we can see from the table below, the table has seperate columns for each categories, which makes the data tidy
csv_data <- data_df %>%
select(-c(COVERAGE_CATEGORY_DESCRIPTION, ANTIGEN_DESCRIPTION))
csv_data<- csv_data %>%
mutate(GROUPING = str_to_upper(GROUP)) %>%
select(-c(GROUP )) %>%
pivot_wider(names_from = GROUPING, values_from = NAME)
kbl(cbind(csv_data[1:10,], csv_data[1:10,])) %>%
kable_paper() %>%
scroll_box(width = "100%", height = "200px")
| CODE | YEAR | ANTIGEN | COVERAGE_CATEGORY | TARGET_NUMBER | DOSES | COVERAGE | COUNTRIES | DEVELOPMENT STATUS | CODE | YEAR | ANTIGEN | COVERAGE_CATEGORY | TARGET_NUMBER | DOSES | COVERAGE | COUNTRIES | DEVELOPMENT STATUS |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ABW | 2019 | DIPHCV4 | ADMIN | 1242 | 1140 | 92 | Aruba | NA | ABW | 2019 | DIPHCV4 | ADMIN | 1242 | 1140 | 92 | Aruba | NA |
| ABW | 2019 | DIPHCV5 | ADMIN | 1448 | 1234 | 85 | Aruba | NA | ABW | 2019 | DIPHCV5 | ADMIN | 1448 | 1234 | 85 | Aruba | NA |
| ABW | 2019 | DIPHCV6 | ADMIN | 1590 | 1462 | 92 | Aruba | NA | ABW | 2019 | DIPHCV6 | ADMIN | 1590 | 1462 | 92 | Aruba | NA |
| ABW | 2019 | DTPCV1 | ADMIN | 1181 | 1165 | 99 | Aruba | NA | ABW | 2019 | DTPCV1 | ADMIN | 1181 | 1165 | 99 | Aruba | NA |
| ABW | 2019 | DTPCV3 | ADMIN | 1181 | 1132 | 96 | Aruba | NA | ABW | 2019 | DTPCV3 | ADMIN | 1181 | 1132 | 96 | Aruba | NA |
| ABW | 2019 | HEPB3 | ADMIN | 1181 | 1108 | 94 | Aruba | NA | ABW | 2019 | HEPB3 | ADMIN | 1181 | 1108 | 94 | Aruba | NA |
| ABW | 2019 | HIB3 | ADMIN | 1181 | 1132 | 96 | Aruba | NA | ABW | 2019 | HIB3 | ADMIN | 1181 | 1132 | 96 | Aruba | NA |
| ABW | 2019 | HPV_FEM | ADMIN | 815 | 517 | 63 | Aruba | NA | ABW | 2019 | HPV_FEM | ADMIN | 815 | 517 | 63 | Aruba | NA |
| ABW | 2019 | IPV1 | ADMIN | 1181 | 1165 | 99 | Aruba | NA | ABW | 2019 | IPV1 | ADMIN | 1181 | 1165 | 99 | Aruba | NA |
| ABW | 2019 | MCV1 | ADMIN | 1242 | 1182 | 95 | Aruba | NA | ABW | 2019 | MCV1 | ADMIN | 1242 | 1182 | 95 | Aruba | NA |
csv_data <- csv_data %>%
filter(TARGET_NUMBER != 0, DOSES != 0, COVERAGE_CATEGORY == "WUENIC") %>%
mutate(COVERAGE_PERCENTAGE =round((DOSES/TARGET_NUMBER)*100, digits=1)) %>%
select(-c(COVERAGE))
Vaccination Coverage Percentage for each year
We can observe from the below chart that there has been continous increase in the overall vaccination coverage world-wide. In the year 1980 where it was close to 30%, now in the year 2020 it came close 75%.
yearly_coverage_data<- csv_data %>%
group_by(`YEAR`) %>%
summarise(COVERAGE_PERCENTAGE= mean(COVERAGE_PERCENTAGE))
ggplot(yearly_coverage_data, aes(x=YEAR,
y=COVERAGE_PERCENTAGE,
color=YEAR,
size=COVERAGE_PERCENTAGE))+
geom_point(size=2)+
geom_smooth() +
ylab("Overall Vaccination Coverage Percentage") +
xlab("Years") +
ggtitle("Vaccination Coverage Percentage for each year")
Vaccination Coverage Percentage with respect to Development Status
Although the overall vaccination coverage world-wide has been in continous uptrend, we can observe the below graph that the developing and leas developed economies of the world have lesser vaccination coverage in comparision to the developed and economies in transition.
development_status_coverage_data<- csv_data %>%
group_by(`DEVELOPMENT STATUS`) %>%
summarise(COVERAGE_PERCENTAGE= mean(COVERAGE_PERCENTAGE)) %>%
filter(`DEVELOPMENT STATUS`!="")
ggplot(development_status_coverage_data, aes( x=`DEVELOPMENT STATUS`,
y=COVERAGE_PERCENTAGE,
fill=`DEVELOPMENT STATUS`)) +
geom_bar(stat='identity', width=.5) +
scale_fill_hue(c = 40) +
ylab("Overall Vaccination Coverage Percentage") +
xlab("Development Status") +
ggtitle("Vaccination Coverage Percentage with respect to Development Status")