Background:

WHO keeps records of various Vaccinations administered throughout the world and categorizes them on basis of countries, development status, type od vaccination, year, number of doses, total coverage percentage etc.The data set contains year-wise data of vaccination coverage worldwide.
Data Source:The Data has been taken from the WHO Immunization Dataset
Characteristics: The original data consists of 347243 observations with 11 variables and 9 distict categories in GROUP column
Variables: There are 4 numerical and 7 categorical variables

In order to reduce the data size to less than 5 MB, some rows has been ommitted using the below code

data_df<-data_df %>% drop_na()
unique(data_df["GROUP"])
data_df<-data_df[1:47286,] %>%
  filter(YEAR!=1984,YEAR!=1986, YEAR!=1991, YEAR!=2003, YEAR!=2007, YEAR!=2013, YEAR!=2018)

Data Analysis

Data Load

library(tidyverse)
library(readxl)
library(ggplot2)
library(knitr)
library(kableExtra)
# read data from size reduced file
data <- read_csv("Assignment1-Ankit-Vaccination Coverage Data Wrangling and Visualization.csv", 
                   col_names = TRUE)

Data Summary

str(data)
## spec_tbl_df [38,000 x 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ GROUP                        : chr [1:38000] "Countries" "Countries" "Countries" "Countries" ...
##  $ CODE                         : chr [1:38000] "ABW" "ABW" "ABW" "ABW" ...
##  $ NAME                         : chr [1:38000] "Aruba" "Aruba" "Aruba" "Aruba" ...
##  $ YEAR                         : num [1:38000] 2019 2019 2019 2019 2019 ...
##  $ ANTIGEN                      : chr [1:38000] "DIPHCV4" "DIPHCV5" "DIPHCV6" "DTPCV1" ...
##  $ ANTIGEN_DESCRIPTION          : chr [1:38000] "Diphtheria-containing vaccine, 4th dose (1st booster)" "Diphtheria-containing vaccine, 5th dose (2nd booster)" "Diphtheria-containing vaccine, 6th dose (3rd booster)" "DTP-containing vaccine, 1st dose" ...
##  $ COVERAGE_CATEGORY            : chr [1:38000] "ADMIN" "ADMIN" "ADMIN" "ADMIN" ...
##  $ COVERAGE_CATEGORY_DESCRIPTION: chr [1:38000] "Administrative coverage" "Administrative coverage" "Administrative coverage" "Administrative coverage" ...
##  $ TARGET_NUMBER                : num [1:38000] 1242 1448 1590 1181 1181 ...
##  $ DOSES                        : num [1:38000] 1140 1234 1462 1165 1132 ...
##  $ COVERAGE                     : num [1:38000] 92 85 92 99 96 94 96 63 99 95 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   GROUP = col_character(),
##   ..   CODE = col_character(),
##   ..   NAME = col_character(),
##   ..   YEAR = col_double(),
##   ..   ANTIGEN = col_character(),
##   ..   ANTIGEN_DESCRIPTION = col_character(),
##   ..   COVERAGE_CATEGORY = col_character(),
##   ..   COVERAGE_CATEGORY_DESCRIPTION = col_character(),
##   ..   TARGET_NUMBER = col_double(),
##   ..   DOSES = col_double(),
##   ..   COVERAGE = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Tidy rule violation
As we can see in the table below, there are 2 distinct categories of data. This is in violation of tidy rule

Each variable must have its own column.

data_df <- as.data.frame(data)
kbl(cbind(data_df[1:10,], data_df[1:10,])) %>%
  kable_paper() %>%
  scroll_box(width = "100%", height = "200px")
GROUP CODE NAME YEAR ANTIGEN ANTIGEN_DESCRIPTION COVERAGE_CATEGORY COVERAGE_CATEGORY_DESCRIPTION TARGET_NUMBER DOSES COVERAGE GROUP CODE NAME YEAR ANTIGEN ANTIGEN_DESCRIPTION COVERAGE_CATEGORY COVERAGE_CATEGORY_DESCRIPTION TARGET_NUMBER DOSES COVERAGE
Countries ABW Aruba 2019 DIPHCV4 Diphtheria-containing vaccine, 4th dose (1st booster) ADMIN Administrative coverage 1242 1140 92 Countries ABW Aruba 2019 DIPHCV4 Diphtheria-containing vaccine, 4th dose (1st booster) ADMIN Administrative coverage 1242 1140 92
Countries ABW Aruba 2019 DIPHCV5 Diphtheria-containing vaccine, 5th dose (2nd booster) ADMIN Administrative coverage 1448 1234 85 Countries ABW Aruba 2019 DIPHCV5 Diphtheria-containing vaccine, 5th dose (2nd booster) ADMIN Administrative coverage 1448 1234 85
Countries ABW Aruba 2019 DIPHCV6 Diphtheria-containing vaccine, 6th dose (3rd booster) ADMIN Administrative coverage 1590 1462 92 Countries ABW Aruba 2019 DIPHCV6 Diphtheria-containing vaccine, 6th dose (3rd booster) ADMIN Administrative coverage 1590 1462 92
Countries ABW Aruba 2019 DTPCV1 DTP-containing vaccine, 1st dose ADMIN Administrative coverage 1181 1165 99 Countries ABW Aruba 2019 DTPCV1 DTP-containing vaccine, 1st dose ADMIN Administrative coverage 1181 1165 99
Countries ABW Aruba 2019 DTPCV3 DTP-containing vaccine, 3rd dose ADMIN Administrative coverage 1181 1132 96 Countries ABW Aruba 2019 DTPCV3 DTP-containing vaccine, 3rd dose ADMIN Administrative coverage 1181 1132 96
Countries ABW Aruba 2019 HEPB3 HepB3 ADMIN Administrative coverage 1181 1108 94 Countries ABW Aruba 2019 HEPB3 HepB3 ADMIN Administrative coverage 1181 1108 94
Countries ABW Aruba 2019 HIB3 Hib3 ADMIN Administrative coverage 1181 1132 96 Countries ABW Aruba 2019 HIB3 Hib3 ADMIN Administrative coverage 1181 1132 96
Countries ABW Aruba 2019 HPV_FEM HPV Female, final dose ADMIN Administrative coverage 815 517 63 Countries ABW Aruba 2019 HPV_FEM HPV Female, final dose ADMIN Administrative coverage 815 517 63
Countries ABW Aruba 2019 IPV1 Inactivated polio-containing vaccine, 1st dose ADMIN Administrative coverage 1181 1165 99 Countries ABW Aruba 2019 IPV1 Inactivated polio-containing vaccine, 1st dose ADMIN Administrative coverage 1181 1165 99
Countries ABW Aruba 2019 MCV1 Measles-containing vaccine, 1st dose ADMIN Administrative coverage 1242 1182 95 Countries ABW Aruba 2019 MCV1 Measles-containing vaccine, 1st dose ADMIN Administrative coverage 1242 1182 95
kbl(unique(data_df["GROUP"])) %>% 
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
GROUP
1 Countries
36610 Development Status

Data Wrangling

As we can see from the table below, the table has seperate columns for each categories, which makes the data tidy

csv_data <- data_df %>% 
  select(-c(COVERAGE_CATEGORY_DESCRIPTION, ANTIGEN_DESCRIPTION)) 

csv_data<- csv_data %>% 
  mutate(GROUPING = str_to_upper(GROUP)) %>%
  select(-c(GROUP )) %>%
  pivot_wider(names_from = GROUPING, values_from = NAME)

kbl(cbind(csv_data[1:10,], csv_data[1:10,])) %>%
  kable_paper() %>%
  scroll_box(width = "100%", height = "200px")
CODE YEAR ANTIGEN COVERAGE_CATEGORY TARGET_NUMBER DOSES COVERAGE COUNTRIES DEVELOPMENT STATUS CODE YEAR ANTIGEN COVERAGE_CATEGORY TARGET_NUMBER DOSES COVERAGE COUNTRIES DEVELOPMENT STATUS
ABW 2019 DIPHCV4 ADMIN 1242 1140 92 Aruba NA ABW 2019 DIPHCV4 ADMIN 1242 1140 92 Aruba NA
ABW 2019 DIPHCV5 ADMIN 1448 1234 85 Aruba NA ABW 2019 DIPHCV5 ADMIN 1448 1234 85 Aruba NA
ABW 2019 DIPHCV6 ADMIN 1590 1462 92 Aruba NA ABW 2019 DIPHCV6 ADMIN 1590 1462 92 Aruba NA
ABW 2019 DTPCV1 ADMIN 1181 1165 99 Aruba NA ABW 2019 DTPCV1 ADMIN 1181 1165 99 Aruba NA
ABW 2019 DTPCV3 ADMIN 1181 1132 96 Aruba NA ABW 2019 DTPCV3 ADMIN 1181 1132 96 Aruba NA
ABW 2019 HEPB3 ADMIN 1181 1108 94 Aruba NA ABW 2019 HEPB3 ADMIN 1181 1108 94 Aruba NA
ABW 2019 HIB3 ADMIN 1181 1132 96 Aruba NA ABW 2019 HIB3 ADMIN 1181 1132 96 Aruba NA
ABW 2019 HPV_FEM ADMIN 815 517 63 Aruba NA ABW 2019 HPV_FEM ADMIN 815 517 63 Aruba NA
ABW 2019 IPV1 ADMIN 1181 1165 99 Aruba NA ABW 2019 IPV1 ADMIN 1181 1165 99 Aruba NA
ABW 2019 MCV1 ADMIN 1242 1182 95 Aruba NA ABW 2019 MCV1 ADMIN 1242 1182 95 Aruba NA
csv_data <- csv_data %>% 
  filter(TARGET_NUMBER != 0, DOSES != 0, COVERAGE_CATEGORY == "WUENIC") %>% 
  mutate(COVERAGE_PERCENTAGE =round((DOSES/TARGET_NUMBER)*100, digits=1)) %>%
  select(-c(COVERAGE))

Data Visualization

Vaccination Coverage Percentage for each year
We can observe from the below chart that there has been continous increase in the overall vaccination coverage world-wide. In the year 1980 where it was close to 30%, now in the year 2020 it came close 75%.

yearly_coverage_data<- csv_data %>%
  group_by(`YEAR`) %>% 
  summarise(COVERAGE_PERCENTAGE= mean(COVERAGE_PERCENTAGE))

ggplot(yearly_coverage_data, aes(x=YEAR, 
                                 y=COVERAGE_PERCENTAGE, 
                                 color=YEAR, 
                                 size=COVERAGE_PERCENTAGE))+
  geom_point(size=2)+
  geom_smooth() + 
  ylab("Overall Vaccination Coverage Percentage") + 
  xlab("Years") + 
  ggtitle("Vaccination Coverage Percentage for each year")

Vaccination Coverage Percentage with respect to Development Status
Although the overall vaccination coverage world-wide has been in continous uptrend, we can observe the below graph that the developing and leas developed economies of the world have lesser vaccination coverage in comparision to the developed and economies in transition.

development_status_coverage_data<- csv_data %>%
  group_by(`DEVELOPMENT STATUS`) %>% 
  summarise(COVERAGE_PERCENTAGE= mean(COVERAGE_PERCENTAGE)) %>% 
  filter(`DEVELOPMENT STATUS`!="")

ggplot(development_status_coverage_data, aes( x=`DEVELOPMENT STATUS`,
                                              y=COVERAGE_PERCENTAGE, 
                                              fill=`DEVELOPMENT STATUS`)) +
  geom_bar(stat='identity', width=.5) +
  scale_fill_hue(c = 40) + 
  ylab("Overall Vaccination Coverage Percentage") + 
  xlab("Development Status") + 
  ggtitle("Vaccination Coverage Percentage with respect to Development Status")