Load packages

pacman::p_load(
  tidyverse,      # includes ggplot2 and other
  stringr,        # working with characters   
  scales,         # transform numbers
  ggrepel,        # smartly-placed labels
  gghighlight,    # highlight one part of plot
  RColorBrewer    # color scales
)

Import data

library(readr)
child_nutri_dat <- read_csv('C:/Users/thuyb/OneDrive - Universiteit Antwerpen/University of Antwerp/Semester 1/Data Management/Practice/Practical session/Practical session 5/data_visualisation_2023/2_ggplot_visualization_DMSS/Child nutrition dataset.csv')
head(child_nutri_dat)  # The first 6 rows of the linelist are displayed below
## # A tibble: 6 × 12
##      id   sex   age  year grade indigent province     cct weight height   bmi
##   <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl> <chr>      <dbl>  <dbl>  <dbl> <dbl>
## 1     1     1    13  2017     8        2 Province 4     1     28   1.35  15.4
## 2     2     1    13  2018     7        1 Province 4     1     24   1.34  13.4
## 3     3     1    12  2019     7        1 Province 4     2     39   1.46  18.3
## 4     4     2    13  2018     7        1 Province 4     1     53   1.6   20.7
## 5     5     1    15  2016     9        2 Province 4     1     48   1.57  19.5
## 6     6     1    13  2018     7        2 Province 4     1     33   1.54  13.9
## # ℹ 1 more variable: HAZ <dbl>

Changing variable characteristics

##demographic and socio-cultural profiles
child_nutri_dat$id <- as.factor(child_nutri_dat$id)
child_nutri_dat$sex <- factor(child_nutri_dat$sex, levels = c(1,2), labels = c("Male", "Female"))
child_nutri_dat$age <- as.numeric(child_nutri_dat$age)
child_nutri_dat$year <- as.factor(child_nutri_dat$year)
child_nutri_dat$grade <- as.factor(child_nutri_dat$grade)

child_nutri_dat$indigent <- factor(child_nutri_dat$indigent, levels = c(1,2), labels = c("Non-indigenous", "Indigenous"))
child_nutri_dat$province <- as.factor(child_nutri_dat$province)

##Exposure variable
child_nutri_dat$cct <- factor(child_nutri_dat$cct, levels = c(1,2), labels= c("cct", "non-cct"))

#outcomes measured
class(child_nutri_dat$weight)
child_nutri_dat$weight <- as.numeric(child_nutri_dat$weight)
child_nutri_dat$height <- as.numeric(child_nutri_dat$height)

##calculated outcome measures or child nutritional status
child_nutri_dat$bmi <- as.numeric(child_nutri_dat$bmi)

### Height-for-age Z-scores and categorization
child_nutri_dat$HAZ <- as.numeric(child_nutri_dat$HAZ)

### Checking the variables after conditioning them.
summary(child_nutri_dat)
## [1] "numeric"
##        id           sex            age           year          grade     
##  5540   :   2   Male  :2537   Min.   : 3.900   2016:  69   5      : 701  
##  5541   :   2   Female:2526   1st Qu.: 7.020   2017: 354   6      : 699  
##  1      :   1   NA's  :  44   Median : 9.070   2018: 778   1      : 634  
##  2      :   1                 Mean   : 9.503   2019:3387   3      : 590  
##  3      :   1                 3rd Qu.:11.060   2020: 231   4      : 504  
##  4      :   1                 Max.   :19.000   2021: 276   (Other):1942  
##  (Other):5099                 NA's   :24       NA's:  12   NA's   :  37  
##            indigent          province         cct           weight     
##  Non-indigenous:4401   Province 1:3655   cct    :1812   Min.   :11.00  
##  Indigenous    : 701   Province 2: 425   non-cct:3098   1st Qu.:21.00  
##  NA's          :   5   Province 3: 456   NA's   : 197   Median :28.00  
##                        Province 4: 563                  Mean   :30.22  
##                        NA's      :   8                  3rd Qu.:38.00  
##                                                         Max.   :80.80  
##                                                         NA's   :163    
##      height           bmi             HAZ         
##  Min.   :0.970   Min.   : 7.03   Min.   :-8.8500  
##  1st Qu.:1.200   1st Qu.:14.37   1st Qu.:-1.3800  
##  Median :1.320   Median :15.91   Median :-0.4350  
##  Mean   :1.321   Mean   :16.71   Mean   :-0.3632  
##  3rd Qu.:1.440   3rd Qu.:18.67   3rd Qu.: 0.5625  
##  Max.   :1.900   Max.   :35.24   Max.   :48.7700  
##  NA's   :162     NA's   :167     NA's   :235

Making new categories for age and HAZ

Group children by age. Creating three age groups: children and adolescents

child_nutri_dat$age_category <- NULL

child_nutri_dat$age_category[child_nutri_dat$age >= 3 & child_nutri_dat$age < 5]            <-'aged below 5'
child_nutri_dat$age_category[child_nutri_dat$age >= 5 & child_nutri_dat$age < 11]           <- 'aged between 5 and 10'
child_nutri_dat$age_category[child_nutri_dat$age >= 11 & child_nutri_dat$age <= 19]         <- 'aged between 11 and 19'
child_nutri_dat$age_category[child_nutri_dat$age < 3]                                       <- 'exclude'
child_nutri_dat$age_category[child_nutri_dat$age > 19]                                      <- 'exclude'

table(child_nutri_dat$age_category, useNA = 'always')

Categorize the HAZ in children!

## Height-for-age categorization
child_nutri_dat$new_haz_cat <- NULL

child_nutri_dat$new_haz_cat[child_nutri_dat$HAZ < -2 & child_nutri_dat$HAZ >= -3]      <-'moderate stunting'
child_nutri_dat$new_haz_cat[child_nutri_dat$HAZ >= -2 & child_nutri_dat$HAZ < 6]       <-'normal'
child_nutri_dat$new_haz_cat[child_nutri_dat$HAZ < -3 & child_nutri_dat$HAZ >= -6]      <-'severe stunting'
child_nutri_dat$new_haz_cat[child_nutri_dat$HAZ < -6 & child_nutri_dat$HAZ > 6]        <-'outliers'

table(child_nutri_dat$new_haz_cat, useNA = 'always')
## 
## moderate stunting            normal   severe stunting              <NA> 
##               445              4248               165               249

1. Histograms and Bar plots”

Histograms 1

Visualize the number of children by age.

 + Since the number of children is a count or discrete and age is continuous, one way that I can plot them is by using a histogram! 
#Plot number of children by province
#Use histogram to plot
ggplot(data= child_nutri_dat, mapping= aes(x=age)) +
  geom_histogram(binwidth = 1,                # width of bins
    color = "dark red",                       # bin line color
    fill = "cornflowerblue")+
  labs(y= "number of children", x="Age")      # NEWLY ADDED: changing labels for the x and y-axis

### Bar chart 1 Creating a plot to visualize the proportion of children by age group

library(ggplot2)
ggplot(data= child_nutri_dat, mapping= aes(fill=child_nutri_dat$sex,x=child_nutri_dat$age_category)) +
    geom_bar(aes(x=age_category, y= after_stat(count/sum(count)*100)), alpha=1, width = 1, position="dodge") +
    labs(y= "proportion of children", x="age group") +
    scale_fill_manual(values = c("rosybrown", "purple"))+  # NEWLY ADDED: changing color fills based on your preference
    theme_classic() +                                      # NEWLY ADDED: adding a background theme
    guides(fill=guide_legend(title="Gender"))              # NEWLY ADDED: Changing legend's label/title

## NEWLY ADDED: Saving the graph in png format
ggsave("Proportion of children by age group.png", width = 11, height = 12)    

Bar chart 2

  • Remove NAs
## Remove NAs in a specific variable
library(tidyverse)
df0 <- child_nutri_dat %>% 
  filter(!is.na(age_category)) %>%                                         
  filter(!is.na(sex))

## Creating a barplot
ggplot(data= df0, mapping= aes(fill= sex,x=age_category)) +  
  geom_bar(aes(x=age_category, y= after_stat(count/sum(count)*100)), alpha=1, width = 1, position="dodge", color= "white") + # position: bar next to
  labs(y= "proportion of children", x="age group") +
  guides(fill=guide_legend(title="Gender")) +   
  theme_classic() +
  theme(axis.text.x = element_text(angle = 40, margin=margin(40)))     # NEWLY ADDED: adjust angles and margins of x-axis texts

### Bar chart 3 * Add Face wrap

## Remove NAs in a specific variable
df0 <- child_nutri_dat %>% 
  filter(!is.na(age_category)) %>%                                         
  filter(!is.na(sex)) 

## Creating a barplot
ggplot(data= df0, mapping= aes(fill= sex,x=age_category)) +  
  geom_bar(aes(x=age_category, y= after_stat(count/sum(count)*100)), alpha=1, width = 1, position="dodge", color= "white") +
  labs(y= "proportion of children", x="age group") +
  guides(fill=guide_legend(title="Gender")) +   
  theme_classic() +
  theme(axis.text.x = element_text(angle = 40, margin=margin(40))) +    # NEWLY ADDED: adjust angles and margins of x-axis texts
  facet_grid(~sex)                                                      #NEWLY ADDED: Splitting two categories into different plots

Histograms 2

  • Visualize the weight distribution of children and color it by sex using a histogram.
ggplot(data=df0, mapping = aes(fill=sex, x=weight)) + 
  geom_histogram(color="white", width = 0)+
  labs(x="Weight of children", y="number of children")+
  guides(fill=guide_legend(title="Gender"))+
  facet_grid(~sex)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 146 rows containing non-finite values (`stat_bin()`).

Bar chart 4

  • Plot (using a barplot) the proportion of children with moderate stunting and severe stunting distributed by grade levels.
# Write your answers here
df1 <-  child_nutri_dat %>% 
  filter(!is.na(grade))%>% 
  filter(!is.na(new_haz_cat)) %>% 
  filter(new_haz_cat=="moderate stunting"|new_haz_cat=="severe stunting")

ggplot(data=df1, mapping = aes(x=grade))+
  geom_bar(aes(fill=new_haz_cat, y=after_stat(count/sum(count)*100)))+
  labs(y="proportion of children", x="grade group")+
  theme_classic() +
  scale_fill_manual(values = c("blue", "orange")) +
  guides(fill=guide_legend(title="Adverse nutritional status"))

### Bar chart 5 * Plot the proportion of children by nutritional status, intervention and by age group (present the proportion of children with “aged between 11 and 19” and “aged between 5 and 10” only).

df2 <-  child_nutri_dat %>% 
  filter(!is.na(cct)) %>% 
  filter(!is.na(age_category))%>% 
  filter(!is.na(new_haz_cat)) %>%
  filter(!new_haz_cat=="normal") %>% 
  filter(!age_category=="aged below 5")

ggplot(data= df2, mapping= aes(x=new_haz_cat)) +
  geom_bar(aes(fill=cct, y= after_stat(count/sum(count)*100)), alpha=1, width = 1, position="dodge", color= "white") +
  labs(y= "proportion of children", x="Child nutritional status") +
  theme_classic() +
  scale_fill_manual(values = c("rosybrown", "purple", "coral", "orange")) +
  facet_wrap(~age_category) +
  guides(fill=guide_legend(title="Intervention"))

2. Scatterplots

Plot the height and weight of children

#Removing NAs
df0 <- child_nutri_dat %>% 
  filter(!is.na(weight)) %>%  
  filter(!is.na(sex)) %>%
  filter(!is.na(height))

#Statistic color for points and for line
ggplot(data = df0, mapping = aes(x = weight, y = height, ))+     
  geom_point(mapping= aes(color= sex)) +                                 ## NEW PLOT using GEOM_POINT function
  geom_vline(xintercept = 50, color = "dark grey") +
  labs(title = "Static color for points and line", y= "Height (meters)", x="Weight (kg)")+
    theme_classic() 

#Removing NAs
df0 <- child_nutri_dat %>% 
  filter(!is.na(weight)) %>%  
  filter(!is.na(sex)) %>%
  filter(!is.na(height))

#Statistic color for points and for line
ggplot(data = df0, mapping = aes(x = weight, y = height, color=weight, size= weight, ))+     
  geom_point(shape= "diamond", alpha =0.3, size= 2)+  #alpha transparency  
  geom_vline(xintercept = 50, color = "dark grey") +
  labs(title = "Static color for points and line", y= "Height (meters)", x="Weight (kg)") +
  theme_classic() +
viridis::scale_color_viridis()+ #change color
geom_smooth(method = "lm")

Scatterplot 1

  • Plot the HAZ and height of children using scatterplot and make a legend that shades them by HAZ of children
df0 <- child_nutri_dat %>% 
  filter(!is.na(age_category)) %>% 
  filter(!is.na(sex)) %>% 
  filter(!new_haz_cat=="outliers")

ggplot(data = df0, mapping = aes(x=HAZ, y=height, color=HAZ))+
  geom_point(shape="triangle", alpha=0.5, size=2)+
  labs(title="Plotting HAZ against height", x="HAZ of children", y="Heigh of children")+
  geom_smooth(method = "lm", size=0.5)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Scatterplot 2

  • Plot the height-for-age Zscores (HAZ) and the BMI of children, differentiate them by the child’s age category
df1 <- child_nutri_dat %>% 
  filter(!is.na(HAZ)) %>%
  filter(!is.na(bmi)) %>% 
  filter(!is.na(age_category))

ggplot(data = df1, mapping = aes(x=HAZ, y=bmi, color=age_category))+
  geom_point(alpha=0.5, size=1)+  
  labs(title="Plotting HAZ against BMI", x="HAZ of children", y="BMI of children")+
  geom_vline(xintercept = 0, color="black")+
  theme_classic()

### Scatterplot 3 * Plot the height-for-age Zscores (HAZ) and the weight of children, differentiate them by the new haz cat

df1 <- child_nutri_dat %>% 
  filter(!is.na(age_category)) %>%  
  filter(!is.na(sex)) %>% 
  filter(!is.na(new_haz_cat))

## Creating a scatterplot
ggplot(data = df1, mapping = aes(x = HAZ, y = weight, color=new_haz_cat))+     
  geom_point(size= 1, alpha= 0.5) +
  geom_vline(xintercept = -2, color = "dark grey") +
  labs(title = "Static color for points and line", y= "weight", x="Height-for-age Z-scores") +
  scale_fill_manual(values = c("rosybrown", "purple", "orange")) +
  theme_classic() 
## Warning: Removed 4 rows containing missing values (`geom_point()`).

3. Boxplots

Boxplots 1

  • Visualize the age distribution of male and female children by province.
df0 <- child_nutri_dat %>% 
  filter(!is.na(province)) %>%  
  filter(!is.na(sex))

ggplot(data = df0, aes(x = province, y = age, colour = province)) + 
  geom_boxplot()+ 
  facet_wrap(~sex, ncol = 4) +
  theme_gray () +                                                   # NEW THEME
  theme(axis.text.x = element_text(angle = 45, margin=margin(20)))       

Boxplot 2

  • Vidualize boxplot, the weight of the children by age category/group divide them by inclusion to CCT.
df1 <- child_nutri_dat %>% 
  filter(!is.na(age_category)) %>% 
  filter(!is.na(weight)) %>% 
  filter(!is.na(cct))

ggplot(df1, mapping = aes(x=weight, y=age_category, color=age_category))+
  geom_boxplot()+
  labs(title = "Weight distribution by age categories", y= "Age categories", x="Weight (kg)") +
  facet_wrap(~cct)+
  theme(axis.text.y = element_text(angle = 45, hjust = 1))

4. Density plot

#install these packages and run this rmarkdown.

library(ggridges)
library(viridis)
library(hrbrthemes)
library(forcats)
library(ggplot2)

Density plot 1

The figure below presents the age distribution of children by nutritional status and by sex.

df0 <- child_nutri_dat %>% 
  filter(!is.na(new_haz_cat)) %>%  
  filter(!is.na(cct))

ggplot(data=df0, mapping= aes(x=age, y=new_haz_cat, fill= sex)) + 
  geom_density_ridges(alpha=0.2)+ 
  labs(y= "", x="Age distribution of children") + 
  theme(legend.position= "right", panel.spacing = unit(0.5, "lines"), strip.text.x= element_text(size=12)) +
  guides(fill=guide_legend(title="Gender")) +
  theme_minimal() 
## Picking joint bandwidth of 0.94

Density plot 2

  • Plot the HAZ of children distributed by inclusion to CCT program and stratify them by province.
df1 <- child_nutri_dat %>% 
  filter(!is.na(HAZ)) %>% 
  filter(!is.na(cct)) %>% 
  filter(!is.na(province)) %>% 
  filter(!is.na(new_haz_cat))

ggplot(df1, aes(x=HAZ, y=province, fill=cct))+
  geom_density_ridges(alpha=0.2)+
  theme(legend.position= "right", # legend.position: position of the legend on the plot
        panel.spacing = unit(0.5, "lines"), # spacing between panels
        strip.text.x= element_text(size=12))+  # size of x-axis text
  guides(fill=guide_legend(title = "Intervention"))
## Picking joint bandwidth of 0.401