pacman::p_load(
tidyverse, # includes ggplot2 and other
stringr, # working with characters
scales, # transform numbers
ggrepel, # smartly-placed labels
gghighlight, # highlight one part of plot
RColorBrewer # color scales
)
library(readr)
child_nutri_dat <- read_csv('C:/Users/thuyb/OneDrive - Universiteit Antwerpen/University of Antwerp/Semester 1/Data Management/Practice/Practical session/Practical session 5/data_visualisation_2023/2_ggplot_visualization_DMSS/Child nutrition dataset.csv')
head(child_nutri_dat) # The first 6 rows of the linelist are displayed below
## # A tibble: 6 × 12
## id sex age year grade indigent province cct weight height bmi
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 13 2017 8 2 Province 4 1 28 1.35 15.4
## 2 2 1 13 2018 7 1 Province 4 1 24 1.34 13.4
## 3 3 1 12 2019 7 1 Province 4 2 39 1.46 18.3
## 4 4 2 13 2018 7 1 Province 4 1 53 1.6 20.7
## 5 5 1 15 2016 9 2 Province 4 1 48 1.57 19.5
## 6 6 1 13 2018 7 2 Province 4 1 33 1.54 13.9
## # ℹ 1 more variable: HAZ <dbl>
##demographic and socio-cultural profiles
child_nutri_dat$id <- as.factor(child_nutri_dat$id)
child_nutri_dat$sex <- factor(child_nutri_dat$sex, levels = c(1,2), labels = c("Male", "Female"))
child_nutri_dat$age <- as.numeric(child_nutri_dat$age)
child_nutri_dat$year <- as.factor(child_nutri_dat$year)
child_nutri_dat$grade <- as.factor(child_nutri_dat$grade)
child_nutri_dat$indigent <- factor(child_nutri_dat$indigent, levels = c(1,2), labels = c("Non-indigenous", "Indigenous"))
child_nutri_dat$province <- as.factor(child_nutri_dat$province)
##Exposure variable
child_nutri_dat$cct <- factor(child_nutri_dat$cct, levels = c(1,2), labels= c("cct", "non-cct"))
#outcomes measured
class(child_nutri_dat$weight)
child_nutri_dat$weight <- as.numeric(child_nutri_dat$weight)
child_nutri_dat$height <- as.numeric(child_nutri_dat$height)
##calculated outcome measures or child nutritional status
child_nutri_dat$bmi <- as.numeric(child_nutri_dat$bmi)
### Height-for-age Z-scores and categorization
child_nutri_dat$HAZ <- as.numeric(child_nutri_dat$HAZ)
### Checking the variables after conditioning them.
summary(child_nutri_dat)
## [1] "numeric"
## id sex age year grade
## 5540 : 2 Male :2537 Min. : 3.900 2016: 69 5 : 701
## 5541 : 2 Female:2526 1st Qu.: 7.020 2017: 354 6 : 699
## 1 : 1 NA's : 44 Median : 9.070 2018: 778 1 : 634
## 2 : 1 Mean : 9.503 2019:3387 3 : 590
## 3 : 1 3rd Qu.:11.060 2020: 231 4 : 504
## 4 : 1 Max. :19.000 2021: 276 (Other):1942
## (Other):5099 NA's :24 NA's: 12 NA's : 37
## indigent province cct weight
## Non-indigenous:4401 Province 1:3655 cct :1812 Min. :11.00
## Indigenous : 701 Province 2: 425 non-cct:3098 1st Qu.:21.00
## NA's : 5 Province 3: 456 NA's : 197 Median :28.00
## Province 4: 563 Mean :30.22
## NA's : 8 3rd Qu.:38.00
## Max. :80.80
## NA's :163
## height bmi HAZ
## Min. :0.970 Min. : 7.03 Min. :-8.8500
## 1st Qu.:1.200 1st Qu.:14.37 1st Qu.:-1.3800
## Median :1.320 Median :15.91 Median :-0.4350
## Mean :1.321 Mean :16.71 Mean :-0.3632
## 3rd Qu.:1.440 3rd Qu.:18.67 3rd Qu.: 0.5625
## Max. :1.900 Max. :35.24 Max. :48.7700
## NA's :162 NA's :167 NA's :235
Making new categories for age and HAZ
Group children by age. Creating three age groups: children and adolescents
child_nutri_dat$age_category <- NULL
child_nutri_dat$age_category[child_nutri_dat$age >= 3 & child_nutri_dat$age < 5] <-'aged below 5'
child_nutri_dat$age_category[child_nutri_dat$age >= 5 & child_nutri_dat$age < 11] <- 'aged between 5 and 10'
child_nutri_dat$age_category[child_nutri_dat$age >= 11 & child_nutri_dat$age <= 19] <- 'aged between 11 and 19'
child_nutri_dat$age_category[child_nutri_dat$age < 3] <- 'exclude'
child_nutri_dat$age_category[child_nutri_dat$age > 19] <- 'exclude'
table(child_nutri_dat$age_category, useNA = 'always')
Categorize the HAZ in children!
## Height-for-age categorization
child_nutri_dat$new_haz_cat <- NULL
child_nutri_dat$new_haz_cat[child_nutri_dat$HAZ < -2 & child_nutri_dat$HAZ >= -3] <-'moderate stunting'
child_nutri_dat$new_haz_cat[child_nutri_dat$HAZ >= -2 & child_nutri_dat$HAZ < 6] <-'normal'
child_nutri_dat$new_haz_cat[child_nutri_dat$HAZ < -3 & child_nutri_dat$HAZ >= -6] <-'severe stunting'
child_nutri_dat$new_haz_cat[child_nutri_dat$HAZ < -6 & child_nutri_dat$HAZ > 6] <-'outliers'
table(child_nutri_dat$new_haz_cat, useNA = 'always')
##
## moderate stunting normal severe stunting <NA>
## 445 4248 165 249
Visualize the number of children by age.
+ Since the number of children is a count or discrete and age is continuous, one way that I can plot them is by using a histogram!
#Plot number of children by province
#Use histogram to plot
ggplot(data= child_nutri_dat, mapping= aes(x=age)) +
geom_histogram(binwidth = 1, # width of bins
color = "dark red", # bin line color
fill = "cornflowerblue")+
labs(y= "number of children", x="Age") # NEWLY ADDED: changing labels for the x and y-axis
### Bar chart 1 Creating a plot to visualize the proportion of children
by age group
library(ggplot2)
ggplot(data= child_nutri_dat, mapping= aes(fill=child_nutri_dat$sex,x=child_nutri_dat$age_category)) +
geom_bar(aes(x=age_category, y= after_stat(count/sum(count)*100)), alpha=1, width = 1, position="dodge") +
labs(y= "proportion of children", x="age group") +
scale_fill_manual(values = c("rosybrown", "purple"))+ # NEWLY ADDED: changing color fills based on your preference
theme_classic() + # NEWLY ADDED: adding a background theme
guides(fill=guide_legend(title="Gender")) # NEWLY ADDED: Changing legend's label/title
## NEWLY ADDED: Saving the graph in png format
ggsave("Proportion of children by age group.png", width = 11, height = 12)
## Remove NAs in a specific variable
library(tidyverse)
df0 <- child_nutri_dat %>%
filter(!is.na(age_category)) %>%
filter(!is.na(sex))
## Creating a barplot
ggplot(data= df0, mapping= aes(fill= sex,x=age_category)) +
geom_bar(aes(x=age_category, y= after_stat(count/sum(count)*100)), alpha=1, width = 1, position="dodge", color= "white") + # position: bar next to
labs(y= "proportion of children", x="age group") +
guides(fill=guide_legend(title="Gender")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 40, margin=margin(40))) # NEWLY ADDED: adjust angles and margins of x-axis texts
### Bar chart 3 * Add Face wrap
## Remove NAs in a specific variable
df0 <- child_nutri_dat %>%
filter(!is.na(age_category)) %>%
filter(!is.na(sex))
## Creating a barplot
ggplot(data= df0, mapping= aes(fill= sex,x=age_category)) +
geom_bar(aes(x=age_category, y= after_stat(count/sum(count)*100)), alpha=1, width = 1, position="dodge", color= "white") +
labs(y= "proportion of children", x="age group") +
guides(fill=guide_legend(title="Gender")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 40, margin=margin(40))) + # NEWLY ADDED: adjust angles and margins of x-axis texts
facet_grid(~sex) #NEWLY ADDED: Splitting two categories into different plots
ggplot(data=df0, mapping = aes(fill=sex, x=weight)) +
geom_histogram(color="white", width = 0)+
labs(x="Weight of children", y="number of children")+
guides(fill=guide_legend(title="Gender"))+
facet_grid(~sex)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 146 rows containing non-finite values (`stat_bin()`).
# Write your answers here
df1 <- child_nutri_dat %>%
filter(!is.na(grade))%>%
filter(!is.na(new_haz_cat)) %>%
filter(new_haz_cat=="moderate stunting"|new_haz_cat=="severe stunting")
ggplot(data=df1, mapping = aes(x=grade))+
geom_bar(aes(fill=new_haz_cat, y=after_stat(count/sum(count)*100)))+
labs(y="proportion of children", x="grade group")+
theme_classic() +
scale_fill_manual(values = c("blue", "orange")) +
guides(fill=guide_legend(title="Adverse nutritional status"))
### Bar chart 5 * Plot the proportion of children by nutritional status,
intervention and by age group (present the proportion of children with
“aged between 11 and 19” and “aged between 5 and 10” only).
df2 <- child_nutri_dat %>%
filter(!is.na(cct)) %>%
filter(!is.na(age_category))%>%
filter(!is.na(new_haz_cat)) %>%
filter(!new_haz_cat=="normal") %>%
filter(!age_category=="aged below 5")
ggplot(data= df2, mapping= aes(x=new_haz_cat)) +
geom_bar(aes(fill=cct, y= after_stat(count/sum(count)*100)), alpha=1, width = 1, position="dodge", color= "white") +
labs(y= "proportion of children", x="Child nutritional status") +
theme_classic() +
scale_fill_manual(values = c("rosybrown", "purple", "coral", "orange")) +
facet_wrap(~age_category) +
guides(fill=guide_legend(title="Intervention"))
Plot the height and weight of children
#Removing NAs
df0 <- child_nutri_dat %>%
filter(!is.na(weight)) %>%
filter(!is.na(sex)) %>%
filter(!is.na(height))
#Statistic color for points and for line
ggplot(data = df0, mapping = aes(x = weight, y = height, ))+
geom_point(mapping= aes(color= sex)) + ## NEW PLOT using GEOM_POINT function
geom_vline(xintercept = 50, color = "dark grey") +
labs(title = "Static color for points and line", y= "Height (meters)", x="Weight (kg)")+
theme_classic()
#Removing NAs
df0 <- child_nutri_dat %>%
filter(!is.na(weight)) %>%
filter(!is.na(sex)) %>%
filter(!is.na(height))
#Statistic color for points and for line
ggplot(data = df0, mapping = aes(x = weight, y = height, color=weight, size= weight, ))+
geom_point(shape= "diamond", alpha =0.3, size= 2)+ #alpha transparency
geom_vline(xintercept = 50, color = "dark grey") +
labs(title = "Static color for points and line", y= "Height (meters)", x="Weight (kg)") +
theme_classic() +
viridis::scale_color_viridis()+ #change color
geom_smooth(method = "lm")
df0 <- child_nutri_dat %>%
filter(!is.na(age_category)) %>%
filter(!is.na(sex)) %>%
filter(!new_haz_cat=="outliers")
ggplot(data = df0, mapping = aes(x=HAZ, y=height, color=HAZ))+
geom_point(shape="triangle", alpha=0.5, size=2)+
labs(title="Plotting HAZ against height", x="HAZ of children", y="Heigh of children")+
geom_smooth(method = "lm", size=0.5)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
df1 <- child_nutri_dat %>%
filter(!is.na(HAZ)) %>%
filter(!is.na(bmi)) %>%
filter(!is.na(age_category))
ggplot(data = df1, mapping = aes(x=HAZ, y=bmi, color=age_category))+
geom_point(alpha=0.5, size=1)+
labs(title="Plotting HAZ against BMI", x="HAZ of children", y="BMI of children")+
geom_vline(xintercept = 0, color="black")+
theme_classic()
### Scatterplot 3 * Plot the height-for-age Zscores (HAZ) and the weight
of children, differentiate them by the new haz cat
df1 <- child_nutri_dat %>%
filter(!is.na(age_category)) %>%
filter(!is.na(sex)) %>%
filter(!is.na(new_haz_cat))
## Creating a scatterplot
ggplot(data = df1, mapping = aes(x = HAZ, y = weight, color=new_haz_cat))+
geom_point(size= 1, alpha= 0.5) +
geom_vline(xintercept = -2, color = "dark grey") +
labs(title = "Static color for points and line", y= "weight", x="Height-for-age Z-scores") +
scale_fill_manual(values = c("rosybrown", "purple", "orange")) +
theme_classic()
## Warning: Removed 4 rows containing missing values (`geom_point()`).
df0 <- child_nutri_dat %>%
filter(!is.na(province)) %>%
filter(!is.na(sex))
ggplot(data = df0, aes(x = province, y = age, colour = province)) +
geom_boxplot()+
facet_wrap(~sex, ncol = 4) +
theme_gray () + # NEW THEME
theme(axis.text.x = element_text(angle = 45, margin=margin(20)))
df1 <- child_nutri_dat %>%
filter(!is.na(age_category)) %>%
filter(!is.na(weight)) %>%
filter(!is.na(cct))
ggplot(df1, mapping = aes(x=weight, y=age_category, color=age_category))+
geom_boxplot()+
labs(title = "Weight distribution by age categories", y= "Age categories", x="Weight (kg)") +
facet_wrap(~cct)+
theme(axis.text.y = element_text(angle = 45, hjust = 1))
#install these packages and run this rmarkdown.
library(ggridges)
library(viridis)
library(hrbrthemes)
library(forcats)
library(ggplot2)
The figure below presents the age distribution of children by nutritional status and by sex.
df0 <- child_nutri_dat %>%
filter(!is.na(new_haz_cat)) %>%
filter(!is.na(cct))
ggplot(data=df0, mapping= aes(x=age, y=new_haz_cat, fill= sex)) +
geom_density_ridges(alpha=0.2)+
labs(y= "", x="Age distribution of children") +
theme(legend.position= "right", panel.spacing = unit(0.5, "lines"), strip.text.x= element_text(size=12)) +
guides(fill=guide_legend(title="Gender")) +
theme_minimal()
## Picking joint bandwidth of 0.94
df1 <- child_nutri_dat %>%
filter(!is.na(HAZ)) %>%
filter(!is.na(cct)) %>%
filter(!is.na(province)) %>%
filter(!is.na(new_haz_cat))
ggplot(df1, aes(x=HAZ, y=province, fill=cct))+
geom_density_ridges(alpha=0.2)+
theme(legend.position= "right", # legend.position: position of the legend on the plot
panel.spacing = unit(0.5, "lines"), # spacing between panels
strip.text.x= element_text(size=12))+ # size of x-axis text
guides(fill=guide_legend(title = "Intervention"))
## Picking joint bandwidth of 0.401