1. Defining malnutrition

1.1. Prerequisites

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(zscorer)
library(readr)
library(dplyr)
library (lubridate)
library (ggplot2)
library (rmarkdown)
library(openxlsx)
library(writexl)
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(eeptools)
## Registered S3 method overwritten by 'lme4':
##   method           from
##   na.action.merMod car
library(ggpubr)
PSFI_df_malnutrition <- read_csv("Ben_cut_4_27.csv")
## Rows: 858 Columns: 959
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (241): poct_emd_initials, poct_hiv_result1, poct_hiv_result2, poct_hiv_r...
## dbl (717): record_id, hiv_pos, poct_malaria_pos, poct_istat___1, poct_istat_...
## lgl   (1): lab_hco3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# CREATE A NEW CATEGORY THAT CALCULATES AGE IN DAYS BASED ON DATE OF BIRTH AND DATE OF PRESENTATION
  PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(
    date_present = as.POSIXct(date_present, format = "%d%b%Y:%H:%M:%S", tz = "UTC"),
    dob = mdy(dob),
    age_days_exact = as.numeric(difftime(date_present, dob, units = "days"))
  )
  
summary(PSFI_df_malnutrition$age_days_exact)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.03  289.19  604.61 1124.44 1461.64 5452.40

1.2. Anthroprometric analysis

# CREATE NEW CATEGORY THAT CALCULATES AGE IN YEARS BASED ON AGE IN DAYS
PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(age_years = age_days_exact / 365.25)

summary(PSFI_df_malnutrition$age_years)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.08768  0.79176  1.65535  3.07854  4.00176 14.92786
# CREATES A SUMMARY OF CASES (ht=height, wt=weight, age_years= age in years, muac)
PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  select(ht, wt, age_years, muac) %>%
  summary()
##        ht               wt          age_years             muac      
##  Min.   : 20.00   Min.   : 3.20   Min.   : 0.08768   Min.   : 9.00  
##  1st Qu.: 67.00   1st Qu.: 7.00   1st Qu.: 0.73436   1st Qu.:14.00  
##  Median : 78.00   Median : 9.50   Median : 1.40141   Median :15.00  
##  Mean   : 85.23   Mean   :12.17   Mean   : 2.83539   Mean   :15.38  
##  3rd Qu.: 98.00   3rd Qu.:14.00   3rd Qu.: 3.49267   3rd Qu.:17.00  
##  Max.   :192.00   Max.   :72.00   Max.   :14.92786   Max.   :27.00  
##  NAs    :1                                           NAs    :2
# CREATES A HISTOGRAM AND BOXPLOT OF HEIGHT (CASES)
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(PSFI_df_malnutrition$ht[PSFI_df_malnutrition$case_control == 1] , horizontal=TRUE , ylim=c(0,200), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(PSFI_df_malnutrition$ht[PSFI_df_malnutrition$case_control == 1]
     , breaks=40 , col=rgb(1,0.8,0.8,1) , border=F , main="" , xlab="Height (cm)", xlim=c(0,200))

# CREATES A HISTOGRAM AND BOXPLOT OF WEIGHT (CASES)
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
 par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(PSFI_df_malnutrition$wt[PSFI_df_malnutrition$case_control == 1] , horizontal=TRUE , ylim=c(0,75), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(PSFI_df_malnutrition$wt[PSFI_df_malnutrition$case_control == 1]
     , breaks=40 , col=rgb(1,0.8,0.8,1) , border=F , main="" , xlab="Weight (kg)", xlim=c(0,75))

# CREATES A HISTOGRAM AND BOXPLOT OF MUAC (CASES)
## BLUE LINE OUTLINES MODERATE MALNUTRITION (12.5cm)
## RED LINE OUTLINES SEVERE MALNUTRITION (11.5cm)
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(PSFI_df_malnutrition$muac[PSFI_df_malnutrition$case_control == 1] , horizontal=TRUE , ylim=c(5,30), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(PSFI_df_malnutrition$muac[PSFI_df_malnutrition$case_control == 1]
     , breaks=20 , col=rgb(1,0.8,0.8,1) , border=F , main="" , xlab="Mid-upper arm circumference (cm)", xlim=c(5,30))
abline(v = 11.5, col = "red", lwd = 2, lty = 2)   # severe
abline(v = 12.5, col = "blue", lwd = 2, lty = 2)  # moderate

# HISTOGRAM AND BOXPLOT OF AGE (CASES)
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
 par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(PSFI_df_malnutrition$age_years[PSFI_df_malnutrition$case_control == 1] , horizontal=TRUE , ylim=c(0,15), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(PSFI_df_malnutrition$age_years[PSFI_df_malnutrition$case_control == 1]
     , breaks=40 , col=rgb(1,0.8,0.8,1) , border=F , main="" , xlab="Age (years)", xlim=c(0,15))

1.3. Z-scorer

# CREATES A NEW CATEGORY DEFINING SEX AS 1/2, INSTEAD OF 0/1 (NECESSARY FOR ZSCORER PACKAGE)
PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(sex_who = if_else(sex == 1, 1, 2))

summary(PSFI_df_malnutrition$sex_who)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.427   2.000   2.000
# CREATES A NEW CATEGORY CALCULATING AGE IN MONTHS BASED ON AGE IN DAYS
PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(age_months = age_days_exact / 30.4375)

summary(PSFI_df_malnutrition$age_months)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.052   9.501  19.864  36.942  48.021 179.134
# CREATES A NEW CATEGORY (AGE<6M = 0, AGE 6M - 5J = 1, AGE > 5J = 2)
PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(
    age_group = case_when(
      age_months < 6 ~ 0L,
      age_months >= 6 & age_years < 5 ~ 1L,
      age_years >= 5 ~ 2L,
      TRUE ~ NA_integer_
    )
  )
# ZSCORER PACKAGE CALCULATES ZSCORE OF WEIGHT FOR LENGTH (wflz), WEIGHT FOR AGE (wfaz), HEIGHT FOR AGE (hfaz), WEIGHT FOR HEIGHT (wfhz), BMI FOR AGE (baz)
PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(
    wflz = addWGSR(
      data = .,
      sex = "sex_who",
      firstPart = "wt",
      secondPart = "ht",
      index = "wfl"
    )$wflz,
    
    wfaz = addWGSR(
      data = .,
      sex = "sex_who",
      firstPart = "wt",
      secondPart = "age_days_exact",
      index = "wfa"
    )$wfaz,
    
    hfaz = addWGSR(
      data = .,
      sex = "sex_who",
      firstPart = "ht",
      secondPart = "age_days_exact",
      index = "hfa"
    )$hfaz,

    wfhz = addWGSR(
      data = .,
      sex = "sex_who",
      firstPart = "wt",
      secondPart = "ht",
      index = "wfh"
    )$wfhz,
    
    baz = addWGSR(
      data = .,
      sex = "sex_who",
      firstPart = "wt",
      secondPart = "ht",
      thirdPart = "age_days_exact",
      index = "bfa"
    )$bfaz
  )
## ================================================================================
## ================================================================================
## ================================================================================
## ================================================================================
## ================================================================================
# ASSIGNS Z-SCORE TO MUAC BASED ON THE 11.5 & 12.5CM LIMITS, THIS IS A PROXY SINCE LATER A Z-SCORE OF 0 WILL EQUAL NO MALNUTRITION, -2.5 WILL EQUAL MODERATE MALNUTRITION AND -4 WILL EQUAL SEVERE MALNUTRITION
PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(
    muacz = case_when(
      muac >= 12.5 ~ 0,
      muac >= 11.5 & muac < 12.5 ~ -2.5,
      muac < 11.5 ~ -4,
      TRUE ~ NA_real_
    )
  )
# CREATE MALNUTRITION ZSCORE BASED ON WFHL, BAZ, MUACZ (to be defined later)
PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(
    zscore_unified = case_when(
      age_group == 1 & ht >= 45 & ht < 65 ~ wflz,
      age_group == 1 & ht >= 65 & ht < 120 ~ wfhz,
      age_group == 1 & (ht < 45 | ht >= 120 | is.na(ht)) ~ muacz,
      age_group == 2 ~ baz,
      TRUE ~ NA_real_
    )
  )
# CREATE MALNUTRITION CATEGORY BASED ON PREVIOUS Z SCORE
PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(
    malnutrition = case_when(
  is.na(zscore_unified) ~ NA_integer_,
  zscore_unified < -3 ~ 2L,
  zscore_unified >= -3 & zscore_unified < -2 ~ 1L,
  TRUE ~ 0L
    )
  )
# ADD A MALNUTRITION SOURCE SO WE KNOW WHICH ANTHROPOMETRIC MEASURE IS BEING USED TO DEFINE MALNUTRITION
PSFI_df_malnutrition <- PSFI_df_malnutrition %>%
  mutate(
    malnutrition_source = case_when(
      age_group == 1 & ht >= 45 & ht < 65  ~ "WFL",
      age_group == 1 & ht >= 65 & ht < 120 ~ "WFH",
      age_group == 1 & (ht < 45 | ht >= 120 | is.na(ht)) ~ "MUAC",
      age_group == 2 ~ "BFA",
      TRUE ~ NA_character_
    )
  )

1.4 Z score check

subset_wfaz <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  pull(wfaz)

# SUMMARY OF WEIGHT FOR AGE Z-SCORE (CASES)
summary(subset_wfaz)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.     NAs 
## -7.5400 -2.3050 -0.9700 -1.0971  0.0475 10.2500      49
# NUMBER OF CASES IN WEIGHT FOR AGE Z SCORE
length(subset_wfaz)
## [1] 755
subset_wfaz1 <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  identify_outliers(wfaz) %>%
  pull(wfaz)

# NUMBER OF OUTLIERS IN WEIGHT FOR AGE Z SCORE
length(subset_wfaz1)
## [1] 10
subset_hfaz <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  pull(hfaz)

# SUMMARY OF HEIGHT FOR AGE Z SCORE (CASES)
summary(subset_hfaz)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.      NAs 
## -18.2600  -2.8700  -1.1800  -0.9338   0.6350  44.5600        1
# NUMBER OF CASES IN HEIGHT FOR AGE Z SCORE
length(subset_hfaz)
## [1] 755
subset_hfaz1 <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  identify_outliers(hfaz) %>%
  pull(hfaz)

# NUMBER OF OUTLIERS FOR HEIGHT FOR AGE Z SCORE
length(subset_hfaz1)
## [1] 29
subset_wfhz <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  pull(wfhz)

# SUMMARY OF WEIGHT FOR HEIGHT Z SCORE (CASES)
summary(subset_wfhz)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.      NAs 
## -10.5900  -2.2300  -0.5700  -0.4132   1.1875  42.7700       93
# NUMBER OF CASES IN WEIGHT FOR HEIGHT Z SCORE
length(subset_wfhz)
## [1] 755
subset_wfhz1 <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  identify_outliers(wfhz) %>%
  pull(wfhz)

# NUMBER OF OUTLIERS IN WEIGHT FOR HEIGHT Z SCORE
length(subset_wfhz1)
## [1] 20
subset_baz <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  pull(baz)

# SUMMARY OF BMI FOR AGE Z SCORE (CASES)
summary(subset_baz)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.     NAs 
## -12.560  -2.250  -0.695  -0.583   1.060  50.500       1
# NUMBER OF CASES IN BMI FOR AGE Z SCORE
length(subset_baz)
## [1] 755
subset_baz1 <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  identify_outliers(baz) %>%
  pull(baz)

# NUMBER OF OUTLIERS IN BMI FOR AGE Z SCORE
length(subset_baz1)
## [1] 29
subset_muac <- PSFI_df_malnutrition %>%
  filter(age_group == 1 & case_control == 1) %>%
  pull(muac)

# SUMMARY OF MUAC (CASES, 6-59 MONTHS)
summary(subset_muac)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.     NAs 
##    9.00   14.00   15.00   15.02   16.00   23.00       2
# NUMBER OF CASES IN MUAC
length(subset_muac)
## [1] 535
subset_muac1 <- PSFI_df_malnutrition %>%
  filter(age_group == 1 & case_control == 1) %>%
  identify_outliers(muac) %>%
  pull(muac)

# NUMBER OF OUTLIERS IN MUAC
length(subset_muac1)
## [1] 18
# CREATES BOXPLOT AND HISTOGRAM FOR WEIGHT FOR HEIGHT Z SCORE (CASES)
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
 par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(PSFI_df_malnutrition$wfhz[PSFI_df_malnutrition$case_control == 1] , horizontal=TRUE , ylim=c(-11,43), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(PSFI_df_malnutrition$wfhz[PSFI_df_malnutrition$case_control == 1]
     , breaks=50 , col=rgb(1,0.8,0.8,1) , border=F , main="" , xlab="Weight-for-height (z-score)", xlim=c(-11,43))

# CREATES BOXPLOT AND HISTOGRAM OF WEIGHT FOR AGE (CASES)
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
 par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(PSFI_df_malnutrition$wfaz[PSFI_df_malnutrition$case_control == 1] , horizontal=TRUE , ylim=c(-8,11), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(PSFI_df_malnutrition$wfaz[PSFI_df_malnutrition$case_control == 1]
     , breaks=50 , col=rgb(1,0.8,0.8,1) , border=F , main="" , xlab="Weight-for-age (z-score)", xlim=c(-8,11))

# CREATES BOXPLOT AND HISTOGRAM OF BMI FOR AGE (CASES)
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
 par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(PSFI_df_malnutrition$baz[PSFI_df_malnutrition$case_control == 1] , horizontal=TRUE , ylim=c(-13,51), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(PSFI_df_malnutrition$baz[PSFI_df_malnutrition$case_control == 1]
     , breaks=50 , col=rgb(1,0.8,0.8,1) , border=F , main="" , xlab="BMI-for-age (z-score)", xlim=c(-13,51))

# CREATE BOXPLOT AND HISTOGRAM OF HEIGHT FOR AGE 
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
 par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(PSFI_df_malnutrition$hfaz[PSFI_df_malnutrition$case_control == 1] , horizontal=TRUE , ylim=c(-19,45), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(PSFI_df_malnutrition$hfaz[PSFI_df_malnutrition$case_control == 1]
     , breaks=50 , col=rgb(1,0.8,0.8,1) , border=F , main="" , xlab="Height-for-age (z-score)", xlim=c(-19,45))

# CREATES HISTOGRAM AND BOXPLOT OF MUAC (CASES, 6-59 MONTHS)
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(PSFI_df_malnutrition$muac[PSFI_df_malnutrition$case_control == 1] , horizontal=TRUE , ylim=c(5,30), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(PSFI_df_malnutrition$muac[PSFI_df_malnutrition$case_control == 1]
     , breaks=20 , col=rgb(1,0.8,0.8,1) , border=F , main="" , xlab="Mid-upper arm circumference (cm)", xlim=c(5,30))
abline(v = 11.5, col = "red", lwd = 2, lty = 2)   # severe
abline(v = 12.5, col = "blue", lwd = 2, lty = 2)  # moderate

# MAKES A TABLE OF OUTLIERS FOR WEIGHT FOR HEIGHT
wfhz_outliers <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  identify_outliers(wfhz) %>%
  select(record_id, wt, ht, wfhz, age_months, is.outlier, is.extreme)

wfhz_outliers
## # A tibble: 20 × 7
##    record_id    wt    ht   wfhz age_months is.outlier is.extreme
##        <dbl> <dbl> <dbl>  <dbl>      <dbl> <lgl>      <lgl>     
##  1        26 45      115   9        145.   TRUE       FALSE     
##  2        51 20       68  12.9       35.7  TRUE       TRUE      
##  3        56  4       70  -7.99      14.9  TRUE       FALSE     
##  4       140  8.45    55   7.71       6.26 TRUE       FALSE     
##  5       151 30       57  42.8      121.   TRUE       TRUE      
##  6       154  7      100  -7.63      13.5  TRUE       FALSE     
##  7       203 39      115   7.61      86.9  TRUE       FALSE     
##  8       277  4       88 -10.6       26.4  TRUE       FALSE     
##  9       318  4.10    70  -7.8       12.7  TRUE       FALSE     
## 10       412  8.5     56   6.39       7.82 TRUE       FALSE     
## 11       430  6       86  -7.48       6.16 TRUE       FALSE     
## 12       617 14       62   9.76      54.5  TRUE       FALSE     
## 13       653 11       54  13.8       13.0  TRUE       TRUE      
## 14       655 10       56   8.84       7.54 TRUE       FALSE     
## 15       657 10       58   7.78      13.4  TRUE       FALSE     
## 16       663 21       75  10.7       93.2  TRUE       FALSE     
## 17       665 12.5     60  10.0       33.3  TRUE       FALSE     
## 18       672 14       69   6.77      23.1  TRUE       FALSE     
## 19       734 10       56   9.6       12.4  TRUE       FALSE     
## 20       885 16.5     71   8.09      21.0  TRUE       FALSE
# MAKES A TABLE OF OUTLIERS FOR WEIGHT FOR AGE
wfaz_outliers <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  identify_outliers(wfaz) %>%
  select(record_id,ht, wt, age_months, wfaz, is.outlier, is.extreme)

wfaz_outliers
## # A tibble: 10 × 7
##    record_id    ht    wt age_months  wfaz is.outlier is.extreme
##        <dbl> <dbl> <dbl>      <dbl> <dbl> <lgl>      <lgl>     
##  1        56    70  4         14.9  -6.89 TRUE       FALSE     
##  2       113    94 12        121.   -6.02 TRUE       FALSE     
##  3       277    88  4         26.4  -7.54 TRUE       FALSE     
##  4       318    70  4.10      12.7  -6.55 TRUE       FALSE     
##  5       369   118 36         74.8   3.75 TRUE       FALSE     
##  6       486    74 12          7.15  3.63 TRUE       FALSE     
##  7       487   156 18          4.59 10.2  TRUE       TRUE      
##  8       552    89 17         19.4   3.8  TRUE       FALSE     
##  9       603    54  6         31.2  -5.97 TRUE       FALSE     
## 10       907    68  4.30      10.4  -6    TRUE       FALSE
# MAKES A TABLE OF OUTLIERS FOR HEIGHT FOR AGE
hfaz_outliers <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  identify_outliers(hfaz) %>%
  select(record_id, ht, wt, age_months, hfaz, is.outlier, is.extreme)

hfaz_outliers
## # A tibble: 29 × 7
##    record_id    ht    wt age_months   hfaz is.outlier is.extreme
##        <dbl> <dbl> <dbl>      <dbl>  <dbl> <lgl>      <lgl>     
##  1        39    20  3.90       1.28 -18.3  TRUE       TRUE      
##  2        94   166 16         50.6   14.0  TRUE       TRUE      
##  3        96   110  9.5       22.3    8.02 TRUE       FALSE     
##  4       135   178 15         52.5   16.2  TRUE       TRUE      
##  5       136    44 15         62.2  -14.4  TRUE       TRUE      
##  6       138    53  6         15.2  -10.4  TRUE       FALSE     
##  7       151    57 30        121.   -12.7  TRUE       FALSE     
##  8       154   100  7         13.5    9.08 TRUE       FALSE     
##  9       174   165  8.10       5.54  44.6  TRUE       TRUE      
## 10       214    40  7          7.88 -13.8  TRUE       TRUE      
## # ℹ 19 more rows
# MAKES A TABLE OF OUTLIERS FOR BMI FOR AGE
baz_outliers <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  identify_outliers(baz) %>%
  select(record_id, wt, ht, baz, age_months, is.outlier, is.extreme)

baz_outliers
## # A tibble: 29 × 7
##    record_id    wt    ht   baz age_months is.outlier is.extreme
##        <dbl> <dbl> <dbl> <dbl>      <dbl> <lgl>      <lgl>     
##  1        39  3.90    20 50.5        1.28 TRUE       TRUE      
##  2        51 20       68 15.2       35.7  TRUE       TRUE      
##  3        56  4       70 -8.04      14.9  TRUE       FALSE     
##  4        94 16      166 -8.9       50.6  TRUE       FALSE     
##  5        96  9.5    110 -7.47      22.3  TRUE       FALSE     
##  6       135 15      178 -9.87      52.5  TRUE       FALSE     
##  7       136 15       44 32.7       62.2  TRUE       TRUE      
##  8       140  8.45    55  6.05       6.26 TRUE       FALSE     
##  9       151 30       57 16.9      121.   TRUE       TRUE      
## 10       154  7      100 -8.31      13.5  TRUE       FALSE     
## # ℹ 19 more rows
# MAKES A TABLE OF OUTLIERS FOR MUAC

muac_outliers <- PSFI_df_malnutrition %>%
  filter(age_group == 1 & case_control == 1) %>%
  identify_outliers(muac) %>%
  select(record_id, ht, wt, age_months, muac, is.outlier, is.extreme)

muac_outliers
## # A tibble: 18 × 7
##    record_id    ht    wt age_months  muac is.outlier is.extreme
##        <dbl> <dbl> <dbl>      <dbl> <dbl> <lgl>      <lgl>     
##  1        51    68 20         35.7     23 TRUE       TRUE      
##  2       106    86 10         16.9     20 TRUE       FALSE     
##  3       140    55  8.45       6.26    20 TRUE       FALSE     
##  4       160    94 14         35.8     20 TRUE       FALSE     
##  5       167    97 19         33.4     20 TRUE       FALSE     
##  6       277    88  4         26.4     10 TRUE       FALSE     
##  7       348    91 16.4       19.1     20 TRUE       FALSE     
##  8       390    70  9.40      12.6     20 TRUE       FALSE     
##  9       412    56  8.5        7.82    20 TRUE       FALSE     
## 10       552    89 17         19.4     20 TRUE       FALSE     
## 11       563    74 11         12.0     21 TRUE       FALSE     
## 12       611    75  8.40      48.0     10 TRUE       FALSE     
## 13       619    63  5         14.5      9 TRUE       FALSE     
## 14       674   100 24         56.6     20 TRUE       FALSE     
## 15       760    66  6.20      16.4     10 TRUE       FALSE     
## 16       784   132 16         53.6     22 TRUE       FALSE     
## 17       882    99 18         41.5     20 TRUE       FALSE     
## 18       884   112 25         52.6     20 TRUE       FALSE
# CREATES AN EXCEL FILE OF ALL OUTLIER TABLES
write.xlsx(
  list(
    WFHZ = wfhz_outliers,
    WFAZ = wfaz_outliers,
    HAZ = hfaz_outliers,
    MUAC = muac_outliers, 
    BAZ = baz_outliers
  ),
  file = "anthropometric_outliers.xlsx"
)
# CREATE A TABLE OF ALL NA'S
na_table <- PSFI_df_malnutrition %>%
  filter(case_control == 1) %>%
  mutate(
    hfaz_missing = is.na(hfaz),
    wfaz_missing  = is.na(wfaz),
    wfhz_missing = is.na(wfhz), 
    muac_missing = is.na (muac), 
    baz_missing = is.na (baz)
  ) %>%
  filter(hfaz_missing | wfaz_missing | wfhz_missing | baz_missing | muac_missing) %>%
  select(
    record_id,
    ht,
    wt,
    age_months,
    hfaz,
    wfaz,
    wfhz,
    baz,
    muac, 
    hfaz_missing,
    wfaz_missing,
    wfhz_missing, 
    baz_missing,
    muac_missing
  )

na_table
## # A tibble: 98 × 14
##    record_id    ht    wt age_months   hfaz  wfaz  wfhz   baz  muac hfaz_missing
##        <dbl> <dbl> <dbl>      <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>       
##  1         2    68  6          7.57  -0.9  -3.11 -3.58 -3.61    NA FALSE       
##  2         4   142 35        144.    -1.37 NA    NA    -0.29    23 FALSE       
##  3        26   115 45        145.    -5.36 NA     9     3.28    18 FALSE       
##  4        32   125 24        150.    -4.22 NA    NA    -1.55    18 FALSE       
##  5        39    20  3.90       1.28 -18.3  -1.58 NA    50.5     11 FALSE       
##  6        40   139 35        157.    -2.56 NA    NA    -0.31    19 FALSE       
##  7        52   133 51        133.    -1.59 NA    NA     3.1     20 FALSE       
##  8        54   127 21.3       98.0   -0.2  -1.41 NA    -2.13    15 FALSE       
##  9        62   127 25         92.1    0.41  0.23 NA    -0.04    18 FALSE       
## 10        82   141 30        112.     1.08  0.15 NA    -0.65    20 FALSE       
## # ℹ 88 more rows
## # ℹ 4 more variables: wfaz_missing <lgl>, wfhz_missing <lgl>,
## #   baz_missing <lgl>, muac_missing <lgl>
anthro_vars <- c("hfaz", "wfaz", "wfhz", "baz", "muac")

cases_df <- PSFI_df_malnutrition %>%
  filter(case_control == 1)

anthro_summary <- lapply(anthro_vars, function(var) {

  x <- cases_df[[var]]

  outlier_info <- cases_df %>%
    select(all_of(var)) %>%
    identify_outliers(!!sym(var))

  tibble(
    measure = var,
    n = sum(!is.na(x)),
    n_missing = sum(is.na(x)),
    pct_missing = round(mean(is.na(x)) * 100, 2),
    n_outliers = sum(outlier_info$is.outlier, na.rm = TRUE),
    n_extreme_outliers = sum(outlier_info$is.extreme, na.rm = TRUE)
  )

}) %>%
  bind_rows()

anthro_summary
## # A tibble: 5 × 6
##   measure     n n_missing pct_missing n_outliers n_extreme_outliers
##   <chr>   <int>     <int>       <dbl>      <int>              <int>
## 1 hfaz      754         1        0.13         29                 10
## 2 wfaz      706        49        6.49         10                  1
## 3 wfhz      662        93       12.3          20                  3
## 4 baz       754         1        0.13         29                  8
## 5 muac      753         2        0.26         17                  1
write.xlsx(
  list(
    Missing = na_table,
    Summary = anthro_summary
  ),
  file = "NA.xlsx")

1.5 Data check

PSFI_df_malnutrition %>%
  group_by(malnutrition, case_control) %>%
  summarize(
    malnutrition_missing = sum(is.na(malnutrition))
  )
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by malnutrition and case_control.
## ℹ Output is grouped by malnutrition.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(malnutrition, case_control))` for per-operation
##   grouping (`?dplyr::dplyr_by`) instead.
## # A tibble: 8 × 3
## # Groups:   malnutrition [4]
##   malnutrition case_control malnutrition_missing
##          <int>        <dbl>                <int>
## 1            0            0                    0
## 2            0            1                    0
## 3            1            0                    0
## 4            1            1                    0
## 5            2            0                    0
## 6            2            1                    0
## 7           NA            0                    1
## 8           NA            1                   93
 #103 mort_inhosp_missing  = controls
PSFI_df_malnutrition %>%
  filter (case_control == 1) %>%
  group_by(malnutrition_source) %>%
  summarize(
    count_malnut = n()
  )
## # A tibble: 5 × 2
##   malnutrition_source count_malnut
##   <chr>                      <int>
## 1 BFA                          128
## 2 MUAC                          10
## 3 WFH                          458
## 4 WFL                           67
## 5 <NA>                          92
# NA due to child with height of 20 cm (under WHO curve for wfh/wfl) and 38 days old (too young for MUAC)
PSFI_df_malnutrition %>%
  filter (case_control == 1) %>%
  group_by(malnutrition) %>%
  summarize(
    count_malnut = n()
  )
## # A tibble: 4 × 2
##   malnutrition count_malnut
##          <int>        <int>
## 1            0          470
## 2            1           76
## 3            2          116
## 4           NA           93
PSFI_df_malnutrition %>%
  drop_na(malnutrition, mort_inhosp) %>%
  ggplot(aes(x = factor(malnutrition))) +
  geom_bar() +
  labs(
    x = "Malnutrition",
    y = "N"
  )

PSFI_df_malnutrition%>%
  filter (case_control == 1) %>%
  count(malnutrition, mort_inhosp) %>%
  group_by(malnutrition) %>%
  mutate(prop = n / sum(n))
## # A tibble: 8 × 4
## # Groups:   malnutrition [4]
##   malnutrition mort_inhosp     n  prop
##          <int>       <dbl> <int> <dbl>
## 1            0           0   405 0.862
## 2            0           1    65 0.138
## 3            1           0    56 0.737
## 4            1           1    20 0.263
## 5            2           0    82 0.707
## 6            2           1    34 0.293
## 7           NA           0    64 0.688
## 8           NA           1    29 0.312
PSFI_df_malnutrition %>%
  drop_na(mort_inhosp, malnutrition) %>% #only cases & minus the one previously mentioned NA
  ggplot +
  (aes(x = factor(malnutrition), fill = factor(mort_inhosp))) +
  geom_bar(stat = "count", position = "fill") +
  labs(
    x = "Malnutrition",
    y = "Proportion",
    fill = "Mortality"
  )

PSFI_df_malnutrition %>%
  count(malnutrition, case_control) %>%
  group_by(malnutrition) %>%
  mutate(prop = n / sum(n))
## # A tibble: 8 × 4
## # Groups:   malnutrition [4]
##   malnutrition case_control     n   prop
##          <int>        <dbl> <int>  <dbl>
## 1            0            0    78 0.142 
## 2            0            1   470 0.858 
## 3            1            0     9 0.106 
## 4            1            1    76 0.894 
## 5            2            0    15 0.115 
## 6            2            1   116 0.885 
## 7           NA            0     1 0.0106
## 8           NA            1    93 0.989
PSFI_df_malnutrition %>%
  drop_na(malnutrition) %>%
  ggplot +
  (aes(x = factor(malnutrition), fill = factor(case_control))) +
  geom_bar(stat = "count", position = "fill") +
  labs(
    x = "Malnutrition",
    y = "Proportion",
    fill = "Case/Control"
  )

PSFI_df_malnutrition %>%

ggplot +
  (aes(x = whz, y = wfhz)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    stat_cor(method = "pearson") +
  theme_minimal()
## Warning: Removed 157 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 157 rows containing missing values or values outside the scale range
## (`geom_point()`).

PSFI_df_malnutrition %>%

ggplot +
  (aes(x = waz, y = wfaz)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    stat_cor(method = "pearson") +
  theme_minimal()
## Warning: Removed 65 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 65 rows containing missing values or values outside the scale range
## (`geom_point()`).

write.xlsx(PSFI_df_malnutrition, file = "PSFI_final_malnutrition.xlsx")