Overview

This research project focused on the effects of HIV/AIDS on pregnant women and the treatment modalities to prevent mother-neonate transmission during parturition.

Load packages

The following R packages are used for this analysis:

  1. readxl
  2. tidyverse
  3. skimr
  4. forcats
  5. ggplot2
  6. gganimate
  7. gifski
  8. plotly

Load data files

hiv <- read_xlsx("hiv.xlsx", sheet = "Sheet1", col_names = TRUE, na = "NA")
New names:
* `` -> `...25`
str(hiv)
tibble [181 x 25] (S3: tbl_df/tbl/data.frame)
 $ CONSENT              : chr [1:181] "YES" "YES" "YES" "YES" ...
 $ PID                  : num [1:181] 101739 101762 101769 101778 101779 ...
 $ ENTERED DATE         : chr [1:181] "Mar.23.2021" "Mar.03.2021" "Mar.03.2021" "Mar.03.2021" ...
 $ AGE AT ENROLLMENT    : num [1:181] 30 41 25 26 37 32 23 26 34 40 ...
 $ CCC NUMBER           : num [1:181] 1.35e+09 1.35e+09 1.35e+09 1.35e+09 1.42e+09 ...
 $ ART REGIMEN          : chr [1:181] "TDF/3TC/DTG" "TDF/3TC/DTG" "TDF/3TC/DTG" "TDF/3TC/DTG" ...
 $ DISCLOSURE STATUS    : chr [1:181] "NOT DISCLOSED" "PARTNER" "FAMILY" "NOT DISCLOSED" ...
 $ MARITAL STATUS       : chr [1:181] "MARRIED" "MARRIED" "MARRIED" "MARRIED" ...
 $ PLANNED PREGNANCY    : chr [1:181] "NO" "NO" "NO" "YES" ...
 $ FAMILY PLANNING      : chr [1:181] "DEPO-PROVERA" "DEPO-PROVERA" "DEPO-PROVERA" "NONE" ...
 $ LMP                  : chr [1:181] "23/2/2020" "43865" "18/2/2020" "20/2/2020" ...
 $ EDD                  : chr [1:181] "Nov.30.2020" "Jan.09.2021" "Nov.30.2020" "Dec.08.2020" ...
 $ DELIVERY DATE        : chr [1:181] "44141" "44197" "18/11/2020" "44147" ...
 $ DELIVERY OUTCOME     : chr [1:181] "LIVE BIRTH" "LIVE BIRTH" "LIVE BIRTH" "LIVE BIRTH" ...
 $ BASELINE VL RESULTS  : chr [1:181] NA "LDL" "LDL" "LDL" ...
 $ POSTPARTUM VL RESULTS: chr [1:181] "72" "LDL" "LDL" "LDL" ...
 $ INFANT HEI NO        : chr [1:181] "13476/2020/00099" "13476/2021/00015" "13476/2020/00107" "13476/2021/0014" ...
 $ BIRTH WEIGHT         : num [1:181] 2 4 3 3 4 NA NA 3 4 3 ...
 $ INFANT GENDER        : chr [1:181] "FEMALE" "MALE" "FEMALE" "MALE" ...
 $ 6WEEKS PCR           : chr [1:181] "NEGATIVE" "NEGATIVE" "NEGATIVE" "NEGATIVE" ...
 $ 6 MONTHS PCR         : chr [1:181] "NEGATIVE" "NEGATIVE" "NEGATIVE" NA ...
 $ INFANT ART           : chr [1:181] "NONE" "NONE" "NONE" "NONE" ...
 $ ADVERSE EVENT        : chr [1:181] "NONE" "NONE" "NONE" "NONE" ...
 $ STUDY STATUS         : chr [1:181] "PARTICIPATORY" "PARTICIPATORY" "PARTICIPATORY" "PARTICIPATORY" ...
 $ ...25                : chr [1:181] NA NA NA NA ...
names(hiv)
 [1] "CONSENT"               "PID"                   "ENTERED DATE"         
 [4] "AGE AT ENROLLMENT"     "CCC NUMBER"            "ART REGIMEN"          
 [7] "DISCLOSURE STATUS"     "MARITAL STATUS"        "PLANNED PREGNANCY"    
[10] "FAMILY PLANNING"       "LMP"                   "EDD"                  
[13] "DELIVERY DATE"         "DELIVERY OUTCOME"      "BASELINE VL RESULTS"  
[16] "POSTPARTUM VL RESULTS" "INFANT HEI NO"         "BIRTH WEIGHT"         
[19] "INFANT GENDER"         "6WEEKS PCR"            "6 MONTHS PCR"         
[22] "INFANT ART"            "ADVERSE EVENT"         "STUDY STATUS"         
[25] "...25"                
dim(hiv)
[1] 181  25

Data wrangling

The column names should be changed for easier analysis. Most of the columns’ class are not in the proper form. The last column is also removed because it has no use.

Format the names of the columns

# Format the columns names
new_names = c("consent", "pid", "entered_date", "age", "ccc_num", "art_regimen", "disclosure_status", "marital_status", "planned_pregnancy", "family_planning", "lmp", "edd", "delivery_date", "delivery_outcome", "baseline_vl", "postpartum_vl", "infant_num", "birth_weight", "infant_sex", "six_week_pcr", "six_month_pcr", "infant_art", "adverse_event", "study_status", "last")

hiv <- hiv %>% set_names(new_names)

# Conversely use
#names(hiv) <- make.names(names(hiv))

Drop the last column

# Remove the last column
hiv <- hiv %>% select(-last)

# Check the dimension of the data
dim(hiv)
[1] 181  24
# View details
skimr::skim(hiv)
Data summary
Name hiv
Number of rows 181
Number of columns 24
_______________________
Column type frequency:
character 20
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
consent 0 1.00 2 3 0 2 0
entered_date 0 1.00 11 11 0 103 0
art_regimen 0 1.00 5 56 0 6 0
disclosure_status 0 1.00 6 13 0 5 0
marital_status 0 1.00 6 9 0 4 0
planned_pregnancy 0 1.00 2 28 0 3 0
family_planning 0 1.00 3 12 0 7 0
lmp 0 1.00 5 10 0 164 0
edd 0 1.00 11 11 0 162 0
delivery_date 36 0.80 5 29 0 130 0
delivery_outcome 26 0.86 5 11 0 5 0
baseline_vl 80 0.56 2 6 0 31 0
postpartum_vl 63 0.65 2 5 0 21 0
infant_num 42 0.77 10 35 0 137 0
infant_sex 39 0.78 4 21 0 4 0
six_week_pcr 66 0.64 8 27 0 3 0
six_month_pcr 159 0.12 8 8 0 2 0
infant_art 41 0.77 4 29 0 2 0
adverse_event 8 0.96 4 30 0 7 0
study_status 0 1.00 10 13 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
pid 0 1.00 102495.41 573.05 101739.0 101981 102338 102968 103592 ▇▅▃▃▃
age 0 1.00 30.77 5.81 15.0 26 32 35 45 ▁▆▇▇▂
ccc_num 0 1.00 701010146.69 704883159.04 65.0 6415 1291204357 1347607191 2200010906 ▇▁▁▇▁
birth_weight 93 0.49 3.29 0.59 1.7 3 3 4 5 ▁▇▁▅▁

Missing values

Observe the level of missing values. Remove all missing values if the level is low.

# Compute missing values
sum(is.na(hiv))
[1] 653
# Available values
sum(!is.na(hiv))
[1] 3691
# Total values
dim(hiv)
[1] 181  24
d1 <- dim(hiv)[1]
d2 <- dim(hiv)[2]
d1*d2
[1] 4344
# Total values (another approach)
sum(sum(is.na(hiv)), sum(!is.na(hiv)))
[1] 4344
# Proportion of missing values
mean(is.na(hiv))
[1] 0.1503223

Change the class of columns

Some of the columns need type casting.

# Change the following columns from character to factor
hiv$art_regimen <- factor(hiv$art_regimen)
hiv$disclosure_status <- factor(hiv$disclosure_status)
hiv$marital_statu <- factor(hiv$marital_status)
hiv$planned_pregnancy <- factor(hiv$planned_pregnancy)
hiv$family_planning <- factor(hiv$family_planning)
hiv$delivery_outcome <- factor(hiv$delivery_outcome)

Explore

# Observe unique values for delivery outcome
unique(hiv$delivery_outcome)
[1] LIVE BIRTH  <NA>        LIVE BIRTHS MISCARRIAGE STILL BIRTH TWINS      
Levels: LIVE BIRTH LIVE BIRTHS MISCARRIAGE STILL BIRTH TWINS
# Count values
hiv %>% select(delivery_outcome) %>% count(delivery_outcome)
# A tibble: 6 x 2
  delivery_outcome     n
  <fct>            <int>
1 LIVE BIRTH         142
2 LIVE BIRTHS          1
3 MISCARRIAGE         10
4 STILL BIRTH          1
5 TWINS                1
6 <NA>                26
# Change the necessary values
hiv <- hiv %>%  mutate(deliveryOutcome2 = recode(hiv$delivery_outcome, "LIVE BIRTHS" = "LIVE BIRTH", "TWINS" = "LIVE BIRTH")) 

# Verify 
hiv %>% select(deliveryOutcome2) %>% count(deliveryOutcome2)
# A tibble: 4 x 2
  deliveryOutcome2     n
  <fct>            <int>
1 LIVE BIRTH         144
2 MISCARRIAGE         10
3 STILL BIRTH          1
4 <NA>                26
# Unique values for infant sex
unique(hiv$infant_sex)
[1] "FEMALE"                "MALE"                  NA                     
[4] "FEMALE        MALE"    "MALE             MALE"
# Count
hiv %>% select(infant_sex) %>% count(infant_sex)
# A tibble: 5 x 2
  infant_sex                n
  <chr>                 <int>
1 FEMALE                   67
2 FEMALE        MALE        1
3 MALE                     73
4 MALE             MALE     1
5 <NA>                     39
# Replace values
hiv <- hiv %>% mutate(infantSex2 = recode(hiv$infant_sex, "FEMALE        MALE" = "FEMALE", "MALE             MALE" = "MALE"))

# Verify
hiv %>% select(infantSex2) %>% count(infantSex2)
# A tibble: 3 x 2
  infantSex2     n
  <chr>      <int>
1 FEMALE        68
2 MALE          74
3 <NA>          39
# Pregnancy
unique(hiv$planned_pregnancy)
[1] NO                           YES                         
[3] YES                      YES
Levels: NO YES YES                      YES
# Replace values
hiv <- hiv %>% mutate(planPreg = recode(hiv$planned_pregnancy, "YES                      YES" = "YES"))

Visualization

# HIV status disclosure with marital status
bars <- ggplot(hiv, aes(x = disclosure_status, fill = marital_status)) + geom_bar() + transition_states(marital_status)

 bars2 <- bars + enter_fade() + exit_fade()
 bars2

# Check the range of the age to determine bins
 range(hiv$age); mean(hiv$age) 
[1] 15 45
[1] 30.76796
# Distribution of age with marital status
age_hist <- ggplot(hiv, aes(age, fill = marital_status)) + geom_histogram()

ageh <- ggplotly(age_hist)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ageh
# Disclosure, age and marital status 
 bp <- ggplot(hiv, aes(x = disclosure_status, y = age, fill = marital_status)) + geom_boxplot() + transition_states(marital_status) + shadow_wake(wake_length = 1, alpha = FALSE, wrap = FALSE)
 bp

 # Subset the column for contraception
 fp2 <- hiv %>% select(family_planning) %>% count(family_planning)

 # Pictorial count of family planning
 contraceptives <- ggplot(fp2, aes(family_planning, n, fill = family_planning)) + geom_col() + 
geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + 
theme(legend.position = "none", axis.title.y = element_text(angle = 0)) + 
labs(title = "Histogram of types of contraceptive")
 contraceptives

 # Family planning and pregnancy
 pp <- ggplot(data = hiv, aes(x = planPreg, fill = family_planning)) + geom_bar(position = "dodge") + facet_wrap(~ family_planning)
 
  ggplotly(pp)

The average age is 30 years and the median is 32. From the data, at the average age of 30 years all participates were already married