Clinical research on HIV positive expectant mothers

Overview

This research project focused on the effects of HIV/AIDS on pregnant women and the treatment modalities to prevent mother-neonate transmission during parturition.

Load packages

The following R packages are used for this analysis:

readxl
tidyverse
skimr
forcats
ggplot2
gganimate
gifski
plotly

Load data files

hiv <- read_xlsx("hiv.xlsx", sheet = "Sheet1", col_names = TRUE, na = "NA")

New names:
* `` -> `...25`

str(hiv)

tibble [181 x 25] (S3: tbl_df/tbl/data.frame)
 $ CONSENT              : chr [1:181] "YES" "YES" "YES" "YES" ...
 $ PID                  : num [1:181] 101739 101762 101769 101778 101779 ...
 $ ENTERED DATE         : chr [1:181] "Mar.23.2021" "Mar.03.2021" "Mar.03.2021" "Mar.03.2021" ...
 $ AGE AT ENROLLMENT    : num [1:181] 30 41 25 26 37 32 23 26 34 40 ...
 $ CCC NUMBER           : num [1:181] 1.35e+09 1.35e+09 1.35e+09 1.35e+09 1.42e+09 ...
 $ ART REGIMEN          : chr [1:181] "TDF/3TC/DTG" "TDF/3TC/DTG" "TDF/3TC/DTG" "TDF/3TC/DTG" ...
 $ DISCLOSURE STATUS    : chr [1:181] "NOT DISCLOSED" "PARTNER" "FAMILY" "NOT DISCLOSED" ...
 $ MARITAL STATUS       : chr [1:181] "MARRIED" "MARRIED" "MARRIED" "MARRIED" ...
 $ PLANNED PREGNANCY    : chr [1:181] "NO" "NO" "NO" "YES" ...
 $ FAMILY PLANNING      : chr [1:181] "DEPO-PROVERA" "DEPO-PROVERA" "DEPO-PROVERA" "NONE" ...
 $ LMP                  : chr [1:181] "23/2/2020" "43865" "18/2/2020" "20/2/2020" ...
 $ EDD                  : chr [1:181] "Nov.30.2020" "Jan.09.2021" "Nov.30.2020" "Dec.08.2020" ...
 $ DELIVERY DATE        : chr [1:181] "44141" "44197" "18/11/2020" "44147" ...
 $ DELIVERY OUTCOME     : chr [1:181] "LIVE BIRTH" "LIVE BIRTH" "LIVE BIRTH" "LIVE BIRTH" ...
 $ BASELINE VL RESULTS  : chr [1:181] NA "LDL" "LDL" "LDL" ...
 $ POSTPARTUM VL RESULTS: chr [1:181] "72" "LDL" "LDL" "LDL" ...
 $ INFANT HEI NO        : chr [1:181] "13476/2020/00099" "13476/2021/00015" "13476/2020/00107" "13476/2021/0014" ...
 $ BIRTH WEIGHT         : num [1:181] 2 4 3 3 4 NA NA 3 4 3 ...
 $ INFANT GENDER        : chr [1:181] "FEMALE" "MALE" "FEMALE" "MALE" ...
 $ 6WEEKS PCR           : chr [1:181] "NEGATIVE" "NEGATIVE" "NEGATIVE" "NEGATIVE" ...
 $ 6 MONTHS PCR         : chr [1:181] "NEGATIVE" "NEGATIVE" "NEGATIVE" NA ...
 $ INFANT ART           : chr [1:181] "NONE" "NONE" "NONE" "NONE" ...
 $ ADVERSE EVENT        : chr [1:181] "NONE" "NONE" "NONE" "NONE" ...
 $ STUDY STATUS         : chr [1:181] "PARTICIPATORY" "PARTICIPATORY" "PARTICIPATORY" "PARTICIPATORY" ...
 $ ...25                : chr [1:181] NA NA NA NA ...

names(hiv)

 [1] "CONSENT"               "PID"                   "ENTERED DATE"         
 [4] "AGE AT ENROLLMENT"     "CCC NUMBER"            "ART REGIMEN"          
 [7] "DISCLOSURE STATUS"     "MARITAL STATUS"        "PLANNED PREGNANCY"    
[10] "FAMILY PLANNING"       "LMP"                   "EDD"                  
[13] "DELIVERY DATE"         "DELIVERY OUTCOME"      "BASELINE VL RESULTS"  
[16] "POSTPARTUM VL RESULTS" "INFANT HEI NO"         "BIRTH WEIGHT"         
[19] "INFANT GENDER"         "6WEEKS PCR"            "6 MONTHS PCR"         
[22] "INFANT ART"            "ADVERSE EVENT"         "STUDY STATUS"         
[25] "...25"

dim(hiv)

[1] 181  25

Data wrangling

The column names should be changed for easier analysis. Most of the columns’ class are not in the proper form. The last column is also removed because it has no use.

Format the names of the columns

# Format the columns names
new_names = c("consent", "pid", "entered_date", "age", "ccc_num", "art_regimen", "disclosure_status", "marital_status", "planned_pregnancy", "family_planning", "lmp", "edd", "delivery_date", "delivery_outcome", "baseline_vl", "postpartum_vl", "infant_num", "birth_weight", "infant_sex", "six_week_pcr", "six_month_pcr", "infant_art", "adverse_event", "study_status", "last")

hiv <- hiv %>% set_names(new_names)

# Conversely use
#names(hiv) <- make.names(names(hiv))

Drop the last column

# Remove the last column
hiv <- hiv %>% select(-last)

# Check the dimension of the data
dim(hiv)

[1] 181  24

# View details
skimr::skim(hiv)

Data summary
Name	hiv
Number of rows	181
Number of columns	24
_______________________
Column type frequency:
character	20
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
consent	0	1.00	2	3	2
entered_date	0	1.00	11	11	103
art_regimen	0	1.00	5	56	6
disclosure_status	0	1.00	6	13	5
marital_status	0	1.00	6	9	4
planned_pregnancy	0	1.00	2	28	3
family_planning	0	1.00	3	12	7
lmp	0	1.00	5	10	164
edd	0	1.00	11	11	162
delivery_date	36	0.80	5	29	130
delivery_outcome	26	0.86	5	11	5
baseline_vl	80	0.56	2	6	31
postpartum_vl	63	0.65	2	5	21
infant_num	42	0.77	10	35	137
infant_sex	39	0.78	4	21	4
six_week_pcr	66	0.64	8	27	3
six_month_pcr	159	0.12	8	8	2
infant_art	41	0.77	4	29	2
adverse_event	8	0.96	4	30	7
study_status	0	1.00	10	13	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
pid	0	1.00	102495.41	573.05	101739.0	101981	102338	102968	103592	▇▅▃▃▃
age	0	1.00	30.77	5.81	15.0	26	32	35	45	▁▆▇▇▂
ccc_num	0	1.00	701010146.69	704883159.04	65.0	6415	1291204357	1347607191	2200010906	▇▁▁▇▁
birth_weight	93	0.49	3.29	0.59	1.7	3	3	4	5	▁▇▁▅▁

Missing values

Observe the level of missing values. Remove all missing values if the level is low.

# Compute missing values
sum(is.na(hiv))

[1] 653

# Available values
sum(!is.na(hiv))

[1] 3691

# Total values
dim(hiv)

[1] 181  24

d1 <- dim(hiv)[1]
d2 <- dim(hiv)[2]
d1*d2

[1] 4344

# Total values (another approach)
sum(sum(is.na(hiv)), sum(!is.na(hiv)))

[1] 4344

# Proportion of missing values
mean(is.na(hiv))

[1] 0.1503223

Change the class of columns

Some of the columns need type casting.

# Change the following columns from character to factor
hiv$art_regimen <- factor(hiv$art_regimen)
hiv$disclosure_status <- factor(hiv$disclosure_status)
hiv$marital_statu <- factor(hiv$marital_status)
hiv$planned_pregnancy <- factor(hiv$planned_pregnancy)
hiv$family_planning <- factor(hiv$family_planning)
hiv$delivery_outcome <- factor(hiv$delivery_outcome)

Explore

# Observe unique values for delivery outcome
unique(hiv$delivery_outcome)

[1] LIVE BIRTH  <NA>        LIVE BIRTHS MISCARRIAGE STILL BIRTH TWINS      
Levels: LIVE BIRTH LIVE BIRTHS MISCARRIAGE STILL BIRTH TWINS

# Count values
hiv %>% select(delivery_outcome) %>% count(delivery_outcome)

# A tibble: 6 x 2
  delivery_outcome     n
  <fct>            <int>
1 LIVE BIRTH         142
2 LIVE BIRTHS          1
3 MISCARRIAGE         10
4 STILL BIRTH          1
5 TWINS                1
6 <NA>                26

# Change the necessary values
hiv <- hiv %>%  mutate(deliveryOutcome2 = recode(hiv$delivery_outcome, "LIVE BIRTHS" = "LIVE BIRTH", "TWINS" = "LIVE BIRTH")) 

# Verify 
hiv %>% select(deliveryOutcome2) %>% count(deliveryOutcome2)

# A tibble: 4 x 2
  deliveryOutcome2     n
  <fct>            <int>
1 LIVE BIRTH         144
2 MISCARRIAGE         10
3 STILL BIRTH          1
4 <NA>                26

# Unique values for infant sex
unique(hiv$infant_sex)

[1] "FEMALE"                "MALE"                  NA                     
[4] "FEMALE        MALE"    "MALE             MALE"

# Count
hiv %>% select(infant_sex) %>% count(infant_sex)

# A tibble: 5 x 2
  infant_sex                n
  <chr>                 <int>
1 FEMALE                   67
2 FEMALE        MALE        1
3 MALE                     73
4 MALE             MALE     1
5 <NA>                     39

# Replace values
hiv <- hiv %>% mutate(infantSex2 = recode(hiv$infant_sex, "FEMALE        MALE" = "FEMALE", "MALE             MALE" = "MALE"))

# Verify
hiv %>% select(infantSex2) %>% count(infantSex2)

# A tibble: 3 x 2
  infantSex2     n
  <chr>      <int>
1 FEMALE        68
2 MALE          74
3 <NA>          39

# Pregnancy
unique(hiv$planned_pregnancy)

[1] NO                           YES                         
[3] YES                      YES
Levels: NO YES YES                      YES

# Replace values
hiv <- hiv %>% mutate(planPreg = recode(hiv$planned_pregnancy, "YES                      YES" = "YES"))

Visualization

# HIV status disclosure with marital status
bars <- ggplot(hiv, aes(x = disclosure_status, fill = marital_status)) + geom_bar() + transition_states(marital_status)

 bars2 <- bars + enter_fade() + exit_fade()
 bars2

# Check the range of the age to determine bins
 range(hiv$age); mean(hiv$age)

[1] 15 45

[1] 30.76796

# Distribution of age with marital status
age_hist <- ggplot(hiv, aes(age, fill = marital_status)) + geom_histogram()

ageh <- ggplotly(age_hist)

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ageh

# Disclosure, age and marital status 
 bp <- ggplot(hiv, aes(x = disclosure_status, y = age, fill = marital_status)) + geom_boxplot() + transition_states(marital_status) + shadow_wake(wake_length = 1, alpha = FALSE, wrap = FALSE)
 bp

 # Subset the column for contraception
 fp2 <- hiv %>% select(family_planning) %>% count(family_planning)

 # Pictorial count of family planning
 contraceptives <- ggplot(fp2, aes(family_planning, n, fill = family_planning)) + geom_col() + 
geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + 
theme(legend.position = "none", axis.title.y = element_text(angle = 0)) + 
labs(title = "Histogram of types of contraceptive")
 contraceptives

 # Family planning and pregnancy
 pp <- ggplot(data = hiv, aes(x = planPreg, fill = family_planning)) + geom_bar(position = "dodge") + facet_wrap(~ family_planning)
 
  ggplotly(pp)

The average age is 30 years and the median is 32. From the data, at the average age of 30 years all participates were already married