This research project focused on the effects of HIV/AIDS on pregnant women and the treatment modalities to prevent mother-neonate transmission during parturition.
The following R packages are used for this analysis:
readxl
tidyverse
skimr
forcats
ggplot2
gganimate
gifski
plotly
hiv <- read_xlsx("hiv.xlsx", sheet = "Sheet1", col_names = TRUE, na = "NA")
New names:
* `` -> `...25`
str(hiv)
tibble [181 x 25] (S3: tbl_df/tbl/data.frame)
$ CONSENT : chr [1:181] "YES" "YES" "YES" "YES" ...
$ PID : num [1:181] 101739 101762 101769 101778 101779 ...
$ ENTERED DATE : chr [1:181] "Mar.23.2021" "Mar.03.2021" "Mar.03.2021" "Mar.03.2021" ...
$ AGE AT ENROLLMENT : num [1:181] 30 41 25 26 37 32 23 26 34 40 ...
$ CCC NUMBER : num [1:181] 1.35e+09 1.35e+09 1.35e+09 1.35e+09 1.42e+09 ...
$ ART REGIMEN : chr [1:181] "TDF/3TC/DTG" "TDF/3TC/DTG" "TDF/3TC/DTG" "TDF/3TC/DTG" ...
$ DISCLOSURE STATUS : chr [1:181] "NOT DISCLOSED" "PARTNER" "FAMILY" "NOT DISCLOSED" ...
$ MARITAL STATUS : chr [1:181] "MARRIED" "MARRIED" "MARRIED" "MARRIED" ...
$ PLANNED PREGNANCY : chr [1:181] "NO" "NO" "NO" "YES" ...
$ FAMILY PLANNING : chr [1:181] "DEPO-PROVERA" "DEPO-PROVERA" "DEPO-PROVERA" "NONE" ...
$ LMP : chr [1:181] "23/2/2020" "43865" "18/2/2020" "20/2/2020" ...
$ EDD : chr [1:181] "Nov.30.2020" "Jan.09.2021" "Nov.30.2020" "Dec.08.2020" ...
$ DELIVERY DATE : chr [1:181] "44141" "44197" "18/11/2020" "44147" ...
$ DELIVERY OUTCOME : chr [1:181] "LIVE BIRTH" "LIVE BIRTH" "LIVE BIRTH" "LIVE BIRTH" ...
$ BASELINE VL RESULTS : chr [1:181] NA "LDL" "LDL" "LDL" ...
$ POSTPARTUM VL RESULTS: chr [1:181] "72" "LDL" "LDL" "LDL" ...
$ INFANT HEI NO : chr [1:181] "13476/2020/00099" "13476/2021/00015" "13476/2020/00107" "13476/2021/0014" ...
$ BIRTH WEIGHT : num [1:181] 2 4 3 3 4 NA NA 3 4 3 ...
$ INFANT GENDER : chr [1:181] "FEMALE" "MALE" "FEMALE" "MALE" ...
$ 6WEEKS PCR : chr [1:181] "NEGATIVE" "NEGATIVE" "NEGATIVE" "NEGATIVE" ...
$ 6 MONTHS PCR : chr [1:181] "NEGATIVE" "NEGATIVE" "NEGATIVE" NA ...
$ INFANT ART : chr [1:181] "NONE" "NONE" "NONE" "NONE" ...
$ ADVERSE EVENT : chr [1:181] "NONE" "NONE" "NONE" "NONE" ...
$ STUDY STATUS : chr [1:181] "PARTICIPATORY" "PARTICIPATORY" "PARTICIPATORY" "PARTICIPATORY" ...
$ ...25 : chr [1:181] NA NA NA NA ...
names(hiv)
[1] "CONSENT" "PID" "ENTERED DATE"
[4] "AGE AT ENROLLMENT" "CCC NUMBER" "ART REGIMEN"
[7] "DISCLOSURE STATUS" "MARITAL STATUS" "PLANNED PREGNANCY"
[10] "FAMILY PLANNING" "LMP" "EDD"
[13] "DELIVERY DATE" "DELIVERY OUTCOME" "BASELINE VL RESULTS"
[16] "POSTPARTUM VL RESULTS" "INFANT HEI NO" "BIRTH WEIGHT"
[19] "INFANT GENDER" "6WEEKS PCR" "6 MONTHS PCR"
[22] "INFANT ART" "ADVERSE EVENT" "STUDY STATUS"
[25] "...25"
dim(hiv)
[1] 181 25
The column names should be changed for easier analysis. Most of the columns’ class are not in the proper form. The last column is also removed because it has no use.
# Format the columns names
new_names = c("consent", "pid", "entered_date", "age", "ccc_num", "art_regimen", "disclosure_status", "marital_status", "planned_pregnancy", "family_planning", "lmp", "edd", "delivery_date", "delivery_outcome", "baseline_vl", "postpartum_vl", "infant_num", "birth_weight", "infant_sex", "six_week_pcr", "six_month_pcr", "infant_art", "adverse_event", "study_status", "last")
hiv <- hiv %>% set_names(new_names)
# Conversely use
#names(hiv) <- make.names(names(hiv))
# Remove the last column
hiv <- hiv %>% select(-last)
# Check the dimension of the data
dim(hiv)
[1] 181 24
# View details
skimr::skim(hiv)
| Name | hiv |
| Number of rows | 181 |
| Number of columns | 24 |
| _______________________ | |
| Column type frequency: | |
| character | 20 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| consent | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| entered_date | 0 | 1.00 | 11 | 11 | 0 | 103 | 0 |
| art_regimen | 0 | 1.00 | 5 | 56 | 0 | 6 | 0 |
| disclosure_status | 0 | 1.00 | 6 | 13 | 0 | 5 | 0 |
| marital_status | 0 | 1.00 | 6 | 9 | 0 | 4 | 0 |
| planned_pregnancy | 0 | 1.00 | 2 | 28 | 0 | 3 | 0 |
| family_planning | 0 | 1.00 | 3 | 12 | 0 | 7 | 0 |
| lmp | 0 | 1.00 | 5 | 10 | 0 | 164 | 0 |
| edd | 0 | 1.00 | 11 | 11 | 0 | 162 | 0 |
| delivery_date | 36 | 0.80 | 5 | 29 | 0 | 130 | 0 |
| delivery_outcome | 26 | 0.86 | 5 | 11 | 0 | 5 | 0 |
| baseline_vl | 80 | 0.56 | 2 | 6 | 0 | 31 | 0 |
| postpartum_vl | 63 | 0.65 | 2 | 5 | 0 | 21 | 0 |
| infant_num | 42 | 0.77 | 10 | 35 | 0 | 137 | 0 |
| infant_sex | 39 | 0.78 | 4 | 21 | 0 | 4 | 0 |
| six_week_pcr | 66 | 0.64 | 8 | 27 | 0 | 3 | 0 |
| six_month_pcr | 159 | 0.12 | 8 | 8 | 0 | 2 | 0 |
| infant_art | 41 | 0.77 | 4 | 29 | 0 | 2 | 0 |
| adverse_event | 8 | 0.96 | 4 | 30 | 0 | 7 | 0 |
| study_status | 0 | 1.00 | 10 | 13 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| pid | 0 | 1.00 | 102495.41 | 573.05 | 101739.0 | 101981 | 102338 | 102968 | 103592 | ▇▅▃▃▃ |
| age | 0 | 1.00 | 30.77 | 5.81 | 15.0 | 26 | 32 | 35 | 45 | ▁▆▇▇▂ |
| ccc_num | 0 | 1.00 | 701010146.69 | 704883159.04 | 65.0 | 6415 | 1291204357 | 1347607191 | 2200010906 | ▇▁▁▇▁ |
| birth_weight | 93 | 0.49 | 3.29 | 0.59 | 1.7 | 3 | 3 | 4 | 5 | ▁▇▁▅▁ |
Observe the level of missing values. Remove all missing values if the level is low.
# Compute missing values
sum(is.na(hiv))
[1] 653
# Available values
sum(!is.na(hiv))
[1] 3691
# Total values
dim(hiv)
[1] 181 24
d1 <- dim(hiv)[1]
d2 <- dim(hiv)[2]
d1*d2
[1] 4344
# Total values (another approach)
sum(sum(is.na(hiv)), sum(!is.na(hiv)))
[1] 4344
# Proportion of missing values
mean(is.na(hiv))
[1] 0.1503223
Some of the columns need type casting.
# Change the following columns from character to factor
hiv$art_regimen <- factor(hiv$art_regimen)
hiv$disclosure_status <- factor(hiv$disclosure_status)
hiv$marital_statu <- factor(hiv$marital_status)
hiv$planned_pregnancy <- factor(hiv$planned_pregnancy)
hiv$family_planning <- factor(hiv$family_planning)
hiv$delivery_outcome <- factor(hiv$delivery_outcome)
# Observe unique values for delivery outcome
unique(hiv$delivery_outcome)
[1] LIVE BIRTH <NA> LIVE BIRTHS MISCARRIAGE STILL BIRTH TWINS
Levels: LIVE BIRTH LIVE BIRTHS MISCARRIAGE STILL BIRTH TWINS
# Count values
hiv %>% select(delivery_outcome) %>% count(delivery_outcome)
# A tibble: 6 x 2
delivery_outcome n
<fct> <int>
1 LIVE BIRTH 142
2 LIVE BIRTHS 1
3 MISCARRIAGE 10
4 STILL BIRTH 1
5 TWINS 1
6 <NA> 26
# Change the necessary values
hiv <- hiv %>% mutate(deliveryOutcome2 = recode(hiv$delivery_outcome, "LIVE BIRTHS" = "LIVE BIRTH", "TWINS" = "LIVE BIRTH"))
# Verify
hiv %>% select(deliveryOutcome2) %>% count(deliveryOutcome2)
# A tibble: 4 x 2
deliveryOutcome2 n
<fct> <int>
1 LIVE BIRTH 144
2 MISCARRIAGE 10
3 STILL BIRTH 1
4 <NA> 26
# Unique values for infant sex
unique(hiv$infant_sex)
[1] "FEMALE" "MALE" NA
[4] "FEMALE MALE" "MALE MALE"
# Count
hiv %>% select(infant_sex) %>% count(infant_sex)
# A tibble: 5 x 2
infant_sex n
<chr> <int>
1 FEMALE 67
2 FEMALE MALE 1
3 MALE 73
4 MALE MALE 1
5 <NA> 39
# Replace values
hiv <- hiv %>% mutate(infantSex2 = recode(hiv$infant_sex, "FEMALE MALE" = "FEMALE", "MALE MALE" = "MALE"))
# Verify
hiv %>% select(infantSex2) %>% count(infantSex2)
# A tibble: 3 x 2
infantSex2 n
<chr> <int>
1 FEMALE 68
2 MALE 74
3 <NA> 39
# Pregnancy
unique(hiv$planned_pregnancy)
[1] NO YES
[3] YES YES
Levels: NO YES YES YES
# Replace values
hiv <- hiv %>% mutate(planPreg = recode(hiv$planned_pregnancy, "YES YES" = "YES"))
# HIV status disclosure with marital status
bars <- ggplot(hiv, aes(x = disclosure_status, fill = marital_status)) + geom_bar() + transition_states(marital_status)
bars2 <- bars + enter_fade() + exit_fade()
bars2
# Check the range of the age to determine bins
range(hiv$age); mean(hiv$age)
[1] 15 45
[1] 30.76796
# Distribution of age with marital status
age_hist <- ggplot(hiv, aes(age, fill = marital_status)) + geom_histogram()
ageh <- ggplotly(age_hist)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ageh
# Disclosure, age and marital status
bp <- ggplot(hiv, aes(x = disclosure_status, y = age, fill = marital_status)) + geom_boxplot() + transition_states(marital_status) + shadow_wake(wake_length = 1, alpha = FALSE, wrap = FALSE)
bp
# Subset the column for contraception
fp2 <- hiv %>% select(family_planning) %>% count(family_planning)
# Pictorial count of family planning
contraceptives <- ggplot(fp2, aes(family_planning, n, fill = family_planning)) + geom_col() +
geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() +
theme(legend.position = "none", axis.title.y = element_text(angle = 0)) +
labs(title = "Histogram of types of contraceptive")
contraceptives
# Family planning and pregnancy
pp <- ggplot(data = hiv, aes(x = planPreg, fill = family_planning)) + geom_bar(position = "dodge") + facet_wrap(~ family_planning)
ggplotly(pp)
The average age is 30 years and the median is 32. From the data, at the average age of 30 years all participates were already married