R Markdown

Load paCKages

rm(list = ls())
library(here)
## here() starts at D:/NIH_Floor study/R-training
library(readr)
library(readxl)
library(haven)
library(labelled)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(broom)
library(ggplot2)

Read in data

stat_6mo <- read_dta(here("data", "data_6mo.dta"))
stat_6mo_factored <- as.data.frame(lapply(stat_6mo, to_factor))

excel_6mo <- read_excel(here("data", "data_6mo.xlsx"))
colnames_6mo <- c("dataid", "round", "breastfeed", "how_breastfeed", "formula", "solid_food",
                  "sit_up")


csv_6mo <- read_csv(here("data", "data_6mo.csv"))
## Rows: 203 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): dataid, q11_010, q11_020, q11_030, q11_60, q11_140, q11_240
## dbl  (8): round, child_num, q11_011, q11_021, q11_031, q11_61, q11_141, q11_241
## date (1): q1_2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
stat_birth_survey <- read_dta(here("data", "birth_survey.dta"))
stat_birth_factored <- as.data.frame(lapply(stat_birth_survey, to_factor))

stat_anthro <- read_dta(here("data", "anthro_data.dta"))
stat_anthro_factored <- as.data.frame(lapply(stat_anthro, to_factor))

Data merging

?left_join
## starting httpd help server ... done
data_full_join <- full_join(stat_6mo_factored, stat_anthro)
## Joining with `by = join_by(dataid)`
data_full_join2 <- full_join(stat_birth_survey, stat_anthro)
## Joining with `by = join_by(dataid, child_id)`
birth_survey <- rename(stat_birth_survey, "birth_survey_status"=brith_survey_status)

merge_birth_anthr <- left_join(stat_anthro, stat_birth_survey, 
                               by = c("dataid", "child_id"))

stat_birth_survey %>% head(6)
## # A tibble: 6 × 8
##   dataid block child_id brith_survey_status reason_birth_survey_not…¹ DOB       
##   <chr>  <chr> <chr>                  <dbl> <dbl+lbl>                 <date>    
## 1 01016  01    t1                         1 NA                        2024-01-16
## 2 01017  01    t1                         1 NA                        2023-12-27
## 3 01018  01    t1                         1 NA                        2024-01-25
## 4 01020  01    t1                         1 NA                        2023-10-10
## 5 01021  01    t1                         1 NA                        2024-03-09
## 6 01022  01    t1                         1 NA                        2023-11-15
## # ℹ abbreviated name: ¹​reason_birth_survey_not_done
## # ℹ 2 more variables: q1_3 <dbl+lbl>, q1_4 <dbl+lbl>
stat_birth_survey %>% tail(6)
## # A tibble: 6 × 8
##   dataid block child_id brith_survey_status reason_birth_survey_not…¹ DOB       
##   <chr>  <chr> <chr>                  <dbl> <dbl+lbl>                 <date>    
## 1 81775  81    t1                         1 NA                        2024-09-22
## 2 81785  81    t1                         1 NA                        2024-09-15
## 3 81789  81    t1                         1 NA                        2024-08-22
## 4 81790  81    t1                         0  1 [The mother and child… 2024-08-20
## 5 81792  81    t1                         1 NA                        2024-11-16
## 6 81795  81    t1                         1 NA                        2024-09-05
## # ℹ abbreviated name: ¹​reason_birth_survey_not_done
## # ℹ 2 more variables: q1_3 <dbl+lbl>, q1_4 <dbl+lbl>
stat_birth_survey %>% dim()
## [1] 709   8
stat_birth_survey %>% nrow()
## [1] 709
dob_block_1 <- stat_birth_survey %>% 
  filter(block == "01") %>% 
  arrange(DOB) %>% 
  pull(DOB)
dob_block_1
##  [1] "2023-10-10" "2023-10-23" "2023-11-15" "2023-12-27" "2024-01-14"
##  [6] "2024-01-16" "2024-01-25" "2024-03-02" "2024-03-09" "2024-03-19"

Breastfeeding

breastfeed_6mo <- stat_6mo_factored %>% 
  filter(q11_010 == "Yes") %>% 
nrow()

bf_6mo <- stat_6mo %>% 
  filter(q11_010 == 1) %>% 
nrow()
bf_6mo
## [1] 203
bf_formula <- stat_6mo_factored %>% 
  filter(q11_010 =="Yes") %>% 
  filter(q11_010 =="Breastfeeding with formula milk") %>% 
nrow()
bf_formula
## [1] 0
mean_head_circ <- merge_birth_anthr %>% 
  group_by(q1_3) %>% 
  summarise(count = n(),
            mean_x = mean(head_circumference),
            median_x = median(head_circumference),
            sd_x = sd(head_circumference))
mean_head_circ
## # A tibble: 2 × 5
##   q1_3       count mean_x median_x  sd_x
##   <dbl+lbl>  <int>  <dbl>    <dbl> <dbl>
## 1 1 [Male]      82   42.3       42 1.12 
## 2 2 [Female]    59   41.6       41 0.985

Data Manupulation

weight_child <- stat_anthro %>% 
  mutate(Child_weight_kg = baby_weight/1000)
weight_child
## # A tibble: 141 × 7
##    dataid child_id date               baby_weight head_circumference baby_length
##    <chr>  <chr>    <chr>                    <dbl>              <dbl>       <dbl>
##  1 01016  t1       2024-09-07 12:37:…        6700                 41          68
##  2 01017  t1       2024-08-19 09:47:…        6712                 40          68
##  3 01018  t1       2024-09-17 09:10:…        6780                 41          69
##  4 01020  t1       2024-06-03 16:28:…        7033                 42          69
##  5 01022  t1       2024-09-18 09:10:…        6082                 41          67
##  6 01035  t1       2024-06-02 16:21:…        9933                 45          73
##  7 01036  t1       2024-09-05 14:33:…        7410                 41          67
##  8 02013  t1       2024-06-04 16:30:…        7213                 44          68
##  9 02015  t1       2024-09-08 13:07:…        6838                 41          66
## 10 02019  t1       2024-06-03 04:48:…        7900                 44          68
## # ℹ 131 more rows
## # ℹ 1 more variable: Child_weight_kg <dbl>

Mutate with Condition

stat_6mo <- stat_6mo %>% 
  mutate(twin_hh = if_else(child_num == 1, "single",
                           "tiwn"))
stat_6mo <- stat_6mo %>% 
  mutate(child_num = case_when(
    child_num == 1 ~"single",
    child_num == 2 ~"twins",
    child_num == 3 ~"triplets"))

Reshape

wide_data_by_sex <- mean_head_circ %>% 
  pivot_wider(names_from = q1_3,
              values_from = mean_x)
wide_data_by_sex
## # A tibble: 2 × 5
##   count median_x  sd_x   `1`   `2`
##   <int>    <dbl> <dbl> <dbl> <dbl>
## 1    82       42 1.12   42.3  NA  
## 2    59       41 0.985  NA    41.6

long to wide

q11_01_long <- stat_6mo %>% 
  select(dataid, child_num, q11_010, q11_011) %>%
  pivot_longer(cols = c("q11_010", "q11_011"),
               names_prefix = "q11_01",
               names_to = "child_id",
               values_to = "q11_01")


q11_01_long <- q11_01_long %>% 
  mutate(child_id = ifelse(child_id == 0, "t1", "t2"))

Chisq test

x <- c(25, 75)
y <- c(14, 50)
df <- data.frame(x, y)
matrix <- as.matrix(df)
martix <- matrix(x, y)
chisq.test(matrix)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  matrix
## X-squared = 0.07319, df = 1, p-value = 0.7867

t-test

bs_hc_male <- stat_birth_survey %>% 
  filter(q1_3 == 1) %>%  pull(q1_4)
bs_hc_female <- stat_birth_survey %>% 
  filter(q1_3 == 2) %>%  pull(q1_4)
t.test(bs_hc_male, bs_hc_female)
## 
##  Welch Two Sample t-test
## 
## data:  bs_hc_male and bs_hc_female
## t = -0.11905, df = 701.53, p-value = 0.9053
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.03740469  0.03312792
## sample estimates:
## mean of x mean of y 
## 0.9383378 0.9404762
logit_mod <- glm(q1_4 ~ q1_3,
                family = binomial(link = "logit"),
                data = stat_birth_survey)

summary(logit_mod) 
## 
## Call:
## glm(formula = q1_4 ~ q1_3, family = binomial(link = "logit"), 
##     data = stat_birth_survey)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.68487    0.48837   5.498 3.85e-08 ***
## q1_3         0.03757    0.31543   0.119    0.905    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 324.37  on 708  degrees of freedom
## Residual deviance: 324.35  on 707  degrees of freedom
## AIC: 328.35
## 
## Number of Fisher Scoring iterations: 5
csvwashb <- read_csv(here("data", "washb_subset.csv"))
## Rows: 12438 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): tr, season
## dbl  (1): diar7d
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
wash_subset <- csvwashb %>%
mutate(tr= factor(tr, levels = c ("Control", #reference 
                                  "Water",
                                  "Sanitation",
                                  "Handwashing",
                                  "Combined WSH")))

tr_mod <- glm(diar7d ~ tr, family = binomial(link="log"), data =wash_subset )
summary(tr_mod) 
## 
## Call:
## glm(formula = diar7d ~ tr, family = binomial(link = "log"), data = wash_subset)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -2.86755    0.06919 -41.444  < 2e-16 ***
## trWater        -0.12261    0.12385  -0.990 0.322186    
## trSanitation   -0.52672    0.14644  -3.597 0.000322 ***
## trHandwashing  -0.49691    0.14356  -3.461 0.000537 ***
## trCombined WSH -0.44799    0.10944  -4.094 4.25e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4444.1  on 12437  degrees of freedom
## Residual deviance: 4415.8  on 12433  degrees of freedom
## AIC: 4425.8
## 
## Number of Fisher Scoring iterations: 6
hist(stat_anthro$baby_weight, 
     main="Histtogram of baby", 
     xlab = "Baby weight")

# ggpolt

ggplot(stat_anthro,
        aes(x= baby_weight, y= baby_length))+
  geom_point() +
  xlab("Baby weight in gm") +
  ylab("Baby lenght in cm") +
  ggtitle("Six month Baby weight & Length") +
  theme_minimal()

ggplot(merge_birth_anthr,
        aes(x= baby_weight, y= baby_length,
            group = factor(q1_3), color = factor(q1_3))) +
  geom_point() +
  xlab("Baby weight in gm") +
  ylab("Baby lenght in cm") +
  ggtitle("Six month Baby weight & Length")+
  theme_minimal()

ggplot(merge_birth_anthr,
        aes(x= baby_weight, y= baby_length,
            group = factor(q1_3, levels = c(1,2)), color = factor(q1_3))) +
  geom_point(size = 3, shape = 1) +
  scale_color_manual(values = c("blue", "red"),
                     labels = c("Male", "Female")) +
  xlab("Baby weight in gm") +
  ylab("Baby lenght in cm") +
  ggtitle("Six month Baby weight & Length")+
  theme_minimal()

stat_birth_factored <- stat_birth_survey %>% 
  mutate(sex = factor(q1_3,
                      levels = c(1,2),
                      labels = c("Male", "Female")))
merge_birth_anthr <- left_join(stat_anthro, stat_birth_factored, by = c("dataid", "child_id"))


ggplot(merge_birth_anthr,
        aes(x= sex, y= baby_weight,
            group = sex,
            colour = sex)) +
  geom_boxplot() +
  scale_color_manual(values = c("blue", "red"),
                     labels = c("Male", "Female")) +
  xlab("Sex") +
  ylab("Baby weight in gm") +
  ggtitle("Six month Baby weight by sex")+
  theme_minimal()

library(ggplot2)
ggplot2::ggplot(merge_birth_anthr,
       aes(x=baby_weight, 
           y=baby_length, 
           colour = sex)) +
  geom_point(size =2) +
  facet_wrap(~ sex) +
  xlab("Baby weight in gm") +
  ylab("Sex") +
  ggtitle("Six month Baby weight by sex") +
  theme_minimal()

model3 <- tidy(tr_mod)
model4 <- model3  %>% 
  filter(term != "(Intercept)") %>% 
    mutate(prev_ratio = exp(estimate),
           lower_ci = exp(estimate - 1.96*std.error),
           upper_ci = exp(estimate + 1.96*std.error))


ggplot(model4, aes(x = term,
                   y = prev_ratio,
                   group = term,
                   colour = term)) +
geom_point(stat = "identity", color ="blue", fill = "#999999") +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci,
                  width = 0.2),
              position = position_dodge(0.5)
              ) +
  
  xlab("Diarrhea Prevalence Ratio (95% CI)") +
  ylab("Treatment")

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.