Import sheet 2 of excel data file

library(rio)
fdir1 <- "C:/Users/trant/Downloads/STUDY/COURSERA/R Intro/metamicrobiomeR-master/inst/extdata/QIIME_outputs/Bangladesh/"
natural2 <-import(paste(fdir1,"nature13421-s2.xlsx",sep=""),sheet=2,skip=1)

Rename column names to sensible names, remove redundant rows

library(dplyr)
newnames <- c("cohort", 
              "family.id", 
              "child.id", 
              "fecal.sample.id",
              "age.days.id", 
              "age.months.id", 
              "whz", 
              "haz", 
              "waz", 
              "diet", 
              "...11", 
              "...12", 
              "diarrhoea", 
              "antibiotics.7days", 
              "medications", 
              "number.V4-16SrRNA", 
              "16srrna", 
              "sample.barcode")
natural2 <- natural2 %>% 
  rename_at(vars(1:18), ~newnames)

Make a table showing number of fecal sample per each child ID

fecal.child <- select(natural2, child.id, fecal.sample.id)
fecal.child <- as.data.frame(count(fecal.child, child.id)) %>% 
  rename_at(vars(2), ~"fecal.per.child") 

Summarize number of fecal samples in all child ID and by cohort of singletons vs twins/triplets

library(plyr)
subnatural2 <- filter(select(natural2, "cohort", "child.id", "fecal.sample.id"), !is.na(fecal.sample.id)) %>%
  group_by(cohort)
subnatural2 <- ddply(subnatural2,.(child.id),transform,count.fecal=length(cohort)) 
summarise(subnatural2,
          mean = mean(count.fecal, na.rm = TRUE),
          sd = sd(count.fecal, na.rm = TRUE),
          median = median(count.fecal, na.rm = TRUE),
          IQR = IQR(count.fecal, na.rm = TRUE))
##       mean       sd median IQR
## 1 20.90562 3.898605     22   5

##Plot number of fecal samples by cohort of singletons vs twins/triplets

library(ggplot2)
ggplot(subnatural2,aes(x=cohort, y=count.fecal, color=cohort)) +
  geom_boxplot()