Import sheet 2 of excel data file

library(rio)
dat <- import('C:/Users/Dell/Desktop/Biomedical data analysis with R Intro series/Dataset/nature13421-s2.xlsx', sheet = 2,skip = 1)
str(dat)
## 'data.frame':    1003 obs. of  18 variables:
##  $ Cohort                                              : chr  NA "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" ...
##  $ Family ID                                           : chr  NA "Bgsng7035" "Bgsng7035" "Bgsng7035" ...
##  $ Child ID                                            : chr  NA "Bgsng7035" "Bgsng7035" "Bgsng7035" ...
##  $ Fecal Sample ID                                     : chr  NA "Bgsng7035.m1" "Bgsng7035.m2" "Bgsng7035.m3" ...
##  $ Age, days                                           : num  NA 5 37 59 87 110 157 200 219 261 ...
##  $ Age, months                                         : num  NA 0.164 1.216 1.938 2.858 ...
##  $ WHZ                                                 : num  NA -0.37 NA NA -1.13 NA NA -0.7 NA NA ...
##  $ HAZ                                                 : num  NA -1.4 NA NA -1.6 NA NA -1.49 NA NA ...
##  $ WAZ                                                 : num  NA -1.24 NA NA -2.05 NA NA -1.48 NA NA ...
##  $ Diet at time of fecal sample collection             : chr  "Breast Milk" "Yes" "Yes" "Yes" ...
##  $ ...11                                               : chr  "Formula1" "Yes" "Yes" "Yes" ...
##  $ ...12                                               : chr  "Solid Foods2" "No" "Yes" "Yes" ...
##  $ diarrhoea at the time of sample collection3         : chr  NA "No" "No" "No" ...
##  $ Antibiotics within 7 days prior to sample collection: chr  NA "No" "Yes" "No" ...
##  $ Medications (Antibiotics and other) 4               : chr  NA NA "Flucloxacillin sodium, Paracetamol, Chlorpheniramine maleate" "Levosulbutamol sulphate" ...
##  $ Number of high quality V4-16S rRNA sequences        : num  NA 25936 26663 25243 27483 ...
##  $ 16S rRNA Sequencing Run ID                          : chr  NA "9" "9" "9" ...
##  $ Sample specific barcode sequence                    : chr  NA "GGATAGCCACTTC" "TTGGGCGTGAACT" "TAACAAGGAACGC" ...

Rename column names to sensible names, remove redundant rows

library(tidyverse)
names(dat) = c("cohort", "family.id", "Child.id", "fecalSample.id", "agedays", "agemonths", "whz", "haz", "daz", "die.breastmilk", "die.fomula", "die.solidfoods", "diarrhea.sample3", "antibioticsbefore", "medications", "number.v4.rNA" , "16s.rna.id", "sample.sequences" )
dat = dat %>% filter(!is.na(whz) & !is.na(medications))
str(dat)
## 'data.frame':    126 obs. of  18 variables:
##  $ cohort           : chr  "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" ...
##  $ family.id        : chr  "Bgsng7035" "Bgsng7035" "Bgsng7035" "Bgsng7106" ...
##  $ Child.id         : chr  "Bgsng7035" "Bgsng7035" "Bgsng7035" "Bgsng7106" ...
##  $ fecalSample.id   : chr  "Bgsng7035.m7" "Bgsng7035.m16" "Bgsng7035.m22" "Bgsng7106.m18" ...
##  $ agedays          : num  200 463 647 556 106 99 198 299 366 635 ...
##  $ agemonths        : num  6.57 15.21 21.26 18.27 3.48 ...
##  $ whz              : num  -0.7 -0.66 0.05 -1.33 -1.98 0.23 0.5 0.8 0.25 0.44 ...
##  $ haz              : num  -1.49 -2.03 -1.96 -1.71 0.47 0.32 -0.75 0.6 0.09 -0.25 ...
##  $ daz              : num  -1.48 -1.42 -0.93 -1.77 -1.03 0.39 -0.12 0.88 0.24 0.18 ...
##  $ die.breastmilk   : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ die.fomula       : chr  "Yes" "No" "No" "No" ...
##  $ die.solidfoods   : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ diarrhea.sample3 : chr  "No" "Yes" "No" "No" ...
##  $ antibioticsbefore: chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ medications      : chr  "Flucloxacillin sodium, Chlorpheniramine maleate" "Erythromycin stearate, Paracetamol, Oral rehydration saline, Folic acid" "Erythromycin stearate, Folic acid, Oral rehydration saline" "Amoxycillin trihydrate, Paracetamol, Sulbutamol" ...
##  $ number.v4.rNA    : num  24733 22722 22252 19806 27659 ...
##  $ 16s.rna.id       : chr  "9" "9" "9" "9" ...
##  $ sample.sequences : chr  "ACTACAGCCTATG" "AAGGAGCGCCTTA" "GGATGGTGTTGCT" "TGAGTCACTGGTG" ...

Save my cleaned data to local disk

write.csv(dat, 'C:/Users/Dell/Desktop/Biomedical data analysis with R Intro series/Dataset/datsession1b.csv', row.names = FALSE)

#I know that using 'dir'object stored path and paste/paste0 funtion is cleaner and more readable, but now i just want to test 'manual way' and know how hard it is

Open my cleaned data

save(dat, file ='C:/Users/Dell/Desktop/Biomedical data analysis with R Intro series/Dataset/session.rda')
load('C:/Users/Dell/Desktop/Biomedical data analysis with R Intro series/Dataset/session.rda')

Make a table showing number of fecal sample per each child ID

a = dat$fecalSample.id
a[!is.na(a)] <- 1
a <- as.numeric(a)
tapply(a, dat$Child.id, sum)
## Bgsng7018 Bgsng7035 Bgsng7052 Bgsng7071 Bgsng7082 Bgsng7090 Bgsng7106 Bgsng7114 
##         4         3         1         1         3         6         1         1 
## Bgsng7115 Bgsng7131 Bgsng7142 Bgsng7149 Bgsng7150 Bgsng7155 Bgsng7173 Bgsng7177 
##         1         5         4         5         5         3         4         4 
## Bgsng7178 Bgsng7192 Bgsng7202 Bgsng7204 Bgsng8064 Bgsng8169  Bgtw1.T1  Bgtw1.T2 
##         4         5         2         4         1         2        10         7 
## Bgtw10.T1 Bgtw10.T2 Bgtw11.T1 Bgtw12.T1 Bgtw12.T2  Bgtw2.T1  Bgtw2.T2  Bgtw3.T1 
##         1         1         1         4         1         3         2         1 
##  Bgtw3.T2  Bgtw4.T1  Bgtw4.T2  Bgtw4.T3  Bgtw5.T2  Bgtw6.T2  Bgtw7.T1  Bgtw7.T2 
##         2         1         1         1         1         1         3         3 
##  Bgtw8.T1  Bgtw8.T2  Bgtw9.T1  Bgtw9.T2 
##         3         6         3         1
#Other way for data with cleaned fecalSample.id object
dat %>% group_by(Child.id) %>% summarise(fecal.sample.number = n())
## # A tibble: 44 x 2
##    Child.id  fecal.sample.number
##    <chr>                   <int>
##  1 Bgsng7018                   4
##  2 Bgsng7035                   3
##  3 Bgsng7052                   1
##  4 Bgsng7071                   1
##  5 Bgsng7082                   3
##  6 Bgsng7090                   6
##  7 Bgsng7106                   1
##  8 Bgsng7114                   1
##  9 Bgsng7115                   1
## 10 Bgsng7131                   5
## # … with 34 more rows
#table(dat$Child.id) also results in the same desired output as the way above

Summarize number of fecal samples in all child ID and by cohort of singletons vs twins/triplets

b <-dat %>% group_by(cohort) %>% summarise(fecal.sample.number = n())
b
## # A tibble: 2 x 2
##   cohort                         fecal.sample.number
##   <chr>                                        <int>
## 1 Healthy Singleton Birth Cohort                  69
## 2 Healthy Twins & Triplets                        57

Plot number of fecal samples by cohort of singletons vs twins/triplets

barplot(b$fecal.sample.number, names.arg = b$cohort, ylab = "Number of fecal samples", ylim = c(0,80), col = c("blue","green"), border = "red ",main = "The number of fecal samples by cohort ")