Import sheet 2 of excel data file
library(rio)
dat <- import('C:/Users/Dell/Desktop/Biomedical data analysis with R Intro series/Dataset/nature13421-s2.xlsx', sheet = 2,skip = 1)
str(dat)
## 'data.frame': 1003 obs. of 18 variables:
## $ Cohort : chr NA "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" ...
## $ Family ID : chr NA "Bgsng7035" "Bgsng7035" "Bgsng7035" ...
## $ Child ID : chr NA "Bgsng7035" "Bgsng7035" "Bgsng7035" ...
## $ Fecal Sample ID : chr NA "Bgsng7035.m1" "Bgsng7035.m2" "Bgsng7035.m3" ...
## $ Age, days : num NA 5 37 59 87 110 157 200 219 261 ...
## $ Age, months : num NA 0.164 1.216 1.938 2.858 ...
## $ WHZ : num NA -0.37 NA NA -1.13 NA NA -0.7 NA NA ...
## $ HAZ : num NA -1.4 NA NA -1.6 NA NA -1.49 NA NA ...
## $ WAZ : num NA -1.24 NA NA -2.05 NA NA -1.48 NA NA ...
## $ Diet at time of fecal sample collection : chr "Breast Milk" "Yes" "Yes" "Yes" ...
## $ ...11 : chr "Formula1" "Yes" "Yes" "Yes" ...
## $ ...12 : chr "Solid Foods2" "No" "Yes" "Yes" ...
## $ diarrhoea at the time of sample collection3 : chr NA "No" "No" "No" ...
## $ Antibiotics within 7 days prior to sample collection: chr NA "No" "Yes" "No" ...
## $ Medications (Antibiotics and other) 4 : chr NA NA "Flucloxacillin sodium, Paracetamol, Chlorpheniramine maleate" "Levosulbutamol sulphate" ...
## $ Number of high quality V4-16S rRNA sequences : num NA 25936 26663 25243 27483 ...
## $ 16S rRNA Sequencing Run ID : chr NA "9" "9" "9" ...
## $ Sample specific barcode sequence : chr NA "GGATAGCCACTTC" "TTGGGCGTGAACT" "TAACAAGGAACGC" ...
Rename column names to sensible names, remove redundant rows
library(tidyverse)
names(dat) = c("cohort", "family.id", "Child.id", "fecalSample.id", "agedays", "agemonths", "whz", "haz", "daz", "die.breastmilk", "die.fomula", "die.solidfoods", "diarrhea.sample3", "antibioticsbefore", "medications", "number.v4.rNA" , "16s.rna.id", "sample.sequences" )
dat = dat %>% filter(!is.na(whz) & !is.na(medications))
str(dat)
## 'data.frame': 126 obs. of 18 variables:
## $ cohort : chr "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" "Healthy Singleton Birth Cohort" ...
## $ family.id : chr "Bgsng7035" "Bgsng7035" "Bgsng7035" "Bgsng7106" ...
## $ Child.id : chr "Bgsng7035" "Bgsng7035" "Bgsng7035" "Bgsng7106" ...
## $ fecalSample.id : chr "Bgsng7035.m7" "Bgsng7035.m16" "Bgsng7035.m22" "Bgsng7106.m18" ...
## $ agedays : num 200 463 647 556 106 99 198 299 366 635 ...
## $ agemonths : num 6.57 15.21 21.26 18.27 3.48 ...
## $ whz : num -0.7 -0.66 0.05 -1.33 -1.98 0.23 0.5 0.8 0.25 0.44 ...
## $ haz : num -1.49 -2.03 -1.96 -1.71 0.47 0.32 -0.75 0.6 0.09 -0.25 ...
## $ daz : num -1.48 -1.42 -0.93 -1.77 -1.03 0.39 -0.12 0.88 0.24 0.18 ...
## $ die.breastmilk : chr "Yes" "Yes" "Yes" "Yes" ...
## $ die.fomula : chr "Yes" "No" "No" "No" ...
## $ die.solidfoods : chr "Yes" "Yes" "Yes" "Yes" ...
## $ diarrhea.sample3 : chr "No" "Yes" "No" "No" ...
## $ antibioticsbefore: chr "Yes" "Yes" "Yes" "Yes" ...
## $ medications : chr "Flucloxacillin sodium, Chlorpheniramine maleate" "Erythromycin stearate, Paracetamol, Oral rehydration saline, Folic acid" "Erythromycin stearate, Folic acid, Oral rehydration saline" "Amoxycillin trihydrate, Paracetamol, Sulbutamol" ...
## $ number.v4.rNA : num 24733 22722 22252 19806 27659 ...
## $ 16s.rna.id : chr "9" "9" "9" "9" ...
## $ sample.sequences : chr "ACTACAGCCTATG" "AAGGAGCGCCTTA" "GGATGGTGTTGCT" "TGAGTCACTGGTG" ...
Save my cleaned data to local disk
write.csv(dat, 'C:/Users/Dell/Desktop/Biomedical data analysis with R Intro series/Dataset/datsession1b.csv', row.names = FALSE)
#I know that using 'dir'object stored path and paste/paste0 funtion is cleaner and more readable, but now i just want to test 'manual way' and know how hard it is
Open my cleaned data
save(dat, file ='C:/Users/Dell/Desktop/Biomedical data analysis with R Intro series/Dataset/session.rda')
load('C:/Users/Dell/Desktop/Biomedical data analysis with R Intro series/Dataset/session.rda')
Make a table showing number of fecal sample per each child ID
a = dat$fecalSample.id
a[!is.na(a)] <- 1
a <- as.numeric(a)
tapply(a, dat$Child.id, sum)
## Bgsng7018 Bgsng7035 Bgsng7052 Bgsng7071 Bgsng7082 Bgsng7090 Bgsng7106 Bgsng7114
## 4 3 1 1 3 6 1 1
## Bgsng7115 Bgsng7131 Bgsng7142 Bgsng7149 Bgsng7150 Bgsng7155 Bgsng7173 Bgsng7177
## 1 5 4 5 5 3 4 4
## Bgsng7178 Bgsng7192 Bgsng7202 Bgsng7204 Bgsng8064 Bgsng8169 Bgtw1.T1 Bgtw1.T2
## 4 5 2 4 1 2 10 7
## Bgtw10.T1 Bgtw10.T2 Bgtw11.T1 Bgtw12.T1 Bgtw12.T2 Bgtw2.T1 Bgtw2.T2 Bgtw3.T1
## 1 1 1 4 1 3 2 1
## Bgtw3.T2 Bgtw4.T1 Bgtw4.T2 Bgtw4.T3 Bgtw5.T2 Bgtw6.T2 Bgtw7.T1 Bgtw7.T2
## 2 1 1 1 1 1 3 3
## Bgtw8.T1 Bgtw8.T2 Bgtw9.T1 Bgtw9.T2
## 3 6 3 1
#Other way for data with cleaned fecalSample.id object
dat %>% group_by(Child.id) %>% summarise(fecal.sample.number = n())
## # A tibble: 44 x 2
## Child.id fecal.sample.number
## <chr> <int>
## 1 Bgsng7018 4
## 2 Bgsng7035 3
## 3 Bgsng7052 1
## 4 Bgsng7071 1
## 5 Bgsng7082 3
## 6 Bgsng7090 6
## 7 Bgsng7106 1
## 8 Bgsng7114 1
## 9 Bgsng7115 1
## 10 Bgsng7131 5
## # … with 34 more rows
#table(dat$Child.id) also results in the same desired output as the way above
Summarize number of fecal samples in all child ID and by cohort of singletons vs twins/triplets
b <-dat %>% group_by(cohort) %>% summarise(fecal.sample.number = n())
b
## # A tibble: 2 x 2
## cohort fecal.sample.number
## <chr> <int>
## 1 Healthy Singleton Birth Cohort 69
## 2 Healthy Twins & Triplets 57
Plot number of fecal samples by cohort of singletons vs twins/triplets
barplot(b$fecal.sample.number, names.arg = b$cohort, ylab = "Number of fecal samples", ylim = c(0,80), col = c("blue","green"), border = "red ",main = "The number of fecal samples by cohort ")
