library(tidyverse)
library(xlsx)
library(rotemfuns)
library(vegan)
library(funrar)
library(subSeq)
dbmap = read.csv("/pita/pub/data/16S_DBs/maps/DB1-12_premap_v2.csv")
dbmap$reads_number = dbmap$reads_number %>% as.character() %>% as.numeric()
NAs introduced by coercion
done =
dbmap %>%
filter(Cohort == "eyes") %>%
# select(-Family) %>%
filter(Organism == "Human") %>%
arrange(desc(reads_number)) %>%
distinct(sample_ID, .keep_all = T)
prev_families = read.csv("DB3_control_families.csv")
done =
done %>%
full_join(dbmap %>% filter(X.SampleID %in% prev_families$X.SampleID))
Joining, by = c("X.SampleID", "BarcodeSequence", "LinkerPrimerSequence", "Barcode", "Database", "Cohort", "sample_ID", "Country", "Organism", "Dx", "FFPE_fresh", "Source", "Enzyme", "remark", "remarks2", "DUPLICATE_code", "Date", "Swab", "pn_ID", "Time", "Time2", "amount", "Age", "Gender", "Family", "position", "box", "Description", "reads_number")
done = filter(done, reads_number > 1000)
done$sample_ID = gsub("-", "_", done$sample_ID %>% as.character())
done = distinct(done, sample_ID, .keep_all = T)
names(done)[1] = "sampleid"
# write.table(done, "human/16s_map.tsv", row.names = F, sep = "\t", quote = F)
# Old
# map = read.xlsx("human/Table for rotem- Families 22.6.20.xlsx", 1)
map = read.xlsx("Table for rotem- Families RH_6Aug20.xlsx",1)
map$Sample.ID = str_replace_all(map$Sample.ID, "-", replacement = "_")
# map$Sample.ID = paste0("E_", map$Sample.ID)
done$sample_ID = gsub("E_", "", done$sample_ID)
map$pnid = paste0(map$Family,"-", map$Subject_.)
# Verify that all samples that already run fount Hadas table
setdiff(done$sample_ID , map$Sample.ID)
[1] "HY_fresh" "HR_fresh" "HP" "HD" "DA_fresh" "CU_fresh" "CR"
[8] "Cya" "Cyo" "FS" "FO" "CK" "CO" "CS"
[15] "BM" "BS" "NE" "NN" "NS"
intersect(done$sample_ID , map$Sample.ID) %>% length() == length(done$sample_ID)
[1] FALSE
names(map)[1] = "sample_ID"
# I made some manual cahnges for this file, including: unify age, family pnID
# done %>%
# select(-Dx, -FFPE_fresh, -Source, -Enzyme, -remark, remarks2) %>%
# left_join(map %>% select(-Family), by = 'sample_ID') %>%
# write.table("16s_map.tsv", row.names = F, sep = "\t", quote = F)
Samples cross QC (Reads above 2000)
data = read.csv("16s_map.tsv",sep = "\t")
sum(data$reads_number > 2000)
[1] 183
Total number of families that passed QC
data %>%
filter(reads_number > 2000) %>%
select(Family) %>%
unique() %>%
nrow()
[1] 26
PCoA not included the families from old DB3
pca = read_core_pcoa("res/core-metrics-results/unweighted_unifrac_pcoa_results.qza")
names(pca) = paste0("PCoA_", 1:length(names(pca)))
data =
data %>%
left_join(pca %>% rownames_to_column("sampleid"))
Joining, by = "sampleid"Column `sampleid` joining factor and character vector, coercing into character vector
pl =
ggplot(data, aes(x = PCoA_1, y = PCoA_2, color = Age, name = sample_ID)) +
geom_point() +
theme_rh
plotly::ggplotly(pl)
pl =
ggplot(data, aes(x = PCoA_1, y = PCoA_2, color = Family, name = sample_ID)) +
geom_point() +
theme_rh
plotly::ggplotly(pl)
pl =
ggplot(data, aes(x = PCoA_1, y = PCoA_2, color = Gender, name = sample_ID)) +
geom_point() +
theme_rh
plotly::ggplotly(pl)
biom_path = "/pita/pub/data/16S_DBs/processed/16S_DB1-12_merged/feature-table.biom"
biom = read_biom(biom_path)
samples_to_filter = done$sampleid
colnums = match(as.character(samples_to_filter), names(biom)) %>% na.omit()
biom =
biom %>%
select(1, colnums) %>%
column_to_rownames(names(biom)[1])
biom =
biom %>%
distinct(.keep_all = T)
biom %>% dim()
[1] 2660 185
LS0tCnRpdGxlOiAiZmlsdGVyIHRoZSBmaWxlcyBmb3IgYW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeSh4bHN4KQpsaWJyYXJ5KHJvdGVtZnVucykKbGlicmFyeSh2ZWdhbikKbGlicmFyeShmdW5yYXIpCmxpYnJhcnkoc3ViU2VxKQpgYGAKCmBgYHtyfQpkYm1hcCA9IHJlYWQuY3N2KCIvcGl0YS9wdWIvZGF0YS8xNlNfREJzL21hcHMvREIxLTEyX3ByZW1hcF92Mi5jc3YiKQpkYm1hcCRyZWFkc19udW1iZXIgPSBkYm1hcCRyZWFkc19udW1iZXIgJT4lIGFzLmNoYXJhY3RlcigpICU+JSBhcy5udW1lcmljKCkKZG9uZSA9IApkYm1hcCAlPiUgCiAgZmlsdGVyKENvaG9ydCA9PSAiZXllcyIpICU+JSAKICAjIHNlbGVjdCgtRmFtaWx5KSAlPiUgCiAgZmlsdGVyKE9yZ2FuaXNtID09ICJIdW1hbiIpICU+JSAKICBhcnJhbmdlKGRlc2MocmVhZHNfbnVtYmVyKSkgJT4lIAogIGRpc3RpbmN0KHNhbXBsZV9JRCwgLmtlZXBfYWxsID0gVCkKCnByZXZfZmFtaWxpZXMgPSByZWFkLmNzdigiREIzX2NvbnRyb2xfZmFtaWxpZXMuY3N2IikgICAgCgpkb25lID0gCmRvbmUgJT4lIAogIGZ1bGxfam9pbihkYm1hcCAlPiUgZmlsdGVyKFguU2FtcGxlSUQgJWluJSBwcmV2X2ZhbWlsaWVzJFguU2FtcGxlSUQpKQoKZG9uZSA9IGZpbHRlcihkb25lLCByZWFkc19udW1iZXIgPiAxMDAwKQpkb25lJHNhbXBsZV9JRCA9IGdzdWIoIi0iLCAiXyIsIGRvbmUkc2FtcGxlX0lEICU+JSBhcy5jaGFyYWN0ZXIoKSkKZG9uZSA9IGRpc3RpbmN0KGRvbmUsIHNhbXBsZV9JRCwgLmtlZXBfYWxsID0gVCkKbmFtZXMoZG9uZSlbMV0gPSAic2FtcGxlaWQiCiMgd3JpdGUudGFibGUoZG9uZSwgImh1bWFuLzE2c19tYXAudHN2Iiwgcm93Lm5hbWVzID0gRiwgc2VwID0gIlx0IiwgcXVvdGUgPSBGKQpgYGAKCmBgYHtyfQojIE9sZAojIG1hcCA9IHJlYWQueGxzeCgiaHVtYW4vVGFibGUgZm9yIHJvdGVtLSBGYW1pbGllcyAyMi42LjIwLnhsc3giLCAxKQptYXAgPSByZWFkLnhsc3goIlRhYmxlIGZvciByb3RlbS0gRmFtaWxpZXMgUkhfNkF1ZzIwLnhsc3giLDEpCm1hcCRTYW1wbGUuSUQgPSBzdHJfcmVwbGFjZV9hbGwobWFwJFNhbXBsZS5JRCwgIi0iLCByZXBsYWNlbWVudCA9ICJfIikKIyBtYXAkU2FtcGxlLklEID0gcGFzdGUwKCJFXyIsIG1hcCRTYW1wbGUuSUQpCmRvbmUkc2FtcGxlX0lEID0gZ3N1YigiRV8iLCAiIiwgZG9uZSRzYW1wbGVfSUQpCm1hcCRwbmlkID0gcGFzdGUwKG1hcCRGYW1pbHksIi0iLCBtYXAkU3ViamVjdF8uKQojIFZlcmlmeSB0aGF0IGFsbCBzYW1wbGVzIHRoYXQgYWxyZWFkeSBydW4gZm91bnQgSGFkYXMgdGFibGUKc2V0ZGlmZihkb25lJHNhbXBsZV9JRCAsIG1hcCRTYW1wbGUuSUQpCmludGVyc2VjdChkb25lJHNhbXBsZV9JRCAsIG1hcCRTYW1wbGUuSUQpICU+JSBsZW5ndGgoKSA9PSBsZW5ndGgoZG9uZSRzYW1wbGVfSUQpCgpuYW1lcyhtYXApWzFdID0gInNhbXBsZV9JRCIKCiMgIEkgbWFkZSBzb21lIG1hbnVhbCBjYWhuZ2VzIGZvciB0aGlzIGZpbGUsIGluY2x1ZGluZzogdW5pZnkgYWdlLCBmYW1pbHkgcG5JRAojIGRvbmUgJT4lCiMgICBzZWxlY3QoLUR4LCAtRkZQRV9mcmVzaCwgLVNvdXJjZSwgLUVuenltZSwgLXJlbWFyaywgcmVtYXJrczIpICU+JSAKIyAgIGxlZnRfam9pbihtYXAgJT4lIHNlbGVjdCgtRmFtaWx5KSwgYnkgPSAnc2FtcGxlX0lEJykgJT4lIAojICAgd3JpdGUudGFibGUoIjE2c19tYXAudHN2Iiwgcm93Lm5hbWVzID0gRiwgc2VwID0gIlx0IiwgcXVvdGUgPSBGKQpgYGAKCgoKU2FtcGxlcyBjcm9zcyBRQyAoUmVhZHMgYWJvdmUgMjAwMCkKYGBge3J9CmRhdGEgPSByZWFkLmNzdigiMTZzX21hcC50c3YiLHNlcCA9ICJcdCIpCnN1bShkYXRhJHJlYWRzX251bWJlciA+IDIwMDApCmBgYApUb3RhbCBudW1iZXIgb2YgZmFtaWxpZXMgdGhhdCBwYXNzZWQgUUMKYGBge3J9CmRhdGEgJT4lIAogIGZpbHRlcihyZWFkc19udW1iZXIgPiAyMDAwKSAlPiUgCiAgc2VsZWN0KEZhbWlseSkgJT4lIAogIHVuaXF1ZSgpICU+JSAKICBucm93KCkKYGBgCgpQQ29BIG5vdCBpbmNsdWRlZCB0aGUgZmFtaWxpZXMgZnJvbSBvbGQgREIzCmBgYHtyfQpwY2EgPSByZWFkX2NvcmVfcGNvYSgicmVzL2NvcmUtbWV0cmljcy1yZXN1bHRzL3Vud2VpZ2h0ZWRfdW5pZnJhY19wY29hX3Jlc3VsdHMucXphIikKbmFtZXMocGNhKSA9IHBhc3RlMCgiUENvQV8iLCAxOmxlbmd0aChuYW1lcyhwY2EpKSkKYGBgCgpgYGB7cn0KZGF0YSA9IApkYXRhICU+JSAKICBsZWZ0X2pvaW4ocGNhICU+JSByb3duYW1lc190b19jb2x1bW4oInNhbXBsZWlkIikpIAoKcGwgPSAKZ2dwbG90KGRhdGEsIGFlcyh4ID0gUENvQV8xLCB5ID0gUENvQV8yLCBjb2xvciA9IEFnZSwgbmFtZSA9IHNhbXBsZV9JRCkpICsgCiAgZ2VvbV9wb2ludCgpICsgCiAgdGhlbWVfcmgKcGxvdGx5OjpnZ3Bsb3RseShwbCkKYGBgCgpgYGB7cn0KcGwgPSAKZ2dwbG90KGRhdGEsIGFlcyh4ID0gUENvQV8xLCB5ID0gUENvQV8yLCBjb2xvciA9IEZhbWlseSwgbmFtZSA9IHNhbXBsZV9JRCkpICsgCiAgZ2VvbV9wb2ludCgpICsgCiAgdGhlbWVfcmgKcGxvdGx5OjpnZ3Bsb3RseShwbCkKYGBgCgpgYGB7cn0KcGwgPSAKZ2dwbG90KGRhdGEsIGFlcyh4ID0gUENvQV8xLCB5ID0gUENvQV8yLCBjb2xvciA9IEdlbmRlciwgbmFtZSA9IHNhbXBsZV9JRCkpICsgCiAgZ2VvbV9wb2ludCgpICsgCiAgdGhlbWVfcmgKcGxvdGx5OjpnZ3Bsb3RseShwbCkKYGBgCgoKYGBge3J9CmJpb21fcGF0aCA9ICIvcGl0YS9wdWIvZGF0YS8xNlNfREJzL3Byb2Nlc3NlZC8xNlNfREIxLTEyX21lcmdlZC9mZWF0dXJlLXRhYmxlLmJpb20iCmJpb20gPSByZWFkX2Jpb20oYmlvbV9wYXRoKQpzYW1wbGVzX3RvX2ZpbHRlciA9IGRvbmUkc2FtcGxlaWQKCmNvbG51bXMgPSBtYXRjaChhcy5jaGFyYWN0ZXIoc2FtcGxlc190b19maWx0ZXIpLCBuYW1lcyhiaW9tKSkgJT4lIG5hLm9taXQoKQpiaW9tID0gCiAgYmlvbSAlPiUgCiAgICBzZWxlY3QoMSwgY29sbnVtcykgJT4lIAogICAgY29sdW1uX3RvX3Jvd25hbWVzKG5hbWVzKGJpb20pWzFdKQoKYmlvbSA9CiAgYmlvbSAlPiUgCiAgZGlzdGluY3QoLmtlZXBfYWxsID0gVCkKCmJpb20gJT4lIGRpbSgpCmBgYAo=