library(tidyverse)
library(xlsx)
# library(rotemfuns)
library(vegan)
library(funrar)
library(subSeq)
library(qiime2R)
library(ggpubr)
library(magrittr)
library(forcats)
library(rmarkdown)
library(kableExtra)
library(gghighlight)
library(microbiomeutilities)
source("/pita/users/rotem/scripts/qiime2_funcs.R")
if(getwd() == "/pita/users/rotem/eyes/") { setwd("human") }
hadasmap = read.xlsx("Table for Rotem2- Families 12.11.20.xlsx", 1)
hadasmap %<>% pivot_longer(cols = starts_with("DNA_Extracted_from")
, names_to = "Sample.type", values_to = "ID", values_drop_na = T)
hadasmap$Sample.ID = hadasmap$ID
hadasmap$Sample.ID = str_replace_all(hadasmap$Sample.ID, "E_", replacement = "")
hadasmap$Sample.ID = str_replace_all(hadasmap$Sample.ID, "-", replacement = "_")
hadasmap$Sample.ID = str_replace_all(hadasmap$Sample.ID, "_(?=[FSM])", "")
hadasmap$Age %<>% as.character() %>% as.numeric()
# map$Sample.ID = paste0("E_", map$Sample.ID)
hadasmap$pnid = paste0(hadasmap$Family,"-", hadasmap$Subject_.)
# Verify that all samples that already run found at Hadas table
# setdiff(done$sample_ID , map$sample_ID)
# intersect(done$sample_ID , hadasmap$sample_ID) %>% length() == length(done$sample_ID)
names(hadasmap)[96] = "sample_ID"
hadasmap %<>% na_if("-")
col_to_dbl = c(4, 8:37, 39:43, 48,49, 51, 52, 54, 56:59)
hadasmap %<>% mutate_at(col_to_dbl, funs(as.numeric))
hadasmap$Subject_. <- as.factor(hadasmap$Subject_.)
dbmap = read.csv("/pita/pub/data/16S_DBs/maps/DB1-14_premap_v1.csv")
dbmap$reads_number = dbmap$reads_number %>% as.character() %>% as.numeric()
done =
dbmap %>%
filter(Cohort == "eyes") %>%
# select(-Family) %>%
filter(Organism == "Human") %>%
arrange(desc(reads_number)) %>%
distinct(sample_ID, .keep_all = T)
done$Family = str_extract(gsub("E_", "", done$sample_ID), "F[RH][0-9]{1,2}")
prev_families = read.csv("DB3_control_families.csv")
done =
done %>%
full_join(dbmap %>% filter(X.SampleID %in% prev_families$X.SampleID))
# done = filter(done, reads_number > 1000)
done$sample_ID = gsub("E_", "", done$sample_ID %>% as.character())
done$sample_ID = gsub("-", "_", done$sample_ID %>% as.character())
done = distinct(done, sample_ID, .keep_all = T)
done$Age %<>% as.character() %>% as.numeric()
names(done)[1] = "sampleid"
# write.table(done, "16s_map.tsv", row.names = F, sep = "\t", quote = F)
names(prev_families)[1] = "sampleid"
cutoff = 2000
number_pass_reads_co = intersect(hadasmap$sample_ID,
done %>% filter(reads_number > cutoff) %>% pull(sample_ID)) %>%
length()
At Hadas table (for humans) there are 398 samples. From all that samples 241 samples sequenced. 223 samples have more than 2000 reads per sample.
allmap = done %>% select(sample_ID, sampleid, reads_number, Database, Cohort, Family) %>%
left_join(hadasmap %>% select(-ID, -Family), by = "sample_ID") %>%
filter(reads_number > cutoff)
allmap %<>% left_join( prev_families %>% select(sampleid, Family, Age, Gender)
, by = "sampleid")
# allmap = full_join(tt
# , prev_families %>% select(X.SampleID, sample_ID, ID, Age, Gender, Family)
# , by = "sample_ID")
allmap = unite(allmap, "Family", contains("Family."), na.rm = T)
allmap = unite(allmap, "Age", contains("Age."), na.rm = T)
allmap = unite(allmap, "Gender", contains("Gender."), na.rm = T)
allmap$Gender = gsub("Male", "M", allmap$Gender)
allmap$Gender = gsub("Female", "F", allmap$Gender)
allmap$Age = allmap$Age %>% as.numeric()
allmap$Disease.group %<>% replace_na(0)
map = sample_data(allmap)
sample_names(map) = map$sampleid
# write.xlsx(hadasmap, "hadasmap_for_rotem_12Nov20.xlsx")
# hadasmap %>% filter(Family == "FR5") %>% select(ID)
# map %>% data.frame() %>% filter(is.na(Sample.type))
allmap$Sample.type[is.na(allmap$Sample.type)] = "DNA_Extracted_from_feaces...F."
allmap %>%
group_by(Family, Sample.type) %>% summarise(number_of_samples = n()) %>%
group_by(Sample.type) %>%
summarise(number_of_samples = sum(number_of_samples), number_of_families = n()) %>%
ggplot(aes(x = Sample.type, y = number_of_samples)) +
scale_x_discrete(labels = c("Feces", "Mouth", "Saliva")) +
scale_y_continuous(expand = expansion(mult = c(0, .1))) +
geom_segment(color = "red", aes(xend = Sample.type, y=0, yend = number_of_samples)) +
geom_point(aes(size = number_of_families),
color="red", fill="orange", alpha=0.7, shape=21, stroke=2) +
geom_text(aes(label = number_of_families)) +
theme_q2r()+
theme(text = element_text(size = 14))
allmap %>% group_by(Family) %>%
summarise(patients = sum(Disease.group == 1 | Disease.group == 2)) %>%
filter(patients > 0) %>%
ggplot(aes(y = patients, x = Family, color = Family)) +
geom_point() + theme_q2r() + ggtitle("Number of patients in each family")
table =
allmap %>% filter(grepl("FR.*",Family)) %>% #filter(!is.na(Age)) %>%
group_by(pnid) %>%
summarise( Family = first(Family), Age = first(Age)
, Age_on_diagnosis = first(Age_on_diagnosis)
, Retinal_degeneration_type = first(Retinal_degeneration_type)) %>%
mutate(pnid = fct_reorder(pnid, Age)) %>%
mutate(pnid = fct_reorder(pnid, Family)) %>%
mutate(pnid = fct_reorder(pnid, Retinal_degeneration_type)) %>%
mutate(Family = gsub("FR", "", Family))
table %<>%
left_join(table %>% group_by(Family) %>%
summarise(Family_Disease = first(Retinal_degeneration_type)))
# mutate(Family = fct_reorder(pnid, Family_Disease))
# table$Family = with(table, fct_reorder(Family, Family_Disease))
pl = table %>%
ggplot(aes(x = pnid, color = Retinal_degeneration_type)) +
geom_point(aes(y = Age)) +
geom_linerange(aes( ymin = Age_on_diagnosis, ymax = Age)) +
facet_grid(.~Family, scales = "free_x", space = "free_x") +
labs(subtitle = "Family number") +
theme( axis.text.x = element_blank()
, axis.ticks.x = element_blank()
, axis.title.x = element_blank()
, strip.background = element_blank()
, panel.grid = element_blank()
, panel.background = element_rect(color="grey25", linetype = "blank")
, legend.position = "bottom"
)
pl
pnid Family
1 FR7-4 FR7
2 FR10-6 FR10
3 <NA> FR15
4 FR12-6 FR12
5 FR9-2 FR9
6 FR6-2 FR6
7 <NA> FR15
8 <NA> FR15
9 FR7-3 FR7
10 <NA> FR15
11 FR3-5 FR3
12 <NA> FR15
13 FR5-1 FR5
14 FR4-4 FR4
15 <NA> FR15
16 FR5-4 FR5
17 FR28-5 FR28
18 FR5-2 FR5
19 FR28-4 FR28
20 FR12-3 FR12
21 <NA> FR15
22 FR8-1 FR8
23 FR11-1 FR11
24 FR10-5 FR10
25 FR9-3 FR9
26 FR13-1 FR13
27 <NA> FR15
28 FR3-3 FR3
29 <NA> FR15
30 FR12-8 FR12
31 <NA> FR15
32 FR7-1 FR7
33 FR8-10 FR8
34 <NA> FR15
35 FR26-2 FR26
36 FR10-7 FR10
37 <NA> FR15
38 FR10-2 FR10
39 FR7-1 FR7
40 FR2-7 FR2
41 FR12-4 FR12
42 FR8-9 FR8
43 FR2-1 FR2
44 FR2-2 FR2
45 FR10-8 FR10
46 <NA> FR15
47 FR12-2 FR12
48 FR28-6 FR28
49 FR10-4 FR10
50 FR21-1 FR21
51 FR13-5 FR13
52 FR5-3 FR5
53 FR21-2 FR21
54 FR8-8 FR8
55 FR8-11 FR8
56 FR21-3 FR21
57 <NA> FR15
58 FR8-3 FR8
59 FR3-1 FR3
60 FR7-4 FR7
61 FR28-1 FR28
62 FR8-12 FR8
63 FR2-6 FR2
64 FR28-2 FR28
65 <NA> FR25
66 FR5-8 FR5
67 FR26-1 FR26
68 FR12-5 FR12
69 FR7-4 FR7
70 <NA> FR15
71 FR14-2 FR14
72 <NA> FR18
73 FR21-4 FR21
74 FR13-4 FR13
75 FR21-8 FR21
76 FR5-7 FR5
77 FR3-1 FR3
78 <NA> FR15
79 <NA> FR15
80 <NA> FR15
81 FR28-9 FR28
82 FR7-2 FR7
83 FR28-10 FR28
84 FR7-3 FR7
85 FR13-3 FR13
86 FR28-4 FR28
87 <NA> FR15
88 FR12-7 FR12
89 FR24-1 FR24
90 FR8-7 FR8
91 FR26-5 FR26
92 FR10-3 FR10
93 FR8-5 FR8
94 <NA> FR15
95 <NA> FR17
96 FR25-1 FR25
97 <NA> FR15
98 FR8-14 FR8
99 FR9-1 FR9
100 FR4-4 FR4
101 FR21-7 FR21
102 FR5-1 FR5
103 FR27-2 FR27
104 FR4-2 FR4
105 FR5-4 FR5
106 <NA> FR15
107 FR28-2 FR28
108 FR23-1 FR23
109 <NA> FR15
110 <NA> FR15
111 <NA> FR15
112 <NA> FR18
113 FR28-11 FR28
114 <NA> FR15
115 <NA> FR15
116 FR5-1 FR5
117 FR7-3 FR7
118 FR21-7 FR21
119 <NA> FR15
120 <NA> FR15
121 <NA> FR15
122 FR23-1 FR23
123 <NA> FR15
124 FR28-12 FR28
125 <NA> FR16
126 FR29-3 FR29
127 FR26-3 FR26
128 FR13-2 FR13
129 FR5-7 FR5
130 FR14-1 FR14
131 FR5-2 FR5
132 <NA> FR18
133 FR24-1 FR24
134 FR5-6 FR5
135 FR5-3 FR5
136 FR5-3 FR5
137 <NA> FR15
138 <NA> FR15
139 <NA> FR15
140 <NA> FR19
141 <NA> FR16
142 FR26-3 FR26
143 <NA> FR18
144 FR8-4 FR8
145 <NA> FR16
146 FR28-8 FR28
147 <NA> FR15
148 FR21-6 FR21
149 <NA> FR16
150 <NA> FR15
151 <NA> FR15
152 FR5-4 FR5
153 <NA> FR17
154 FR5-3 FR5
155 <NA> FR20
156 <NA> FR17
157 FR8-7 FR8
158 FR28-7 FR28
159 <NA> FR15
160 FR8-2 FR8
161 FR23-6 FR23
162 <NA> FR3
163 <NA> FR15
164 <NA> FR19
165 <NA> FR20
166 FR29-1 FR29
167 FR23-3 FR23
168 <NA> FR16
169 FR8-14 FR8
170 <NA> FR18
171 <NA> FR16
172 FR14-4 FR14
173 FR29-5 FR29
174 <NA> FR15
175 FR8-13 FR8
176 <NA> FR20
177 FR23-4 FR23
178 FR23-5 FR23
179 <NA> FR16
180 FR8-10 FR8
181 FR8-13 FR8
182 <NA> FR16
183 <NA> FR20
184 <NA> FR16
185 <NA> FR22
186 FR23-5 FR23
187 FR8-9 FR8
188 <NA> FR20
189 FR2-8 FR2
190 <NA> FR20
191 <NA> FR22
192 <NA> FR22
193 FR28-9 FR28
194 <NA> FR19
195 FR29-2 FR29
196 <NA> FR20
197 <NA> FR15
198 <NA> FR22
199 <NA> FR18
200 FR8-1 FR8
201 FR29-4 FR29
202 FR23-3 FR23
203 FR12-1 FR12
204 FR3-2 FR3
205 <NA> FR3
206 FR5-4 FR5
207 FR21-5 FR21
208 <NA> FR15
209 FR8-5 FR8
210 FR9-2 FR9
211 FR8-6 FR8
212 <NA> FR17
213 FR2-8 FR2
214 FR25-1 FR25
215 <NA> FR19
216 <NA> FR16
217 <NA> FR22
218 <NA> FR16
219 FR28-11 FR28
220 <NA> FR19
221 FR2-4 FR2
222 <NA> FR19
223 <NA> FR19
224 <NA> FR18
225 <NA> FR16
226 FR5-4 FR5
227 FR28-10 FR28
228 <NA> FR22
229 FR8-8 FR8
230 <NA> FR3
231 <NA> FR20
232 FR8-2 FR8
233 <NA> FR19
234 FR5-8 FR5
235 <NA> FR20
236 <NA> FR22
237 <NA> FR20
238 <NA> FR19
239 FR28-3 FR28
240 <NA> FR20
241 <NA> FR20
242 <NA> FR20
243 FR23-6 FR23
244 FR29-2 FR29
245 <NA> FR20
246 <NA> FR18
247 FR7-1 FR7
248 <NA> FR15
249 <NA> FR22
250 FR21-4 FR21
251 <NA> FR19
252 <NA> FR20
253 FR26-1 FR26
254 FR9-1 FR9
255 <NA> FR3
256 FR8-11 FR8
257 FR3-4 FR3
258 <NA> FR19
259 <NA> FR20
260 <NA> FR22
261 FR2-5 FR2
262 <NA> FR22
263 <NA> FR15
264 FR28-8 FR28
265 <NA> FR15
266 <NA> FR19
267 FR21-1 FR21
268 FR27-2 FR27
269 FR21-2 FR21
270 FR25-2 FR25
271 <NA> FR3
272 FR4-3 FR4
273 <NA> FR20
274 <NA> FR22
275 FR23-2 FR23
276 <NA> FR16
277 FR21-3 FR21
278 <NA> FR19
279 FR2-1 FR2
280 FR27-5 FR27
281 <NA> FR20
282 FR3-3 FR3
283 <NA> FR18
284 FR26-4 FR26
285 FR8-4 FR8
286 FR27-1 FR27
287 FR4-2 FR4
288 FR4-3 FR4
289 <NA> FR22
290 <NA> FR15
291 <NA> FR3
292 FR5-2 FR5
293 <NA> FR16
294 FR7-2 FR7
295 FR3-4 FR3
296 FR5-1 FR5
297 FR29-5 FR29
298 FR5-1 FR5
299 <NA> FR18
300 <NA> FR3
301 FR3-5 FR3
302 FR28-6 FR28
303 FR2-5 FR2
304 FR4-1 FR4
305 FR23-2 FR23
306 FR5-3 FR5
307 FR28-1 FR28
308 <NA> FR6
309 FR5-6 FR5
310 <NA> FR19
311 FR27-6 FR27
312 <NA> FR22
313 FR7-2 FR7
314 FR8-12 FR8
315 FR2-2 FR2
316 <NA> FR20
317 FR27-4 FR27
318 <NA> FR20
319 <NA> FR3
320 <NA> FR20
321 FR27-3 FR27
322 FR8-3 FR8
323 FR2-7 FR2
324 FR2-6 FR2
325 FR29-3 FR29
326 FR28-7 FR28
327 <NA> FR3
328 <NA> FR22
329 FR4-1 FR4
330 <NA> FR3
331 <NA> FR15
# more_than_one = paste0("FR", c(3,4,5,8,9,13,15,16,18,19,20,21,23))
# allmap %>% filter(Family %in% more_than_one) %>% select(Family)
allmap %>% group_by(pnid) %>%
summarise(Family = first(Family), Disease = first(Disease.code)) %>% ungroup() %>%
group_by(Family) %>% summarise(pn_in_family = sum(!is.na(Disease))) %>%
filter(pn_in_family > 1) %>% kable()
| Family | pn_in_family |
|---|---|
| FR13 | 2 |
| FR2 | 4 |
| FR21 | 4 |
| FR23 | 3 |
| FR26 | 2 |
| FR3 | 2 |
| FR4 | 2 |
| FR5 | 2 |
| FR8 | 2 |
| FR9 | 2 |
| pnid | Sample.type | n |
|---|---|---|
| FR5-1 | DNA_Extracted_from_feaces…F. | 2 |
| FR5-1 | DNA_Extracted_from_saliva…S. | 2 |
| FR5-2 | DNA_Extracted_from_feaces…F. | 2 |
| FR5-3 | DNA_Extracted_from_feaces…F. | 2 |
| FR5-3 | DNA_Extracted_from_saliva…S. | 2 |
| FR5-4 | DNA_Extracted_from_feaces…F. | 2 |
| FR5-4 | DNA_Extracted_from_saliva…S. | 2 |
| FR7-1 | DNA_Extracted_from_feaces…F. | 2 |
| FR7-2 | DNA_Extracted_from_feaces…F. | 2 |
| FR7-3 | DNA_Extracted_from_feaces…F. | 2 |
| FR7-4 | DNA_Extracted_from_feaces…F. | 2 |
| NA | DNA_Extracted_from_feaces…F. | 162 |
| sampleid | Database | sample_ID |
|---|---|---|
| DB9.094 | DB9 | FR8_6S |
| DB12.192 | DB12 | FR15_27_F |
| DB14.339 | DB14 | FR28_3S |
| DB14.216 | DB14 | FR5_2S V2 |
| DB13.031 | DB13 | FR22_8_F |
| DB14.317 | DB14 | FR21_6F |
| DB14.277 | DB14 | FR25_2F |
| DB14.349 | DB14 | FR29_1S |
| DB13.116 | DB13 | FR16_6_S |
| DB14.226 | DB14 | FR21_5S |
| DB13.006 | DB13 | FR18_6_F |
| DB10.010 | DB10 | FR3_2M |
| DB14.239 | DB14 | FR26_2S |
| DB13.106 | DB13 | FR15_27_S |
| DB13.108 | DB13 | FR15_29_S |
| DB13.115 | DB13 | FR16_5_S |
| DB9.262 | DB9 | FR9_4F |
| DB14.281 | DB14 | FR26_4F |
| DB13.103 | DB13 | FR15_24_S |
| DB13.099 | DB13 | FR15_20_S |
| DB13.082 | DB13 | FR15_3_S |
| DB14.341 | DB14 | FR28_5S |
| DB12.213 | DB12 | FR6_3 |
| DB13.090 | DB13 | FR15_11_S |
| DB9.066 | DB9 | FR2_3M |
| DB9.254 | DB9 | FR6_1 |
| DB13.104 | DB13 | FR15_25_S |
| DB13.007 | DB13 | FR19_1_F |
| DB13.111 | DB13 | FR15_32_S |
| DB13.119 | DB13 | FR16_9_S |
| DB9.148 | DB9 | FR10_1F |
| DB14.320 | DB14 | FR28_12F |
# allmap$Family %>% unique() %>% length()
# grep("FR", allmap$Family %>% unique()) %>% length()
biompath = "/pita/pub/data/16S_DBs/processed/16S_DB1-14_merged/table.qza"
taxpath = "/pita/pub/data/16S_DBs/processed/16S_DB1-14_merged/taxonomy.qza"
biom = read_qza(biompath)$data
biom = biom %>% data.frame()
biom = biom %>% select(sample_names(map))
min_reads_cutoff = cutoff
biom = biom[apply(biom, 2, sum) > min_reads_cutoff]
rare_biom = vegan::rrarefy(biom %>% t(), 10000) %>% t()
otu = phyloseq::otu_table(rare_biom, taxa_are_rows = T)
otu = phyloseq::prune_taxa(taxa_sums(otu) > 10, otu)
relative_otu = otu_table(funrar::make_relative(otu@.Data), taxa_are_rows = T)
relative_otu = filter_taxa(relative_otu, function(x) mean(x) > 1e-5, TRUE)
pretax = read_qza(taxpath)$data
tax = pretax %>% phyloseq::tax_table()
phyloseq::taxa_names(tax) = pretax$Feature.ID
tree = ape::rtree(ntaxa(tax), rooted = T, tip.label = phyloseq::taxa_names(tax))
abs_exp = phyloseq(otu, map, tax, tree)
exp = phyloseq(relative_otu, map, tax, tree)
# write_phyloseq(abs_exp, path = "/pita/users/rotem/eyes/human/eyes123.csv",type = "all")
samples_to_filter = "/pita/users/rotem/eyes/human/samples_to_filter.csv"
abs_exp %>% sample_data() %>% data.frame() %>% rownames() %>%
write.table(samples_to_filter, quote = F, sep = "\t", row.names = F, col.names = "sample-id")
cmd = paste0(qiime_path, "feature-table filter-samples",
" --i-table ", biompath,
" --m-metadata-file ", samples_to_filter,
" --o-filtered-table ", "/pita/users/rotem/eyes/human/eyes-filtered.qza")
system(cmd)
abs_exp %>% sample_data() %>% data.frame() %>%
write.table("/pita/users/rotem/eyes/human/eyes-filtered.tsv"
, quote = F, sep = "\t", row.names = F)
# read_qza("/pita/users/rotem/eyes/human/eyes-filtered.qza")$data %>% dim()
file.remove(samples_to_filter)