prelim_data = read.table(gzfile("data/prelim_results.agg.tsv.gz"), header=T, com='', sep="\t", stringsAsFactors = F) %>% rename('sample' = 'X.sample')
head(prelim_data)
## sample
## 1 MESO-MQ-A6BL-TP
## 2 MESO-MQ-A6BL-TP
## 3 MESO-MQ-A6BL-TP
## 4 MESO-MQ-A6BL-TP
## 5 MESO-MQ-A6BL-TP
## 6 MESO-MQ-A6BL-TP
## entry
## 1 chr2~32916372~+~refseq|NC_047813|18546nt|Staphylococcus~18542~-
## 2 chr2~88857462~+~refseq|NC_047813|18546nt|Staphylococcus~18542~-
## 3 chr1~232725307~+~refseq|NC_047813|18546nt|Staphylococcus~18542~-
## 4 refseq|NC_047813|18546nt|Staphylococcus~18547~+~chr2~88857544~+
## 5 chr12~119397852~+~refseq|NC_047813|18546nt|Staphylococcus~18542~-
## 6 refseq|NC_047813|18546nt|Staphylococcus~18542~+~chr14~77269782~+
## chrA coordA orientA
## 1 chr2 32916372 +
## 2 chr2 88857462 +
## 3 chr1 232725307 +
## 4 refseq|NC_047813|18546nt|Staphylococcus 18547 +
## 5 chr12 119397852 +
## 6 refseq|NC_047813|18546nt|Staphylococcus 18542 +
## chrB coordB orientB primary_brkpt_type
## 1 refseq|NC_047813|18546nt|Staphylococcus 18542 - Span
## 2 refseq|NC_047813|18546nt|Staphylococcus 18542 - Span
## 3 refseq|NC_047813|18546nt|Staphylococcus 18542 - Span
## 4 chr2 88857544 + Span
## 5 refseq|NC_047813|18546nt|Staphylococcus 18542 - Span
## 6 chr14 77269782 + Span
## num_primary_reads num_supp_reads total hits min_per_id max_end_clipping
## 1 7 226 233 1.391 90.4 5.459
## 2 2 68 70 1.000 92.1 4.100
## 3 4 39 43 1.140 86.8 4.930
## 4 1 37 38 1.000 90.5 2.211
## 5 8 30 38 1.158 88.6 5.947
## 6 2 22 24 1.500 92.7 9.708
## min_anchor_len
## 1 42.343
## 2 43.600
## 3 42.744
## 4 45.579
## 5 41.395
## 6 35.667
# remove any chrM hits.
prelim_data = prelim_data %>% filter( chrA != "chrM" & chrB != "chrM")
prelim_hits = bind_rows( prelim_data %>% select(sample, chrA, total) %>% rename(chr=chrA),
prelim_data %>% select(sample, chrB, total) %>% rename(chr=chrB)) %>%
filter(! grepl("^chr", chr) )
`
prelim_hits %>% filter(total >= 10) %>% mutate(true=T) %>%
ggplot(aes(x=chr, y=sample, z=true)) + geom_tile() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
refined_data = read.table(gzfile("data/refined_results.agg.tsv.gz"), header=T, stringsAsFactors = F, com='') %>%
rename('sample' = 'X.sample') %>%
filter(total >= 10)
head(refined_data)
## sample
## 1 LUSC-43-3394-NT.vif.refined.tsv
## 2 LUSC-43-3394-NT.vif.refined.tsv
## 3 LUSC-43-3394-NT.vif.refined.tsv
## 4 LUSC-43-3394-NT.vif.refined.tsv
## 5 LUSC-43-3394-NT.vif.refined.tsv
## 6 LUSC-43-3394-NT.vif.refined.tsv
## contig
## 1 refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~11091~+
## 2 refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~6368~+
## 3 refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~4941~+
## 4 refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~9820~+
## 5 refseq|NC_047813|18546nt|Staphylococcus~18547~+~chr8~22164349~+
## 6 refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~13097~+
## chrA coordA orientA chrB coordB orientB
## 1 refseq|NC_047813|18546nt|Staphylococcus 18547 + chrM 11091 +
## 2 refseq|NC_047813|18546nt|Staphylococcus 18547 + chrM 6368 +
## 3 refseq|NC_047813|18546nt|Staphylococcus 18547 + chrM 4941 +
## 4 refseq|NC_047813|18546nt|Staphylococcus 18547 + chrM 9820 +
## 5 refseq|NC_047813|18546nt|Staphylococcus 18547 + chr8 22164349 +
## 6 refseq|NC_047813|18546nt|Staphylococcus 18547 + chrM 13097 +
## prelim.primary_brkpt_type prelim.mean_hits prelim.mean_min_per_id
## 1 Span 1 96.6
## 2 Span 1 97.0
## 3 Span 1 96.5
## 4 Span 1 98.1
## 5 Span 1 98.1
## 6 Span 1 97.4
## prelim.mean_max_end_clipping prelim.mean_min_anchor_len prelim.total split
## 1 9.448 40.379 29 31
## 2 7.643 42.214 14 21
## 3 9.250 40.667 12 19
## 4 10.438 39.375 16 0
## 5 9.923 40.077 13 9
## 6 9.273 40.636 11 10
## span total
## 1 1 32
## 2 11 32
## 3 4 23
## 4 14 14
## 5 5 14
## 6 3 13
# remove any chrM hits.
refined_data = refined_data %>% filter( chrA != "chrM" & chrB != "chrM")
refined_virus_hits = bind_rows( refined_data %>% select(sample, chrA) %>% rename(chr=chrA),
refined_data %>% select(sample, chrB) %>% rename(chr=chrB)) %>%
filter(! grepl("^chr", chr) ) %>%
unique()
refined_virus_hits %>% mutate(true=T) %>%
ggplot(aes(x=chr, y=sample, z=true)) + geom_tile() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
refined_data %>% ggplot(aes(x=total, y=prelim.total)) + geom_point() + geom_abline(slope=1, intercept=0, col='purple')
```