Prelim results

prelim_data = read.table(gzfile("data/prelim_results.agg.tsv.gz"), header=T, com='', sep="\t", stringsAsFactors = F) %>% rename('sample' = 'X.sample')

head(prelim_data)
##            sample
## 1 MESO-MQ-A6BL-TP
## 2 MESO-MQ-A6BL-TP
## 3 MESO-MQ-A6BL-TP
## 4 MESO-MQ-A6BL-TP
## 5 MESO-MQ-A6BL-TP
## 6 MESO-MQ-A6BL-TP
##                                                               entry
## 1   chr2~32916372~+~refseq|NC_047813|18546nt|Staphylococcus~18542~-
## 2   chr2~88857462~+~refseq|NC_047813|18546nt|Staphylococcus~18542~-
## 3  chr1~232725307~+~refseq|NC_047813|18546nt|Staphylococcus~18542~-
## 4   refseq|NC_047813|18546nt|Staphylococcus~18547~+~chr2~88857544~+
## 5 chr12~119397852~+~refseq|NC_047813|18546nt|Staphylococcus~18542~-
## 6  refseq|NC_047813|18546nt|Staphylococcus~18542~+~chr14~77269782~+
##                                      chrA    coordA orientA
## 1                                    chr2  32916372       +
## 2                                    chr2  88857462       +
## 3                                    chr1 232725307       +
## 4 refseq|NC_047813|18546nt|Staphylococcus     18547       +
## 5                                   chr12 119397852       +
## 6 refseq|NC_047813|18546nt|Staphylococcus     18542       +
##                                      chrB   coordB orientB primary_brkpt_type
## 1 refseq|NC_047813|18546nt|Staphylococcus    18542       -               Span
## 2 refseq|NC_047813|18546nt|Staphylococcus    18542       -               Span
## 3 refseq|NC_047813|18546nt|Staphylococcus    18542       -               Span
## 4                                    chr2 88857544       +               Span
## 5 refseq|NC_047813|18546nt|Staphylococcus    18542       -               Span
## 6                                   chr14 77269782       +               Span
##   num_primary_reads num_supp_reads total  hits min_per_id max_end_clipping
## 1                 7            226   233 1.391       90.4            5.459
## 2                 2             68    70 1.000       92.1            4.100
## 3                 4             39    43 1.140       86.8            4.930
## 4                 1             37    38 1.000       90.5            2.211
## 5                 8             30    38 1.158       88.6            5.947
## 6                 2             22    24 1.500       92.7            9.708
##   min_anchor_len
## 1         42.343
## 2         43.600
## 3         42.744
## 4         45.579
## 5         41.395
## 6         35.667
# remove any chrM hits.

prelim_data = prelim_data %>% filter( chrA != "chrM" & chrB != "chrM")
prelim_hits = bind_rows( prelim_data %>% select(sample, chrA, total) %>% rename(chr=chrA), 
                                prelim_data %>% select(sample, chrB, total) %>% rename(chr=chrB)) %>%
    filter(! grepl("^chr", chr) )

`

prelim_hits %>% filter(total >= 10) %>% mutate(true=T) %>%
    ggplot(aes(x=chr, y=sample, z=true)) + geom_tile() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

Refined results

refined_data = read.table(gzfile("data/refined_results.agg.tsv.gz"), header=T, stringsAsFactors = F, com='') %>% 
    rename('sample' = 'X.sample') %>%
    filter(total >= 10)

head(refined_data)
##                            sample
## 1 LUSC-43-3394-NT.vif.refined.tsv
## 2 LUSC-43-3394-NT.vif.refined.tsv
## 3 LUSC-43-3394-NT.vif.refined.tsv
## 4 LUSC-43-3394-NT.vif.refined.tsv
## 5 LUSC-43-3394-NT.vif.refined.tsv
## 6 LUSC-43-3394-NT.vif.refined.tsv
##                                                            contig
## 1    refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~11091~+
## 2     refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~6368~+
## 3     refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~4941~+
## 4     refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~9820~+
## 5 refseq|NC_047813|18546nt|Staphylococcus~18547~+~chr8~22164349~+
## 6    refseq|NC_047813|18546nt|Staphylococcus~18547~+~chrM~13097~+
##                                      chrA coordA orientA chrB   coordB orientB
## 1 refseq|NC_047813|18546nt|Staphylococcus  18547       + chrM    11091       +
## 2 refseq|NC_047813|18546nt|Staphylococcus  18547       + chrM     6368       +
## 3 refseq|NC_047813|18546nt|Staphylococcus  18547       + chrM     4941       +
## 4 refseq|NC_047813|18546nt|Staphylococcus  18547       + chrM     9820       +
## 5 refseq|NC_047813|18546nt|Staphylococcus  18547       + chr8 22164349       +
## 6 refseq|NC_047813|18546nt|Staphylococcus  18547       + chrM    13097       +
##   prelim.primary_brkpt_type prelim.mean_hits prelim.mean_min_per_id
## 1                      Span                1                   96.6
## 2                      Span                1                   97.0
## 3                      Span                1                   96.5
## 4                      Span                1                   98.1
## 5                      Span                1                   98.1
## 6                      Span                1                   97.4
##   prelim.mean_max_end_clipping prelim.mean_min_anchor_len prelim.total split
## 1                        9.448                     40.379           29    31
## 2                        7.643                     42.214           14    21
## 3                        9.250                     40.667           12    19
## 4                       10.438                     39.375           16     0
## 5                        9.923                     40.077           13     9
## 6                        9.273                     40.636           11    10
##   span total
## 1    1    32
## 2   11    32
## 3    4    23
## 4   14    14
## 5    5    14
## 6    3    13
# remove any chrM hits.

refined_data = refined_data %>% filter( chrA != "chrM" & chrB != "chrM")
refined_virus_hits = bind_rows( refined_data %>% select(sample, chrA) %>% rename(chr=chrA), 
                                refined_data %>% select(sample, chrB) %>% rename(chr=chrB)) %>%
    filter(! grepl("^chr", chr) ) %>%
    unique()
refined_virus_hits %>% mutate(true=T) %>%
    ggplot(aes(x=chr, y=sample, z=true)) + geom_tile() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

Compare prelim to refined

refined_data %>% ggplot(aes(x=total, y=prelim.total)) + geom_point() + geom_abline(slope=1, intercept=0, col='purple')

```