Prelim results

prelim_data = read.table(gzfile("data/prelim_results.agg.tsv.gz"), header=T, com='', sep="\t", stringsAsFactors = F) %>% rename('sample' = 'X.sample')

head(prelim_data)
##            sample
## 1 MESO-MQ-A6BL-TP
## 2 MESO-MQ-A6BL-TP
## 3 MESO-MQ-A6BL-TP
## 4 MESO-MQ-A6BL-TP
## 5 MESO-MQ-A6BL-TP
## 6 MESO-MQ-A6BL-TP
##                                                                                        entry
## 1                refseq|NC_004102|9646nt|Hepatitis_C_virus_genotype_1~9516~+~chr10~3776992~+
## 2                               chr14~105742026~+~refseq|NC_009026|10815nt|Aroa_virus~6598~-
## 3                       refseq|NC_004161|18891nt|Reston_ebolavirus~15385~+~chr14~105668252~+
## 4 chr14~105742214~+~refseq|NC_001731|190289nt|Molluscum_contagiosum_virus_subtype_1~159282~+
## 5      refseq|NC_012783|226205nt|Cercopithecine_betaherpesvirus_5~121937~-~chr14~105668257~+
## 6               chr14~105742114~+~refseq|NC_043445|13072nt|Guenon_simian_foamy_virus~11353~-
##                                                         chrA    coordA orientA
## 1       refseq|NC_004102|9646nt|Hepatitis_C_virus_genotype_1      9516       +
## 2                                                      chr14 105742026       +
## 3                 refseq|NC_004161|18891nt|Reston_ebolavirus     15385       +
## 4                                                      chr14 105742214       +
## 5 refseq|NC_012783|226205nt|Cercopithecine_betaherpesvirus_5    121937       -
## 6                                                      chr14 105742114       +
##                                                              chrB    coordB
## 1                                                           chr10   3776992
## 2                             refseq|NC_009026|10815nt|Aroa_virus      6598
## 3                                                           chr14 105668252
## 4 refseq|NC_001731|190289nt|Molluscum_contagiosum_virus_subtype_1    159282
## 5                                                           chr14 105668257
## 6              refseq|NC_043445|13072nt|Guenon_simian_foamy_virus     11353
##   orientB primary_brkpt_type num_primary_reads num_supp_reads total   hits
## 1       +               Span                 1              2     3  1.000
## 2       -              Split                 2              1     3  6.333
## 3       +              Split                 2              0     2  7.000
## 4       +              Split                 2              0     2 14.500
## 5       +              Split                 1              1     2  5.000
## 6       -              Split                 1              0     1  7.000
##   min_per_id max_end_clipping min_anchor_len
## 1       95.1            6.667         41.000
## 2      100.0           14.333         13.667
## 3      100.0           13.500         13.500
## 4       91.5           17.000         13.000
## 5       97.1           16.500         14.500
## 6       85.7           27.000         13.000
# remove any chrM hits.

prelim_data = prelim_data %>% filter( chrA != "chrM" & chrB != "chrM")
prelim_hits = bind_rows( prelim_data %>% select(sample, chrA, total) %>% rename(chr=chrA), 
                                prelim_data %>% select(sample, chrB, total) %>% rename(chr=chrB)) %>%
    filter(! grepl("^chr", chr) )

`

prelim_hits %>% filter(total >= 10) %>% mutate(true=T) %>%
    ggplot(aes(x=chr, y=sample, z=true)) + geom_tile() +
    theme(axis.text.x = element_text(angle = 30, hjust = 1))

Refined results

refined_data = read.table(gzfile("data/refined_results.agg.tsv.gz"), header=T, stringsAsFactors = F, com='') %>% 
    rename('sample' = 'X.sample') %>%
    filter(total >= 10)

head(refined_data)
##                            sample                       contig  chrA   coordA
## 1 CESC-C5-A1BJ-TP.vif.refined.tsv  chr4~73777731~+~HPV16~881~-  chr4 73777731
## 2 CESC-C5-A1BJ-TP.vif.refined.tsv  chr4~73777731~+~HPV16~227~-  chr4 73777731
## 3 CESC-C5-A1BJ-TP.vif.refined.tsv  chr4~73802675~+~HPV16~881~-  chr4 73802675
## 4 CESC-DS-A1O9-TP.vif.refined.tsv HPV73~863~+~chr14~24657524~+ HPV73      863
## 5 CESC-DS-A1O9-TP.vif.refined.tsv HPV73~183~+~chr14~24657551~+ HPV73      183
## 6 CESC-DS-A1O9-TP.vif.refined.tsv HPV73~863~+~chr14~24747972~+ HPV73      863
##   orientA  chrB   coordB orientB prelim.primary_brkpt_type prelim.mean_hits
## 1       + HPV16      881       -                     Split            1.007
## 2       + HPV16      227       -                     Split            1.000
## 3       + HPV16      881       -                     Split            1.000
## 4       + chr14 24657524       +                     Split            1.000
## 5       + chr14 24657551       +                      Span            1.000
## 6       + chr14 24747972       +                     Split            1.067
##   prelim.mean_min_per_id prelim.mean_max_end_clipping
## 1                   98.6                        5.015
## 2                   98.5                        4.850
## 3                   98.9                        4.963
## 4                   97.8                        1.789
## 5                   98.2                        1.138
## 6                   98.3                        2.833
##   prelim.mean_min_anchor_len prelim.total split span total
## 1                     42.567         3576   740 2304  3044
## 2                     42.818         1580   147 1120  1267
## 3                     42.769          108    15   67    82
## 4                     46.126         4346   338 3633  3971
## 5                     46.793           29   132    0   132
## 6                     44.900           30     9   48    57
# remove any chrM hits.

refined_data = refined_data %>% filter( chrA != "chrM" & chrB != "chrM")
refined_virus_hits = bind_rows( refined_data %>% select(sample, chrA) %>% rename(chr=chrA), 
                                refined_data %>% select(sample, chrB) %>% rename(chr=chrB)) %>%
    filter(! grepl("^chr", chr) ) %>%
    unique()
refined_virus_hits %>% mutate(true=T) %>%
    ggplot(aes(x=chr, y=sample, z=true)) + geom_tile() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

Compare prelim to refined

refined_data %>% ggplot(aes(x=total, y=prelim.total)) + geom_point() + geom_abline(slope=1, intercept=0, col='purple')

```