prelim_data = read.table(gzfile("data/prelim_results.agg.tsv.gz"), header=T, com='', sep="\t", stringsAsFactors = F) %>% rename('sample' = 'X.sample')
head(prelim_data)
## sample
## 1 MESO-MQ-A6BL-TP
## 2 MESO-MQ-A6BL-TP
## 3 MESO-MQ-A6BL-TP
## 4 MESO-MQ-A6BL-TP
## 5 MESO-MQ-A6BL-TP
## 6 MESO-MQ-A6BL-TP
## entry
## 1 refseq|NC_004102|9646nt|Hepatitis_C_virus_genotype_1~9516~+~chr10~3776992~+
## 2 chr14~105742026~+~refseq|NC_009026|10815nt|Aroa_virus~6598~-
## 3 refseq|NC_004161|18891nt|Reston_ebolavirus~15385~+~chr14~105668252~+
## 4 chr14~105742214~+~refseq|NC_001731|190289nt|Molluscum_contagiosum_virus_subtype_1~159282~+
## 5 refseq|NC_012783|226205nt|Cercopithecine_betaherpesvirus_5~121937~-~chr14~105668257~+
## 6 chr14~105742114~+~refseq|NC_043445|13072nt|Guenon_simian_foamy_virus~11353~-
## chrA coordA orientA
## 1 refseq|NC_004102|9646nt|Hepatitis_C_virus_genotype_1 9516 +
## 2 chr14 105742026 +
## 3 refseq|NC_004161|18891nt|Reston_ebolavirus 15385 +
## 4 chr14 105742214 +
## 5 refseq|NC_012783|226205nt|Cercopithecine_betaherpesvirus_5 121937 -
## 6 chr14 105742114 +
## chrB coordB
## 1 chr10 3776992
## 2 refseq|NC_009026|10815nt|Aroa_virus 6598
## 3 chr14 105668252
## 4 refseq|NC_001731|190289nt|Molluscum_contagiosum_virus_subtype_1 159282
## 5 chr14 105668257
## 6 refseq|NC_043445|13072nt|Guenon_simian_foamy_virus 11353
## orientB primary_brkpt_type num_primary_reads num_supp_reads total hits
## 1 + Span 1 2 3 1.000
## 2 - Split 2 1 3 6.333
## 3 + Split 2 0 2 7.000
## 4 + Split 2 0 2 14.500
## 5 + Split 1 1 2 5.000
## 6 - Split 1 0 1 7.000
## min_per_id max_end_clipping min_anchor_len
## 1 95.1 6.667 41.000
## 2 100.0 14.333 13.667
## 3 100.0 13.500 13.500
## 4 91.5 17.000 13.000
## 5 97.1 16.500 14.500
## 6 85.7 27.000 13.000
# remove any chrM hits.
prelim_data = prelim_data %>% filter( chrA != "chrM" & chrB != "chrM")
prelim_hits = bind_rows( prelim_data %>% select(sample, chrA, total) %>% rename(chr=chrA),
prelim_data %>% select(sample, chrB, total) %>% rename(chr=chrB)) %>%
filter(! grepl("^chr", chr) )
`
prelim_hits %>% filter(total >= 10) %>% mutate(true=T) %>%
ggplot(aes(x=chr, y=sample, z=true)) + geom_tile() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
refined_data = read.table(gzfile("data/refined_results.agg.tsv.gz"), header=T, stringsAsFactors = F, com='') %>%
rename('sample' = 'X.sample') %>%
filter(total >= 10)
head(refined_data)
## sample contig chrA coordA
## 1 CESC-C5-A1BJ-TP.vif.refined.tsv chr4~73777731~+~HPV16~881~- chr4 73777731
## 2 CESC-C5-A1BJ-TP.vif.refined.tsv chr4~73777731~+~HPV16~227~- chr4 73777731
## 3 CESC-C5-A1BJ-TP.vif.refined.tsv chr4~73802675~+~HPV16~881~- chr4 73802675
## 4 CESC-DS-A1O9-TP.vif.refined.tsv HPV73~863~+~chr14~24657524~+ HPV73 863
## 5 CESC-DS-A1O9-TP.vif.refined.tsv HPV73~183~+~chr14~24657551~+ HPV73 183
## 6 CESC-DS-A1O9-TP.vif.refined.tsv HPV73~863~+~chr14~24747972~+ HPV73 863
## orientA chrB coordB orientB prelim.primary_brkpt_type prelim.mean_hits
## 1 + HPV16 881 - Split 1.007
## 2 + HPV16 227 - Split 1.000
## 3 + HPV16 881 - Split 1.000
## 4 + chr14 24657524 + Split 1.000
## 5 + chr14 24657551 + Span 1.000
## 6 + chr14 24747972 + Split 1.067
## prelim.mean_min_per_id prelim.mean_max_end_clipping
## 1 98.6 5.015
## 2 98.5 4.850
## 3 98.9 4.963
## 4 97.8 1.789
## 5 98.2 1.138
## 6 98.3 2.833
## prelim.mean_min_anchor_len prelim.total split span total
## 1 42.567 3576 740 2304 3044
## 2 42.818 1580 147 1120 1267
## 3 42.769 108 15 67 82
## 4 46.126 4346 338 3633 3971
## 5 46.793 29 132 0 132
## 6 44.900 30 9 48 57
# remove any chrM hits.
refined_data = refined_data %>% filter( chrA != "chrM" & chrB != "chrM")
refined_virus_hits = bind_rows( refined_data %>% select(sample, chrA) %>% rename(chr=chrA),
refined_data %>% select(sample, chrB) %>% rename(chr=chrB)) %>%
filter(! grepl("^chr", chr) ) %>%
unique()
refined_virus_hits %>% mutate(true=T) %>%
ggplot(aes(x=chr, y=sample, z=true)) + geom_tile() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
refined_data %>% ggplot(aes(x=total, y=prelim.total)) + geom_point() + geom_abline(slope=1, intercept=0, col='purple')
```