Load the data and make the two datasets comparable

library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
setwd('~/code/ljcohen/MMETSP/')
dib_to_ncgr <- fread('MMETSP_transrate_data.csv')
ncgr_to_dib <- fread('MMETSP_reverse_transrate_data.csv')

# normalise the metadata so we can link samples between sets

meta <- fread('clean_sraruninfo.csv')
## Warning in fread("clean_sraruninfo.csv"): Some columns have been read as
## type 'integer64' but package bit64 isn't loaded. Those columns will display
## as strange looking floating point data. There is no need to reload the
## data. Just require(bit64) to obtain the integer64 print method and print
## the data again.
sample_from_dib <- function(assembly) {
  srr <- gsub(".*(SRR[0-9]+).*", "\\1", assembly)
  matches <- match(meta$Run, srr)
  ordered_meta <- meta[matches[!is.na(matches)],]
  return(paste(ordered_meta$ScientificName, ordered_meta$SampleName))
}

sample_from_ncgr <- function(assembly) {
  sample <- gsub('.*(MMETSP[0-9]+).*', "\\1", assembly)
  matches <- match(meta$SampleName, sample)
  ordered_meta <- meta[matches[!is.na(matches)],]
  return(paste(ordered_meta$ScientificName, ordered_meta$SampleName))
}

dib_to_ncgr$sample <- sample_from_dib(dib_to_ncgr$assembly)
ncgr_to_dib$sample <- sample_from_ncgr(ncgr_to_dib$assembly)
## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 54 items
## to be assigned to 55 items of column 'sample' (recycled leaving remainder
## of 1 items).
dib_to_ncgr$analysis <- 'ass:DIB -> ref:NCGR'
ncgr_to_dib$analysis <- 'ass:NCGR -> ref:DIB'

all_results <- rbind(dib_to_ncgr, ncgr_to_dib)

Plots, yo!

NCGR assemblies tend to have many fewer contigs

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
ggplot(arrange(all_results, sample), aes(x=sample, y=n_seqs, colour=analysis)) +
  geom_point() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab('Sample') +
  ylab('Number of contigs')

ggplot(arrange(all_results, sample), aes(x=analysis, y=n_seqs)) +
  geom_violin() +
  geom_point(aes(colour = analysis)) +
  geom_line(aes(group=sample)) +
  xlab('Analysis') +
  ylab('Number of contigs')

The DIB assemblies generally contain most of the information in the NCGR assemblies, plus a lot of unique information

library(ggplot2)
ggplot(arrange(all_results, sample), aes(x=sample, y=p_contigs_with_CRBB, colour=analysis)) +
  geom_point() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab('Sample') +
  ylab('Proportion of contigs with a reference hit')

ggplot(arrange(all_results, sample), aes(x=analysis, y=p_contigs_with_CRBB)) +
  geom_violin() +
  geom_point(aes(colour = analysis)) +
  geom_line(aes(group=sample)) +
  xlab('Analysis') +
  ylab('Proportion of contigs with a reference hit')