Classifying putative mitochondrial contigs

Shaun Jackman 2013-12-12

Load libraries

## Loading required package: grid
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'useful'
## 
## The following object is masked from 'package:stats':
## 
##     plot.ts

Read the data

comp <- read.table('pg29organelle-scaffolds.fa.comp.tab', header=TRUE)
idxstats <- subset(read.table('pg29organelle-scaffolds_D08CWACXX_6_merged.sort.idxstats.tab',
    header=TRUE), tid != '*')
fatab <- read.table('pg29organelle-scaffolds.fa.tab', header=TRUE)
data <- cbind(fatab, comp, idxstats)

Perform calculations

data$numACGT <- with(data, numA + numC + numG + numT)
data$GCcontent <- with(data, (numC + numG) / numACGT)
data$read.density <- with(data, numMapped / numACGT)

# mean.coverage seems to perform better than read.density
data$read.density <- data$mean.coverage

Visualize the data in one dimension

size.threshold <- 2000

densityplot(~numACGT, data, subset = numACGT >= size.threshold,
    scales = list(x = list(log = TRUE)))

plot of chunk Visualize the data in one dimension

densityplot(~GCcontent, data, subset = numACGT >= size.threshold)

plot of chunk Visualize the data in one dimension

densityplot(~read.density, data, subset = numACGT >= size.threshold,
    scales = list(x = list(log = TRUE)))

plot of chunk Visualize the data in one dimension

Visualize the data in two dimensions

splom(~ cbind(read.density, GCcontent, numACGT), data,
    subset = numACGT >= size.threshold)

plot of chunk Visualize the data in two dimensions


#xyplot(mean.coverage ~ read.density, data, subset = numACGT >= size.threshold)
xyplot(read.density ~ GCcontent, data, subset = numACGT >= size.threshold)

plot of chunk Visualize the data in two dimensions

xyplot(read.density ~ numACGT, data, subset = numACGT >= size.threshold)

plot of chunk Visualize the data in two dimensions

xyplot(GCcontent ~ numACGT, data, subset = numACGT >= size.threshold)

plot of chunk Visualize the data in two dimensions


hexbinplot(read.density ~ GCcontent, data, subset = numACGT >= size.threshold)

plot of chunk Visualize the data in two dimensions

hexbinplot(read.density ~ numACGT, data, subset = numACGT >= size.threshold)

plot of chunk Visualize the data in two dimensions

hexbinplot(GCcontent ~ numACGT, data, subset = numACGT >= size.threshold)

plot of chunk Visualize the data in two dimensions


xyplot(read.density ~ numACGT, data, subset = numACGT >= size.threshold,
    main = 'Read density vs. scaffold size of scaffolds larger than 5 kbp',
    xlab = 'Scaffold size (bp)',
    ylab = 'Read density (reads/bp)')

plot of chunk Visualize the data in two dimensions


xyplot(500 * read.density ~ numACGT, data, subset = numACGT >= size.threshold,
    main = 'Coverage vs. scaffold size\nfor scaffolds larger than 5 kbp',
    xlab = 'Scaffold size (bp)',
    ylab = 'Read coverage')

plot of chunk Visualize the data in two dimensions

Visualize the data in three dimensions

xyplot(read.density ~ GCcontent, data,
    subset = numACGT >= size.threshold,
    cex = log(subset(data, numACGT >= size.threshold)$numACGT / size.threshold, base=10))

plot of chunk Visualize the data in three dimensions

Clustering with k-means

data.subset <- subset(data, numACGT >= size.threshold, c('name', 'read.density', 'GCcontent', 'numACGT'))
data.subset.scaled <- with(data.subset, data.frame(
    read.density = scale(log(read.density)),
    GCcontent = scale(GCcontent),
    numACGT = scale(log(numACGT))))
clusters <- kmeans(
    subset(data.subset.scaled, select = c('read.density', 'GCcontent', 'numACGT')),
    centers = 2)

plot(clusters, data = data.subset.scaled)

plot of chunk Clustering with k-means

xyplot(read.density ~ GCcontent, data.subset.scaled, group = clusters$cluster,
    #cex = data.subset.scaled$numACGT - min(data.subset.scaled$numACGT),
    auto.key=T)

plot of chunk Clustering with k-means


data.clustered <- cbind(data.subset, clusters['cluster'])

xyplot(read.density ~ GCcontent, data.clustered, group = cluster,
    #cex = log(data.clustered$numACGT / size.threshold, base=10),
    auto.key=T)

plot of chunk Clustering with k-means

xyplot(read.density ~ numACGT, data.clustered, group = cluster,
    #cex = log(data.clustered$numACGT / size.threshold, base=10),
    auto.key=T)

plot of chunk Clustering with k-means

xyplot(GCcontent ~ numACGT, data.clustered, group = cluster,
    #cex = log(data.clustered$numACGT / size.threshold, base=10),
    auto.key=T)

plot of chunk Clustering with k-means

Statistics of mitochondrial sequences

which.cluster <- which.min(table(data.clustered$cluster))
data.mt <- subset(data.clustered, cluster == which.cluster, -cluster)
summary(data.mt)

##       name         read.density     GCcontent        numACGT      
##  Min.   : 28457   Min.   : 11.1   Min.   :0.380   Min.   :  2160  
##  1st Qu.:202480   1st Qu.: 24.7   1st Qu.:0.441   1st Qu.:  6998  
##  Median :205008   Median : 27.6   Median :0.448   Median : 19098  
##  Mean   :202409   Mean   : 28.6   Mean   :0.447   Mean   : 26778  
##  3rd Qu.:206727   3rd Qu.: 30.8   3rd Qu.:0.454   3rd Qu.: 34612  
##  Max.   :208054   Max.   :108.7   Max.   :0.495   Max.   :216585

nrow(data.mt)

## [1] 223

sum(data.mt$numACGT)

## [1] 5971387

write.table(data.clustered, 'pg29organelle-scaffolds.fa.clustered.tab',
    quote = FALSE, sep = '\t', row.names = FALSE)

n	n:500	n:N50	min	N80	N50	N20	E-size	max	sum	name
294	294	60	4216	11870	28072	53632	32893	102327	5484122	Pabies1.0-mt.fa
223	223	42	2160	21935	39683	92648	59258	216585	5971387	pg29organelle-scaffolds.fa.clustered.mt.fa

n	nm	identity	file
4442830	131769	0.970	Pabies1.0-mt_pg29organelle-scaffolds.sort.bam
3657831	83390	0.977	pg29organelle-scaffolds_Pabies1.0-mt.sort.bam