Shaun Jackman 2013-12-12
## Loading required package: grid
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'useful'
##
## The following object is masked from 'package:stats':
##
## plot.ts
comp <- read.table('pg29organelle-scaffolds.fa.comp.tab', header=TRUE)
idxstats <- subset(read.table('pg29organelle-scaffolds_D08CWACXX_6_merged.sort.idxstats.tab',
header=TRUE), tid != '*')
fatab <- read.table('pg29organelle-scaffolds.fa.tab', header=TRUE)
data <- cbind(fatab, comp, idxstats)
data$numACGT <- with(data, numA + numC + numG + numT)
data$GCcontent <- with(data, (numC + numG) / numACGT)
data$read.density <- with(data, numMapped / numACGT)
# mean.coverage seems to perform better than read.density
data$read.density <- data$mean.coverage
size.threshold <- 2000
densityplot(~numACGT, data, subset = numACGT >= size.threshold,
scales = list(x = list(log = TRUE)))
densityplot(~GCcontent, data, subset = numACGT >= size.threshold)
densityplot(~read.density, data, subset = numACGT >= size.threshold,
scales = list(x = list(log = TRUE)))
splom(~ cbind(read.density, GCcontent, numACGT), data,
subset = numACGT >= size.threshold)
#xyplot(mean.coverage ~ read.density, data, subset = numACGT >= size.threshold)
xyplot(read.density ~ GCcontent, data, subset = numACGT >= size.threshold)
xyplot(read.density ~ numACGT, data, subset = numACGT >= size.threshold)
xyplot(GCcontent ~ numACGT, data, subset = numACGT >= size.threshold)
hexbinplot(read.density ~ GCcontent, data, subset = numACGT >= size.threshold)
hexbinplot(read.density ~ numACGT, data, subset = numACGT >= size.threshold)
hexbinplot(GCcontent ~ numACGT, data, subset = numACGT >= size.threshold)
xyplot(read.density ~ numACGT, data, subset = numACGT >= size.threshold,
main = 'Read density vs. scaffold size of scaffolds larger than 5 kbp',
xlab = 'Scaffold size (bp)',
ylab = 'Read density (reads/bp)')
xyplot(500 * read.density ~ numACGT, data, subset = numACGT >= size.threshold,
main = 'Coverage vs. scaffold size\nfor scaffolds larger than 5 kbp',
xlab = 'Scaffold size (bp)',
ylab = 'Read coverage')
xyplot(read.density ~ GCcontent, data,
subset = numACGT >= size.threshold,
cex = log(subset(data, numACGT >= size.threshold)$numACGT / size.threshold, base=10))
data.subset <- subset(data, numACGT >= size.threshold, c('name', 'read.density', 'GCcontent', 'numACGT'))
data.subset.scaled <- with(data.subset, data.frame(
read.density = scale(log(read.density)),
GCcontent = scale(GCcontent),
numACGT = scale(log(numACGT))))
clusters <- kmeans(
subset(data.subset.scaled, select = c('read.density', 'GCcontent', 'numACGT')),
centers = 2)
plot(clusters, data = data.subset.scaled)
xyplot(read.density ~ GCcontent, data.subset.scaled, group = clusters$cluster,
#cex = data.subset.scaled$numACGT - min(data.subset.scaled$numACGT),
auto.key=T)
data.clustered <- cbind(data.subset, clusters['cluster'])
xyplot(read.density ~ GCcontent, data.clustered, group = cluster,
#cex = log(data.clustered$numACGT / size.threshold, base=10),
auto.key=T)
xyplot(read.density ~ numACGT, data.clustered, group = cluster,
#cex = log(data.clustered$numACGT / size.threshold, base=10),
auto.key=T)
xyplot(GCcontent ~ numACGT, data.clustered, group = cluster,
#cex = log(data.clustered$numACGT / size.threshold, base=10),
auto.key=T)
which.cluster <- which.min(table(data.clustered$cluster))
data.mt <- subset(data.clustered, cluster == which.cluster, -cluster)
summary(data.mt)
## name read.density GCcontent numACGT
## Min. : 28457 Min. : 11.1 Min. :0.380 Min. : 2160
## 1st Qu.:202480 1st Qu.: 24.7 1st Qu.:0.441 1st Qu.: 6998
## Median :205008 Median : 27.6 Median :0.448 Median : 19098
## Mean :202409 Mean : 28.6 Mean :0.447 Mean : 26778
## 3rd Qu.:206727 3rd Qu.: 30.8 3rd Qu.:0.454 3rd Qu.: 34612
## Max. :208054 Max. :108.7 Max. :0.495 Max. :216585
nrow(data.mt)
## [1] 223
sum(data.mt$numACGT)
## [1] 5971387
write.table(data.clustered, 'pg29organelle-scaffolds.fa.clustered.tab',
quote = FALSE, sep = '\t', row.names = FALSE)
| n | n:500 | n:N50 | min | N80 | N50 | N20 | E-size | max | sum | name |
|---|---|---|---|---|---|---|---|---|---|---|
| 294 | 294 | 60 | 4216 | 11870 | 28072 | 53632 | 32893 | 102327 | 5484122 | Pabies1.0-mt.fa |
| 223 | 223 | 42 | 2160 | 21935 | 39683 | 92648 | 59258 | 216585 | 5971387 | pg29organelle-scaffolds.fa.clustered.mt.fa |
| n | nm | identity | file |
|---|---|---|---|
| 4442830 | 131769 | 0.970 | Pabies1.0-mt_pg29organelle-scaffolds.sort.bam |
| 3657831 | 83390 | 0.977 | pg29organelle-scaffolds_Pabies1.0-mt.sort.bam |