Flaveria bidentis eXpress data exploration

Load the data

setwd("/data2/rnaseq/flaveria/assemblies/fb_ex/")
data_tpm <- read.csv("tpm_columns.txt", sep = "\t", head = T)

TPM

Correlation matrix

tpms <- data_tpm[, 3:20]
library("ggplot2")
library("reshape2")
names(tpms) <- gsub(names(tpms), pattern = "X([0-9]).([0-9])", replacement = "Fb\\2.\\1")
c <- melt(cor(tpms))
p <- ggplot(data = c, aes_string(x = names(c)[1], y = names(c)[2], fill = "value")) + 
    geom_tile() + xlab("") + ylab("")
p

plot of chunk unnamed-chunk-2

tpms_melt <- melt(tpms)

## Using  as id variables

tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern = "\\.[0-9]", replacement = "")
ggplot(tpms_melt, aes(x = log(value), group = stage, colour = stage)) + geom_density()

## Warning: Removed 408468 rows containing non-finite values (stat_density).
## Warning: Removed 382944 rows containing non-finite values (stat_density).
## Warning: Removed 374038 rows containing non-finite values (stat_density).
## Warning: Removed 353398 rows containing non-finite values (stat_density).
## Warning: Removed 371628 rows containing non-finite values (stat_density).
## Warning: Removed 320856 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-2

plotheatmap <- function(counts) {
    dists <- dist(t(as.matrix(tpms)))
    mat <- as.matrix(dists)
    print(heatmap.2(mat, trace = "none"))
}
plotheatmap()

## Error: could not find function "heatmap.2"

Effective counts

data_counts <- read.csv("express_output_eff_counts.txt", sep = "\t", head = T)
library(EBSeq)

## Loading required package: blockmodeling
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

library(DESeq)

## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## 
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Loading required package: locfit
## locfit 1.5-9.1    2013-03-22
## Loading required package: lattice
##     Welcome to 'DESeq'. For improved performance, usability and
##     functionality, please consider migrating to 'DESeq2'.

# ` Remove any rows containing all-zeros
remove_zero_rows <- function(df) {
    df[apply(df, 1, function(x) !all(x == 0)), ]
}

normalise_counts <- function(counts, normfactors) {
    round(t(t(counts)/normfactors))
}

counts <- data_counts[, 3:20]
rownames(counts) <- data_counts$contig
counts <- remove_zero_rows(round(counts))
cds <- newCountDataSet(counts, condition = rep("a", 18))
cds <- estimateSizeFactors(cds)
cds <- counts(cds, normalize = TRUE)
rownames(cds) <- rownames(counts)
library("ggplot2")
library("reshape2")
colnames(counts) <- gsub(colnames(counts), pattern = "X([0-9]).([0-9])", replacement = "Fb_\\2.\\1")
plotcountcor <- function(counts) {
    c <- melt(cor(counts))
    p <- ggplot(data = c, aes_string(x = names(c)[1], y = names(c)[2], fill = "value")) + 
        geom_tile() + geom_text(aes(label = round(value, 2))) + xlab("") + ylab("")
    print(p)
}
plotcountcor(counts)

plot of chunk unnamed-chunk-4

log count distributions

cf <- as.data.frame(counts)
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id = "contig")
ggplot(counts_melt, aes(x = log(value), colour = variable)) + geom_density()

## Warning: Removed 127361 rows containing non-finite values (stat_density).
## Warning: Removed 116277 rows containing non-finite values (stat_density).
## Warning: Removed 116073 rows containing non-finite values (stat_density).
## Warning: Removed 117926 rows containing non-finite values (stat_density).
## Warning: Removed 121018 rows containing non-finite values (stat_density).
## Warning: Removed 106893 rows containing non-finite values (stat_density).
## Warning: Removed 123945 rows containing non-finite values (stat_density).
## Warning: Removed 120890 rows containing non-finite values (stat_density).
## Warning: Removed 115994 rows containing non-finite values (stat_density).
## Warning: Removed 108688 rows containing non-finite values (stat_density).
## Warning: Removed 122543 rows containing non-finite values (stat_density).
## Warning: Removed 90288 rows containing non-finite values (stat_density).
## Warning: Removed 149746 rows containing non-finite values (stat_density).
## Warning: Removed 137272 rows containing non-finite values (stat_density).
## Warning: Removed 133943 rows containing non-finite values (stat_density).
## Warning: Removed 118488 rows containing non-finite values (stat_density).
## Warning: Removed 120348 rows containing non-finite values (stat_density).
## Warning: Removed 115559 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-5

ggplot(counts_melt, aes(x = variable, y = value, colour = variable)) + geom_boxplot()

plot of chunk unnamed-chunk-5

plot distribution of counts over 50,000

counts_50k <- counts_melt[counts_melt$value > 50000, ]
ggplot(counts_50k, aes(x = variable, y = value, colour = variable)) + geom_boxplot()

plot of chunk unnamed-chunk-6

there are some high counts distorting the distributions - let's remove those rows for the purposes of checking how good replication is for the majority of genes

high_contigs <- unique(counts_50k[which(counts_50k$value > 2e+05), ]$contig)
fixed_tpm <- data_tpm[-which(data_tpm$contig %in% high_contigs), ]
names(fixed_tpm) <- gsub(names(fixed_tpm), pattern = "X([0-9]).([0-9])", replacement = "Fb\\2.\\1")
tpms <- fixed_tpm[, 3:20]
rownames(tpms) <- fixed_tpm$contig
plotcountcor(tpms)

plot of chunk unnamed-chunk-7

we can also use a correlation metric less sensitive to outliers


plotcountspearman <- function(counts) {
    c <- cor(counts, method = "spearman")
    print(c)
    c <- melt(c)
    c <- c[-which(c$value == 1), ]
    p <- ggplot(data = c, aes_string(x = names(c)[1], y = names(c)[2], fill = "value")) + 
        geom_tile() + geom_text(aes(label = round(value, 2))) + xlab("") + ylab("")
    print(p)
}
plotcountspearman(tpms)

##        Fb1.1  Fb2.1  Fb3.1  Fb4.1  Fb5.1  Fb6.1  Fb1.2  Fb2.2  Fb3.2
## Fb1.1 1.0000 0.7164 0.6835 0.6138 0.6147 0.5672 0.7240 0.7196 0.6842
## Fb2.1 0.7164 1.0000 0.7179 0.6492 0.6545 0.6127 0.7043 0.7294 0.7149
## Fb3.1 0.6835 0.7179 1.0000 0.6866 0.6958 0.6494 0.6737 0.7115 0.7290
## Fb4.1 0.6138 0.6492 0.6866 1.0000 0.6920 0.6437 0.6041 0.6424 0.6734
## Fb5.1 0.6147 0.6545 0.6958 0.6920 1.0000 0.6941 0.6097 0.6437 0.6795
## Fb6.1 0.5672 0.6127 0.6494 0.6437 0.6941 1.0000 0.5642 0.6030 0.6414
## Fb1.2 0.7240 0.7043 0.6737 0.6041 0.6097 0.5642 1.0000 0.7108 0.6757
## Fb2.2 0.7196 0.7294 0.7115 0.6424 0.6437 0.6030 0.7108 1.0000 0.7176
## Fb3.2 0.6842 0.7149 0.7290 0.6734 0.6795 0.6414 0.6757 0.7176 1.0000
## Fb4.2 0.6393 0.6749 0.7149 0.6830 0.7036 0.6753 0.6307 0.6794 0.7155
## Fb5.2 0.5939 0.6266 0.6728 0.6603 0.6991 0.6734 0.5903 0.6297 0.6668
## Fb6.2 0.4388 0.4654 0.5143 0.5040 0.5699 0.5629 0.4524 0.4717 0.5064
## Fb1.3 0.7068 0.6899 0.6709 0.6279 0.6397 0.5787 0.6991 0.6920 0.6681
## Fb2.3 0.7128 0.7127 0.6951 0.6426 0.6544 0.5994 0.7060 0.7145 0.6913
## Fb3.3 0.7049 0.7216 0.7185 0.6655 0.6764 0.6192 0.6986 0.7216 0.7133
## Fb4.3 0.6574 0.6913 0.7116 0.7003 0.6800 0.6374 0.6505 0.6907 0.7101
## Fb5.3 0.6309 0.6643 0.7047 0.6850 0.7179 0.6727 0.6279 0.6623 0.6942
## Fb6.3 0.5881 0.6266 0.6635 0.6596 0.7144 0.6977 0.5862 0.6163 0.6523
##        Fb4.2  Fb5.2  Fb6.2  Fb1.3  Fb2.3  Fb3.3  Fb4.3  Fb5.3  Fb6.3
## Fb1.1 0.6393 0.5939 0.4388 0.7068 0.7128 0.7049 0.6574 0.6309 0.5881
## Fb2.1 0.6749 0.6266 0.4654 0.6899 0.7127 0.7216 0.6913 0.6643 0.6266
## Fb3.1 0.7149 0.6728 0.5143 0.6709 0.6951 0.7185 0.7116 0.7047 0.6635
## Fb4.1 0.6830 0.6603 0.5040 0.6279 0.6426 0.6655 0.7003 0.6850 0.6596
## Fb5.1 0.7036 0.6991 0.5699 0.6397 0.6544 0.6764 0.6800 0.7179 0.7144
## Fb6.1 0.6753 0.6734 0.5629 0.5787 0.5994 0.6192 0.6374 0.6727 0.6977
## Fb1.2 0.6307 0.5903 0.4524 0.6991 0.7060 0.6986 0.6505 0.6279 0.5862
## Fb2.2 0.6794 0.6297 0.4717 0.6920 0.7145 0.7216 0.6907 0.6623 0.6163
## Fb3.2 0.7155 0.6668 0.5064 0.6681 0.6913 0.7133 0.7101 0.6942 0.6523
## Fb4.2 1.0000 0.6948 0.5395 0.6306 0.6540 0.6818 0.6983 0.7081 0.6756
## Fb5.2 0.6948 1.0000 0.5523 0.6052 0.6200 0.6452 0.6594 0.6867 0.6745
## Fb6.2 0.5395 0.5523 1.0000 0.4695 0.4729 0.4979 0.5007 0.5693 0.5698
## Fb1.3 0.6306 0.6052 0.4695 1.0000 0.7407 0.7241 0.6668 0.6500 0.6193
## Fb2.3 0.6540 0.6200 0.4729 0.7407 1.0000 0.7402 0.6893 0.6648 0.6322
## Fb3.3 0.6818 0.6452 0.4979 0.7241 0.7402 1.0000 0.7107 0.6921 0.6516
## Fb4.3 0.6983 0.6594 0.5007 0.6668 0.6893 0.7107 1.0000 0.6956 0.6564
## Fb5.3 0.7081 0.6867 0.5693 0.6500 0.6648 0.6921 0.6956 1.0000 0.6976
## Fb6.3 0.6756 0.6745 0.5698 0.6193 0.6322 0.6516 0.6564 0.6976 1.0000

plot of chunk unnamed-chunk-8

looking better - Fb5.2 and Fb6.2 are no longer completely different in the pearson correlation, but Fb6.2 is still looking wrong in the spearman. Also, Fb4.3 and Fb5.3 look like they are closer to the earlier stage in the other two replicates.

Just re-checking the distributions aggregated by sample…

tpms_melt <- melt(tpms)

## Using  as id variables

tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern = "\\.[0-9]", replacement = "")
ggplot(tpms_melt, aes(x = log(value), group = stage, colour = stage)) + geom_density()

## Warning: Removed 408468 rows containing non-finite values (stat_density).
## Warning: Removed 382944 rows containing non-finite values (stat_density).
## Warning: Removed 374038 rows containing non-finite values (stat_density).
## Warning: Removed 353398 rows containing non-finite values (stat_density).
## Warning: Removed 371628 rows containing non-finite values (stat_density).
## Warning: Removed 320856 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-9

and not aggregated…

ggplot(tpms_melt, aes(x = log(value), group = variable, colour = stage)) + geom_density() + 
    scale_colour_brewer(type = "qual", palette = 6)

## Warning: Removed 129682 rows containing non-finite values (stat_density).
## Warning: Removed 119372 rows containing non-finite values (stat_density).
## Warning: Removed 118826 rows containing non-finite values (stat_density).
## Warning: Removed 120616 rows containing non-finite values (stat_density).
## Warning: Removed 124051 rows containing non-finite values (stat_density).
## Warning: Removed 109565 rows containing non-finite values (stat_density).
## Warning: Removed 126504 rows containing non-finite values (stat_density).
## Warning: Removed 123633 rows containing non-finite values (stat_density).
## Warning: Removed 118654 rows containing non-finite values (stat_density).
## Warning: Removed 111558 rows containing non-finite values (stat_density).
## Warning: Removed 124729 rows containing non-finite values (stat_density).
## Warning: Removed 92896 rows containing non-finite values (stat_density).
## Warning: Removed 152282 rows containing non-finite values (stat_density).
## Warning: Removed 139939 rows containing non-finite values (stat_density).
## Warning: Removed 136558 rows containing non-finite values (stat_density).
## Warning: Removed 121224 rows containing non-finite values (stat_density).
## Warning: Removed 122848 rows containing non-finite values (stat_density).
## Warning: Removed 118395 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-10

we probably should re-normalise TPMs within each column since we removed the highest rows

print(apply(tpms, 2, sum))

##  Fb1.1  Fb2.1  Fb3.1  Fb4.1  Fb5.1  Fb6.1  Fb1.2  Fb2.2  Fb3.2  Fb4.2 
## 999676 999702 999652 999642 999647 999467 999651 999598 999670 999647 
##  Fb5.2  Fb6.2  Fb1.3  Fb2.3  Fb3.3  Fb4.3  Fb5.3  Fb6.3 
## 954305 984396 999419 999521 999650 999588 999466 999480

renorm <- function(x) {
    x = (x/sum(x)) * 1e+06
}
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
print(apply(tpms_rn, 2, sum))

## Fb1.1 Fb2.1 Fb3.1 Fb4.1 Fb5.1 Fb6.1 Fb1.2 Fb2.2 Fb3.2 Fb4.2 Fb5.2 Fb6.2 
## 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 
## Fb1.3 Fb2.3 Fb3.3 Fb4.3 Fb5.3 Fb6.3 
## 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06

and one more correlation plot check…

plotcountcor(tpms_rn)

plot of chunk unnamed-chunk-12

plotcountspearman(tpms_rn)

##        Fb1.1  Fb2.1  Fb3.1  Fb4.1  Fb5.1  Fb6.1  Fb1.2  Fb2.2  Fb3.2
## Fb1.1 1.0000 0.7164 0.6835 0.6138 0.6147 0.5672 0.7240 0.7196 0.6842
## Fb2.1 0.7164 1.0000 0.7179 0.6492 0.6545 0.6127 0.7043 0.7294 0.7149
## Fb3.1 0.6835 0.7179 1.0000 0.6866 0.6958 0.6494 0.6737 0.7115 0.7290
## Fb4.1 0.6138 0.6492 0.6866 1.0000 0.6920 0.6437 0.6041 0.6424 0.6734
## Fb5.1 0.6147 0.6545 0.6958 0.6920 1.0000 0.6941 0.6097 0.6437 0.6795
## Fb6.1 0.5672 0.6127 0.6494 0.6437 0.6941 1.0000 0.5642 0.6030 0.6414
## Fb1.2 0.7240 0.7043 0.6737 0.6041 0.6097 0.5642 1.0000 0.7108 0.6757
## Fb2.2 0.7196 0.7294 0.7115 0.6424 0.6437 0.6030 0.7108 1.0000 0.7176
## Fb3.2 0.6842 0.7149 0.7290 0.6734 0.6795 0.6414 0.6757 0.7176 1.0000
## Fb4.2 0.6393 0.6749 0.7149 0.6830 0.7036 0.6753 0.6307 0.6794 0.7155
## Fb5.2 0.5939 0.6266 0.6728 0.6603 0.6991 0.6734 0.5903 0.6297 0.6668
## Fb6.2 0.4388 0.4654 0.5143 0.5040 0.5699 0.5629 0.4524 0.4717 0.5064
## Fb1.3 0.7068 0.6899 0.6709 0.6279 0.6397 0.5787 0.6991 0.6920 0.6681
## Fb2.3 0.7128 0.7127 0.6951 0.6426 0.6544 0.5994 0.7060 0.7145 0.6913
## Fb3.3 0.7049 0.7216 0.7185 0.6655 0.6764 0.6192 0.6986 0.7216 0.7133
## Fb4.3 0.6574 0.6913 0.7116 0.7003 0.6800 0.6374 0.6505 0.6907 0.7101
## Fb5.3 0.6309 0.6643 0.7047 0.6850 0.7179 0.6727 0.6279 0.6623 0.6942
## Fb6.3 0.5881 0.6266 0.6635 0.6596 0.7144 0.6977 0.5862 0.6163 0.6523
##        Fb4.2  Fb5.2  Fb6.2  Fb1.3  Fb2.3  Fb3.3  Fb4.3  Fb5.3  Fb6.3
## Fb1.1 0.6393 0.5939 0.4388 0.7068 0.7128 0.7049 0.6574 0.6309 0.5881
## Fb2.1 0.6749 0.6266 0.4654 0.6899 0.7127 0.7216 0.6913 0.6643 0.6266
## Fb3.1 0.7149 0.6728 0.5143 0.6709 0.6951 0.7185 0.7116 0.7047 0.6635
## Fb4.1 0.6830 0.6603 0.5040 0.6279 0.6426 0.6655 0.7003 0.6850 0.6596
## Fb5.1 0.7036 0.6991 0.5699 0.6397 0.6544 0.6764 0.6800 0.7179 0.7144
## Fb6.1 0.6753 0.6734 0.5629 0.5787 0.5994 0.6192 0.6374 0.6727 0.6977
## Fb1.2 0.6307 0.5903 0.4524 0.6991 0.7060 0.6986 0.6505 0.6279 0.5862
## Fb2.2 0.6794 0.6297 0.4717 0.6920 0.7145 0.7216 0.6907 0.6623 0.6163
## Fb3.2 0.7155 0.6668 0.5064 0.6681 0.6913 0.7133 0.7101 0.6942 0.6523
## Fb4.2 1.0000 0.6948 0.5395 0.6306 0.6540 0.6818 0.6983 0.7081 0.6756
## Fb5.2 0.6948 1.0000 0.5523 0.6052 0.6200 0.6452 0.6594 0.6867 0.6745
## Fb6.2 0.5395 0.5523 1.0000 0.4695 0.4729 0.4979 0.5007 0.5693 0.5698
## Fb1.3 0.6306 0.6052 0.4695 1.0000 0.7407 0.7241 0.6668 0.6500 0.6193
## Fb2.3 0.6540 0.6200 0.4729 0.7407 1.0000 0.7402 0.6893 0.6648 0.6322
## Fb3.3 0.6818 0.6452 0.4979 0.7241 0.7402 1.0000 0.7107 0.6921 0.6516
## Fb4.3 0.6983 0.6594 0.5007 0.6668 0.6893 0.7107 1.0000 0.6956 0.6564
## Fb5.3 0.7081 0.6867 0.5693 0.6500 0.6648 0.6921 0.6956 1.0000 0.6976
## Fb6.3 0.6756 0.6745 0.5698 0.6193 0.6322 0.6516 0.6564 0.6976 1.0000

plot of chunk unnamed-chunk-12

maybe the chloroplast genes are causing the problem - if chloroplasts were highly expressing or were captured at higher rates in one replicate's extractions, that could lead to broken correlations

chloro <- read.csv("../chloroplast_contigs.txt", head = F)
names(chloro)[1] <- "contig"
nc_tpms <- fixed_tpm[-which(fixed_tpm$contig %in% chloro$contig), ]
tpms <- nc_tpms[, 3:20]
rownames(tpms) <- nc_tpms$contig
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
plotcountcor(tpms_rn)

plot of chunk unnamed-chunk-13