Load the data
setwd("/data2/rnaseq/flaveria/assemblies/fb_ex/")
data_tpm <- read.csv("tpm_columns.txt", sep = "\t", head = T)
Correlation matrix
tpms <- data_tpm[, 3:20]
library("ggplot2")
library("reshape2")
names(tpms) <- gsub(names(tpms), pattern = "X([0-9]).([0-9])", replacement = "Fb\\2.\\1")
c <- melt(cor(tpms))
p <- ggplot(data = c, aes_string(x = names(c)[1], y = names(c)[2], fill = "value")) +
geom_tile() + xlab("") + ylab("")
p
tpms_melt <- melt(tpms)
## Using as id variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern = "\\.[0-9]", replacement = "")
ggplot(tpms_melt, aes(x = log(value), group = stage, colour = stage)) + geom_density()
## Warning: Removed 408468 rows containing non-finite values (stat_density).
## Warning: Removed 382944 rows containing non-finite values (stat_density).
## Warning: Removed 374038 rows containing non-finite values (stat_density).
## Warning: Removed 353398 rows containing non-finite values (stat_density).
## Warning: Removed 371628 rows containing non-finite values (stat_density).
## Warning: Removed 320856 rows containing non-finite values (stat_density).
plotheatmap <- function(counts) {
dists <- dist(t(as.matrix(tpms)))
mat <- as.matrix(dists)
print(heatmap.2(mat, trace = "none"))
}
plotheatmap()
## Error: could not find function "heatmap.2"
data_counts <- read.csv("express_output_eff_counts.txt", sep = "\t", head = T)
library(EBSeq)
## Loading required package: blockmodeling
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(DESeq)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist
##
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Loading required package: locfit
## locfit 1.5-9.1 2013-03-22
## Loading required package: lattice
## Welcome to 'DESeq'. For improved performance, usability and
## functionality, please consider migrating to 'DESeq2'.
# ` Remove any rows containing all-zeros
remove_zero_rows <- function(df) {
df[apply(df, 1, function(x) !all(x == 0)), ]
}
normalise_counts <- function(counts, normfactors) {
round(t(t(counts)/normfactors))
}
counts <- data_counts[, 3:20]
rownames(counts) <- data_counts$contig
counts <- remove_zero_rows(round(counts))
cds <- newCountDataSet(counts, condition = rep("a", 18))
cds <- estimateSizeFactors(cds)
cds <- counts(cds, normalize = TRUE)
rownames(cds) <- rownames(counts)
library("ggplot2")
library("reshape2")
colnames(counts) <- gsub(colnames(counts), pattern = "X([0-9]).([0-9])", replacement = "Fb_\\2.\\1")
plotcountcor <- function(counts) {
c <- melt(cor(counts))
p <- ggplot(data = c, aes_string(x = names(c)[1], y = names(c)[2], fill = "value")) +
geom_tile() + geom_text(aes(label = round(value, 2))) + xlab("") + ylab("")
print(p)
}
plotcountcor(counts)
log count distributions
cf <- as.data.frame(counts)
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id = "contig")
ggplot(counts_melt, aes(x = log(value), colour = variable)) + geom_density()
## Warning: Removed 127361 rows containing non-finite values (stat_density).
## Warning: Removed 116277 rows containing non-finite values (stat_density).
## Warning: Removed 116073 rows containing non-finite values (stat_density).
## Warning: Removed 117926 rows containing non-finite values (stat_density).
## Warning: Removed 121018 rows containing non-finite values (stat_density).
## Warning: Removed 106893 rows containing non-finite values (stat_density).
## Warning: Removed 123945 rows containing non-finite values (stat_density).
## Warning: Removed 120890 rows containing non-finite values (stat_density).
## Warning: Removed 115994 rows containing non-finite values (stat_density).
## Warning: Removed 108688 rows containing non-finite values (stat_density).
## Warning: Removed 122543 rows containing non-finite values (stat_density).
## Warning: Removed 90288 rows containing non-finite values (stat_density).
## Warning: Removed 149746 rows containing non-finite values (stat_density).
## Warning: Removed 137272 rows containing non-finite values (stat_density).
## Warning: Removed 133943 rows containing non-finite values (stat_density).
## Warning: Removed 118488 rows containing non-finite values (stat_density).
## Warning: Removed 120348 rows containing non-finite values (stat_density).
## Warning: Removed 115559 rows containing non-finite values (stat_density).
ggplot(counts_melt, aes(x = variable, y = value, colour = variable)) + geom_boxplot()
plot distribution of counts over 50,000
counts_50k <- counts_melt[counts_melt$value > 50000, ]
ggplot(counts_50k, aes(x = variable, y = value, colour = variable)) + geom_boxplot()
there are some high counts distorting the distributions - let's remove those rows for the purposes of checking how good replication is for the majority of genes
high_contigs <- unique(counts_50k[which(counts_50k$value > 2e+05), ]$contig)
fixed_tpm <- data_tpm[-which(data_tpm$contig %in% high_contigs), ]
names(fixed_tpm) <- gsub(names(fixed_tpm), pattern = "X([0-9]).([0-9])", replacement = "Fb\\2.\\1")
tpms <- fixed_tpm[, 3:20]
rownames(tpms) <- fixed_tpm$contig
plotcountcor(tpms)
we can also use a correlation metric less sensitive to outliers
plotcountspearman <- function(counts) {
c <- cor(counts, method = "spearman")
print(c)
c <- melt(c)
c <- c[-which(c$value == 1), ]
p <- ggplot(data = c, aes_string(x = names(c)[1], y = names(c)[2], fill = "value")) +
geom_tile() + geom_text(aes(label = round(value, 2))) + xlab("") + ylab("")
print(p)
}
plotcountspearman(tpms)
## Fb1.1 Fb2.1 Fb3.1 Fb4.1 Fb5.1 Fb6.1 Fb1.2 Fb2.2 Fb3.2
## Fb1.1 1.0000 0.7164 0.6835 0.6138 0.6147 0.5672 0.7240 0.7196 0.6842
## Fb2.1 0.7164 1.0000 0.7179 0.6492 0.6545 0.6127 0.7043 0.7294 0.7149
## Fb3.1 0.6835 0.7179 1.0000 0.6866 0.6958 0.6494 0.6737 0.7115 0.7290
## Fb4.1 0.6138 0.6492 0.6866 1.0000 0.6920 0.6437 0.6041 0.6424 0.6734
## Fb5.1 0.6147 0.6545 0.6958 0.6920 1.0000 0.6941 0.6097 0.6437 0.6795
## Fb6.1 0.5672 0.6127 0.6494 0.6437 0.6941 1.0000 0.5642 0.6030 0.6414
## Fb1.2 0.7240 0.7043 0.6737 0.6041 0.6097 0.5642 1.0000 0.7108 0.6757
## Fb2.2 0.7196 0.7294 0.7115 0.6424 0.6437 0.6030 0.7108 1.0000 0.7176
## Fb3.2 0.6842 0.7149 0.7290 0.6734 0.6795 0.6414 0.6757 0.7176 1.0000
## Fb4.2 0.6393 0.6749 0.7149 0.6830 0.7036 0.6753 0.6307 0.6794 0.7155
## Fb5.2 0.5939 0.6266 0.6728 0.6603 0.6991 0.6734 0.5903 0.6297 0.6668
## Fb6.2 0.4388 0.4654 0.5143 0.5040 0.5699 0.5629 0.4524 0.4717 0.5064
## Fb1.3 0.7068 0.6899 0.6709 0.6279 0.6397 0.5787 0.6991 0.6920 0.6681
## Fb2.3 0.7128 0.7127 0.6951 0.6426 0.6544 0.5994 0.7060 0.7145 0.6913
## Fb3.3 0.7049 0.7216 0.7185 0.6655 0.6764 0.6192 0.6986 0.7216 0.7133
## Fb4.3 0.6574 0.6913 0.7116 0.7003 0.6800 0.6374 0.6505 0.6907 0.7101
## Fb5.3 0.6309 0.6643 0.7047 0.6850 0.7179 0.6727 0.6279 0.6623 0.6942
## Fb6.3 0.5881 0.6266 0.6635 0.6596 0.7144 0.6977 0.5862 0.6163 0.6523
## Fb4.2 Fb5.2 Fb6.2 Fb1.3 Fb2.3 Fb3.3 Fb4.3 Fb5.3 Fb6.3
## Fb1.1 0.6393 0.5939 0.4388 0.7068 0.7128 0.7049 0.6574 0.6309 0.5881
## Fb2.1 0.6749 0.6266 0.4654 0.6899 0.7127 0.7216 0.6913 0.6643 0.6266
## Fb3.1 0.7149 0.6728 0.5143 0.6709 0.6951 0.7185 0.7116 0.7047 0.6635
## Fb4.1 0.6830 0.6603 0.5040 0.6279 0.6426 0.6655 0.7003 0.6850 0.6596
## Fb5.1 0.7036 0.6991 0.5699 0.6397 0.6544 0.6764 0.6800 0.7179 0.7144
## Fb6.1 0.6753 0.6734 0.5629 0.5787 0.5994 0.6192 0.6374 0.6727 0.6977
## Fb1.2 0.6307 0.5903 0.4524 0.6991 0.7060 0.6986 0.6505 0.6279 0.5862
## Fb2.2 0.6794 0.6297 0.4717 0.6920 0.7145 0.7216 0.6907 0.6623 0.6163
## Fb3.2 0.7155 0.6668 0.5064 0.6681 0.6913 0.7133 0.7101 0.6942 0.6523
## Fb4.2 1.0000 0.6948 0.5395 0.6306 0.6540 0.6818 0.6983 0.7081 0.6756
## Fb5.2 0.6948 1.0000 0.5523 0.6052 0.6200 0.6452 0.6594 0.6867 0.6745
## Fb6.2 0.5395 0.5523 1.0000 0.4695 0.4729 0.4979 0.5007 0.5693 0.5698
## Fb1.3 0.6306 0.6052 0.4695 1.0000 0.7407 0.7241 0.6668 0.6500 0.6193
## Fb2.3 0.6540 0.6200 0.4729 0.7407 1.0000 0.7402 0.6893 0.6648 0.6322
## Fb3.3 0.6818 0.6452 0.4979 0.7241 0.7402 1.0000 0.7107 0.6921 0.6516
## Fb4.3 0.6983 0.6594 0.5007 0.6668 0.6893 0.7107 1.0000 0.6956 0.6564
## Fb5.3 0.7081 0.6867 0.5693 0.6500 0.6648 0.6921 0.6956 1.0000 0.6976
## Fb6.3 0.6756 0.6745 0.5698 0.6193 0.6322 0.6516 0.6564 0.6976 1.0000
looking better - Fb5.2 and Fb6.2 are no longer completely different in the pearson correlation, but Fb6.2 is still looking wrong in the spearman. Also, Fb4.3 and Fb5.3 look like they are closer to the earlier stage in the other two replicates.
Just re-checking the distributions aggregated by sample…
tpms_melt <- melt(tpms)
## Using as id variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern = "\\.[0-9]", replacement = "")
ggplot(tpms_melt, aes(x = log(value), group = stage, colour = stage)) + geom_density()
## Warning: Removed 408468 rows containing non-finite values (stat_density).
## Warning: Removed 382944 rows containing non-finite values (stat_density).
## Warning: Removed 374038 rows containing non-finite values (stat_density).
## Warning: Removed 353398 rows containing non-finite values (stat_density).
## Warning: Removed 371628 rows containing non-finite values (stat_density).
## Warning: Removed 320856 rows containing non-finite values (stat_density).
and not aggregated…
ggplot(tpms_melt, aes(x = log(value), group = variable, colour = stage)) + geom_density() +
scale_colour_brewer(type = "qual", palette = 6)
## Warning: Removed 129682 rows containing non-finite values (stat_density).
## Warning: Removed 119372 rows containing non-finite values (stat_density).
## Warning: Removed 118826 rows containing non-finite values (stat_density).
## Warning: Removed 120616 rows containing non-finite values (stat_density).
## Warning: Removed 124051 rows containing non-finite values (stat_density).
## Warning: Removed 109565 rows containing non-finite values (stat_density).
## Warning: Removed 126504 rows containing non-finite values (stat_density).
## Warning: Removed 123633 rows containing non-finite values (stat_density).
## Warning: Removed 118654 rows containing non-finite values (stat_density).
## Warning: Removed 111558 rows containing non-finite values (stat_density).
## Warning: Removed 124729 rows containing non-finite values (stat_density).
## Warning: Removed 92896 rows containing non-finite values (stat_density).
## Warning: Removed 152282 rows containing non-finite values (stat_density).
## Warning: Removed 139939 rows containing non-finite values (stat_density).
## Warning: Removed 136558 rows containing non-finite values (stat_density).
## Warning: Removed 121224 rows containing non-finite values (stat_density).
## Warning: Removed 122848 rows containing non-finite values (stat_density).
## Warning: Removed 118395 rows containing non-finite values (stat_density).
we probably should re-normalise TPMs within each column since we removed the highest rows
print(apply(tpms, 2, sum))
## Fb1.1 Fb2.1 Fb3.1 Fb4.1 Fb5.1 Fb6.1 Fb1.2 Fb2.2 Fb3.2 Fb4.2
## 999676 999702 999652 999642 999647 999467 999651 999598 999670 999647
## Fb5.2 Fb6.2 Fb1.3 Fb2.3 Fb3.3 Fb4.3 Fb5.3 Fb6.3
## 954305 984396 999419 999521 999650 999588 999466 999480
renorm <- function(x) {
x = (x/sum(x)) * 1e+06
}
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
print(apply(tpms_rn, 2, sum))
## Fb1.1 Fb2.1 Fb3.1 Fb4.1 Fb5.1 Fb6.1 Fb1.2 Fb2.2 Fb3.2 Fb4.2 Fb5.2 Fb6.2
## 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06
## Fb1.3 Fb2.3 Fb3.3 Fb4.3 Fb5.3 Fb6.3
## 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06
and one more correlation plot check…
plotcountcor(tpms_rn)
plotcountspearman(tpms_rn)
## Fb1.1 Fb2.1 Fb3.1 Fb4.1 Fb5.1 Fb6.1 Fb1.2 Fb2.2 Fb3.2
## Fb1.1 1.0000 0.7164 0.6835 0.6138 0.6147 0.5672 0.7240 0.7196 0.6842
## Fb2.1 0.7164 1.0000 0.7179 0.6492 0.6545 0.6127 0.7043 0.7294 0.7149
## Fb3.1 0.6835 0.7179 1.0000 0.6866 0.6958 0.6494 0.6737 0.7115 0.7290
## Fb4.1 0.6138 0.6492 0.6866 1.0000 0.6920 0.6437 0.6041 0.6424 0.6734
## Fb5.1 0.6147 0.6545 0.6958 0.6920 1.0000 0.6941 0.6097 0.6437 0.6795
## Fb6.1 0.5672 0.6127 0.6494 0.6437 0.6941 1.0000 0.5642 0.6030 0.6414
## Fb1.2 0.7240 0.7043 0.6737 0.6041 0.6097 0.5642 1.0000 0.7108 0.6757
## Fb2.2 0.7196 0.7294 0.7115 0.6424 0.6437 0.6030 0.7108 1.0000 0.7176
## Fb3.2 0.6842 0.7149 0.7290 0.6734 0.6795 0.6414 0.6757 0.7176 1.0000
## Fb4.2 0.6393 0.6749 0.7149 0.6830 0.7036 0.6753 0.6307 0.6794 0.7155
## Fb5.2 0.5939 0.6266 0.6728 0.6603 0.6991 0.6734 0.5903 0.6297 0.6668
## Fb6.2 0.4388 0.4654 0.5143 0.5040 0.5699 0.5629 0.4524 0.4717 0.5064
## Fb1.3 0.7068 0.6899 0.6709 0.6279 0.6397 0.5787 0.6991 0.6920 0.6681
## Fb2.3 0.7128 0.7127 0.6951 0.6426 0.6544 0.5994 0.7060 0.7145 0.6913
## Fb3.3 0.7049 0.7216 0.7185 0.6655 0.6764 0.6192 0.6986 0.7216 0.7133
## Fb4.3 0.6574 0.6913 0.7116 0.7003 0.6800 0.6374 0.6505 0.6907 0.7101
## Fb5.3 0.6309 0.6643 0.7047 0.6850 0.7179 0.6727 0.6279 0.6623 0.6942
## Fb6.3 0.5881 0.6266 0.6635 0.6596 0.7144 0.6977 0.5862 0.6163 0.6523
## Fb4.2 Fb5.2 Fb6.2 Fb1.3 Fb2.3 Fb3.3 Fb4.3 Fb5.3 Fb6.3
## Fb1.1 0.6393 0.5939 0.4388 0.7068 0.7128 0.7049 0.6574 0.6309 0.5881
## Fb2.1 0.6749 0.6266 0.4654 0.6899 0.7127 0.7216 0.6913 0.6643 0.6266
## Fb3.1 0.7149 0.6728 0.5143 0.6709 0.6951 0.7185 0.7116 0.7047 0.6635
## Fb4.1 0.6830 0.6603 0.5040 0.6279 0.6426 0.6655 0.7003 0.6850 0.6596
## Fb5.1 0.7036 0.6991 0.5699 0.6397 0.6544 0.6764 0.6800 0.7179 0.7144
## Fb6.1 0.6753 0.6734 0.5629 0.5787 0.5994 0.6192 0.6374 0.6727 0.6977
## Fb1.2 0.6307 0.5903 0.4524 0.6991 0.7060 0.6986 0.6505 0.6279 0.5862
## Fb2.2 0.6794 0.6297 0.4717 0.6920 0.7145 0.7216 0.6907 0.6623 0.6163
## Fb3.2 0.7155 0.6668 0.5064 0.6681 0.6913 0.7133 0.7101 0.6942 0.6523
## Fb4.2 1.0000 0.6948 0.5395 0.6306 0.6540 0.6818 0.6983 0.7081 0.6756
## Fb5.2 0.6948 1.0000 0.5523 0.6052 0.6200 0.6452 0.6594 0.6867 0.6745
## Fb6.2 0.5395 0.5523 1.0000 0.4695 0.4729 0.4979 0.5007 0.5693 0.5698
## Fb1.3 0.6306 0.6052 0.4695 1.0000 0.7407 0.7241 0.6668 0.6500 0.6193
## Fb2.3 0.6540 0.6200 0.4729 0.7407 1.0000 0.7402 0.6893 0.6648 0.6322
## Fb3.3 0.6818 0.6452 0.4979 0.7241 0.7402 1.0000 0.7107 0.6921 0.6516
## Fb4.3 0.6983 0.6594 0.5007 0.6668 0.6893 0.7107 1.0000 0.6956 0.6564
## Fb5.3 0.7081 0.6867 0.5693 0.6500 0.6648 0.6921 0.6956 1.0000 0.6976
## Fb6.3 0.6756 0.6745 0.5698 0.6193 0.6322 0.6516 0.6564 0.6976 1.0000
maybe the chloroplast genes are causing the problem - if chloroplasts were highly expressing or were captured at higher rates in one replicate's extractions, that could lead to broken correlations
chloro <- read.csv("../chloroplast_contigs.txt", head = F)
names(chloro)[1] <- "contig"
nc_tpms <- fixed_tpm[-which(fixed_tpm$contig %in% chloro$contig), ]
tpms <- nc_tpms[, 3:20]
rownames(tpms) <- nc_tpms$contig
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
plotcountcor(tpms_rn)