Flaveria robusta eXpress data exploration

Load the data

library(ggplot2)
library(gplots)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library(reshape2)
setwd('~/hydrogen/flaveria/Fr/Fr_2/')
#setwd('~/flaveria/Fr/Fr_2/')
data_tpm <- read.csv('Fr_express.tpm', sep="\t", head=T)
data_counts <- read.csv('Fr_express.eff_count', sep="\t", head=T)

TPM

Pearson Correlation matrix

library(BBmisc)
tpms <- data_tpm[,3:20]
names(tpms) <- gsub(names(tpms), pattern="Fr_([0-9])\\.([0-9])", replacement="\\2.\\1")

tmp <- lapply(names(tpms), function(x) explode(x,sep="\\."))
names(tpms) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))

rownames(tpms) <- data_tpm$contig
correlation <- cor(tpms)
cm <- melt(correlation)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) + 
  geom_tile(aes(fill=value)) +
  ggtitle("TPM Pearson Correlation Matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk tpm_correlation

Expression Density Plot on TPMs

tpms_melt <- melt(tpms)
## No id variables; using all as measure variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")

ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density()
## Warning: Removed 355023 rows containing non-finite values (stat_density).
## Warning: Removed 376407 rows containing non-finite values (stat_density).
## Warning: Removed 394947 rows containing non-finite values (stat_density).
## Warning: Removed 402205 rows containing non-finite values (stat_density).
## Warning: Removed 407993 rows containing non-finite values (stat_density).
## Warning: Removed 502191 rows containing non-finite values (stat_density).

plot of chunk tpm_density

# ggsave("Fr_density.tpm.pdf")

Heatmap on TPMs

plotheatmap <- function(data) {
  dists <- dist(t(as.matrix(data)))
  mat <- as.matrix(dists)
  heatmap.2(mat, trace="none")
}
#pdf("Fr_heatmap.pdf")
plotheatmap(tpms)

plot of chunk heatmap

#dev.off()

Effective counts

#library(EBSeq)
#library(DESeq) # doesn't install
# Remove any rows containing all-zeros
remove_zero_rows <- function(df) {
  df[apply(df, 1, function(x) !all(x==0)),]
}

normalise_counts <- function(counts, normfactors) {
  round(t(t(counts) / normfactors))
}

counts <- data_counts[,3:20]
colnames(counts) <- gsub(colnames(counts), pattern="Fr_([0-9]).([0-9])", replacement="\\2.\\1")

tmp <- lapply(names(counts), function(x) explode(x,sep="\\."))
names(counts) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))

rownames(counts) <- data_counts$contig
counts <- remove_zero_rows(round(counts))

cm <- melt(cor(counts))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2

ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("Effective Count Correlation matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk eff_corr

# ggsave("Fr_correlation.eff.pdf")

Log distributions of Effective Counts

cf <- as.data.frame(counts)
cf <- cf[c(6,5,4,3,2,1,12,11,10,9,8,7,18,17,16,15,14,13)]
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id='contig')
ggplot(counts_melt, aes(x=log(value), colour=variable)) +
  geom_density() +
  ggtitle("Effective Count Log Density")
## Warning: Removed 112528 rows containing non-finite values (stat_density).
## Warning: Removed 122677 rows containing non-finite values (stat_density).
## Warning: Removed 129828 rows containing non-finite values (stat_density).
## Warning: Removed 131359 rows containing non-finite values (stat_density).
## Warning: Removed 127418 rows containing non-finite values (stat_density).
## Warning: Removed 221758 rows containing non-finite values (stat_density).
## Warning: Removed 115542 rows containing non-finite values (stat_density).
## Warning: Removed 131613 rows containing non-finite values (stat_density).
## Warning: Removed 131927 rows containing non-finite values (stat_density).
## Warning: Removed 131764 rows containing non-finite values (stat_density).
## Warning: Removed 136712 rows containing non-finite values (stat_density).
## Warning: Removed 139107 rows containing non-finite values (stat_density).
## Warning: Removed 115333 rows containing non-finite values (stat_density).
## Warning: Removed 109883 rows containing non-finite values (stat_density).
## Warning: Removed 121226 rows containing non-finite values (stat_density).
## Warning: Removed 126440 rows containing non-finite values (stat_density).
## Warning: Removed 131301 rows containing non-finite values (stat_density).
## Warning: Removed 130164 rows containing non-finite values (stat_density).

plot of chunk log_count

# ggsave("Fr_density.eff.pdf")

Box plot with outliers

ggplot(counts_melt, aes(x=variable, y=value, colour=variable)) + 
  geom_boxplot() +
  ggtitle("Boxplots")

plot of chunk boxplot1

# ggsave("Fr_boxplot.eff.pdf")

Plot distribution of counts over 50,000

counts_50k <- counts_melt[counts_melt$value > 50000,]
ggplot(counts_50k, aes(x=variable, y=value, colour=variable)) + 
  geom_boxplot() +
  ggtitle("Boxplot of Outliers")

plot of chunk boxplot2

# ggsave("Fr_boxplot50k.eff.pdf")

There are some high counts (>2e+05) distorting the distributions - let's remove those rows for the purposes of checking how good replication is for the majority of genes

high_contigs <- unique(counts_50k[which(counts_50k$value > 2e+05),]$contig)
fixed_tpm <- data_tpm[-which(data_tpm$contig %in% high_contigs),] # remove highly expressed contigs
fixed_tpm <- fixed_tpm[,-c(21)]
names(fixed_tpm) <- gsub(names(fixed_tpm), pattern="Fr_([0-9]).([0-9])", replacement="\\2.\\1")

tmp <- lapply(names(fixed_tpm), function(x) { 
    if (nchar(x)==3) {
      explode(x,sep="\\.")
    } else {
      x
    }
  })
names(fixed_tpm) <- lapply(tmp, function(x) {
  if (length(x)==2) {
    paste(7-as.numeric(x[1]), x[2], sep=".")
  } else {
    x
  }
})

tpms <- fixed_tpm[,3:20]
rownames(tpms) <- fixed_tpm$contig

cm <- melt(cor(tpms))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2

ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("TPM Pearson Correlation Matrix with high expression removed") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk unnamed-chunk-2

# ggsave("Fr_correlation_no_outliers.tpm.pdf")

We can also use a correlation metric less sensitive to outliers

cm <- melt(cor(counts, method = "spearman"))
cm <- cm[-which(cm$value == 1), ]
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2

ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("Effective Count Spearman Correlation matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk unnamed-chunk-3

Just re-checking the distributions with outliers removed and aggregated by sample

tpms_melt <- melt(tpms)
## No id variables; using all as measure variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density() +
  ggtitle("TPM Density plot by section")
## Warning: Removed 355023 rows containing non-finite values (stat_density).
## Warning: Removed 376407 rows containing non-finite values (stat_density).
## Warning: Removed 394947 rows containing non-finite values (stat_density).
## Warning: Removed 402205 rows containing non-finite values (stat_density).
## Warning: Removed 407993 rows containing non-finite values (stat_density).
## Warning: Removed 502191 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-4

ggsave("Fr_density_no_outliers.tpm.pdf")
## Saving 12 x 12 in image
## Warning: Removed 355023 rows containing non-finite values (stat_density).
## Warning: Removed 376407 rows containing non-finite values (stat_density).
## Warning: Removed 394947 rows containing non-finite values (stat_density).
## Warning: Removed 402205 rows containing non-finite values (stat_density).
## Warning: Removed 407993 rows containing non-finite values (stat_density).
## Warning: Removed 502191 rows containing non-finite values (stat_density).

and not aggregated…

ggplot(tpms_melt, aes(x=log(value), colour=variable)) +
  geom_density() +
  ggtitle("TPM Density plot all samples")
## Warning: Removed 223831 rows containing non-finite values (stat_density).
## Warning: Removed 131581 rows containing non-finite values (stat_density).
## Warning: Removed 135416 rows containing non-finite values (stat_density).
## Warning: Removed 133478 rows containing non-finite values (stat_density).
## Warning: Removed 126524 rows containing non-finite values (stat_density).
## Warning: Removed 116630 rows containing non-finite values (stat_density).
## Warning: Removed 143341 rows containing non-finite values (stat_density).
## Warning: Removed 140364 rows containing non-finite values (stat_density).
## Warning: Removed 135802 rows containing non-finite values (stat_density).
## Warning: Removed 135671 rows containing non-finite values (stat_density).
## Warning: Removed 135369 rows containing non-finite values (stat_density).
## Warning: Removed 119025 rows containing non-finite values (stat_density).
## Warning: Removed 135019 rows containing non-finite values (stat_density).
## Warning: Removed 136048 rows containing non-finite values (stat_density).
## Warning: Removed 130987 rows containing non-finite values (stat_density).
## Warning: Removed 125798 rows containing non-finite values (stat_density).
## Warning: Removed 114514 rows containing non-finite values (stat_density).
## Warning: Removed 119368 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-5

we probably should re-normalise TPMs within each column since we removed the highest rows

renorm <- function(x)  {
  x = (x / sum(x)) * 1e6
}
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))

tpms_rn_melt <- melt(tpms_rn)
## No id variables; using all as measure variables
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density() +
  ggtitle("Renormalised TPM Density plot")
## Warning: Removed 355022 rows containing non-finite values (stat_density).
## Warning: Removed 376403 rows containing non-finite values (stat_density).
## Warning: Removed 394943 rows containing non-finite values (stat_density).
## Warning: Removed 402201 rows containing non-finite values (stat_density).
## Warning: Removed 407941 rows containing non-finite values (stat_density).
## Warning: Removed 502179 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-6

Plot the TPM correlation matrices again with outliers removed and then renormalised

cm <- melt(cor(tpms_rn))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("TPM Pearson Correlation matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk unnamed-chunk-7

cm <- cor(tpms_rn, method="spearman")
cm <- melt(cm)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
cm <- cm[-which(cm$value==1),] # remove 1s so the gradient is better
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  ggtitle("TPM Spearman (ranked) correlation matrix") +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk unnamed-chunk-8

Chloroplast genes could be causing problems - if chloroplasts were highly expressing or were captured at higher rates in one replicate's extractions, that could lead to broken correlations

setwd('~/hydrogen/flaveria/Fr/Fr_2/')
chloro <- read.csv('Fr_chloro_contigs.txt', head=F)
names(chloro)[1] <- 'contig'
nc_tpms <- fixed_tpm[-which(fixed_tpm$contig %in% chloro$contig),] # remove chloroplast contigs

tpms <- nc_tpms[,3:20]
rownames(tpms) <- nc_tpms$contig
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))

cm <- melt(cor(tpms_rn))

cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid <- (max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("TPM Correlation Matrix no outliers, no chloroplast") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value="grey50", guide="colorbar")

plot of chunk unnamed-chunk-9

tpms_rn_melt <- melt(tpms_rn)
## No id variables; using all as measure variables
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density() +
  ggtitle("TPM Density plot by section")
## Warning: Removed 354129 rows containing non-finite values (stat_density).
## Warning: Removed 375485 rows containing non-finite values (stat_density).
## Warning: Removed 394019 rows containing non-finite values (stat_density).
## Warning: Removed 401159 rows containing non-finite values (stat_density).
## Warning: Removed 406942 rows containing non-finite values (stat_density).
## Warning: Removed 501035 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-10

Annotation

#setwd('~/hydrogen/flaveria/Fr/Fr_2/')
#anno <- read.csv('Fr_annotation.txt', head=F, sep='\t')            # load annotation (species, contig, agi)
#key <- read.csv('key_agi.txt', head=F, sep='\t')

#names(anno) <- c("species", "contig", "annotation")                # rename column headers
#names(key) <- c("agi", "desc")
#anno_tpm <- merge(data_tpm, anno, by="contig", all=T)              # merge count data and annotation

#library(plyr)
#anno_tpm <- anno_tpm[,-c(1, 2, 21, 22)]                            # remove extraneous columns
#anno_tpm <- ddply(anno_tpm, .(annotation), numcolwise(sum))        # sum columns, grouping by annotation
#key_tpm <- anno_tpm[which(anno_tpm$annotation %in% key$agi),]      
#desc_tpm <- merge(key_tpm, key, by.x="annotation", by.y="agi", all=F) # add gene description

#write.table(desc_tpm, file ="Fr_2ey_agi.tpm.csv", row.names=F, quote=F,sep = "\t")