Flaveria bidentis eXpress data exploration

Load the data

library(ggplot2)
library(gplots)

## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

library(reshape2)
setwd('~/hydrogen/flaveria/Fb/Fb_k/')
#setwd('~/flaveria/Fb/Fb_k/')
data_tpm <- read.csv('Fb_express.tpm', sep="\t", head=T)
data_counts <- read.csv('Fb_express.eff_count', sep="\t", head=T)

TPM

Pearson Correlation matrix

library(BBmisc)
tpms <- data_tpm[,3:20]
names(tpms) <- gsub(names(tpms), pattern="Fb_k_([0-9])\\.([0-9])", replacement="\\2.\\1")

tmp <- lapply(names(tpms), function(x) explode(x,sep="\\."))
names(tpms) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))

rownames(tpms) <- data_tpm$contig
correlation <- cor(tpms)
cm <- melt(correlation)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) + 
  geom_tile(aes(fill=value)) +
  ggtitle("TPM Pearson Correlation Matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk tpm_correlation

Expression Density Plot on TPMs

tpms_melt <- melt(tpms)

## No id variables; using all as measure variables

tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")

ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density()

## Warning: Removed 320892 rows containing non-finite values (stat_density).
## Warning: Removed 371573 rows containing non-finite values (stat_density).
## Warning: Removed 353356 rows containing non-finite values (stat_density).
## Warning: Removed 373993 rows containing non-finite values (stat_density).
## Warning: Removed 383017 rows containing non-finite values (stat_density).
## Warning: Removed 408394 rows containing non-finite values (stat_density).

plot of chunk tpm_density

# ggsave("Fb_density.tpm.pdf")

Heatmap on TPMs

plotheatmap <- function(data) {
  dists <- dist(t(as.matrix(data)))
  mat <- as.matrix(dists)
  heatmap.2(mat, trace="none")
}
#pdf("Fb_heatmap.pdf")
plotheatmap(tpms)

plot of chunk heatmap

#dev.off()

Effective counts

#library(EBSeq)
#library(DESeq) # doesn't install
# Remove any rows containing all-zeros
remove_zero_rows <- function(df) {
  df[apply(df, 1, function(x) !all(x==0)),]
}

normalise_counts <- function(counts, normfactors) {
  round(t(t(counts) / normfactors))
}

counts <- data_counts[,3:20]
colnames(counts) <- gsub(colnames(counts), pattern="Fb_k_([0-9]).([0-9])", replacement="\\2.\\1")

tmp <- lapply(names(counts), function(x) explode(x,sep="\\."))
names(counts) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))

rownames(counts) <- data_counts$contig
counts <- remove_zero_rows(round(counts))

cm <- melt(cor(counts))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2

ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("Effective Count Correlation matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk eff_corr

# ggsave("Fb_correlation.eff.pdf")

Log distributions of Effective Counts

cf <- as.data.frame(counts)
cf <- cf[c(6,5,4,3,2,1,12,11,10,9,8,7,18,17,16,15,14,13)]
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id='contig')
ggplot(counts_melt, aes(x=log(value), colour=variable)) +
  geom_density() +
  ggtitle("Effective Count Log Density")

## Warning: Removed 106918 rows containing non-finite values (stat_density).
## Warning: Removed 120979 rows containing non-finite values (stat_density).
## Warning: Removed 117910 rows containing non-finite values (stat_density).
## Warning: Removed 116053 rows containing non-finite values (stat_density).
## Warning: Removed 116277 rows containing non-finite values (stat_density).
## Warning: Removed 127314 rows containing non-finite values (stat_density).
## Warning: Removed 90140 rows containing non-finite values (stat_density).
## Warning: Removed 122505 rows containing non-finite values (stat_density).
## Warning: Removed 108684 rows containing non-finite values (stat_density).
## Warning: Removed 115966 rows containing non-finite values (stat_density).
## Warning: Removed 120907 rows containing non-finite values (stat_density).
## Warning: Removed 123931 rows containing non-finite values (stat_density).
## Warning: Removed 115580 rows containing non-finite values (stat_density).
## Warning: Removed 120356 rows containing non-finite values (stat_density).
## Warning: Removed 118623 rows containing non-finite values (stat_density).
## Warning: Removed 133871 rows containing non-finite values (stat_density).
## Warning: Removed 136919 rows containing non-finite values (stat_density).
## Warning: Removed 149722 rows containing non-finite values (stat_density).

plot of chunk log_count

# ggsave("Fb_density.eff.pdf")

Box plot with outliers

ggplot(counts_melt, aes(x=variable, y=value, colour=variable)) + 
  geom_boxplot() +
  ggtitle("Boxplots")

plot of chunk boxplot1

# ggsave("Fb_boxplot.eff.pdf")

Plot distribution of counts over 50,000

counts_50k <- counts_melt[counts_melt$value > 50000,]
ggplot(counts_50k, aes(x=variable, y=value, colour=variable)) + 
  geom_boxplot() +
  ggtitle("Boxplot of Outliers")

plot of chunk boxplot2

# ggsave("Fb_boxplot50k.eff.pdf")

There are some high counts (>2e+05) distorting the distributions - let's remove those rows for the purposes of checking how good replication is for the majority of genes

high_contigs <- unique(counts_50k[which(counts_50k$value > 2e+05),]$contig)
fixed_tpm <- data_tpm[-which(data_tpm$contig %in% high_contigs),] # remove highly expressed contigs
fixed_tpm <- fixed_tpm[,-c(21)]
names(fixed_tpm) <- gsub(names(fixed_tpm), pattern="Fb_k_([0-9]).([0-9])", replacement="\\2.\\1")

tmp <- lapply(names(fixed_tpm), function(x) { 
    if (nchar(x)==3) {
      explode(x,sep="\\.")
    } else {
      x
    }
  })
names(fixed_tpm) <- lapply(tmp, function(x) {
  if (length(x)==2) {
    paste(7-as.numeric(x[1]), x[2], sep=".")
  } else {
    x
  }
})

tpms <- fixed_tpm[,3:20]
rownames(tpms) <- fixed_tpm$contig

cm <- melt(cor(tpms))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2

ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("TPM Pearson Correlation Matrix with high expression removed") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk unnamed-chunk-2

# ggsave("Fb_correlation_no_outliers.tpm.pdf")

We can also use a correlation metric less sensitive to outliers

cm <- melt(cor(counts, method = "spearman"))
cm <- cm[-which(cm$value == 1), ]
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2

ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("Effective Count Spearman Correlation matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

Just re-checking the distributions with outliers removed and aggregated by sample

tpms_melt <- melt(tpms)

## No id variables; using all as measure variables

tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density() +
  ggtitle("TPM Density plot by section")

## Warning: Removed 320892 rows containing non-finite values (stat_density).
## Warning: Removed 371573 rows containing non-finite values (stat_density).
## Warning: Removed 353356 rows containing non-finite values (stat_density).
## Warning: Removed 373993 rows containing non-finite values (stat_density).
## Warning: Removed 383017 rows containing non-finite values (stat_density).
## Warning: Removed 408394 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-4

ggsave("Fb_density_no_outliers.tpm.pdf")

## Saving 12 x 12 in image

## Warning: Removed 320892 rows containing non-finite values (stat_density).
## Warning: Removed 371573 rows containing non-finite values (stat_density).
## Warning: Removed 353356 rows containing non-finite values (stat_density).
## Warning: Removed 373993 rows containing non-finite values (stat_density).
## Warning: Removed 383017 rows containing non-finite values (stat_density).
## Warning: Removed 408394 rows containing non-finite values (stat_density).

and not aggregated…

ggplot(tpms_melt, aes(x=log(value), colour=variable)) +
  geom_density() +
  ggtitle("TPM Density plot all samples")

## Warning: Removed 129669 rows containing non-finite values (stat_density).
## Warning: Removed 119392 rows containing non-finite values (stat_density).
## Warning: Removed 118842 rows containing non-finite values (stat_density).
## Warning: Removed 120587 rows containing non-finite values (stat_density).
## Warning: Removed 124017 rows containing non-finite values (stat_density).
## Warning: Removed 109569 rows containing non-finite values (stat_density).
## Warning: Removed 126482 rows containing non-finite values (stat_density).
## Warning: Removed 123651 rows containing non-finite values (stat_density).
## Warning: Removed 118615 rows containing non-finite values (stat_density).
## Warning: Removed 111554 rows containing non-finite values (stat_density).
## Warning: Removed 124724 rows containing non-finite values (stat_density).
## Warning: Removed 92911 rows containing non-finite values (stat_density).
## Warning: Removed 152243 rows containing non-finite values (stat_density).
## Warning: Removed 139974 rows containing non-finite values (stat_density).
## Warning: Removed 136536 rows containing non-finite values (stat_density).
## Warning: Removed 121215 rows containing non-finite values (stat_density).
## Warning: Removed 122832 rows containing non-finite values (stat_density).
## Warning: Removed 118412 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-5

we probably should re-normalise TPMs within each column since we removed the highest rows

renorm <- function(x)  {
  x = (x / sum(x)) * 1e6
}
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))

tpms_rn_melt <- melt(tpms_rn)

## No id variables; using all as measure variables

tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density() +
  ggtitle("Renormalised TPM Density plot")

## Warning: Removed 320860 rows containing non-finite values (stat_density).
## Warning: Removed 371522 rows containing non-finite values (stat_density).
## Warning: Removed 353346 rows containing non-finite values (stat_density).
## Warning: Removed 373964 rows containing non-finite values (stat_density).
## Warning: Removed 382976 rows containing non-finite values (stat_density).
## Warning: Removed 408355 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-6

Plot the TPM correlation matrices again with outliers removed and then renormalised

cm <- melt(cor(tpms_rn))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("TPM Pearson Correlation matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk unnamed-chunk-7

cm <- cor(tpms_rn, method="spearman")
cm <- melt(cm)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
cm <- cm[-which(cm$value==1),] # remove 1s so the gradient is better
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  ggtitle("TPM Spearman (ranked) correlation matrix") +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk unnamed-chunk-8

Chloroplast genes could be causing problems - if chloroplasts were highly expressing or were captured at higher rates in one replicate's extractions, that could lead to broken correlations

setwd('~/hydrogen/flaveria/Fb/Fb_k/')
chloro <- read.csv('Fb_chloro_contigs.txt', head=F)
names(chloro)[1] <- 'contig'
nc_tpms <- fixed_tpm[-which(fixed_tpm$contig %in% chloro$contig),] # remove chloroplast contigs
head(fixed_tpm)

##          contig length    6.1     5.1     4.1     3.1    2.1    1.1    6.2
## 1        _60173    522 0.0000  0.1802  0.0000  0.4036  0.000  0.000 0.0000
## 2        _84551    402 0.0000  0.0000  0.0000  0.0000  0.000  0.000 0.1214
## 3       _135740    269 0.0000  0.0000  0.0000  0.0000  0.000  0.000 0.0000
## 4       _122250    294 0.9457  0.0000  0.7192  0.0000  0.000  0.000 0.0000
## 5  scaffold8719   2695 0.9160  1.1395  1.7053  2.4615  1.995  2.163 0.8020
## 6 contig-230949   1544 9.1787 12.2532 15.6112 20.9178 14.705 17.456 8.0028
##      5.2    4.2     3.2   2.2     1.2    6.3    5.3     4.3     3.3
## 1  0.000  0.000  0.2149 0.000  0.0000 0.0000 0.0000  0.2808  0.3692
## 2  0.000  0.000  0.0000 0.000  0.4355 0.0000 0.0000  0.0000  0.0000
## 3  0.000  0.000  0.0000 0.000  1.4669 0.0000 0.0000  0.0000  0.0000
## 4  0.000  0.000  0.0000 0.000  0.0000 0.0000 0.0000  0.0000  0.0000
## 5  1.093  1.172  2.2383 2.039  2.3963 0.8466 0.8437  1.0425  1.6588
## 6 11.223 16.627 20.6507 9.358 17.6514 6.7944 8.8251 11.5766 17.7655
##       2.3     1.3
## 1  0.1402  0.1224
## 2  0.0000  0.0000
## 3  0.0000  0.0000
## 4  0.0000  0.0000
## 5  2.5151  2.8498
## 6 21.8288 20.2192

tpms <- nc_tpms[,3:20]
rownames(tpms) <- nc_tpms$contig
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))

cm <- melt(cor(tpms_rn))

cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid <- (max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("TPM Correlation Matrix no outliers, no chloroplast") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value="grey50", guide="colorbar")

plot of chunk unnamed-chunk-9

tpms_rn_melt <- melt(tpms_rn)

## No id variables; using all as measure variables

tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density() +
  ggtitle("TPM Density plot by section")

## Warning: Removed 319763 rows containing non-finite values (stat_density).
## Warning: Removed 370314 rows containing non-finite values (stat_density).
## Warning: Removed 352176 rows containing non-finite values (stat_density).
## Warning: Removed 372647 rows containing non-finite values (stat_density).
## Warning: Removed 381662 rows containing non-finite values (stat_density).
## Warning: Removed 406955 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-10

Annotation

#setwd('~/hydrogen/flaveria/Fb/Fb_k/')
#anno <- read.csv('Fb_annotation.txt', head=F, sep='\t')            # load annotation (species, contig, agi)
#key <- read.csv('key_agi.txt', head=F, sep='\t')

#names(anno) <- c("species", "contig", "annotation")                # rename column headers
#names(key) <- c("agi", "desc")
#anno_tpm <- merge(data_tpm, anno, by="contig", all=T)              # merge count data and annotation

#library(plyr)
#anno_tpm <- anno_tpm[,-c(1, 2, 21, 22)]                            # remove extraneous columns
#anno_tpm <- ddply(anno_tpm, .(annotation), numcolwise(sum))        # sum columns, grouping by annotation
#key_tpm <- anno_tpm[which(anno_tpm$annotation %in% key$agi),]      
#desc_tpm <- merge(key_tpm, key, by.x="annotation", by.y="agi", all=F) # add gene description

#write.table(desc_tpm, file ="Fb_key_agi.tpm.csv", row.names=F, quote=F,sep = "\t")