Load the data
library(ggplot2)
library(gplots)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(reshape2)
setwd('~/hydrogen/flaveria/Fr/Fr_2/')
#setwd('~/flaveria/Fr/Fr_2/')
data_tpm <- read.csv('Fr_express.tpm', sep="\t", head=T)
data_counts <- read.csv('Fr_express.eff_count', sep="\t", head=T)
Pearson Correlation matrix
library(BBmisc)
tpms <- data_tpm[,3:20]
names(tpms) <- gsub(names(tpms), pattern="Fr_([0-9])\\.([0-9])", replacement="\\2.\\1")
tmp <- lapply(names(tpms), function(x) explode(x,sep="\\."))
names(tpms) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))
rownames(tpms) <- data_tpm$contig
correlation <- cor(tpms)
cm <- melt(correlation)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Pearson Correlation Matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
Expression Density Plot on TPMs
tpms_melt <- melt(tpms)
## No id variables; using all as measure variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density()
## Warning: Removed 355023 rows containing non-finite values (stat_density).
## Warning: Removed 376407 rows containing non-finite values (stat_density).
## Warning: Removed 394947 rows containing non-finite values (stat_density).
## Warning: Removed 402205 rows containing non-finite values (stat_density).
## Warning: Removed 407993 rows containing non-finite values (stat_density).
## Warning: Removed 502191 rows containing non-finite values (stat_density).
# ggsave("Fr_density.tpm.pdf")
Heatmap on TPMs
plotheatmap <- function(data) {
dists <- dist(t(as.matrix(data)))
mat <- as.matrix(dists)
heatmap.2(mat, trace="none")
}
#pdf("Fr_heatmap.pdf")
plotheatmap(tpms)
#dev.off()
#library(EBSeq)
#library(DESeq) # doesn't install
# Remove any rows containing all-zeros
remove_zero_rows <- function(df) {
df[apply(df, 1, function(x) !all(x==0)),]
}
normalise_counts <- function(counts, normfactors) {
round(t(t(counts) / normfactors))
}
counts <- data_counts[,3:20]
colnames(counts) <- gsub(colnames(counts), pattern="Fr_([0-9]).([0-9])", replacement="\\2.\\1")
tmp <- lapply(names(counts), function(x) explode(x,sep="\\."))
names(counts) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))
rownames(counts) <- data_counts$contig
counts <- remove_zero_rows(round(counts))
cm <- melt(cor(counts))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("Effective Count Correlation matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
# ggsave("Fr_correlation.eff.pdf")
Log distributions of Effective Counts
cf <- as.data.frame(counts)
cf <- cf[c(6,5,4,3,2,1,12,11,10,9,8,7,18,17,16,15,14,13)]
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id='contig')
ggplot(counts_melt, aes(x=log(value), colour=variable)) +
geom_density() +
ggtitle("Effective Count Log Density")
## Warning: Removed 112528 rows containing non-finite values (stat_density).
## Warning: Removed 122677 rows containing non-finite values (stat_density).
## Warning: Removed 129828 rows containing non-finite values (stat_density).
## Warning: Removed 131359 rows containing non-finite values (stat_density).
## Warning: Removed 127418 rows containing non-finite values (stat_density).
## Warning: Removed 221758 rows containing non-finite values (stat_density).
## Warning: Removed 115542 rows containing non-finite values (stat_density).
## Warning: Removed 131613 rows containing non-finite values (stat_density).
## Warning: Removed 131927 rows containing non-finite values (stat_density).
## Warning: Removed 131764 rows containing non-finite values (stat_density).
## Warning: Removed 136712 rows containing non-finite values (stat_density).
## Warning: Removed 139107 rows containing non-finite values (stat_density).
## Warning: Removed 115333 rows containing non-finite values (stat_density).
## Warning: Removed 109883 rows containing non-finite values (stat_density).
## Warning: Removed 121226 rows containing non-finite values (stat_density).
## Warning: Removed 126440 rows containing non-finite values (stat_density).
## Warning: Removed 131301 rows containing non-finite values (stat_density).
## Warning: Removed 130164 rows containing non-finite values (stat_density).
# ggsave("Fr_density.eff.pdf")
Box plot with outliers
ggplot(counts_melt, aes(x=variable, y=value, colour=variable)) +
geom_boxplot() +
ggtitle("Boxplots")
# ggsave("Fr_boxplot.eff.pdf")
Plot distribution of counts over 50,000
counts_50k <- counts_melt[counts_melt$value > 50000,]
ggplot(counts_50k, aes(x=variable, y=value, colour=variable)) +
geom_boxplot() +
ggtitle("Boxplot of Outliers")
# ggsave("Fr_boxplot50k.eff.pdf")
There are some high counts (>2e+05) distorting the distributions - let's remove those rows for the purposes of checking how good replication is for the majority of genes
high_contigs <- unique(counts_50k[which(counts_50k$value > 2e+05),]$contig)
fixed_tpm <- data_tpm[-which(data_tpm$contig %in% high_contigs),] # remove highly expressed contigs
fixed_tpm <- fixed_tpm[,-c(21)]
names(fixed_tpm) <- gsub(names(fixed_tpm), pattern="Fr_([0-9]).([0-9])", replacement="\\2.\\1")
tmp <- lapply(names(fixed_tpm), function(x) {
if (nchar(x)==3) {
explode(x,sep="\\.")
} else {
x
}
})
names(fixed_tpm) <- lapply(tmp, function(x) {
if (length(x)==2) {
paste(7-as.numeric(x[1]), x[2], sep=".")
} else {
x
}
})
tpms <- fixed_tpm[,3:20]
rownames(tpms) <- fixed_tpm$contig
cm <- melt(cor(tpms))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Pearson Correlation Matrix with high expression removed") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
# ggsave("Fr_correlation_no_outliers.tpm.pdf")
We can also use a correlation metric less sensitive to outliers
cm <- melt(cor(counts, method = "spearman"))
cm <- cm[-which(cm$value == 1), ]
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("Effective Count Spearman Correlation matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
Just re-checking the distributions with outliers removed and aggregated by sample
tpms_melt <- melt(tpms)
## No id variables; using all as measure variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density() +
ggtitle("TPM Density plot by section")
## Warning: Removed 355023 rows containing non-finite values (stat_density).
## Warning: Removed 376407 rows containing non-finite values (stat_density).
## Warning: Removed 394947 rows containing non-finite values (stat_density).
## Warning: Removed 402205 rows containing non-finite values (stat_density).
## Warning: Removed 407993 rows containing non-finite values (stat_density).
## Warning: Removed 502191 rows containing non-finite values (stat_density).
ggsave("Fr_density_no_outliers.tpm.pdf")
## Saving 12 x 12 in image
## Warning: Removed 355023 rows containing non-finite values (stat_density).
## Warning: Removed 376407 rows containing non-finite values (stat_density).
## Warning: Removed 394947 rows containing non-finite values (stat_density).
## Warning: Removed 402205 rows containing non-finite values (stat_density).
## Warning: Removed 407993 rows containing non-finite values (stat_density).
## Warning: Removed 502191 rows containing non-finite values (stat_density).
and not aggregated…
ggplot(tpms_melt, aes(x=log(value), colour=variable)) +
geom_density() +
ggtitle("TPM Density plot all samples")
## Warning: Removed 223831 rows containing non-finite values (stat_density).
## Warning: Removed 131581 rows containing non-finite values (stat_density).
## Warning: Removed 135416 rows containing non-finite values (stat_density).
## Warning: Removed 133478 rows containing non-finite values (stat_density).
## Warning: Removed 126524 rows containing non-finite values (stat_density).
## Warning: Removed 116630 rows containing non-finite values (stat_density).
## Warning: Removed 143341 rows containing non-finite values (stat_density).
## Warning: Removed 140364 rows containing non-finite values (stat_density).
## Warning: Removed 135802 rows containing non-finite values (stat_density).
## Warning: Removed 135671 rows containing non-finite values (stat_density).
## Warning: Removed 135369 rows containing non-finite values (stat_density).
## Warning: Removed 119025 rows containing non-finite values (stat_density).
## Warning: Removed 135019 rows containing non-finite values (stat_density).
## Warning: Removed 136048 rows containing non-finite values (stat_density).
## Warning: Removed 130987 rows containing non-finite values (stat_density).
## Warning: Removed 125798 rows containing non-finite values (stat_density).
## Warning: Removed 114514 rows containing non-finite values (stat_density).
## Warning: Removed 119368 rows containing non-finite values (stat_density).
we probably should re-normalise TPMs within each column since we removed the highest rows
renorm <- function(x) {
x = (x / sum(x)) * 1e6
}
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
tpms_rn_melt <- melt(tpms_rn)
## No id variables; using all as measure variables
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density() +
ggtitle("Renormalised TPM Density plot")
## Warning: Removed 355022 rows containing non-finite values (stat_density).
## Warning: Removed 376403 rows containing non-finite values (stat_density).
## Warning: Removed 394943 rows containing non-finite values (stat_density).
## Warning: Removed 402201 rows containing non-finite values (stat_density).
## Warning: Removed 407941 rows containing non-finite values (stat_density).
## Warning: Removed 502179 rows containing non-finite values (stat_density).
Plot the TPM correlation matrices again with outliers removed and then renormalised
cm <- melt(cor(tpms_rn))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Pearson Correlation matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
cm <- cor(tpms_rn, method="spearman")
cm <- melt(cm)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
cm <- cm[-which(cm$value==1),] # remove 1s so the gradient is better
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
ggtitle("TPM Spearman (ranked) correlation matrix") +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
Chloroplast genes could be causing problems - if chloroplasts were highly expressing or were captured at higher rates in one replicate's extractions, that could lead to broken correlations
setwd('~/hydrogen/flaveria/Fr/Fr_2/')
chloro <- read.csv('Fr_chloro_contigs.txt', head=F)
names(chloro)[1] <- 'contig'
nc_tpms <- fixed_tpm[-which(fixed_tpm$contig %in% chloro$contig),] # remove chloroplast contigs
tpms <- nc_tpms[,3:20]
rownames(tpms) <- nc_tpms$contig
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
cm <- melt(cor(tpms_rn))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid <- (max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Correlation Matrix no outliers, no chloroplast") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value="grey50", guide="colorbar")
tpms_rn_melt <- melt(tpms_rn)
## No id variables; using all as measure variables
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density() +
ggtitle("TPM Density plot by section")
## Warning: Removed 354129 rows containing non-finite values (stat_density).
## Warning: Removed 375485 rows containing non-finite values (stat_density).
## Warning: Removed 394019 rows containing non-finite values (stat_density).
## Warning: Removed 401159 rows containing non-finite values (stat_density).
## Warning: Removed 406942 rows containing non-finite values (stat_density).
## Warning: Removed 501035 rows containing non-finite values (stat_density).
Annotation
#setwd('~/hydrogen/flaveria/Fr/Fr_2/')
#anno <- read.csv('Fr_annotation.txt', head=F, sep='\t') # load annotation (species, contig, agi)
#key <- read.csv('key_agi.txt', head=F, sep='\t')
#names(anno) <- c("species", "contig", "annotation") # rename column headers
#names(key) <- c("agi", "desc")
#anno_tpm <- merge(data_tpm, anno, by="contig", all=T) # merge count data and annotation
#library(plyr)
#anno_tpm <- anno_tpm[,-c(1, 2, 21, 22)] # remove extraneous columns
#anno_tpm <- ddply(anno_tpm, .(annotation), numcolwise(sum)) # sum columns, grouping by annotation
#key_tpm <- anno_tpm[which(anno_tpm$annotation %in% key$agi),]
#desc_tpm <- merge(key_tpm, key, by.x="annotation", by.y="agi", all=F) # add gene description
#write.table(desc_tpm, file ="Fr_2ey_agi.tpm.csv", row.names=F, quote=F,sep = "\t")