Load the data
library(ggplot2)
library(gplots)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(reshape2)
setwd('~/hydrogen/flaveria/Ft/Ft_2/')
#setwd('~/flaveria/Ft/Ft_2/')
data_tpm <- read.csv('Ft_express.tpm', sep="\t", head=T)
data_counts <- read.csv('Ft_express.eff_count', sep="\t", head=T)
Pearson Correlation matrix
library(BBmisc)
tpms <- data_tpm[,3:20]
names(tpms) <- gsub(names(tpms), pattern="Ft_([0-9])\\.([0-9])", replacement="\\2.\\1")
tmp <- lapply(names(tpms), function(x) explode(x,sep="\\."))
names(tpms) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))
rownames(tpms) <- data_tpm$contig
correlation <- cor(tpms)
cm <- melt(correlation)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Pearson Correlation Matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
Expression Density Plot on TPMs
tpms_melt <- melt(tpms)
## No id variables; using all as measure variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density()
## Warning: Removed 331292 rows containing non-finite values (stat_density).
## Warning: Removed 446142 rows containing non-finite values (stat_density).
## Warning: Removed 497262 rows containing non-finite values (stat_density).
## Warning: Removed 437989 rows containing non-finite values (stat_density).
## Warning: Removed 453320 rows containing non-finite values (stat_density).
## Warning: Removed 450578 rows containing non-finite values (stat_density).
# ggsave("Ft_density.tpm.pdf")
Heatmap on TPMs
plotheatmap <- function(data) {
dists <- dist(t(as.matrix(data)))
mat <- as.matrix(dists)
heatmap.2(mat, trace="none")
}
#pdf("Ft_heatmap.pdf")
plotheatmap(tpms)
#dev.off()
#library(EBSeq)
#library(DESeq) # doesn't install
# Remove any rows containing all-zeros
remove_zero_rows <- function(df) {
df[apply(df, 1, function(x) !all(x==0)),]
}
normalise_counts <- function(counts, normfactors) {
round(t(t(counts) / normfactors))
}
counts <- data_counts[,3:20]
colnames(counts) <- gsub(colnames(counts), pattern="Ft_([0-9]).([0-9])", replacement="\\2.\\1")
tmp <- lapply(names(counts), function(x) explode(x,sep="\\."))
names(counts) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))
rownames(counts) <- data_counts$contig
counts <- remove_zero_rows(round(counts))
cm <- melt(cor(counts))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("Effective Count Correlation matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
# ggsave("Ft_correlation.eff.pdf")
Log distributions of Effective Counts
cf <- as.data.frame(counts)
cf <- cf[c(6,5,4,3,2,1,12,11,10,9,8,7,18,17,16,15,14,13)]
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id='contig')
ggplot(counts_melt, aes(x=log(value), colour=variable)) +
geom_density() +
ggtitle("Effective Count Log Density")
## Warning: Removed 106588 rows containing non-finite values (stat_density).
## Warning: Removed 142225 rows containing non-finite values (stat_density).
## Warning: Removed 145041 rows containing non-finite values (stat_density).
## Warning: Removed 136878 rows containing non-finite values (stat_density).
## Warning: Removed 148940 rows containing non-finite values (stat_density).
## Warning: Removed 148236 rows containing non-finite values (stat_density).
## Warning: Removed 124201 rows containing non-finite values (stat_density).
## Warning: Removed 131955 rows containing non-finite values (stat_density).
## Warning: Removed 153031 rows containing non-finite values (stat_density).
## Warning: Removed 146090 rows containing non-finite values (stat_density).
## Warning: Removed 146897 rows containing non-finite values (stat_density).
## Warning: Removed 148555 rows containing non-finite values (stat_density).
## Warning: Removed 86345 rows containing non-finite values (stat_density).
## Warning: Removed 158390 rows containing non-finite values (stat_density).
## Warning: Removed 185867 rows containing non-finite values (stat_density).
## Warning: Removed 140241 rows containing non-finite values (stat_density).
## Warning: Removed 142688 rows containing non-finite values (stat_density).
## Warning: Removed 139276 rows containing non-finite values (stat_density).
# ggsave("Ft_density.eff.pdf")
Box plot with outliers
ggplot(counts_melt, aes(x=variable, y=value, colour=variable)) +
geom_boxplot() +
ggtitle("Boxplots")
# ggsave("Ft_boxplot.eff.pdf")
Plot distribution of counts over 50,000
counts_50k <- counts_melt[counts_melt$value > 50000,]
ggplot(counts_50k, aes(x=variable, y=value, colour=variable)) +
geom_boxplot() +
ggtitle("Boxplot of Outliers")
# ggsave("Ft_boxplot50k.eff.pdf")
There are some high counts (>2e+05) distorting the distributions - let's remove those rows for the purposes of checking how good replication is for the majority of genes
high_contigs <- unique(counts_50k[which(counts_50k$value > 2e+05),]$contig)
fixed_tpm <- data_tpm[-which(data_tpm$contig %in% high_contigs),] # remove highly expressed contigs
fixed_tpm <- fixed_tpm[,-c(21)]
names(fixed_tpm) <- gsub(names(fixed_tpm), pattern="Ft_([0-9]).([0-9])", replacement="\\2.\\1")
tmp <- lapply(names(fixed_tpm), function(x) {
if (nchar(x)==3) {
explode(x,sep="\\.")
} else {
x
}
})
names(fixed_tpm) <- lapply(tmp, function(x) {
if (length(x)==2) {
paste(7-as.numeric(x[1]), x[2], sep=".")
} else {
x
}
})
tpms <- fixed_tpm[,3:20]
rownames(tpms) <- fixed_tpm$contig
cm <- melt(cor(tpms))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Pearson Correlation Matrix with high expression removed") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
# ggsave("Ft_correlation_no_outliers.tpm.pdf")
We can also use a correlation metric less sensitive to outliers
cm <- melt(cor(counts, method = "spearman"))
cm <- cm[-which(cm$value == 1), ]
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("Effective Count Spearman Correlation matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
Just re-checking the distributions with outliers removed and aggregated by sample
tpms_melt <- melt(tpms)
## No id variables; using all as measure variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density() +
ggtitle("TPM Density plot by section")
## Warning: Removed 331292 rows containing non-finite values (stat_density).
## Warning: Removed 446142 rows containing non-finite values (stat_density).
## Warning: Removed 497262 rows containing non-finite values (stat_density).
## Warning: Removed 437989 rows containing non-finite values (stat_density).
## Warning: Removed 453320 rows containing non-finite values (stat_density).
## Warning: Removed 450578 rows containing non-finite values (stat_density).
ggsave("Ft_density_no_outliers.tpm.pdf")
## Saving 12 x 12 in image
## Warning: Removed 331292 rows containing non-finite values (stat_density).
## Warning: Removed 446142 rows containing non-finite values (stat_density).
## Warning: Removed 497262 rows containing non-finite values (stat_density).
## Warning: Removed 437989 rows containing non-finite values (stat_density).
## Warning: Removed 453320 rows containing non-finite values (stat_density).
## Warning: Removed 450578 rows containing non-finite values (stat_density).
and not aggregated…
ggplot(tpms_melt, aes(x=log(value), colour=variable)) +
geom_density() +
ggtitle("TPM Density plot all samples")
## Warning: Removed 152721 rows containing non-finite values (stat_density).
## Warning: Removed 153764 rows containing non-finite values (stat_density).
## Warning: Removed 141853 rows containing non-finite values (stat_density).
## Warning: Removed 149903 rows containing non-finite values (stat_density).
## Warning: Removed 146999 rows containing non-finite values (stat_density).
## Warning: Removed 111092 rows containing non-finite values (stat_density).
## Warning: Removed 153347 rows containing non-finite values (stat_density).
## Warning: Removed 151635 rows containing non-finite values (stat_density).
## Warning: Removed 150690 rows containing non-finite values (stat_density).
## Warning: Removed 157156 rows containing non-finite values (stat_density).
## Warning: Removed 136160 rows containing non-finite values (stat_density).
## Warning: Removed 129087 rows containing non-finite values (stat_density).
## Warning: Removed 144510 rows containing non-finite values (stat_density).
## Warning: Removed 147921 rows containing non-finite values (stat_density).
## Warning: Removed 145446 rows containing non-finite values (stat_density).
## Warning: Removed 190203 rows containing non-finite values (stat_density).
## Warning: Removed 162983 rows containing non-finite values (stat_density).
## Warning: Removed 91113 rows containing non-finite values (stat_density).
we probably should re-normalise TPMs within each column since we removed the highest rows
renorm <- function(x) {
x = (x / sum(x)) * 1e6
}
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
tpms_rn_melt <- melt(tpms_rn)
## No id variables; using all as measure variables
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density() +
ggtitle("Renormalised TPM Density plot")
## Warning: Removed 331291 rows containing non-finite values (stat_density).
## Warning: Removed 446129 rows containing non-finite values (stat_density).
## Warning: Removed 497247 rows containing non-finite values (stat_density).
## Warning: Removed 437957 rows containing non-finite values (stat_density).
## Warning: Removed 453285 rows containing non-finite values (stat_density).
## Warning: Removed 450534 rows containing non-finite values (stat_density).
Plot the TPM correlation matrices again with outliers removed and then renormalised
cm <- melt(cor(tpms_rn))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Pearson Correlation matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
cm <- cor(tpms_rn, method="spearman")
cm <- melt(cm)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
cm <- cm[-which(cm$value==1),] # remove 1s so the gradient is better
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
ggtitle("TPM Spearman (ranked) correlation matrix") +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
Chloroplast genes could be causing problems - if chloroplasts were highly expressing or were captured at higher rates in one replicate's extractions, that could lead to broken correlations
setwd('~/hydrogen/flaveria/Ft/Ft_2/')
chloro <- read.csv('Ft_chloro_contigs.txt', head=F)
names(chloro)[1] <- 'contig'
nc_tpms <- fixed_tpm[-which(fixed_tpm$contig %in% chloro$contig),] # remove chloroplast contigs
tpms <- nc_tpms[,3:20]
rownames(tpms) <- nc_tpms$contig
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
cm <- melt(cor(tpms_rn))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid <- (max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Correlation Matrix no outliers, no chloroplast") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value="grey50", guide="colorbar")
tpms_rn_melt <- melt(tpms_rn)
## No id variables; using all as measure variables
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density() +
ggtitle("TPM Density plot by section")
## Warning: Removed 329595 rows containing non-finite values (stat_density).
## Warning: Removed 444505 rows containing non-finite values (stat_density).
## Warning: Removed 495493 rows containing non-finite values (stat_density).
## Warning: Removed 436595 rows containing non-finite values (stat_density).
## Warning: Removed 451607 rows containing non-finite values (stat_density).
## Warning: Removed 448890 rows containing non-finite values (stat_density).
Annotation
#setwd('~/hydrogen/flaveria/Ft/Ft_2/')
#anno <- read.csv('Ft_annotation.txt', head=F, sep='\t') # load annotation (species, contig, agi)
#key <- read.csv('key_agi.txt', head=F, sep='\t')
#names(anno) <- c("species", "contig", "annotation") # rename column headers
#names(key) <- c("agi", "desc")
#anno_tpm <- merge(data_tpm, anno, by="contig", all=T) # merge count data and annotation
#library(plyr)
#anno_tpm <- anno_tpm[,-c(1, 2, 21, 22)] # remove extraneous columns
#anno_tpm <- ddply(anno_tpm, .(annotation), numcolwise(sum)) # sum columns, grouping by annotation
#key_tpm <- anno_tpm[which(anno_tpm$annotation %in% key$agi),]
#desc_tpm <- merge(key_tpm, key, by.x="annotation", by.y="agi", all=F) # add gene description
#write.table(desc_tpm, file ="Ft_2ey_agi.tpm.csv", row.names=F, quote=F,sep = "\t")