Load the data
library(ggplot2)
library(gplots)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(reshape2)
setwd('~/hydrogen/flaveria/Fb/Fb_k/')
#setwd('~/flaveria/Fb/Fb_k/')
data_tpm <- read.csv('Fb_express.tpm', sep="\t", head=T)
data_counts <- read.csv('Fb_express.eff_count', sep="\t", head=T)
Pearson Correlation matrix
library(BBmisc)
tpms <- data_tpm[,3:20]
names(tpms) <- gsub(names(tpms), pattern="Fb_k_([0-9])\\.([0-9])", replacement="\\2.\\1")
tmp <- lapply(names(tpms), function(x) explode(x,sep="\\."))
names(tpms) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))
rownames(tpms) <- data_tpm$contig
correlation <- cor(tpms)
cm <- melt(correlation)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Pearson Correlation Matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
Expression Density Plot on TPMs
tpms_melt <- melt(tpms)
## No id variables; using all as measure variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density()
## Warning: Removed 320892 rows containing non-finite values (stat_density).
## Warning: Removed 371573 rows containing non-finite values (stat_density).
## Warning: Removed 353356 rows containing non-finite values (stat_density).
## Warning: Removed 373993 rows containing non-finite values (stat_density).
## Warning: Removed 383017 rows containing non-finite values (stat_density).
## Warning: Removed 408394 rows containing non-finite values (stat_density).
# ggsave("Fb_density.tpm.pdf")
Heatmap on TPMs
plotheatmap <- function(data) {
dists <- dist(t(as.matrix(data)))
mat <- as.matrix(dists)
heatmap.2(mat, trace="none")
}
#pdf("Fb_heatmap.pdf")
plotheatmap(tpms)
#dev.off()
#library(EBSeq)
#library(DESeq) # doesn't install
# Remove any rows containing all-zeros
remove_zero_rows <- function(df) {
df[apply(df, 1, function(x) !all(x==0)),]
}
normalise_counts <- function(counts, normfactors) {
round(t(t(counts) / normfactors))
}
counts <- data_counts[,3:20]
colnames(counts) <- gsub(colnames(counts), pattern="Fb_k_([0-9]).([0-9])", replacement="\\2.\\1")
tmp <- lapply(names(counts), function(x) explode(x,sep="\\."))
names(counts) <- lapply(tmp, function(x) paste(7-as.numeric(x[1]), x[2], sep="."))
rownames(counts) <- data_counts$contig
counts <- remove_zero_rows(round(counts))
cm <- melt(cor(counts))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("Effective Count Correlation matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
# ggsave("Fb_correlation.eff.pdf")
Log distributions of Effective Counts
cf <- as.data.frame(counts)
cf <- cf[c(6,5,4,3,2,1,12,11,10,9,8,7,18,17,16,15,14,13)]
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id='contig')
ggplot(counts_melt, aes(x=log(value), colour=variable)) +
geom_density() +
ggtitle("Effective Count Log Density")
## Warning: Removed 106918 rows containing non-finite values (stat_density).
## Warning: Removed 120979 rows containing non-finite values (stat_density).
## Warning: Removed 117910 rows containing non-finite values (stat_density).
## Warning: Removed 116053 rows containing non-finite values (stat_density).
## Warning: Removed 116277 rows containing non-finite values (stat_density).
## Warning: Removed 127314 rows containing non-finite values (stat_density).
## Warning: Removed 90140 rows containing non-finite values (stat_density).
## Warning: Removed 122505 rows containing non-finite values (stat_density).
## Warning: Removed 108684 rows containing non-finite values (stat_density).
## Warning: Removed 115966 rows containing non-finite values (stat_density).
## Warning: Removed 120907 rows containing non-finite values (stat_density).
## Warning: Removed 123931 rows containing non-finite values (stat_density).
## Warning: Removed 115580 rows containing non-finite values (stat_density).
## Warning: Removed 120356 rows containing non-finite values (stat_density).
## Warning: Removed 118623 rows containing non-finite values (stat_density).
## Warning: Removed 133871 rows containing non-finite values (stat_density).
## Warning: Removed 136919 rows containing non-finite values (stat_density).
## Warning: Removed 149722 rows containing non-finite values (stat_density).
# ggsave("Fb_density.eff.pdf")
Box plot with outliers
ggplot(counts_melt, aes(x=variable, y=value, colour=variable)) +
geom_boxplot() +
ggtitle("Boxplots")
# ggsave("Fb_boxplot.eff.pdf")
Plot distribution of counts over 50,000
counts_50k <- counts_melt[counts_melt$value > 50000,]
ggplot(counts_50k, aes(x=variable, y=value, colour=variable)) +
geom_boxplot() +
ggtitle("Boxplot of Outliers")
# ggsave("Fb_boxplot50k.eff.pdf")
There are some high counts (>2e+05) distorting the distributions - let's remove those rows for the purposes of checking how good replication is for the majority of genes
high_contigs <- unique(counts_50k[which(counts_50k$value > 2e+05),]$contig)
fixed_tpm <- data_tpm[-which(data_tpm$contig %in% high_contigs),] # remove highly expressed contigs
fixed_tpm <- fixed_tpm[,-c(21)]
names(fixed_tpm) <- gsub(names(fixed_tpm), pattern="Fb_k_([0-9]).([0-9])", replacement="\\2.\\1")
tmp <- lapply(names(fixed_tpm), function(x) {
if (nchar(x)==3) {
explode(x,sep="\\.")
} else {
x
}
})
names(fixed_tpm) <- lapply(tmp, function(x) {
if (length(x)==2) {
paste(7-as.numeric(x[1]), x[2], sep=".")
} else {
x
}
})
tpms <- fixed_tpm[,3:20]
rownames(tpms) <- fixed_tpm$contig
cm <- melt(cor(tpms))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Pearson Correlation Matrix with high expression removed") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
# ggsave("Fb_correlation_no_outliers.tpm.pdf")
We can also use a correlation metric less sensitive to outliers
cm <- melt(cor(counts, method = "spearman"))
cm <- cm[-which(cm$value == 1), ]
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("Effective Count Spearman Correlation matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
Just re-checking the distributions with outliers removed and aggregated by sample
tpms_melt <- melt(tpms)
## No id variables; using all as measure variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density() +
ggtitle("TPM Density plot by section")
## Warning: Removed 320892 rows containing non-finite values (stat_density).
## Warning: Removed 371573 rows containing non-finite values (stat_density).
## Warning: Removed 353356 rows containing non-finite values (stat_density).
## Warning: Removed 373993 rows containing non-finite values (stat_density).
## Warning: Removed 383017 rows containing non-finite values (stat_density).
## Warning: Removed 408394 rows containing non-finite values (stat_density).
ggsave("Fb_density_no_outliers.tpm.pdf")
## Saving 12 x 12 in image
## Warning: Removed 320892 rows containing non-finite values (stat_density).
## Warning: Removed 371573 rows containing non-finite values (stat_density).
## Warning: Removed 353356 rows containing non-finite values (stat_density).
## Warning: Removed 373993 rows containing non-finite values (stat_density).
## Warning: Removed 383017 rows containing non-finite values (stat_density).
## Warning: Removed 408394 rows containing non-finite values (stat_density).
and not aggregated…
ggplot(tpms_melt, aes(x=log(value), colour=variable)) +
geom_density() +
ggtitle("TPM Density plot all samples")
## Warning: Removed 129669 rows containing non-finite values (stat_density).
## Warning: Removed 119392 rows containing non-finite values (stat_density).
## Warning: Removed 118842 rows containing non-finite values (stat_density).
## Warning: Removed 120587 rows containing non-finite values (stat_density).
## Warning: Removed 124017 rows containing non-finite values (stat_density).
## Warning: Removed 109569 rows containing non-finite values (stat_density).
## Warning: Removed 126482 rows containing non-finite values (stat_density).
## Warning: Removed 123651 rows containing non-finite values (stat_density).
## Warning: Removed 118615 rows containing non-finite values (stat_density).
## Warning: Removed 111554 rows containing non-finite values (stat_density).
## Warning: Removed 124724 rows containing non-finite values (stat_density).
## Warning: Removed 92911 rows containing non-finite values (stat_density).
## Warning: Removed 152243 rows containing non-finite values (stat_density).
## Warning: Removed 139974 rows containing non-finite values (stat_density).
## Warning: Removed 136536 rows containing non-finite values (stat_density).
## Warning: Removed 121215 rows containing non-finite values (stat_density).
## Warning: Removed 122832 rows containing non-finite values (stat_density).
## Warning: Removed 118412 rows containing non-finite values (stat_density).
we probably should re-normalise TPMs within each column since we removed the highest rows
renorm <- function(x) {
x = (x / sum(x)) * 1e6
}
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
tpms_rn_melt <- melt(tpms_rn)
## No id variables; using all as measure variables
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density() +
ggtitle("Renormalised TPM Density plot")
## Warning: Removed 320860 rows containing non-finite values (stat_density).
## Warning: Removed 371522 rows containing non-finite values (stat_density).
## Warning: Removed 353346 rows containing non-finite values (stat_density).
## Warning: Removed 373964 rows containing non-finite values (stat_density).
## Warning: Removed 382976 rows containing non-finite values (stat_density).
## Warning: Removed 408355 rows containing non-finite values (stat_density).
Plot the TPM correlation matrices again with outliers removed and then renormalised
cm <- melt(cor(tpms_rn))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Pearson Correlation matrix") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
cm <- cor(tpms_rn, method="spearman")
cm <- melt(cm)
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
cm <- cm[-which(cm$value==1),] # remove 1s so the gradient is better
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
ggtitle("TPM Spearman (ranked) correlation matrix") +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value = "grey50", guide="colorbar")
Chloroplast genes could be causing problems - if chloroplasts were highly expressing or were captured at higher rates in one replicate's extractions, that could lead to broken correlations
setwd('~/hydrogen/flaveria/Fb/Fb_k/')
chloro <- read.csv('Fb_chloro_contigs.txt', head=F)
names(chloro)[1] <- 'contig'
nc_tpms <- fixed_tpm[-which(fixed_tpm$contig %in% chloro$contig),] # remove chloroplast contigs
head(fixed_tpm)
## contig length 6.1 5.1 4.1 3.1 2.1 1.1 6.2
## 1 _60173 522 0.0000 0.1802 0.0000 0.4036 0.000 0.000 0.0000
## 2 _84551 402 0.0000 0.0000 0.0000 0.0000 0.000 0.000 0.1214
## 3 _135740 269 0.0000 0.0000 0.0000 0.0000 0.000 0.000 0.0000
## 4 _122250 294 0.9457 0.0000 0.7192 0.0000 0.000 0.000 0.0000
## 5 scaffold8719 2695 0.9160 1.1395 1.7053 2.4615 1.995 2.163 0.8020
## 6 contig-230949 1544 9.1787 12.2532 15.6112 20.9178 14.705 17.456 8.0028
## 5.2 4.2 3.2 2.2 1.2 6.3 5.3 4.3 3.3
## 1 0.000 0.000 0.2149 0.000 0.0000 0.0000 0.0000 0.2808 0.3692
## 2 0.000 0.000 0.0000 0.000 0.4355 0.0000 0.0000 0.0000 0.0000
## 3 0.000 0.000 0.0000 0.000 1.4669 0.0000 0.0000 0.0000 0.0000
## 4 0.000 0.000 0.0000 0.000 0.0000 0.0000 0.0000 0.0000 0.0000
## 5 1.093 1.172 2.2383 2.039 2.3963 0.8466 0.8437 1.0425 1.6588
## 6 11.223 16.627 20.6507 9.358 17.6514 6.7944 8.8251 11.5766 17.7655
## 2.3 1.3
## 1 0.1402 0.1224
## 2 0.0000 0.0000
## 3 0.0000 0.0000
## 4 0.0000 0.0000
## 5 2.5151 2.8498
## 6 21.8288 20.2192
tpms <- nc_tpms[,3:20]
rownames(tpms) <- nc_tpms$contig
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
cm <- melt(cor(tpms_rn))
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid <- (max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("TPM Correlation Matrix no outliers, no chloroplast") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan", mid="yellow", high="red", midpoint=mid, space="Lab", na.value="grey50", guide="colorbar")
tpms_rn_melt <- melt(tpms_rn)
## No id variables; using all as measure variables
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
geom_density() +
ggtitle("TPM Density plot by section")
## Warning: Removed 319763 rows containing non-finite values (stat_density).
## Warning: Removed 370314 rows containing non-finite values (stat_density).
## Warning: Removed 352176 rows containing non-finite values (stat_density).
## Warning: Removed 372647 rows containing non-finite values (stat_density).
## Warning: Removed 381662 rows containing non-finite values (stat_density).
## Warning: Removed 406955 rows containing non-finite values (stat_density).
Annotation
#setwd('~/hydrogen/flaveria/Fb/Fb_k/')
#anno <- read.csv('Fb_annotation.txt', head=F, sep='\t') # load annotation (species, contig, agi)
#key <- read.csv('key_agi.txt', head=F, sep='\t')
#names(anno) <- c("species", "contig", "annotation") # rename column headers
#names(key) <- c("agi", "desc")
#anno_tpm <- merge(data_tpm, anno, by="contig", all=T) # merge count data and annotation
#library(plyr)
#anno_tpm <- anno_tpm[,-c(1, 2, 21, 22)] # remove extraneous columns
#anno_tpm <- ddply(anno_tpm, .(annotation), numcolwise(sum)) # sum columns, grouping by annotation
#key_tpm <- anno_tpm[which(anno_tpm$annotation %in% key$agi),]
#desc_tpm <- merge(key_tpm, key, by.x="annotation", by.y="agi", all=F) # add gene description
#write.table(desc_tpm, file ="Fb_key_agi.tpm.csv", row.names=F, quote=F,sep = "\t")