We used all the RNA-seq data available in the lab and the genome to map and generate an improved annotation. We used Tophat-Cufflinks and RSEM to obtain normalized counts and TPMs for each sample
library(ggplot2)
library(gplots)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(reshape2)
tpms <- read.delim("genes_TPMs.tsv", header=TRUE, sep=",")
rownames(tpms) <-tpms$gene
tpms <-tpms[,c(3:5,12:23)]
colnames(tpms) <- c("WL1","WL2", "WL3", "rol_M1",
"rol_M2", "rol_M3", "rol_BS1",
"rol_BS2", "rol_BS3", "lcm_M1",
"lcm_M2", "lcm_M3", "lcm_BS1",
"lcm_BS2","lcm_BS3")
head(tpms)
## WL1 WL2 WL3 rol_M1 rol_M2 rol_M3 rol_BS1 rol_BS2 rol_BS3
## XLOC_000001 49.82 44.42 46.14 49.56 69.14 56.42 51.80 68.43 60.27
## XLOC_000002 8.45 8.04 7.11 6.96 4.98 12.16 15.10 19.45 15.09
## XLOC_000003 50.73 41.98 47.44 53.62 72.96 62.30 60.98 90.50 66.05
## XLOC_000004 5.70 3.26 4.36 0.79 0.37 0.20 12.62 2.70 7.37
## XLOC_000005 33.56 29.65 29.32 26.58 29.41 27.19 26.43 28.03 30.24
## XLOC_000006 48.38 48.26 46.64 27.62 14.90 26.77 28.59 40.48 33.02
## lcm_M1 lcm_M2 lcm_M3 lcm_BS1 lcm_BS2 lcm_BS3
## XLOC_000001 374.83 377.67 395.01 476.60 386.23 393.73
## XLOC_000002 0.78 1.32 1.10 1.14 1.31 0.94
## XLOC_000003 94.83 97.73 111.94 80.74 75.71 82.52
## XLOC_000004 0.95 0.93 0.88 1.42 1.26 1.82
## XLOC_000005 31.08 28.10 26.28 26.74 25.00 28.24
## XLOC_000006 8.34 11.25 10.06 16.13 15.56 17.82
correlation <- cor(tpms)
cm <- melt(correlation)
colnames(cm)<- c("Var1","Var2","value")
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
Pearson Correlation
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("Pearson Correlation Matrix on TPMs") +
geom_text(aes(label = round(value, 2))) +
theme_bw() +
xlab('') +
ylab('') +
scale_fill_gradient2(low="cyan",
mid="yellow",
high="red",
midpoint=mid,
space="Lab",
na.value = "grey50",
guide="colorbar")
Expression Density Plot on TPMs
tpms_melt <- melt(tpms)
## No id variables; using all as measure variables
tpms_melt$value[tpms_melt$value < 0.01] <- 0
ggplot(tpms_melt, aes(x=log(value), group=variable, colour=variable)) +
ggtitle("log density TPMs") +
theme_bw()+
geom_density()+
theme(axis.title.y = element_text(size = rel(1.5), angle = 90)) +
theme(axis.title.x = element_text(size = rel(1.5), angle = 0)) +
labs(y = "Density")
## Warning: Removed 1219 rows containing non-finite values (stat_density).
## Warning: Removed 1716 rows containing non-finite values (stat_density).
## Warning: Removed 1548 rows containing non-finite values (stat_density).
## Warning: Removed 2917 rows containing non-finite values (stat_density).
## Warning: Removed 2838 rows containing non-finite values (stat_density).
## Warning: Removed 2544 rows containing non-finite values (stat_density).
## Warning: Removed 1164 rows containing non-finite values (stat_density).
## Warning: Removed 1587 rows containing non-finite values (stat_density).
## Warning: Removed 1330 rows containing non-finite values (stat_density).
## Warning: Removed 3608 rows containing non-finite values (stat_density).
## Warning: Removed 3834 rows containing non-finite values (stat_density).
## Warning: Removed 4048 rows containing non-finite values (stat_density).
## Warning: Removed 3004 rows containing non-finite values (stat_density).
## Warning: Removed 3517 rows containing non-finite values (stat_density).
## Warning: Removed 3191 rows containing non-finite values (stat_density).
BS/M samples from rolling are less homogeneous compared with LCM and WL samples
ALLreads <- read.delim("all_genes_effcounts.tsv", header=TRUE, sep ="\t")
LCMreads <- read.delim("LCM_genes_effcounts.tsv", header=TRUE, sep ="\t")
data_counts <- merge(x =ALLreads, y= LCMreads, by.x= "gene", by.y= "gene", all=TRUE)
data_counts <- data_counts[complete.cases(data_counts),]
head(data_counts)
## gene CgTOT1 CgTOT2 CgTOT3 Cg35S1 Cg35S2 Cg35S3 CgGDC1 CgGDC2
## 1 XLOC_000001 705 515 620 1891 1564 1687 1311 1318
## 2 XLOC_000002 650 490 487 145 144 180 184 197
## 3 XLOC_000003 609 416 530 1553 1182 1247 721 781
## 4 XLOC_000004 169 80 120 133 103 141 47 99
## 5 XLOC_000005 803 584 649 1280 1044 1216 694 598
## 6 XLOC_000006 1696 1391 1516 1118 1024 1490 1773 1555
## CgGDC3 BS1 BS2 BS3 M1 M2 M3 LCM_M1 LCM_M2 LCM_M3 LCM_BS1 LCM_BS2
## 1 1083 630 814 845 410 973 796 7600 7129 5821 12411 7507
## 2 181 956 1354 935 301 320 885 48 74 49 91 78
## 3 869 674 930 728 416 895 748 1338 1288 1150 1465 1028
## 4 61 342 69 200 16 12 7 34 31 23 64 43
## 5 704 580 574 663 408 718 665 873 735 537 965 673
## 6 1727 917 1215 1061 620 532 968 342 429 301 848 610
## LCM_BS3
## 1 8374
## 2 61
## 3 1222
## 4 67
## 5 833
## 6 765
data_counts <-data_counts[,-5:-10]
counts <- data_counts[,2:16]
rownames(counts) <- data_counts$gene
head(counts)
## CgTOT1 CgTOT2 CgTOT3 BS1 BS2 BS3 M1 M2 M3 LCM_M1 LCM_M2
## XLOC_000001 705 515 620 630 814 845 410 973 796 7600 7129
## XLOC_000002 650 490 487 956 1354 935 301 320 885 48 74
## XLOC_000003 609 416 530 674 930 728 416 895 748 1338 1288
## XLOC_000004 169 80 120 342 69 200 16 12 7 34 31
## XLOC_000005 803 584 649 580 574 663 408 718 665 873 735
## XLOC_000006 1696 1391 1516 917 1215 1061 620 532 968 342 429
## LCM_M3 LCM_BS1 LCM_BS2 LCM_BS3
## XLOC_000001 5821 12411 7507 8374
## XLOC_000002 49 91 78 61
## XLOC_000003 1150 1465 1028 1222
## XLOC_000004 23 64 43 67
## XLOC_000005 537 965 673 833
## XLOC_000006 301 848 610 765
colnames(counts) <- c("WL1","WL2", "WL3", "rol_BS1",
"rol_BS2", "rol_BS3", "rol_M1",
"rol_M2", "rol_M3","lcm_M1",
"lcm_M2", "lcm_M3", "lcm_BS1",
"lcm_BS2","lcm_BS3")
cm <- melt(cor(counts))
colnames(cm) <- c("Var1", "Var2","value")
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2
Pearson Correlation
ggplot(data=cm, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
ggtitle("Pearson Correlation matrix (Effective Counts)") +
geom_text(aes(label = round(value, 2))) +
xlab('') +
ylab('') +
theme_bw() +
scale_fill_gradient2(low="cyan", mid="yellow",
high="red", midpoint=mid,
space="Lab", na.value = "grey50", guide="colorbar")
Log Distributions on Effective counts
cf <- as.data.frame(counts)
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id='contig')
counts_melt["sample"] <- c(rep("WL", 76479),rep("Rolling", 152958),rep("LCM", 152958))
head(counts_melt)
## contig variable value sample
## 1 XLOC_000001 WL1 705 WL
## 2 XLOC_000002 WL1 650 WL
## 3 XLOC_000003 WL1 609 WL
## 4 XLOC_000004 WL1 169 WL
## 5 XLOC_000005 WL1 803 WL
## 6 XLOC_000006 WL1 1696 WL
ggplot(counts_melt, aes(x=log(value), colour=variable)) +
geom_density() +
facet_wrap(~ sample , scales="free" ) +
theme_bw()+
theme(axis.title.y = element_text(size = rel(1.5), angle = 90)) +
theme(axis.title.x = element_text(size = rel(1.5), angle = 0)) +
labs(y = "Density")+
ylim(0,0.25) +
ggtitle("Effective Count Log Density")
ggplot(counts_melt, aes(x=log(value), colour=variable)) +
geom_density() +
theme_bw()+
theme(axis.title.y = element_text(size = rel(1.5), angle = 90)) +
theme(axis.title.x = element_text(size = rel(1.5), angle = 0)) +
labs(y = "Density")+
ggtitle("Effective Count Log Density")
Again the Rolling samples look less homogeneous. Next is to check the outliers
ggplot(counts_melt, aes(x=variable, y=value, colour=variable)) +
geom_boxplot() +
ggtitle("Boxplot with outliers")+
theme_bw()+
labs(y = "Value", x="Sample")
And the behaviour of the samples when removing outliers > 50k
counts_50k <- counts_melt[counts_melt$value > 50000,]
ggplot(counts_50k, aes(x=variable, y=value, colour=variable)) +
geom_boxplot() +
ggtitle("Boxplot removing outliers (value < 50k)")+
theme_bw()+
labs(y = "Value", x="Sample")
The outliers list is attached as “outliers50k.csv”, A lot of them are photosynthetic genes