Cleome gynandra LCM and Rolling data exploration

We used all the RNA-seq data available in the lab and the genome to map and generate an improved annotation. We used Tophat-Cufflinks and RSEM to obtain normalized counts and TPMs for each sample

Data exploration on TPMs

library(ggplot2)
library(gplots)

## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

library(reshape2)
tpms <- read.delim("genes_TPMs.tsv", header=TRUE, sep=",")
rownames(tpms) <-tpms$gene
tpms <-tpms[,c(3:5,12:23)]
colnames(tpms) <- c("WL1","WL2", "WL3", "rol_M1",
                    "rol_M2", "rol_M3", "rol_BS1", 
                    "rol_BS2", "rol_BS3", "lcm_M1",
                     "lcm_M2", "lcm_M3", "lcm_BS1",
                    "lcm_BS2","lcm_BS3")
head(tpms)

##               WL1   WL2   WL3 rol_M1 rol_M2 rol_M3 rol_BS1 rol_BS2 rol_BS3
## XLOC_000001 49.82 44.42 46.14  49.56  69.14  56.42   51.80   68.43   60.27
## XLOC_000002  8.45  8.04  7.11   6.96   4.98  12.16   15.10   19.45   15.09
## XLOC_000003 50.73 41.98 47.44  53.62  72.96  62.30   60.98   90.50   66.05
## XLOC_000004  5.70  3.26  4.36   0.79   0.37   0.20   12.62    2.70    7.37
## XLOC_000005 33.56 29.65 29.32  26.58  29.41  27.19   26.43   28.03   30.24
## XLOC_000006 48.38 48.26 46.64  27.62  14.90  26.77   28.59   40.48   33.02
##             lcm_M1 lcm_M2 lcm_M3 lcm_BS1 lcm_BS2 lcm_BS3
## XLOC_000001 374.83 377.67 395.01  476.60  386.23  393.73
## XLOC_000002   0.78   1.32   1.10    1.14    1.31    0.94
## XLOC_000003  94.83  97.73 111.94   80.74   75.71   82.52
## XLOC_000004   0.95   0.93   0.88    1.42    1.26    1.82
## XLOC_000005  31.08  28.10  26.28   26.74   25.00   28.24
## XLOC_000006   8.34  11.25  10.06   16.13   15.56   17.82

correlation <- cor(tpms)
cm <- melt(correlation)
colnames(cm)<- c("Var1","Var2","value")
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2

Pearson Correlation

ggplot(data=cm, aes(x=Var1, y=Var2)) + 
  geom_tile(aes(fill=value)) +
  ggtitle("Pearson Correlation Matrix on TPMs") +
  geom_text(aes(label = round(value, 2))) +
  theme_bw() +
  xlab('') +
  ylab('') +
  scale_fill_gradient2(low="cyan",
                       mid="yellow",
                       high="red",
                       midpoint=mid, 
                       space="Lab",
                       na.value = "grey50",
                       guide="colorbar")

plot of chunk unnamed-chunk-2

Expression Density Plot on TPMs

tpms_melt <- melt(tpms)

## No id variables; using all as measure variables

tpms_melt$value[tpms_melt$value < 0.01] <- 0

ggplot(tpms_melt, aes(x=log(value), group=variable, colour=variable)) +
  ggtitle("log density TPMs") +
  theme_bw()+
  geom_density()+
  theme(axis.title.y = element_text(size = rel(1.5), angle = 90)) +
  theme(axis.title.x = element_text(size = rel(1.5), angle = 0)) +
  labs(y = "Density")

## Warning: Removed 1219 rows containing non-finite values (stat_density).
## Warning: Removed 1716 rows containing non-finite values (stat_density).
## Warning: Removed 1548 rows containing non-finite values (stat_density).
## Warning: Removed 2917 rows containing non-finite values (stat_density).
## Warning: Removed 2838 rows containing non-finite values (stat_density).
## Warning: Removed 2544 rows containing non-finite values (stat_density).
## Warning: Removed 1164 rows containing non-finite values (stat_density).
## Warning: Removed 1587 rows containing non-finite values (stat_density).
## Warning: Removed 1330 rows containing non-finite values (stat_density).
## Warning: Removed 3608 rows containing non-finite values (stat_density).
## Warning: Removed 3834 rows containing non-finite values (stat_density).
## Warning: Removed 4048 rows containing non-finite values (stat_density).
## Warning: Removed 3004 rows containing non-finite values (stat_density).
## Warning: Removed 3517 rows containing non-finite values (stat_density).
## Warning: Removed 3191 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-4 BS/M samples from rolling are less homogeneous compared with LCM and WL samples

Data Exploration on counts

ALLreads <- read.delim("all_genes_effcounts.tsv", header=TRUE, sep ="\t")
LCMreads <- read.delim("LCM_genes_effcounts.tsv", header=TRUE, sep ="\t")
data_counts <- merge(x =ALLreads, y= LCMreads, by.x= "gene", by.y= "gene", all=TRUE)
data_counts <- data_counts[complete.cases(data_counts),]
head(data_counts)

##          gene CgTOT1 CgTOT2 CgTOT3 Cg35S1 Cg35S2 Cg35S3 CgGDC1 CgGDC2
## 1 XLOC_000001    705    515    620   1891   1564   1687   1311   1318
## 2 XLOC_000002    650    490    487    145    144    180    184    197
## 3 XLOC_000003    609    416    530   1553   1182   1247    721    781
## 4 XLOC_000004    169     80    120    133    103    141     47     99
## 5 XLOC_000005    803    584    649   1280   1044   1216    694    598
## 6 XLOC_000006   1696   1391   1516   1118   1024   1490   1773   1555
##   CgGDC3 BS1  BS2  BS3  M1  M2  M3 LCM_M1 LCM_M2 LCM_M3 LCM_BS1 LCM_BS2
## 1   1083 630  814  845 410 973 796   7600   7129   5821   12411    7507
## 2    181 956 1354  935 301 320 885     48     74     49      91      78
## 3    869 674  930  728 416 895 748   1338   1288   1150    1465    1028
## 4     61 342   69  200  16  12   7     34     31     23      64      43
## 5    704 580  574  663 408 718 665    873    735    537     965     673
## 6   1727 917 1215 1061 620 532 968    342    429    301     848     610
##   LCM_BS3
## 1    8374
## 2      61
## 3    1222
## 4      67
## 5     833
## 6     765

data_counts <-data_counts[,-5:-10]



counts <- data_counts[,2:16]
rownames(counts) <- data_counts$gene
head(counts)

##             CgTOT1 CgTOT2 CgTOT3 BS1  BS2  BS3  M1  M2  M3 LCM_M1 LCM_M2
## XLOC_000001    705    515    620 630  814  845 410 973 796   7600   7129
## XLOC_000002    650    490    487 956 1354  935 301 320 885     48     74
## XLOC_000003    609    416    530 674  930  728 416 895 748   1338   1288
## XLOC_000004    169     80    120 342   69  200  16  12   7     34     31
## XLOC_000005    803    584    649 580  574  663 408 718 665    873    735
## XLOC_000006   1696   1391   1516 917 1215 1061 620 532 968    342    429
##             LCM_M3 LCM_BS1 LCM_BS2 LCM_BS3
## XLOC_000001   5821   12411    7507    8374
## XLOC_000002     49      91      78      61
## XLOC_000003   1150    1465    1028    1222
## XLOC_000004     23      64      43      67
## XLOC_000005    537     965     673     833
## XLOC_000006    301     848     610     765

colnames(counts) <- c("WL1","WL2", "WL3", "rol_BS1", 
                      "rol_BS2", "rol_BS3", "rol_M1",
                      "rol_M2", "rol_M3","lcm_M1",
                      "lcm_M2", "lcm_M3", "lcm_BS1",
                      "lcm_BS2","lcm_BS3")

cm <- melt(cor(counts))
colnames(cm) <- c("Var1", "Var2","value")
cm$Var1 <- as.factor(cm$Var1)
cm$Var2 <- as.factor(cm$Var2)
mid<-(max(cm$value, na.rm=TRUE)+min(cm$value, na.rm=TRUE))/2

Pearson Correlation

ggplot(data=cm, aes(x=Var1, y=Var2)) +
  geom_tile(aes(fill=value)) +
  ggtitle("Pearson Correlation matrix (Effective Counts)") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  theme_bw() +
  scale_fill_gradient2(low="cyan", mid="yellow", 
                       high="red", midpoint=mid,
                       space="Lab", na.value = "grey50", guide="colorbar")

plot of chunk unnamed-chunk-6

Log Distributions on Effective counts

cf <- as.data.frame(counts)
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id='contig')
counts_melt["sample"] <- c(rep("WL", 76479),rep("Rolling", 152958),rep("LCM", 152958))
head(counts_melt)

##        contig variable value sample
## 1 XLOC_000001      WL1   705     WL
## 2 XLOC_000002      WL1   650     WL
## 3 XLOC_000003      WL1   609     WL
## 4 XLOC_000004      WL1   169     WL
## 5 XLOC_000005      WL1   803     WL
## 6 XLOC_000006      WL1  1696     WL

ggplot(counts_melt, aes(x=log(value), colour=variable)) +
  geom_density() +
  facet_wrap(~ sample , scales="free" ) +
  theme_bw()+
  theme(axis.title.y = element_text(size = rel(1.5), angle = 90)) +
  theme(axis.title.x = element_text(size = rel(1.5), angle = 0)) +
  labs(y = "Density")+
  ylim(0,0.25) +
  ggtitle("Effective Count Log Density")

ggplot(counts_melt, aes(x=log(value), colour=variable)) +
  geom_density() +
  theme_bw()+
  theme(axis.title.y = element_text(size = rel(1.5), angle = 90)) +
  theme(axis.title.x = element_text(size = rel(1.5), angle = 0)) +
  labs(y = "Density")+
  ggtitle("Effective Count Log Density")

plot of chunk unnamed-chunk-9

Again the Rolling samples look less homogeneous. Next is to check the outliers

ggplot(counts_melt, aes(x=variable, y=value, colour=variable)) + 
  geom_boxplot() +
  ggtitle("Boxplot with outliers")+
  theme_bw()+
  labs(y = "Value", x="Sample")

plot of chunk unnamed-chunk-10

And the behaviour of the samples when removing outliers > 50k

counts_50k <- counts_melt[counts_melt$value > 50000,]
ggplot(counts_50k, aes(x=variable, y=value, colour=variable)) + 
  geom_boxplot() +
  ggtitle("Boxplot removing outliers (value < 50k)")+
  theme_bw()+
  labs(y = "Value", x="Sample")

plot of chunk unnamed-chunk-11

The outliers list is attached as “outliers50k.csv”, A lot of them are photosynthetic genes