Obtain data

The data has been downloaded from GEO at the accession GSE134765. This provides the features counts output that have been transformed into a counts matrix with doRawMatrix.sh script, available in AW gitHub. The column names have been simplified with:

# Remove the ".//" from the beggining of the column names.
sed -i -e "1s/\.\/\///g" GSM3967160_OPC_counts.tsv

Introduction

The differential analysis between young and old have been done mostly following Analyzing RNAseq with DESeq2 and RNA-seq workflow: gene-level exploratory analysis and differential expression

Import into R

#Libraries
library("DESeq2")
library("knitr")
library("pheatmap")
library("RColorBrewer")
library("AnnotationDbi")
library("org.Rn.eg.db")
library("here")
# Save the counts
counts <- read.table(here("Data", "Rat_bulkRNAseq", "GSM3967160_OPC_counts.tsv"), header = TRUE, row.names = 1)
 
# Create metadata
condition <- factor( c("young", "young", "young", "old", "old", "old"))
colData <- data.frame( condition = condition, row.names = colnames(counts))

# Check that the colData row names and the  are in the correct order
all(rownames(colData) == colnames(counts))
## [1] TRUE
# Create DESeqDataSet
rat_dds <- DESeqDataSetFromMatrix(countData = counts,
                                  colData = colData,
                                  design = ~ condition)

# Relevel: We indicate that the reference level is the young dataset
rat_dds$condition <- relevel(rat_dds$condition, ref = "young")

Pre-filtering

We remove very low count genes. Low counts are define as genes that do not present more than 10 counts in at least 3 samples

# Initial gene number
nrow(rat_dds)
## [1] 32623
# genes to keep
keep <- rowSums(counts(rat_dds) >= 10) >= 3
# subset the dds
rat_dds <- rat_dds[keep,]
# Final genes
nrow(rat_dds)
## [1] 18660

This leaves us with a dataset of a similar size of the mouse one (12589 genes)

Normalisation for plotting

Bellow we normalize the variance, with regularized-logarithm transformation (Love, Huber, and Anders 2014); as RNA-seq data tend to have more variance for greater count means. This is just for visualising the pre-processing steps, as DESeq() function needs raw gene-counts, and preforms transformations internally.

rat_rld <- rlog(rat_dds, blind = FALSE)

Samle distances

Calculate the Eclidean distance between samples, with the normalized data to ensure a similar distribution from all genes.

# Calculate distance with tranposed matrix: distances are calculated with samples on rows
(dists <- dist(t(assay(rat_rld))))
##             OPC.young.1 OPC.young.2 OPC.young.3 OPC.aged.1 OPC.aged.2
## OPC.young.2    38.24874                                              
## OPC.young.3    55.32624    36.13101                                  
## OPC.aged.1     76.07883    55.30246    52.14414                      
## OPC.aged.2     73.66555    53.39572    50.91715   24.33327           
## OPC.aged.3     93.99422    71.76637    60.03740   47.14445   46.95658
# Plot in a heatmap
dists_mat <- as.matrix( dists )
rownames(dists_mat) <- colnames(rat_dds)
colnames(dists_mat) <- colnames(rat_dds)
#colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(dists_mat,
         #color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100)
         color = colorRampPalette(rev(brewer.pal(n = 5, name = "RdYlBu")))(100),
         clustering_distance_rows = dists,
         clustering_distance_cols = dists)

# Plot PCA
plotPCA(rat_rld)

The Heatmap and PCA show how the sample cluster according to the experimental design, with greater simalirty between samples from the same group than between groups.

Differential expression

# Run the differential expression analysis
rat_dds <- DESeq(rat_dds)

# Extract the results
rat_de <- results(rat_dds)

I tested different filters to see the impact on the results: Changing the null hyposthesis to a logFC threshold other than 0 and adjusting the FDR to a pct other than 10%. I still kept the default filters as it is the ones used in the provided mouse dataset this data will be compared against.

# Have a look at proportions with different filters filtered version
# This is how it is
table(results(rat_dds)$padj < 0.05)
## 
## FALSE  TRUE 
## 12542  4647
# this would test the hypothesis there is more than double difference between genes
table(results(rat_dds, lfcThreshold=1)$padj < 0.05)
## 
## FALSE  TRUE 
## 18482   155
# and this more than 10% (calculations in rat_mouse_de)
 table(results(rat_dds, lfcThreshold=0.14)$padj < 0.05)
## 
## FALSE  TRUE 
## 14960  2591
# This would lower the FDR to 5%
table(results(rat_dds, alpha = 0.05)$padj < 0.05)
## 
## FALSE  TRUE 
## 12908  4643

Add the baseMean for different samples, code from here

# Add the BaseMean for different levels
base_mean_per_lvl <- sapply( levels(rat_dds$condition), function(lvl) rowMeans( counts(rat_dds,normalized=TRUE)[,rat_dds$condition == lvl, drop=F] ) )
# Change colnames
colnames(base_mean_per_lvl) <- paste0(levels(rat_dds$condition),"_BaseMean")
  
# Append to df format, easier to work with
rat_de <- cbind(base_mean_per_lvl, as.data.frame(rat_de))

Annotation

# Add the full gene names
rat_de$description <- mapIds(org.Rn.eg.db,
keys=row.names(rat_de),
column="GENENAME",
keytype="ENSEMBL",
multiVals="first")

# Add the Symbol ID
rat_de$gene_name <- mapIds(org.Rn.eg.db,
keys=row.names(rat_de),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")

Export

res_ordered <- rat_de[order(rat_de$pvalue),]
saveRDS(res_ordered, file = here("Processed", "rat_de.Rds"))

Session information

Click to expand
sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 
## [2] LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] here_1.0.1                  org.Rn.eg.db_3.12.0        
##  [3] AnnotationDbi_1.52.0        RColorBrewer_1.1-2         
##  [5] pheatmap_1.0.12             knitr_1.31                 
##  [7] DESeq2_1.30.1               SummarizedExperiment_1.20.0
##  [9] Biobase_2.50.0              MatrixGenerics_1.2.1       
## [11] matrixStats_0.58.0          GenomicRanges_1.42.0       
## [13] GenomeInfoDb_1.26.4         IRanges_2.24.1             
## [15] S4Vectors_0.28.1            BiocGenerics_0.36.0        
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2             sass_0.3.1             bit64_4.0.5           
##  [4] jsonlite_1.7.2         splines_4.0.4          bslib_0.2.4           
##  [7] assertthat_0.2.1       highr_0.8              blob_1.2.1            
## [10] GenomeInfoDbData_1.2.4 yaml_2.2.1             pillar_1.5.1          
## [13] RSQLite_2.2.4          lattice_0.20-41        glue_1.4.2            
## [16] digest_0.6.27          XVector_0.30.0         colorspace_2.0-0      
## [19] htmltools_0.5.1.1      Matrix_1.3-2           XML_3.99-0.5          
## [22] pkgconfig_2.0.3        genefilter_1.72.1      zlibbioc_1.36.0       
## [25] purrr_0.3.4            xtable_1.8-4           scales_1.1.1          
## [28] BiocParallel_1.24.1    tibble_3.1.0           annotate_1.68.0       
## [31] farver_2.1.0           generics_0.1.0         ggplot2_3.3.3         
## [34] ellipsis_0.3.1         cachem_1.0.4           survival_3.2-10       
## [37] magrittr_2.0.1         crayon_1.4.1           memoise_2.0.0         
## [40] evaluate_0.14          fansi_0.4.2            tools_4.0.4           
## [43] lifecycle_1.0.0        stringr_1.4.0          munsell_0.5.0         
## [46] locfit_1.5-9.4         DelayedArray_0.16.2    compiler_4.0.4        
## [49] jquerylib_0.1.3        rlang_0.4.10           grid_4.0.4            
## [52] RCurl_1.98-1.2         labeling_0.4.2         bitops_1.0-6          
## [55] rmarkdown_2.7          gtable_0.3.0           DBI_1.1.1             
## [58] R6_2.5.0               dplyr_1.0.5            fastmap_1.1.0         
## [61] bit_4.0.4              utf8_1.2.1             rprojroot_2.0.2       
## [64] stringi_1.5.3          Rcpp_1.0.6             vctrs_0.3.6           
## [67] geneplotter_1.68.0     tidyselect_1.1.0       xfun_0.21