Required libraries

library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.1

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## Warning: package 'tibble' was built under R version 4.2.1

## Warning: package 'dplyr' was built under R version 4.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ dplyr::slice()      masks IRanges::slice()

library(dplyr)
library(apeglm)
library(ggplot2)
library(RColorBrewer)
library(vsn)
library(pheatmap)

Read the Counts matrix as a CSV File

data <- read.csv("D:/ChIP seq and RNA seq analysis/E15 to E17 Foxg1 RNA seq/Foxg1_countsMatrix.csv")
data.cortex <- data[,1:8]
data.hc <- data[, c(1,9:15)]
data.Ctx_vs_HC <- data[,c(1:4,9:11)]

Load all the count values

x1 <- data.Ctx_vs_HC[,-1]

Provide info of rownames by pointing it to the first column of the sheet

rownames(x1) <- data.Ctx_vs_HC[,1]
head(x1)

##               Ctrl_Ctx1 Ctrl_Ctx2 Ctrl_Ctx3 Ctrl_HC1 Ctrl_HC2 Ctrl_HC3
## Zc3h14             2631      2258      2946     4200     3261     2990
## Troap               502       537       522      799      499      540
## Clca2                 0         0         1        0        5        2
## 1810013L24Rik      1732      1525      1969     2979     2313     1933
## Srsf7              7693      6986      8574    10672     8991     9577
## Fignl2               78        66        71       74       58       25

Create info on the different samples present in the data

Region <- factor(c(rep("Ctx",3),rep("HC", 3)))

Create a dataframe coldata with sample info types of conditions

coldata <- data.frame(row.names=colnames(x1),Region)
coldata

##           Region
## Ctrl_Ctx1    Ctx
## Ctrl_Ctx2    Ctx
## Ctrl_Ctx3    Ctx
## Ctrl_HC1      HC
## Ctrl_HC2      HC
## Ctrl_HC3      HC

Create a DESeq2 object from coldata and counts data files

dds <- DESeqDataSetFromMatrix(countData = x1, colData = coldata, design= ~ Region)

Provide info on the baseline variable to use

relevel(dds$Region, ref = 'Ctx')

## [1] Ctx Ctx Ctx HC  HC  HC 
## Levels: Ctx HC

Apply DESeq2 algorithm on the dataset

dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

Get results

res <- results(dds)
res

## log2 fold change (MLE): Region HC vs Ctx 
## Wald test p-value: Region HC vs Ctx 
## DataFrame with 25229 rows and 6 columns
##                 baseMean log2FoldChange     lfcSE       stat    pvalue
##                <numeric>      <numeric> <numeric>  <numeric> <numeric>
## Zc3h14        2987.67348     -0.0843253 0.0809422  -1.041797 0.2975057
## Troap          563.20413     -0.2830905 0.1525825  -1.855328 0.0635494
## Clca2            1.21973      2.2841755 2.3378065   0.977059 0.3285398
## 1810013L24Rik 2023.68286     -0.0394208 0.0879985  -0.447971 0.6541741
## Srsf7         8681.96743     -0.1538219 0.1054412  -1.458840 0.1446092
## ...                  ...            ...       ...        ...       ...
## Gm42517         90.69342      0.8527168  0.277170  3.0765094 0.0020944
## Gm5565           1.52847     -0.0724341  1.717404 -0.0421765 0.9663580
## Paqr9          240.38149      0.2078848  0.215576  0.9643235 0.3348838
## Btbd35f26        0.00000             NA        NA         NA        NA
## Btbd35f6         0.00000             NA        NA         NA        NA
##                    padj
##               <numeric>
## Zc3h14         0.608144
## Troap          0.243988
## Clca2                NA
## 1810013L24Rik  0.866612
## Srsf7          0.407589
## ...                 ...
## Gm42517       0.0186469
## Gm5565               NA
## Paqr9         0.6472733
## Btbd35f26            NA
## Btbd35f6             NA

Write into a .csv file and create a dataframe of the DESeq object

write.csv(res, file = "Foxg1_HC_vs_Ctx.csv")
resDF <- as.data.frame(res)

Put cutoffs into the results data padj=0.05 and LFC>0.58

###Create new column “diffexpressed”

resDF$diffexpressed <- "NO"

if log2Foldchange > 0.58 and pvalue < 0.05, set as “UP”

resDF$diffexpressed[resDF$log2FoldChange > 0.58 & resDF$padj < 0.05] <- "HC_UP"

if log2Foldchange < -0.6 and pvalue < 0.05, set as “DOWN”

resDF$diffexpressed[resDF$log2FoldChange < -0.58 & resDF$padj < 0.05] <- "CTX_UP"

Select rows with significant values and drop NA columns

res_cutoff <- resDF %>% filter(diffexpressed=="HC_UP" | diffexpressed =="CTX_UP") %>% drop_na()
head(res_cutoff, 20)

##                  baseMean log2FoldChange     lfcSE      stat       pvalue
## Fignl2          63.537993     -0.9955802 0.3286456 -3.029343 2.450864e-03
## Vsx1             3.874003      4.1049206 1.5134410  2.712310 6.681616e-03
## Hpcal1         390.513372      1.1747488 0.1759250  6.677554 2.429633e-11
## Akap7          635.486176     -0.7064122 0.1450917 -4.868729 1.123185e-06
## Slit3          314.831292      2.4566426 0.3146453  7.807658 5.826049e-15
## Plekhb1        173.347529     -1.0592592 0.1913806 -5.534830 3.115306e-08
## Rara           265.477861     -1.1328709 0.2753401 -4.114443 3.881158e-05
## Calca           11.278642     -2.2879732 0.6991095 -3.272697 1.065267e-03
## Nlgn1         1202.504160      0.7097112 0.1301872  5.451468 4.995563e-08
## Shroom2        691.602194     -0.6084318 0.1427849 -4.261176 2.033539e-05
## Arpp21         946.639485     -0.6168235 0.1571837 -3.924220 8.701132e-05
## Cdk6          4404.239815     -0.7223648 0.1786920 -4.042513 5.288140e-05
## Ackr3          348.609631     -1.9518716 0.6337081 -3.080080 2.069451e-03
## Nkain3         458.338436      0.9197504 0.2159493  4.259104 2.052480e-05
## Trpc4          444.494311     -1.6078437 0.2020987 -7.955734 1.780727e-15
## Bend5          669.414056     -0.7434878 0.1321228 -5.627246 1.831090e-08
## Scg2           112.300712      1.5151910 0.2788652  5.433417 5.528506e-08
## Tectb            7.284991     -3.0346716 0.9913627 -3.061111 2.205170e-03
## Mgam            47.714301      0.9887121 0.3590636  2.753585 5.894644e-03
## D030068K23Rik  135.594596     -1.3225058 0.2661195 -4.969594 6.709315e-07
##                       padj diffexpressed
## Fignl2        2.124299e-02        CTX_UP
## Vsx1          4.764726e-02         HC_UP
## Hpcal1        1.291571e-09         HC_UP
## Akap7         2.608779e-05        CTX_UP
## Slit3         4.630578e-13         HC_UP
## Plekhb1       1.012042e-06        CTX_UP
## Rara          6.323004e-04        CTX_UP
## Calca         1.081328e-02        CTX_UP
## Nlgn1         1.575960e-06         HC_UP
## Shroom2       3.534516e-04        CTX_UP
## Arpp21        1.273133e-03        CTX_UP
## Cdk6          8.206893e-04        CTX_UP
## Ackr3         1.847498e-02        CTX_UP
## Nkain3        3.563654e-04         HC_UP
## Trpc4         1.487543e-13        CTX_UP
## Bend5         6.181533e-07        CTX_UP
## Scg2          1.733309e-06         HC_UP
## Tectb         1.942874e-02        CTX_UP
## Mgam          4.293283e-02         HC_UP
## D030068K23Rik 1.639576e-05        CTX_UP

Save it in a .csv file

write.csv(res_cutoff, file = "Foxg1_HC_vs_CTX_results_cutoff.csv")

Volcano Plot

p <- ggplot(resDF, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed)) + geom_point() + 
  theme_minimal()
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("CTX_UP", "HC_UP", "NO")
p3 <- p + scale_colour_manual(values = mycolors)
p3

## Warning: Removed 8856 rows containing missing values (geom_point).

MA Plot

plotMA(res,ylim=c(-2,2))

resultsNames(dds)

## [1] "Intercept"        "Region_HC_vs_Ctx"

resNorm <- lfcShrink(dds, coef=2, type="normal")

## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

resAsh <- lfcShrink(dds, coef=2, type="ashr")

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(res,xlim=xlim,ylim=ylim, main="apeglm")
plotMA(resNorm,xlim=xlim,ylim=ylim, main="normal")
plotMA(resAsh,xlim=xlim,ylim=ylim, main="ashr")

Create Normalized counts for each sample

d <- plotCounts(dds, gene=which.min(res$padj), intgroup="Region", returnData=TRUE)
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd),10)

##               Ctrl_Ctx1 Ctrl_Ctx2 Ctrl_Ctx3  Ctrl_HC1  Ctrl_HC2  Ctrl_HC3
## Zc3h14        11.713046 11.650163 11.741272 11.656634 11.572415 11.642745
## Troap          9.702057  9.887516  9.646850  9.654718  9.372604  9.590326
## Clca2          7.033334  7.033334  7.163976  7.033334  7.293650  7.210020
## 1810013L24Rik 11.164577 11.136802 11.211381 11.206145 11.124474 11.073668
## Srsf7         13.186961 13.199971 13.210078 12.931289 12.956665 13.240792
## Fignl2         8.209022  8.175258  8.109399  7.930910  7.907698  7.653615
## Sema6a        10.256994 10.072016 10.206432 10.590084 10.544197 10.378334
## Anapc15        9.674578  9.714428  9.731620  9.632423  9.605059  9.578905
## Misp3          7.170118  7.033334  7.033334  7.033334  7.198106  7.033334
## Nmb            8.706304  8.532080  8.655471  8.410858  8.631606  8.831155

ntd <- normTransform(dds)
head(ntd)

## class: DESeqTransform 
## dim: 6 6 
## metadata(1): version
## assays(1): ''
## rownames(6): Zc3h14 Troap ... Srsf7 Fignl2
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Ctrl_Ctx1 Ctrl_Ctx2 ... Ctrl_HC2 Ctrl_HC3
## colData names(2): Region sizeFactor

Plot standard deviation plot of normalized counts

meanSdPlot(assay(ntd))

meanSdPlot(assay(vsd))

meanSdPlot(assay(rld))

Plot heatmap

select <- order(rowMeans(counts(dds,normalized=TRUE)), decreasing=TRUE)[1:20]
colData(dds)

## DataFrame with 6 rows and 2 columns
##             Region sizeFactor
##           <factor>  <numeric>
## Ctrl_Ctx1      Ctx   0.848618
## Ctrl_Ctx2      Ctx   0.763515
## Ctrl_Ctx3      Ctx   0.930346
## Ctrl_HC1       HC    1.413285
## Ctrl_HC2       HC    1.169216
## Ctrl_HC3       HC    1.016689

df <- as.data.frame(colData(dds)[,c("Region","sizeFactor")])
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df)

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df)

pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df)

Distance between every sample

sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Region, vsd$sizeFactor, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMatrix, clustering_distance_rows=sampleDists, 
         clustering_distance_cols=sampleDists, col=colors)

PCA Plot

plotPCA(vsd, intgroup=c("Region"))

pcaData <- plotPCA(vsd, intgroup=c("Region"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Region)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed()

RNA seq analysis - DESeq2

Mahima Bose

2022-07-05

Required libraries

Read the Counts matrix as a CSV File

Load all the count values

Provide info of rownames by pointing it to the first column of the sheet

Create info on the different samples present in the data

Create a dataframe coldata with sample info types of conditions

Create a DESeq2 object from coldata and counts data files

Provide info on the baseline variable to use

Apply DESeq2 algorithm on the dataset

Get results

Write into a .csv file and create a dataframe of the DESeq object

Put cutoffs into the results data padj=0.05 and LFC>0.58

if log2Foldchange > 0.58 and pvalue < 0.05, set as “UP”

if log2Foldchange < -0.6 and pvalue < 0.05, set as “DOWN”

Select rows with significant values and drop NA columns

Save it in a .csv file

Volcano Plot

MA Plot

Create Normalized counts for each sample

Plot standard deviation plot of normalized counts

Plot heatmap

Distance between every sample

PCA Plot