Required libraries

library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.2.1
## Warning: package 'dplyr' was built under R version 4.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ dplyr::slice()      masks IRanges::slice()
library(dplyr)
library(apeglm)
library(ggplot2)
library(RColorBrewer)
library(vsn)
library(pheatmap)

Read the Counts matrix as a CSV File

data <- read.csv("D:/ChIP seq and RNA seq analysis/E15 to E17 Foxg1 RNA seq/Foxg1_countsMatrix.csv")
data.cortex <- data[,1:8]
data.hc <- data[, c(1,9:15)]
data.Ctx_vs_HC <- data[,c(1:4,9:11)]

Load all the count values

x1 <- data.Ctx_vs_HC[,-1]

Provide info of rownames by pointing it to the first column of the sheet

rownames(x1) <- data.Ctx_vs_HC[,1]
head(x1)
##               Ctrl_Ctx1 Ctrl_Ctx2 Ctrl_Ctx3 Ctrl_HC1 Ctrl_HC2 Ctrl_HC3
## Zc3h14             2631      2258      2946     4200     3261     2990
## Troap               502       537       522      799      499      540
## Clca2                 0         0         1        0        5        2
## 1810013L24Rik      1732      1525      1969     2979     2313     1933
## Srsf7              7693      6986      8574    10672     8991     9577
## Fignl2               78        66        71       74       58       25

Create info on the different samples present in the data

Region <- factor(c(rep("Ctx",3),rep("HC", 3)))

Create a dataframe coldata with sample info types of conditions

coldata <- data.frame(row.names=colnames(x1),Region)
coldata
##           Region
## Ctrl_Ctx1    Ctx
## Ctrl_Ctx2    Ctx
## Ctrl_Ctx3    Ctx
## Ctrl_HC1      HC
## Ctrl_HC2      HC
## Ctrl_HC3      HC

Create a DESeq2 object from coldata and counts data files

dds <- DESeqDataSetFromMatrix(countData = x1, colData = coldata, design= ~ Region)

Provide info on the baseline variable to use

relevel(dds$Region, ref = 'Ctx')
## [1] Ctx Ctx Ctx HC  HC  HC 
## Levels: Ctx HC

Apply DESeq2 algorithm on the dataset

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Get results

res <- results(dds)
res
## log2 fold change (MLE): Region HC vs Ctx 
## Wald test p-value: Region HC vs Ctx 
## DataFrame with 25229 rows and 6 columns
##                 baseMean log2FoldChange     lfcSE       stat    pvalue
##                <numeric>      <numeric> <numeric>  <numeric> <numeric>
## Zc3h14        2987.67348     -0.0843253 0.0809422  -1.041797 0.2975057
## Troap          563.20413     -0.2830905 0.1525825  -1.855328 0.0635494
## Clca2            1.21973      2.2841755 2.3378065   0.977059 0.3285398
## 1810013L24Rik 2023.68286     -0.0394208 0.0879985  -0.447971 0.6541741
## Srsf7         8681.96743     -0.1538219 0.1054412  -1.458840 0.1446092
## ...                  ...            ...       ...        ...       ...
## Gm42517         90.69342      0.8527168  0.277170  3.0765094 0.0020944
## Gm5565           1.52847     -0.0724341  1.717404 -0.0421765 0.9663580
## Paqr9          240.38149      0.2078848  0.215576  0.9643235 0.3348838
## Btbd35f26        0.00000             NA        NA         NA        NA
## Btbd35f6         0.00000             NA        NA         NA        NA
##                    padj
##               <numeric>
## Zc3h14         0.608144
## Troap          0.243988
## Clca2                NA
## 1810013L24Rik  0.866612
## Srsf7          0.407589
## ...                 ...
## Gm42517       0.0186469
## Gm5565               NA
## Paqr9         0.6472733
## Btbd35f26            NA
## Btbd35f6             NA

Write into a .csv file and create a dataframe of the DESeq object

write.csv(res, file = "Foxg1_HC_vs_Ctx.csv")
resDF <- as.data.frame(res)

Put cutoffs into the results data padj=0.05 and LFC>0.58

###Create new column “diffexpressed”

resDF$diffexpressed <- "NO"

if log2Foldchange > 0.58 and pvalue < 0.05, set as “UP”

resDF$diffexpressed[resDF$log2FoldChange > 0.58 & resDF$padj < 0.05] <- "HC_UP"

if log2Foldchange < -0.6 and pvalue < 0.05, set as “DOWN”

resDF$diffexpressed[resDF$log2FoldChange < -0.58 & resDF$padj < 0.05] <- "CTX_UP"

Select rows with significant values and drop NA columns

res_cutoff <- resDF %>% filter(diffexpressed=="HC_UP" | diffexpressed =="CTX_UP") %>% drop_na()
head(res_cutoff, 20)
##                  baseMean log2FoldChange     lfcSE      stat       pvalue
## Fignl2          63.537993     -0.9955802 0.3286456 -3.029343 2.450864e-03
## Vsx1             3.874003      4.1049206 1.5134410  2.712310 6.681616e-03
## Hpcal1         390.513372      1.1747488 0.1759250  6.677554 2.429633e-11
## Akap7          635.486176     -0.7064122 0.1450917 -4.868729 1.123185e-06
## Slit3          314.831292      2.4566426 0.3146453  7.807658 5.826049e-15
## Plekhb1        173.347529     -1.0592592 0.1913806 -5.534830 3.115306e-08
## Rara           265.477861     -1.1328709 0.2753401 -4.114443 3.881158e-05
## Calca           11.278642     -2.2879732 0.6991095 -3.272697 1.065267e-03
## Nlgn1         1202.504160      0.7097112 0.1301872  5.451468 4.995563e-08
## Shroom2        691.602194     -0.6084318 0.1427849 -4.261176 2.033539e-05
## Arpp21         946.639485     -0.6168235 0.1571837 -3.924220 8.701132e-05
## Cdk6          4404.239815     -0.7223648 0.1786920 -4.042513 5.288140e-05
## Ackr3          348.609631     -1.9518716 0.6337081 -3.080080 2.069451e-03
## Nkain3         458.338436      0.9197504 0.2159493  4.259104 2.052480e-05
## Trpc4          444.494311     -1.6078437 0.2020987 -7.955734 1.780727e-15
## Bend5          669.414056     -0.7434878 0.1321228 -5.627246 1.831090e-08
## Scg2           112.300712      1.5151910 0.2788652  5.433417 5.528506e-08
## Tectb            7.284991     -3.0346716 0.9913627 -3.061111 2.205170e-03
## Mgam            47.714301      0.9887121 0.3590636  2.753585 5.894644e-03
## D030068K23Rik  135.594596     -1.3225058 0.2661195 -4.969594 6.709315e-07
##                       padj diffexpressed
## Fignl2        2.124299e-02        CTX_UP
## Vsx1          4.764726e-02         HC_UP
## Hpcal1        1.291571e-09         HC_UP
## Akap7         2.608779e-05        CTX_UP
## Slit3         4.630578e-13         HC_UP
## Plekhb1       1.012042e-06        CTX_UP
## Rara          6.323004e-04        CTX_UP
## Calca         1.081328e-02        CTX_UP
## Nlgn1         1.575960e-06         HC_UP
## Shroom2       3.534516e-04        CTX_UP
## Arpp21        1.273133e-03        CTX_UP
## Cdk6          8.206893e-04        CTX_UP
## Ackr3         1.847498e-02        CTX_UP
## Nkain3        3.563654e-04         HC_UP
## Trpc4         1.487543e-13        CTX_UP
## Bend5         6.181533e-07        CTX_UP
## Scg2          1.733309e-06         HC_UP
## Tectb         1.942874e-02        CTX_UP
## Mgam          4.293283e-02         HC_UP
## D030068K23Rik 1.639576e-05        CTX_UP

Save it in a .csv file

write.csv(res_cutoff, file = "Foxg1_HC_vs_CTX_results_cutoff.csv")

Volcano Plot

p <- ggplot(resDF, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed)) + geom_point() + 
  theme_minimal()
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("CTX_UP", "HC_UP", "NO")
p3 <- p + scale_colour_manual(values = mycolors)
p3
## Warning: Removed 8856 rows containing missing values (geom_point).

MA Plot

plotMA(res,ylim=c(-2,2))

resultsNames(dds)
## [1] "Intercept"        "Region_HC_vs_Ctx"
resNorm <- lfcShrink(dds, coef=2, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=2, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(res,xlim=xlim,ylim=ylim, main="apeglm")
plotMA(resNorm,xlim=xlim,ylim=ylim, main="normal")
plotMA(resAsh,xlim=xlim,ylim=ylim, main="ashr")

Create Normalized counts for each sample

d <- plotCounts(dds, gene=which.min(res$padj), intgroup="Region", returnData=TRUE)
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd),10)
##               Ctrl_Ctx1 Ctrl_Ctx2 Ctrl_Ctx3  Ctrl_HC1  Ctrl_HC2  Ctrl_HC3
## Zc3h14        11.713046 11.650163 11.741272 11.656634 11.572415 11.642745
## Troap          9.702057  9.887516  9.646850  9.654718  9.372604  9.590326
## Clca2          7.033334  7.033334  7.163976  7.033334  7.293650  7.210020
## 1810013L24Rik 11.164577 11.136802 11.211381 11.206145 11.124474 11.073668
## Srsf7         13.186961 13.199971 13.210078 12.931289 12.956665 13.240792
## Fignl2         8.209022  8.175258  8.109399  7.930910  7.907698  7.653615
## Sema6a        10.256994 10.072016 10.206432 10.590084 10.544197 10.378334
## Anapc15        9.674578  9.714428  9.731620  9.632423  9.605059  9.578905
## Misp3          7.170118  7.033334  7.033334  7.033334  7.198106  7.033334
## Nmb            8.706304  8.532080  8.655471  8.410858  8.631606  8.831155
ntd <- normTransform(dds)
head(ntd)
## class: DESeqTransform 
## dim: 6 6 
## metadata(1): version
## assays(1): ''
## rownames(6): Zc3h14 Troap ... Srsf7 Fignl2
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Ctrl_Ctx1 Ctrl_Ctx2 ... Ctrl_HC2 Ctrl_HC3
## colData names(2): Region sizeFactor

Plot standard deviation plot of normalized counts

meanSdPlot(assay(ntd))

meanSdPlot(assay(vsd))

meanSdPlot(assay(rld))

Plot heatmap

select <- order(rowMeans(counts(dds,normalized=TRUE)), decreasing=TRUE)[1:20]
colData(dds)
## DataFrame with 6 rows and 2 columns
##             Region sizeFactor
##           <factor>  <numeric>
## Ctrl_Ctx1      Ctx   0.848618
## Ctrl_Ctx2      Ctx   0.763515
## Ctrl_Ctx3      Ctx   0.930346
## Ctrl_HC1       HC    1.413285
## Ctrl_HC2       HC    1.169216
## Ctrl_HC3       HC    1.016689
df <- as.data.frame(colData(dds)[,c("Region","sizeFactor")])
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df)

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df)

pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df)

Distance between every sample

sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Region, vsd$sizeFactor, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMatrix, clustering_distance_rows=sampleDists, 
         clustering_distance_cols=sampleDists, col=colors)

PCA Plot

plotPCA(vsd, intgroup=c("Region"))

pcaData <- plotPCA(vsd, intgroup=c("Region"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Region)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed()