EdgeR

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

## Warning: package 'BiocManager' was built under R version 4.2.1

BiocManager::install("edgeR")

## Bioconductor version 3.15 (BiocManager 1.30.18), R 4.2.0 (2022-04-22 ucrt)

## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'edgeR'

## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.2.0/library
##   packages:
##     MASS, nlme

## Old packages: 'htmltools', 'quantreg', 'rlang'

BiocManager::install("GO.db")

## Bioconductor version 3.15 (BiocManager 1.30.18), R 4.2.0 (2022-04-22 ucrt)

## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'GO.db'

## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.2.0/library
##   packages:
##     MASS, nlme
## Old packages: 'htmltools', 'quantreg', 'rlang'

BiocManager::install("org.Hs.eg.db")

## Bioconductor version 3.15 (BiocManager 1.30.18), R 4.2.0 (2022-04-22 ucrt)

## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'org.Hs.eg.db'

## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.2.0/library
##   packages:
##     MASS, nlme
## Old packages: 'htmltools', 'quantreg', 'rlang'

library(edgeR)

## Loading required package: limma

library(ggplot2)
library(pheatmap)
library(GO.db)

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following object is masked from 'package:limma':
## 
##     plotMA

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: IRanges

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

##

library(org.Hs.eg.db)

##

Load data

x <- read.csv("D:/ChIP seq and RNA seq analysis/E15 to E17 Foxg1 RNA seq/Foxg1_countsMatrix.csv")

Parse out the counts and gene names

x1 <- x[,-1]
rownames(x1) <- x[,1]
head(x1)

##               Ctrl_Ctx1 Ctrl_Ctx2 Ctrl_Ctx3 Mut_Ctx1 Mut_Ctx2 Mut_Ctx3 Mut_Ctx4
## Zc3h14             2631      2258      2946     2521     3300     1996     2643
## Troap               502       537       522      313      555      269      446
## Clca2                 0         0         1        0        1        0        0
## 1810013L24Rik      1732      1525      1969     2007     2481     2037     1943
## Srsf7              7693      6986      8574     9098    11508     6650     9552
## Fignl2               78        66        71       55       49       35       33
##               Ctrl_HC1 Ctrl_HC2 Ctrl_HC3 Mut_HC1 Mut_HC2 Mut_HC3 Mut_HC4
## Zc3h14            4200     3261     2990    3198    4076    2371    2747
## Troap              799      499      540     469     639     294     438
## Clca2                0        5        2       0       0       0       0
## 1810013L24Rik     2979     2313     1933    2292    2935    1585    2160
## Srsf7            10672     8991     9577   11042   12883    7658    9929
## Fignl2              74       58       25      25      71      15      29

Mention the different kinds of variables in the data in their specific order on the data frame

group <- factor(c(rep("Ctrl_Ctx",3),rep("Mut_Ctx",4), rep("Ctrl_HC", 3), rep("Mut_HC", 4)))

Load the info of counts and variable types as an EdgeR object “DGEList”

y <- DGEList(counts=x1, group=group)

Normalize the counts

y <- calcNormFactors(y)

Mention the variable within which comparison should be done

design <- model.matrix(~group)

Estimating dispersions for quantile-adjusted conditional maximum likelihood

(This analysis is only applicable on datasets with a single-factor design)

y <- estimateDisp(y,design)
y$samples

##              group lib.size norm.factors
## Ctrl_Ctx1 Ctrl_Ctx 25409337    1.0012619
## Ctrl_Ctx2 Ctrl_Ctx 22512259    1.0086813
## Ctrl_Ctx3 Ctrl_Ctx 28159040    0.9874403
## Mut_Ctx1   Mut_Ctx 25341282    0.9888326
## Mut_Ctx2   Mut_Ctx 33507884    1.0093675
## Mut_Ctx3   Mut_Ctx 22649157    0.9646663
## Mut_Ctx4   Mut_Ctx 26313877    0.9750238
## Ctrl_HC1   Ctrl_HC 43825848    0.9671781
## Ctrl_HC2   Ctrl_HC 34783186    1.0089196
## Ctrl_HC3   Ctrl_HC 28933516    1.0368129
## Mut_HC1     Mut_HC 33292670    1.0154158
## Mut_HC2     Mut_HC 40736924    1.0052011
## Mut_HC3     Mut_HC 23466510    1.0015300
## Mut_HC4     Mut_HC 31090524    1.0327587

Plot the biological dispersion estimates

plotBCV(y)

To set a level as a baseline value

y$samples$group <- relevel(y$samples$group, ref = "Ctrl_HC")

##Testing for Differentially Expressed genes

(To be used only for single-factor design datasets)

et <- exactTest(y)
out <- topTags(et, p.value=0.05, n=20000)
head(out, 10)

## Comparison of groups:  Ctrl_Ctx-Ctrl_HC 
##             logFC   logCPM       PValue          FDR
## Nhlh2   -4.079698 6.118688 5.036721e-55 1.270714e-50
## Nr4a3   -3.639890 6.204284 1.263256e-50 1.593534e-46
## Rmst    -3.801514 4.598996 4.868075e-38 4.093889e-34
## Lhx9    -6.147132 3.929907 2.893449e-37 1.824971e-33
## Col2a1  -2.384743 7.299961 2.136243e-36 1.077906e-32
## Slc16a2 -1.516636 6.953510 1.415101e-33 5.950263e-30
## Reln    -2.203911 8.653360 1.653303e-32 5.958740e-29
## Nhlh1   -1.625738 7.294928 1.529751e-31 4.824261e-28
## Klhl35  -1.245108 5.899472 4.462839e-31 1.251033e-27
## Pdlim3  -5.030091 4.917858 7.544504e-30 1.903403e-26

Generalized Linear Models for non-normally distributed data - used for >1 design factor comaparison

Using the quasi-likelihood F-test to get DE genes

1) glmQLFit() - Fits a negative binomial GLM and creates a DGELM object

2) glmQLFTest() - Does the QL F-test

Coefficient 2 means comparison of set 2 with set 1(selected as baseline)

fit <- glmQLFit(y,design)
#tr <- glmTreat(fit, coef=2, lfc=1)
qlf <- glmQLFTest(fit,coef=2)

## To compare 3 vs 1
#qlf.3vs1 <- glmQLFTest(fit, coef=3)

## To compare 3 vs 2
#qlf.3vs2 <- glmQLFTest(fit, contrast=c(0,-1,1,0))

## To compare all 4 elements
#qlf <- glmQLFTest(fit, coef=2:4)

To find top DE Genes between samples (n = Number of top DE genes you want to see)

sup <- topTags(qlf, n=30000)
write.csv(as.data.frame(sup), file="abc_qlf.csv")

If sample has more than single-factor comparisons

Model matrix without setting any sample type as baseline

Create contrasts for pair-wise comparisons

design <- model.matrix(~0+group, data = y$samples)
colnames(design) <- levels(y$samples$group)
my.contrasts <- makeContrasts(CtxMutvsWT=Ctrl_Ctx-Mut_Ctx,
                              HCMutvsWT=Ctrl_HC-Mut_HC,
                              CtxvsHC=Ctrl_Ctx-Ctrl_HC, levels = design)

Perform Quasi-likelihood model test on the samples according to the mentioned pairs

qlf.CtxMutvsWT <- glmQLFTest(fit, contrast = my.contrasts[,'CtxMutvsWT'], )
qlf.HCMutvsWT <- glmQLFTest(fit, contrast = my.contrasts[,'HCMutvsWT'])
qlf.CtxvsHC <- glmQLFTest(fit, contrast = my.contrasts[,'CtxvsHC'])

## Print top DE Genes
sup.Ctx <- topTags(qlf.CtxMutvsWT, n=30000)
sup.HC <- topTags(qlf.HCMutvsWT, n=30000)
sup.CtxvsHC <- topTags(qlf.CtxvsHC, n=30000)

# Write as a .csv file
write.csv(as.data.frame(sup.Ctx), file="Ctx_Ctrl_vs_Mut_qlf.csv")

Alternative

Likelihood Ratio Test

fit <- glmFit(y,design)
colnames(fit)

## [1] "Ctrl_HC"  "Ctrl_Ctx" "Mut_Ctx"  "Mut_HC"

lrt <- glmLRT(fit,coef=2)
sup1 <- topTags(lrt, n=30000)
write.csv(as.data.frame(sup1), file="abc_lrt.csv")

To get Normalised counts for a heatmap

logcpm <- cpm(y, log=TRUE)
# write.csv(logcpm, file = "logCPM_HC_vs_CTX_EdgeR.csv" )

EdgeR

Mahima Bose

2022-07-20

Load data

Parse out the counts and gene names

Mention the different kinds of variables in the data in their specific order on the data frame

Load the info of counts and variable types as an EdgeR object “DGEList”

Normalize the counts

Mention the variable within which comparison should be done

Estimating dispersions for quantile-adjusted conditional maximum likelihood

(This analysis is only applicable on datasets with a single-factor design)

Plot the biological dispersion estimates

To set a level as a baseline value

(To be used only for single-factor design datasets)

Generalized Linear Models for non-normally distributed data - used for >1 design factor comaparison

Using the quasi-likelihood F-test to get DE genes

1) glmQLFit() - Fits a negative binomial GLM and creates a DGELM object

2) glmQLFTest() - Does the QL F-test

Coefficient 2 means comparison of set 2 with set 1(selected as baseline)

To find top DE Genes between samples (n = Number of top DE genes you want to see)

If sample has more than single-factor comparisons

Model matrix without setting any sample type as baseline

Create contrasts for pair-wise comparisons

Perform Quasi-likelihood model test on the samples according to the mentioned pairs

Alternative

Likelihood Ratio Test

To get Normalised counts for a heatmap