- Read in “RNAseq_DGEdata_BI777.txt” to R as a dataframe. Set the
first column of this table to the row names. What are the dimensions of
this dataframe? What are the names of each column? What are the count
values in the first row?
#Read in the file and set the first column as row names
df <- read.table("RNAseq_DGEdata_BI777.txt",
header = TRUE,
sep = "\t",
row.names = 1)
#Determine the dimensions of df
dim(df)
## [1] 28395 12
#Get column names
colnames(df)
## [1] "KO1" "KO2" "KO3" "KO4" "KO5" "KO6" "WT1" "WT2" "WT3" "WT4" "WT5" "WT6"
#Count values in the first row
df[1, ]
## KO1 KO2 KO3 KO4 KO5 KO6 WT1 WT2 WT3 WT4 WT5 WT6
## 100287102 3 0 4 0 8 12 0 0 8 0 2 4
- Use EdgeR to perform DGE on the two experimental conditions. Use
edgeR to filter genes by expression prior to DGE analysis (hint: user
manual). Perform quasi-likelihood F-tests to make DGE predictions. Make
a MDplot from this analysis and include it here as your answer. (2
points)
#Install packages
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("edgeR")
## Bioconductor version 3.20 (BiocManager 1.30.26), R 4.4.3 (2025-02-28)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'edgeR'
## Installation paths not writeable, unable to update packages
## path: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/library
## packages:
## boot, cli, cluster, colorspace, foreign, generics, ggplot2, lattice,
## magrittr, MASS, Matrix, mgcv, nlme, pillar, rlang, scales, stringi,
## stringr, tibble, utf8
library(edgeR)
## Loading required package: limma
#Group by KO vs. WT
group <- factor(c("KO", "KO", "KO","KO", "KO", "KO",
"WT", "WT", "WT", "WT", "WT", "WT"))
#Create a DGEList
y <- DGEList(counts = df, group = group)
#Filter lowly expressed genes
keep <- filterByExpr(y)
y <- y[keep, , keep.lib.sizes = FALSE]
#Normalize for library size differences
y <- calcNormFactors(y)
#Create design matrix for the two groups
design <- model.matrix(~ group)
#Estimate dispersion
y <- estimateDisp(y, design)
#Fit quasi-likelihood model and perform F-test
fit <- glmQLFit(y, design)
qlf <- glmQLFTest(fit)
# 9. Get top differentially expressed genes
topTags(qlf)
## Coefficient: groupWT
## logFC logCPM F PValue FDR
## 5099 9.087690 4.4852459 632.6136 5.533359e-17 8.313873e-13
## 1008 7.226762 2.3360938 384.2306 3.248357e-14 2.440328e-10
## 10869 4.372842 4.6195166 824.0977 6.273449e-14 3.141952e-10
## 8344 6.429611 3.6652389 361.2175 5.302052e-13 1.991583e-09
## 5787 6.074886 5.2080159 516.5234 1.675926e-12 5.036157e-09
## 7216 6.669355 2.0452501 216.4417 2.061027e-12 5.161154e-09
## 126129 7.421371 2.4947218 238.3306 2.478196e-12 5.319271e-09
## 23705 -5.146252 3.9966870 260.6951 3.382936e-11 6.353576e-08
## 79993 4.557351 0.1891408 137.3058 9.999903e-11 1.503523e-07
## 203447 9.091808 3.0090575 365.4763 1.004212e-10 1.503523e-07
# 10. Make an MD plot (log-fold-change vs average expression)
plotMD(qlf, main = "MD Plot: Quasi-likelihood F-test, KO vs. WT", ylim = c(-10, 10))
abline(h = c(-1, 1), col = "blue")

- From your edgeR analysis include the top 10 differentially expressed
genes between these two samples. Include: gene name, logFC, logCPM, and
FDR (2 points).
#Extracting top 10 DEGs from qlf object
top10 <- topTags(qlf, n = 10)$table[, c("logFC", "logCPM", "FDR")]
# Add gene names as a column (from rownames)
top10 <- cbind(Gene = rownames(top10), top10)
#View the table
print(top10)
## Gene logFC logCPM FDR
## 5099 5099 9.087690 4.4852459 8.313873e-13
## 1008 1008 7.226762 2.3360938 2.440328e-10
## 10869 10869 4.372842 4.6195166 3.141952e-10
## 8344 8344 6.429611 3.6652389 1.991583e-09
## 5787 5787 6.074886 5.2080159 5.036157e-09
## 7216 7216 6.669355 2.0452501 5.161154e-09
## 126129 126129 7.421371 2.4947218 5.319271e-09
## 23705 23705 -5.146252 3.9966870 6.353576e-08
## 79993 79993 4.557351 0.1891408 1.503523e-07
## 203447 203447 9.091808 3.0090575 1.503523e-07