knitr::opts_knit$set(root.dir = ".")
These functions with help simultaneously save plots as a png, pdf, and tiff file.
# read in metadata
metadata <-
read.delim(
paste0("../../mouse_comparison/mouse_metadata.txt"),
header = TRUE,
sep = "\t"
)
# subset to only look at WT mice
metadata <- metadata[metadata$genotype == "WT", ]
# read in counts data
counts <- read.delim(
paste0("../../mouse_comparison/mouse_rpkm.txt"),
header = TRUE,
sep = "\t"
)
geneid <- read.delim("../../mouse_comparison/mouse_gene_info.txt",
header = TRUE,
sep = "\t")
rownames(counts) <- geneid$GeneID
gene_biotype <- read.delim(
"../../mouse_comparison/mouse_gene_biotype.txt",
header = TRUE,
sep = "\t"
)
# subset to only look at protein coding genes
genes <-
gene_biotype[gene_biotype$Gene_Biotype == "protein_coding", ]
genes <- gene_biotype
# remove rows in counts file that are not protein coding
counts.genes <- subset(counts, row.names(counts) %in% genes$GeneID)
# remove samples that are not WT
WT_counts <-
subset(
counts.genes,
select = c(
s_SK1_RPKM,
s_SK2_RPKM,
s_SK3_RPKM,
s_SK4_RPKM,
s_SK9_RPKM,
s_SK10_RPKM,
s_SK11_RPKM,
s_SK12_RPKM
)
)
# reorder counts columns to match metadata
WT_counts <- WT_counts[, (metadata$sampleID)]
WT_counts_ordered <-
WT_counts[match(genes$GeneID, rownames(WT_counts)),]
# check columns and rows match up between files
all.equal(rownames(WT_counts_ordered), genes$GeneID)
## [1] TRUE
all.equal(colnames(WT_counts_ordered), (metadata$sampleID))
## [1] TRUE
# create object
dge <- DGEList(counts = WT_counts_ordered,
samples = metadata,
genes = genes)
# set metadata information as factors for downstream use in labeling plots
dge$samples$condition <- as.factor(dge$samples$condition)
saveRDS(dge, file = paste0("../../mouse_comparison/LPS_WT_mouse_gene_raw.rds"))
dim(dge)
## [1] 43346 8
removeMT <- dge$genes$Chr != "chrMT" # true when NOT MT
dge <- dge[removeMT,,keep.lib.sizes = FALSE]
dim(dge)
## [1] 43309 8
dim(dge)
## [1] 43309 8
keepProtein <- dge$genes$Gene_Biotype == "protein_coding" # true when NOT MT
dge <- dge[keepProtein,,keep.lib.sizes = FALSE]
dim(dge)
## [1] 22019 8
This portion won’t display in the R Markdown pdf; the margins are too large. The pdf and png file can only be saved one at a time.
# set colors and get data
group_colors <- c(treatment_color, control_color)[dge$samples$condition]
data <- edgeR::cpm(dge$counts, log = TRUE)
par(bg = 'white')
# plot MDS
plotMDS(
data,
top = 100,
labels = dge$samples$sample,
cex = 0.8,
dim.plot = c(1,2),
plot = TRUE,
col = group_colors
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))
legend(
"bottom",
legend = c(control, treatment),
pch = 16,
col = c(control_color, treatment_color),
cex = 0.8
)
# save
path <- paste0("../../mouse_comparison/results/LPS_WT_gene_MDS_raw")
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png
## 2
The filterByExpr() function in the edgeR package determines which genes have a great enough count value to keep. We will filter by group. This means at least 4 samples (4 is the smallest group sample size) must express a minimum count of 5 (in cpm, this is less than the default value of 10).
dim(dge)
## [1] 22019 8
dge$counts <- edgeR::cpm(dge$counts)
keep.expr <- filterByExpr(dge, group = dge$samples$condition, min.count = 5)
dge.filtered <- dge[keep.expr, , keep.lib.sizes = FALSE]
dim(dge.filtered)
## [1] 12479 8
# Now, normalization by the method of trimmed mean of M-values (TMM)
dge.filtered.norm <- calcNormFactors(dge.filtered, method = "TMM")
# norm factor summary
summary(dge.filtered.norm$samples$norm.factors)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.9907 0.9925 0.9943 1.0001 1.0043 1.0190
Density plots of log-intensity distribution of each library can be superposed on a single graph for a better comparison between libraries and for identification of libraries with weird distribution.
# set graphical parameter
par(mfrow = c(1,3))
# Normalize data for library size and expression intesntiy
log2cpm.tech <- edgeR::cpm(dge, log = TRUE)
log2cpm.filtered <- edgeR::cpm(dge.filtered, log = TRUE)
log2cpm.norm <- edgeR::cpm(dge.filtered.norm, log = TRUE)
# set colors
colors <- c("red","orange","green","yellow","blue","purple",
"lightgray","brown","pink","cyan")
nsamples <- ncol(dge)
# First, plot the first column of the log2cpm.tech density
plot(density(log2cpm.tech[,1]), col = colors[1], lwd = 2, ylim = c(0,0.5),
las = 2, main = "A. Raw", xlab = expression('Log'[2]~CPM))
# For each sample plot the lcpm density
for (i in 2:nsamples){
den <- density(log2cpm.tech[,i]) #subset each column
lines(den$x, den$y, col = colors[i], lwd = 2)
}
# Second, plot log2cpm.filtered
plot(density(log2cpm.filtered[,1]), col = colors[1], lwd = 2, ylim = c(0,0.5),
las = 2, main = "B. Filtered", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
den <- density(log2cpm.filtered[,i])
lines(den$x, den$y, col = colors[i], lwd = 2)
}
# Third, plot log2cpm.norm
plot(density(log2cpm.norm[,1]), col = colors[1], lwd = 2, ylim = c(0,0.5),
las = 2, main = "C. Normalized", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
den <- density(log2cpm.norm[,i])
lines(den$x, den$y, col = colors[i], lwd = 2)
}
# save
path <- paste0("../../mouse_comparison/results/LPS_WT_gene_density")
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png
## 2
# set parameters
par(mfrow = c(1,3))
# First look at dge.tech
boxplot(log2cpm.tech,
main="A. Raw",
xlab="",
ylab=expression('Counts per gene (Log'[2]~'CPM)'),
axes=FALSE,
col = colors
)
axis(2) # 2 = left
axis(1, # 1 = below
at = 1:nsamples, # points at which tick-marks should be drawn
labels = colnames(log2cpm.tech),
las = 2,
cex.axis = 0.8 # size of axis
)
# Second, look at dge.filtered
boxplot(log2cpm.filtered,
main="B. Filtered",
xlab="",
ylab=expression('Counts per gene (Log'[2]~'CPM)'),
axes=FALSE,
col = colors
)
axis(2)
axis(1, at=1:nsamples,labels=colnames(log2cpm.filtered),las=2,cex.axis=0.8)
# Third, look at dge.norm
boxplot(log2cpm.norm,
main="C. Normalized",
xlab="",
ylab=expression('Counts per gene (Log'[2]~'CPM)'),
axes=FALSE,
col = colors)
axis(2)
axis(1,at=1:nsamples,labels=colnames(log2cpm.norm),las=2,cex.axis=0.8)
# save
path <- paste0("../../mouse_comparison/results/LPS_WT_gene_boxplot")
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png
## 2
group <- interaction(dge.filtered.norm$samples$condition)
design <- model.matrix(~ 0 + group)
colnames(design) <- c(treatment,control)
design
## LPS Saline
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
## 5 1 0
## 6 1 0
## 7 1 0
## 8 1 0
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
# voom transform counts
v <- voomWithQualityWeights(dge.filtered.norm,
design,
plot = TRUE)
# fits linear model for each gene given a series of arrays
fit <- lmFit(v, design)
# contrast design for differential expression
contrasts <- makeContrasts(
title = myContrasts, # myContrasts was user input from beginning
levels = colnames(design))
head(contrasts)
## Contrasts
## Levels LPS - Saline
## LPS 1
## Saline -1
# save contrast names
allComparisons <- colnames(contrasts)
allComparisons # check
## [1] "LPS - Saline"
# run contrast analysis
vfit <- contrasts.fit(fit, contrasts = contrasts)
# Compute differential expression based on the empirical Bayes moderation of the
# standard errors towards a common value.
veBayesFit <- eBayes(vfit)
plotSA(veBayesFit, main = "Final Model: Mean-variance Trend")
group_colors <- c(treatment_color, control_color)[v$targets$condition]
names <- v$targets$sample
plotMDS(
v,
top = 100,
labels = names,
cex = 1,
dim.plot = c(1,2),
plot = TRUE,
col = group_colors
)
title(expression('Top 100 Genes - Voom (Log'[2]~'CPM)'))
legend(
"bottom",
legend = c(control, treatment),
pch = 16,
col = c(control_color, treatment_color),
cex = 1
)
path <- paste0("../../mouse_comparison/results/LPS_WT_gene_MDS_normalized")
saveRDS(v, file = paste0("../../mouse_comparison/LPS_WT_gene_voom.rds"))
Identify number of differentially expressed genes.
pval <- 0.05
sumTable <-
summary(decideTests(
vfit, # object
adjust.method = "BH", # by default the method = "separate"
p.value = pval,
lfc = 0 # numeric, minimum absolute log2-fold change required
))
print(paste0(tissue, " FDRq < ", pval))
## [1] "Brain FDRq < 0.05"
head(sumTable)
## LPS - Saline
## Down 2335
## NotSig 7766
## Up 2378
Read and save table with all genes (FDRq = 1).
Assign colors values based on FDRq cutoff of 0.05.
Subset the top 10 up and down-regulated genes
hadjpval <- (-log10(max(
treatment_vs_control$P.Value[treatment_vs_control$adj.P.Val < 0.05],
na.rm=TRUE)))
p_vol <-
ggplot(data = treatment_vs_control,
aes(x = logFC, # x-axis is logFC
y = -log10(P.Value), # y-axis will be -log10 of P.Value
color = color_p0.05)) + # color is based on factored color column
geom_point(alpha = 0.8, size = 1.7) + # create scatterplot, alpha makes points transparent
theme_bw() + # set color theme
theme(legend.position = "none") + # no legend
scale_color_manual(values = c("red", "blue","grey")) + # set factor colors
labs(
title = "", # no main title
x = expression(log[2](FC)), # x-axis title
y = expression(-log[10] ~ "(" ~ italic("p") ~ "-value)") # y-axis title
) +
theme(axis.title.x = element_text(size = 10),
axis.text.x = element_text(size = 10)) +
theme(axis.title.y = element_text(size = 10),
axis.text.y = element_text(size = 10)) +
geom_hline(yintercept = hadjpval, # horizontal line
colour = "#000000",
linetype = "dashed") +
ggtitle(paste0("WT LPS vs WT Saline\nFDRq < 0.05")) +
theme(plot.title = element_text(size = 10)) +
geom_text_repel(data = up10,
aes(x = logFC, y= -log10(P.Value), label = GeneName),
color = "maroon",
fontface="italic",
size = 3,
max.overlaps = getOption("ggrepel.max.overlaps", default = 30)
) +
geom_text_repel(data = down10,
aes(x = logFC, y= -log10(P.Value), label = GeneName),
color = "navyblue",
fontface="italic",
size = 3,
max.overlaps = getOption("ggrepel.max.overlaps", default = 15)
)
p_vol
# save
path <- paste0("../../results/", tool, "/volcano/", treatment, "_",tolower(tissue),"_gene_volcano_FDRq0.05")
saveToPDF(paste0(path, ".pdf"), width = 8, height = 6)
## png
## 2
# Kang results
Kang <-
read.delim("../../mouse_comparison/mouse_WT-LPS_over_WT.txt")
mouse_limma <-
read.delim("../../mouse_comparison/results/LPS_WT_gene_DEGs_FDRq0.05.txt")
# Venn diagram to see what is gained and lost between the reanalysis
x = list(Kang = Kang$GeneName, rerun_mouse = mouse_limma$GeneName)
ggvenn(
x,
fill_color = c("#0073C2FF", "#EFC000FF", "pink"),
stroke_size = 2,
set_name_size = 2
)
path <- paste0("../../mouse_comparison/results/Venn")
saveToPDF(paste0(path, ".pdf"), width = 8, height = 8)
## png
## 2
# what is unique to Kang
GeneName <- setdiff(Kang$GeneName, mouse_limma$GeneName)
Kang_unique_df <- as.data.frame(GeneName)
# get the biotype information for the genes unique to Kang
Kang_unique_df_biotype <-
subset(gene_biotype, GeneName %in% Kang_unique_df$GeneName)
table(Kang_unique_df_biotype$Gene_Biotype)
##
## 3prime_overlapping_ncrna antisense
## 1 85
## IG_C_gene lincRNA
## 1 92
## miRNA misc_RNA
## 25 4
## polymorphic_pseudogene processed_pseudogene
## 4 25
## processed_transcript protein_coding
## 52 715
## pseudogene sense_intronic
## 6 1
## snoRNA snRNA
## 7 3
## TEC transcribed_processed_pseudogene
## 22 6
## transcribed_unprocessed_pseudogene unitary_pseudogene
## 5 1
## unprocessed_pseudogene
## 11
remove(GeneName)
# what is unique to the re-analysis
GeneName <- setdiff(mouse_limma$GeneName, Kang$GeneName)
mouse_limma_unique_df <- as.data.frame(GeneName)
# get the biotype information for the genes unique to Kang
mouse_limma_unique_df_biotype <-
subset(gene_biotype, GeneName %in% mouse_limma_unique_df$GeneName)
table(mouse_limma_unique_df_biotype$Gene_Biotype)
##
## 3prime_overlapping_ncrna protein_coding
## 1 1947
```r
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.9 (Maipo)
##
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gtools_3.9.2.2 ggvenn_0.1.9 reshape_0.8.9
## [4] variancePartition_1.24.1 stringr_1.4.0 rtracklayer_1.54.0
## [7] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1 IRanges_2.28.0
## [10] S4Vectors_0.32.4 BiocGenerics_0.40.0 philentropy_0.6.0
## [13] gplots_3.1.3 ggrepel_0.9.1 ggplot2_3.3.6
## [16] edgeR_3.36.0 limma_3.50.3 dplyr_1.0.9
## [19] BiocParallel_1.28.3
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-153 pbkrtest_0.5.1
## [3] bitops_1.0-7 matrixStats_0.62.0
## [5] progress_1.2.2 doParallel_1.0.17
## [7] backports_1.4.1 tools_4.1.2
## [9] bslib_0.3.1 utf8_1.2.2
## [11] R6_2.5.1 KernSmooth_2.23-20
## [13] DBI_1.1.3 colorspace_2.0-3
## [15] withr_2.5.0 prettyunits_1.1.1
## [17] tidyselect_1.1.2 compiler_4.1.2
## [19] cli_3.3.0 Biobase_2.54.0
## [21] DelayedArray_0.20.0 labeling_0.4.2
## [23] sass_0.4.1 caTools_1.18.2
## [25] scales_1.2.0 digest_0.6.29
## [27] Rsamtools_2.10.0 minqa_1.2.4
## [29] rmarkdown_2.14 aod_1.3.2
## [31] RhpcBLASctl_0.21-247.1 XVector_0.34.0
## [33] pkgconfig_2.0.3 htmltools_0.5.2
## [35] lme4_1.1-29 MatrixGenerics_1.6.0
## [37] highr_0.9 fastmap_1.1.0
## [39] rlang_1.0.4 rstudioapi_0.13
## [41] farver_2.1.1 jquerylib_0.1.4
## [43] BiocIO_1.4.0 generics_0.1.3
## [45] jsonlite_1.8.0 RCurl_1.98-1.7
## [47] magrittr_2.0.3 GenomeInfoDbData_1.2.7
## [49] Matrix_1.3-4 Rcpp_1.0.9
## [51] munsell_0.5.0 fansi_1.0.3
## [53] lifecycle_1.0.1 stringi_1.7.8
## [55] yaml_2.3.5 MASS_7.3-54
## [57] SummarizedExperiment_1.24.0 zlibbioc_1.40.0
## [59] plyr_1.8.7 parallel_4.1.2
## [61] crayon_1.5.1 lattice_0.20-45
## [63] Biostrings_2.62.0 splines_4.1.2
## [65] hms_1.1.1 locfit_1.5-9.5
## [67] knitr_1.39 pillar_1.7.0
## [69] boot_1.3-28 rjson_0.2.21
## [71] reshape2_1.4.4 codetools_0.2-18
## [73] XML_3.99-0.10 glue_1.6.2
## [75] evaluate_0.15 nloptr_2.0.3
## [77] vctrs_0.4.1 Rdpack_2.3.1
## [79] foreach_1.5.2 tidyr_1.2.0
## [81] gtable_0.3.0 purrr_0.3.4
## [83] assertthat_0.2.1 xfun_0.31
## [85] rbibutils_2.2.8 broom_0.8.0
## [87] restfulr_0.0.15 tibble_3.1.7
## [89] iterators_1.0.14 GenomicAlignments_1.30.0
## [91] ellipsis_0.3.2