Setup

knitr::opts_knit$set(root.dir = ".")

User defined variables

Save functions

These functions with help simultaneously save plots as a png, pdf, and tiff file.

Read data

# read in metadata
metadata <-
  read.delim(
    paste0("../../mouse_comparison/mouse_metadata.txt"),
    header = TRUE,
    sep = "\t"
  )
# subset to only look at WT mice
metadata <- metadata[metadata$genotype == "WT", ]

# read in counts data
counts <- read.delim(
  paste0("../../mouse_comparison/mouse_rpkm.txt"),
  header = TRUE,
  sep = "\t"
)
geneid <- read.delim("../../mouse_comparison/mouse_gene_info.txt",
                     header = TRUE,
                     sep = "\t")
rownames(counts) <- geneid$GeneID

gene_biotype <- read.delim(
  "../../mouse_comparison/mouse_gene_biotype.txt",
  header = TRUE,
  sep = "\t"
)

# subset to only look at protein coding genes
genes <-
  gene_biotype[gene_biotype$Gene_Biotype == "protein_coding", ]
genes <- gene_biotype
# remove rows in counts file that are not protein coding
counts.genes <- subset(counts, row.names(counts) %in% genes$GeneID)
# remove samples that are not WT
WT_counts <-
  subset(
    counts.genes,
    select = c(
      s_SK1_RPKM,
      s_SK2_RPKM,
      s_SK3_RPKM,
      s_SK4_RPKM,
      s_SK9_RPKM,
      s_SK10_RPKM,
      s_SK11_RPKM,
      s_SK12_RPKM
    )
  )

# reorder counts columns to match metadata
WT_counts <- WT_counts[, (metadata$sampleID)]
WT_counts_ordered <-
  WT_counts[match(genes$GeneID, rownames(WT_counts)),]

# check columns and rows match up between files
all.equal(rownames(WT_counts_ordered), genes$GeneID)

## [1] TRUE

all.equal(colnames(WT_counts_ordered), (metadata$sampleID))

## [1] TRUE

Create DGE object

# create object
dge <- DGEList(counts = WT_counts_ordered,
               samples = metadata,
               genes = genes)

# set metadata information as factors for downstream use in labeling plots 
dge$samples$condition <- as.factor(dge$samples$condition)

saveRDS(dge, file = paste0("../../mouse_comparison/LPS_WT_mouse_gene_raw.rds"))

Remove mitochondrial genes

dim(dge)

## [1] 43346     8

removeMT <- dge$genes$Chr != "chrMT"  # true when NOT MT
dge <- dge[removeMT,,keep.lib.sizes = FALSE]
dim(dge)

## [1] 43309     8

Keep only protein coding genes

dim(dge)

## [1] 43309     8

keepProtein <- dge$genes$Gene_Biotype == "protein_coding"  # true when NOT MT
dge <- dge[keepProtein,,keep.lib.sizes = FALSE]
dim(dge)

## [1] 22019     8

JSD heatmap

This portion won’t display in the R Markdown pdf; the margins are too large. The pdf and png file can only be saved one at a time.

Raw MDS with technical replicates

# set colors and get data
group_colors <- c(treatment_color, control_color)[dge$samples$condition]
data <- edgeR::cpm(dge$counts, log = TRUE)
par(bg = 'white')

# plot MDS
plotMDS(
  data, 
  top = 100, 
  labels = dge$samples$sample,
  cex = 0.8, 
  dim.plot = c(1,2), 
  plot = TRUE, 
  col = group_colors
)

title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))

legend(
  "bottom",
  legend = c(control, treatment),
  pch = 16,
  col = c(control_color, treatment_color),
  cex = 0.8
)

# save
path <- paste0("../../mouse_comparison/results/LPS_WT_gene_MDS_raw")
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)

## png 
##   2

Filter lowly expressed genes

The filterByExpr() function in the edgeR package determines which genes have a great enough count value to keep. We will filter by group. This means at least 4 samples (4 is the smallest group sample size) must express a minimum count of 5 (in cpm, this is less than the default value of 10).

dim(dge)

## [1] 22019     8

dge$counts <- edgeR::cpm(dge$counts)
keep.expr <- filterByExpr(dge, group = dge$samples$condition, min.count = 5)

dge.filtered <- dge[keep.expr, , keep.lib.sizes = FALSE]
dim(dge.filtered)

## [1] 12479     8

TMM normalization

# Now, normalization by the method of trimmed mean of M-values (TMM)
dge.filtered.norm <- calcNormFactors(dge.filtered, method = "TMM")

# norm factor summary
summary(dge.filtered.norm$samples$norm.factors)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.9907  0.9925  0.9943  1.0001  1.0043  1.0190

Density plot

Density plots of log-intensity distribution of each library can be superposed on a single graph for a better comparison between libraries and for identification of libraries with weird distribution.

# set graphical parameter
par(mfrow = c(1,3))

# Normalize data for library size and expression intesntiy
log2cpm.tech <- edgeR::cpm(dge, log = TRUE)
log2cpm.filtered <- edgeR::cpm(dge.filtered, log = TRUE)
log2cpm.norm <- edgeR::cpm(dge.filtered.norm, log = TRUE)

# set colors
colors <- c("red","orange","green","yellow","blue","purple", 
            "lightgray","brown","pink","cyan")
nsamples <- ncol(dge)

# First, plot the first column of the log2cpm.tech density
plot(density(log2cpm.tech[,1]), col = colors[1], lwd = 2, ylim = c(0,0.5), 
     las = 2, main = "A. Raw", xlab = expression('Log'[2]~CPM))

# For each sample plot the lcpm density
for (i in 2:nsamples){
  den <- density(log2cpm.tech[,i]) #subset each column
  lines(den$x, den$y, col = colors[i], lwd = 2) 
}


# Second, plot log2cpm.filtered
plot(density(log2cpm.filtered[,1]), col = colors[1], lwd = 2, ylim = c(0,0.5), 
     las = 2, main = "B. Filtered", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
  den <- density(log2cpm.filtered[,i])
  lines(den$x, den$y, col = colors[i], lwd = 2)
}

# Third, plot log2cpm.norm
plot(density(log2cpm.norm[,1]), col = colors[1], lwd = 2, ylim = c(0,0.5), 
     las = 2, main = "C. Normalized", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
  den <- density(log2cpm.norm[,i])
  lines(den$x, den$y, col = colors[i], lwd = 2)
}

# save
path <- paste0("../../mouse_comparison/results/LPS_WT_gene_density")
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)

## png 
##   2

Boxplots

# set parameters
par(mfrow = c(1,3))

# First look at dge.tech
boxplot(log2cpm.tech, 
        main="A. Raw", 
        xlab="", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors
        )
axis(2) # 2 = left 
axis(1, # 1 = below 
     at = 1:nsamples, # points at which tick-marks should be drawn
     labels = colnames(log2cpm.tech),
     las = 2,
     cex.axis = 0.8 # size of axis
     )

# Second, look at dge.filtered
boxplot(log2cpm.filtered, 
        main="B. Filtered", 
        xlab="", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors
        )
axis(2)
axis(1, at=1:nsamples,labels=colnames(log2cpm.filtered),las=2,cex.axis=0.8)

# Third, look at dge.norm
boxplot(log2cpm.norm, 
        main="C. Normalized", 
        xlab="", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors)
axis(2)
axis(1,at=1:nsamples,labels=colnames(log2cpm.norm),las=2,cex.axis=0.8)

# save
path <- paste0("../../mouse_comparison/results/LPS_WT_gene_boxplot")
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)

## png 
##   2

Design matrix

group <- interaction(dge.filtered.norm$samples$condition)

design <- model.matrix(~ 0 + group)
colnames(design) <- c(treatment,control)

design

##   LPS Saline
## 1   0      1
## 2   0      1
## 3   0      1
## 4   0      1
## 5   1      0
## 6   1      0
## 7   1      0
## 8   1      0
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"

Voom

# voom transform counts
v <- voomWithQualityWeights(dge.filtered.norm,
                            design,
                            plot = TRUE)

# fits linear model for each gene given a series of arrays
fit <- lmFit(v, design)

# contrast design for differential expression
contrasts <- makeContrasts(
  title = myContrasts,  # myContrasts was user input from beginning
  levels = colnames(design))
head(contrasts)

##         Contrasts
## Levels   LPS - Saline
##   LPS               1
##   Saline           -1

# save contrast names
allComparisons <- colnames(contrasts)
allComparisons # check

## [1] "LPS - Saline"

# run contrast analysis
vfit <- contrasts.fit(fit, contrasts = contrasts)

# Compute differential expression based on the empirical Bayes moderation of the
# standard errors towards a common value.
veBayesFit <- eBayes(vfit)
plotSA(veBayesFit, main = "Final Model: Mean-variance Trend")

Voom MDS Plot

group_colors <- c(treatment_color, control_color)[v$targets$condition]
names <- v$targets$sample

plotMDS(
  v, 
  top = 100, 
  labels = names,
  cex = 1, 
  dim.plot = c(1,2), 
  plot = TRUE, 
  col = group_colors
)

title(expression('Top 100 Genes - Voom (Log'[2]~'CPM)'))

legend(
  "bottom",
  legend = c(control, treatment),
  pch = 16,
  col = c(control_color, treatment_color),
  cex = 1
)

path <- paste0("../../mouse_comparison/results/LPS_WT_gene_MDS_normalized")

saveRDS(v, file = paste0("../../mouse_comparison/LPS_WT_gene_voom.rds"))

Number of DEGs

Identify number of differentially expressed genes.

pval <- 0.05

sumTable <- 
  summary(decideTests(
    vfit,  # object
    adjust.method = "BH", # by default the method = "separate"
    p.value = pval,
    lfc = 0  # numeric, minimum absolute log2-fold change required
  ))

print(paste0(tissue, " FDRq < ", pval))

## [1] "Brain FDRq < 0.05"

head(sumTable)

##        LPS - Saline
## Down           2335
## NotSig         7766
## Up             2378

Output DEG tables

Read and save table with all genes (FDRq = 1).

Assign colors

Assign colors values based on FDRq cutoff of 0.05.

Subset genes to label

Subset the top 10 up and down-regulated genes

Volcano plot

hadjpval <- (-log10(max(
  treatment_vs_control$P.Value[treatment_vs_control$adj.P.Val < 0.05], 
  na.rm=TRUE)))

p_vol <-
  ggplot(data = treatment_vs_control, 
         aes(x = logFC,  # x-axis is logFC
             y = -log10(P.Value),  # y-axis will be -log10 of P.Value
             color = color_p0.05)) +  # color is based on factored color column
  geom_point(alpha = 0.8, size = 1.7) +  # create scatterplot, alpha makes points transparent
  theme_bw() +  # set color theme
  theme(legend.position = "none") +  # no legend
  scale_color_manual(values = c("red", "blue","grey")) +  # set factor colors
  labs(
    title = "", # no main title
    x = expression(log[2](FC)), # x-axis title
    y = expression(-log[10] ~ "(" ~ italic("p") ~ "-value)") # y-axis title
  ) +
  theme(axis.title.x = element_text(size = 10),
        axis.text.x = element_text(size = 10)) +
  theme(axis.title.y = element_text(size = 10),
        axis.text.y = element_text(size = 10)) +
  geom_hline(yintercept = hadjpval,  #  horizontal line
                     colour = "#000000",
                     linetype = "dashed") +
  ggtitle(paste0("WT LPS vs WT Saline\nFDRq < 0.05")) +
  theme(plot.title = element_text(size = 10)) +
  geom_text_repel(data = up10,
                  aes(x = logFC, y= -log10(P.Value), label = GeneName), 
                  color = "maroon", 
                  fontface="italic",
                  size = 3, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 30)
                  ) +
  geom_text_repel(data = down10,
                  aes(x = logFC, y= -log10(P.Value), label = GeneName), 
                  color = "navyblue", 
                  fontface="italic",
                  size = 3, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 15)
  )

p_vol

# save
path <- paste0("../../results/", tool, "/volcano/", treatment, "_",tolower(tissue),"_gene_volcano_FDRq0.05")
saveToPDF(paste0(path, ".pdf"), width = 8, height = 6)

## png 
##   2

Compare between Kang et al. and the re-analysis

# Kang results
Kang <-
  read.delim("../../mouse_comparison/mouse_WT-LPS_over_WT.txt")
mouse_limma <-
  read.delim("../../mouse_comparison/results/LPS_WT_gene_DEGs_FDRq0.05.txt")

# Venn diagram to see what is gained and lost between the reanalysis
x = list(Kang = Kang$GeneName, rerun_mouse = mouse_limma$GeneName)
ggvenn(
  x,
  fill_color = c("#0073C2FF", "#EFC000FF", "pink"),
  stroke_size = 2,
  set_name_size = 2
)

path <- paste0("../../mouse_comparison/results/Venn")
saveToPDF(paste0(path, ".pdf"), width = 8, height = 8)

## png 
##   2

# what is unique to Kang
GeneName <- setdiff(Kang$GeneName, mouse_limma$GeneName)
Kang_unique_df <- as.data.frame(GeneName)
# get the biotype information for the genes unique to Kang
Kang_unique_df_biotype <-
  subset(gene_biotype, GeneName %in% Kang_unique_df$GeneName)
table(Kang_unique_df_biotype$Gene_Biotype)

## 
##           3prime_overlapping_ncrna                          antisense 
##                                  1                                 85 
##                          IG_C_gene                            lincRNA 
##                                  1                                 92 
##                              miRNA                           misc_RNA 
##                                 25                                  4 
##             polymorphic_pseudogene               processed_pseudogene 
##                                  4                                 25 
##               processed_transcript                     protein_coding 
##                                 52                                715 
##                         pseudogene                     sense_intronic 
##                                  6                                  1 
##                             snoRNA                              snRNA 
##                                  7                                  3 
##                                TEC   transcribed_processed_pseudogene 
##                                 22                                  6 
## transcribed_unprocessed_pseudogene                 unitary_pseudogene 
##                                  5                                  1 
##             unprocessed_pseudogene 
##                                 11

remove(GeneName)
# what is unique to the re-analysis
GeneName <- setdiff(mouse_limma$GeneName, Kang$GeneName)
mouse_limma_unique_df <- as.data.frame(GeneName)
# get the biotype information for the genes unique to Kang
mouse_limma_unique_df_biotype <-
  subset(gene_biotype, GeneName %in% mouse_limma_unique_df$GeneName)
table(mouse_limma_unique_df_biotype$Gene_Biotype)

## 
## 3prime_overlapping_ncrna           protein_coding 
##                        1                     1947

Interactive Glimma



```r
sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.9 (Maipo)
## 
## Matrix products: default
## BLAS:   /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gtools_3.9.2.2           ggvenn_0.1.9             reshape_0.8.9           
##  [4] variancePartition_1.24.1 stringr_1.4.0            rtracklayer_1.54.0      
##  [7] GenomicRanges_1.46.1     GenomeInfoDb_1.30.1      IRanges_2.28.0          
## [10] S4Vectors_0.32.4         BiocGenerics_0.40.0      philentropy_0.6.0       
## [13] gplots_3.1.3             ggrepel_0.9.1            ggplot2_3.3.6           
## [16] edgeR_3.36.0             limma_3.50.3             dplyr_1.0.9             
## [19] BiocParallel_1.28.3     
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-153                pbkrtest_0.5.1             
##  [3] bitops_1.0-7                matrixStats_0.62.0         
##  [5] progress_1.2.2              doParallel_1.0.17          
##  [7] backports_1.4.1             tools_4.1.2                
##  [9] bslib_0.3.1                 utf8_1.2.2                 
## [11] R6_2.5.1                    KernSmooth_2.23-20         
## [13] DBI_1.1.3                   colorspace_2.0-3           
## [15] withr_2.5.0                 prettyunits_1.1.1          
## [17] tidyselect_1.1.2            compiler_4.1.2             
## [19] cli_3.3.0                   Biobase_2.54.0             
## [21] DelayedArray_0.20.0         labeling_0.4.2             
## [23] sass_0.4.1                  caTools_1.18.2             
## [25] scales_1.2.0                digest_0.6.29              
## [27] Rsamtools_2.10.0            minqa_1.2.4                
## [29] rmarkdown_2.14              aod_1.3.2                  
## [31] RhpcBLASctl_0.21-247.1      XVector_0.34.0             
## [33] pkgconfig_2.0.3             htmltools_0.5.2            
## [35] lme4_1.1-29                 MatrixGenerics_1.6.0       
## [37] highr_0.9                   fastmap_1.1.0              
## [39] rlang_1.0.4                 rstudioapi_0.13            
## [41] farver_2.1.1                jquerylib_0.1.4            
## [43] BiocIO_1.4.0                generics_0.1.3             
## [45] jsonlite_1.8.0              RCurl_1.98-1.7             
## [47] magrittr_2.0.3              GenomeInfoDbData_1.2.7     
## [49] Matrix_1.3-4                Rcpp_1.0.9                 
## [51] munsell_0.5.0               fansi_1.0.3                
## [53] lifecycle_1.0.1             stringi_1.7.8              
## [55] yaml_2.3.5                  MASS_7.3-54                
## [57] SummarizedExperiment_1.24.0 zlibbioc_1.40.0            
## [59] plyr_1.8.7                  parallel_4.1.2             
## [61] crayon_1.5.1                lattice_0.20-45            
## [63] Biostrings_2.62.0           splines_4.1.2              
## [65] hms_1.1.1                   locfit_1.5-9.5             
## [67] knitr_1.39                  pillar_1.7.0               
## [69] boot_1.3-28                 rjson_0.2.21               
## [71] reshape2_1.4.4              codetools_0.2-18           
## [73] XML_3.99-0.10               glue_1.6.2                 
## [75] evaluate_0.15               nloptr_2.0.3               
## [77] vctrs_0.4.1                 Rdpack_2.3.1               
## [79] foreach_1.5.2               tidyr_1.2.0                
## [81] gtable_0.3.0                purrr_0.3.4                
## [83] assertthat_0.2.1            xfun_0.31                  
## [85] rbibutils_2.2.8             broom_0.8.0                
## [87] restfulr_0.0.15             tibble_3.1.7               
## [89] iterators_1.0.14            GenomicAlignments_1.30.0   
## [91] ellipsis_0.3.2

Differential expression in LPS mice from RPKM data

Kimberly Olney

07/11/2022