Setup

# Also do Session > Set Working Directory > Choose Directory
knitr::opts_knit$set(root.dir = ".")

Sys.setenv(RSTUDIO_PANDOC="/usr/lib/rstudio-server/bin/pandoc")
library(rmarkdown)
library(BiocParallel)  # SnowParam()
library(dplyr)  # left_join()
library(edgeR)  # DGEList()
library(limma)  # plotMDS()
library(ggrepel) # geom_text_repel()
library(ggplot2)  # ggplot()
library(gplots)  # heatmap.2()
library(grDevices)  # colorRampPalette()
require(philentropy)  # JSD()
library(rtracklayer)  # import()
library(stringr)  # str_match()
require(variancePartition)  # fitExtractVarPartModel()
library(reshape)  # melt()
library(Glimma)
library(plyr)
library(corrplot)
library("ggpubr")
library(tidyverse)
library(caret)
library(glmnet)

User defined variables

control <- "CONTROL"
condition <- "LBD"
control_color <- "gray29"
condition_color <- "red"
myContrasts <- c("LBD - CONTROL")
tool = c("star")
pathToRef = c("/research/labs/neurology/fryer/projects/references/human/")
pathToRawData = c("/research/labs/neurology/fryer/projects/LBD_CWOW/")

Read data

# read in metadata
metadata <- read.delim(paste0(pathToRawData, "metadata.tsv"), 
                       header = TRUE,
                       sep = "\t")

RIN <- read.delim(paste0(pathToRawData, "RIN.tsv"), 
                       header = TRUE,
                       sep = "\t")
RIN$NPID <- RIN$Sample
metadata <- merge(metadata, RIN, by = "NPID")
# read in counts data
counts <- read.delim(
  paste0("../../featureCounts/LBD.counts"),
  header = TRUE,
  sep = "\t", check.names=FALSE)
# get the sampleID from the counts file to match with metadata
sampleIDs <- colnames(counts)
samples <- as.data.frame(sampleIDs)
# split the sampleID by _ 
sample_count_id <- as.data.frame(str_split_fixed(samples$sampleID, "_", 2))
sample_count_id$counts_id <- samples$sampleIDs
# rename columns
names(sample_count_id)[names(sample_count_id) == 'V1'] <- 'NPID'
names(sample_count_id)[names(sample_count_id) == 'V2'] <- 'flow_lane'
# merge sample_count_id and metadata files
# this way metadata contains the sampleID to match the counts table 
counts_metadata <- merge(metadata, sample_count_id, by = "NPID")

# gene information
counts.geneid <- read.delim(
  "../../featureCounts/gene_info.txt",
  header = TRUE,
  sep = "\t"
)
counts.geneid <- as.data.frame(counts.geneid$Geneid)
colnames(counts.geneid) <- "gene_id"
# add gene_name as row names to counts file
rownames(counts) <- counts.geneid$gene_id

# read in annotation file
gtf.file <- paste0(pathToRef, "gencode.v38.annotation.gtf")
genes.gtf <- rtracklayer::import(gtf.file)
genes.gtf <- as.data.frame(genes.gtf)
genes.gtf <- genes.gtf[genes.gtf$type == "gene",]
table(genes.gtf$gene_type)

## 
##                          IG_C_gene                    IG_C_pseudogene 
##                                 14                                  9 
##                          IG_D_gene                          IG_J_gene 
##                                 37                                 18 
##                    IG_J_pseudogene                      IG_pseudogene 
##                                  3                                  1 
##                          IG_V_gene                    IG_V_pseudogene 
##                                145                                187 
##                             lncRNA                              miRNA 
##                              16888                               1879 
##                           misc_RNA                            Mt_rRNA 
##                               2212                                  2 
##                            Mt_tRNA             polymorphic_pseudogene 
##                                 22                                 49 
##               processed_pseudogene                     protein_coding 
##                              10163                              19955 
##                         pseudogene                           ribozyme 
##                                 15                                  8 
##                               rRNA                    rRNA_pseudogene 
##                                 47                                497 
##                             scaRNA                              scRNA 
##                                 49                                  1 
##                             snoRNA                              snRNA 
##                                943                               1901 
##                               sRNA                                TEC 
##                                  5                               1056 
##                          TR_C_gene                          TR_D_gene 
##                                  6                                  4 
##                          TR_J_gene                    TR_J_pseudogene 
##                                 79                                  4 
##                          TR_V_gene                    TR_V_pseudogene 
##                                106                                 33 
##   transcribed_processed_pseudogene     transcribed_unitary_pseudogene 
##                                502                                143 
## transcribed_unprocessed_pseudogene    translated_processed_pseudogene 
##                                950                                  2 
##  translated_unprocessed_pseudogene                 unitary_pseudogene 
##                                  1                                 98 
##             unprocessed_pseudogene                          vault_RNA 
##                               2614                                  1

#counts.geneid
#names(counts.geneid)[1] <- 'gene_name'
joined.df <- join(counts.geneid, genes.gtf, type = "inner")

## Joining by: gene_id

# check columns and rows match up between files
all.equal(rownames(counts), counts.geneid$gene_id)

## [1] TRUE

all.equal(rownames(counts), genes.gtf$gene_id)

## [1] TRUE

# reorder counts to be in the same order as metadata table
counts <- counts[, counts_metadata$counts_id]
all.equal(colnames(counts), (counts_metadata$counts_id))

## [1] TRUE

Create DGE object

# create object
dge <- DGEList(counts = counts,
               samples = counts_metadata,
               genes = genes.gtf)

table(dge$samples$TYPE)

## 
##      CONTROL CONTROL - AD CONTROL - PA          LBD 
##           88           56           48          450

# remove AD and PA controls
remove <- vector()
for (i in 1:nrow(dge$samples)) {
  if (dge$samples$TYPE[i] == "CONTROL - AD" | dge$samples$TYPE[i] == "CONTROL - PA") {
    remove <- c(remove, i)
  }
}
dge <- dge[,-remove, keep.lib.sizes = FALSE]
table(dge$samples$TYPE)

## 
## CONTROL     LBD 
##      88     450

Remove mitochondrial genes

dim(dge)

## [1] 60649   538

removeMT <- dge$genes$seqnames != "chrM"  # true when NOT MT
dge <- dge[removeMT,,keep.lib.sizes = FALSE]
dim(dge)

## [1] 60612   538

Save functions

These functions with help simultaneously save plots as a png, pdf, and tiff file.

saveToPDF <- function(...) {
    d = dev.copy(pdf,...)
    dev.off(d)
}

saveToPNG <- function(...) {
    d = dev.copy(png,...)
    dev.off(d)
}

Raw MDS with technical replicates

# set colors and get data
table(dge$samples$TYPE)

## 
## CONTROL     LBD 
##      88     450

dge$samples$TYPE <- as.factor(dge$samples$TYPE)
group_colors <- c("grey","red")[dge$samples$TYPE]
lcpm <- edgeR::cpm(dge$counts, log = TRUE)
cpm <- edgeR::cpm(dge$counts, log = FALSE)

par(bg = 'white')

# plot MDS
plotMDS(
  lcpm,
  top = 100, 
  labels = dge$samples$Sex,
  cex = 2, 
  dim.plot = c(1,2), 
  plot = TRUE, 
  col = group_colors, gene.selection = "common"
)
title(expression('Top 100 Genes (Log'[2]~'CPM)'))

path <- paste0("../../results/", tool, "/MDS/", condition,"_gene_MDS_techreps_label_sex")
saveToPDF(paste0(path, ".pdf"), width = 5.2, height = 5.2)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)

## png 
##   2

Sex check

genes_and_counts <- cbind(dge$genes$gene_name, dge$counts)
genes_and_counts <- as.data.frame(genes_and_counts)
names(genes_and_counts)[names(genes_and_counts) == "V1"] <- "Geneid"
rownames(genes_and_counts) <- NULL

sex_genes <- c("XIST", "EIF1AY", "KDM5D", "UTY", "DDX3Y", "RPS4Y1")
sex_genes_counts <- subset(genes_and_counts, Geneid %in% sex_genes)
rownames(sex_genes_counts) <- sex_genes_counts$Geneid
sex_genes_counts$Geneid <- NULL
sex_gene_df <- as.data.frame(t(sex_genes_counts))
sex_gene_df$counts_id <- rownames(sex_gene_df)
rownames(sex_gene_df) <- NULL

meta_short <- counts_metadata[,c("counts_id","Sex")] 
sex_gene_meta <- merge(sex_gene_df, meta_short, by = "counts_id")
cols.num <- c("XIST","EIF1AY", "KDM5D", "UTY", "DDX3Y", "RPS4Y1")
sex_gene_meta[cols.num] <- sapply(sex_gene_meta[cols.num], as.integer)
sex_gene_meta <- sex_gene_meta[,c(2,3,4,5,6,7,8,1)]
# replace M with male and F with female 
sex_gene_meta <- sex_gene_meta %>% 
  mutate(
    Sex = ifelse(Sex %in% c("M"), "male", "female") 
  )

Sex inference model

training_model_data <- "for_rna_sex_check.tsv"

# Build the model using the GTEx data
# Load the data and remove NAs
train_data <- read.csv(training_model_data, sep="\t")

# Split the data into training and test set
set.seed(123)
training.samples <- train_data$sex %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- train_data[training.samples, ]
test.data <- train_data[-training.samples, ]

# Dummy code categorical predictor variables
x <- model.matrix(sex~., train.data)[,-1]
# Convert the outcome (class) to a numerical variable
y <- ifelse(train.data$sex == "female", 1, 0)

cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")
plot(cv.lasso)

cv.lasso$lambda.min

## [1] 3.452375e-05

coef(cv.lasso, cv.lasso$lambda.min)

## 7 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)  4.0259095684
## XIST         0.0008731318
## EIF1AY      -0.0064198554
## KDM5D        .           
## UTY         -0.0063930288
## DDX3Y        .           
## RPS4Y1      -0.0028940802

coef(cv.lasso, cv.lasso$lambda.1se)

## 7 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  3.944517679
## XIST         0.000758968
## EIF1AY      -0.006134509
## KDM5D        .          
## UTY         -0.005928303
## DDX3Y        .          
## RPS4Y1      -0.002692325

# Final model with lambda.min
lasso.model <- glmnet(x, y, alpha = 1, family = "binomial",
                      lambda = cv.lasso$lambda.min)
# Make prediction on test data
x.test <- model.matrix(sex ~., test.data)[,-1]
probabilities <- lasso.model %>% predict(newx = x.test)
predicted.classes <- ifelse(probabilities > 0.5, "female", "male")
# Model accuracy
observed.classes <- test.data$sex
mean(predicted.classes == observed.classes)

## [1] 1

# ----------------------
# Run on experiment data
# ----------------------
counts_ids <- sex_gene_meta$counts_id 
sex_gene_meta$counts_id <- NULL
test.experiment <- model.matrix(Sex ~., sex_gene_meta)[,-1]

probabilities <- lasso.model %>% predict(newx = test.experiment)
predicted.classes <- ifelse(probabilities > 4, "female", "male")
# Model accuracy
observed.classes <- sex_gene_meta$Sex
mean(predicted.classes == observed.classes)

## [1] 0.9962825

df <- cbind(as.data.frame(predicted.classes)$s0, sex_gene_meta$Sex)
df <- as.data.frame(df)
# Rename column where names is "Sepal.Length"
names(df)[names(df) == "V1"] <- "reported"
names(df)[names(df) == "V2"] <- "observed"

# add back in sample IDs
df$counts_id <- counts_ids

# what doesn't match between reported and observed? 
which(df[,1]!=df[,2])

## [1] 37 38

# output results 
write.table(df, "sex_check_RNA.txt", sep = "\t", row.names = FALSE, quote = FALSE)

Plot sex check

genes_and_counts <- cbind(dge$genes$gene_name, lcpm)
genes_and_counts <- as.data.frame(genes_and_counts)
names(genes_and_counts)[names(genes_and_counts) == "V1"] <- "Geneid"
rownames(genes_and_counts)<-NULL
genes_counts <- melt(genes_and_counts, id=c("Geneid"))
names(genes_counts)[names(genes_counts) == "variable"] <- "counts_id"

df <- cbind(counts_metadata$counts_id, counts_metadata$Sex)
df <- as.data.frame(df)
names(df)[names(df) == "V1"] <- "counts_id"
names(df)[names(df) == "V2"] <- "Sex"

data <- merge(genes_counts, df, by = "counts_id")

sexGenes <- c("DDX3X, DDX3Y")
SelectGenes_counts <-
  subset(
    data,
    Geneid %in% c(
      "DDX3X",
      "DDX3Y",
      "ZFX",
      "ZFY",
      "USP9X",
      "USP9Y",
      "KDM6A",
      "UTY",
      "PCDH11X",
      "PCDH11Y",
      "XIST",
      "SRY"
    )
  )
SelectGenes_counts[, "geneComb"] <- NA
SelectGenes_counts[, "group"] <- NA


SelectGenes_counts$geneComb <-
  ifelse(
    SelectGenes_counts$Geneid == "DDX3X",
    "DDX3X:DDX3Y",
    ifelse(
      SelectGenes_counts$Geneid == "DDX3Y",
      "DDX3X:DDX3Y",
      ifelse(
        SelectGenes_counts$Geneid == "ZFX",
        "ZFX:ZFY",
        ifelse(
          SelectGenes_counts$Geneid == "ZFY",
          "ZFX:ZFY",
          ifelse(
            SelectGenes_counts$Geneid == "USP9X",
            "USP9X:USP9Y",
            ifelse(
              SelectGenes_counts$Geneid == "USP9Y",
              "USP9X:USP9Y",
              ifelse(
                SelectGenes_counts$Geneid == "KDM6A",
                "UTX:UTY",
                ifelse(
                  SelectGenes_counts$Geneid == "UTY",
                  "UTX:UTY",
                  ifelse(
                    SelectGenes_counts$Geneid == "PCDH11X",
                    "PCDH11X:PCDH11Y",
                    ifelse(
                      SelectGenes_counts$Geneid == "PCDH11Y",
                      "PCDH11X:PCDH11Y",
                      ifelse(
                        SelectGenes_counts$Geneid == "XIST",
                        "XIST",
                        ifelse(SelectGenes_counts$Geneid == "SRY", "SRY", "NA")
                      )
                    )
                  )
                )
              )
            )
          )
        )
      )
    )
  )

SelectGenes_counts$group <-
  ifelse(
    SelectGenes_counts$geneComb == "DDX3X:DDX3Y",
    1,
    ifelse(
      SelectGenes_counts$geneComb == "ZFX:ZFY",
      4,
      ifelse(
        SelectGenes_counts$geneComb == "USP9X:USP9Y",
        3,
        ifelse(
          SelectGenes_counts$geneComb == "UTX:UTY",
          5,
          ifelse(
            SelectGenes_counts$geneComb == "PCDH11X:PCDH11Y",
            2,
            ifelse(
              SelectGenes_counts$geneComb == "XIST",
              6,
              ifelse(SelectGenes_counts$geneComb == "SRY", 7, "NA")
            )
          )
        )
      )
    )
  )
# Plot
data <- SelectGenes_counts
# identify outliers
data_XIST <- subset(data, Geneid == "XIST")
lower_bound <- quantile(as.numeric(data_XIST$value), 0.025)

not_male <- subset(data, counts_id == "5510_FCH5MYFDMXY_L1" | counts_id ==  "5510_FCH5MYFDMXY_L2" )
not_male$Sex <- c()
leg_lab <- "reported sex"
cbPaletteJITTER = c("darkorange", "blue")
geneticSEXgenes_plot <- ggplot(data, aes(x = Geneid, y = value)) +
  geom_jitter(aes(color = Sex, shape = Sex),
              width = 0.25,
              size = 2.0) +
  scale_color_manual(leg_lab, values = cbPaletteJITTER) + # Jitter color palette
  scale_shape_manual(leg_lab, values = c(19, 15)) +
  labs(x = "", y = "", title = "") +
  facet_grid(
    . ~ group + geneComb,
    switch = "x",
    # Moves the labels from the top to the bottom
    #labeller = label_both, # Adds the labels to the year and X variables
    scales = "free_x",
    space = "free_x"
  ) +
  theme(
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    panel.border = element_rect(
      colour = "black",
      fill = NA,
      size = 1
    )
  ) +
  xlab("") + ylab("lcpm") 

geneticSEXgenes_plot + theme(
  panel.background = element_rect(
    fill = "white",
    colour = "white",
    size = 0.5,
    linetype = "solid"
  ),
  panel.grid.major = element_line(
    size = 0.5,
    linetype = 'solid',
    colour = "white"
  ),
  panel.grid.minor = element_line(
    size = 0.25,
    linetype = 'solid',
    colour = "grey"
  ),
  panel.border = element_rect(
    colour = "black",
    fill = NA,
    size = 1
  ),
  axis.text.x = element_text(face = "italic")
)

path <- paste0("../../results/", tool, "/sex_check/LBD_gene_raw_sex_check")
saveToPDF(paste0(path, ".pdf"), width = 10.5, height = 6)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 10.5, height = 4, unit = "in", res = 300)

## png 
##   2

RIN check with replicates

# first filter by expression and normalize the data
keep.expr <- filterByExpr(dge, group = dge$samples$TYPE)
dim(dge)

## [1] 60612   538

dge.filtered.reps <- dge[keep.expr, , keep.lib.sizes = FALSE]

dim(dge.filtered.reps)

## [1] 17982   538

table(dge.filtered.reps$genes$gene_type)

## 
##                          IG_C_gene                             lncRNA 
##                                  1                               2682 
##                              miRNA                           misc_RNA 
##                                  5                                 12 
##             polymorphic_pseudogene               processed_pseudogene 
##                                  5                                196 
##                     protein_coding                             scaRNA 
##                              14487                                  1 
##                              scRNA                             snoRNA 
##                                  1                                  8 
##                              snRNA                                TEC 
##                                  8                                142 
##                          TR_C_gene   transcribed_processed_pseudogene 
##                                  4                                 79 
##     transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene 
##                                 41                                255 
##                 unitary_pseudogene             unprocessed_pseudogene 
##                                  2                                 53

# Now, normalization by the method of trimmed mean of M-values (TMM)
dge.filtered.reps.norm <- calcNormFactors(dge.filtered.reps, method = "TMM")

# norm factor summary
summary(dge.filtered.reps.norm$samples$norm.factors)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6138  0.9424  1.0317  1.0060  1.0799  1.2114

log2cpm.norm.reps <- edgeR::cpm(dge.filtered.reps.norm, log = T)
nsamples <- ncol(dge.filtered.reps.norm)
boxplot(log2cpm.norm.reps, 
        main="Filtered normalized lcpm data with replicates", 
        xlab="RIN", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE)
axis(2)
axis(1,at=1:nsamples,labels=(dge.filtered.reps.norm$samples$RIN),las=2,cex.axis=0.8)

path <- paste0("../../results/", tool, "/boxplot/LBD_gene_boxplot_with_reps_with_RIN")
saveToPDF(paste0(path, ".pdf"), width = 35, height = 6)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 35, height = 6, unit = "in", res = 300)

## png 
##   2

Sum technical replicates

# sum technical replicates
dim(dge)

## [1] 60612   538

dge.tech <- sumTechReps(dge, dge$samples$NPID)
dim(dge.tech$counts)

## [1] 60612   269

colnames(dge.tech$counts) <- dge.tech$samples$NPID

Raw MDS

# set colors and get data
group_colors <- c("black", "red")[dge.tech$samples$TYPE]
lcpm <- edgeR::cpm(dge.tech$counts, log = TRUE)

par(bg = 'white')

# plot MDS
plotMDS(
  lcpm, 
  top = 100, 
  labels = dge.tech$samples$NPID,
  cex = .8, 
  dim.plot = c(4,5), 
  plot = TRUE, 
  col = group_colors,
  gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))
legend(
  "top",
  legend = c(control, condition),
  pch = 16,
  col = c(control_color, condition_color),
  cex = 1
)

# save
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_raw")
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)

## png 
##   2

Filter lowly expressed genes

The filterByExpr() function in the edgeR package determines which genes have a great enough count value to keep. We will filter by group. This means at least 6 samples (6 is the smallest group sample size) must express a minimum count of 10 (in cpm, default value).

keep.expr <- filterByExpr(dge.tech, group = dge.tech$samples$TYPE)
dim(dge.tech)

## [1] 60612   269

dge.filtered <- dge.tech[keep.expr, , keep.lib.sizes = FALSE]

dim(dge.filtered)

## [1] 20067   269

table(dge.filtered$genes$gene_type)

## 
##                          IG_C_gene                             lncRNA 
##                                  4                               3752 
##                              miRNA                           misc_RNA 
##                                 15                                 26 
##             polymorphic_pseudogene               processed_pseudogene 
##                                  8                                333 
##                     protein_coding                    rRNA_pseudogene 
##                              15119                                  3 
##                             scaRNA                              scRNA 
##                                  1                                  1 
##                             snoRNA                              snRNA 
##                                 16                                 22 
##                                TEC                          TR_C_gene 
##                                195                                  4 
##                          TR_J_gene   transcribed_processed_pseudogene 
##                                  1                                 99 
##     transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene 
##                                 54                                314 
##                 unitary_pseudogene             unprocessed_pseudogene 
##                                  4                                 96

TMM normalization

# Now, normalization by the method of trimmed mean of M-values (TMM)
dge.filtered.norm <- calcNormFactors(dge.filtered, method = "TMM")

# norm factor summary
summary(dge.filtered.norm$samples$norm.factors)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5944  0.9455  1.0313  1.0063  1.0793  1.2139

Density plot

Density plots of log-intensity distribution of each library can be superposed on a single graph for a better comparison between libraries and for identification of libraries with weird distribution.

# set graphical parameter
par(mfrow = c(1,3))

# Normalize data for library size and expression intesntiy
log2cpm.tech <- edgeR::cpm(dge.tech, log = TRUE)
log2cpm.filtered <- edgeR::cpm(dge.filtered, log = TRUE)
log2cpm.norm <- edgeR::cpm(dge.filtered.norm, log = TRUE)

# set colors
colors <- c("red","orange","green","yellow","blue","purple", 
            "lightgray","brown","pink","cyan")
nsamples <- ncol(dge.tech)

# First, plot the first column of the log2cpm.tech density
plot(density(log2cpm.tech[,1]), col = colors[1], lwd = 2, ylim = c(0,0.25), 
     las = 2, main = "A. Raw", xlab = expression('Log'[2]~CPM))

# For each sample plot the lcpm density
for (i in 2:nsamples){
  den <- density(log2cpm.tech[,i]) #subset each column
  lines(den$x, den$y, col = colors[i], lwd = 2) 
}

# Second, plot log2cpm.filtered
plot(density(log2cpm.filtered[,1]), col = colors[1], lwd = 2, ylim = c(0,0.25), 
     las = 2, main = "B. Filtered", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
  den <- density(log2cpm.filtered[,i])
  lines(den$x, den$y, col = colors[i], lwd = 2)
}

# Third, plot log2cpm.norm
plot(density(log2cpm.norm[,1]), col = colors[1], lwd = 2, ylim = c(0,0.25), 
     las = 2, main = "C. Normalized", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
  den <- density(log2cpm.norm[,i])
  lines(den$x, den$y, col = colors[i], lwd = 2)
}

# save
path <- paste0("../../results/", tool, "/density/LBD_gene_density")
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)

## png 
##   2

Boxplots

# set parameters
par(mfrow = c(1,3))

# First look at dge.tech
boxplot(log2cpm.tech, 
        main="A. Raw", 
        xlab="", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors
        )
axis(2) # 2 = left 
axis(1, # 1 = below 
     at = 1:nsamples, # points at which tick-marks should be drawn
     labels = colnames(log2cpm.tech),
     las = 2,
     cex.axis = 0.8 # size of axis
     )

# Second, look at dge.filtered
boxplot(log2cpm.filtered, 
        main="B. Filtered", 
        xlab="", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors
        )
axis(2)
axis(1, at=1:nsamples,labels=colnames(log2cpm.filtered),las=2,cex.axis=0.8)

# Third, look at dge.norm
boxplot(log2cpm.norm, 
        main="C. Normalized", 
        xlab="", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors)
axis(2)
axis(1,at=1:nsamples,labels=colnames(log2cpm.norm),las=2,cex.axis=0.8)

# save
path <- paste0("../../results/", tool, "/boxplot/LBD_gene_boxplot")
saveToPDF(paste0(path, ".pdf"), width = 10, height = 6)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 10, height = 6, unit = "in", res = 300)

## png 
##   2

RIN check with replicates summed

boxplot(log2cpm.norm, 
        main="Filtered normalized lcpm data", 
        xlab="RIN", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors)
axis(2)
axis(1,at=1:nsamples,labels=(dge.filtered.norm$samples$RIN),las=2,cex.axis=0.8)

path <- paste0("../../results/", tool, "/boxplot/LBD_gene_boxplot_sum_reps_with_RIN")
saveToPDF(paste0(path, ".pdf"), width = 12, height = 6)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 12, height = 6, unit = "in", res = 300)

## png 
##   2

# check if there is correlation between RIN and library size
box <- dge.filtered.norm$samples
plot(box$RIN, box$lib.size)

cor(box$RIN, box$lib.size, method = c("pearson", "kendall", "spearman"))

## [1] 0.3483433

cor.test(box$RIN, box$lib.size, method=c("pearson", "kendall", "spearman"))

## 
##  Pearson's product-moment correlation
## 
## data:  box$RIN and box$lib.size
## t = 6.0723, df = 267, p-value = 4.317e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2386895 0.4492260
## sample estimates:
##       cor 
## 0.3483433

# is the data normally distrubited 
ggqqplot(box$lib.size, ylab = "library size")

# wt
ggqqplot(box$RIN, ylab = "RIN")

res <- cor.test(box$lib.size, box$RIN, 
                method = "pearson")
res

## 
##  Pearson's product-moment correlation
## 
## data:  box$lib.size and box$RIN
## t = 6.0723, df = 267, p-value = 4.317e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2386895 0.4492260
## sample estimates:
##       cor 
## 0.3483433

ggscatter(box, x = "RIN", y = "lib.size", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          ylab = "library size", xlab = "RIN value")

## `geom_smooth()` using formula 'y ~ x'

path <- paste0("../../results/", tool, "/library/LBD_corr_RIN_lib_size")
saveToPDF(paste0(path, ".pdf"), width = 6, height = 6)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 6, height = 6, unit = "in", res = 300)

## png 
##   2

Sex check with summed replicates

lcpm <- edgeR::cpm(dge.filtered.norm, log = TRUE)
genes_and_counts <- cbind(dge.filtered.norm$genes$gene_name, lcpm)
genes_and_counts <- as.data.frame(genes_and_counts)
names(genes_and_counts)[names(genes_and_counts) == "V1"] <- "Geneid"
rownames(genes_and_counts)<-NULL
genes_counts <- melt(genes_and_counts, id=c("Geneid"))
names(genes_counts)[names(genes_counts) == "variable"] <- "NPID"

df <- cbind(counts_metadata$NPID, counts_metadata$Sex)
df <- as.data.frame(df)
names(df)[names(df) == "V1"] <- "NPID"
names(df)[names(df) == "V2"] <- "Sex"

data <- merge(genes_counts, df, by = "NPID")

sexGenes <- c("DDX3X, DDX3Y")
SelectGenes_counts <-
  subset(
    data,
    Geneid %in% c(
      "DDX3X",
      "DDX3Y",
      "ZFX",
      "ZFY",
      "USP9X",
      "USP9Y",
      "KDM6A",
      "UTY",
      "PCDH11X",
      "PCDH11Y",
      "XIST",
      "SRY"
    )
  )
SelectGenes_counts[, "geneComb"] <- NA
SelectGenes_counts[, "group"] <- NA


SelectGenes_counts$geneComb <-
  ifelse(
    SelectGenes_counts$Geneid == "DDX3X",
    "DDX3X:DDX3Y",
    ifelse(
      SelectGenes_counts$Geneid == "DDX3Y",
      "DDX3X:DDX3Y",
      ifelse(
        SelectGenes_counts$Geneid == "ZFX",
        "ZFX:ZFY",
        ifelse(
          SelectGenes_counts$Geneid == "ZFY",
          "ZFX:ZFY",
          ifelse(
            SelectGenes_counts$Geneid == "USP9X",
            "USP9X:USP9Y",
            ifelse(
              SelectGenes_counts$Geneid == "USP9Y",
              "USP9X:USP9Y",
              ifelse(
                SelectGenes_counts$Geneid == "KDM6A",
                "UTX:UTY",
                ifelse(
                  SelectGenes_counts$Geneid == "UTY",
                  "UTX:UTY",
                  ifelse(
                    SelectGenes_counts$Geneid == "PCDH11X",
                    "PCDH11X:PCDH11Y",
                    ifelse(
                      SelectGenes_counts$Geneid == "PCDH11Y",
                      "PCDH11X:PCDH11Y",
                      ifelse(
                        SelectGenes_counts$Geneid == "XIST",
                        "XIST",
                        ifelse(SelectGenes_counts$Geneid == "SRY", "SRY", "NA")
                      )
                    )
                  )
                )
              )
            )
          )
        )
      )
    )
  )

SelectGenes_counts$group <-
  ifelse(
    SelectGenes_counts$geneComb == "DDX3X:DDX3Y",
    1,
    ifelse(
      SelectGenes_counts$geneComb == "ZFX:ZFY",
      4,
      ifelse(
        SelectGenes_counts$geneComb == "USP9X:USP9Y",
        3,
        ifelse(
          SelectGenes_counts$geneComb == "UTX:UTY",
          5,
          ifelse(
            SelectGenes_counts$geneComb == "PCDH11X:PCDH11Y",
            2,
            ifelse(
              SelectGenes_counts$geneComb == "XIST",
              6,
              ifelse(SelectGenes_counts$geneComb == "SRY", 7, "NA")
            )
          )
        )
      )
    )
  )
# Plot
data <- SelectGenes_counts

leg_lab <- "reported sex"
cbPaletteJITTER = c("darkorange", "blue")
geneticSEXgenes_plot <- ggplot(data, aes(x = Geneid, y = value)) +
  geom_jitter(aes(color = Sex, shape = Sex),
              width = 0.25,
              size = 3.0) +
  scale_color_manual(leg_lab, values = cbPaletteJITTER) + # Jitter color palette
  scale_shape_manual(leg_lab, values = c(19, 15)) +
  labs(x = "", y = "", title = "") +
  facet_grid(
    . ~ group + geneComb,
    switch = "x",
    # Moves the labels from the top to the bottom
    #labeller = label_both, # Adds the labels to the year and X variables
    scales = "free_x",
    space = "free_x"
  ) +
  theme(
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    panel.border = element_rect(
      colour = "black",
      fill = NA,
      size = 1
    )
  ) +
  xlab("") + ylab("lcpm") # Removes the month legend

geneticSEXgenes_plot + theme(
  panel.background = element_rect(
    fill = "white",
    colour = "white",
    size = 0.5,
    linetype = "solid"
  ),
  panel.grid.major = element_line(
    size = 0.5,
    linetype = 'solid',
    colour = "white"
  ),
  panel.grid.minor = element_line(
    size = 0.25,
    linetype = 'solid',
    colour = "grey"
  ),
  panel.border = element_rect(
    colour = "black",
    fill = NA,
    size = 1
  ),
  axis.text.x = element_text(face = "italic")
)

path <- paste0("../../results/", tool, "/sex_check/LBD_gene_raw_sex_check_sumreps")
saveToPDF(paste0(path, ".pdf"), width = 10.5, height = 6)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 10.5, height = 4, unit = "in", res = 300)

## png 
##   2

Variance partition

CCA Heatmap

#form <- ~ TYPE + Age + Sex + ClinicalDx +lib.size + LBD.type +CDLB
form <- ~ TYPE + PathDx + AD.subtype + LBD.type + CDLB + Braak.NFT + Thal.amyloid + MF.SP + MF.NFT + MF.LB + Cing.LB + MF.Amyloid + MF.Tau + Cing.Synuclein + CWOW.Category + VaD  + TDP.type + Brain.wt + ClinicalDx + FHx + Duration + Sex + Age + Race + PMI + APOE + MAPT + GRN + TMEM106b + RIN + Total.RNA.ng 
# Compute Canonical Correlation Analysis (CCA)
# between all pairs of variables
# returns absolute correlation value
info <- as.data.frame(dge.filtered.norm$samples)
C = canCorPairs( form, info)

## Warning in canCorPairs(form, info): Regression model may be problematic.
## High colinearity between variables:
##   TYPE and PathDx
##   TYPE and LBD.type
##   TYPE and CDLB
##   MF.Amyloid and CWOW.Category
##   MF.Tau and CWOW.Category
##   Cing.Synuclein and CWOW.Category

# replace NA with zero
C[is.na(C)] = 0
cor.mtest <- function(C, ...) {
    C <- as.matrix(C)
    n <- ncol(C)
    p.mat<- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            tmp <- cor.test(C[, i], C[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
        }
    }
  colnames(p.mat) <- rownames(p.mat) <- colnames(C)
  p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(C)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(C, method="color", col=col(200),  
         type="upper", order="hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=45, #Text label color and rotation
         diag=FALSE 
         )

path <- paste0("../../results/", tool ,"/varpart/LBD_gene_CCA")
saveToPDF(paste0(path, ".pdf"), width = 25, height = 25)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 23, height = 25, unit = "in", res = 300)

## png 
##   2

Save R object

saveRDS(dge.filtered.norm, file = paste0("../../rObjects/LBD_dge.filtered.norm.rds"))
dge.filtered.norm <- readRDS(paste0("../../rObjects/LBD_dge.filtered.norm.rds"))

Top variable genes

Design matrix

age <- as.numeric(dge.filtered.norm$samples$Age)
RIN <- as.numeric(dge.filtered.norm$samples$RIN)

sex <- as.factor(dge.filtered.norm$samples$Sex)
race <- as.factor(dge.filtered.norm$samples$Race)

group <- interaction(dge.filtered.norm$samples$TYPE)

design <- model.matrix(~ 0 + group + sex + (1|RIN))
colnames(design) <- c(control,condition, "sex", "RIN")

design

##     CONTROL LBD sex RIN
## 1         1   0   1   1
## 2         1   0   1   1
## 3         1   0   1   1
## 4         0   1   0   1
## 5         1   0   0   1
## 6         0   1   1   1
## 7         0   1   0   1
## 8         1   0   0   1
## 9         0   1   1   1
## 10        0   1   1   1
## 11        0   1   0   1
## 12        0   1   0   1
## 13        0   1   0   1
## 14        0   1   1   1
## 15        1   0   0   1
## 16        0   1   1   1
## 17        0   1   1   1
## 18        0   1   1   1
## 19        0   1   1   1
## 20        1   0   1   1
## 21        0   1   1   1
## 22        0   1   1   1
## 23        1   0   1   1
## 24        1   0   1   1
## 25        1   0   0   1
## 26        0   1   0   1
## 27        0   1   1   1
## 28        0   1   1   1
## 29        0   1   1   1
## 30        0   1   0   1
## 31        0   1   1   1
## 32        0   1   0   1
## 33        0   1   1   1
## 34        0   1   0   1
## 35        0   1   0   1
## 36        0   1   0   1
## 37        0   1   1   1
## 38        0   1   0   1
## 39        0   1   1   1
## 40        0   1   1   1
## 41        0   1   0   1
## 42        0   1   0   1
## 43        1   0   1   1
## 44        0   1   1   1
## 45        1   0   1   1
## 46        0   1   1   1
## 47        0   1   1   1
## 48        1   0   0   1
## 49        0   1   1   1
## 50        0   1   1   1
## 51        0   1   1   1
## 52        0   1   1   1
## 53        1   0   1   1
## 54        1   0   1   1
## 55        0   1   1   1
## 56        0   1   1   1
## 57        0   1   1   1
## 58        0   1   1   1
## 59        1   0   0   1
## 60        0   1   1   1
## 61        0   1   0   1
## 62        0   1   1   1
## 63        0   1   0   1
## 64        0   1   1   1
## 65        0   1   1   1
## 66        0   1   1   1
## 67        1   0   1   1
## 68        0   1   1   1
## 69        0   1   0   1
## 70        0   1   1   1
## 71        1   0   1   1
## 72        0   1   0   1
## 73        0   1   1   1
## 74        0   1   1   1
## 75        1   0   1   1
## 76        0   1   1   1
## 77        0   1   1   1
## 78        0   1   1   1
## 79        0   1   1   1
## 80        0   1   1   1
## 81        1   0   1   1
## 82        0   1   1   1
## 83        0   1   0   1
## 84        0   1   1   1
## 85        0   1   0   1
## 86        0   1   0   1
## 87        0   1   0   1
## 88        1   0   1   1
## 89        0   1   1   1
## 90        0   1   1   1
## 91        0   1   1   1
## 92        0   1   1   1
## 93        0   1   0   1
## 94        0   1   1   1
## 95        1   0   1   1
## 96        0   1   0   1
## 97        0   1   1   1
## 98        0   1   1   1
## 99        0   1   0   1
## 100       0   1   1   1
## 101       0   1   0   1
## 102       1   0   1   1
## 103       0   1   0   1
## 104       0   1   1   1
## 105       0   1   1   1
## 106       0   1   1   1
## 107       0   1   1   1
## 108       0   1   0   1
## 109       0   1   0   1
## 110       0   1   1   1
## 111       0   1   1   1
## 112       1   0   1   1
## 113       0   1   1   1
## 114       0   1   0   1
## 115       0   1   1   1
## 116       1   0   0   1
## 117       1   0   1   1
## 118       0   1   0   1
## 119       1   0   0   1
## 120       0   1   0   1
## 121       0   1   0   1
## 122       0   1   1   1
## 123       0   1   0   1
## 124       0   1   1   1
## 125       1   0   0   1
## 126       0   1   0   1
## 127       0   1   1   1
## 128       0   1   1   1
## 129       0   1   0   1
## 130       0   1   1   1
## 131       0   1   1   1
## 132       0   1   0   1
## 133       0   1   0   1
## 134       1   0   1   1
## 135       1   0   0   1
## 136       0   1   0   1
## 137       0   1   1   1
## 138       0   1   1   1
## 139       0   1   1   1
## 140       0   1   0   1
## 141       0   1   0   1
## 142       1   0   0   1
## 143       0   1   1   1
## 144       0   1   0   1
## 145       0   1   1   1
## 146       0   1   1   1
## 147       1   0   0   1
## 148       0   1   1   1
## 149       1   0   0   1
## 150       0   1   1   1
## 151       0   1   1   1
## 152       0   1   1   1
## 153       0   1   1   1
## 154       0   1   1   1
## 155       0   1   1   1
## 156       0   1   0   1
## 157       0   1   1   1
## 158       1   0   1   1
## 159       0   1   1   1
## 160       0   1   1   1
## 161       0   1   1   1
## 162       0   1   1   1
## 163       0   1   1   1
## 164       0   1   0   1
## 165       0   1   0   1
## 166       0   1   1   1
## 167       0   1   1   1
## 168       0   1   0   1
## 169       0   1   1   1
## 170       0   1   0   1
## 171       0   1   1   1
## 172       0   1   1   1
## 173       0   1   0   1
## 174       0   1   1   1
## 175       1   0   1   1
## 176       0   1   0   1
## 177       0   1   1   1
## 178       0   1   1   1
## 179       0   1   1   1
## 180       0   1   1   1
## 181       0   1   1   1
## 182       0   1   1   1
## 183       0   1   1   1
## 184       0   1   1   1
## 185       1   0   1   1
## 186       0   1   1   1
## 187       0   1   0   1
## 188       0   1   1   1
## 189       0   1   1   1
## 190       0   1   1   1
## 191       0   1   1   1
## 192       0   1   1   1
## 193       0   1   1   1
## 194       0   1   1   1
## 195       0   1   1   1
## 196       0   1   0   1
## 197       0   1   0   1
## 198       1   0   1   1
## 199       1   0   0   1
## 200       0   1   1   1
## 201       0   1   1   1
## 202       1   0   0   1
## 203       0   1   0   1
## 204       0   1   1   1
## 205       0   1   1   1
## 206       0   1   0   1
## 207       0   1   1   1
## 208       0   1   0   1
## 209       0   1   0   1
## 210       0   1   1   1
## 211       0   1   0   1
## 212       0   1   1   1
## 213       0   1   1   1
## 214       0   1   0   1
## 215       0   1   0   1
## 216       0   1   0   1
## 217       0   1   0   1
## 218       0   1   0   1
## 219       0   1   1   1
## 220       0   1   1   1
## 221       0   1   0   1
## 222       0   1   1   1
## 223       0   1   1   1
## 224       0   1   1   1
## 225       0   1   0   1
## 226       0   1   1   1
## 227       0   1   0   1
## 228       0   1   0   1
## 229       0   1   1   1
## 230       0   1   0   1
## 231       0   1   1   1
## 232       0   1   1   1
## 233       0   1   1   1
## 234       0   1   0   1
## 235       0   1   1   1
## 236       0   1   1   1
## 237       1   0   1   1
## 238       1   0   0   1
## 239       0   1   0   1
## 240       0   1   0   1
## 241       1   0   1   1
## 242       0   1   0   1
## 243       0   1   1   1
## 244       0   1   1   1
## 245       0   1   1   1
## 246       0   1   1   1
## 247       0   1   1   1
## 248       0   1   1   1
## 249       0   1   0   1
## 250       0   1   1   1
## 251       0   1   1   1
## 252       0   1   1   1
## 253       0   1   0   1
## 254       0   1   0   1
## 255       0   1   0   1
## 256       0   1   1   1
## 257       0   1   0   1
## 258       0   1   0   1
## 259       0   1   1   1
## 260       0   1   1   1
## 261       1   0   1   1
## 262       0   1   0   1
## 263       1   0   0   1
## 264       0   1   1   1
## 265       0   1   1   1
## 266       0   1   1   1
## 267       0   1   0   1
## 268       0   1   1   1
## 269       0   1   1   1
## attr(,"assign")
## [1] 1 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
## 
## attr(,"contrasts")$sex
## [1] "contr.treatment"
## 
## attr(,"contrasts")$`1 | RIN`
## [1] "contr.treatment"

Voom

# voom transform counts
v <- voomWithQualityWeights(dge.filtered.norm,
                            design,
                            plot = TRUE)

## Coefficients not estimable: RIN

## Warning: Partial NA coefficients for 20067 probe(s)

## Coefficients not estimable: RIN

## Warning: Partial NA coefficients for 20067 probe(s)

# save
path <- paste0("../../results/", tool, "/voom/LBD_gene_mean_var_weights")
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)

## png 
##   2

# fits linear model for each gene given a series of arrays
fit <- lmFit(v, design)

## Coefficients not estimable: RIN

## Warning: Partial NA coefficients for 20067 probe(s)

# contrast design for differential expression
contrasts <- makeContrasts(
  title = myContrasts,  # myContrasts was user input from beginning
  levels = colnames(design))
head(contrasts)

##          Contrasts
## Levels    LBD - CONTROL
##   CONTROL            -1
##   LBD                 1
##   sex                 0
##   RIN                 0

# save contrast names
allComparisons <- colnames(contrasts)
allComparisons # check

## [1] "LBD - CONTROL"

# run contrast analysis
vfit <- contrasts.fit(fit, contrasts = contrasts)

# Compute differential expression based on the empirical Bayes moderation of the
# standard errors towards a common value.
veBayesFit <- eBayes(vfit)
plotSA(veBayesFit, main = "Final Model: Mean-variance Trend")

# save
path <- paste0("../../results/", tool, "/voom/LBD_gene_final_mean_var")
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)

## png 
##   2

Voom MDS Plot

group_colors <- c("black", "red")[v$targets$TYPE]
names <- v$targets$NPID

plotMDS(
  v, 
  top = 100, 
  labels = names,
  cex = 1, 
  dim.plot = c(1,2), 
  plot = TRUE, 
  col = group_colors
)

title(expression('Top 100 Genes - Voom (Log'[2]~'CPM)'))

# save
# save
path <- paste0("../../results/", tool, "/MDS/LBD_gene_MDS_normalized")
saveToPDF(paste0(path, ".pdf"), width = 5, height = 5)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)

## png 
##   2

saveRDS(v, file = paste0("../../rObjects/LBD_gene_voom.rds"))
v <- readRDS(paste0("../../rObjects/LBD_gene_voom.rds"))

Number of DEGs

Identify number of differential expressed genes.

pval <- 0.05

sumTable <- 
  summary(decideTests(
    vfit,  # object
    adjust.method = "BH", # by default the method = "separate"
    p.value = pval,
    lfc = 0  # numeric, minimum absolute log2-fold change required
  ))

print(paste0(" FDRq < ", pval))

## [1] " FDRq < 0.05"

sumTable

##        LBD - CONTROL
## Down            2590
## NotSig         13984
## Up              3493

Output DEG tables

coef <- 1

for (i in allComparisons) {
  # p < 1, log2fc > 0 
  vTopTableAll <-
    topTable(
      veBayesFit, 
      coef = coef,  
      n = Inf, 
      p.value = 1,
      lfc = 0 
    )
  path <- paste("../../results/", tool, "/DEGs/LBD_gene_DEGs_FDRq1.00.txt", sep = "") 
  write.table(
    vTopTableAll,
    path,
    sep = "\t",
    row.names = FALSE,
    quote = FALSE
  )
  
  # p < 0.05, log2fc > 0
  vTopTable1 <-
    topTable( 
      veBayesFit,  
      coef = coef,  
      n = Inf, 
      p.value = 0.05,
      lfc = 0
    )
  path <- paste("../../results/", tool, "/DEGs/LBD_gene_DEGs_FDRq0.05.txt", sep = "") 
  write.table(
    vTopTable1,
    path,
    sep = "\t",
    row.names = FALSE,
    quote = FALSE
  )
  # increment -----------------------------------------------------------------
  coef <- coef + 1
}

Read and save table with all genes (FDRq = 1).

condition_vs_control <- read.table(
  paste0("../../results/", tool, "/DEGs/LBD_gene_DEGs_FDRq1.00.txt", sep = ""),
  header = TRUE,
  sep = "\t",
  stringsAsFactors = FALSE)

saveRDS(condition_vs_control, file = paste0("../../rObjects/LBD_gene_table.rds"))

Assign colors

Assign colors values based on FDRq cutoff of 0.05.

color_values <- vector()
max <- nrow(condition_vs_control)

for(i in 1:max){
  if (condition_vs_control$adj.P.Val[i] < 0.05){
    if (condition_vs_control$logFC[i] > 0){
      color_values <- c(color_values, 1) # 1 when logFC > 0 and FDRq < 0.05
    }
    else if (condition_vs_control$logFC[i] < 0){
      color_values <- c(color_values, 2) # 2 when logFC < 0 and FDRq < 0.05
    }
  }
  else{
    color_values <- c(color_values, 3) # 3 when FDRq >= 0.05
  }
}

condition_vs_control$color_p0.05 <- factor(color_values)

Subset genes to label

Subset the top 10 up and down-regulated genes

up <- condition_vs_control[condition_vs_control$color_p0.05 == 1,]
up10 <- up[1:10,]

down <- condition_vs_control[condition_vs_control$color_p0.05 == 2,]
down <- subset(down, down$logFC < -1.5)
down10 <- down[1:7,]

Volcano plot

hadjpval <- (-log10(max(
  condition_vs_control$P.Value[condition_vs_control$adj.P.Val < 0.05], 
  na.rm=TRUE)))

p_vol <-
  ggplot(data = condition_vs_control, 
         aes(x = logFC,  # x-axis is logFC
             y = -log10(P.Value),  # y-axis will be -log10 of P.Value
             color = color_p0.05)) +  # color is based on factored color column
  geom_point(alpha = 1.5, size = 1.7) +  # create scatterplot, alpha makes points transparent
  theme_bw() +  # set color theme
  theme(legend.position = "none") +  # no legend
  scale_color_manual(values = c("red", "blue","grey")) +  # set factor colors
  labs(
    title = "", # no main title
    x = expression(log[2](FC)), # x-axis title
    y = expression(-log[10] ~ "(" ~ italic("p") ~ "-value)") # y-axis title
  ) +
  theme(axis.title.x = element_text(size = 10),
        axis.text.x = element_text(size = 10)) +
  theme(axis.title.y = element_text(size = 10),
        axis.text.y = element_text(size = 10)) +
  geom_hline(yintercept = hadjpval,  #  horizontal line
                     colour = "#000000",
                     linetype = "dashed") +
  ggtitle("LBD vs Control\nFDRq < 0.05") +
  theme(plot.title = element_text(size = 10)) +
  geom_text_repel(data = up10,
                  aes(x = logFC, y= -log10(P.Value), label = gene_name), 
                  color = "maroon", 
                  fontface="italic",
                  size = 4, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 30)
                  ) +
  geom_text_repel(data = down10,
                  aes(x = logFC, y= -log10(P.Value), label = gene_name), 
                  color = "navyblue", 
                  fontface="italic",
                  size = 4, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 15)
                  )  #+
 # scale_y_continuous(breaks = seq(0,8,by=1), limits = c(0,8)) +
 # scale_x_continuous(breaks = seq(-3,3,by=1), limits = c(-3,3))
p_vol

# save
path <- paste0("../../results/", tool, "/volcano/LBD_gene_volcano_FDRq0.05")
saveToPDF(paste0(path, ".pdf"), width = 5.2, height = 5.2)

## png 
##   2

saveToPNG(paste0(path, ".png"), width = 8, height = 6, unit = "in", res = 300)

## png 
##   2

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.9 (Maipo)
## 
## Matrix products: default
## BLAS:   /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] glmnet_4.1-4             Matrix_1.3-4             caret_6.0-92            
##  [4] lattice_0.20-45          forcats_0.5.1            purrr_0.3.4             
##  [7] readr_2.1.2              tidyr_1.2.0              tibble_3.1.7            
## [10] tidyverse_1.3.1          ggpubr_0.4.0             corrplot_0.92           
## [13] plyr_1.8.7               Glimma_2.4.0             reshape_0.8.9           
## [16] variancePartition_1.24.1 stringr_1.4.0            rtracklayer_1.54.0      
## [19] GenomicRanges_1.46.1     GenomeInfoDb_1.30.1      IRanges_2.28.0          
## [22] S4Vectors_0.32.4         BiocGenerics_0.40.0      philentropy_0.6.0       
## [25] gplots_3.1.3             ggrepel_0.9.1            ggplot2_3.3.6           
## [28] edgeR_3.36.0             limma_3.50.3             dplyr_1.0.9             
## [31] BiocParallel_1.28.3      rmarkdown_2.14          
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                  tidyselect_1.1.2           
##   [3] lme4_1.1-30                 RSQLite_2.2.14             
##   [5] AnnotationDbi_1.56.2        htmlwidgets_1.5.4          
##   [7] grid_4.1.2                  pROC_1.18.0                
##   [9] munsell_0.5.0               codetools_0.2-18           
##  [11] future_1.26.1               withr_2.5.0                
##  [13] colorspace_2.0-3            Biobase_2.54.0             
##  [15] highr_0.9                   knitr_1.39                 
##  [17] rstudioapi_0.13             ggsignif_0.6.3             
##  [19] listenv_0.8.0               labeling_0.4.2             
##  [21] MatrixGenerics_1.6.0        Rdpack_2.3.1               
##  [23] GenomeInfoDbData_1.2.7      farver_2.1.1               
##  [25] bit64_4.0.5                 parallelly_1.32.0          
##  [27] vctrs_0.4.1                 generics_0.1.3             
##  [29] ipred_0.9-13                xfun_0.31                  
##  [31] R6_2.5.1                    doParallel_1.0.17          
##  [33] locfit_1.5-9.5              bitops_1.0-7               
##  [35] cachem_1.0.6                DelayedArray_0.20.0        
##  [37] assertthat_0.2.1            BiocIO_1.4.0               
##  [39] scales_1.2.0                nnet_7.3-16                
##  [41] gtable_0.3.0                globals_0.15.1             
##  [43] timeDate_3043.102           rlang_1.0.4                
##  [45] genefilter_1.76.0           splines_4.1.2              
##  [47] rstatix_0.7.0               ModelMetrics_1.2.2.2       
##  [49] broom_1.0.0                 yaml_2.3.5                 
##  [51] reshape2_1.4.4              abind_1.4-5                
##  [53] modelr_0.1.8                backports_1.4.1            
##  [55] tools_4.1.2                 lava_1.6.10                
##  [57] ellipsis_0.3.2              jquerylib_0.1.4            
##  [59] RColorBrewer_1.1-3          Rcpp_1.0.9                 
##  [61] progress_1.2.2              zlibbioc_1.40.0            
##  [63] RCurl_1.98-1.7              prettyunits_1.1.1          
##  [65] rpart_4.1-15                SummarizedExperiment_1.24.0
##  [67] haven_2.5.0                 fs_1.5.2                   
##  [69] magrittr_2.0.3              data.table_1.14.2          
##  [71] reprex_2.0.1                matrixStats_0.62.0         
##  [73] hms_1.1.1                   evaluate_0.15              
##  [75] xtable_1.8-4                pbkrtest_0.5.1             
##  [77] RhpcBLASctl_0.21-247.1      XML_3.99-0.10              
##  [79] readxl_1.4.0                shape_1.4.6                
##  [81] compiler_4.1.2              KernSmooth_2.23-20         
##  [83] crayon_1.5.1                minqa_1.2.4                
##  [85] htmltools_0.5.2             mgcv_1.8-38                
##  [87] tzdb_0.3.0                  geneplotter_1.72.0         
##  [89] lubridate_1.8.0             DBI_1.1.3                  
##  [91] dbplyr_2.2.0                MASS_7.3-54                
##  [93] boot_1.3-28                 car_3.1-0                  
##  [95] cli_3.3.0                   rbibutils_2.2.8            
##  [97] parallel_4.1.2              gower_1.0.0                
##  [99] pkgconfig_2.0.3             GenomicAlignments_1.30.0   
## [101] recipes_1.0.1               xml2_1.3.3                 
## [103] foreach_1.5.2               annotate_1.72.0            
## [105] bslib_0.3.1                 hardhat_1.2.0              
## [107] XVector_0.34.0              prodlim_2019.11.13         
## [109] rvest_1.0.2                 digest_0.6.29              
## [111] Biostrings_2.62.0           cellranger_1.1.0           
## [113] restfulr_0.0.15             Rsamtools_2.10.0           
## [115] gtools_3.9.3                rjson_0.2.21               
## [117] nloptr_2.0.3                lifecycle_1.0.1            
## [119] nlme_3.1-153                jsonlite_1.8.0             
## [121] aod_1.3.2                   carData_3.0-5              
## [123] fansi_1.0.3                 pillar_1.7.0               
## [125] KEGGREST_1.34.0             fastmap_1.1.0              
## [127] httr_1.4.3                  survival_3.2-13            
## [129] glue_1.6.2                  png_0.1-7                  
## [131] iterators_1.0.14            bit_4.0.4                  
## [133] class_7.3-19                stringi_1.7.8              
## [135] sass_0.4.1                  blob_1.2.3                 
## [137] DESeq2_1.34.0               caTools_1.18.2             
## [139] memoise_2.0.1               future.apply_1.9.0

Differential expression in LBD samples

Kimberly Olney

01/20/2022