Differential Expression Analysis

# Install or update BiocManager
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

# Install or update Bioconductor to version '3.16'
BiocManager::install

## function (pkgs = character(), ..., site_repository = character(), 
##     update = TRUE, ask = TRUE, checkBuilt = FALSE, force = FALSE, 
##     version = BiocManager::version()) 
## {
##     stopifnot(is.character(pkgs), !anyNA(pkgs), .install_validate_dots(...), 
##         length(site_repository) <= 1L, is.character(site_repository), 
##         !any(is.na(site_repository)), is.logical(update), length(update) == 
##             1L, !is.na(update), is.logical(ask), length(ask) == 
##             1L, !is.na(ask), is.logical(checkBuilt), length(checkBuilt) == 
##             1L, !is.na(checkBuilt), length(version) == 1L || 
##             inherits(version, "version_sentinel"))
##     version <- .version_validate(version)
##     inst <- installed.packages()
##     if (!"BiocVersion" %in% rownames(inst)) {
##         pkgs <- unique(c("BiocVersion", pkgs))
##     }
##     cmp <- .version_compare(version, version())
##     action <- if (cmp < 0) 
##         "Downgrade"
##     else "Upgrade"
##     repos <- .repositories(site_repository, version = version, 
##         ...)
##     vout <- .valid_out_of_date_pkgs(pkgs = inst, repos = repos, 
##         ..., checkBuilt = checkBuilt, site_repository = site_repository)
##     if (cmp != 0L) {
##         pkgs <- unique(c("BiocVersion", pkgs))
##         valist <- .valid_result(vout, pkgs = inst)
##         npkgs <- .install_n_invalid_pkgs(valist) + length(pkgs)
##         if (!length(pkgs) - 1L) {
##             .install_ask_up_or_down_grade(version, npkgs, cmp, 
##                 ask) || .stop(paste0("Bioconductor version not changed by 'install()'", 
##                 if (!interactive() && isTRUE(ask)) 
##                   "; in non-interactive sessions use 'ask = FALSE'"))
##         }
##         else {
##             fmt <- paste(c("To use Bioconductor version '%s', first %s %d packages with", 
##                 "\n    BiocManager::install(version = '%s')"), 
##                 collapse = "")
##             action <- tolower(action)
##             .stop(fmt, version, action, npkgs, version, wrap. = FALSE)
##         }
##     }
##     .message(.version_string(version))
##     pkgs <- .install(pkgs, vout[["out_of_date"]], instPkgs = inst, 
##         repos = repos, update = update, ask = ask, force = force, 
##         ...)
##     if (update && cmp == 0L) {
##         .install_update(repos, ask, checkBuilt = checkBuilt, 
##             ...)
##     }
##     else if (cmp != 0L) {
##         .install_updated_version(valist, update, vout[["out_of_date"]], 
##             inst, repos, ask = ask, force = force, ...)
##     }
##     invisible(pkgs)
## }
## <bytecode: 0x000001de19e27470>
## <environment: namespace:BiocManager>

# Install DESeq2
BiocManager::install("DESeq2")

## Bioconductor version 3.17 (BiocManager 1.30.20), R 4.3.0 (2023-04-21 ucrt)

## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'DESeq2'

## Old packages: 'Matrix'

# Install ggplot2
BiocManager::install("ggplot2")

## Bioconductor version 3.17 (BiocManager 1.30.20), R 4.3.0 (2023-04-21 ucrt)

## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'ggplot2'

## Old packages: 'Matrix'

# Install ggplot2
BiocManager::install("tidyr")

## Bioconductor version 3.17 (BiocManager 1.30.20), R 4.3.0 (2023-04-21 ucrt)

## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'tidyr'

## Old packages: 'Matrix'

# Load necessary libraries

library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

## Warning: replacing previous import 'S4Arrays::read_block' by
## 'DelayedArray::read_block' when loading 'SummarizedExperiment'

library(ggplot2)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:S4Vectors':
## 
##     expand

i) Differential gene expression analysis of given file. Diagnosis is the only factor

# Set the path to the input directory
st_path_input <- "C:/Users/smath/Desktop/omics_cw/input_part2"

# Read the count data from the file
countData <- read.table(file.path(st_path_input, "GSE64018_countlevel_12asd_12ctl_edited.txt"), header = TRUE, row.names = 1)

# Display the first 10 rows of countData
head(countData, 10)

##                 AN02987 AN04682 UMB5278 AN01570 AN00493 AN12457 AN08166 AN08792
## ENSG00000000003     155     163     138      65      84      93      54      37
## ENSG00000000005       0      13       2       3       4       0       1       2
## ENSG00000000419     432     447     222     238     211     140     209      75
## ENSG00000000457     321     186     198     243     220     177     210      58
## ENSG00000000460     118      83      92      94      85     104      76      38
## ENSG00000000938     301      93      30      31     105     112      21      14
## ENSG00000000971     984     264      76      87     113     158     128     146
## ENSG00000001036     387     176     118     138     226     142      99      60
## ENSG00000001084    1336     705     540     743    1472     968     651     244
## ENSG00000001167     776     865     521     687     898     562     454     257
##                 AN01971 AN03632 AN08043 AN09714 AN17425 AN07444 UMB4590 AN10833
## ENSG00000000003       8     247      47     116     154     105      87      33
## ENSG00000000005       1       5       2       1       2       0       4       4
## ENSG00000000419      21     555     192     150     395     355     358      86
## ENSG00000000457      52     590     162     177     388     244     266     134
## ENSG00000000460      20     277      57     107     134      90     113      44
## ENSG00000000938      25     141      21     110      33      44      56      31
## ENSG00000000971      51     506      62     571     235     180     198      86
## ENSG00000001036      35     353      83     159     206     141     161      94
## ENSG00000001084     222    2202     492    1065     999     786     939     426
## ENSG00000001167     243    1167     378     806     784     579     840     331
##                 AN14757 AN19760 AN12137 UMB5079 AN08161 AN08677 UMB4842 AN13295
## ENSG00000000003     144      93     140      72      41     147     107     130
## ENSG00000000005       2       4       5       3       0       0       3       1
## ENSG00000000419     610     209     454     280     137     381     268     327
## ENSG00000000457     375     229     450     247     158     253     199     322
## ENSG00000000460     115      85     247      97      63      82      70     163
## ENSG00000000938      26      34      80      52      56      36      23      41
## ENSG00000000971     282     141     394     176     231     155     154     221
## ENSG00000001036     205     172     307     138     104     173     136     235
## ENSG00000001084    1133     848    1163     830     630    1106     795    1196
## ENSG00000001167     759     581     979     659     415     679     644     918

# Read metadata table 
colData <- read.table(file.path(st_path_input, "metadata.txt") , header = TRUE)
head(colData, 10)

##           regionid diagnosis age Sex rin brainbank   pmi totalreads.picard
## AN02987 ba41-42-22       ASD  15   M 8.6       ATP 30.80          45506260
## AN04682 ba41-42-22       ASD  15   M 8.2       ATP 23.23          54642554
## UMB5278 ba41-42-22       ASD  16   F 7.8     NICHD 13.00          39374242
## AN01570 ba41-42-22       ASD  18   F 7.1       ATP  6.75          45359064
## AN00493 ba41-42-22       ASD  27   M 7.3       ATP  8.30          39989410
## AN12457 ba41-42-22       ASD  29   F 6.0       ATP 17.83          43809852
## AN08166 ba41-42-22       ASD  29   M 6.4       ATP 43.25          27996353
## AN08792 ba41-42-22       ASD  30   M 5.1       ATP 20.30          21602464
## AN01971 ba41-42-22       ASD  39   M 2.9       ATP 31.50          14009235
## AN03632 ba41-42-22       ASD  49   F 8.1       ATP 21.08          84218275
##         aligned.reads.picard hq.aligned.reads.picard pf.all.bases.picard
## AN02987             45506260                40502727          4550389520
## AN04682             54642554                50421547          5463948661
## UMB5278             39374242                35878279          3937184301
## AN01570             45359064                42050951          4535639256
## AN00493             39989410                35730549          3998687269
## AN12457             43809852                40938207          4380656349
## AN08166             27996353                25895921          2799474872
## AN08792             21602464                20148125          2160114475
## AN01971             14009235                12298443          1400830353
## AN03632             84218275                77408176          8421328282
##         coding.bases.picard utr.bases.picard intronic.bases.picard
## AN02987          1388493945       1884703079            1026654593
## AN04682          1163882243       1632881234            1959276248
## UMB5278           875666592       1292786624            1472794507
## AN01570          1318635297       1495560716            1455524346
## AN00493           988975069       1285458462            1462609241
## AN12457           978687230       1151872810            1922767554
## AN08166           702818287        954701435             975637636
## AN08792           407890700        574331115             859937444
## AN01971           369250054        402098037             474899394
## AN03632          1827033959       2557873886            3403006316
##         intergenic.bases.picard median.cv.coverage.picard
## AN02987               250537903                  0.514579
## AN04682               707908936                  0.544495
## UMB5278               295936578                  0.515967
## AN01570               265918897                  0.498845
## AN00493               261644497                  0.572530
## AN12457               327328755                  0.585500
## AN08166               166317514                  0.525250
## AN08792               317955216                  0.536986
## AN01971               154582868                  0.672936
## AN03632               633414121                  0.540069
##         median.5prime.bias.picard median.3prime.bias.picard
## AN02987                  0.458133                  0.758689
## AN04682                  0.456012                  0.728795
## UMB5278                  0.453488                  0.740362
## AN01570                  0.490352                  0.735198
## AN00493                  0.449714                  0.692505
## AN12457                  0.450107                  0.645537
## AN08166                  0.456507                  0.742371
## AN08792                  0.454813                  0.735531
## AN01971                  0.412661                  0.624652
## AN03632                  0.469228                  0.698454
##         median.5to3prime.bias.picard at.dropout.picard gc.dropout.picard
## AN02987                     0.615196          30.92868          0.883464
## AN04682                     0.639601          25.87652          0.437608
## UMB5278                     0.585323          21.98184          1.209633
## AN01570                     0.654083          27.42288          0.138467
## AN00493                     0.651819          33.23593          0.343162
## AN12457                     0.669651          26.50754          0.000411
## AN08166                     0.594834          18.73335          1.704897
## AN08792                     0.614396          25.25016          0.247464
## AN01971                     0.645283          33.18674          0.068047
## AN03632                     0.655312          19.23817          1.481120
##         propexonicreads.htsc
## AN02987            0.3263685
## AN04682            0.2915453
## UMB5278            0.2479446
## AN01570            0.2362229
## AN00493            0.2690841
## AN12457            0.2053809
## AN08166            0.2320067
## AN08792            0.2981747
## AN01971            0.3160508
## AN03632            0.2471129

# Assuming 'diagnosis' is the only factor 

# Convert 'diagnosis' to a factor
colData$diagnosis <- factor(colData$diagnosis)

# Create DESeq2 dataset
dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~ diagnosis)

# Check the levels of 'diagnosis'
factor_levels <- levels(dds$diagnosis)

# Print the levels
print(factor_levels)

## [1] "ASD" "CTL"

# Relevel the 'diagnosis' factor variable
dds$diagnosis <- relevel(dds$diagnosis, ref = "CTL")

# Check the levels after releveling
factor_levels <- levels(dds$diagnosis)
print(factor_levels)

## [1] "CTL" "ASD"

# Perform differential expression analysis
dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 558 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

#get the results and store 
res <- results(dds)

#visualize
head(res, 10)

## log2 fold change (MLE): diagnosis ASD vs CTL 
## Wald test p-value: diagnosis ASD vs CTL 
## DataFrame with 10 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE       stat     pvalue
##                 <numeric>      <numeric> <numeric>  <numeric>  <numeric>
## ENSG00000000003  90.51022      0.0289053  0.223654  0.1292411 0.89716684
## ENSG00000000005   2.39821      0.4441329  0.579575  0.7663085 0.44349279
## ENSG00000000419 248.03262     -0.3006463  0.234590 -1.2815836 0.19998876
## ENSG00000000457 218.69538     -0.1873677  0.145848 -1.2846771 0.19890516
## ENSG00000000460  90.22595     -0.0078534  0.154794 -0.0507344 0.95953720
## ENSG00000000938  57.60287      0.9864855  0.329591  2.9930592 0.00276196
## ENSG00000000971 215.34037      0.4745321  0.340551  1.3934251 0.16349127
## ENSG00000001036 151.49749      0.0596814  0.166137  0.3592301 0.71942299
## ENSG00000001084 812.72561      0.0689150  0.161438  0.4268830 0.66946455
## ENSG00000001167 611.40011      0.1014584  0.107023  0.9480022 0.34312834
##                      padj
##                 <numeric>
## ENSG00000000003 0.9568729
## ENSG00000000005 0.6837449
## ENSG00000000419 0.4556007
## ENSG00000000457 0.4545321
## ENSG00000000460 0.9845328
## ENSG00000000938 0.0623818
## ENSG00000000971 0.4093214
## ENSG00000001036 0.8656915
## ENSG00000001084 0.8368799
## ENSG00000001167 0.5983593

ii) Report how many genes are differential expressed (up and down-regulated) at an adjusted p-value cut-off of 0.05.

# Extract up-regulated genes
upregulated_genes <- subset(res, padj <= 0.05 & log2FoldChange > 0)

# Extract down-regulated genes
downregulated_genes <- subset(res, padj <= 0.05 & log2FoldChange < 0)

# Count the number of up-regulated and down-regulated genes
num_upregulated <- nrow(upregulated_genes)
num_downregulated <- nrow(downregulated_genes)

# Print the results
cat("Number of up-regulated genes:", num_upregulated, "\n")

## Number of up-regulated genes: 1145

cat("Number of down-regulated genes:", num_downregulated, "\n")

## Number of down-regulated genes: 69

iii) Top 10 genes with highest log2foldchange

# Convert DESeqResults object to a data frame
res_df <- as.data.frame(res)

# Subset the top 10 significant genes
top_genes <- subset(res_df, padj < 0.05)
top_genes <- top_genes[order(-abs(top_genes$log2FoldChange)), ]
top_genes <- head(top_genes, 10)

# Add gene names as a column in the data frame
top_genes$gene_name <- rownames(top_genes)

# Filter out rows with missing values
res_df_filtered <- na.omit(res_df)

# Create the volcano plot
volcano_plot <- ggplot(res_df_filtered, aes(x = log2FoldChange, y = -log10(padj))) +
  geom_point(aes(color = ifelse(padj < 0.05, "significant", "nonsignificant")), alpha = 0.7) +
  geom_text(data = top_genes, aes(label = gene_name), hjust = 0, vjust = 0, size = 2, color = "black") +
  labs(x = "Log2 Fold Change", y = "-log10(Adjusted p-value)",
       title = "Volcano Plot of Differential Expression") +
  theme_minimal() +
  theme(plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm"))

# Display the volcano plot
print(volcano_plot)

## iV) Counts for gene SRRM4 represented by box plot

# Extract the counts for the gene ENSG00000139767
gene_counts <- countData["ENSG00000139767", ]

# Convert the gene counts to a numeric vector
gene_counts <- as.numeric(gene_counts)

# Create a data frame with the diagnosis and gene counts
data <- data.frame(Diagnosis = colData$diagnosis, Gene_Count = gene_counts)

# Create a box plot with jittered points for counts and summary statistic
box_plot <- ggplot(data, aes(x = Diagnosis, y = Gene_Count)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
  stat_summary(fun = "mean", geom = "point", shape = 20, size = 3, color = "red") +
  labs(x = "Diagnosis", y = "Gene Count", title = "Gene Counts for ENSG00000139767 by Diagnosis") +
  theme_minimal()

# Display the box plot with scatter plot and summary statistic
print(box_plot)

# I have used to use the raw counts instead of adjusted FPKM as to show the expression levels between the samples, raw counts can be used. To identify pattern or relative expression we need to use the adjusted FPKM counts. However we could see from the differential analysis the particular gene ‘SRRM4’ is not differentially expressed between CTL and ASD factors and only slight decrease in gene expression levels. It should be noted that we are assuming that the diagnosis is the only factor here. DESeq2 performs normalization as part of its analysis pipeline. DESeq2 uses a method called “median of ratios” normalization, which accounts for differences in library sizes and composition across samples.

# Retrieve the DESeq results
deseq_results <- results(dds)

# Subset the results for the gene 'ENSG00000139767'
srrm4_results <- deseq_results[rownames(deseq_results) == 'ENSG00000139767', ]

# Retrieve the padj value for 'SRRM4'
padj_value <- srrm4_results$padj
log2fold_change <- srrm4_results$log2FoldChange

# Print the padj value
print(padj_value)

## [1] 0.5355527

print(log2fold_change)

## [1] -0.2797976

V) Data processing, analysis pipelines and other sample handling ccan be the reason for the discrepencies between both samples. It should be noteed that the paper p value is significannt only of the stastical analysis performed using median. There are outliers. It is not clear that the factors like age and gender is considered.

VI) As mentioned above other biological factors has not been considered. The design can be improved using other factors like age and gender. Also, envioronmental factors can play a role too. We can add all those factors to improve the analysis.

##VII) Creating the shiny app

library(shiny)
library(ggplot2)
ui <- fluidPage(
  sidebarLayout(
    sidebarPanel(
      textInput("gene", "Enter Gene Name/ID"),
      actionButton("plotButton", "Plot")
    ),
    mainPanel(
      plotOutput("genePlot")
    )
  )
)

server <- function(input, output) {
  observeEvent(input$plotButton, {
    # Get the entered gene name or ID
    gene <- input$gene
    
    # Extract the counts for the gene
    gene_counts <- countData[gene, ]
    
    # Convert the gene counts to a numeric vector
    gene_counts <- as.numeric(gene_counts)
    
    # Create a data frame with the diagnosis and gene counts
    data <- data.frame(Diagnosis = colData$diagnosis, Gene_Count = gene_counts)
    
    # Create the plot
    output$genePlot <- renderPlot({
      ggplot(data, aes(x = Diagnosis, y = Gene_Count)) +
        geom_boxplot() +
        geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
        stat_summary(fun = "mean", geom = "point", shape = 20, size = 3, color = "red") +
        labs(x = "Diagnosis", y = "Gene Count", title = paste("Gene Counts for", gene, "by Diagnosis")) +
        theme_minimal()
    })
  })
}

shinyApp(ui, server)

Shiny applications not supported in static R Markdown documents

Differential Expression Analysis

Mathulah Sriskandarajah

2023-05-10

i) Differential gene expression analysis of given file. Diagnosis is the only factor

ii) Report how many genes are differential expressed (up and down-regulated) at an adjusted p-value cut-off of 0.05.

iii) Top 10 genes with highest log2foldchange

VI) As mentioned above other biological factors has not been considered. The design can be improved using other factors like age and gender. Also, envioronmental factors can play a role too. We can add all those factors to improve the analysis.