# Install or update BiocManager
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
# Install or update Bioconductor to version '3.16'
BiocManager::install
## function (pkgs = character(), ..., site_repository = character(),
## update = TRUE, ask = TRUE, checkBuilt = FALSE, force = FALSE,
## version = BiocManager::version())
## {
## stopifnot(is.character(pkgs), !anyNA(pkgs), .install_validate_dots(...),
## length(site_repository) <= 1L, is.character(site_repository),
## !any(is.na(site_repository)), is.logical(update), length(update) ==
## 1L, !is.na(update), is.logical(ask), length(ask) ==
## 1L, !is.na(ask), is.logical(checkBuilt), length(checkBuilt) ==
## 1L, !is.na(checkBuilt), length(version) == 1L ||
## inherits(version, "version_sentinel"))
## version <- .version_validate(version)
## inst <- installed.packages()
## if (!"BiocVersion" %in% rownames(inst)) {
## pkgs <- unique(c("BiocVersion", pkgs))
## }
## cmp <- .version_compare(version, version())
## action <- if (cmp < 0)
## "Downgrade"
## else "Upgrade"
## repos <- .repositories(site_repository, version = version,
## ...)
## vout <- .valid_out_of_date_pkgs(pkgs = inst, repos = repos,
## ..., checkBuilt = checkBuilt, site_repository = site_repository)
## if (cmp != 0L) {
## pkgs <- unique(c("BiocVersion", pkgs))
## valist <- .valid_result(vout, pkgs = inst)
## npkgs <- .install_n_invalid_pkgs(valist) + length(pkgs)
## if (!length(pkgs) - 1L) {
## .install_ask_up_or_down_grade(version, npkgs, cmp,
## ask) || .stop(paste0("Bioconductor version not changed by 'install()'",
## if (!interactive() && isTRUE(ask))
## "; in non-interactive sessions use 'ask = FALSE'"))
## }
## else {
## fmt <- paste(c("To use Bioconductor version '%s', first %s %d packages with",
## "\n BiocManager::install(version = '%s')"),
## collapse = "")
## action <- tolower(action)
## .stop(fmt, version, action, npkgs, version, wrap. = FALSE)
## }
## }
## .message(.version_string(version))
## pkgs <- .install(pkgs, vout[["out_of_date"]], instPkgs = inst,
## repos = repos, update = update, ask = ask, force = force,
## ...)
## if (update && cmp == 0L) {
## .install_update(repos, ask, checkBuilt = checkBuilt,
## ...)
## }
## else if (cmp != 0L) {
## .install_updated_version(valist, update, vout[["out_of_date"]],
## inst, repos, ask = ask, force = force, ...)
## }
## invisible(pkgs)
## }
## <bytecode: 0x000001de19e27470>
## <environment: namespace:BiocManager>
# Install DESeq2
BiocManager::install("DESeq2")
## Bioconductor version 3.17 (BiocManager 1.30.20), R 4.3.0 (2023-04-21 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'DESeq2'
## Old packages: 'Matrix'
# Install ggplot2
BiocManager::install("ggplot2")
## Bioconductor version 3.17 (BiocManager 1.30.20), R 4.3.0 (2023-04-21 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'ggplot2'
## Old packages: 'Matrix'
# Install ggplot2
BiocManager::install("tidyr")
## Bioconductor version 3.17 (BiocManager 1.30.20), R 4.3.0 (2023-04-21 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'tidyr'
## Old packages: 'Matrix'
# Load necessary libraries
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
## Warning: replacing previous import 'S4Arrays::read_block' by
## 'DelayedArray::read_block' when loading 'SummarizedExperiment'
library(ggplot2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
# Set the path to the input directory
st_path_input <- "C:/Users/smath/Desktop/omics_cw/input_part2"
# Read the count data from the file
countData <- read.table(file.path(st_path_input, "GSE64018_countlevel_12asd_12ctl_edited.txt"), header = TRUE, row.names = 1)
# Display the first 10 rows of countData
head(countData, 10)
## AN02987 AN04682 UMB5278 AN01570 AN00493 AN12457 AN08166 AN08792
## ENSG00000000003 155 163 138 65 84 93 54 37
## ENSG00000000005 0 13 2 3 4 0 1 2
## ENSG00000000419 432 447 222 238 211 140 209 75
## ENSG00000000457 321 186 198 243 220 177 210 58
## ENSG00000000460 118 83 92 94 85 104 76 38
## ENSG00000000938 301 93 30 31 105 112 21 14
## ENSG00000000971 984 264 76 87 113 158 128 146
## ENSG00000001036 387 176 118 138 226 142 99 60
## ENSG00000001084 1336 705 540 743 1472 968 651 244
## ENSG00000001167 776 865 521 687 898 562 454 257
## AN01971 AN03632 AN08043 AN09714 AN17425 AN07444 UMB4590 AN10833
## ENSG00000000003 8 247 47 116 154 105 87 33
## ENSG00000000005 1 5 2 1 2 0 4 4
## ENSG00000000419 21 555 192 150 395 355 358 86
## ENSG00000000457 52 590 162 177 388 244 266 134
## ENSG00000000460 20 277 57 107 134 90 113 44
## ENSG00000000938 25 141 21 110 33 44 56 31
## ENSG00000000971 51 506 62 571 235 180 198 86
## ENSG00000001036 35 353 83 159 206 141 161 94
## ENSG00000001084 222 2202 492 1065 999 786 939 426
## ENSG00000001167 243 1167 378 806 784 579 840 331
## AN14757 AN19760 AN12137 UMB5079 AN08161 AN08677 UMB4842 AN13295
## ENSG00000000003 144 93 140 72 41 147 107 130
## ENSG00000000005 2 4 5 3 0 0 3 1
## ENSG00000000419 610 209 454 280 137 381 268 327
## ENSG00000000457 375 229 450 247 158 253 199 322
## ENSG00000000460 115 85 247 97 63 82 70 163
## ENSG00000000938 26 34 80 52 56 36 23 41
## ENSG00000000971 282 141 394 176 231 155 154 221
## ENSG00000001036 205 172 307 138 104 173 136 235
## ENSG00000001084 1133 848 1163 830 630 1106 795 1196
## ENSG00000001167 759 581 979 659 415 679 644 918
# Read metadata table
colData <- read.table(file.path(st_path_input, "metadata.txt") , header = TRUE)
head(colData, 10)
## regionid diagnosis age Sex rin brainbank pmi totalreads.picard
## AN02987 ba41-42-22 ASD 15 M 8.6 ATP 30.80 45506260
## AN04682 ba41-42-22 ASD 15 M 8.2 ATP 23.23 54642554
## UMB5278 ba41-42-22 ASD 16 F 7.8 NICHD 13.00 39374242
## AN01570 ba41-42-22 ASD 18 F 7.1 ATP 6.75 45359064
## AN00493 ba41-42-22 ASD 27 M 7.3 ATP 8.30 39989410
## AN12457 ba41-42-22 ASD 29 F 6.0 ATP 17.83 43809852
## AN08166 ba41-42-22 ASD 29 M 6.4 ATP 43.25 27996353
## AN08792 ba41-42-22 ASD 30 M 5.1 ATP 20.30 21602464
## AN01971 ba41-42-22 ASD 39 M 2.9 ATP 31.50 14009235
## AN03632 ba41-42-22 ASD 49 F 8.1 ATP 21.08 84218275
## aligned.reads.picard hq.aligned.reads.picard pf.all.bases.picard
## AN02987 45506260 40502727 4550389520
## AN04682 54642554 50421547 5463948661
## UMB5278 39374242 35878279 3937184301
## AN01570 45359064 42050951 4535639256
## AN00493 39989410 35730549 3998687269
## AN12457 43809852 40938207 4380656349
## AN08166 27996353 25895921 2799474872
## AN08792 21602464 20148125 2160114475
## AN01971 14009235 12298443 1400830353
## AN03632 84218275 77408176 8421328282
## coding.bases.picard utr.bases.picard intronic.bases.picard
## AN02987 1388493945 1884703079 1026654593
## AN04682 1163882243 1632881234 1959276248
## UMB5278 875666592 1292786624 1472794507
## AN01570 1318635297 1495560716 1455524346
## AN00493 988975069 1285458462 1462609241
## AN12457 978687230 1151872810 1922767554
## AN08166 702818287 954701435 975637636
## AN08792 407890700 574331115 859937444
## AN01971 369250054 402098037 474899394
## AN03632 1827033959 2557873886 3403006316
## intergenic.bases.picard median.cv.coverage.picard
## AN02987 250537903 0.514579
## AN04682 707908936 0.544495
## UMB5278 295936578 0.515967
## AN01570 265918897 0.498845
## AN00493 261644497 0.572530
## AN12457 327328755 0.585500
## AN08166 166317514 0.525250
## AN08792 317955216 0.536986
## AN01971 154582868 0.672936
## AN03632 633414121 0.540069
## median.5prime.bias.picard median.3prime.bias.picard
## AN02987 0.458133 0.758689
## AN04682 0.456012 0.728795
## UMB5278 0.453488 0.740362
## AN01570 0.490352 0.735198
## AN00493 0.449714 0.692505
## AN12457 0.450107 0.645537
## AN08166 0.456507 0.742371
## AN08792 0.454813 0.735531
## AN01971 0.412661 0.624652
## AN03632 0.469228 0.698454
## median.5to3prime.bias.picard at.dropout.picard gc.dropout.picard
## AN02987 0.615196 30.92868 0.883464
## AN04682 0.639601 25.87652 0.437608
## UMB5278 0.585323 21.98184 1.209633
## AN01570 0.654083 27.42288 0.138467
## AN00493 0.651819 33.23593 0.343162
## AN12457 0.669651 26.50754 0.000411
## AN08166 0.594834 18.73335 1.704897
## AN08792 0.614396 25.25016 0.247464
## AN01971 0.645283 33.18674 0.068047
## AN03632 0.655312 19.23817 1.481120
## propexonicreads.htsc
## AN02987 0.3263685
## AN04682 0.2915453
## UMB5278 0.2479446
## AN01570 0.2362229
## AN00493 0.2690841
## AN12457 0.2053809
## AN08166 0.2320067
## AN08792 0.2981747
## AN01971 0.3160508
## AN03632 0.2471129
# Assuming 'diagnosis' is the only factor
# Convert 'diagnosis' to a factor
colData$diagnosis <- factor(colData$diagnosis)
# Create DESeq2 dataset
dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~ diagnosis)
# Check the levels of 'diagnosis'
factor_levels <- levels(dds$diagnosis)
# Print the levels
print(factor_levels)
## [1] "ASD" "CTL"
# Relevel the 'diagnosis' factor variable
dds$diagnosis <- relevel(dds$diagnosis, ref = "CTL")
# Check the levels after releveling
factor_levels <- levels(dds$diagnosis)
print(factor_levels)
## [1] "CTL" "ASD"
# Perform differential expression analysis
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 558 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
#get the results and store
res <- results(dds)
#visualize
head(res, 10)
## log2 fold change (MLE): diagnosis ASD vs CTL
## Wald test p-value: diagnosis ASD vs CTL
## DataFrame with 10 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 90.51022 0.0289053 0.223654 0.1292411 0.89716684
## ENSG00000000005 2.39821 0.4441329 0.579575 0.7663085 0.44349279
## ENSG00000000419 248.03262 -0.3006463 0.234590 -1.2815836 0.19998876
## ENSG00000000457 218.69538 -0.1873677 0.145848 -1.2846771 0.19890516
## ENSG00000000460 90.22595 -0.0078534 0.154794 -0.0507344 0.95953720
## ENSG00000000938 57.60287 0.9864855 0.329591 2.9930592 0.00276196
## ENSG00000000971 215.34037 0.4745321 0.340551 1.3934251 0.16349127
## ENSG00000001036 151.49749 0.0596814 0.166137 0.3592301 0.71942299
## ENSG00000001084 812.72561 0.0689150 0.161438 0.4268830 0.66946455
## ENSG00000001167 611.40011 0.1014584 0.107023 0.9480022 0.34312834
## padj
## <numeric>
## ENSG00000000003 0.9568729
## ENSG00000000005 0.6837449
## ENSG00000000419 0.4556007
## ENSG00000000457 0.4545321
## ENSG00000000460 0.9845328
## ENSG00000000938 0.0623818
## ENSG00000000971 0.4093214
## ENSG00000001036 0.8656915
## ENSG00000001084 0.8368799
## ENSG00000001167 0.5983593
# Extract up-regulated genes
upregulated_genes <- subset(res, padj <= 0.05 & log2FoldChange > 0)
# Extract down-regulated genes
downregulated_genes <- subset(res, padj <= 0.05 & log2FoldChange < 0)
# Count the number of up-regulated and down-regulated genes
num_upregulated <- nrow(upregulated_genes)
num_downregulated <- nrow(downregulated_genes)
# Print the results
cat("Number of up-regulated genes:", num_upregulated, "\n")
## Number of up-regulated genes: 1145
cat("Number of down-regulated genes:", num_downregulated, "\n")
## Number of down-regulated genes: 69
# Convert DESeqResults object to a data frame
res_df <- as.data.frame(res)
# Subset the top 10 significant genes
top_genes <- subset(res_df, padj < 0.05)
top_genes <- top_genes[order(-abs(top_genes$log2FoldChange)), ]
top_genes <- head(top_genes, 10)
# Add gene names as a column in the data frame
top_genes$gene_name <- rownames(top_genes)
# Filter out rows with missing values
res_df_filtered <- na.omit(res_df)
# Create the volcano plot
volcano_plot <- ggplot(res_df_filtered, aes(x = log2FoldChange, y = -log10(padj))) +
geom_point(aes(color = ifelse(padj < 0.05, "significant", "nonsignificant")), alpha = 0.7) +
geom_text(data = top_genes, aes(label = gene_name), hjust = 0, vjust = 0, size = 2, color = "black") +
labs(x = "Log2 Fold Change", y = "-log10(Adjusted p-value)",
title = "Volcano Plot of Differential Expression") +
theme_minimal() +
theme(plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm"))
# Display the volcano plot
print(volcano_plot)
## iV) Counts for gene SRRM4 represented by box plot
# Extract the counts for the gene ENSG00000139767
gene_counts <- countData["ENSG00000139767", ]
# Convert the gene counts to a numeric vector
gene_counts <- as.numeric(gene_counts)
# Create a data frame with the diagnosis and gene counts
data <- data.frame(Diagnosis = colData$diagnosis, Gene_Count = gene_counts)
# Create a box plot with jittered points for counts and summary statistic
box_plot <- ggplot(data, aes(x = Diagnosis, y = Gene_Count)) +
geom_boxplot() +
geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
stat_summary(fun = "mean", geom = "point", shape = 20, size = 3, color = "red") +
labs(x = "Diagnosis", y = "Gene Count", title = "Gene Counts for ENSG00000139767 by Diagnosis") +
theme_minimal()
# Display the box plot with scatter plot and summary statistic
print(box_plot)
# I have used to use the raw counts instead of adjusted FPKM as to show
the expression levels between the samples, raw counts can be used. To
identify pattern or relative expression we need to use the adjusted FPKM
counts. However we could see from the differential analysis the
particular gene ‘SRRM4’ is not differentially expressed between CTL and
ASD factors and only slight decrease in gene expression levels. It
should be noted that we are assuming that the diagnosis is the only
factor here. DESeq2 performs normalization as part of its analysis
pipeline. DESeq2 uses a method called “median of ratios” normalization,
which accounts for differences in library sizes and composition across
samples.
# Retrieve the DESeq results
deseq_results <- results(dds)
# Subset the results for the gene 'ENSG00000139767'
srrm4_results <- deseq_results[rownames(deseq_results) == 'ENSG00000139767', ]
# Retrieve the padj value for 'SRRM4'
padj_value <- srrm4_results$padj
log2fold_change <- srrm4_results$log2FoldChange
# Print the padj value
print(padj_value)
## [1] 0.5355527
print(log2fold_change)
## [1] -0.2797976
##VII) Creating the shiny app
library(shiny)
library(ggplot2)
ui <- fluidPage(
sidebarLayout(
sidebarPanel(
textInput("gene", "Enter Gene Name/ID"),
actionButton("plotButton", "Plot")
),
mainPanel(
plotOutput("genePlot")
)
)
)
server <- function(input, output) {
observeEvent(input$plotButton, {
# Get the entered gene name or ID
gene <- input$gene
# Extract the counts for the gene
gene_counts <- countData[gene, ]
# Convert the gene counts to a numeric vector
gene_counts <- as.numeric(gene_counts)
# Create a data frame with the diagnosis and gene counts
data <- data.frame(Diagnosis = colData$diagnosis, Gene_Count = gene_counts)
# Create the plot
output$genePlot <- renderPlot({
ggplot(data, aes(x = Diagnosis, y = Gene_Count)) +
geom_boxplot() +
geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
stat_summary(fun = "mean", geom = "point", shape = 20, size = 3, color = "red") +
labs(x = "Diagnosis", y = "Gene Count", title = paste("Gene Counts for", gene, "by Diagnosis")) +
theme_minimal()
})
})
}
shinyApp(ui, server)