This labwork starts with reading the data and data munipulation, then plot a pca graph for dimention reduction for further data analysis; afterward, the author conduct a differential expression analysis and perform a pathway analysis to identify what are the most significant pathways. Finally, the author conclude the gdc_cases.diagnoses.tumor_stage has 10 DNA expression at 95% confident leval, among of them, the Authod present the most sigificant of 4.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("msigdbr")
## Bioconductor version 3.16 (BiocManager 1.30.20), R 4.2.3 (2023-03-15 ucrt)
## Installing package(s) 'msigdbr'
## package 'msigdbr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\u6977\AppData\Local\Temp\RtmpKwfmNf\downloaded_packages
## Installation paths not writeable, unable to update packages
## path: C:/Program Files/R/R-4.2.3/library
## packages:
## class, KernSmooth, lattice, MASS, Matrix, nnet, survival
## Old packages: 'cachem', 'checkmate', 'clock', 'dplyr', 'graphlayouts', 'Hmisc',
## 'rlang', 'tzdb', 'vctrs', 'xfun', 'xml2'
library(Homo.sapiens)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GO.db
##
## Loading required package: org.Hs.eg.db
##
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
library(org.Hs.eg.db)
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::select() masks OrganismDbi::select(), AnnotationDbi::select()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggfortify)
library(RColorBrewer)
options(repos = c(CRAN = "https://cran.r-project.org/"))
options(timeout = 300) # Sets the timeout to 300 seconds (5 minutes)
url <-'http://duffel.rail.bio/recount/v2/TCGA/rse_gene_skin.Rdata'
destfile <- "rse_gene_skin.Rdata"
download.file(url = url, destfile = 'rse_gene_skin.Rdata', method = 'auto',mode = 'wb')
load("rse_gene_skin.Rdata")
#gene_names <- rowData(rse_gene)$Gene.Name
library(GEOquery)
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(org.Hs.eg.db)
library(limma)
##
## Attaching package: 'limma'
##
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(edgeR)
library(Glimma)
library(Homo.sapiens)
library(gplots)
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:IRanges':
##
## space
##
## The following object is masked from 'package:S4Vectors':
##
## space
##
## The following object is masked from 'package:stats':
##
## lowess
library(dplyr)
### Gene counts
counts <- assay(rse_gene)
gene_names <- rowData(rse_gene)$Gene.Name
### Gene identifiers
genes <- rowData(rse_gene)
### Phenotype data for the samples
samples <- as.data.frame(colData(rse_gene))
### There are 864 variables in samples, 593 are completely missing
#install.packages("magrittr")
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## The following object is masked from 'package:GenomicRanges':
##
## subtract
samples %>%
is.na() %>%
colSums() %>%
table()
## .
## 0 1 2 3 8 9 10 11 12 13 14 15 22 28 29 33 35 39 45 50
## 186 2 1 3 7 1 2 3 1 2 1 1 2 1 2 1 1 2 1 1
## 112 150 158 160 161 162 163 168 190 195 200 214 218 222 251 258 300 319 320 329
## 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3
## 349 360 364 369 402 409 430 436 466 472 473
## 1 2 1 1 1 1 1 1 1 16 593
library(msigdbr)
# Bioconductor package for MSigDB gene sets
Hs.c2 <- msigdbr(species = "Homo sapiens", category = "C2")
knitr::opts_chunk$set(echo = TRUE)
counts <- assay(rse_gene)
genes <- rowData(rse_gene)
samples <- as.data.frame(colData(rse_gene))
TCGA <- DGEList(counts = counts, samples = samples, genes = genes)
TCGA <- calcNormFactors(TCGA, method = "TMM")
keep <- filterByExpr(TCGA)
## Warning in filterByExpr.DGEList(TCGA): All samples appear to belong to the same
## group.
TCGA <- TCGA[keep,]
cpm <- cpm(TCGA)
lcpm <- cpm(TCGA, log=TRUE)
pca <- prcomp(t(lcpm), scale = TRUE)
autoplot(pca, data = samples, colour = "cgc_case_gender")
design <- model.matrix(~ cgc_case_gender, samples)
v <- voom(TCGA, design)
fit <- lmFit(v, design)
efit <- eBayes(fit)
dt_fdr <- decideTests(efit, adjust.method = "fdr", p.value = 0.05)
summary(dt_fdr)
## (Intercept) cgc_case_genderMALE
## Down 15186 59
## NotSig 819 33355
## Up 17438 29
volcanoplot(efit, coef = "cgc_case_genderMALE", highlight = 10, names = efit$genes$symbol)
design <- model.matrix(~ cgc_case_gender, samples)
v <- voom(TCGA, design)
fit <- lmFit(v, design)
efit <- eBayes(fit)
dt_fdr <- decideTests(efit, adjust.method = "fdr", p.value = 0.05)
summary(dt_fdr)
## (Intercept) cgc_case_genderMALE
## Down 15186 59
## NotSig 819 33355
## Up 17438 29
volcanoplot(efit, coef = "cgc_case_genderMALE", highlight = 10, names = efit$genes$symbol)
symbol <- unlist(lapply(v$genes$symbol,function(x)x[1]))
ENTREZID <- mapIds(Homo.sapiens, keys=symbol, column=c("ENTREZID"),keytype="SYMBOL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
v$genes$ENTREZID <-ENTREZID
idx <- ids2indices(Hs.c2,id=v$genes$ENTREZID)
cam<- camera(v,idx,design,contrast="cgc_case_genderMALE",trend.var = TRUE)
head(cam,4)
## NGenes Direction PValue FDR
## entrez_gene 17069 Up 0.8051229 0.8051229
## human_entrez_gene 17069 Up 0.8051229 0.8051229
The author concludes there are 10 genes sinificant relate to cgc_case_genderMALE at 95% significant level presented, the the top four results of the pathway analysis are RUNNE_GENDER_EFFECT_UP, PYEON_CANCER_HEAD_AND_NECK_VS_CERVICAL_DN,DISTECHE_ESCAPED_FROM_X_INACTIVATION, and ZHANG_TLX_TARGETS_36HR_DN.