Introduction

This labwork starts with reading the data and data munipulation, then plot a pca graph for dimention reduction for further data analysis; afterward, the author conduct a differential expression analysis and perform a pathway analysis to identify what are the most significant pathways. Finally, the author conclude the gdc_cases.diagnoses.tumor_stage has 10 DNA expression at 95% confident leval, among of them, the Authod present the most sigificant of 4.

Data Reading and Cleaning

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("msigdbr")
## Bioconductor version 3.16 (BiocManager 1.30.20), R 4.2.3 (2023-03-15 ucrt)
## Installing package(s) 'msigdbr'
## package 'msigdbr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\u6977\AppData\Local\Temp\RtmpKwfmNf\downloaded_packages
## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.2.3/library
##   packages:
##     class, KernSmooth, lattice, MASS, Matrix, nnet, survival
## Old packages: 'cachem', 'checkmate', 'clock', 'dplyr', 'graphlayouts', 'Hmisc',
##   'rlang', 'tzdb', 'vctrs', 'xfun', 'xml2'
library(Homo.sapiens)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GO.db
## 
## Loading required package: org.Hs.eg.db
## 
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
library(org.Hs.eg.db)
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
## 
##     rowMedians
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::select()       masks OrganismDbi::select(), AnnotationDbi::select()
## ✖ dplyr::slice()        masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggfortify)
library(RColorBrewer)


options(repos = c(CRAN = "https://cran.r-project.org/"))
options(timeout = 300)  # Sets the timeout to 300 seconds (5 minutes)

url <-'http://duffel.rail.bio/recount/v2/TCGA/rse_gene_skin.Rdata'
destfile <- "rse_gene_skin.Rdata"



download.file(url = url, destfile = 'rse_gene_skin.Rdata', method = 'auto',mode = 'wb')
load("rse_gene_skin.Rdata")

#gene_names <- rowData(rse_gene)$Gene.Name




library(GEOquery)
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(org.Hs.eg.db)
library(limma)
## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(edgeR)
library(Glimma)
library(Homo.sapiens)
library(gplots)
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:IRanges':
## 
##     space
## 
## The following object is masked from 'package:S4Vectors':
## 
##     space
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library(dplyr)


### Gene counts
counts <- assay(rse_gene)
gene_names <- rowData(rse_gene)$Gene.Name
### Gene identifiers
genes <- rowData(rse_gene)
### Phenotype data for the samples
samples <- as.data.frame(colData(rse_gene))
### There are 864 variables in samples, 593 are completely missing
#install.packages("magrittr")
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract
samples %>%
    is.na() %>%
    colSums() %>%
    table()
## .
##   0   1   2   3   8   9  10  11  12  13  14  15  22  28  29  33  35  39  45  50 
## 186   2   1   3   7   1   2   3   1   2   1   1   2   1   2   1   1   2   1   1 
## 112 150 158 160 161 162 163 168 190 195 200 214 218 222 251 258 300 319 320 329 
##   1   1   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   2   3 
## 349 360 364 369 402 409 430 436 466 472 473 
##   1   2   1   1   1   1   1   1   1  16 593
library(msigdbr) 
# Bioconductor package for MSigDB gene sets
Hs.c2 <- msigdbr(species = "Homo sapiens", category = "C2")

PCA dimention reduction plot

knitr::opts_chunk$set(echo = TRUE)
counts <- assay(rse_gene)
genes <- rowData(rse_gene)
samples <- as.data.frame(colData(rse_gene))
TCGA <- DGEList(counts = counts, samples = samples, genes = genes)
TCGA <- calcNormFactors(TCGA, method = "TMM")
keep <- filterByExpr(TCGA)
## Warning in filterByExpr.DGEList(TCGA): All samples appear to belong to the same
## group.
TCGA <- TCGA[keep,]
cpm <- cpm(TCGA)
lcpm <- cpm(TCGA, log=TRUE)
pca <- prcomp(t(lcpm), scale = TRUE)
autoplot(pca, data = samples, colour = "cgc_case_gender")

design <- model.matrix(~ cgc_case_gender, samples)
v <- voom(TCGA, design)
fit <- lmFit(v, design)
efit <- eBayes(fit)
dt_fdr <- decideTests(efit, adjust.method = "fdr", p.value = 0.05)
summary(dt_fdr)
##        (Intercept) cgc_case_genderMALE
## Down         15186                  59
## NotSig         819               33355
## Up           17438                  29
volcanoplot(efit, coef = "cgc_case_genderMALE", highlight = 10, names = efit$genes$symbol)

conducting a differential expression analysis

design <- model.matrix(~ cgc_case_gender, samples)
v <- voom(TCGA, design)
fit <- lmFit(v, design)
efit <- eBayes(fit)
dt_fdr <- decideTests(efit, adjust.method = "fdr", p.value = 0.05)
summary(dt_fdr)
##        (Intercept) cgc_case_genderMALE
## Down         15186                  59
## NotSig         819               33355
## Up           17438                  29
volcanoplot(efit, coef = "cgc_case_genderMALE", highlight = 10, names = efit$genes$symbol)

perform a pathway analysis

symbol <- unlist(lapply(v$genes$symbol,function(x)x[1])) 
ENTREZID <- mapIds(Homo.sapiens, keys=symbol, column=c("ENTREZID"),keytype="SYMBOL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
v$genes$ENTREZID <-ENTREZID
idx <- ids2indices(Hs.c2,id=v$genes$ENTREZID)
cam<- camera(v,idx,design,contrast="cgc_case_genderMALE",trend.var = TRUE)
head(cam,4)
##                   NGenes Direction    PValue       FDR
## entrez_gene        17069        Up 0.8051229 0.8051229
## human_entrez_gene  17069        Up 0.8051229 0.8051229

Conclusion

The author concludes there are 10 genes sinificant relate to cgc_case_genderMALE at 95% significant level presented, the the top four results of the pathway analysis are RUNNE_GENDER_EFFECT_UP, PYEON_CANCER_HEAD_AND_NECK_VS_CERVICAL_DN,DISTECHE_ESCAPED_FROM_X_INACTIVATION, and ZHANG_TLX_TARGETS_36HR_DN.