LabReport3Final

Introduction

This labwork starts with reading the data and data munipulation, then plot a pca graph for dimention reduction for further data analysis; afterward, the author conduct a differential expression analysis and perform a pathway analysis to identify what are the most significant pathways. Finally, the author conclude the gdc_cases.diagnoses.tumor_stage has 10 DNA expression at 95% confident leval, among of them, the Authod present the most sigificant of 4.

Data Reading and Cleaning

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("msigdbr")

## Bioconductor version 3.16 (BiocManager 1.30.20), R 4.2.3 (2023-03-15 ucrt)

## Installing package(s) 'msigdbr'

## package 'msigdbr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\u6977\AppData\Local\Temp\RtmpKwfmNf\downloaded_packages

## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.2.3/library
##   packages:
##     class, KernSmooth, lattice, MASS, Matrix, nnet, survival

## Old packages: 'cachem', 'checkmate', 'clock', 'dplyr', 'graphlayouts', 'Hmisc',
##   'rlang', 'tzdb', 'vctrs', 'xfun', 'xml2'

library(Homo.sapiens)

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: IRanges

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: OrganismDbi

## Loading required package: GenomicFeatures

## Loading required package: GenomeInfoDb

## Loading required package: GenomicRanges

## Loading required package: GO.db

##

## Loading required package: org.Hs.eg.db

##

## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene

library(org.Hs.eg.db)
library(SummarizedExperiment)

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## The following object is masked from 'package:Biobase':
## 
##     rowMedians

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::select()       masks OrganismDbi::select(), AnnotationDbi::select()
## ✖ dplyr::slice()        masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggfortify)
library(RColorBrewer)


options(repos = c(CRAN = "https://cran.r-project.org/"))
options(timeout = 300)  # Sets the timeout to 300 seconds (5 minutes)

url <-'http://duffel.rail.bio/recount/v2/TCGA/rse_gene_skin.Rdata'
destfile <- "rse_gene_skin.Rdata"



download.file(url = url, destfile = 'rse_gene_skin.Rdata', method = 'auto',mode = 'wb')
load("rse_gene_skin.Rdata")

#gene_names <- rowData(rse_gene)$Gene.Name




library(GEOquery)

## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)

library(org.Hs.eg.db)
library(limma)

## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

library(edgeR)
library(Glimma)
library(Homo.sapiens)
library(gplots)

## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:IRanges':
## 
##     space
## 
## The following object is masked from 'package:S4Vectors':
## 
##     space
## 
## The following object is masked from 'package:stats':
## 
##     lowess

library(dplyr)


### Gene counts
counts <- assay(rse_gene)
gene_names <- rowData(rse_gene)$Gene.Name
### Gene identifiers
genes <- rowData(rse_gene)
### Phenotype data for the samples
samples <- as.data.frame(colData(rse_gene))
### There are 864 variables in samples, 593 are completely missing
#install.packages("magrittr")
library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract

samples %>%
    is.na() %>%
    colSums() %>%
    table()

## .
##   0   1   2   3   8   9  10  11  12  13  14  15  22  28  29  33  35  39  45  50 
## 186   2   1   3   7   1   2   3   1   2   1   1   2   1   2   1   1   2   1   1 
## 112 150 158 160 161 162 163 168 190 195 200 214 218 222 251 258 300 319 320 329 
##   1   1   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   2   3 
## 349 360 364 369 402 409 430 436 466 472 473 
##   1   2   1   1   1   1   1   1   1  16 593

library(msigdbr) 
# Bioconductor package for MSigDB gene sets
Hs.c2 <- msigdbr(species = "Homo sapiens", category = "C2")

PCA dimention reduction plot

knitr::opts_chunk$set(echo = TRUE)
counts <- assay(rse_gene)
genes <- rowData(rse_gene)
samples <- as.data.frame(colData(rse_gene))
TCGA <- DGEList(counts = counts, samples = samples, genes = genes)
TCGA <- calcNormFactors(TCGA, method = "TMM")
keep <- filterByExpr(TCGA)

## Warning in filterByExpr.DGEList(TCGA): All samples appear to belong to the same
## group.

TCGA <- TCGA[keep,]
cpm <- cpm(TCGA)
lcpm <- cpm(TCGA, log=TRUE)
pca <- prcomp(t(lcpm), scale = TRUE)
autoplot(pca, data = samples, colour = "cgc_case_gender")

design <- model.matrix(~ cgc_case_gender, samples)
v <- voom(TCGA, design)
fit <- lmFit(v, design)
efit <- eBayes(fit)
dt_fdr <- decideTests(efit, adjust.method = "fdr", p.value = 0.05)
summary(dt_fdr)

##        (Intercept) cgc_case_genderMALE
## Down         15186                  59
## NotSig         819               33355
## Up           17438                  29

volcanoplot(efit, coef = "cgc_case_genderMALE", highlight = 10, names = efit$genes$symbol)

conducting a differential expression analysis

design <- model.matrix(~ cgc_case_gender, samples)
v <- voom(TCGA, design)
fit <- lmFit(v, design)
efit <- eBayes(fit)
dt_fdr <- decideTests(efit, adjust.method = "fdr", p.value = 0.05)
summary(dt_fdr)

##        (Intercept) cgc_case_genderMALE
## Down         15186                  59
## NotSig         819               33355
## Up           17438                  29

volcanoplot(efit, coef = "cgc_case_genderMALE", highlight = 10, names = efit$genes$symbol)

perform a pathway analysis

symbol <- unlist(lapply(v$genes$symbol,function(x)x[1])) 
ENTREZID <- mapIds(Homo.sapiens, keys=symbol, column=c("ENTREZID"),keytype="SYMBOL", multiVals = "first")

## 'select()' returned 1:many mapping between keys and columns

v$genes$ENTREZID <-ENTREZID
idx <- ids2indices(Hs.c2,id=v$genes$ENTREZID)
cam<- camera(v,idx,design,contrast="cgc_case_genderMALE",trend.var = TRUE)
head(cam,4)

##                   NGenes Direction    PValue       FDR
## entrez_gene        17069        Up 0.8051229 0.8051229
## human_entrez_gene  17069        Up 0.8051229 0.8051229

Conclusion

The author concludes there are 10 genes sinificant relate to cgc_case_genderMALE at 95% significant level presented, the the top four results of the pathway analysis are RUNNE_GENDER_EFFECT_UP, PYEON_CANCER_HEAD_AND_NECK_VS_CERVICAL_DN,DISTECHE_ESCAPED_FROM_X_INACTIVATION, and ZHANG_TLX_TARGETS_36HR_DN.