Cluster analysis shows that gender is not an important factor (it may also be because the females’ sample is too small). So I decided to look only at differences in gene expression between white men with bladder cancer and Asian men, which could provide meaningful information.
From the UniProt data, I think SEMA4D has the most significant association with bladder cancer because it is a cell surface receptor that plays a role in cell-cell signaling. Endothelial cell migration is induced by activation of the PTK2B/PYK2 pathway and other kinase-Akt pathways.MAN1 protein is also essential because it is a specific inhibitor of TGF, highly associated with cancer.
Using heat maps, we found that most Asian men expressed the same genes, and most white men expressed the same genes. And most of the genes that are up-regulated in Asian men are down-regulated in white men, vice Versa.
Finally, I used a camera to perform a competitive test to assess whether genes in a given set rank higher in expression differences than genes, not in that set.
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("Glimma")
#BiocManager::install("limma")
#BiocManager::install("edgeR")
#BiocManager::install("Homo.sapiens")
#BiocManager::install("stringr")
library(limma)
library(Glimma)
library(edgeR)
library(Homo.sapiens)#for human genes
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GO.db
##
## Loading required package: org.Hs.eg.db
##
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
library(stringr)
setwd('/Users/dongzeyuan/Desktop/zeyuan-510final')
# unzip whitemale data
setwd('/Users/dongzeyuan/Desktop/zeyuan-510final/WhiteMale')
dir.create("SampleFiles")
## Warning in dir.create("SampleFiles"): 'SampleFiles'已存在
filepath <- dir(path ="./gdc_download_20201111_071754.200344",full.names = TRUE)
for(wd in filepath){
files <-dir(path = wd,pattern="gz$") #
fromfilepath <- paste(wd,"/",files,sep ="")
tofilepath <- paste("./SampleFiles/",files,sep ="")
file.copy(fromfilepath,tofilepath)
}
# unzip whitefemale data
setwd('/Users/dongzeyuan/Desktop/zeyuan-510final/WhiteFemale')
dir.create("SampleFiles2")
## Warning in dir.create("SampleFiles2"): 'SampleFiles2'已存在
filepath <- dir(path ="./gdc_download_20201111_071940.367741",full.names = TRUE)
for(wd in filepath){
files <-dir(path = wd,pattern="gz$")
fromfilepath <- paste(wd,"/",files,sep ="")
tofilepath <- paste("./SampleFiles2/",files,sep ="")
file.copy(fromfilepath,tofilepath)
}
# unzip asianfemale data
setwd('/Users/dongzeyuan/Desktop/zeyuan-510final/AsianFemale')
dir.create("SampleFiles3")
## Warning in dir.create("SampleFiles3"): 'SampleFiles3'已存在
filepath <- dir(path ="./gdc_download_20201111_071444.004912",full.names = TRUE)
for(wd in filepath){
files <-dir(path = wd,pattern="gz$")
fromfilepath <- paste(wd,"/",files,sep ="")
tofilepath <- paste("./SampleFiles3/",files,sep ="")
file.copy(fromfilepath,tofilepath)
}
# unzip asianmale data
setwd('/Users/dongzeyuan/Desktop/zeyuan-510final/AsianMale')
dir.create("SampleFiles4")
## Warning in dir.create("SampleFiles4"): 'SampleFiles4'已存在
filepath <- dir(path ="./gdc_download_20201111_071307.660905",full.names = TRUE)
for(wd in filepath){
files <-dir(path = wd,pattern="gz$")
fromfilepath <- paste(wd,"/",files,sep ="")
tofilepath <- paste("./SampleFiles4/",files,sep ="")
file.copy(fromfilepath,tofilepath)
}
# Check Files and manuanlly unzip desired data
#tarDir <- "/Users/dongzeyuan/Desktop/zeyuan-510final/TwoGroups"
#dir.create(tarDir)
#setwd("/Users/dongzeyuan/Desktop/zeyuan-510final/TwoGroups")
#old_files1 <- list.files("/Users/dongzeyuan/Desktop/zeyuan-510final/WhiteMale/SampleFiles", pattern = ".htseq.counts", full.names = TRUE)
#new_files1 <- c("WM01.htseq.counts",
#"WM02.htseq.counts",
#"WM03.htseq.counts",
#"WM04.htseq.counts",
#"WM05.htseq.counts",
#"WM06.htseq.counts",
#"WM07.htseq.counts",
#"WM08.htseq.counts",
#"WM09.htseq.counts",
#"WM10.htseq.counts",
#"WM11.htseq.counts",
#"WM12.htseq.counts",
#"WM13.htseq.counts",
#"WM14.htseq.counts",
#"WM15.htseq.counts",
#"WM16.htseq.counts",
#"WM17.htseq.counts",
#"WM18.htseq.counts",
#"WM19.htseq.counts",
#"WM20.htseq.counts",
#"WM21.htseq.counts",
#"WM22.htseq.counts",
#"WM23.htseq.counts",
#"WM24.htseq.counts",
#"WM25.htseq.counts",
#"WM26.htseq.counts",
#"WM27.htseq.counts",
#"WM28.htseq.counts",
#"WM29.htseq.counts",
#"WM30.htseq.counts")
#file.copy(from = old_files1, to = new_files1)
#old_files2 <- list.files("/Users/dongzeyuan/Desktop/zeyuan-510final/WhiteFemale/SampleFiles2", pattern = ".htseq.counts", full.names = TRUE)
#new_files2 <- c("WF01.htseq.counts",
#"WF02.htseq.counts",
#"WF03.htseq.counts",
#"WF04.htseq.counts")
#file.copy(from = old_files2, to = new_files2)
#old_files3 <- list.files("/Users/dongzeyuan/Desktop/zeyuan-510final/AsianFemale/SampleFiles3", pattern = ".htseq.counts", full.names = TRUE)
#new_files3 <- c("AF01.htseq.counts",
#"AF02.htseq.counts",
#"AF03.htseq.counts",
#"AF04.htseq.counts")
#file.copy(from = old_files3, to = new_files3)
#old_files4 <- list.files("/Users/dongzeyuan/Desktop/zeyuan-510final/AsianMale/SampleFiles4", pattern = ".htseq.counts", full.names = TRUE)
#new_files4 <- c("AM01.htseq.counts",
#"AM02.htseq.counts",
#"AM03.htseq.counts",
#"AM04.htseq.counts",
#"AM05.htseq.counts",
#"AM06.htseq.counts",
#"AM07.htseq.counts",
#"AM08.htseq.counts",
#"AM09.htseq.counts",
#"AM10.htseq.counts",
#"AM11.htseq.counts",
#"AM12.htseq.counts",
#"AM13.htseq.counts",
#"AM14.htseq.counts",
#"AM15.htseq.counts",
#"AM16.htseq.counts",
#"AM17.htseq.counts",
#"AM18.htseq.counts",
#"AM19.htseq.counts",
#"AM20.htseq.counts",
#"AM21.htseq.counts",
#"AM22.htseq.counts",
#"AM23.htseq.counts",
#"AM24.htseq.counts",
#"AM25.htseq.counts",
#"AM26.htseq.counts",
#"AM27.htseq.counts",
#"AM28.htseq.counts",
#"AM29.htseq.counts",
#"AM30.htseq.counts")
#file.copy(from = old_files4, to = new_files4)
setwd('/Users/dongzeyuan/Desktop/zeyuan-510final/TwoGroups')
files <- c("WM01.htseq.counts","WM02.htseq.counts","WM03.htseq.counts","WM04.htseq.counts","WM05.htseq.counts","WM06.htseq.counts","WM07.htseq.counts","WM08.htseq.counts","WM09.htseq.counts","WM10.htseq.counts","WM11.htseq.counts","WM12.htseq.counts","WM13.htseq.counts","WM14.htseq.counts","WM15.htseq.counts","WM16.htseq.counts","WM17.htseq.counts","WM18.htseq.counts","WM19.htseq.counts","WM20.htseq.counts","WM21.htseq.counts","WM22.htseq.counts","WM23.htseq.counts","WM24.htseq.counts","WM25.htseq.counts","WM26.htseq.counts","WM27.htseq.counts","WM28.htseq.counts","WM29.htseq.counts","WM30.htseq.counts","WF01.htseq.counts","WF02.htseq.counts","WF03.htseq.counts","WF04.htseq.counts","AF01.htseq.counts","AF02.htseq.counts","AF03.htseq.counts","AF04.htseq.counts","AM01.htseq.counts","AM02.htseq.counts","AM03.htseq.counts","AM04.htseq.counts","AM05.htseq.counts","AM06.htseq.counts","AM07.htseq.counts","AM08.htseq.counts","AM09.htseq.counts","AM10.htseq.counts","AM11.htseq.counts","AM12.htseq.counts","AM13.htseq.counts","AM14.htseq.counts","AM15.htseq.counts","AM16.htseq.counts","AM17.htseq.counts","AM18.htseq.counts","AM19.htseq.counts","AM20.htseq.counts","AM21.htseq.counts","AM22.htseq.counts","AM23.htseq.counts","AM24.htseq.counts","AM25.htseq.counts","AM26.htseq.counts","AM27.htseq.counts","AM28.htseq.counts","AM29.htseq.counts","AM30.htseq.counts")
#for(i in paste(files, ".gz", sep=""))
# R.utils::gunzip(i, overwrite=TRUE)
x <- readDGE(files, columns=c(1,2))
## Meta tags detected: __no_feature, __ambiguous, __too_low_aQual, __not_aligned, __alignment_not_unique
males only
setwd('/Users/dongzeyuan/Desktop/zeyuan-510final/TwoGroups')
files_male_only <- c("WM01.htseq.counts","WM02.htseq.counts","WM03.htseq.counts","WM04.htseq.counts","WM05.htseq.counts","WM06.htseq.counts","WM07.htseq.counts","WM08.htseq.counts","WM09.htseq.counts","WM10.htseq.counts","WM11.htseq.counts","WM12.htseq.counts","WM13.htseq.counts","WM14.htseq.counts","WM15.htseq.counts","WM16.htseq.counts","WM17.htseq.counts","WM18.htseq.counts","WM19.htseq.counts","WM20.htseq.counts","WM21.htseq.counts","WM22.htseq.counts","WM23.htseq.counts","WM24.htseq.counts","WM25.htseq.counts","WM26.htseq.counts","WM27.htseq.counts","WM28.htseq.counts","WM29.htseq.counts","WM30.htseq.counts","AM01.htseq.counts","AM02.htseq.counts","AM03.htseq.counts","AM04.htseq.counts","AM05.htseq.counts","AM06.htseq.counts","AM07.htseq.counts","AM08.htseq.counts","AM09.htseq.counts","AM10.htseq.counts","AM11.htseq.counts","AM12.htseq.counts","AM13.htseq.counts","AM14.htseq.counts","AM15.htseq.counts","AM16.htseq.counts","AM17.htseq.counts","AM18.htseq.counts","AM19.htseq.counts","AM20.htseq.counts","AM21.htseq.counts","AM22.htseq.counts","AM23.htseq.counts","AM24.htseq.counts","AM25.htseq.counts","AM26.htseq.counts","AM27.htseq.counts","AM28.htseq.counts","AM29.htseq.counts","AM30.htseq.counts")
x_male_only <-readDGE(files_male_only, columns=c(1,2))
## Meta tags detected: __no_feature, __ambiguous, __too_low_aQual, __not_aligned, __alignment_not_unique
# strip the suffix
samplenames <- substring(colnames(x), 1, 4)
samplenames
## [1] "WM01" "WM02" "WM03" "WM04" "WM05" "WM06" "WM07" "WM08" "WM09" "WM10"
## [11] "WM11" "WM12" "WM13" "WM14" "WM15" "WM16" "WM17" "WM18" "WM19" "WM20"
## [21] "WM21" "WM22" "WM23" "WM24" "WM25" "WM26" "WM27" "WM28" "WM29" "WM30"
## [31] "WF01" "WF02" "WF03" "WF04" "AF01" "AF02" "AF03" "AF04" "AM01" "AM02"
## [41] "AM03" "AM04" "AM05" "AM06" "AM07" "AM08" "AM09" "AM10" "AM11" "AM12"
## [51] "AM13" "AM14" "AM15" "AM16" "AM17" "AM18" "AM19" "AM20" "AM21" "AM22"
## [61] "AM23" "AM24" "AM25" "AM26" "AM27" "AM28" "AM29" "AM30"
colnames(x) <- samplenames
samplenames_males_only <- substring(colnames(x_male_only), 1, 4)
#samplenames
colnames(x_male_only) <- samplenames_males_only
# Group by Race
group <- as.factor(rep(c("White","Asian"),c(34,34)))
x$samples$group <- group
# Group by gender
gender <- as.factor(rep(c("Male","Female","Male"),c(30,8,30)))
x$samples$gender<-gender
group_male_only <- as.factor(rep(c("White","Asian"),c(30,30)))
x_male_only$samples$group <- group_male_only
gender_male_only <- as.factor(rep(c("Male","Male"),c(30,30)))
x_male_only$samples$gender<-gender_male_only
# Look for the geneid corresponding gene-symbols and locations of chromosome
geneid <- rownames(x)
#geneid
# Remove the decimal point at ENSEMBL
gene <- str_match(geneid, "(\\w*).*")
geneid <- gene[,2]
# go through Homo pacakge and get symbol and chromosome location
genes <- select(Homo.sapiens,
keys=geneid,
columns=c("SYMBOL","TXCHROM"),
keytype="ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
head(genes)
## ENSEMBL SYMBOL TXCHROM
## 1 ENSG00000000005 TNMD chrX
## 2 ENSG00000000419 DPM1 chr20
## 3 ENSG00000000457 SCYL3 chr1
## 4 ENSG00000000460 C1orf112 chr1
## 5 ENSG00000000938 FGR chr1
## 6 ENSG00000000971 CFH chr1
# remove the overlapped genes
genes <- genes[!duplicated(genes$ENSEMBL),]
# import genes to DGEList
x$genes <- genes
#x
# Look for the geneid corresponding gene-symbols and locations of chromosome
geneid_male_only <- rownames(x_male_only)
#geneid
# Remove the decimal point at ENSEMBL
gene_male_only <- str_match(geneid_male_only, "(\\w*).*")
geneid_male_only <- gene_male_only[,2]
# go through Homo pacakge and get symbol and chromosome location
genes_male_only <- select(Homo.sapiens,
keys=geneid,
columns=c("SYMBOL","TXCHROM"),
keytype="ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
#head(genes_male_only)
# remove the overlapped genes
genes_male_only <- genes_male_only[!duplicated(genes$ENSEMBL),]
# import genes to DGEList
x_male_only$genes <- genes_male_only
#x
cpm <- cpm(x)
lcpm <- cpm(x, log=TRUE)
cpm_male_only <- cpm(x_male_only)
lcpm_male_only <- cpm(x_male_only, log=TRUE)
# Get the average and median of the sample library size
L <- mean(x$samples$lib.size) * 1e-6
M <- median(x$samples$lib.size) * 1e-6
c(L, M)
## [1] 80.27711 75.63755
# Get the average and median of the sample library size
L_male_only <- mean(x_male_only$samples$lib.size) * 1e-6
M_male_only <- median(x_male_only$samples$lib.size) * 1e-6
#c(L, M)
# see the summary
summary(lcpm)
## WM01 WM02 WM03 WM04
## Min. :-5.3269 Min. :-5.3269 Min. :-5.3269 Min. :-5.3269
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269
## Median :-5.3269 Median :-4.6226 Median :-4.5340 Median :-4.5704
## Mean :-2.6314 Mean :-2.5396 Mean :-2.3890 Mean :-2.4082
## 3rd Qu.:-0.4149 3rd Qu.:-0.4529 3rd Qu.:-0.2724 3rd Qu.:-0.1645
## Max. :17.7454 Max. :17.9583 Max. :17.4635 Max. :17.8388
## WM05 WM06 WM07 WM08
## Min. :-5.32692 Min. :-5.3269 Min. :-5.3269 Min. :-5.3269
## 1st Qu.:-5.32692 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269
## Median :-4.80685 Median :-4.7634 Median :-4.6601 Median :-4.8768
## Mean :-2.41955 Mean :-2.6985 Mean :-2.4729 Mean :-2.6704
## 3rd Qu.:-0.06668 3rd Qu.:-0.9969 3rd Qu.:-0.2688 3rd Qu.:-0.9503
## Max. :18.52531 Max. :17.9985 Max. :17.8434 Max. :18.1149
## WM09 WM10 WM11 WM12
## Min. :-5.3269 Min. :-5.3269 Min. :-5.3269 Min. :-5.3269
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269
## Median :-5.3269 Median :-4.7118 Median :-5.3269 Median :-5.3269
## Mean :-2.5955 Mean :-2.5468 Mean :-2.6742 Mean :-2.6826
## 3rd Qu.:-0.6431 3rd Qu.:-0.4083 3rd Qu.:-0.7364 3rd Qu.:-0.9285
## Max. :17.5749 Max. :17.5749 Max. :17.9109 Max. :17.8394
## WM13 WM14 WM15 WM16
## Min. :-5.327 Min. :-5.32692 Min. :-5.3269 Min. :-5.327
## 1st Qu.:-5.327 1st Qu.:-5.32692 1st Qu.:-5.3269 1st Qu.:-5.327
## Median :-4.655 Median :-4.63508 Median :-5.3269 Median :-5.327
## Mean :-2.560 Mean :-2.43548 Mean :-2.5495 Mean :-2.655
## 3rd Qu.:-0.446 3rd Qu.:-0.01343 3rd Qu.:-0.4172 3rd Qu.:-0.967
## Max. :17.770 Max. :17.94493 Max. :17.5603 Max. :17.344
## WM17 WM18 WM19 WM20
## Min. :-5.3269 Min. :-5.3269 Min. :-5.3269 Min. :-5.3269
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269
## Median :-4.6507 Median :-4.7548 Median :-4.8064 Median :-4.6185
## Mean :-2.5070 Mean :-2.4360 Mean :-2.5228 Mean :-2.2647
## 3rd Qu.:-0.5244 3rd Qu.:-0.1169 3rd Qu.:-0.5684 3rd Qu.: 0.3118
## Max. :17.8456 Max. :18.1089 Max. :17.7854 Max. :17.7193
## WM21 WM22 WM23 WM24
## Min. :-5.3269 Min. :-5.3269 Min. :-5.327 Min. :-5.327
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.327 1st Qu.:-5.327
## Median :-4.6693 Median :-4.9014 Median :-5.327 Median :-4.714
## Mean :-2.4430 Mean :-2.5493 Mean :-2.802 Mean :-2.456
## 3rd Qu.:-0.1947 3rd Qu.:-0.6824 3rd Qu.:-1.163 3rd Qu.:-0.202
## Max. :17.8367 Max. :17.6007 Max. :18.095 Max. :17.655
## WM25 WM26 WM27 WM28
## Min. :-5.3269 Min. :-5.327 Min. :-5.3269 Min. :-5.327
## 1st Qu.:-5.3269 1st Qu.:-5.327 1st Qu.:-5.3269 1st Qu.:-5.327
## Median :-5.3269 Median :-4.818 Median :-4.5844 Median :-4.591
## Mean :-2.4899 Mean :-2.672 Mean :-2.5616 Mean :-2.942
## 3rd Qu.:-0.1856 3rd Qu.:-1.003 3rd Qu.:-0.6317 3rd Qu.:-1.467
## Max. :17.3813 Max. :17.897 Max. :18.2407 Max. :19.398
## WM29 WM30 WF01 WF02
## Min. :-5.326917 Min. :-5.327 Min. :-5.3269 Min. :-5.3269
## 1st Qu.:-5.326917 1st Qu.:-5.327 1st Qu.:-5.3269 1st Qu.:-5.3269
## Median :-4.805374 Median :-4.803 Median :-4.8442 Median :-5.3269
## Mean :-2.351516 Mean :-2.692 Mean :-2.5755 Mean :-2.5922
## 3rd Qu.: 0.001993 3rd Qu.:-1.048 3rd Qu.:-0.4983 3rd Qu.:-0.5136
## Max. :17.551316 Max. :17.732 Max. :17.8535 Max. :17.7878
## WF03 WF04 AF01 AF02
## Min. :-5.3269 Min. :-5.3269 Min. :-5.3269 Min. :-5.3269
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269
## Median :-5.3269 Median :-5.3269 Median :-5.3269 Median :-4.6169
## Mean :-2.5155 Mean :-2.6229 Mean :-2.7296 Mean :-2.3594
## 3rd Qu.:-0.3728 3rd Qu.:-0.5169 3rd Qu.:-0.8329 3rd Qu.: 0.2209
## Max. :17.5632 Max. :17.7754 Max. :18.2142 Max. :17.6560
## AF03 AF04 AM01 AM02
## Min. :-5.3269 Min. :-5.3269 Min. :-5.3269 Min. :-5.3269
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269
## Median :-5.3269 Median :-4.6721 Median :-4.8093 Median :-4.8828
## Mean :-2.5815 Mean :-2.5082 Mean :-2.5406 Mean :-2.6353
## 3rd Qu.:-0.6028 3rd Qu.:-0.4616 3rd Qu.:-0.6487 3rd Qu.:-0.8039
## Max. :17.5836 Max. :17.7331 Max. :17.8091 Max. :18.0378
## AM03 AM04 AM05 AM06
## Min. :-5.3269 Min. :-5.3269 Min. :-5.3269 Min. :-5.3269
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269
## Median :-5.3269 Median :-5.3269 Median :-4.9834 Median :-5.3269
## Mean :-2.6791 Mean :-2.5671 Mean :-2.6703 Mean :-2.5643
## 3rd Qu.:-0.6953 3rd Qu.:-0.6244 3rd Qu.:-0.8173 3rd Qu.:-0.5037
## Max. :18.1162 Max. :17.6917 Max. :17.9144 Max. :17.6848
## AM07 AM08 AM09 AM10
## Min. :-5.32692 Min. :-5.3269 Min. :-5.327 Min. :-5.3269
## 1st Qu.:-5.32692 1st Qu.:-5.3269 1st Qu.:-5.327 1st Qu.:-5.3269
## Median :-4.88125 Median :-4.6396 Median :-5.327 Median :-5.3269
## Mean :-2.36136 Mean :-2.4553 Mean :-2.737 Mean :-2.6980
## 3rd Qu.:-0.00175 3rd Qu.:-0.2675 3rd Qu.:-1.244 3rd Qu.:-0.9101
## Max. :17.56140 Max. :17.6274 Max. :17.758 Max. :18.0273
## AM11 AM12 AM13 AM14
## Min. :-5.3269 Min. :-5.3269 Min. :-5.3269 Min. :-5.3269
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269
## Median :-5.3269 Median :-5.3269 Median :-4.8546 Median :-5.3269
## Mean :-2.7170 Mean :-2.7597 Mean :-2.7981 Mean :-2.6732
## 3rd Qu.:-0.7161 3rd Qu.:-0.9733 3rd Qu.:-0.9787 3rd Qu.:-0.9632
## Max. :17.8248 Max. :17.9879 Max. :18.4898 Max. :17.7811
## AM15 AM16 AM17 AM18
## Min. :-5.3269 Min. :-5.3269 Min. :-5.3269 Min. :-5.327
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.327
## Median :-4.7736 Median :-5.3269 Median :-4.7915 Median :-5.327
## Mean :-2.5800 Mean :-2.5495 Mean :-2.5335 Mean :-2.879
## 3rd Qu.:-0.4899 3rd Qu.:-0.5002 3rd Qu.:-0.3886 3rd Qu.:-1.225
## Max. :17.9565 Max. :17.6605 Max. :18.1492 Max. :18.435
## AM19 AM20 AM21 AM22
## Min. :-5.3269 Min. :-5.3269 Min. :-5.3269 Min. :-5.327
## 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.3269 1st Qu.:-5.327
## Median :-4.7207 Median :-4.6545 Median :-4.0121 Median :-5.327
## Mean :-2.5716 Mean :-2.3618 Mean :-2.3625 Mean :-3.130
## 3rd Qu.:-0.6817 3rd Qu.: 0.1628 3rd Qu.:-0.5834 3rd Qu.:-1.757
## Max. :17.7992 Max. :18.3595 Max. :18.6399 Max. :18.478
## AM23 AM24 AM25 AM26
## Min. :-5.327 Min. :-5.3269 Min. :-5.327 Min. :-5.3269
## 1st Qu.:-5.327 1st Qu.:-5.3269 1st Qu.:-5.327 1st Qu.:-5.3269
## Median :-5.327 Median :-5.3269 Median :-4.436 Median :-4.6663
## Mean :-2.964 Mean :-2.6495 Mean :-2.386 Mean :-2.5526
## 3rd Qu.:-1.422 3rd Qu.:-0.5852 3rd Qu.:-0.226 3rd Qu.:-0.3635
## Max. :18.269 Max. :17.9668 Max. :17.729 Max. :18.0451
## AM27 AM28 AM29 AM30
## Min. :-5.327 Min. :-5.3269 Min. :-5.327 Min. :-5.3269
## 1st Qu.:-5.327 1st Qu.:-5.3269 1st Qu.:-5.327 1st Qu.:-5.3269
## Median :-5.327 Median :-5.3269 Median :-5.327 Median :-4.7689
## Mean :-2.800 Mean :-2.7496 Mean :-2.820 Mean :-2.4558
## 3rd Qu.:-1.028 3rd Qu.:-0.9491 3rd Qu.:-1.140 3rd Qu.:-0.2777
## Max. :17.949 Max. :18.0113 Max. :18.695 Max. :17.5229
#summary(lcpm_male_only)
# to filter genes, while keeping as many genes as possible with worthwhile counts.
keep.exprs <- filterByExpr(x, group=group)
x <- x[keep.exprs,, keep.lib.sizes=FALSE]
dim(x)
## [1] 20138 68
# to filter genes, while keeping as many genes as possible with worthwhile counts.
keep.exprs <- filterByExpr(x_male_only, group=group)
x_male_only <- x_male_only[keep.exprs,, keep.lib.sizes=FALSE]
#dim(x_male_only)
# Compare the data before and after filtering
lcpm.cutoff <- log2(5/M + 2/L)
library(RColorBrewer)
nsamples <- ncol(x)
col <- brewer.pal(nsamples, "Paired")
## Warning in brewer.pal(nsamples, "Paired"): n too large, allowed maximum for palette Paired is 12
## Returning the palette you asked for with that many colors
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", samplenames, text.col=col, bty="n")
lcpm <- cpm(x, log=TRUE)
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", samplenames, text.col=col, bty="n")
lcpm_male_only.cutoff <- log2(5/M + 2/L)
lcpm_male_only <- cpm(x_male_only, log=TRUE)
# normalization
x <- calcNormFactors(x, method = "TMM")
x$samples$norm.factors
## [1] 0.9906711 1.1128612 1.2862404 1.1637129 0.9685468 0.8655872 1.1624764
## [8] 0.9960807 1.1034987 1.1138525 0.9253751 1.0598805 0.9132135 1.1127265
## [15] 0.9570849 1.1136734 1.1194157 1.1416051 1.1210267 1.3611276 1.1907137
## [22] 1.0514294 0.7971595 1.1937833 1.1827919 1.0175912 1.0532701 0.3703736
## [29] 1.3687501 0.9834462 1.0066632 1.0079499 1.2806465 1.0120290 0.8116220
## [36] 1.2169187 1.1680961 1.2434374 1.0576294 0.9613168 0.8563864 1.1025868
## [43] 0.9498651 1.1407080 1.2936701 1.2823856 1.0368950 0.9188302 1.0105678
## [50] 0.8765149 0.6811246 0.9871650 0.9549061 1.2145905 1.0973692 0.7117738
## [57] 1.1288755 1.2203519 0.7962916 0.3952453 0.5630903 0.8561179 1.2654744
## [64] 0.9873156 0.8244647 0.9445294 0.7698379 1.2427766
# normalization
x_male_only<- calcNormFactors(x_male_only, method = "TMM")
x_male_only$samples$norm.factors
## [1] 1.0544366 1.1822699 1.3137869 1.1391922 1.0687850 0.8821556 1.2500826
## [8] 0.9759027 1.0843602 1.1360980 0.9637187 1.0153952 0.9539888 1.2010249
## [15] 1.0505980 1.1013236 1.1095586 1.1299325 1.1235116 1.3849932 1.2458779
## [22] 1.0582072 0.7575622 1.2592716 1.2203626 1.0373030 1.0684191 0.3735070
## [29] 1.4049891 0.9967419 1.0516851 0.9154784 0.9122300 1.1127232 0.9251500
## [36] 1.1603052 1.3335247 1.3149238 1.0320941 0.9061980 0.9581156 0.8460191
## [43] 0.6855343 1.0368601 0.9706537 1.2266938 1.0733621 0.6817997 1.1573692
## [50] 1.2433194 0.7774294 0.3731171 0.5495819 0.8988362 1.2533254 1.0315463
## [57] 0.7788595 0.9374891 0.7741824 1.2904361
# To give a better visual representation of the effects of normalisation, the data was duplicated then adjusted so that the counts of the first sample are reduced to 5% of their original values, and in the second sample they are inflated to be 5-times larger.
x2 <- x
x2$samples$norm.factors <- 1
x2$counts[,1] <- ceiling(x2$counts[,1]*0.05)
x2$counts[,2] <- x2$counts[,2]*5
par(mfrow=c(1,2))
lcpm <- cpm(x2, log=TRUE)
boxplot(lcpm, las=2, col=col, main="")
title(main="A. Example: Unnormalised data",ylab="Log-cpm")
x2 <- calcNormFactors(x2)
x2$samples$norm.factors
## [1] 0.06240712 5.50619855 1.31601164 1.16928053 1.02167803 0.86378941
## [7] 1.19567682 1.01974874 1.10916962 1.13109466 0.90931476 1.04943861
## [13] 0.95473805 1.15803559 1.04964202 1.11633889 1.13669152 1.17265012
## [19] 1.13926357 1.41574855 1.20762870 1.08110764 0.81483561 1.21823370
## [25] 1.19402358 1.04324607 1.06618421 0.38185077 1.41023715 1.01427805
## [31] 1.00653393 1.02269599 1.29978858 0.99161931 0.81675714 1.24670532
## [37] 1.17386503 1.25571264 1.04720965 0.98072175 0.84671839 1.09670615
## [43] 0.96640510 1.13301635 1.32622946 1.35960194 1.01892413 0.92254800
## [49] 1.00068939 0.89716104 0.70388504 1.00144044 0.98062456 1.26415319
## [55] 1.11233774 0.71553871 1.13762498 1.23488358 0.80335275 0.39924271
## [61] 0.57635838 0.88887026 1.29163654 1.03118686 0.82952801 0.96506159
## [67] 0.78340100 1.32116348
lcpm <- cpm(x2, log=TRUE)
boxplot(lcpm, las=2, col=col, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")
After cluster analysis, it can be seen from Figure A that the gene expression of most white people and Asian people is very different. As shown in Figure B, gender is not an important factor (it may also be because women’s sample is too small). So I decided to look only at differences in gene expression between white men with bladder cancer and Asian men, which could provide meaningful information.
# By Race
lcpm <- cpm(x, log=TRUE)
par(mfrow=c(1,2))
col.group <- group
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
## Warning in brewer.pal(nlevels(col.group), "Set1"): minimal value for n is 3, returning requested palette with 3 different levels
col.group <- as.character(col.group)
plotMDS(lcpm, labels=group, col=col.group)
title(main="A. Sample groups")
# By gender
lcpm <- cpm(x, log=TRUE)
par(mfrow=c(1,2))
col.gender <- gender
levels(col.gender) <- brewer.pal(nlevels(col.gender), "Set2")
## Warning in brewer.pal(nlevels(col.gender), "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
col.gender <- as.character(col.gender)
plotMDS(lcpm, labels=gender, col=col.gender)
title(main="B. Sample genders")
# The glMDSPlot function generates an html page (that is opened in a browser if launch=TRUE) with an MDS plot in the left panel and a barplot showing the proportion of variation explained by each dimension in the right panel.
glMDSPlot(lcpm, groups=group,labels=paste(gender,sep="-"))
From now, only males data for Differential expression analysis
# Creating a design matrix and contrasts
design <- model.matrix(~0+group_male_only)
colnames(design) <- gsub("group_male_only", "", colnames(design))
design
## Asian White
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## 7 0 1
## 8 0 1
## 9 0 1
## 10 0 1
## 11 0 1
## 12 0 1
## 13 0 1
## 14 0 1
## 15 0 1
## 16 0 1
## 17 0 1
## 18 0 1
## 19 0 1
## 20 0 1
## 21 0 1
## 22 0 1
## 23 0 1
## 24 0 1
## 25 0 1
## 26 0 1
## 27 0 1
## 28 0 1
## 29 0 1
## 30 0 1
## 31 1 0
## 32 1 0
## 33 1 0
## 34 1 0
## 35 1 0
## 36 1 0
## 37 1 0
## 38 1 0
## 39 1 0
## 40 1 0
## 41 1 0
## 42 1 0
## 43 1 0
## 44 1 0
## 45 1 0
## 46 1 0
## 47 1 0
## 48 1 0
## 49 1 0
## 50 1 0
## 51 1 0
## 52 1 0
## 53 1 0
## 54 1 0
## 55 1 0
## 56 1 0
## 57 1 0
## 58 1 0
## 59 1 0
## 60 1 0
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group_male_only
## [1] "contr.treatment"
contr.matrix <- makeContrasts(
RaceVS= White - Asian,
levels = colnames(design))
contr.matrix
## Contrasts
## Levels RaceVS
## Asian -1
## White 1
# Removing heteroscedascity from count data
par(mfrow=c(1,2))
v <- voom(x_male_only, design, plot=TRUE)
v
## An object of class "EList"
## $genes
## ENSEMBL SYMBOL TXCHROM
## 2 ENSG00000000419 DPM1 chr20
## 3 ENSG00000000457 SCYL3 chr1
## 4 ENSG00000000460 C1orf112 chr1
## 5 ENSG00000000938 FGR chr1
## 6 ENSG00000000971 CFH chr1
## 21047 more rows ...
##
## $targets
## files group lib.size norm.factors gender
## WM01 WM01.htseq.counts White 40645743 1.054437 Male
## WM02 WM02.htseq.counts White 75282280 1.182270 Male
## WM03 WM03.htseq.counts White 143588616 1.313787 Male
## WM04 WM04.htseq.counts White 132364285 1.139192 Male
## WM05 WM05.htseq.counts White 98625108 1.068785 Male
## 55 more rows ...
##
## $E
## Samples
## Tags WM01 WM02 WM03 WM04 WM05 WM06
## ENSG00000000419.11 5.022928 4.815950 5.672323 3.5510779 4.299073 5.490311
## ENSG00000000457.12 1.758255 2.692058 2.877913 4.4705289 3.322292 3.105406
## ENSG00000000460.15 1.579780 3.100035 2.661545 5.3134366 2.671886 3.358395
## ENSG00000000938.11 3.275187 3.644579 2.618019 -0.1354808 3.098071 4.254403
## ENSG00000000971.14 6.034617 5.118081 6.441128 1.4090108 5.292929 3.290239
## Samples
## Tags WM07 WM08 WM09 WM10 WM11 WM12
## ENSG00000000419.11 4.999887 4.615341 4.746224 4.812657 4.677334 5.593602
## ENSG00000000457.12 3.773436 4.004128 3.940233 2.721965 2.785133 5.231865
## ENSG00000000460.15 3.019051 1.769489 2.511780 3.121264 2.549808 4.351631
## ENSG00000000938.11 2.220277 3.448921 2.080104 2.820734 3.781998 1.948287
## ENSG00000000971.14 6.211973 6.439701 2.894367 3.799699 5.907419 3.362123
## Samples
## Tags WM13 WM14 WM15 WM16 WM17 WM18
## ENSG00000000419.11 4.323134 4.565077 4.667833 4.711618 4.194247 3.720298
## ENSG00000000457.12 4.169448 3.277604 2.599264 3.414864 4.025844 3.081103
## ENSG00000000460.15 4.104382 2.901658 1.243503 3.206667 4.101243 1.405411
## ENSG00000000938.11 1.968313 2.143140 3.166926 3.237314 1.868074 2.008789
## ENSG00000000971.14 3.719265 4.646132 7.182554 5.995179 2.712524 2.831743
## Samples
## Tags WM19 WM20 WM21 WM22 WM23 WM24
## ENSG00000000419.11 5.113363 3.701457 4.847453 5.088981 4.658075 4.284656
## ENSG00000000457.12 3.953956 2.587144 3.363243 2.996317 3.346191 3.601059
## ENSG00000000460.15 5.920054 3.492659 3.129589 3.426101 1.885662 3.696389
## ENSG00000000938.11 1.575895 1.141060 2.438298 1.529364 2.784313 1.903268
## ENSG00000000971.14 4.764466 3.737840 5.874589 4.415466 6.057867 3.731002
## Samples
## Tags WM25 WM26 WM27 WM28 WM29 WM30
## ENSG00000000419.11 4.853312 4.765459 5.525745 4.2517542 5.133110 5.415829
## ENSG00000000457.12 3.686591 2.432154 2.964463 3.8952732 3.844010 3.007145
## ENSG00000000460.15 2.306333 2.549618 2.323963 3.6115115 3.388319 3.888669
## ENSG00000000938.11 2.971568 2.759240 2.510651 0.7006737 3.486324 1.048881
## ENSG00000000971.14 5.264554 5.316663 2.201106 5.5422058 5.453412 3.919344
## Samples
## Tags AM01 AM02 AM03 AM04 AM05 AM06
## ENSG00000000419.11 4.291686 4.597177 4.563899 4.349383 4.529645 4.649084
## ENSG00000000457.12 4.006696 3.745311 2.740528 3.647169 3.876209 3.538691
## ENSG00000000460.15 1.722844 1.153465 2.296333 2.132464 1.127449 2.403549
## ENSG00000000938.11 3.374819 2.262313 4.652529 2.969282 3.533035 3.131837
## ENSG00000000971.14 5.885169 1.477303 5.869204 6.833005 7.426498 3.831775
## Samples
## Tags AM07 AM08 AM09 AM10 AM11 AM12
## ENSG00000000419.11 4.250915 3.964314 5.054879 4.136379 4.611519 4.214645
## ENSG00000000457.12 3.387321 4.104977 3.691064 3.428772 2.675478 2.973496
## ENSG00000000460.15 1.966508 1.039920 2.716835 1.859689 1.440716 1.574279
## ENSG00000000938.11 2.049179 1.668907 3.920994 2.375920 1.981532 2.419404
## ENSG00000000971.14 6.806777 6.455179 6.380495 6.909672 6.752629 6.228427
## Samples
## Tags AM13 AM14 AM15 AM16 AM17 AM18
## ENSG00000000419.11 3.9023164 4.399168 3.486962 4.470723 5.588370 4.473664
## ENSG00000000457.12 3.3251477 3.293144 4.108798 3.850196 4.967327 3.461391
## ENSG00000000460.15 0.7633171 2.694568 2.090097 1.241851 5.376511 2.195301
## ENSG00000000938.11 3.9543712 2.339015 3.568093 2.955596 2.142035 1.488141
## ENSG00000000971.14 5.3985272 6.585173 7.317586 7.314800 7.561215 6.028835
## Samples
## Tags AM19 AM20 AM21 AM22 AM23 AM24
## ENSG00000000419.11 4.352583 4.930791 4.226809 3.277115 4.499455 3.827720
## ENSG00000000457.12 4.313110 3.345263 3.723848 3.665410 3.606009 3.827720
## ENSG00000000460.15 2.505142 3.453087 3.173682 1.956268 1.435497 2.721857
## ENSG00000000938.11 3.968857 2.614504 3.506520 2.070498 2.114229 3.385417
## ENSG00000000971.14 8.377229 3.438850 3.667091 3.281435 4.543026 5.783646
## Samples
## Tags AM25 AM26 AM27 AM28 AM29 AM30
## ENSG00000000419.11 4.869377 4.422552 4.646597 4.459307 4.404475 4.240246
## ENSG00000000457.12 3.981862 3.155072 3.379161 3.465049 3.593761 4.002149
## ENSG00000000460.15 4.596437 2.110038 2.158674 0.955741 1.297751 1.891594
## ENSG00000000938.11 1.740042 1.814263 2.660194 2.759986 4.069191 3.288552
## ENSG00000000971.14 4.621594 5.194601 4.650156 6.130290 6.221523 7.422736
## 19785 more rows ...
##
## $weights
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1.7775929 2.190901 2.399772 2.385197 2.306094 2.182424 2.251606 2.330592
## [2,] 1.0921558 1.509753 2.007940 1.950598 1.725890 1.495805 1.609378 1.789362
## [3,] 0.9954565 1.363210 1.869810 1.805426 1.570520 1.350242 1.456584 1.636338
## [4,] 0.8223657 1.068164 1.497354 1.434004 1.226174 1.058972 1.137204 1.279822
## [5,] 1.7311103 2.159774 2.390392 2.371628 2.283766 2.149484 2.222772 2.313252
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 2.182094 2.252716 1.9362735 2.300949 2.103092 2.209285 2.0851674 2.1145106
## [2,] 1.495251 1.611870 1.2163096 1.714762 1.388775 1.539395 1.3656556 1.4036104
## [3,] 1.349738 1.458905 1.1015187 1.559691 1.252855 1.391468 1.2323011 1.2664189
## [4,] 1.058614 1.138943 0.8902618 1.217458 0.990671 1.088341 0.9767985 0.9998024
## [5,] 2.149083 2.224234 1.8935531 2.278683 2.066476 2.181026 2.0456817 2.0801077
## [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 2.185107 2.284074 2.320022 2.260118 2.256503 2.369253 2.224941 2.293091
## [2,] 1.500310 1.678670 1.762179 1.628501 1.620392 1.900028 1.564961 1.697877
## [3,] 1.354336 1.524615 1.607106 1.474453 1.466841 1.752781 1.415244 1.543273
## [4,] 1.061875 1.189243 1.255637 1.150594 1.144891 1.382590 1.106195 1.204249
## [5,] 2.152740 2.262012 2.300721 2.233988 2.229222 2.355798 2.196432 2.270920
## [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32]
## [1,] 2.1023822 2.304818 2.0972916 2.129795 2.379386 2.276131 2.200385 2.220361
## [2,] 1.3878565 1.723126 1.3811305 1.423620 1.932022 1.661899 1.862835 1.893350
## [3,] 1.2520156 1.567830 1.2460006 1.284655 1.786062 1.508338 1.106013 1.129416
## [4,] 0.9901051 1.224008 0.9860493 1.012319 1.415079 1.176158 1.422516 1.453918
## [5,] 2.0656300 2.282506 2.0595599 2.096475 2.365858 2.254165 2.432641 2.433940
## [,33] [,34] [,35] [,36] [,37] [,38] [,39]
## [1,] 1.9623649 1.9200455 2.334845 1.9276568 2.354684 2.133796 1.9190132
## [2,] 1.5510789 1.5064321 2.100963 1.5144315 2.138772 1.766479 1.5053482
## [3,] 0.9221757 0.9006179 1.324730 0.9044915 1.373003 1.040723 0.9000927
## [4,] 1.1563765 1.1230547 1.695495 1.1290249 1.750438 1.331687 1.1222458
## [5,] 2.3949397 2.3836668 2.420851 2.3860783 2.414080 2.427899 2.3833390
## [,40] [,41] [,42] [,43] [,44] [,45] [,46]
## [1,] 2.117552 1.9755240 2.048561 2.0120713 1.9829149 2.113210 2.0522777
## [2,] 1.744213 1.5650458 1.653918 1.6094237 1.5729078 1.737804 1.6583134
## [3,] 1.026482 0.9288891 0.974946 0.9508696 0.9326618 1.022694 0.9774105
## [4,] 1.311537 1.1668008 1.236117 1.2003724 1.1726688 1.306184 1.2397866
## [5,] 2.425298 2.3977048 2.415157 2.4063154 2.3992495 2.424599 2.4156774
## [,47] [,48] [,49] [,50] [,51] [,52] [,53]
## [1,] 2.190565 1.8362746 2.150462 2.116097 1.9247497 1.4529972 1.759134
## [2,] 1.846449 1.4198829 1.788473 1.742064 1.5113746 1.1015297 1.346054
## [3,] 1.094637 0.8619027 1.055447 1.025212 0.9030118 0.7279228 0.830010
## [4,] 1.407264 1.0623090 1.352560 1.309742 1.1267434 0.8604265 1.012054
## [5,] 2.431997 2.3580684 2.429663 2.425064 2.3851583 2.1723431 2.329128
## [,54] [,55] [,56] [,57] [,58] [,59] [,60]
## [1,] 1.7229362 2.281186 2.0142445 1.7265181 1.7714544 1.9894368 2.255054
## [2,] 1.3144743 1.997106 1.6122051 1.3175220 1.3579566 1.5805876 1.946586
## [3,] 0.8169787 1.216243 0.9522975 0.8182420 0.8350345 0.9363428 1.172435
## [4,] 0.9916849 1.566217 1.2024870 0.9936341 1.0200571 1.1784550 1.511324
## [5,] 2.3160791 2.431013 2.4068464 2.3173514 2.3339706 2.4007503 2.432541
## 19785 more rows ...
##
## $design
## Asian White
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 55 more rows ...
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")
# Examining the number of DE genes
summary(decideTests(efit, p.value = 0.05))# significance: p.value=0.05
## RaceVS
## Down 2616
## NotSig 14857
## Up 2317
tfit <- treat(vfit, lfc=log2(1.8))# To get more differentially expressed genes via using log2(1.8) instead of log2(2); log2(1.8) is still acceptable and relatively reliable compared with log2(1.5) or log2(1.2).
dt <- decideTests(tfit, p.value=0.05)
summary(dt)
## RaceVS
## Down 15
## NotSig 19763
## Up 12
tfit <- treat(vfit, lfc=log2(1.8))
dt <- decideTests(tfit, p.value = 0.05)
summary(dt)
## RaceVS
## Down 15
## NotSig 19763
## Up 12
Considering p-value, we choose top5 identified genes for further study, i.e.KRTAP12-3,TANC1,RPL32P1,MAN1 B1-DT and SEMA4D. KRTAP12-3 and TANC1 are upregulated, while others are downregulated.
From data on UniProt, I hold the idea that SEMA4D is the most related to bladder cancer because it is a cell surface receptor that plays a role in cell-cell signaling. Whatsmore, it can induce endothelial cell migration via activating PTK2B/PYK2 pathway and other kinase-AKT pathways.
MAN1 protein is equivalently vital because it functions as a specific repressor of TGF-beta, which is highly related to cancer.
TANC1 is a scaffold component related to the nervous system. There is limited information about RPL32P1, but we know that this gene encodes a ribosomal protein, which is highly related to protein translation and metabolism.
KRTAP12-3 is not so important as the other two genes. This is because its function is more related to hair keratin only.
# Examining individual DE genes from top to bottom
RaceVS <- topTreat(tfit, coef=1, n=Inf)
head(RaceVS)
## ENSEMBL SYMBOL TXCHROM logFC AveExpr
## ENSG00000207425.1 ENSG00000205439 KRTAP12-3 chr21 3.872097 -2.7708182
## ENSG00000115414.17 ENSG00000115183 TANC1 chr2 3.241314 7.4604693
## ENSG00000226278.1 ENSG00000224796 RPL32P1 <NA> -5.094902 -1.4026288
## ENSG00000271579.1 ENSG00000268996 MAN1B1-DT chr9 -4.215592 -1.7453452
## ENSG00000221947.6 ENSG00000217769 <NA> <NA> 2.775228 -0.6783404
## ENSG00000188573.7 ENSG00000187764 SEMA4D chr9 -3.294317 -1.3883600
## t P.Value adj.P.Val
## ENSG00000207425.1 5.701528 1.779808e-07 0.003522240
## ENSG00000115414.17 5.349273 6.809986e-07 0.006738481
## ENSG00000226278.1 -5.040873 2.157664e-06 0.008956744
## ENSG00000271579.1 -5.034672 2.206950e-06 0.008956744
## ENSG00000221947.6 5.027882 2.262947e-06 0.008956744
## ENSG00000188573.7 -4.745246 6.363804e-06 0.020989947
visualization
# plotMD function
plotMD(tfit, column=1, status=dt[,1], main=colnames(tfit)[1],
xlim=c(-8,13))
#glMDPlot(tfit, coef=1, status=dt, main=colnames(tfit)[1],
#side.main="ENSEMBL", counts=lcpm_male_only, groups=group_male_only, launch=FALSE)
HeatMap
# get heatmap of log-CPM values
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
##
## space
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
RaceVS.topgenes <- RaceVS$ENSEMBL[1:100]
i <- which(v$genes$ENSEMBL %in% RaceVS.topgenes)
mycol <- colorpanel(1000,"blue","white","red")
heatmap.2(lcpm_male_only[i,], scale="row",
labRow=v$genes$SYMBOL[i], labCol=group_male_only,
col=mycol, trace="none", density.info="none",
lhei=NULL, dendrogram="column")
load('/Users/dongzeyuan/Desktop/zeyuan-510final/human_c2_v5p2.rdata')
#load(system.file("extdata", "human_c2_v5p2.rda", package = "RNAseq123"))
idx <- ids2indices(Hs.c2,id=rownames(v$genes))
cam.RaceVS <- camera(v,idx,design,contrast=contr.matrix[,1])
head(cam.RaceVS,20)# look for appropriate hallmark gene sets
## NGenes Direction
## KEGG_ARACHIDONIC_ACID_METABOLISM 42 Up
## BIOCARTA_TCRA_PATHWAY 10 Up
## WANG_BARRETTS_ESOPHAGUS_AND_ESOPHAGUS_CANCER_UP 16 Up
## STEIN_ESRRA_TARGETS_RESPONSIVE_TO_ESTROGEN_DN 24 Up
## CHIARADONNA_NEOPLASTIC_TRANSFORMATION_KRAS_CDC25_DN 36 Up
## KOBAYASHI_RESPONSE_TO_ROMIDEPSIN 9 Up
## KEGG_ETHER_LIPID_METABOLISM 18 Up
## BIOCARTA_IL17_PATHWAY 14 Up
## ODONNELL_TARGETS_OF_MYC_AND_TFRC_DN 29 Up
## REACTOME_PI_METABOLISM 33 Up
## NIKOLSKY_BREAST_CANCER_7P15_AMPLICON 11 Up
## VALK_AML_WITH_FLT3_ITD 25 Up
## REACTOME_UNWINDING_OF_DNA 7 Up
## HONRADO_BREAST_CANCER_BRCA1_VS_BRCA2 17 Up
## LEIN_LOCALIZED_TO_DISTAL_AND_PROXIMAL_DENDRITES 6 Up
## LIN_SILENCED_BY_TUMOR_MICROENVIRONMENT 59 Up
## SARTIPY_BLUNTED_BY_INSULIN_RESISTANCE_UP 9 Up
## PACHER_TARGETS_OF_IGF1_AND_IGF2_UP 27 Up
## REACTOME_ACYL_CHAIN_REMODELLING_OF_PS 7 Up
## WHITFIELD_CELL_CYCLE_LITERATURE 37 Up
## PValue FDR
## KEGG_ARACHIDONIC_ACID_METABOLISM 3.764156e-05 0.1655476
## BIOCARTA_TCRA_PATHWAY 1.705186e-04 0.2575530
## WANG_BARRETTS_ESOPHAGUS_AND_ESOPHAGUS_CANCER_UP 2.102091e-04 0.2575530
## STEIN_ESRRA_TARGETS_RESPONSIVE_TO_ESTROGEN_DN 2.342456e-04 0.2575530
## CHIARADONNA_NEOPLASTIC_TRANSFORMATION_KRAS_CDC25_DN 5.157754e-04 0.3921561
## KOBAYASHI_RESPONSE_TO_ROMIDEPSIN 6.874951e-04 0.3921561
## KEGG_ETHER_LIPID_METABOLISM 7.560312e-04 0.3921561
## BIOCARTA_IL17_PATHWAY 8.461410e-04 0.3921561
## ODONNELL_TARGETS_OF_MYC_AND_TFRC_DN 8.794413e-04 0.3921561
## REACTOME_PI_METABOLISM 9.905638e-04 0.3921561
## NIKOLSKY_BREAST_CANCER_7P15_AMPLICON 1.101098e-03 0.3921561
## VALK_AML_WITH_FLT3_ITD 1.233378e-03 0.3921561
## REACTOME_UNWINDING_OF_DNA 1.280672e-03 0.3921561
## HONRADO_BREAST_CANCER_BRCA1_VS_BRCA2 1.398224e-03 0.3921561
## LEIN_LOCALIZED_TO_DISTAL_AND_PROXIMAL_DENDRITES 1.451895e-03 0.3921561
## LIN_SILENCED_BY_TUMOR_MICROENVIRONMENT 1.490335e-03 0.3921561
## SARTIPY_BLUNTED_BY_INSULIN_RESISTANCE_UP 1.729055e-03 0.3921561
## PACHER_TARGETS_OF_IGF1_AND_IGF2_UP 1.830680e-03 0.3921561
## REACTOME_ACYL_CHAIN_REMODELLING_OF_PS 1.852089e-03 0.3921561
## WHITFIELD_CELL_CYCLE_LITERATURE 1.991238e-03 0.3921561
For upregulation, I choose PACHER_TARGETS_OF_IGF1_AND_IGF2_UP because some genes are also related to growth factors, as mentioned before. For downregulation, I choose CHIARADONNA_NEOPLASTIC_TRANSFORMATION_KRAS_CDC25_DN because KRAS is a common oncogene.
# use camera function to perform a competitive test to assess whether the genes in a given set are highly ranked in terms of differential expression relative to genes that are not in the set.
barcodeplot(efit$t[,1], index=idx$PACHER_TARGETS_OF_IGF1_AND_IGF2_UP,
index2=idx$CHIARADONNA_NEOPLASTIC_TRANSFORMATION_KRAS_CDC25_DN, main="RaceVS")