2021/11/09 Zeyuan Dong
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
counts <- read.table("ProstCa_030921_2.txt")
#library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#seqlevels(txd
library(stringr)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
library(Homo.sapiens)
## 载入需要的程辑包:AnnotationDbi
## 载入需要的程辑包:stats4
## 载入需要的程辑包:BiocGenerics
## 载入需要的程辑包:parallel
##
## 载入程辑包:'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## 载入需要的程辑包:Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## 载入需要的程辑包:IRanges
## 载入需要的程辑包:S4Vectors
##
## 载入程辑包:'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## 载入需要的程辑包:OrganismDbi
## 载入需要的程辑包:GenomicFeatures
## 载入需要的程辑包:GenomeInfoDb
## 载入需要的程辑包:GenomicRanges
## 载入需要的程辑包:GO.db
##
## 载入需要的程辑包:org.Hs.eg.db
##
## 载入需要的程辑包:TxDb.Hsapiens.UCSC.hg19.knownGene
geneid <- rownames(counts)
gene <- str_match(geneid, "(\\w*).*")
geneid <- gene[,2]
geneid2<-geneid
geneid2=data.frame(geneid2)
counts<-cbind(counts,geneid2)
#Convert geneid
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
genes <- select(Homo.sapiens, keys=geneid,
columns=c("SYMBOL","TXCHROM"),
keytype="ENSEMBL")
## 'select()' returned many:many mapping between keys and columns
dim(genes)
## [1] 60216 3
#Get rid of duplicate genes
genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
row.names(counts)<-counts$geneid2
library(GenomicAlignments)
## 载入需要的程辑包:SummarizedExperiment
## 载入需要的程辑包:MatrixGenerics
## 载入需要的程辑包:matrixStats
##
## 载入程辑包:'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## 载入程辑包:'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## 载入需要的程辑包:Biostrings
## 载入需要的程辑包:XVector
##
## 载入程辑包:'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## 载入需要的程辑包:Rsamtools
counts<-cbind(counts,genes$SYMBOL)
counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready
counts_filtered<-counts[1:6]
coldata=data.frame(row.names=c("34C","35C","36C","34T","35T","36T"),
Type=rep(c("Normal","Tumor"),each=3),
Gleason=rep(c("2","1","2"),2),
Treatment=rep(c("1","1","2"),2))
coldata
## Type Gleason Treatment
## 34C Normal 2 1
## 35C Normal 1 1
## 36C Normal 2 2
## 34T Tumor 2 1
## 35T Tumor 1 1
## 36T Tumor 2 2
coldata$Type<-factor(coldata$Type)
coldata$Gleason<-factor(coldata$Gleason)
coldata$Treatment<-factor(coldata$Treatment)
#Differential expression analysis
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
colData = coldata,
design = ~ Type )
#Pre-Filtering
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
#Alternative shrinkage estimators# To look for other choice
#install.packages("ashr")
resultsNames(dds)
## [1] "Intercept" "Type_Tumor_vs_Normal"
resNorm <- lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
resLFC <- lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")
#TypeTreatment_Tumor_vs_Normal 34 35 36T vs 34 35 36N
ddsTN<-dds
resTypeTrX_TN<-lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resTypeTrX_TN
## log2 fold change (MAP): Type Tumor vs Normal
## Wald test p-value: Type Tumor vs Normal
## DataFrame with 29912 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.001487557 0.0684490 0.920390 0.998905
## TNMD 32.8382 0.003521716 0.0700459 0.300846 0.971182
## DPM1 473.0177 0.000797669 0.0695532 0.932049 0.999246
## SCYL3 1171.0906 -0.004260059 0.0692136 0.710194 0.994762
## C1orf112 321.5382 -0.004050157 0.0694818 0.666490 0.994762
## ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -0.000217798 0.0699759 0.867141 0.996308
## H2AL1SP 4.26468 -0.000759203 0.0699901 0.435829 NA
## NIPBL-DT 1361.89229 -0.005912317 0.0695784 0.562948 0.988494
## CERNA2 29.84232 -0.000177364 0.0699416 0.936720 0.999246
## LOC100996886 32.40953 0.002115149 0.0700345 NA NA
summary(resTypeTrX_TN)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 65, 0.22%
## LFC < 0 (down) : 8, 0.027%
## outliers [1] : 1970, 6.6%
## low counts [2] : 2320, 7.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resTypeTrX_TN_Ordered<-resTypeTrX_TN[order(resTypeTrX_TN$padj),]
head(resTypeTrX_TN_Ordered)
## log2 fold change (MAP): Type Tumor vs Normal
## Wald test p-value: Type Tumor vs Normal
## DataFrame with 6 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## CCL26 14.0100 0.00105147 0.0700005 1.05242e-07 0.000245138
## BEND4 1569.3562 1.73385161 0.3347639 9.57519e-09 0.000245138
## RNU6-744P 15.8184 0.00105403 0.0700006 9.45000e-08 0.000245138
## RPSAP23 14.0100 0.00105147 0.0700005 1.05242e-07 0.000245138
## MBD3L2 14.4220 0.00105226 0.0700006 9.97044e-08 0.000245138
## PLSCR5 14.4220 0.00105226 0.0700006 9.97044e-08 0.000245138
set.seed(1)
library(ComplexHeatmap)
## 载入需要的程辑包:grid
##
## 载入程辑包:'grid'
## The following object is masked from 'package:Biostrings':
##
## pattern
## ========================================
## ComplexHeatmap version 2.9.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library("pheatmap")
##
## 载入程辑包:'pheatmap'
## The following object is masked from 'package:ComplexHeatmap':
##
## pheatmap
select_genes_resTypeTrX_TN_Ordered<-rownames(resTypeTrX_TN_Ordered)
select_genes_resTypeTrX_TN_Ordered<-select_genes_resTypeTrX_TN_Ordered[1:30]
vst_TypeTrX_TN<-vst(ddsTN,blind = FALSE)
df_resTypeTrX_TN <- as.data.frame(colData(vst_TypeTrX_TN)["Type"])
pheatmap(assay(vst_TypeTrX_TN)[select_genes_resTypeTrX_TN_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resTypeTrX_TN, fontsize=8, main = "34,35,36N vs 34,35,36T ",name="vst(FPKM)")
###Pre vs Post (Control N vs T)
ddsPrePost_ctlNT<-dds
design(ddsPrePost_ctlNT)<-~Type + Treatment
ddsPrePost_ctlNT<-DESeq(ddsPrePost_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsPrePost_ctlNT)
## [1] "Intercept" "Type_Tumor_vs_Normal" "Treatment_2_vs_1"
resPrePost_ctlNT<-lfcShrink(ddsPrePost_ctlNT, coef="Treatment_2_vs_1", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resPrePost_ctlNT
## log2 fold change (MAP): Treatment 2 vs 1
## Wald test p-value: Treatment 2 vs 1
## DataFrame with 29912 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.128615 0.257440 0.5490895 0.744443
## TNMD 32.8382 -0.103872 0.455843 0.3613623 0.588085
## DPM1 473.0177 0.357673 0.447453 0.1726233 0.385318
## SCYL3 1171.0906 0.467124 0.318886 0.0634366 0.213619
## C1orf112 321.5382 0.160553 0.347815 0.4967210 0.705751
## ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -0.0388657 0.458081 0.0972491 NA
## H2AL1SP 4.26468 0.0417802 0.456595 0.4353598 NA
## NIPBL-DT 1361.89229 0.4762370 0.370302 0.0779702 0.240383
## CERNA2 29.84232 0.0837081 0.454357 0.4235501 0.644285
## LOC100996886 32.40953 -0.0675131 0.462369 0.0395255 0.162237
summary(resPrePost_ctlNT)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1403, 4.7%
## LFC < 0 (down) : 2469, 8.3%
## outliers [1] : 0, 0%
## low counts [2] : 6959, 23%
## (mean count < 20)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resPrePost_ctlNT_Ordered<-resTypeTrX_TN[order(resPrePost_ctlNT$padj),]
head(resPrePost_ctlNT_Ordered)
## log2 fold change (MAP): Type Tumor vs Normal
## Wald test p-value: Type Tumor vs Normal
## DataFrame with 6 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## POTEJ 992.145 0.002199421 0.0699965 0.3926990 0.985154
## PCA3 2190.298 0.001554564 0.0699760 0.5244327 0.987454
## RN7SL2 448990.716 0.000510975 0.0695987 0.9551007 0.999246
## TMPRSS2 20680.141 0.003064803 0.0700129 0.3768504 0.985154
## RBFOX3 1635.347 -0.011391204 0.0711377 0.0587171 0.786905
## PRRX1 4103.483 -0.007193370 0.0701364 0.3167789 0.976521
#第二张热图pre vs post
set.seed(88)
select_genes_resPrePost_ctlNT_Ordered<-rownames(resPrePost_ctlNT_Ordered)
select_genes_resPrePost_ctlNT_Ordered<-select_genes_resPrePost_ctlNT_Ordered[1:30]
vst_PrePost_ctlNT<-vst(ddsPrePost_ctlNT,blind = FALSE)
df_resPrePost_ctlNT <- as.data.frame(colData(vst_PrePost_ctlNT)[,c("Type","Treatment")])
pheatmap(assay(vst_PrePost_ctlNT)[select_genes_resPrePost_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resPrePost_ctlNT,fontsize=8, main = "Pre vs Post,Control NT",name="vst(FPKM)")
#8 vs 9
ddsGleason89_ctlNT<-dds
design(ddsGleason89_ctlNT)<-~Type + Gleason
ddsGleason89_ctlNT<-DESeq(ddsGleason89_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsGleason89_ctlNT)
## [1] "Intercept" "Type_Tumor_vs_Normal" "Gleason_2_vs_1"
resGleason89_ctlNT<-lfcShrink(ddsGleason89_ctlNT, coef="Gleason_2_vs_1" , type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resGleason89_ctlNT
## log2 fold change (MAP): Gleason 2 vs 1
## Wald test p-value: Gleason 2 vs 1
## DataFrame with 29912 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 -0.00439921 0.225794 0.979033 0.990679
## TNMD 32.8382 -0.06649215 0.307228 0.291380 0.601304
## DPM1 473.0177 0.12596094 0.311472 0.249735 0.562810
## SCYL3 1171.0906 0.24873175 0.315059 0.148813 0.459664
## C1orf112 321.5382 0.04780934 0.268725 0.717452 0.876212
## ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -0.00321189 0.303914 0.6496241 NA
## H2AL1SP 4.26468 0.00639818 0.303738 0.7527682 NA
## NIPBL-DT 1361.89229 0.38123392 0.415404 0.0511271 0.300842
## CERNA2 29.84232 -0.00541761 0.301312 0.8741141 0.946936
## LOC100996886 32.40953 -0.01547409 0.304536 0.0411074 0.277866
summary(resGleason89_ctlNT)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 314, 1%
## LFC < 0 (down) : 437, 1.5%
## outliers [1] : 0, 0%
## low counts [2] : 7538, 25%
## (mean count < 23)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resGleason89_ctlNT_Ordered<-resGleason89_ctlNT[order(resGleason89_ctlNT$padj),]
head(resGleason89_ctlNT_Ordered)
## log2 fold change (MAP): Gleason 2 vs 1
## Wald test p-value: Gleason 2 vs 1
## DataFrame with 6 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ANPEP 4877.91 -5.60853 0.492568 1.63436e-33 3.65672e-29
## NCAPD3 10906.07 -3.60354 0.316366 2.99826e-31 3.35415e-27
## DHCR24 5570.05 -3.25469 0.357290 5.19738e-21 2.93052e-17
## SCHLAP1 1260.44 5.60918 0.656282 5.23916e-21 2.93052e-17
## IGHA2 3746.03 -4.58665 0.613142 1.50390e-17 6.72963e-14
## BANK1 1142.58 -2.84737 0.364722 3.61285e-16 1.34723e-12
#第三张热图
set.seed(88)
library("pheatmap")
select_genes_resGleason89_ctlNT_Ordered<-rownames(resGleason89_ctlNT_Ordered)
select_genes_resGleason89_ctlNT_Ordered<-select_genes_resGleason89_ctlNT_Ordered[1:30]
vst_Gleason89_ctlNT<-vst(ddsGleason89_ctlNT,blind = FALSE)
df_resGleason89_ctlNT <- as.data.frame(colData(vst_Gleason89_ctlNT)[,c("Type","Gleason")])
pheatmap(assay(vst_Gleason89_ctlNT)[select_genes_resGleason89_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resGleason89_ctlNT,fontsize=8, main = "Gleason8 vs Gleason9,Control NT",name="vst(FPKM)")