FinalDongCapStone-过程尝试.knit

2021/11/09 Zeyuan Dong

setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
counts <- read.table("ProstCa_030921_2.txt")
#library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#seqlevels(txd

library(stringr)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
library(Homo.sapiens)

## 载入需要的程辑包：AnnotationDbi

## 载入需要的程辑包：stats4

## 载入需要的程辑包：BiocGenerics

## 载入需要的程辑包：parallel

## 
## 载入程辑包：'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 载入需要的程辑包：Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 载入需要的程辑包：IRanges

## 载入需要的程辑包：S4Vectors

## 
## 载入程辑包：'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## 载入需要的程辑包：OrganismDbi

## 载入需要的程辑包：GenomicFeatures

## 载入需要的程辑包：GenomeInfoDb

## 载入需要的程辑包：GenomicRanges

## 载入需要的程辑包：GO.db

##

## 载入需要的程辑包：org.Hs.eg.db

##

## 载入需要的程辑包：TxDb.Hsapiens.UCSC.hg19.knownGene

geneid <- rownames(counts) 
gene <- str_match(geneid, "(\\w*).*")
geneid <- gene[,2]
geneid2<-geneid
geneid2=data.frame(geneid2)
counts<-cbind(counts,geneid2)

#Convert geneid
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
genes <- select(Homo.sapiens, keys=geneid,
                columns=c("SYMBOL","TXCHROM"),
                keytype="ENSEMBL")

## 'select()' returned many:many mapping between keys and columns

dim(genes)

## [1] 60216     3

#Get rid of duplicate genes
genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
row.names(counts)<-counts$geneid2

library(GenomicAlignments)

## 载入需要的程辑包：SummarizedExperiment

## 载入需要的程辑包：MatrixGenerics

## 载入需要的程辑包：matrixStats

## 
## 载入程辑包：'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## 
## 载入程辑包：'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## The following object is masked from 'package:Biobase':
## 
##     rowMedians

## 载入需要的程辑包：Biostrings

## 载入需要的程辑包：XVector

## 
## 载入程辑包：'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

## 载入需要的程辑包：Rsamtools

counts<-cbind(counts,genes$SYMBOL)
counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready

counts_filtered<-counts[1:6]

coldata=data.frame(row.names=c("34C","35C","36C","34T","35T","36T"),
                   Type=rep(c("Normal","Tumor"),each=3),
                   Gleason=rep(c("2","1","2"),2),
                   Treatment=rep(c("1","1","2"),2))

coldata

##       Type Gleason Treatment
## 34C Normal       2         1
## 35C Normal       1         1
## 36C Normal       2         2
## 34T  Tumor       2         1
## 35T  Tumor       1         1
## 36T  Tumor       2         2

coldata$Type<-factor(coldata$Type)
coldata$Gleason<-factor(coldata$Gleason)
coldata$Treatment<-factor(coldata$Treatment)

#Differential expression analysis

library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
                              colData = coldata,
                              design = ~ Type )

#Pre-Filtering

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res <- results(dds)

#Alternative shrinkage estimators# To look for other choice

#install.packages("ashr")
resultsNames(dds)

## [1] "Intercept"            "Type_Tumor_vs_Normal"

resNorm <- lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="normal")

## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

resAsh <- lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="ashr")

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

resLFC <- lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

#TypeTreatment_Tumor_vs_Normal 34 35 36T vs 34 35 36N

ddsTN<-dds
resTypeTrX_TN<-lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

resTypeTrX_TN

## log2 fold change (MAP): Type Tumor vs Normal 
## Wald test p-value: Type Tumor vs Normal 
## DataFrame with 29912 rows and 5 columns
##                baseMean log2FoldChange     lfcSE    pvalue      padj
##               <numeric>      <numeric> <numeric> <numeric> <numeric>
## TSPAN6        1107.8741    0.001487557 0.0684490  0.920390  0.998905
## TNMD            32.8382    0.003521716 0.0700459  0.300846  0.971182
## DPM1           473.0177    0.000797669 0.0695532  0.932049  0.999246
## SCYL3         1171.0906   -0.004260059 0.0692136  0.710194  0.994762
## C1orf112       321.5382   -0.004050157 0.0694818  0.666490  0.994762
## ...                 ...            ...       ...       ...       ...
## BMP8B-AS1      10.53852   -0.000217798 0.0699759  0.867141  0.996308
## H2AL1SP         4.26468   -0.000759203 0.0699901  0.435829        NA
## NIPBL-DT     1361.89229   -0.005912317 0.0695784  0.562948  0.988494
## CERNA2         29.84232   -0.000177364 0.0699416  0.936720  0.999246
## LOC100996886   32.40953    0.002115149 0.0700345        NA        NA

summary(resTypeTrX_TN)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 65, 0.22%
## LFC < 0 (down)     : 8, 0.027%
## outliers [1]       : 1970, 6.6%
## low counts [2]     : 2320, 7.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

resTypeTrX_TN_Ordered<-resTypeTrX_TN[order(resTypeTrX_TN$padj),]
head(resTypeTrX_TN_Ordered)

## log2 fold change (MAP): Type Tumor vs Normal 
## Wald test p-value: Type Tumor vs Normal 
## DataFrame with 6 rows and 5 columns
##            baseMean log2FoldChange     lfcSE      pvalue        padj
##           <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## CCL26       14.0100     0.00105147 0.0700005 1.05242e-07 0.000245138
## BEND4     1569.3562     1.73385161 0.3347639 9.57519e-09 0.000245138
## RNU6-744P   15.8184     0.00105403 0.0700006 9.45000e-08 0.000245138
## RPSAP23     14.0100     0.00105147 0.0700005 1.05242e-07 0.000245138
## MBD3L2      14.4220     0.00105226 0.0700006 9.97044e-08 0.000245138
## PLSCR5      14.4220     0.00105226 0.0700006 9.97044e-08 0.000245138

set.seed(1)
library(ComplexHeatmap)

## 载入需要的程辑包：grid

## 
## 载入程辑包：'grid'

## The following object is masked from 'package:Biostrings':
## 
##     pattern

## ========================================
## ComplexHeatmap version 2.9.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

library("pheatmap")

## 
## 载入程辑包：'pheatmap'

## The following object is masked from 'package:ComplexHeatmap':
## 
##     pheatmap

select_genes_resTypeTrX_TN_Ordered<-rownames(resTypeTrX_TN_Ordered)
select_genes_resTypeTrX_TN_Ordered<-select_genes_resTypeTrX_TN_Ordered[1:30]
vst_TypeTrX_TN<-vst(ddsTN,blind = FALSE)
df_resTypeTrX_TN <- as.data.frame(colData(vst_TypeTrX_TN)["Type"])
pheatmap(assay(vst_TypeTrX_TN)[select_genes_resTypeTrX_TN_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_resTypeTrX_TN, fontsize=8, main = "34,35,36N vs 34,35,36T ",name="vst(FPKM)")

###Pre vs Post (Control N vs T)

ddsPrePost_ctlNT<-dds

design(ddsPrePost_ctlNT)<-~Type + Treatment

ddsPrePost_ctlNT<-DESeq(ddsPrePost_ctlNT)

## using pre-existing size factors

## estimating dispersions

## found already estimated dispersions, replacing these

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

resultsNames(ddsPrePost_ctlNT)

## [1] "Intercept"            "Type_Tumor_vs_Normal" "Treatment_2_vs_1"

resPrePost_ctlNT<-lfcShrink(ddsPrePost_ctlNT, coef="Treatment_2_vs_1", type="apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

resPrePost_ctlNT

## log2 fold change (MAP): Treatment 2 vs 1 
## Wald test p-value: Treatment 2 vs 1 
## DataFrame with 29912 rows and 5 columns
##                baseMean log2FoldChange     lfcSE    pvalue      padj
##               <numeric>      <numeric> <numeric> <numeric> <numeric>
## TSPAN6        1107.8741       0.128615  0.257440 0.5490895  0.744443
## TNMD            32.8382      -0.103872  0.455843 0.3613623  0.588085
## DPM1           473.0177       0.357673  0.447453 0.1726233  0.385318
## SCYL3         1171.0906       0.467124  0.318886 0.0634366  0.213619
## C1orf112       321.5382       0.160553  0.347815 0.4967210  0.705751
## ...                 ...            ...       ...       ...       ...
## BMP8B-AS1      10.53852     -0.0388657  0.458081 0.0972491        NA
## H2AL1SP         4.26468      0.0417802  0.456595 0.4353598        NA
## NIPBL-DT     1361.89229      0.4762370  0.370302 0.0779702  0.240383
## CERNA2         29.84232      0.0837081  0.454357 0.4235501  0.644285
## LOC100996886   32.40953     -0.0675131  0.462369 0.0395255  0.162237

summary(resPrePost_ctlNT)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1403, 4.7%
## LFC < 0 (down)     : 2469, 8.3%
## outliers [1]       : 0, 0%
## low counts [2]     : 6959, 23%
## (mean count < 20)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

resPrePost_ctlNT_Ordered<-resTypeTrX_TN[order(resPrePost_ctlNT$padj),]
head(resPrePost_ctlNT_Ordered)

## log2 fold change (MAP): Type Tumor vs Normal 
## Wald test p-value: Type Tumor vs Normal 
## DataFrame with 6 rows and 5 columns
##           baseMean log2FoldChange     lfcSE    pvalue      padj
##          <numeric>      <numeric> <numeric> <numeric> <numeric>
## POTEJ      992.145    0.002199421 0.0699965 0.3926990  0.985154
## PCA3      2190.298    0.001554564 0.0699760 0.5244327  0.987454
## RN7SL2  448990.716    0.000510975 0.0695987 0.9551007  0.999246
## TMPRSS2  20680.141    0.003064803 0.0700129 0.3768504  0.985154
## RBFOX3    1635.347   -0.011391204 0.0711377 0.0587171  0.786905
## PRRX1     4103.483   -0.007193370 0.0701364 0.3167789  0.976521

#第二张热图pre vs post
set.seed(88)

select_genes_resPrePost_ctlNT_Ordered<-rownames(resPrePost_ctlNT_Ordered)
select_genes_resPrePost_ctlNT_Ordered<-select_genes_resPrePost_ctlNT_Ordered[1:30]
vst_PrePost_ctlNT<-vst(ddsPrePost_ctlNT,blind = FALSE)
df_resPrePost_ctlNT <- as.data.frame(colData(vst_PrePost_ctlNT)[,c("Type","Treatment")])
pheatmap(assay(vst_PrePost_ctlNT)[select_genes_resPrePost_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_resPrePost_ctlNT,fontsize=8, main = "Pre vs Post,Control NT",name="vst(FPKM)")

#8 vs 9

ddsGleason89_ctlNT<-dds

design(ddsGleason89_ctlNT)<-~Type + Gleason

ddsGleason89_ctlNT<-DESeq(ddsGleason89_ctlNT)

## using pre-existing size factors

## estimating dispersions

## found already estimated dispersions, replacing these

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

resultsNames(ddsGleason89_ctlNT)

## [1] "Intercept"            "Type_Tumor_vs_Normal" "Gleason_2_vs_1"

resGleason89_ctlNT<-lfcShrink(ddsGleason89_ctlNT, coef="Gleason_2_vs_1" , type="apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

resGleason89_ctlNT

## log2 fold change (MAP): Gleason 2 vs 1 
## Wald test p-value: Gleason 2 vs 1 
## DataFrame with 29912 rows and 5 columns
##                baseMean log2FoldChange     lfcSE    pvalue      padj
##               <numeric>      <numeric> <numeric> <numeric> <numeric>
## TSPAN6        1107.8741    -0.00439921  0.225794  0.979033  0.990679
## TNMD            32.8382    -0.06649215  0.307228  0.291380  0.601304
## DPM1           473.0177     0.12596094  0.311472  0.249735  0.562810
## SCYL3         1171.0906     0.24873175  0.315059  0.148813  0.459664
## C1orf112       321.5382     0.04780934  0.268725  0.717452  0.876212
## ...                 ...            ...       ...       ...       ...
## BMP8B-AS1      10.53852    -0.00321189  0.303914 0.6496241        NA
## H2AL1SP         4.26468     0.00639818  0.303738 0.7527682        NA
## NIPBL-DT     1361.89229     0.38123392  0.415404 0.0511271  0.300842
## CERNA2         29.84232    -0.00541761  0.301312 0.8741141  0.946936
## LOC100996886   32.40953    -0.01547409  0.304536 0.0411074  0.277866

summary(resGleason89_ctlNT)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 314, 1%
## LFC < 0 (down)     : 437, 1.5%
## outliers [1]       : 0, 0%
## low counts [2]     : 7538, 25%
## (mean count < 23)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

resGleason89_ctlNT_Ordered<-resGleason89_ctlNT[order(resGleason89_ctlNT$padj),]
head(resGleason89_ctlNT_Ordered)

## log2 fold change (MAP): Gleason 2 vs 1 
## Wald test p-value: Gleason 2 vs 1 
## DataFrame with 6 rows and 5 columns
##          baseMean log2FoldChange     lfcSE      pvalue        padj
##         <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## ANPEP     4877.91       -5.60853  0.492568 1.63436e-33 3.65672e-29
## NCAPD3   10906.07       -3.60354  0.316366 2.99826e-31 3.35415e-27
## DHCR24    5570.05       -3.25469  0.357290 5.19738e-21 2.93052e-17
## SCHLAP1   1260.44        5.60918  0.656282 5.23916e-21 2.93052e-17
## IGHA2     3746.03       -4.58665  0.613142 1.50390e-17 6.72963e-14
## BANK1     1142.58       -2.84737  0.364722 3.61285e-16 1.34723e-12

#第三张热图
set.seed(88)
library("pheatmap")
select_genes_resGleason89_ctlNT_Ordered<-rownames(resGleason89_ctlNT_Ordered)
select_genes_resGleason89_ctlNT_Ordered<-select_genes_resGleason89_ctlNT_Ordered[1:30]
vst_Gleason89_ctlNT<-vst(ddsGleason89_ctlNT,blind = FALSE)
df_resGleason89_ctlNT <- as.data.frame(colData(vst_Gleason89_ctlNT)[,c("Type","Gleason")])
pheatmap(assay(vst_Gleason89_ctlNT)[select_genes_resGleason89_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_resGleason89_ctlNT,fontsize=8, main = "Gleason8 vs Gleason9,Control NT",name="vst(FPKM)")