最终版代码test
2021/08/04 最后完整版 可以有,应该赢,别等了 Fight on forever
#设置工作路径,读取源文件 安装BiocManager,安装TxDb.Hsapiens.UCSC.hg19.knownGene,安装stringr和Homo.sapiens
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
counts <- read.table("ProstCa_030921_2.txt")
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## Loading required package: GenomicFeatures
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevels(txdb)
## [1] "chr1" "chr2" "chr3"
## [4] "chr4" "chr5" "chr6"
## [7] "chr7" "chr8" "chr9"
## [10] "chr10" "chr11" "chr12"
## [13] "chr13" "chr14" "chr15"
## [16] "chr16" "chr17" "chr18"
## [19] "chr19" "chr20" "chr21"
## [22] "chr22" "chrX" "chrY"
## [25] "chrM" "chr1_gl000191_random" "chr1_gl000192_random"
## [28] "chr4_ctg9_hap1" "chr4_gl000193_random" "chr4_gl000194_random"
## [31] "chr6_apd_hap1" "chr6_cox_hap2" "chr6_dbb_hap3"
## [34] "chr6_mann_hap4" "chr6_mcf_hap5" "chr6_qbl_hap6"
## [37] "chr6_ssto_hap7" "chr7_gl000195_random" "chr8_gl000196_random"
## [40] "chr8_gl000197_random" "chr9_gl000198_random" "chr9_gl000199_random"
## [43] "chr9_gl000200_random" "chr9_gl000201_random" "chr11_gl000202_random"
## [46] "chr17_ctg5_hap1" "chr17_gl000203_random" "chr17_gl000204_random"
## [49] "chr17_gl000205_random" "chr17_gl000206_random" "chr18_gl000207_random"
## [52] "chr19_gl000208_random" "chr19_gl000209_random" "chr21_gl000210_random"
## [55] "chrUn_gl000211" "chrUn_gl000212" "chrUn_gl000213"
## [58] "chrUn_gl000214" "chrUn_gl000215" "chrUn_gl000216"
## [61] "chrUn_gl000217" "chrUn_gl000218" "chrUn_gl000219"
## [64] "chrUn_gl000220" "chrUn_gl000221" "chrUn_gl000222"
## [67] "chrUn_gl000223" "chrUn_gl000224" "chrUn_gl000225"
## [70] "chrUn_gl000226" "chrUn_gl000227" "chrUn_gl000228"
## [73] "chrUn_gl000229" "chrUn_gl000230" "chrUn_gl000231"
## [76] "chrUn_gl000232" "chrUn_gl000233" "chrUn_gl000234"
## [79] "chrUn_gl000235" "chrUn_gl000236" "chrUn_gl000237"
## [82] "chrUn_gl000238" "chrUn_gl000239" "chrUn_gl000240"
## [85] "chrUn_gl000241" "chrUn_gl000242" "chrUn_gl000243"
## [88] "chrUn_gl000244" "chrUn_gl000245" "chrUn_gl000246"
## [91] "chrUn_gl000247" "chrUn_gl000248" "chrUn_gl000249"
library(stringr)
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
library(Homo.sapiens)
## Loading required package: OrganismDbi
## Loading required package: GO.db
##
## Loading required package: org.Hs.eg.db
##
geneid <- rownames(counts)
gene <- str_match(geneid, "(\\w*).*")#准备去除小数点
geneid <- gene[,2]#去除后面两个小数点
geneid2<-geneid#准备将去除小数点后面的ENS名字对应到基因
geneid2=data.frame(geneid2)
counts<-cbind(counts,geneid2)
#Convert geneid
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
genes <- select(Homo.sapiens, keys=geneid,
columns=c("SYMBOL","TXCHROM"),
keytype="ENSEMBL")
## 'select()' returned many:many mapping between keys and columns
dim(genes)#看他的情况大概
## [1] 60216 3
#Get rid of duplicate genes
genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
row.names(counts)<-counts$geneid2
#把countsready重新变化,变成更容易操作的矩阵,列命变化,Group and build a matrix that formally processes the data
library(GenomicAlignments)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: Rsamtools
counts<-cbind(counts,genes$SYMBOL)
counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready
counts_filtered<-counts[1:6]
coldata=data.frame(row.names=c("34C","35C","36C","34T","35T","36T"),
Type=rep(c("Control","Tumor"),each=3),
Gleason=rep(c("2","1","2"),2),
Treatment=rep(c("1","1","2"),2))
#到这一步结束,我们已经拥有了清晰的列命的counts,并且已经将ENS去除了小数点,对应好了基因名字。这个矩阵现在叫counts_filtered
coldata#构建完整的矩阵列表
## Type Gleason Treatment
## 34C Control 2 1
## 35C Control 1 1
## 36C Control 2 2
## 34T Tumor 2 1
## 35T Tumor 1 1
## 36T Tumor 2 2
#添加老师的notes,89代表什么,
coldata$Type<-factor(coldata$Type)
coldata$Gleason<-factor(coldata$Gleason)
coldata$Treatment<-factor(coldata$Treatment)
#Differential expression analysis
library(DESeq2)
#if (!requireNamespace("BiocManager", quiet = TRUE))
#install.packages("BiocManager")
#BiocManager::install("SummarizedExperiment")
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
colData = coldata,
design = ~ Type )
#Pre-Filtering
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
#这一行加起来,和是10的话 就可以留下来,因为出现0就不用的话,就都删掉了。只要这一行有一个不是0,那么0也是一种信息
#虽然在运行 DESeq2 函数之前没有必要预过滤低计数基因,但有两个原因使预过滤有用:通过删除很少读取的行,我们减少了dds数据对象的内存大小,我们提高了 DESeq2 中转换和测试函数的速度。在这里,我们执行最小预过滤以仅保留至少有 10 次读取的行。请注意,通过对结果函数中归一化计数的均值进行独立过滤,可以自动应用更严格的过滤以增加功效。
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
#res01 <- results(dds,alpha = 0.1)
#res005 <- results(dds,alpha = 0.05)
#write.csv(as.data.frame(res01),
#file="/Users/dongxiaotai/Desktop/res01.csv")
#write.csv(as.data.frame(res005),
#file="/Users/dongxiaotai/Desktop/res005.csv")
#Alternative shrinkage estimators# To look for other choice
#install.packages("ashr")
resultsNames(dds)
## [1] "Intercept" "Type_Tumor_vs_Control"
resNorm <- lfcShrink(dds, coef="Type_Tumor_vs_Control", type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef="Type_Tumor_vs_Control", type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
resLFC <- lfcShrink(dds, coef="Type_Tumor_vs_Control", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")
#分开噪声点,后面运用 apeglm
#TypeTreatment_Tumor_vs_Normal 34 35 36T vs 34 35 36N
ddsTN<-dds
#resTypeTrX_TN<-lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="apeglm")
resTypeTrX_TN<-results(ddsTN,contrast=c("Type","Tumor","Control"),alpha=0.05)
resTypeTrX_TN
## log2 fold change (MLE): Type Tumor vs Control
## Wald test p-value: Type Tumor vs Control
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.0325524 0.325711 0.0999427 0.920390 0.998486
## TNMD 32.8382 1.4031560 1.356203 1.0346212 0.300846 0.941348
## DPM1 473.0177 0.0528333 0.619625 0.0852666 0.932049 0.999438
## SCYL3 1171.0906 -0.1550594 0.417280 -0.3715957 0.710194 0.992431
## C1orf112 321.5382 -0.2140606 0.496694 -0.4309708 0.666490 0.991134
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -0.582843 3.484002 -0.1672914 0.867141 0.996036
## H2AL1SP 4.26468 -2.997713 3.846888 -0.7792566 0.435829 0.964254
## NIPBL-DT 1361.89229 -0.269032 0.465077 -0.5784681 0.562948 0.979952
## CERNA2 29.84232 -0.150351 1.893759 -0.0793926 0.936720 0.999731
## LOC100996886 32.40953 5.663064 2.655310 2.1327323 NA NA
summary(resTypeTrX_TN)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 37, 0.12%
## LFC < 0 (down) : 5, 0.017%
## outliers [1] : 1970, 6.6%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#N作为标准,得出上调和下降
resTypeTrX_TN_Ordered<-resTypeTrX_TN[order(resTypeTrX_TN$padj),]
head(resTypeTrX_TN_Ordered)
## log2 fold change (MLE): Type Tumor vs Control
## Wald test p-value: Type Tumor vs Control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## CCL26 14.0100 20.79336 3.91041 5.31743 1.05242e-07 0.000267334
## BEND4 1569.3562 1.85266 0.32287 5.73809 9.57519e-09 0.000267334
## RNU6-744P 15.8184 20.86765 3.91000 5.33699 9.45000e-08 0.000267334
## RPSAP23 14.0100 20.79336 3.91041 5.31743 1.05242e-07 0.000267334
## MBD3L2 14.4220 20.83125 3.91031 5.32726 9.97044e-08 0.000267334
## PLSCR5 14.4220 20.83125 3.91031 5.32726 9.97044e-08 0.000267334
#通过调整后的p值将,基因重新排序
#第一张热图 T vs N
set.seed(1)
#if (!requireNamespace("BiocManager", quiet = TRUE))
#install.packages("BiocManager")
#BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)
## Loading required package: grid
##
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
##
## pattern
## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library("pheatmap")
##
## Attaching package: 'pheatmap'
## The following object is masked from 'package:ComplexHeatmap':
##
## pheatmap
select_genes_resTypeTrX_TN_Ordered<-rownames(resTypeTrX_TN_Ordered)
select_genes_resTypeTrX_TN_Ordered<-select_genes_resTypeTrX_TN_Ordered[1:42]
write.csv(as.data.frame(resTypeTrX_TN_Ordered),
file="/Users/dongxiaotai/Desktop/resTypeTrX_TN_Ordered.csv")
write.csv(as.data.frame(resTypeTrX_TN_Ordered[select_genes_resTypeTrX_TN_Ordered,]),
file="/Users/dongxiaotai/Desktop/select_genes_resTypeTrX_TN_Ordered.csv")
vst_TypeTrX_TN<-vst(ddsTN,blind = FALSE)
df_resTypeTrX_TN <- as.data.frame(colData(vst_TypeTrX_TN)["Type"])
pheatmap(assay(vst_TypeTrX_TN)[select_genes_resTypeTrX_TN_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resTypeTrX_TN, fontsize=8, main = "34,35,36C vs 34,35,36T ",name="vst(FPKM)")
#8 vs 9
ddsGleason89_ctlNT<-dds
design(ddsGleason89_ctlNT)<-~Type + Gleason
ddsGleason89_ctlNT<-DESeq(ddsGleason89_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsGleason89_ctlNT)
## [1] "Intercept" "Type_Tumor_vs_Control" "Gleason_2_vs_1"
resGleason89_ctlNT<-results(ddsGleason89_ctlNT,contrast=c("Gleason","2","1"),alpha=0.05)
resGleason89_ctlNT
## log2 fold change (MLE): Gleason 2 vs 1
## Wald test p-value: Gleason 2 vs 1
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 -0.00885828 0.337065 -0.0262806 0.979033 0.990579
## TNMD 32.8382 -1.43220889 1.357417 -1.0550987 0.291380 0.600705
## DPM1 473.0177 0.73435841 0.638022 1.1509930 0.249735 0.562239
## SCYL3 1171.0906 0.56378587 0.390504 1.4437383 0.148813 0.459201
## C1orf112 321.5382 0.19070630 0.527008 0.3618662 0.717452 0.876603
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -1.792776 3.946374 -0.454284 0.6496241 NA
## H2AL1SP 4.26468 1.296251 4.115195 0.314991 0.7527682 NA
## NIPBL-DT 1361.89229 0.808984 0.414776 1.950412 0.0511271 0.301514
## CERNA2 29.84232 -0.333639 2.105841 -0.158435 0.8741141 0.947013
## LOC100996886 32.40953 -5.702918 2.792202 -2.042445 0.0411074 0.277870
summary(resGleason89_ctlNT)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 139, 0.46%
## LFC < 0 (down) : 194, 0.65%
## outliers [1] : 0, 0%
## low counts [2] : 8119, 27%
## (mean count < 26)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resGleason89_ctlNT_Ordered<-resGleason89_ctlNT[order(resGleason89_ctlNT$padj),]
head(resGleason89_ctlNT_Ordered)
## log2 fold change (MLE): Gleason 2 vs 1
## Wald test p-value: Gleason 2 vs 1
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ANPEP 4877.91 -5.70051 0.472519 -12.06410 1.63436e-33 3.56176e-29
## NCAPD3 10906.07 -3.66525 0.315231 -11.62718 2.99826e-31 3.26705e-27
## DHCR24 5570.05 -3.34382 0.355532 -9.40511 5.19738e-21 2.85443e-17
## SCHLAP1 1260.44 5.75391 0.611840 9.40427 5.23916e-21 2.85443e-17
## IGHA2 3746.03 -4.75000 0.557064 -8.52685 1.50390e-17 6.55488e-14
## BANK1 1142.58 -2.93922 0.360601 -8.15088 3.61285e-16 1.31225e-12
#第2张热图
set.seed(88)
library("pheatmap")
select_genes_resGleason89_ctlNT_Ordered<-rownames(resGleason89_ctlNT_Ordered)
select_genes_resGleason89_ctlNT_Ordered<-select_genes_resGleason89_ctlNT_Ordered[1:30]
vst_Gleason89_ctlNT<-vst(ddsGleason89_ctlNT,blind = FALSE)
df_resGleason89_ctlNT <- as.data.frame(colData(vst_Gleason89_ctlNT)[,c("Type","Gleason")])
pheatmap(assay(vst_Gleason89_ctlNT)[select_genes_resGleason89_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resGleason89_ctlNT,fontsize=8, main = "Gleason8 vs Gleason9,Control CT ",name="vst(FPKM)")
###Pre vs Post (Control N vs T)
ddsPrePost_ctlNT<-dds
design(ddsPrePost_ctlNT)<-~Type + Treatment
ddsPrePost_ctlNT<-DESeq(ddsPrePost_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsPrePost_ctlNT)
## [1] "Intercept" "Type_Tumor_vs_Control" "Treatment_2_vs_1"
resPrePost_ctlNT<-results(ddsPrePost_ctlNT, contrast=c("Treatment","2","1"), alpha=0.05)
resPrePost_ctlNT
## log2 fold change (MLE): Treatment 2 vs 1
## Wald test p-value: Treatment 2 vs 1
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.183342 0.306017 0.599125 0.5490895 0.743449
## TNMD 32.8382 -1.263376 1.384109 -0.912772 0.3613623 0.586899
## DPM1 473.0177 0.805517 0.590631 1.363823 0.1726233 0.384804
## SCYL3 1171.0906 0.634129 0.341642 1.856119 0.0634366 0.211961
## C1orf112 321.5382 0.333883 0.491251 0.679658 0.4967210 0.704565
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -6.094999 3.675364 -1.658339 0.0972491 NA
## H2AL1SP 4.26468 3.174072 4.069047 0.780053 0.4353598 NA
## NIPBL-DT 1361.89229 0.712923 0.404476 1.762587 0.0779702 0.239104
## CERNA2 29.84232 1.580403 1.974819 0.800277 0.4235501 0.642771
## LOC100996886 32.40953 -5.740451 2.788422 -2.058673 0.0395255 0.160207
summary(resPrePost_ctlNT)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 865, 2.9%
## LFC < 0 (down) : 1545, 5.2%
## outliers [1] : 0, 0%
## low counts [2] : 7538, 25%
## (mean count < 23)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resPrePost_ctlNT_Ordered<-resTypeTrX_TN[order(resPrePost_ctlNT$padj),]
head(resPrePost_ctlNT_Ordered)
## log2 fold change (MLE): Type Tumor vs Control
## Wald test p-value: Type Tumor vs Control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## POTEJ 992.145 1.4888086 1.741840 0.8547330 0.3926990 0.960734
## PCA3 2190.298 1.2327145 1.936624 0.6365275 0.5244327 0.976677
## RN7SL2 448990.716 0.0370742 0.658479 0.0563027 0.9551007 0.999731
## TMPRSS2 20680.141 1.2188144 1.379195 0.8837145 0.3768504 0.959836
## RBFOX3 1635.347 -1.4278862 0.755373 -1.8903059 0.0587171 0.858157
## PRRX1 4103.483 -0.6384444 0.637743 -1.0010990 0.3167789 0.942193
#第3张热图pre vs post
set.seed(88)
select_genes_resPrePost_ctlNT_Ordered<-rownames(resPrePost_ctlNT_Ordered)
select_genes_resPrePost_ctlNT_Ordered<-select_genes_resPrePost_ctlNT_Ordered[1:30]
vst_PrePost_ctlNT<-vst(ddsPrePost_ctlNT,blind = FALSE)
df_resPrePost_ctlNT <- as.data.frame(colData(vst_PrePost_ctlNT)[,c("Type","Treatment")])
pheatmap(assay(vst_PrePost_ctlNT)[select_genes_resPrePost_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resPrePost_ctlNT,fontsize=8, main = "Pre vs Post,Control CT",name="vst(FPKM)")
#Three_Tumors_Only
counts_t<-counts[,c(4:6)]#countst指的t,only
coldata_t<-coldata[c(4:6),]
#library(DESeq2)
dds_t <- DESeqDataSetFromMatrix(countData = counts_t,
colData = coldata_t,
design = ~Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_t)) >= 10
dds_t <- dds_t[keep,]
#dds+res
dds_t<-DESeq(dds_t)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
#res_t<-lfcShrink(dds_t, )
res_t<-results(dds_t, contrast=c("Treatment","2","1"), alpha=0.05)
summary(res_t)
##
## out of 28561 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 422, 1.5%
## LFC < 0 (down) : 293, 1%
## outliers [1] : 0, 0%
## low counts [2] : 9414, 33%
## (mean count < 59)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#resTypeTrX_TN<-lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="apeglm")
#resTypeTrX_TN<-results(ddsTN,contrast=c("Type","Tumor","Normal"),alpha=0.05)
#resTypeTrX_TN
#summary(resTypeTrX_TN)
#res_t<-results(dds_t, contrast=c("Treatment","2","1"), alpha=0.05)
#res_t
#summary(res_t)
#res_t<-lfcShrink(dds_t, )
#res_t<-lfcShrink(dds_t, coef = "Treatment_2_vs_1", type = "apeglm")
#summary(res_t)
#testing_heatmap_only tumor
res_t_Ordered<-res_t[order(res_t$padj),]
head(res_t_Ordered)
## log2 fold change (MLE): Treatment 2 vs 1
## Wald test p-value: Treatment 2 vs 1
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## RN7SL147P 2675.79 8.82928 0.677177 13.0384 7.40301e-39 1.41745e-34
## RNA5-8SP4 1811.92 8.94547 0.723286 12.3678 3.90213e-35 3.73570e-31
## ANKRD35 18097.44 6.34913 0.550263 11.5383 8.45397e-31 5.39561e-27
## RNA5SP215 5706.88 5.94593 0.570019 10.4311 1.78790e-25 8.55821e-22
## RNA5SP324 2411.34 6.75583 0.650013 10.3934 2.65769e-25 1.01774e-21
## RNU2-61P 7818.33 5.58140 0.538177 10.3709 3.36231e-25 1.07297e-21
set.seed(88)
library("pheatmap")
select_genes_res_t_Ordered<-rownames(res_t_Ordered)
select_genes_res_t_Ordered<-select_genes_res_t_Ordered[1:30]
vst_t<-vst(dds_t,blind = FALSE)
df_res_t <- as.data.frame(colData(vst_t)["Treatment"])
pheatmap(assay(vst_t)[select_genes_res_t_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_t, fontsize=8, main = "Tumors:Pre vs Post-Treatment",name="vst(FPKM)")
#Normal_Only
counts_n<-counts[,c(1:3)]
coldata_n<-coldata[c(1:3),]
library(DESeq2)
dds_n <- DESeqDataSetFromMatrix(countData = counts_n,
colData = coldata_n,
design = ~ Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_n)) >= 10
dds_n <- dds_n[keep,]
#dds+res
dds_n<-DESeq(dds_n)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
#res_t<-lfcShrink(dds_t, )
res_n<-results(dds_n, contrast=c("Treatment","2","1"), alpha=0.05)
summary(res_n)
##
## out of 26382 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 84, 0.32%
## LFC < 0 (down) : 211, 0.8%
## outliers [1] : 0, 0%
## low counts [2] : 8184, 31%
## (mean count < 40)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#testing_heatmap_only normal
res_n_Ordered<-res_n[order(res_n$padj),]
head(res_n_Ordered)
## log2 fold change (MLE): Treatment 2 vs 1
## Wald test p-value: Treatment 2 vs 1
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## RNA5SP141 60053.281 -3.97976 0.372212 -10.69218 1.10743e-26 2.01530e-22
## PCA3 1478.407 -6.71460 0.696384 -9.64210 5.30905e-22 4.83071e-18
## TMPRSS2 14016.335 -3.21323 0.368393 -8.72228 2.72656e-18 1.42527e-14
## RN7SL2 499314.108 -2.70774 0.311001 -8.70654 3.13280e-18 1.42527e-14
## SPON2 9362.003 -2.75835 0.332613 -8.29298 1.10442e-16 4.01964e-13
## POTEJ 586.631 -6.22581 0.753419 -8.26340 1.41576e-16 4.29401e-13
#library("pheatmap")
set.seed(3)
library("pheatmap")
select_genes_res_n_Ordered<-rownames(res_n_Ordered)
select_genes_res_n_Ordered<-select_genes_res_n_Ordered[1:30]
vst_n<-vst(dds_n,blind = FALSE)
df_res_n <- as.data.frame(colData(vst_n)["Treatment"])
pheatmap(assay(vst_n)[select_genes_res_n_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_n, fontsize=8, main = "Normal:Pre vs Post-Treatment",name="vst(FPKM)")
###34vs36;pre vs post
counts_3436_trx<-counts[,c(1,3,4,6)]
coldata_3436_trx<-coldata[c(1,3,4,6),]
#library(DESeq2)
dds_3436_trx <- DESeqDataSetFromMatrix(countData = counts_3436_trx,
colData = coldata_3436_trx,
design = ~ Type + Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_3436_trx)) >= 10
dds_3436_trx <- dds_3436_trx[keep,]
#dds+res
dds_3436_trx<-DESeq(dds_3436_trx)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_3436_trx<-results(dds_3436_trx, contrast=c("Treatment","2","1"), alpha=0.05)
summary(res_3436_trx)
##
## out of 27475 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 787, 2.9%
## LFC < 0 (down) : 630, 2.3%
## outliers [1] : 0, 0%
## low counts [2] : 6392, 23%
## (mean count < 15)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_3436_trx_Ordered<-res_3436_trx[order(res_3436_trx$padj),]
head(res_3436_trx_Ordered)
## log2 fold change (MLE): Treatment 2 vs 1
## Wald test p-value: Treatment 2 vs 1
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## COL2A1 6964.468 6.22930 0.353518 17.6209 1.70325e-69 3.59097e-65
## POTEI 3138.401 -5.20207 0.304108 -17.1060 1.33959e-65 1.41212e-61
## TGM4 1909.905 5.42208 0.349629 15.5081 3.05833e-54 2.14929e-50
## POTEJ 1023.558 -6.36934 0.450505 -14.1382 2.20716e-45 1.16334e-41
## PCA3 598.939 -5.54619 0.487710 -11.3719 5.77243e-30 2.43400e-26
## RNA5SP207 702.878 5.48560 0.490732 11.1784 5.20187e-29 1.82785e-25
#library("pheatmap")
set.seed(70)
select_genes_res_3436_trx_Ordered<-rownames(res_3436_trx_Ordered)
select_genes_res_3436_trx_Ordered<-select_genes_res_3436_trx_Ordered[1:30]
vst_3436_trx<-vst(dds_3436_trx,blind = FALSE)
df_res_3436_trx <- as.data.frame(colData(vst_3436_trx)["Treatment"])
pheatmap(assay(vst_3436_trx)[select_genes_res_3436_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=df_res_3436_trx, fontsize=8, main = "34&36 contrasting Pre and Post-Treatment",name="vst(FPKM)")
3536
counts_3536_trx<-counts[,c(2,3,5,6)]
coldata_3536_trx<-coldata[c(2,3,5,6),]
#library(DESeq2)
dds_3536_trx <- DESeqDataSetFromMatrix(countData = counts_3536_trx,
colData = coldata_3536_trx,
design = ~ Type + Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_3536_trx)) >= 10
dds_3536_trx <- dds_3536_trx[keep,]
#dds+res
dds_3536_trx<-DESeq(dds_3536_trx)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_3536_trx<-results(dds_3536_trx, contrast=c("Treatment","2","1"), alpha=0.05)
summary(res_3536_trx)
##
## out of 28772 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 671, 2.3%
## LFC < 0 (down) : 716, 2.5%
## outliers [1] : 0, 0%
## low counts [2] : 7252, 25%
## (mean count < 30)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_3536_trx_Ordered<-res_3536_trx[order(res_3536_trx$padj),]
head(res_3536_trx_Ordered)
## log2 fold change (MLE): Treatment 2 vs 1
## Wald test p-value: Treatment 2 vs 1
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## RN7SL147P 2510.74 8.81406 0.536080 16.4417 9.62320e-61 2.07091e-56
## PCA3 2458.68 -7.55776 0.486495 -15.5351 2.00700e-54 2.15953e-50
## ACSM1 2751.65 -5.94770 0.428096 -13.8934 6.94895e-44 4.98472e-40
## ANPEP 6546.93 -5.94850 0.443400 -13.4157 4.89490e-41 2.63346e-37
## CTRB1 2441.57 6.73694 0.537736 12.5283 5.22459e-36 2.24866e-32
## CHRNA2 1290.91 -6.27367 0.546778 -11.4739 1.78460e-30 6.40078e-27
set.seed(1)
library(ComplexHeatmap)
#library("pheatmap")
select_genes_res_3536_trx_Ordered<-rownames(res_3536_trx_Ordered)
select_genes_res_3536_trx_Ordered<-select_genes_res_3536_trx_Ordered[1:30]
vst_3536_trx<-vst(dds_3536_trx,blind = FALSE)
df_res_3536_trx <- as.data.frame(colData(vst_3536_trx)["Treatment"])
pheatmap(assay(vst_3536_trx)[select_genes_res_3536_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_3536_trx,fontsize=8, main = "35&36 contrasting Pre- and Post- Treatment",name="vst(FPKM)")
3436,3536 各选取top100 并寻找share gene
write.csv(as.data.frame(res_3436_trx_Ordered[1:100,]),
file="/Users/dongxiaotai/Desktop/res_3436_trx_Orderedtop100.csv")
write.csv(as.data.frame(res_3536_trx_Ordered[1:100,]),
file="/Users/dongxiaotai/Desktop/res_3536_trx_Orderedtop100.csv")
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
library(readxl)
shared_genes_343536_trx<-read_excel('/Users/dongxiaotai/Desktop/shared_gene.xlsx',col_names = FALSE)
## New names:
## * `` -> ...1
shared_genes_343536_trx<-shared_genes_343536_trx[,1]
shared_genes_343536_trx<-t(shared_genes_343536_trx)
shared_genes_343536_trx<-shared_genes_343536_trx[1,]
dds_all_trx<-dds
design(dds_all_trx)<-~ Type + Treatment
dds_all_trx<-DESeq(dds_all_trx)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_all_trx<-results(dds_all_trx, contrast=c("Treatment","2","1"),alpha=0.05)
set.seed(1)
library(ComplexHeatmap)
#library("pheatmap")
#select_genes_res_3536_trx_Ordered<-rownames(res_3536_trx_Ordered)
#select_genes_res_3536_trx_Ordered<-select_genes_res_3536_trx_Ordered[1:30]
vst_all_trx<-vst(dds_all_trx,blind = FALSE)
df_res_all_trx <- as.data.frame(colData(vst_all_trx)["Treatment"])
pheatmap(assay(vst_all_trx)[shared_genes_343536_trx,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_all_trx,fontsize=8, main = "Shared Genes by 34vs36 & 35vs36 Pre/Post Treatments",name="vst(FPKM)", scale = "row")
vsd <- vst(dds, blind=FALSE)
plotPCA(vsd, intgroup=c("Treatment", "Type"))