在这一节中,我探索了r中的结果,可视化的结果帮助我更清楚地理解数据信息和数据结构。在DESeq2中,plotMA函数显示了由DeseqDataSet中所有样本的标准化计数的平均值引起的log2倍数的变化。在许多研究中,热图通常是一种探索枚举矩阵的直观方法。在DESeq2中,提出了两种变换方法。一种是方差稳定变换(VST),另一种是正则对数变换(Rlog)。由于VST的运行时间比Rlog短,所以我选择VST (FPKM)作为热图中的所有颜色。
#Set the working path, install various required packages,preprocess the COUNTS file
library(stringr)
library(Homo.sapiens)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GO.db
##
## Loading required package: org.Hs.eg.db
##
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
counts <- read.table("ProstCa_030921_2.txt")
geneid <- rownames(counts)
gene <- str_match(geneid, "(\\w*).*")#Ready to remove the decimal point
geneid <- gene[,2]#Let's get rid of the next two decimal points
geneid2<-geneid#We're going to map the ENS name after the decimal point to the gene
geneid2=data.frame(geneid2)#Convert to data.frame format
counts<-cbind(counts,geneid2)#Merge counts and geneid2 to counts
#Convert geneid and get rid of duplicate genes
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
genes <- select(Homo.sapiens, keys=geneid,
columns=c("SYMBOL","TXCHROM"),
keytype="ENSEMBL")
## 'select()' returned many:many mapping between keys and columns
dim(genes)
## [1] 60216 3
genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
row.names(counts)<-counts$geneid2
#Change CountsReady back to a more manageable matrix. Change the column names of the matrix, group and build a matrix that formally processes the data
library(GenomicAlignments)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: Rsamtools
counts<-cbind(counts,genes$SYMBOL)
counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready
#By the end of this step, we have a clear counts and have removed the decimal point for ENS to correspond to the gene name. This matrix is now called counts_filtered
#Build the complete matrix list,add sample information.I added the information before and after treatment and the Gleason score to the sample information. In “Counts” after reprocessing, 3 of the 6 data sets are C1 and the other 3 are T1. In ID, “C” stands for Control and “T” stands for Tumor.
counts_filtered<-counts[1:6]
coldata=data.frame(row.names=c("34C","35C","36C","34T","35T","36T"),
Type=rep(c("Control","Tumor"),each=3),
Gleason=rep(c("9","8","9"),2),
Treatment=rep(c("Pre","Pre","Post"),2))
#Display the table
coldata
## Type Gleason Treatment
## 34C Control 9 Pre
## 35C Control 8 Pre
## 36C Control 9 Post
## 34T Tumor 9 Pre
## 35T Tumor 8 Pre
## 36T Tumor 9 Post
coldata$Type<-factor(coldata$Type)
coldata$Gleason<-factor(coldata$Gleason)
coldata$Treatment<-factor(coldata$Treatment)
#Differential expression analysis
library(DESeq2)
#if (!requireNamespace("BiocManager", quiet = TRUE))
#install.packages("BiocManager")
#BiocManager::install("SummarizedExperiment")
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
colData = coldata,
design = ~ Type )
#Pre-Filtering If this row adds up, if it adds up to a 10 you can keep it, because if you don’t use a 0, you get rid of it. As long as one of these rows is not 0, then 0 is also information #Prefiltering has two important effects: we reduce the memory size of DDS data objects by removing rows that are rarely read; We have improved the speed of converting and testing functions in DESeq2. In this step, we perform minimum prefiltering to preserve only rows that have at least 10 reads.
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
#TypeTreatment_Tumor_vs_Normal 34 35 36T vs 34 35 36C
ddsTN<-dds
resTypeTrX_TN<-results(ddsTN,contrast=c("Type","Tumor","Control"),alpha=0.05)
resTypeTrX_TN
## log2 fold change (MLE): Type Tumor vs Control
## Wald test p-value: Type Tumor vs Control
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.0325524 0.325711 0.0999427 0.920390 0.998486
## TNMD 32.8382 1.4031560 1.356203 1.0346212 0.300846 0.941348
## DPM1 473.0177 0.0528333 0.619625 0.0852666 0.932049 0.999438
## SCYL3 1171.0906 -0.1550594 0.417280 -0.3715957 0.710194 0.992431
## C1orf112 321.5382 -0.2140606 0.496694 -0.4309708 0.666490 0.991134
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -0.582843 3.484002 -0.1672914 0.867141 0.996036
## H2AL1SP 4.26468 -2.997713 3.846888 -0.7792566 0.435829 0.964254
## NIPBL-DT 1361.89229 -0.269032 0.465077 -0.5784681 0.562948 0.979952
## CERNA2 29.84232 -0.150351 1.893759 -0.0793926 0.936720 0.999731
## LOC100996886 32.40953 5.663064 2.655310 2.1327323 NA NA
summary(resTypeTrX_TN)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 37, 0.12%
## LFC < 0 (down) : 5, 0.017%
## outliers [1] : 1970, 6.6%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resTypeTrX_TN_Ordered<-resTypeTrX_TN[order(resTypeTrX_TN$padj),]
head(resTypeTrX_TN_Ordered)
## log2 fold change (MLE): Type Tumor vs Control
## Wald test p-value: Type Tumor vs Control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## CCL26 14.0100 20.79336 3.91041 5.31743 1.05242e-07 0.000267334
## BEND4 1569.3562 1.85266 0.32287 5.73809 9.57519e-09 0.000267334
## RNU6-744P 15.8184 20.86765 3.91000 5.33699 9.45000e-08 0.000267334
## RPSAP23 14.0100 20.79336 3.91041 5.31743 1.05242e-07 0.000267334
## MBD3L2 14.4220 20.83125 3.91031 5.32726 9.97044e-08 0.000267334
## PLSCR5 14.4220 20.83125 3.91031 5.32726 9.97044e-08 0.000267334
#Reorder the genes by the adjusted p value
set.seed(1)
#if (!requireNamespace("BiocManager", quiet = TRUE))
#install.packages("BiocManager")
#BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)
## Loading required package: grid
##
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
##
## pattern
## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library("pheatmap")
##
## Attaching package: 'pheatmap'
## The following object is masked from 'package:ComplexHeatmap':
##
## pheatmap
select_genes_resTypeTrX_TN_Ordered<-rownames(resTypeTrX_TN_Ordered)
select_genes_resTypeTrX_TN_Ordered<-select_genes_resTypeTrX_TN_Ordered[1:42]
write.csv(as.data.frame(resTypeTrX_TN_Ordered),
file="/Users/dongxiaotai/Desktop/resTypeTrX_TN_Ordered.csv")
write.csv(as.data.frame(resTypeTrX_TN_Ordered[select_genes_resTypeTrX_TN_Ordered,]),
file="/Users/dongxiaotai/Desktop/select_genes_resTypeTrX_TN_Ordered.csv")
vst_TypeTrX_TN<-vst(ddsTN,blind = FALSE)
df_resTypeTrX_TN <- as.data.frame(colData(vst_TypeTrX_TN)["Type"])
ComplexHeatmap::pheatmap(assay(vst_TypeTrX_TN)[select_genes_resTypeTrX_TN_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,cluster_cols=TRUE, annotation_col=df_resTypeTrX_TN, fontsize=6.5, main = "34,35,36C vs 34,35,36T ",name="vst(FPKM)")
# As you can see from the heat map, the control sample and the tumor sample are clustered separately. In the control group, gene expression was very similar.
#Gleason8 vs Gleason9 (control CT)
ddsGleason89_ctlNT<-dds
design(ddsGleason89_ctlNT)<-~Type + Gleason
ddsGleason89_ctlNT<-DESeq(ddsGleason89_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsGleason89_ctlNT)
## [1] "Intercept" "Type_Tumor_vs_Control" "Gleason_9_vs_8"
resGleason89_ctlNT<-results(ddsGleason89_ctlNT,contrast=c("Gleason","9","8"),alpha=0.05)
resGleason89_ctlNT
## log2 fold change (MLE): Gleason 9 vs 8
## Wald test p-value: Gleason 9 vs 8
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 -0.00885828 0.337065 -0.0262806 0.979033 0.990579
## TNMD 32.8382 -1.43220889 1.357417 -1.0550987 0.291380 0.600705
## DPM1 473.0177 0.73435841 0.638022 1.1509930 0.249735 0.562239
## SCYL3 1171.0906 0.56378587 0.390504 1.4437383 0.148813 0.459201
## C1orf112 321.5382 0.19070630 0.527008 0.3618662 0.717452 0.876603
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -1.792776 3.946374 -0.454284 0.6496241 NA
## H2AL1SP 4.26468 1.296251 4.115195 0.314991 0.7527682 NA
## NIPBL-DT 1361.89229 0.808984 0.414776 1.950412 0.0511271 0.301514
## CERNA2 29.84232 -0.333639 2.105841 -0.158435 0.8741141 0.947013
## LOC100996886 32.40953 -5.702918 2.792202 -2.042445 0.0411074 0.277870
summary(resGleason89_ctlNT)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 139, 0.46%
## LFC < 0 (down) : 194, 0.65%
## outliers [1] : 0, 0%
## low counts [2] : 8119, 27%
## (mean count < 26)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
In “Gleason8 vs Gleason9,Control CT”, the results table showed 139 genes up-regulated and 194 down-regulated.
resGleason89_ctlNT_Ordered<-resGleason89_ctlNT[order(resGleason89_ctlNT$padj),]
head(resGleason89_ctlNT_Ordered)
## log2 fold change (MLE): Gleason 9 vs 8
## Wald test p-value: Gleason 9 vs 8
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ANPEP 4877.91 -5.70051 0.472519 -12.06410 1.63436e-33 3.56176e-29
## NCAPD3 10906.07 -3.66525 0.315231 -11.62718 2.99826e-31 3.26705e-27
## DHCR24 5570.05 -3.34382 0.355532 -9.40511 5.19738e-21 2.85443e-17
## SCHLAP1 1260.44 5.75391 0.611840 9.40427 5.23916e-21 2.85443e-17
## IGHA2 3746.03 -4.75000 0.557064 -8.52685 1.50390e-17 6.55488e-14
## BANK1 1142.58 -2.93922 0.360601 -8.15088 3.61285e-16 1.31225e-12
#第2张热图
set.seed(88)
library("pheatmap")
select_genes_resGleason89_ctlNT_Ordered<-rownames(resGleason89_ctlNT_Ordered)
select_genes_resGleason89_ctlNT_Ordered<-select_genes_resGleason89_ctlNT_Ordered[1:30]
vst_Gleason89_ctlNT<-vst(ddsGleason89_ctlNT,blind = FALSE)
df_resGleason89_ctlNT <- as.data.frame(colData(vst_Gleason89_ctlNT)[,c("Type","Gleason")])
ComplexHeatmap::pheatmap(assay(vst_Gleason89_ctlNT)[select_genes_resGleason89_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resGleason89_ctlNT,fontsize=8, main = "Gleason8 vs Gleason9,Control CT ",name="vst(FPKM)")
In the heat map, Gleason8 and Gleason9 are clustered separately when we control for the effect of sample type. In Gleason9, 34C and 36C are clustered together, 34T and 36T are clustered together.
###Pre vs Post (Control C vs T)
ddsPrePost_ctlNT<-dds
design(ddsPrePost_ctlNT)<-~Type + Treatment
ddsPrePost_ctlNT<-DESeq(ddsPrePost_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsPrePost_ctlNT)
## [1] "Intercept" "Type_Tumor_vs_Control" "Treatment_Pre_vs_Post"
resPrePost_ctlNT<-results(ddsPrePost_ctlNT, contrast=c("Treatment","Post","Pre"), alpha=0.05)
resPrePost_ctlNT
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.183342 0.306017 0.599122 0.5490916 0.743446
## TNMD 32.8382 -1.263384 1.384108 -0.912779 0.3613590 0.586894
## DPM1 473.0177 0.805513 0.590631 1.363817 0.1726250 0.384807
## SCYL3 1171.0906 0.634128 0.341642 1.856115 0.0634371 0.211958
## C1orf112 321.5382 0.333881 0.491251 0.679654 0.4967236 0.704558
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -6.094994 3.675343 -1.658347 0.0972474 NA
## H2AL1SP 4.26468 3.174002 4.069027 0.780040 0.4353676 NA
## NIPBL-DT 1361.89229 0.712922 0.404476 1.762583 0.0779709 0.239106
## CERNA2 29.84232 1.580382 1.974816 0.800268 0.4235555 0.642765
## LOC100996886 32.40953 -5.740423 2.788411 -2.058672 0.0395257 0.160208
summary(resPrePost_ctlNT)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 866, 2.9%
## LFC < 0 (down) : 1545, 5.2%
## outliers [1] : 0, 0%
## low counts [2] : 7538, 25%
## (mean count < 23)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resPrePost_ctlNT_Ordered<-resTypeTrX_TN[order(resPrePost_ctlNT$padj),]
head(resPrePost_ctlNT_Ordered)
## log2 fold change (MLE): Type Tumor vs Control
## Wald test p-value: Type Tumor vs Control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## POTEJ 992.145 1.4888086 1.741840 0.8547330 0.3926990 0.960734
## PCA3 2190.298 1.2327145 1.936624 0.6365275 0.5244327 0.976677
## RN7SL2 448990.716 0.0370742 0.658479 0.0563027 0.9551007 0.999731
## TMPRSS2 20680.141 1.2188144 1.379195 0.8837145 0.3768504 0.959836
## RBFOX3 1635.347 -1.4278862 0.755373 -1.8903059 0.0587171 0.858157
## PRRX1 4103.483 -0.6384444 0.637743 -1.0010990 0.3167789 0.942193
#第3张热图pre vs post
set.seed(88)
select_genes_resPrePost_ctlNT_Ordered<-rownames(resPrePost_ctlNT_Ordered)
select_genes_resPrePost_ctlNT_Ordered<-select_genes_resPrePost_ctlNT_Ordered[1:30]
vst_PrePost_ctlNT<-vst(ddsPrePost_ctlNT,blind = FALSE)
df_resPrePost_ctlNT <- as.data.frame(colData(vst_PrePost_ctlNT)[,c("Type","Treatment")])
ComplexHeatmap::pheatmap(assay(vst_PrePost_ctlNT)[select_genes_resPrePost_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resPrePost_ctlNT,fontsize=8, main = "Pre vs Post,Control CT",name="vst(FPKM)")
#In the heat map, the pre - and post-treatment samples were clustered separately when we controlled for the effect of sample type. Gene expression in tumor samples was significantly different before and after treatment.
Three_Tumors_Only
counts_t<-counts[,c(4:6)]#counts_t——tumor only
coldata_t<-coldata[c(4:6),]
#library(DESeq2)
dds_t <- DESeqDataSetFromMatrix(countData = counts_t,
colData = coldata_t,
design = ~Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_t)) >= 10
dds_t <- dds_t[keep,]
#dds+res
dds_t<-DESeq(dds_t)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_t<-results(dds_t, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_t)
##
## out of 28561 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 422, 1.5%
## LFC < 0 (down) : 293, 1%
## outliers [1] : 0, 0%
## low counts [2] : 9414, 33%
## (mean count < 59)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#testing_heatmap_only tumor
res_t_Ordered<-res_t[order(res_t$padj),]
head(res_t_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## RN7SL147P 2675.79 8.82928 0.677177 13.0384 7.40348e-39 1.41754e-34
## RNA5-8SP4 1811.92 8.94547 0.723285 12.3678 3.90236e-35 3.73593e-31
## ANKRD35 18097.44 6.34912 0.550263 11.5383 8.45459e-31 5.39600e-27
## RNA5SP215 5706.88 5.94593 0.570019 10.4311 1.78800e-25 8.55873e-22
## RNA5SP324 2411.34 6.75583 0.650013 10.3934 2.65784e-25 1.01779e-21
## RNU2-61P 7818.33 5.58139 0.538177 10.3709 3.36251e-25 1.07303e-21
set.seed(88)
library("pheatmap")
select_genes_res_t_Ordered<-rownames(res_t_Ordered)
select_genes_res_t_Ordered<-select_genes_res_t_Ordered[1:30]
vst_t<-vst(dds_t,blind = FALSE)
df_res_t <- as.data.frame(colData(vst_t)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_t)[select_genes_res_t_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_t, fontsize=8, main = "Tumors:Pre vs Post-Treatment",name="vst(FPKM)")
#Normal_Only
counts_n<-counts[,c(1:3)]
coldata_n<-coldata[c(1:3),]
library(DESeq2)
dds_n <- DESeqDataSetFromMatrix(countData = counts_n,
colData = coldata_n,
design = ~ Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_n)) >= 10
dds_n <- dds_n[keep,]
#dds+res
dds_n<-DESeq(dds_n)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
#res_t<-lfcShrink(dds_t, )
res_n<-results(dds_n, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_n)
##
## out of 26382 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 84, 0.32%
## LFC < 0 (down) : 211, 0.8%
## outliers [1] : 0, 0%
## low counts [2] : 8184, 31%
## (mean count < 40)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#testing_heatmap_only normal
res_n_Ordered<-res_n[order(res_n$padj),]
head(res_n_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## RNA5SP141 60053.281 -3.97976 0.372212 -10.69218 1.10736e-26 2.01517e-22
## PCA3 1478.407 -6.71461 0.696384 -9.64211 5.30878e-22 4.83046e-18
## TMPRSS2 14016.335 -3.21323 0.368393 -8.72228 2.72644e-18 1.42520e-14
## RN7SL2 499314.108 -2.70774 0.311001 -8.70655 3.13264e-18 1.42520e-14
## SPON2 9362.003 -2.75836 0.332613 -8.29299 1.10438e-16 4.01950e-13
## POTEJ 586.631 -6.22581 0.753419 -8.26341 1.41571e-16 4.29384e-13
#library("pheatmap")
set.seed(3)
library("pheatmap")
select_genes_res_n_Ordered<-rownames(res_n_Ordered)
select_genes_res_n_Ordered<-select_genes_res_n_Ordered[1:30]
vst_n<-vst(dds_n,blind = FALSE)
df_res_n <- as.data.frame(colData(vst_n)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_n)[select_genes_res_n_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_n, fontsize=8, main = "Control:Pre vs Post-Treatment",name="vst(FPKM)")
###34vs36;pre vs post
counts_3436_trx<-counts[,c(1,3,4,6)]
coldata_3436_trx<-coldata[c(1,3,4,6),]
#library(DESeq2)
dds_3436_trx <- DESeqDataSetFromMatrix(countData = counts_3436_trx,
colData = coldata_3436_trx,
design = ~ Type + Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_3436_trx)) >= 10
dds_3436_trx <- dds_3436_trx[keep,]
#dds+res
dds_3436_trx<-DESeq(dds_3436_trx)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_3436_trx<-results(dds_3436_trx, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_3436_trx)
##
## out of 27475 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 787, 2.9%
## LFC < 0 (down) : 630, 2.3%
## outliers [1] : 0, 0%
## low counts [2] : 6392, 23%
## (mean count < 15)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_3436_trx_Ordered<-res_3436_trx[order(res_3436_trx$padj),]
head(res_3436_trx_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## COL2A1 6964.468 6.22930 0.353518 17.6209 1.70318e-69 3.59081e-65
## POTEI 3138.401 -5.20207 0.304108 -17.1060 1.33953e-65 1.41207e-61
## TGM4 1909.905 5.42208 0.349629 15.5081 3.05849e-54 2.14941e-50
## POTEJ 1023.558 -6.36935 0.450505 -14.1382 2.20709e-45 1.16330e-41
## PCA3 598.939 -5.54619 0.487710 -11.3719 5.77212e-30 2.43387e-26
## RNA5SP207 702.878 5.48559 0.490732 11.1784 5.20184e-29 1.82784e-25
#library("pheatmap")
set.seed(70)
select_genes_res_3436_trx_Ordered<-rownames(res_3436_trx_Ordered)
select_genes_res_3436_trx_Ordered<-select_genes_res_3436_trx_Ordered[1:30]
vst_3436_trx<-vst(dds_3436_trx,blind = FALSE)
df_res_3436_trx <- as.data.frame(colData(vst_3436_trx)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_3436_trx)[select_genes_res_3436_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=df_res_3436_trx, fontsize=8, main = "34&36 contrasting Pre and Post-Treatment",name="vst(FPKM)")
3536
counts_3536_trx<-counts[,c(2,3,5,6)]
coldata_3536_trx<-coldata[c(2,3,5,6),]
#library(DESeq2)
dds_3536_trx <- DESeqDataSetFromMatrix(countData = counts_3536_trx,
colData = coldata_3536_trx,
design = ~ Type + Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_3536_trx)) >= 10
dds_3536_trx <- dds_3536_trx[keep,]
#dds+res
dds_3536_trx<-DESeq(dds_3536_trx)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_3536_trx<-results(dds_3536_trx, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_3536_trx)
##
## out of 28772 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 671, 2.3%
## LFC < 0 (down) : 716, 2.5%
## outliers [1] : 0, 0%
## low counts [2] : 7252, 25%
## (mean count < 30)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_3536_trx_Ordered<-res_3536_trx[order(res_3536_trx$padj),]
head(res_3536_trx_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## RN7SL147P 2510.74 8.81406 0.536080 16.4417 9.62063e-61 2.07036e-56
## PCA3 2458.68 -7.55776 0.486495 -15.5351 2.00710e-54 2.15964e-50
## ACSM1 2751.65 -5.94770 0.428096 -13.8934 6.94842e-44 4.98433e-40
## ANPEP 6546.93 -5.94850 0.443400 -13.4157 4.89493e-41 2.63347e-37
## CTRB1 2441.57 6.73694 0.537736 12.5283 5.22524e-36 2.24895e-32
## CHRNA2 1290.91 -6.27367 0.546778 -11.4739 1.78455e-30 6.40057e-27
set.seed(1)
library(ComplexHeatmap)
#library("pheatmap")
select_genes_res_3536_trx_Ordered<-rownames(res_3536_trx_Ordered)
select_genes_res_3536_trx_Ordered<-select_genes_res_3536_trx_Ordered[1:30]
vst_3536_trx<-vst(dds_3536_trx,blind = FALSE)
df_res_3536_trx <- as.data.frame(colData(vst_3536_trx)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_3536_trx)[select_genes_res_3536_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_3536_trx,fontsize=8, main = "35&36 contrasting Pre- and Post- Treatment",name="vst(FPKM)")
3436,3536 各选取top100 并寻找share gene, 经过比对top100和top200 首先top100有多少个,共有基因少了很多个个 top200 有多少个,并且最大的p-value是多少,具有意义
#write.csv(as.data.frame(res_3436_trx_Ordered[1:100,]),
#file="/Users/dongxiaotai/Desktop/res_3436_trx_Orderedtop100.csv")
#write.csv(as.data.frame(res_3536_trx_Ordered[1:100,]),
#file="/Users/dongxiaotai/Desktop/res_3536_trx_Orderedtop100.csv")
write.csv(as.data.frame(res_3436_trx_Ordered[1:200,]),
file="/Users/dongxiaotai/Desktop/res_3436_trx_Orderedtop200.csv")
write.csv(as.data.frame(res_3536_trx_Ordered[1:200,]),
file="/Users/dongxiaotai/Desktop/res_3536_trx_Orderedtop200.csv")
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
library(readxl)
shared_genes_343536top200_trx<-read_excel('/Users/dongxiaotai/Desktop/sharedgenetop200.xlsx',col_names = FALSE)
## New names:
## * `` -> ...1
shared_genes_343536top200_trx<-shared_genes_343536top200_trx[,1]
shared_genes_343536top200_trx<-t(shared_genes_343536top200_trx)
shared_genes_343536top200_trx<-shared_genes_343536top200_trx[1,]
dds_all_top200trx<-dds
design(dds_all_top200trx)<-~ Type + Treatment
dds_all_top200trx<-DESeq(dds_all_top200trx)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_all_top200trx<-results(dds_all_top200trx, contrast=c("Treatment","Post","Pre"),alpha=0.05)
set.seed(1)
library(ComplexHeatmap)
library("pheatmap")
#select_genes_res_343536top200_trx_Ordered<-rownames(res_343536top200_trx_Ordered)
#select_genes_res_343536top200_trx_Ordered<-select_genes_res_343536top200_trx_Ordered[1:30]
vst_all_top200trx<-vst(dds_all_top200trx,blind = FALSE)
df_res_all_top200trx <- as.data.frame(colData(vst_all_top200trx)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_all_top200trx)[shared_genes_343536top200_trx,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_all_top200trx,fontsize=6.5, main = "Top200 Shared Genes by 34vs36 & 35vs36 Pre/Post Treatments",display_numbers=TRUE,name ="vst(FPKM)")