在这一节中,我探索了r中的结果,可视化的结果帮助我更清楚地理解数据信息和数据结构。在DESeq2中,plotMA函数显示了由DeseqDataSet中所有样本的标准化计数的平均值引起的log2倍数的变化。在许多研究中,热图通常是一种探索枚举矩阵的直观方法。在DESeq2中,提出了两种变换方法。一种是方差稳定变换(VST),另一种是正则对数变换(Rlog)。由于VST的运行时间比Rlog短,所以我选择VST (FPKM)作为热图中的所有颜色。

#Set the working path, install various required packages,preprocess the COUNTS file

library(stringr)
library(Homo.sapiens)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GO.db
## 
## Loading required package: org.Hs.eg.db
## 
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
counts <- read.table("ProstCa_030921_2.txt")
geneid <- rownames(counts) 
gene <- str_match(geneid, "(\\w*).*")#Ready to remove the decimal point
geneid <- gene[,2]#Let's get rid of the next two decimal points
geneid2<-geneid#We're going to map the ENS name after the decimal point to the gene
geneid2=data.frame(geneid2)#Convert to data.frame format
counts<-cbind(counts,geneid2)#Merge counts and geneid2 to counts

#Convert geneid and get rid of duplicate genes

setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
genes <- select(Homo.sapiens, keys=geneid,
                columns=c("SYMBOL","TXCHROM"),
                keytype="ENSEMBL")
## 'select()' returned many:many mapping between keys and columns
dim(genes)
## [1] 60216     3
genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
row.names(counts)<-counts$geneid2

#Change CountsReady back to a more manageable matrix. Change the column names of the matrix, group and build a matrix that formally processes the data

library(GenomicAlignments)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
## 
##     rowMedians
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: Rsamtools
counts<-cbind(counts,genes$SYMBOL)
counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready

#By the end of this step, we have a clear counts and have removed the decimal point for ENS to correspond to the gene name. This matrix is now called counts_filtered

#Build the complete matrix list,add sample information.I added the information before and after treatment and the Gleason score to the sample information. In “Counts” after reprocessing, 3 of the 6 data sets are C1 and the other 3 are T1. In ID, “C” stands for Control and “T” stands for Tumor.

counts_filtered<-counts[1:6] 
coldata=data.frame(row.names=c("34C","35C","36C","34T","35T","36T"),
                   Type=rep(c("Control","Tumor"),each=3),
                   Gleason=rep(c("9","8","9"),2),
                   Treatment=rep(c("Pre","Pre","Post"),2))

#Display the table

coldata
##        Type Gleason Treatment
## 34C Control       9       Pre
## 35C Control       8       Pre
## 36C Control       9      Post
## 34T   Tumor       9       Pre
## 35T   Tumor       8       Pre
## 36T   Tumor       9      Post
coldata$Type<-factor(coldata$Type)
coldata$Gleason<-factor(coldata$Gleason)
coldata$Treatment<-factor(coldata$Treatment)

#Differential expression analysis

library(DESeq2)
#if (!requireNamespace("BiocManager", quiet = TRUE))
    #install.packages("BiocManager")
#BiocManager::install("SummarizedExperiment")
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
                              colData = coldata,
                              design = ~ Type )

#Pre-Filtering If this row adds up, if it adds up to a 10 you can keep it, because if you don’t use a 0, you get rid of it. As long as one of these rows is not 0, then 0 is also information #Prefiltering has two important effects: we reduce the memory size of DDS data objects by removing rows that are rarely read; We have improved the speed of converting and testing functions in DESeq2. In this step, we perform minimum prefiltering to preserve only rows that have at least 10 reads.

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)

#TypeTreatment_Tumor_vs_Normal 34 35 36T vs 34 35 36C

ddsTN<-dds
resTypeTrX_TN<-results(ddsTN,contrast=c("Type","Tumor","Control"),alpha=0.05)
resTypeTrX_TN
## log2 fold change (MLE): Type Tumor vs Control 
## Wald test p-value: Type Tumor vs Control 
## DataFrame with 29912 rows and 6 columns
##                baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##               <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## TSPAN6        1107.8741      0.0325524  0.325711  0.0999427  0.920390  0.998486
## TNMD            32.8382      1.4031560  1.356203  1.0346212  0.300846  0.941348
## DPM1           473.0177      0.0528333  0.619625  0.0852666  0.932049  0.999438
## SCYL3         1171.0906     -0.1550594  0.417280 -0.3715957  0.710194  0.992431
## C1orf112       321.5382     -0.2140606  0.496694 -0.4309708  0.666490  0.991134
## ...                 ...            ...       ...        ...       ...       ...
## BMP8B-AS1      10.53852      -0.582843  3.484002 -0.1672914  0.867141  0.996036
## H2AL1SP         4.26468      -2.997713  3.846888 -0.7792566  0.435829  0.964254
## NIPBL-DT     1361.89229      -0.269032  0.465077 -0.5784681  0.562948  0.979952
## CERNA2         29.84232      -0.150351  1.893759 -0.0793926  0.936720  0.999731
## LOC100996886   32.40953       5.663064  2.655310  2.1327323        NA        NA
summary(resTypeTrX_TN)
## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 37, 0.12%
## LFC < 0 (down)     : 5, 0.017%
## outliers [1]       : 1970, 6.6%
## low counts [2]     : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

In “34 35 36T vs 34 35 36C”, 37 up-regulated genes and 5 down-regulated genes were displayed under the condition of p-value<0.05.

resTypeTrX_TN_Ordered<-resTypeTrX_TN[order(resTypeTrX_TN$padj),]
head(resTypeTrX_TN_Ordered)
## log2 fold change (MLE): Type Tumor vs Control 
## Wald test p-value: Type Tumor vs Control 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## CCL26       14.0100       20.79336   3.91041   5.31743 1.05242e-07 0.000267334
## BEND4     1569.3562        1.85266   0.32287   5.73809 9.57519e-09 0.000267334
## RNU6-744P   15.8184       20.86765   3.91000   5.33699 9.45000e-08 0.000267334
## RPSAP23     14.0100       20.79336   3.91041   5.31743 1.05242e-07 0.000267334
## MBD3L2      14.4220       20.83125   3.91031   5.32726 9.97044e-08 0.000267334
## PLSCR5      14.4220       20.83125   3.91031   5.32726 9.97044e-08 0.000267334
#Reorder the genes by the adjusted p value
set.seed(1)
#if (!requireNamespace("BiocManager", quiet = TRUE))
    #install.packages("BiocManager")
#BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)
## Loading required package: grid
## 
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
## 
##     pattern
## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library("pheatmap")
## 
## Attaching package: 'pheatmap'
## The following object is masked from 'package:ComplexHeatmap':
## 
##     pheatmap
select_genes_resTypeTrX_TN_Ordered<-rownames(resTypeTrX_TN_Ordered)
select_genes_resTypeTrX_TN_Ordered<-select_genes_resTypeTrX_TN_Ordered[1:42]
write.csv(as.data.frame(resTypeTrX_TN_Ordered), 
          file="/Users/dongxiaotai/Desktop/resTypeTrX_TN_Ordered.csv")
write.csv(as.data.frame(resTypeTrX_TN_Ordered[select_genes_resTypeTrX_TN_Ordered,]), 
          file="/Users/dongxiaotai/Desktop/select_genes_resTypeTrX_TN_Ordered.csv")
vst_TypeTrX_TN<-vst(ddsTN,blind = FALSE)
df_resTypeTrX_TN <- as.data.frame(colData(vst_TypeTrX_TN)["Type"])
ComplexHeatmap::pheatmap(assay(vst_TypeTrX_TN)[select_genes_resTypeTrX_TN_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,cluster_cols=TRUE, annotation_col=df_resTypeTrX_TN, fontsize=6.5, main = "34,35,36C vs 34,35,36T ",name="vst(FPKM)")

# As you can see from the heat map, the control sample and the tumor sample are clustered separately. In the control group, gene expression was very similar.

#Gleason8 vs Gleason9 (control CT)

ddsGleason89_ctlNT<-dds
design(ddsGleason89_ctlNT)<-~Type + Gleason

ddsGleason89_ctlNT<-DESeq(ddsGleason89_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsGleason89_ctlNT)
## [1] "Intercept"             "Type_Tumor_vs_Control" "Gleason_9_vs_8"
resGleason89_ctlNT<-results(ddsGleason89_ctlNT,contrast=c("Gleason","9","8"),alpha=0.05)  
resGleason89_ctlNT
## log2 fold change (MLE): Gleason 9 vs 8 
## Wald test p-value: Gleason 9 vs 8 
## DataFrame with 29912 rows and 6 columns
##                baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##               <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## TSPAN6        1107.8741    -0.00885828  0.337065 -0.0262806  0.979033  0.990579
## TNMD            32.8382    -1.43220889  1.357417 -1.0550987  0.291380  0.600705
## DPM1           473.0177     0.73435841  0.638022  1.1509930  0.249735  0.562239
## SCYL3         1171.0906     0.56378587  0.390504  1.4437383  0.148813  0.459201
## C1orf112       321.5382     0.19070630  0.527008  0.3618662  0.717452  0.876603
## ...                 ...            ...       ...        ...       ...       ...
## BMP8B-AS1      10.53852      -1.792776  3.946374  -0.454284 0.6496241        NA
## H2AL1SP         4.26468       1.296251  4.115195   0.314991 0.7527682        NA
## NIPBL-DT     1361.89229       0.808984  0.414776   1.950412 0.0511271  0.301514
## CERNA2         29.84232      -0.333639  2.105841  -0.158435 0.8741141  0.947013
## LOC100996886   32.40953      -5.702918  2.792202  -2.042445 0.0411074  0.277870
summary(resGleason89_ctlNT)
## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 139, 0.46%
## LFC < 0 (down)     : 194, 0.65%
## outliers [1]       : 0, 0%
## low counts [2]     : 8119, 27%
## (mean count < 26)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

In “Gleason8 vs Gleason9,Control CT”, the results table showed 139 genes up-regulated and 194 down-regulated.

resGleason89_ctlNT_Ordered<-resGleason89_ctlNT[order(resGleason89_ctlNT$padj),]
head(resGleason89_ctlNT_Ordered)
## log2 fold change (MLE): Gleason 9 vs 8 
## Wald test p-value: Gleason 9 vs 8 
## DataFrame with 6 rows and 6 columns
##          baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##         <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## ANPEP     4877.91       -5.70051  0.472519 -12.06410 1.63436e-33 3.56176e-29
## NCAPD3   10906.07       -3.66525  0.315231 -11.62718 2.99826e-31 3.26705e-27
## DHCR24    5570.05       -3.34382  0.355532  -9.40511 5.19738e-21 2.85443e-17
## SCHLAP1   1260.44        5.75391  0.611840   9.40427 5.23916e-21 2.85443e-17
## IGHA2     3746.03       -4.75000  0.557064  -8.52685 1.50390e-17 6.55488e-14
## BANK1     1142.58       -2.93922  0.360601  -8.15088 3.61285e-16 1.31225e-12
#第2张热图
set.seed(88)
library("pheatmap")
select_genes_resGleason89_ctlNT_Ordered<-rownames(resGleason89_ctlNT_Ordered)
select_genes_resGleason89_ctlNT_Ordered<-select_genes_resGleason89_ctlNT_Ordered[1:30]
vst_Gleason89_ctlNT<-vst(ddsGleason89_ctlNT,blind = FALSE)
df_resGleason89_ctlNT <- as.data.frame(colData(vst_Gleason89_ctlNT)[,c("Type","Gleason")])
ComplexHeatmap::pheatmap(assay(vst_Gleason89_ctlNT)[select_genes_resGleason89_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_resGleason89_ctlNT,fontsize=8, main = "Gleason8 vs Gleason9,Control CT ",name="vst(FPKM)")

In the heat map, Gleason8 and Gleason9 are clustered separately when we control for the effect of sample type. In Gleason9, 34C and 36C are clustered together, 34T and 36T are clustered together.

###Pre vs Post (Control C vs T)

ddsPrePost_ctlNT<-dds

design(ddsPrePost_ctlNT)<-~Type + Treatment

ddsPrePost_ctlNT<-DESeq(ddsPrePost_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsPrePost_ctlNT)
## [1] "Intercept"             "Type_Tumor_vs_Control" "Treatment_Pre_vs_Post"
resPrePost_ctlNT<-results(ddsPrePost_ctlNT, contrast=c("Treatment","Post","Pre"), alpha=0.05)    
resPrePost_ctlNT
## log2 fold change (MLE): Treatment Post vs Pre 
## Wald test p-value: Treatment Post vs Pre 
## DataFrame with 29912 rows and 6 columns
##                baseMean log2FoldChange     lfcSE      stat    pvalue      padj
##               <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6        1107.8741       0.183342  0.306017  0.599122 0.5490916  0.743446
## TNMD            32.8382      -1.263384  1.384108 -0.912779 0.3613590  0.586894
## DPM1           473.0177       0.805513  0.590631  1.363817 0.1726250  0.384807
## SCYL3         1171.0906       0.634128  0.341642  1.856115 0.0634371  0.211958
## C1orf112       321.5382       0.333881  0.491251  0.679654 0.4967236  0.704558
## ...                 ...            ...       ...       ...       ...       ...
## BMP8B-AS1      10.53852      -6.094994  3.675343 -1.658347 0.0972474        NA
## H2AL1SP         4.26468       3.174002  4.069027  0.780040 0.4353676        NA
## NIPBL-DT     1361.89229       0.712922  0.404476  1.762583 0.0779709  0.239106
## CERNA2         29.84232       1.580382  1.974816  0.800268 0.4235555  0.642765
## LOC100996886   32.40953      -5.740423  2.788411 -2.058672 0.0395257  0.160208
summary(resPrePost_ctlNT)
## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 866, 2.9%
## LFC < 0 (down)     : 1545, 5.2%
## outliers [1]       : 0, 0%
## low counts [2]     : 7538, 25%
## (mean count < 23)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

In the “Pre vs Post,Control CT”, the results table showed 866 genes were up-regulated and 1545 genes were down-regulated.

resPrePost_ctlNT_Ordered<-resTypeTrX_TN[order(resPrePost_ctlNT$padj),]
head(resPrePost_ctlNT_Ordered)
## log2 fold change (MLE): Type Tumor vs Control 
## Wald test p-value: Type Tumor vs Control 
## DataFrame with 6 rows and 6 columns
##           baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##          <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## POTEJ      992.145      1.4888086  1.741840  0.8547330 0.3926990  0.960734
## PCA3      2190.298      1.2327145  1.936624  0.6365275 0.5244327  0.976677
## RN7SL2  448990.716      0.0370742  0.658479  0.0563027 0.9551007  0.999731
## TMPRSS2  20680.141      1.2188144  1.379195  0.8837145 0.3768504  0.959836
## RBFOX3    1635.347     -1.4278862  0.755373 -1.8903059 0.0587171  0.858157
## PRRX1     4103.483     -0.6384444  0.637743 -1.0010990 0.3167789  0.942193
#第3张热图pre vs post
set.seed(88)
select_genes_resPrePost_ctlNT_Ordered<-rownames(resPrePost_ctlNT_Ordered)
select_genes_resPrePost_ctlNT_Ordered<-select_genes_resPrePost_ctlNT_Ordered[1:30]
vst_PrePost_ctlNT<-vst(ddsPrePost_ctlNT,blind = FALSE)
df_resPrePost_ctlNT <- as.data.frame(colData(vst_PrePost_ctlNT)[,c("Type","Treatment")])
ComplexHeatmap::pheatmap(assay(vst_PrePost_ctlNT)[select_genes_resPrePost_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_resPrePost_ctlNT,fontsize=8, main = "Pre vs Post,Control CT",name="vst(FPKM)")

#In the heat map, the pre - and post-treatment samples were clustered separately when we controlled for the effect of sample type. Gene expression in tumor samples was significantly different before and after treatment.

Three_Tumors_Only

counts_t<-counts[,c(4:6)]#counts_t——tumor only
coldata_t<-coldata[c(4:6),]
#library(DESeq2)
dds_t <- DESeqDataSetFromMatrix(countData = counts_t,
                              colData = coldata_t,
                              design = ~Treatment )

#Pre-Filtering

keep <- rowSums(counts(dds_t)) >= 10
dds_t <- dds_t[keep,]

#dds+res

dds_t<-DESeq(dds_t)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_t<-results(dds_t, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_t)
## 
## out of 28561 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 422, 1.5%
## LFC < 0 (down)     : 293, 1%
## outliers [1]       : 0, 0%
## low counts [2]     : 9414, 33%
## (mean count < 59)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

#testing_heatmap_only tumor

res_t_Ordered<-res_t[order(res_t$padj),]
head(res_t_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre 
## Wald test p-value: Treatment Post vs Pre 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## RN7SL147P   2675.79        8.82928  0.677177   13.0384 7.40348e-39 1.41754e-34
## RNA5-8SP4   1811.92        8.94547  0.723285   12.3678 3.90236e-35 3.73593e-31
## ANKRD35    18097.44        6.34912  0.550263   11.5383 8.45459e-31 5.39600e-27
## RNA5SP215   5706.88        5.94593  0.570019   10.4311 1.78800e-25 8.55873e-22
## RNA5SP324   2411.34        6.75583  0.650013   10.3934 2.65784e-25 1.01779e-21
## RNU2-61P    7818.33        5.58139  0.538177   10.3709 3.36251e-25 1.07303e-21
set.seed(88)

library("pheatmap")
select_genes_res_t_Ordered<-rownames(res_t_Ordered)
select_genes_res_t_Ordered<-select_genes_res_t_Ordered[1:30]
vst_t<-vst(dds_t,blind = FALSE)
df_res_t <- as.data.frame(colData(vst_t)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_t)[select_genes_res_t_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_res_t, fontsize=8, main = "Tumors:Pre vs Post-Treatment",name="vst(FPKM)")

#Normal_Only

counts_n<-counts[,c(1:3)]
coldata_n<-coldata[c(1:3),]
library(DESeq2)
dds_n <- DESeqDataSetFromMatrix(countData = counts_n,
                              colData = coldata_n,
                              design = ~ Treatment )

#Pre-Filtering

keep <- rowSums(counts(dds_n)) >= 10
dds_n <- dds_n[keep,]

#dds+res

dds_n<-DESeq(dds_n)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
#res_t<-lfcShrink(dds_t, )
res_n<-results(dds_n, contrast=c("Treatment","Post","Pre"), alpha=0.05)

summary(res_n)
## 
## out of 26382 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 84, 0.32%
## LFC < 0 (down)     : 211, 0.8%
## outliers [1]       : 0, 0%
## low counts [2]     : 8184, 31%
## (mean count < 40)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

#testing_heatmap_only normal

res_n_Ordered<-res_n[order(res_n$padj),]
head(res_n_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre 
## Wald test p-value: Treatment Post vs Pre 
## DataFrame with 6 rows and 6 columns
##             baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##            <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## RNA5SP141  60053.281       -3.97976  0.372212 -10.69218 1.10736e-26 2.01517e-22
## PCA3        1478.407       -6.71461  0.696384  -9.64211 5.30878e-22 4.83046e-18
## TMPRSS2    14016.335       -3.21323  0.368393  -8.72228 2.72644e-18 1.42520e-14
## RN7SL2    499314.108       -2.70774  0.311001  -8.70655 3.13264e-18 1.42520e-14
## SPON2       9362.003       -2.75836  0.332613  -8.29299 1.10438e-16 4.01950e-13
## POTEJ        586.631       -6.22581  0.753419  -8.26341 1.41571e-16 4.29384e-13
#library("pheatmap")
set.seed(3)
library("pheatmap")
select_genes_res_n_Ordered<-rownames(res_n_Ordered)
select_genes_res_n_Ordered<-select_genes_res_n_Ordered[1:30]
vst_n<-vst(dds_n,blind = FALSE)
df_res_n <- as.data.frame(colData(vst_n)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_n)[select_genes_res_n_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_res_n, fontsize=8, main = "Control:Pre vs Post-Treatment",name="vst(FPKM)")

###34vs36;pre vs post

counts_3436_trx<-counts[,c(1,3,4,6)]
coldata_3436_trx<-coldata[c(1,3,4,6),]
#library(DESeq2)
dds_3436_trx <- DESeqDataSetFromMatrix(countData = counts_3436_trx,
                              colData = coldata_3436_trx,
                              design = ~ Type + Treatment )

#Pre-Filtering

keep <- rowSums(counts(dds_3436_trx)) >= 10
dds_3436_trx <- dds_3436_trx[keep,]

#dds+res

dds_3436_trx<-DESeq(dds_3436_trx)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_3436_trx<-results(dds_3436_trx, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_3436_trx)
## 
## out of 27475 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 787, 2.9%
## LFC < 0 (down)     : 630, 2.3%
## outliers [1]       : 0, 0%
## low counts [2]     : 6392, 23%
## (mean count < 15)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_3436_trx_Ordered<-res_3436_trx[order(res_3436_trx$padj),]
head(res_3436_trx_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre 
## Wald test p-value: Treatment Post vs Pre 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## COL2A1     6964.468        6.22930  0.353518   17.6209 1.70318e-69 3.59081e-65
## POTEI      3138.401       -5.20207  0.304108  -17.1060 1.33953e-65 1.41207e-61
## TGM4       1909.905        5.42208  0.349629   15.5081 3.05849e-54 2.14941e-50
## POTEJ      1023.558       -6.36935  0.450505  -14.1382 2.20709e-45 1.16330e-41
## PCA3        598.939       -5.54619  0.487710  -11.3719 5.77212e-30 2.43387e-26
## RNA5SP207   702.878        5.48559  0.490732   11.1784 5.20184e-29 1.82784e-25
#library("pheatmap")
set.seed(70)
select_genes_res_3436_trx_Ordered<-rownames(res_3436_trx_Ordered)
select_genes_res_3436_trx_Ordered<-select_genes_res_3436_trx_Ordered[1:30]
vst_3436_trx<-vst(dds_3436_trx,blind = FALSE)
df_res_3436_trx <- as.data.frame(colData(vst_3436_trx)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_3436_trx)[select_genes_res_3436_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=df_res_3436_trx, fontsize=8, main = "34&36 contrasting Pre and Post-Treatment",name="vst(FPKM)")

3536

counts_3536_trx<-counts[,c(2,3,5,6)]
coldata_3536_trx<-coldata[c(2,3,5,6),]
#library(DESeq2)
dds_3536_trx <- DESeqDataSetFromMatrix(countData = counts_3536_trx,
                              colData = coldata_3536_trx,
                              design = ~ Type + Treatment )

#Pre-Filtering

keep <- rowSums(counts(dds_3536_trx)) >= 10
dds_3536_trx <- dds_3536_trx[keep,]

#dds+res

dds_3536_trx<-DESeq(dds_3536_trx)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_3536_trx<-results(dds_3536_trx, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_3536_trx)
## 
## out of 28772 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 671, 2.3%
## LFC < 0 (down)     : 716, 2.5%
## outliers [1]       : 0, 0%
## low counts [2]     : 7252, 25%
## (mean count < 30)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_3536_trx_Ordered<-res_3536_trx[order(res_3536_trx$padj),]
head(res_3536_trx_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre 
## Wald test p-value: Treatment Post vs Pre 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## RN7SL147P   2510.74        8.81406  0.536080   16.4417 9.62063e-61 2.07036e-56
## PCA3        2458.68       -7.55776  0.486495  -15.5351 2.00710e-54 2.15964e-50
## ACSM1       2751.65       -5.94770  0.428096  -13.8934 6.94842e-44 4.98433e-40
## ANPEP       6546.93       -5.94850  0.443400  -13.4157 4.89493e-41 2.63347e-37
## CTRB1       2441.57        6.73694  0.537736   12.5283 5.22524e-36 2.24895e-32
## CHRNA2      1290.91       -6.27367  0.546778  -11.4739 1.78455e-30 6.40057e-27
set.seed(1)
library(ComplexHeatmap)
#library("pheatmap")
select_genes_res_3536_trx_Ordered<-rownames(res_3536_trx_Ordered)
select_genes_res_3536_trx_Ordered<-select_genes_res_3536_trx_Ordered[1:30]
vst_3536_trx<-vst(dds_3536_trx,blind = FALSE)
df_res_3536_trx <- as.data.frame(colData(vst_3536_trx)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_3536_trx)[select_genes_res_3536_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_res_3536_trx,fontsize=8, main = "35&36 contrasting Pre- and Post- Treatment",name="vst(FPKM)")

3436,3536 各选取top100 并寻找share gene, 经过比对top100和top200 首先top100有多少个,共有基因少了很多个个 top200 有多少个,并且最大的p-value是多少,具有意义

#write.csv(as.data.frame(res_3436_trx_Ordered[1:100,]), 
          #file="/Users/dongxiaotai/Desktop/res_3436_trx_Orderedtop100.csv")
#write.csv(as.data.frame(res_3536_trx_Ordered[1:100,]), 
          #file="/Users/dongxiaotai/Desktop/res_3536_trx_Orderedtop100.csv")
write.csv(as.data.frame(res_3436_trx_Ordered[1:200,]), 
          file="/Users/dongxiaotai/Desktop/res_3436_trx_Orderedtop200.csv")
write.csv(as.data.frame(res_3536_trx_Ordered[1:200,]), 
          file="/Users/dongxiaotai/Desktop/res_3536_trx_Orderedtop200.csv")
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
library(readxl)
shared_genes_343536top200_trx<-read_excel('/Users/dongxiaotai/Desktop/sharedgenetop200.xlsx',col_names = FALSE)
## New names:
## * `` -> ...1
shared_genes_343536top200_trx<-shared_genes_343536top200_trx[,1]
shared_genes_343536top200_trx<-t(shared_genes_343536top200_trx)
shared_genes_343536top200_trx<-shared_genes_343536top200_trx[1,]
dds_all_top200trx<-dds
design(dds_all_top200trx)<-~ Type + Treatment
dds_all_top200trx<-DESeq(dds_all_top200trx)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_all_top200trx<-results(dds_all_top200trx, contrast=c("Treatment","Post","Pre"),alpha=0.05)
set.seed(1)

library(ComplexHeatmap)
library("pheatmap")

#select_genes_res_343536top200_trx_Ordered<-rownames(res_343536top200_trx_Ordered)
#select_genes_res_343536top200_trx_Ordered<-select_genes_res_343536top200_trx_Ordered[1:30]
vst_all_top200trx<-vst(dds_all_top200trx,blind = FALSE)
df_res_all_top200trx <- as.data.frame(colData(vst_all_top200trx)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_all_top200trx)[shared_genes_343536top200_trx,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_res_all_top200trx,fontsize=6.5, main = "Top200 Shared Genes by 34vs36 & 35vs36 Pre/Post Treatments",display_numbers=TRUE,name ="vst(FPKM)")