FinalDongCapStone-copy-copy.knit

最终版代码test

2021/08/04 最后完整版可以有，应该赢，别等了 Fight on forever

#设置工作路径，读取源文件安装BiocManager，安装TxDb.Hsapiens.UCSC.hg19.knownGene，安装stringr和Homo.sapiens

setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
counts <- read.table("ProstCa_030921_2.txt")
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

## Loading required package: GenomicFeatures

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomeInfoDb

## Loading required package: GenomicRanges

## Loading required package: AnnotationDbi

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevels(txdb)

##  [1] "chr1"                  "chr2"                  "chr3"                 
##  [4] "chr4"                  "chr5"                  "chr6"                 
##  [7] "chr7"                  "chr8"                  "chr9"                 
## [10] "chr10"                 "chr11"                 "chr12"                
## [13] "chr13"                 "chr14"                 "chr15"                
## [16] "chr16"                 "chr17"                 "chr18"                
## [19] "chr19"                 "chr20"                 "chr21"                
## [22] "chr22"                 "chrX"                  "chrY"                 
## [25] "chrM"                  "chr1_gl000191_random"  "chr1_gl000192_random" 
## [28] "chr4_ctg9_hap1"        "chr4_gl000193_random"  "chr4_gl000194_random" 
## [31] "chr6_apd_hap1"         "chr6_cox_hap2"         "chr6_dbb_hap3"        
## [34] "chr6_mann_hap4"        "chr6_mcf_hap5"         "chr6_qbl_hap6"        
## [37] "chr6_ssto_hap7"        "chr7_gl000195_random"  "chr8_gl000196_random" 
## [40] "chr8_gl000197_random"  "chr9_gl000198_random"  "chr9_gl000199_random" 
## [43] "chr9_gl000200_random"  "chr9_gl000201_random"  "chr11_gl000202_random"
## [46] "chr17_ctg5_hap1"       "chr17_gl000203_random" "chr17_gl000204_random"
## [49] "chr17_gl000205_random" "chr17_gl000206_random" "chr18_gl000207_random"
## [52] "chr19_gl000208_random" "chr19_gl000209_random" "chr21_gl000210_random"
## [55] "chrUn_gl000211"        "chrUn_gl000212"        "chrUn_gl000213"       
## [58] "chrUn_gl000214"        "chrUn_gl000215"        "chrUn_gl000216"       
## [61] "chrUn_gl000217"        "chrUn_gl000218"        "chrUn_gl000219"       
## [64] "chrUn_gl000220"        "chrUn_gl000221"        "chrUn_gl000222"       
## [67] "chrUn_gl000223"        "chrUn_gl000224"        "chrUn_gl000225"       
## [70] "chrUn_gl000226"        "chrUn_gl000227"        "chrUn_gl000228"       
## [73] "chrUn_gl000229"        "chrUn_gl000230"        "chrUn_gl000231"       
## [76] "chrUn_gl000232"        "chrUn_gl000233"        "chrUn_gl000234"       
## [79] "chrUn_gl000235"        "chrUn_gl000236"        "chrUn_gl000237"       
## [82] "chrUn_gl000238"        "chrUn_gl000239"        "chrUn_gl000240"       
## [85] "chrUn_gl000241"        "chrUn_gl000242"        "chrUn_gl000243"       
## [88] "chrUn_gl000244"        "chrUn_gl000245"        "chrUn_gl000246"       
## [91] "chrUn_gl000247"        "chrUn_gl000248"        "chrUn_gl000249"

library(stringr)
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
library(Homo.sapiens)

## Loading required package: OrganismDbi

## Loading required package: GO.db

##

## Loading required package: org.Hs.eg.db

##

geneid <- rownames(counts) 
gene <- str_match(geneid, "(\\w*).*")#准备去除小数点
geneid <- gene[,2]#去除后面两个小数点
geneid2<-geneid#准备将去除小数点后面的ENS名字对应到基因
geneid2=data.frame(geneid2)
counts<-cbind(counts,geneid2)

#Convert geneid
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
genes <- select(Homo.sapiens, keys=geneid,
                columns=c("SYMBOL","TXCHROM"),
                keytype="ENSEMBL")

## 'select()' returned many:many mapping between keys and columns

dim(genes)#看他的情况大概

## [1] 60216     3

#Get rid of duplicate genes
genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
row.names(counts)<-counts$geneid2

#把countsready重新变化，变成更容易操作的矩阵，列命变化，Group and build a matrix that formally processes the data

library(GenomicAlignments)

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## The following object is masked from 'package:Biobase':
## 
##     rowMedians

## Loading required package: Biostrings

## Loading required package: XVector

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

## Loading required package: Rsamtools

counts<-cbind(counts,genes$SYMBOL)
counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready

counts_filtered<-counts[1:6]
coldata=data.frame(row.names=c("34C","35C","36C","34T","35T","36T"),
                   Type=rep(c("Control","Tumor"),each=3),
                   Gleason=rep(c("2","1","2"),2),
                   Treatment=rep(c("1","1","2"),2))
#到这一步结束，我们已经拥有了清晰的列命的counts，并且已经将ENS去除了小数点，对应好了基因名字。这个矩阵现在叫counts_filtered

coldata#构建完整的矩阵列表

##        Type Gleason Treatment
## 34C Control       2         1
## 35C Control       1         1
## 36C Control       2         2
## 34T   Tumor       2         1
## 35T   Tumor       1         1
## 36T   Tumor       2         2

#添加老师的notes，89代表什么，
coldata$Type<-factor(coldata$Type)
coldata$Gleason<-factor(coldata$Gleason)
coldata$Treatment<-factor(coldata$Treatment)

#Differential expression analysis

library(DESeq2)
#if (!requireNamespace("BiocManager", quiet = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("SummarizedExperiment")
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
                              colData = coldata,
                              design = ~ Type )

#Pre-Filtering

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
#这一行加起来，和是10的话 就可以留下来，因为出现0就不用的话，就都删掉了。只要这一行有一个不是0，那么0也是一种信息

#虽然在运行 DESeq2 函数之前没有必要预过滤低计数基因，但有两个原因使预过滤有用：通过删除很少读取的行，我们减少了dds数据对象的内存大小，我们提高了 DESeq2 中转换和测试函数的速度。在这里，我们执行最小预过滤以仅保留至少有 10 次读取的行。请注意，通过对结果函数中归一化计数的均值进行独立过滤，可以自动应用更严格的过滤以增加功效。

dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res <- results(dds)
#res01 <- results(dds,alpha = 0.1)
#res005 <- results(dds,alpha = 0.05)
#write.csv(as.data.frame(res01), 
          #file="/Users/dongxiaotai/Desktop/res01.csv")
#write.csv(as.data.frame(res005), 
          #file="/Users/dongxiaotai/Desktop/res005.csv")

#Alternative shrinkage estimators# To look for other choice

#install.packages("ashr")
resultsNames(dds)

## [1] "Intercept"             "Type_Tumor_vs_Control"

resNorm <- lfcShrink(dds, coef="Type_Tumor_vs_Control", type="normal")

## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

resAsh <- lfcShrink(dds, coef="Type_Tumor_vs_Control", type="ashr")

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

resLFC <- lfcShrink(dds, coef="Type_Tumor_vs_Control", type="apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

#分开噪声点，后面运用 apeglm

#TypeTreatment_Tumor_vs_Normal 34 35 36T vs 34 35 36N

ddsTN<-dds
#resTypeTrX_TN<-lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="apeglm")
resTypeTrX_TN<-results(ddsTN,contrast=c("Type","Tumor","Control"),alpha=0.05)
resTypeTrX_TN

## log2 fold change (MLE): Type Tumor vs Control 
## Wald test p-value: Type Tumor vs Control 
## DataFrame with 29912 rows and 6 columns
##                baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##               <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## TSPAN6        1107.8741      0.0325524  0.325711  0.0999427  0.920390  0.998486
## TNMD            32.8382      1.4031560  1.356203  1.0346212  0.300846  0.941348
## DPM1           473.0177      0.0528333  0.619625  0.0852666  0.932049  0.999438
## SCYL3         1171.0906     -0.1550594  0.417280 -0.3715957  0.710194  0.992431
## C1orf112       321.5382     -0.2140606  0.496694 -0.4309708  0.666490  0.991134
## ...                 ...            ...       ...        ...       ...       ...
## BMP8B-AS1      10.53852      -0.582843  3.484002 -0.1672914  0.867141  0.996036
## H2AL1SP         4.26468      -2.997713  3.846888 -0.7792566  0.435829  0.964254
## NIPBL-DT     1361.89229      -0.269032  0.465077 -0.5784681  0.562948  0.979952
## CERNA2         29.84232      -0.150351  1.893759 -0.0793926  0.936720  0.999731
## LOC100996886   32.40953       5.663064  2.655310  2.1327323        NA        NA

summary(resTypeTrX_TN)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 37, 0.12%
## LFC < 0 (down)     : 5, 0.017%
## outliers [1]       : 1970, 6.6%
## low counts [2]     : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

#N作为标准，得出上调和下降

resTypeTrX_TN_Ordered<-resTypeTrX_TN[order(resTypeTrX_TN$padj),]
head(resTypeTrX_TN_Ordered)

## log2 fold change (MLE): Type Tumor vs Control 
## Wald test p-value: Type Tumor vs Control 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## CCL26       14.0100       20.79336   3.91041   5.31743 1.05242e-07 0.000267334
## BEND4     1569.3562        1.85266   0.32287   5.73809 9.57519e-09 0.000267334
## RNU6-744P   15.8184       20.86765   3.91000   5.33699 9.45000e-08 0.000267334
## RPSAP23     14.0100       20.79336   3.91041   5.31743 1.05242e-07 0.000267334
## MBD3L2      14.4220       20.83125   3.91031   5.32726 9.97044e-08 0.000267334
## PLSCR5      14.4220       20.83125   3.91031   5.32726 9.97044e-08 0.000267334

#通过调整后的p值将，基因重新排序

#第一张热图 T vs N
set.seed(1)
#if (!requireNamespace("BiocManager", quiet = TRUE))
    #install.packages("BiocManager")
#BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)

## Loading required package: grid

## 
## Attaching package: 'grid'

## The following object is masked from 'package:Biostrings':
## 
##     pattern

## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

library("pheatmap")

## 
## Attaching package: 'pheatmap'

## The following object is masked from 'package:ComplexHeatmap':
## 
##     pheatmap

select_genes_resTypeTrX_TN_Ordered<-rownames(resTypeTrX_TN_Ordered)
select_genes_resTypeTrX_TN_Ordered<-select_genes_resTypeTrX_TN_Ordered[1:42]
write.csv(as.data.frame(resTypeTrX_TN_Ordered), 
          file="/Users/dongxiaotai/Desktop/resTypeTrX_TN_Ordered.csv")
write.csv(as.data.frame(resTypeTrX_TN_Ordered[select_genes_resTypeTrX_TN_Ordered,]), 
          file="/Users/dongxiaotai/Desktop/select_genes_resTypeTrX_TN_Ordered.csv")
vst_TypeTrX_TN<-vst(ddsTN,blind = FALSE)
df_resTypeTrX_TN <- as.data.frame(colData(vst_TypeTrX_TN)["Type"])
pheatmap(assay(vst_TypeTrX_TN)[select_genes_resTypeTrX_TN_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_resTypeTrX_TN, fontsize=8, main = "34,35,36C vs 34,35,36T ",name="vst(FPKM)")

#8 vs 9

ddsGleason89_ctlNT<-dds
design(ddsGleason89_ctlNT)<-~Type + Gleason

ddsGleason89_ctlNT<-DESeq(ddsGleason89_ctlNT)

## using pre-existing size factors

## estimating dispersions

## found already estimated dispersions, replacing these

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

resultsNames(ddsGleason89_ctlNT)

## [1] "Intercept"             "Type_Tumor_vs_Control" "Gleason_2_vs_1"

resGleason89_ctlNT<-results(ddsGleason89_ctlNT,contrast=c("Gleason","2","1"),alpha=0.05)  
resGleason89_ctlNT

## log2 fold change (MLE): Gleason 2 vs 1 
## Wald test p-value: Gleason 2 vs 1 
## DataFrame with 29912 rows and 6 columns
##                baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##               <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## TSPAN6        1107.8741    -0.00885828  0.337065 -0.0262806  0.979033  0.990579
## TNMD            32.8382    -1.43220889  1.357417 -1.0550987  0.291380  0.600705
## DPM1           473.0177     0.73435841  0.638022  1.1509930  0.249735  0.562239
## SCYL3         1171.0906     0.56378587  0.390504  1.4437383  0.148813  0.459201
## C1orf112       321.5382     0.19070630  0.527008  0.3618662  0.717452  0.876603
## ...                 ...            ...       ...        ...       ...       ...
## BMP8B-AS1      10.53852      -1.792776  3.946374  -0.454284 0.6496241        NA
## H2AL1SP         4.26468       1.296251  4.115195   0.314991 0.7527682        NA
## NIPBL-DT     1361.89229       0.808984  0.414776   1.950412 0.0511271  0.301514
## CERNA2         29.84232      -0.333639  2.105841  -0.158435 0.8741141  0.947013
## LOC100996886   32.40953      -5.702918  2.792202  -2.042445 0.0411074  0.277870

summary(resGleason89_ctlNT)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 139, 0.46%
## LFC < 0 (down)     : 194, 0.65%
## outliers [1]       : 0, 0%
## low counts [2]     : 8119, 27%
## (mean count < 26)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

resGleason89_ctlNT_Ordered<-resGleason89_ctlNT[order(resGleason89_ctlNT$padj),]
head(resGleason89_ctlNT_Ordered)

## log2 fold change (MLE): Gleason 2 vs 1 
## Wald test p-value: Gleason 2 vs 1 
## DataFrame with 6 rows and 6 columns
##          baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##         <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## ANPEP     4877.91       -5.70051  0.472519 -12.06410 1.63436e-33 3.56176e-29
## NCAPD3   10906.07       -3.66525  0.315231 -11.62718 2.99826e-31 3.26705e-27
## DHCR24    5570.05       -3.34382  0.355532  -9.40511 5.19738e-21 2.85443e-17
## SCHLAP1   1260.44        5.75391  0.611840   9.40427 5.23916e-21 2.85443e-17
## IGHA2     3746.03       -4.75000  0.557064  -8.52685 1.50390e-17 6.55488e-14
## BANK1     1142.58       -2.93922  0.360601  -8.15088 3.61285e-16 1.31225e-12

#第2张热图
set.seed(88)
library("pheatmap")
select_genes_resGleason89_ctlNT_Ordered<-rownames(resGleason89_ctlNT_Ordered)
select_genes_resGleason89_ctlNT_Ordered<-select_genes_resGleason89_ctlNT_Ordered[1:30]
vst_Gleason89_ctlNT<-vst(ddsGleason89_ctlNT,blind = FALSE)
df_resGleason89_ctlNT <- as.data.frame(colData(vst_Gleason89_ctlNT)[,c("Type","Gleason")])
pheatmap(assay(vst_Gleason89_ctlNT)[select_genes_resGleason89_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_resGleason89_ctlNT,fontsize=8, main = "Gleason8 vs Gleason9,Control CT ",name="vst(FPKM)")

###Pre vs Post (Control N vs T)

ddsPrePost_ctlNT<-dds

design(ddsPrePost_ctlNT)<-~Type + Treatment

ddsPrePost_ctlNT<-DESeq(ddsPrePost_ctlNT)

## using pre-existing size factors

## estimating dispersions

## found already estimated dispersions, replacing these

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

resultsNames(ddsPrePost_ctlNT)

## [1] "Intercept"             "Type_Tumor_vs_Control" "Treatment_2_vs_1"

resPrePost_ctlNT<-results(ddsPrePost_ctlNT, contrast=c("Treatment","2","1"), alpha=0.05)    
resPrePost_ctlNT

## log2 fold change (MLE): Treatment 2 vs 1 
## Wald test p-value: Treatment 2 vs 1 
## DataFrame with 29912 rows and 6 columns
##                baseMean log2FoldChange     lfcSE      stat    pvalue      padj
##               <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6        1107.8741       0.183342  0.306017  0.599125 0.5490895  0.743449
## TNMD            32.8382      -1.263376  1.384109 -0.912772 0.3613623  0.586899
## DPM1           473.0177       0.805517  0.590631  1.363823 0.1726233  0.384804
## SCYL3         1171.0906       0.634129  0.341642  1.856119 0.0634366  0.211961
## C1orf112       321.5382       0.333883  0.491251  0.679658 0.4967210  0.704565
## ...                 ...            ...       ...       ...       ...       ...
## BMP8B-AS1      10.53852      -6.094999  3.675364 -1.658339 0.0972491        NA
## H2AL1SP         4.26468       3.174072  4.069047  0.780053 0.4353598        NA
## NIPBL-DT     1361.89229       0.712923  0.404476  1.762587 0.0779702  0.239104
## CERNA2         29.84232       1.580403  1.974819  0.800277 0.4235501  0.642771
## LOC100996886   32.40953      -5.740451  2.788422 -2.058673 0.0395255  0.160207

summary(resPrePost_ctlNT)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 865, 2.9%
## LFC < 0 (down)     : 1545, 5.2%
## outliers [1]       : 0, 0%
## low counts [2]     : 7538, 25%
## (mean count < 23)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

resPrePost_ctlNT_Ordered<-resTypeTrX_TN[order(resPrePost_ctlNT$padj),]
head(resPrePost_ctlNT_Ordered)

## log2 fold change (MLE): Type Tumor vs Control 
## Wald test p-value: Type Tumor vs Control 
## DataFrame with 6 rows and 6 columns
##           baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##          <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## POTEJ      992.145      1.4888086  1.741840  0.8547330 0.3926990  0.960734
## PCA3      2190.298      1.2327145  1.936624  0.6365275 0.5244327  0.976677
## RN7SL2  448990.716      0.0370742  0.658479  0.0563027 0.9551007  0.999731
## TMPRSS2  20680.141      1.2188144  1.379195  0.8837145 0.3768504  0.959836
## RBFOX3    1635.347     -1.4278862  0.755373 -1.8903059 0.0587171  0.858157
## PRRX1     4103.483     -0.6384444  0.637743 -1.0010990 0.3167789  0.942193

#第3张热图pre vs post
set.seed(88)
select_genes_resPrePost_ctlNT_Ordered<-rownames(resPrePost_ctlNT_Ordered)
select_genes_resPrePost_ctlNT_Ordered<-select_genes_resPrePost_ctlNT_Ordered[1:30]
vst_PrePost_ctlNT<-vst(ddsPrePost_ctlNT,blind = FALSE)
df_resPrePost_ctlNT <- as.data.frame(colData(vst_PrePost_ctlNT)[,c("Type","Treatment")])
pheatmap(assay(vst_PrePost_ctlNT)[select_genes_resPrePost_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_resPrePost_ctlNT,fontsize=8, main = "Pre vs Post,Control CT",name="vst(FPKM)")

#Three_Tumors_Only

counts_t<-counts[,c(4:6)]#countst指的t，only
coldata_t<-coldata[c(4:6),]

#library(DESeq2)
dds_t <- DESeqDataSetFromMatrix(countData = counts_t,
                              colData = coldata_t,
                              design = ~Treatment )

#Pre-Filtering

keep <- rowSums(counts(dds_t)) >= 10
dds_t <- dds_t[keep,]

#dds+res

dds_t<-DESeq(dds_t)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

#res_t<-lfcShrink(dds_t, )
res_t<-results(dds_t, contrast=c("Treatment","2","1"), alpha=0.05)

summary(res_t)

## 
## out of 28561 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 422, 1.5%
## LFC < 0 (down)     : 293, 1%
## outliers [1]       : 0, 0%
## low counts [2]     : 9414, 33%
## (mean count < 59)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

#resTypeTrX_TN<-lfcShrink(dds, coef="Type_Tumor_vs_Normal", type="apeglm")
#resTypeTrX_TN<-results(ddsTN,contrast=c("Type","Tumor","Normal"),alpha=0.05)
#resTypeTrX_TN
#summary(resTypeTrX_TN)
#res_t<-results(dds_t, contrast=c("Treatment","2","1"), alpha=0.05) 
#res_t
#summary(res_t)
#res_t<-lfcShrink(dds_t, )
#res_t<-lfcShrink(dds_t, coef = "Treatment_2_vs_1", type = "apeglm")
#summary(res_t)

#testing_heatmap_only tumor

res_t_Ordered<-res_t[order(res_t$padj),]
head(res_t_Ordered)

## log2 fold change (MLE): Treatment 2 vs 1 
## Wald test p-value: Treatment 2 vs 1 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## RN7SL147P   2675.79        8.82928  0.677177   13.0384 7.40301e-39 1.41745e-34
## RNA5-8SP4   1811.92        8.94547  0.723286   12.3678 3.90213e-35 3.73570e-31
## ANKRD35    18097.44        6.34913  0.550263   11.5383 8.45397e-31 5.39561e-27
## RNA5SP215   5706.88        5.94593  0.570019   10.4311 1.78790e-25 8.55821e-22
## RNA5SP324   2411.34        6.75583  0.650013   10.3934 2.65769e-25 1.01774e-21
## RNU2-61P    7818.33        5.58140  0.538177   10.3709 3.36231e-25 1.07297e-21

set.seed(88)

library("pheatmap")
select_genes_res_t_Ordered<-rownames(res_t_Ordered)
select_genes_res_t_Ordered<-select_genes_res_t_Ordered[1:30]
vst_t<-vst(dds_t,blind = FALSE)
df_res_t <- as.data.frame(colData(vst_t)["Treatment"])
pheatmap(assay(vst_t)[select_genes_res_t_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_res_t, fontsize=8, main = "Tumors:Pre vs Post-Treatment",name="vst(FPKM)")

#Normal_Only

counts_n<-counts[,c(1:3)]
coldata_n<-coldata[c(1:3),]

library(DESeq2)
dds_n <- DESeqDataSetFromMatrix(countData = counts_n,
                              colData = coldata_n,
                              design = ~ Treatment )

#Pre-Filtering

keep <- rowSums(counts(dds_n)) >= 10
dds_n <- dds_n[keep,]

#dds+res

dds_n<-DESeq(dds_n)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

#res_t<-lfcShrink(dds_t, )
res_n<-results(dds_n, contrast=c("Treatment","2","1"), alpha=0.05)

summary(res_n)

## 
## out of 26382 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 84, 0.32%
## LFC < 0 (down)     : 211, 0.8%
## outliers [1]       : 0, 0%
## low counts [2]     : 8184, 31%
## (mean count < 40)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

#testing_heatmap_only normal

res_n_Ordered<-res_n[order(res_n$padj),]
head(res_n_Ordered)

## log2 fold change (MLE): Treatment 2 vs 1 
## Wald test p-value: Treatment 2 vs 1 
## DataFrame with 6 rows and 6 columns
##             baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##            <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## RNA5SP141  60053.281       -3.97976  0.372212 -10.69218 1.10743e-26 2.01530e-22
## PCA3        1478.407       -6.71460  0.696384  -9.64210 5.30905e-22 4.83071e-18
## TMPRSS2    14016.335       -3.21323  0.368393  -8.72228 2.72656e-18 1.42527e-14
## RN7SL2    499314.108       -2.70774  0.311001  -8.70654 3.13280e-18 1.42527e-14
## SPON2       9362.003       -2.75835  0.332613  -8.29298 1.10442e-16 4.01964e-13
## POTEJ        586.631       -6.22581  0.753419  -8.26340 1.41576e-16 4.29401e-13

#library("pheatmap")
set.seed(3)
library("pheatmap")
select_genes_res_n_Ordered<-rownames(res_n_Ordered)
select_genes_res_n_Ordered<-select_genes_res_n_Ordered[1:30]
vst_n<-vst(dds_n,blind = FALSE)
df_res_n <- as.data.frame(colData(vst_n)["Treatment"])
pheatmap(assay(vst_n)[select_genes_res_n_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_res_n, fontsize=8, main = "Normal:Pre vs Post-Treatment",name="vst(FPKM)")

###34vs36;pre vs post

counts_3436_trx<-counts[,c(1,3,4,6)]
coldata_3436_trx<-coldata[c(1,3,4,6),]

#library(DESeq2)
dds_3436_trx <- DESeqDataSetFromMatrix(countData = counts_3436_trx,
                              colData = coldata_3436_trx,
                              design = ~ Type + Treatment )

#Pre-Filtering

keep <- rowSums(counts(dds_3436_trx)) >= 10
dds_3436_trx <- dds_3436_trx[keep,]

#dds+res

dds_3436_trx<-DESeq(dds_3436_trx)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res_3436_trx<-results(dds_3436_trx, contrast=c("Treatment","2","1"), alpha=0.05)
summary(res_3436_trx)

## 
## out of 27475 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 787, 2.9%
## LFC < 0 (down)     : 630, 2.3%
## outliers [1]       : 0, 0%
## low counts [2]     : 6392, 23%
## (mean count < 15)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

res_3436_trx_Ordered<-res_3436_trx[order(res_3436_trx$padj),]
head(res_3436_trx_Ordered)

## log2 fold change (MLE): Treatment 2 vs 1 
## Wald test p-value: Treatment 2 vs 1 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## COL2A1     6964.468        6.22930  0.353518   17.6209 1.70325e-69 3.59097e-65
## POTEI      3138.401       -5.20207  0.304108  -17.1060 1.33959e-65 1.41212e-61
## TGM4       1909.905        5.42208  0.349629   15.5081 3.05833e-54 2.14929e-50
## POTEJ      1023.558       -6.36934  0.450505  -14.1382 2.20716e-45 1.16334e-41
## PCA3        598.939       -5.54619  0.487710  -11.3719 5.77243e-30 2.43400e-26
## RNA5SP207   702.878        5.48560  0.490732   11.1784 5.20187e-29 1.82785e-25

#library("pheatmap")
set.seed(70)
select_genes_res_3436_trx_Ordered<-rownames(res_3436_trx_Ordered)
select_genes_res_3436_trx_Ordered<-select_genes_res_3436_trx_Ordered[1:30]
vst_3436_trx<-vst(dds_3436_trx,blind = FALSE)
df_res_3436_trx <- as.data.frame(colData(vst_3436_trx)["Treatment"])
pheatmap(assay(vst_3436_trx)[select_genes_res_3436_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=df_res_3436_trx, fontsize=8, main = "34&36 contrasting Pre and Post-Treatment",name="vst(FPKM)")

3536

counts_3536_trx<-counts[,c(2,3,5,6)]
coldata_3536_trx<-coldata[c(2,3,5,6),]

#library(DESeq2)
dds_3536_trx <- DESeqDataSetFromMatrix(countData = counts_3536_trx,
                              colData = coldata_3536_trx,
                              design = ~ Type + Treatment )

#Pre-Filtering

keep <- rowSums(counts(dds_3536_trx)) >= 10
dds_3536_trx <- dds_3536_trx[keep,]

#dds+res

dds_3536_trx<-DESeq(dds_3536_trx)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res_3536_trx<-results(dds_3536_trx, contrast=c("Treatment","2","1"), alpha=0.05)
summary(res_3536_trx)

## 
## out of 28772 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 671, 2.3%
## LFC < 0 (down)     : 716, 2.5%
## outliers [1]       : 0, 0%
## low counts [2]     : 7252, 25%
## (mean count < 30)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

res_3536_trx_Ordered<-res_3536_trx[order(res_3536_trx$padj),]
head(res_3536_trx_Ordered)

## log2 fold change (MLE): Treatment 2 vs 1 
## Wald test p-value: Treatment 2 vs 1 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## RN7SL147P   2510.74        8.81406  0.536080   16.4417 9.62320e-61 2.07091e-56
## PCA3        2458.68       -7.55776  0.486495  -15.5351 2.00700e-54 2.15953e-50
## ACSM1       2751.65       -5.94770  0.428096  -13.8934 6.94895e-44 4.98472e-40
## ANPEP       6546.93       -5.94850  0.443400  -13.4157 4.89490e-41 2.63346e-37
## CTRB1       2441.57        6.73694  0.537736   12.5283 5.22459e-36 2.24866e-32
## CHRNA2      1290.91       -6.27367  0.546778  -11.4739 1.78460e-30 6.40078e-27

set.seed(1)
library(ComplexHeatmap)
#library("pheatmap")
select_genes_res_3536_trx_Ordered<-rownames(res_3536_trx_Ordered)
select_genes_res_3536_trx_Ordered<-select_genes_res_3536_trx_Ordered[1:30]
vst_3536_trx<-vst(dds_3536_trx,blind = FALSE)
df_res_3536_trx <- as.data.frame(colData(vst_3536_trx)["Treatment"])
pheatmap(assay(vst_3536_trx)[select_genes_res_3536_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_res_3536_trx,fontsize=8, main = "35&36 contrasting Pre- and Post- Treatment",name="vst(FPKM)")

3436,3536 各选取top100 并寻找share gene

write.csv(as.data.frame(res_3436_trx_Ordered[1:100,]), 
          file="/Users/dongxiaotai/Desktop/res_3436_trx_Orderedtop100.csv")
write.csv(as.data.frame(res_3536_trx_Ordered[1:100,]), 
          file="/Users/dongxiaotai/Desktop/res_3536_trx_Orderedtop100.csv")

setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
library(readxl)
shared_genes_343536_trx<-read_excel('/Users/dongxiaotai/Desktop/shared_gene.xlsx',col_names = FALSE)

## New names:
## * `` -> ...1

shared_genes_343536_trx<-shared_genes_343536_trx[,1]
shared_genes_343536_trx<-t(shared_genes_343536_trx)
shared_genes_343536_trx<-shared_genes_343536_trx[1,]

dds_all_trx<-dds
design(dds_all_trx)<-~ Type + Treatment
dds_all_trx<-DESeq(dds_all_trx)

## using pre-existing size factors

## estimating dispersions

## found already estimated dispersions, replacing these

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res_all_trx<-results(dds_all_trx, contrast=c("Treatment","2","1"),alpha=0.05)

set.seed(1)
library(ComplexHeatmap)
#library("pheatmap")
#select_genes_res_3536_trx_Ordered<-rownames(res_3536_trx_Ordered)
#select_genes_res_3536_trx_Ordered<-select_genes_res_3536_trx_Ordered[1:30]
vst_all_trx<-vst(dds_all_trx,blind = FALSE)
df_res_all_trx <- as.data.frame(colData(vst_all_trx)["Treatment"])
pheatmap(assay(vst_all_trx)[shared_genes_343536_trx,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df_res_all_trx,fontsize=8, main = "Shared Genes by 34vs36 & 35vs36 Pre/Post Treatments",name="vst(FPKM)", scale = "row")

vsd <- vst(dds, blind=FALSE)
plotPCA(vsd, intgroup=c("Treatment", "Type"))