Set the working path, install various required packages,preprocess the COUNTS file
library(stringr)
library(Homo.sapiens)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GO.db
##
## Loading required package: org.Hs.eg.db
##
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
counts <- read.table("ProstCa_030921_2.txt")
geneid <- rownames(counts)
gene <- str_match(geneid, "(\\w*).*")#Ready to remove the decimal point
geneid <- gene[,2]#Let's get rid of the next two decimal points
geneid2<-geneid#We're going to map the ENS name after the decimal point to the gene
geneid2=data.frame(geneid2)#Convert to data.frame format
counts<-cbind(counts,geneid2)#Merge counts and geneid2 to counts
Convert geneid and get rid of duplicate genes
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
genes <- select(Homo.sapiens, keys=geneid,
columns=c("SYMBOL","TXCHROM"),
keytype="ENSEMBL")
## 'select()' returned many:many mapping between keys and columns
dim(genes)
## [1] 60216 3
genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
row.names(counts)<-counts$geneid2
Change CountsReady back to a more manageable matrix. Change the column names of the matrix, group and build a matrix that formally processes the data
library(GenomicAlignments)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: Rsamtools
counts<-cbind(counts,genes$SYMBOL)
counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready
By the end of this step, we have a clear counts and have removed the decimal point for ENS to correspond to the gene name. This matrix is now called counts_filtered.
Build the complete matrix list,add sample information.I added the information before and after treatment and the Gleason score to the sample information. In “Counts” after reprocessing, 3 of the 6 data sets are C1 and the other 3 are T1. In ID, “C” stands for Control and “T” stands for Tumor.
counts_filtered<-counts[1:6]
coldata=data.frame(row.names=c("34C","35C","36C","34T","35T","36T"),
Type=rep(c("Control","Tumor"),each=3),
Gleason=rep(c("9","8","9"),2),
Treatment=rep(c("Pre","Pre","Post"),2))
Display the table
coldata
## Type Gleason Treatment
## 34C Control 9 Pre
## 35C Control 8 Pre
## 36C Control 9 Post
## 34T Tumor 9 Pre
## 35T Tumor 8 Pre
## 36T Tumor 9 Post
coldata$Type<-factor(coldata$Type)
coldata$Gleason<-factor(coldata$Gleason)
coldata$Treatment<-factor(coldata$Treatment)
Differential expression analysis
library(DESeq2)
#if (!requireNamespace("BiocManager", quiet = TRUE))
#install.packages("BiocManager")
#BiocManager::install("SummarizedExperiment")
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
colData = coldata,
design = ~ Type )
Pre-Filtering If this row adds up, if it adds up to a 10 you can keep it, because if you don’t use a 0, you get rid of it. As long as one of these rows is not 0, then 0 is also information. Prefiltering has two important effects: we reduce the memory size of DDS data objects by removing rows that are rarely read; We have improved the speed of converting and testing functions in DESeq2. In this step, we perform minimum prefiltering to preserve only rows that have at least 10 reads.
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
TypeTreatment_Tumor_vs_Normal 34 35 36T vs 34 35 36C
ddsTN<-dds
resTypeTrX_TN<-results(ddsTN,contrast=c("Type","Tumor","Control"),alpha=0.05)
resTypeTrX_TN
## log2 fold change (MLE): Type Tumor vs Control
## Wald test p-value: Type Tumor vs Control
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.0325524 0.325711 0.0999427 0.920390 0.998486
## TNMD 32.8382 1.4031560 1.356203 1.0346212 0.300846 0.941348
## DPM1 473.0177 0.0528333 0.619625 0.0852666 0.932049 0.999438
## SCYL3 1171.0906 -0.1550594 0.417280 -0.3715957 0.710194 0.992431
## C1orf112 321.5382 -0.2140606 0.496694 -0.4309708 0.666490 0.991134
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -0.582843 3.484002 -0.1672914 0.867141 0.996036
## H2AL1SP 4.26468 -2.997713 3.846888 -0.7792566 0.435829 0.964254
## NIPBL-DT 1361.89229 -0.269032 0.465077 -0.5784681 0.562948 0.979952
## CERNA2 29.84232 -0.150351 1.893759 -0.0793926 0.936720 0.999731
## LOC100996886 32.40953 5.663064 2.655310 2.1327323 NA NA
summary(resTypeTrX_TN)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 37, 0.12%
## LFC < 0 (down) : 5, 0.017%
## outliers [1] : 1970, 6.6%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
In “34 35 36T vs 34 35 36C”, 37 up-regulated genes and 5 down-regulated genes were displayed under the condition of p-value<0.05.
resTypeTrX_TN_Ordered<-resTypeTrX_TN[order(resTypeTrX_TN$padj),]
head(resTypeTrX_TN_Ordered)
## log2 fold change (MLE): Type Tumor vs Control
## Wald test p-value: Type Tumor vs Control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## CCL26 14.0100 20.79336 3.91041 5.31743 1.05242e-07 0.000267334
## BEND4 1569.3562 1.85266 0.32287 5.73809 9.57519e-09 0.000267334
## RNU6-744P 15.8184 20.86765 3.91000 5.33699 9.45000e-08 0.000267334
## RPSAP23 14.0100 20.79336 3.91041 5.31743 1.05242e-07 0.000267334
## MBD3L2 14.4220 20.83125 3.91031 5.32726 9.97044e-08 0.000267334
## PLSCR5 14.4220 20.83125 3.91031 5.32726 9.97044e-08 0.000267334
#Reorder the genes by the adjusted p value
set.seed(1)
#if (!requireNamespace("BiocManager", quiet = TRUE))
#install.packages("BiocManager")
#BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)
## Loading required package: grid
##
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
##
## pattern
## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library("pheatmap")
##
## Attaching package: 'pheatmap'
## The following object is masked from 'package:ComplexHeatmap':
##
## pheatmap
select_genes_resTypeTrX_TN_Ordered<-rownames(resTypeTrX_TN_Ordered)
select_genes_resTypeTrX_TN_Ordered<-select_genes_resTypeTrX_TN_Ordered[1:42]
write.csv(as.data.frame(resTypeTrX_TN_Ordered),
file="/Users/dongxiaotai/Desktop/resTypeTrX_TN_Ordered.csv")
write.csv(as.data.frame(resTypeTrX_TN_Ordered[select_genes_resTypeTrX_TN_Ordered,]),
file="/Users/dongxiaotai/Desktop/select_genes_resTypeTrX_TN_Ordered.csv")
vst_TypeTrX_TN<-vst(ddsTN,blind = FALSE)
df_resTypeTrX_TN <- as.data.frame(colData(vst_TypeTrX_TN)["Type"])
ComplexHeatmap::pheatmap(assay(vst_TypeTrX_TN)[select_genes_resTypeTrX_TN_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,cluster_cols=TRUE, annotation_col=df_resTypeTrX_TN, fontsize=6.5, main = "34,35,36C vs 34,35,36T ",name="vst(FPKM)")
As you can see from the heat map, the control sample and the tumor sample are clustered separately. In the control group, gene expression was very similar.
Gleason8 vs Gleason9 (control CT)
ddsGleason89_ctlNT<-dds
design(ddsGleason89_ctlNT)<-~Type + Gleason
ddsGleason89_ctlNT<-DESeq(ddsGleason89_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsGleason89_ctlNT)
## [1] "Intercept" "Type_Tumor_vs_Control" "Gleason_9_vs_8"
resGleason89_ctlNT<-results(ddsGleason89_ctlNT,contrast=c("Gleason","9","8"),alpha=0.05)
resGleason89_ctlNT
## log2 fold change (MLE): Gleason 9 vs 8
## Wald test p-value: Gleason 9 vs 8
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 -0.00885828 0.337065 -0.0262806 0.979033 0.990579
## TNMD 32.8382 -1.43220889 1.357417 -1.0550987 0.291380 0.600705
## DPM1 473.0177 0.73435841 0.638022 1.1509930 0.249735 0.562239
## SCYL3 1171.0906 0.56378587 0.390504 1.4437383 0.148813 0.459201
## C1orf112 321.5382 0.19070630 0.527008 0.3618662 0.717452 0.876603
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -1.792776 3.946374 -0.454284 0.6496241 NA
## H2AL1SP 4.26468 1.296251 4.115195 0.314991 0.7527682 NA
## NIPBL-DT 1361.89229 0.808984 0.414776 1.950412 0.0511271 0.301514
## CERNA2 29.84232 -0.333639 2.105841 -0.158435 0.8741141 0.947013
## LOC100996886 32.40953 -5.702918 2.792202 -2.042445 0.0411074 0.277870
summary(resGleason89_ctlNT)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 139, 0.46%
## LFC < 0 (down) : 194, 0.65%
## outliers [1] : 0, 0%
## low counts [2] : 8119, 27%
## (mean count < 26)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
In “Gleason8 vs Gleason9,Control CT”, the results table showed 139 genes up-regulated and 194 down-regulated.
resGleason89_ctlNT_Ordered<-resGleason89_ctlNT[order(resGleason89_ctlNT$padj),]
head(resGleason89_ctlNT_Ordered)
## log2 fold change (MLE): Gleason 9 vs 8
## Wald test p-value: Gleason 9 vs 8
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ANPEP 4877.91 -5.70051 0.472519 -12.06410 1.63436e-33 3.56176e-29
## NCAPD3 10906.07 -3.66525 0.315231 -11.62718 2.99826e-31 3.26705e-27
## DHCR24 5570.05 -3.34382 0.355532 -9.40511 5.19738e-21 2.85443e-17
## SCHLAP1 1260.44 5.75391 0.611840 9.40427 5.23916e-21 2.85443e-17
## IGHA2 3746.03 -4.75000 0.557064 -8.52685 1.50390e-17 6.55488e-14
## BANK1 1142.58 -2.93922 0.360601 -8.15088 3.61285e-16 1.31225e-12
set.seed(88)
library("pheatmap")
select_genes_resGleason89_ctlNT_Ordered<-rownames(resGleason89_ctlNT_Ordered)
select_genes_resGleason89_ctlNT_Ordered<-select_genes_resGleason89_ctlNT_Ordered[1:30]
vst_Gleason89_ctlNT<-vst(ddsGleason89_ctlNT,blind = FALSE)
df_resGleason89_ctlNT <- as.data.frame(colData(vst_Gleason89_ctlNT)[,c("Type","Gleason")])
ComplexHeatmap::pheatmap(assay(vst_Gleason89_ctlNT)[select_genes_resGleason89_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resGleason89_ctlNT,fontsize=8, main = "Gleason8 vs Gleason9,Control CT ",name="vst(FPKM)")
In the heat map, Gleason8 and Gleason9 are clustered separately when we control for the effect of sample type. In Gleason9, 34C and 36C are clustered together, 34T and 36T are clustered together.
Pre vs Post (Control CT)
ddsPrePost_ctlNT<-dds
design(ddsPrePost_ctlNT)<-~Type + Treatment
ddsPrePost_ctlNT<-DESeq(ddsPrePost_ctlNT)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(ddsPrePost_ctlNT)
## [1] "Intercept" "Type_Tumor_vs_Control" "Treatment_Pre_vs_Post"
resPrePost_ctlNT<-results(ddsPrePost_ctlNT, contrast=c("Treatment","Post","Pre"), alpha=0.05)
resPrePost_ctlNT
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.183342 0.306017 0.599122 0.5490916 0.743446
## TNMD 32.8382 -1.263384 1.384108 -0.912779 0.3613590 0.586894
## DPM1 473.0177 0.805513 0.590631 1.363817 0.1726250 0.384807
## SCYL3 1171.0906 0.634128 0.341642 1.856115 0.0634371 0.211958
## C1orf112 321.5382 0.333881 0.491251 0.679654 0.4967236 0.704558
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -6.094994 3.675343 -1.658347 0.0972474 NA
## H2AL1SP 4.26468 3.174002 4.069027 0.780040 0.4353676 NA
## NIPBL-DT 1361.89229 0.712922 0.404476 1.762583 0.0779709 0.239106
## CERNA2 29.84232 1.580382 1.974816 0.800268 0.4235555 0.642765
## LOC100996886 32.40953 -5.740423 2.788411 -2.058672 0.0395257 0.160208
summary(resPrePost_ctlNT)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 866, 2.9%
## LFC < 0 (down) : 1545, 5.2%
## outliers [1] : 0, 0%
## low counts [2] : 7538, 25%
## (mean count < 23)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
In the “Pre vs Post,Control CT”, the results table showed 866 genes were up-regulated and 1545 genes were down-regulated.
resPrePost_ctlNT_Ordered<-resTypeTrX_TN[order(resPrePost_ctlNT$padj),]
head(resPrePost_ctlNT_Ordered)
## log2 fold change (MLE): Type Tumor vs Control
## Wald test p-value: Type Tumor vs Control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## POTEJ 992.145 1.4888086 1.741840 0.8547330 0.3926990 0.960734
## PCA3 2190.298 1.2327145 1.936624 0.6365275 0.5244327 0.976677
## RN7SL2 448990.716 0.0370742 0.658479 0.0563027 0.9551007 0.999731
## TMPRSS2 20680.141 1.2188144 1.379195 0.8837145 0.3768504 0.959836
## RBFOX3 1635.347 -1.4278862 0.755373 -1.8903059 0.0587171 0.858157
## PRRX1 4103.483 -0.6384444 0.637743 -1.0010990 0.3167789 0.942193
set.seed(88)
select_genes_resPrePost_ctlNT_Ordered<-rownames(resPrePost_ctlNT_Ordered)
select_genes_resPrePost_ctlNT_Ordered<-select_genes_resPrePost_ctlNT_Ordered[1:30]
vst_PrePost_ctlNT<-vst(ddsPrePost_ctlNT,blind = FALSE)
df_resPrePost_ctlNT <- as.data.frame(colData(vst_PrePost_ctlNT)[,c("Type","Treatment")])
ComplexHeatmap::pheatmap(assay(vst_PrePost_ctlNT)[select_genes_resPrePost_ctlNT_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_resPrePost_ctlNT,fontsize=8, main = "Pre vs Post,Control CT",name="vst(FPKM)")
In the heat map, the pre - and post-treatment samples were clustered separately when we controlled for the effect of sample type. Gene expression in tumor samples was significantly different before and after treatment.
Three_Tumors_Only
counts_t<-counts[,c(4:6)]#counts_t——tumor only
coldata_t<-coldata[c(4:6),]
#library(DESeq2)
dds_t <- DESeqDataSetFromMatrix(countData = counts_t,
colData = coldata_t,
design = ~Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_t)) >= 10
dds_t <- dds_t[keep,]
#dds+res
dds_t<-DESeq(dds_t)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_t<-results(dds_t, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_t)
##
## out of 28561 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 422, 1.5%
## LFC < 0 (down) : 293, 1%
## outliers [1] : 0, 0%
## low counts [2] : 9414, 33%
## (mean count < 59)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
testing_heatmap_only tumor
res_t_Ordered<-res_t[order(res_t$padj),]
head(res_t_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## RN7SL147P 2675.79 8.82928 0.677177 13.0384 7.40348e-39 1.41754e-34
## RNA5-8SP4 1811.92 8.94547 0.723285 12.3678 3.90236e-35 3.73593e-31
## ANKRD35 18097.44 6.34912 0.550263 11.5383 8.45459e-31 5.39600e-27
## RNA5SP215 5706.88 5.94593 0.570019 10.4311 1.78800e-25 8.55873e-22
## RNA5SP324 2411.34 6.75583 0.650013 10.3934 2.65784e-25 1.01779e-21
## RNU2-61P 7818.33 5.58139 0.538177 10.3709 3.36251e-25 1.07303e-21
set.seed(88)
library("pheatmap")
select_genes_res_t_Ordered<-rownames(res_t_Ordered)
select_genes_res_t_Ordered<-select_genes_res_t_Ordered[1:30]
vst_t<-vst(dds_t,blind = FALSE)
df_res_t <- as.data.frame(colData(vst_t)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_t)[select_genes_res_t_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_t, fontsize=8, main = "Tumors:Pre vs Post-Treatment",name="vst(FPKM)")
Normal_Only
counts_n<-counts[,c(1:3)]
coldata_n<-coldata[c(1:3),]
library(DESeq2)
dds_n <- DESeqDataSetFromMatrix(countData = counts_n,
colData = coldata_n,
design = ~ Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_n)) >= 10
dds_n <- dds_n[keep,]
#dds+res
dds_n<-DESeq(dds_n)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
#res_t<-lfcShrink(dds_t, )
res_n<-results(dds_n, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_n)
##
## out of 26382 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 84, 0.32%
## LFC < 0 (down) : 211, 0.8%
## outliers [1] : 0, 0%
## low counts [2] : 8184, 31%
## (mean count < 40)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#testing_heatmap_only normal
res_n_Ordered<-res_n[order(res_n$padj),]
head(res_n_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## RNA5SP141 60053.281 -3.97976 0.372212 -10.69218 1.10736e-26 2.01517e-22
## PCA3 1478.407 -6.71461 0.696384 -9.64211 5.30878e-22 4.83046e-18
## TMPRSS2 14016.335 -3.21323 0.368393 -8.72228 2.72644e-18 1.42520e-14
## RN7SL2 499314.108 -2.70774 0.311001 -8.70655 3.13264e-18 1.42520e-14
## SPON2 9362.003 -2.75836 0.332613 -8.29299 1.10438e-16 4.01950e-13
## POTEJ 586.631 -6.22581 0.753419 -8.26341 1.41571e-16 4.29384e-13
#library("pheatmap")
set.seed(3)
library("pheatmap")
select_genes_res_n_Ordered<-rownames(res_n_Ordered)
select_genes_res_n_Ordered<-select_genes_res_n_Ordered[1:30]
vst_n<-vst(dds_n,blind = FALSE)
df_res_n <- as.data.frame(colData(vst_n)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_n)[select_genes_res_n_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_n, fontsize=8, main = "Control:Pre vs Post-Treatment",name="vst(FPKM)")
34vs36;Pre vs Post-,Control CT
In this study, the focus is to explore the differential gene expression before and after treatment in prostate cancer patients. The patient-1(Pre-treatment: 34C, 34T; Post-treatment: 36C; 36T; Gleason-9) has received treatment, while patient-2(Pre-treatment: 35C, 35T; Gleason-8) has not received therapy yet. Therefore, 34CT&36CT contrasting pre-treatment and post-treatment differential gene expression(DGE) analysis was conducted to check the differential expression affected by the therapy. Additionally, 35CT&36CT contrasting pre- and post-treatment DGE analysis was also done because of the similarity between 34T and 35T, for comparison and supplement.
counts_3436_trx<-counts[,c(1,3,4,6)]
coldata_3436_trx<-coldata[c(1,3,4,6),]
#library(DESeq2)
dds_3436_trx <- DESeqDataSetFromMatrix(countData = counts_3436_trx,
colData = coldata_3436_trx,
design = ~ Type + Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_3436_trx)) >= 10
dds_3436_trx <- dds_3436_trx[keep,]
#dds+res
dds_3436_trx<-DESeq(dds_3436_trx)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_3436_trx<-results(dds_3436_trx, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_3436_trx)
##
## out of 27475 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 787, 2.9%
## LFC < 0 (down) : 630, 2.3%
## outliers [1] : 0, 0%
## low counts [2] : 6392, 23%
## (mean count < 15)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_3436_trx_Ordered<-res_3436_trx[order(res_3436_trx$padj),]
head(res_3436_trx_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## COL2A1 6964.468 6.22930 0.353518 17.6209 1.70318e-69 3.59081e-65
## POTEI 3138.401 -5.20207 0.304108 -17.1060 1.33953e-65 1.41207e-61
## TGM4 1909.905 5.42208 0.349629 15.5081 3.05849e-54 2.14941e-50
## POTEJ 1023.558 -6.36935 0.450505 -14.1382 2.20709e-45 1.16330e-41
## PCA3 598.939 -5.54619 0.487710 -11.3719 5.77212e-30 2.43387e-26
## RNA5SP207 702.878 5.48559 0.490732 11.1784 5.20184e-29 1.82784e-25
#library("pheatmap")
set.seed(70)
select_genes_res_3436_trx_Ordered<-rownames(res_3436_trx_Ordered)
select_genes_res_3436_trx_Ordered<-select_genes_res_3436_trx_Ordered[1:30]
vst_3436_trx<-vst(dds_3436_trx,blind = FALSE)
df_res_3436_trx <- as.data.frame(colData(vst_3436_trx)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_3436_trx)[select_genes_res_3436_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=df_res_3436_trx, fontsize=8, main = "34&36 contrasting Pre and Post-Treatment",name="vst(FPKM)")
35vs36;Pre vs Post-,Control CT
counts_3536_trx<-counts[,c(2,3,5,6)]
coldata_3536_trx<-coldata[c(2,3,5,6),]
#library(DESeq2)
dds_3536_trx <- DESeqDataSetFromMatrix(countData = counts_3536_trx,
colData = coldata_3536_trx,
design = ~ Type + Treatment )
#Pre-Filtering
keep <- rowSums(counts(dds_3536_trx)) >= 10
dds_3536_trx <- dds_3536_trx[keep,]
#dds+res
dds_3536_trx<-DESeq(dds_3536_trx)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_3536_trx<-results(dds_3536_trx, contrast=c("Treatment","Post","Pre"), alpha=0.05)
summary(res_3536_trx)
##
## out of 28772 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 671, 2.3%
## LFC < 0 (down) : 716, 2.5%
## outliers [1] : 0, 0%
## low counts [2] : 7252, 25%
## (mean count < 30)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_3536_trx_Ordered<-res_3536_trx[order(res_3536_trx$padj),]
head(res_3536_trx_Ordered)
## log2 fold change (MLE): Treatment Post vs Pre
## Wald test p-value: Treatment Post vs Pre
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## RN7SL147P 2510.74 8.81406 0.536080 16.4417 9.62063e-61 2.07036e-56
## PCA3 2458.68 -7.55776 0.486495 -15.5351 2.00710e-54 2.15964e-50
## ACSM1 2751.65 -5.94770 0.428096 -13.8934 6.94842e-44 4.98433e-40
## ANPEP 6546.93 -5.94850 0.443400 -13.4157 4.89493e-41 2.63347e-37
## CTRB1 2441.57 6.73694 0.537736 12.5283 5.22524e-36 2.24895e-32
## CHRNA2 1290.91 -6.27367 0.546778 -11.4739 1.78455e-30 6.40057e-27
set.seed(1)
library(ComplexHeatmap)
#library("pheatmap")
select_genes_res_3536_trx_Ordered<-rownames(res_3536_trx_Ordered)
select_genes_res_3536_trx_Ordered<-select_genes_res_3536_trx_Ordered[1:30]
vst_3536_trx<-vst(dds_3536_trx,blind = FALSE)
df_res_3536_trx <- as.data.frame(colData(vst_3536_trx)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_3536_trx)[select_genes_res_3536_trx_Ordered,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_3536_trx,fontsize=8, main = "35&36 contrasting Pre- and Post- Treatment",name="vst(FPKM)")
Top200 genes of 34CT&36CT(contrasting pre- and post-treatment) and 35CT&36CT(contrasting pre- and post-treatment) by adjusted p-value were obtained respectively. Then, a list of 48 shared genes from those 2 Top200 genes was generated, and subsequently analyzed by Ingenuity Pathway Analysis (IPA). After testing, only 19 candidate genes were shared in the TOP100, and 48 candidate genes were shared in the TOP200. The maximum p-value of shared candidate genes in TOP200 was less than 10 to the power of minus 5, which was meaningful. Therefore, genes shared in TOP200 were selected and analyzed. After this step, I imported the selected 48 shared genes into IPA and performed simple path analysis for IPA. I submitted only genetic symbols without numbers/data/statistics. I exported the IPA results in Excel and selected the affected path that I was interested in. I made protein-protein interaction prediction through STRING.
#write.csv(as.data.frame(res_3436_trx_Ordered[1:100,]),
#file="/Users/dongxiaotai/Desktop/res_3436_trx_Orderedtop100.csv")
#write.csv(as.data.frame(res_3536_trx_Ordered[1:100,]),
#file="/Users/dongxiaotai/Desktop/res_3536_trx_Orderedtop100.csv")
write.csv(as.data.frame(res_3436_trx_Ordered[1:200,]),
file="/Users/dongxiaotai/Desktop/res_3436_trx_Orderedtop200.csv")
write.csv(as.data.frame(res_3536_trx_Ordered[1:200,]),
file="/Users/dongxiaotai/Desktop/res_3536_trx_Orderedtop200.csv")
setwd("/Users/dongxiaotai/Desktop/TRGN_lab/final/")
library(readxl)
shared_genes_343536top200_trx<-read_excel('/Users/dongxiaotai/Desktop/sharedgenetop200.xlsx',col_names = FALSE)
## New names:
## * `` -> ...1
shared_genes_343536top200_trx<-shared_genes_343536top200_trx[,1]
shared_genes_343536top200_trx<-t(shared_genes_343536top200_trx)
shared_genes_343536top200_trx<-shared_genes_343536top200_trx[1,]
dds_all_top200trx<-dds
design(dds_all_top200trx)<-~ Type + Treatment
dds_all_top200trx<-DESeq(dds_all_top200trx)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_all_top200trx<-results(dds_all_top200trx, contrast=c("Treatment","Post","Pre"),alpha=0.05)
set.seed(1)
library(ComplexHeatmap)
library("pheatmap")
#select_genes_res_343536top200_trx_Ordered<-rownames(res_343536top200_trx_Ordered)
#select_genes_res_343536top200_trx_Ordered<-select_genes_res_343536top200_trx_Ordered[1:30]
vst_all_top200trx<-vst(dds_all_top200trx,blind = FALSE)
df_res_all_top200trx <- as.data.frame(colData(vst_all_top200trx)["Treatment"])
ComplexHeatmap::pheatmap(assay(vst_all_top200trx)[shared_genes_343536top200_trx,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col=df_res_all_top200trx,fontsize=6.5, main = "Top200 Shared Genes by 34vs36 & 35vs36 Pre/Post Treatments",display_numbers=TRUE,name ="vst(FPKM)")
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] readxl_1.3.1
## [2] pheatmap_1.0.12
## [3] ComplexHeatmap_2.10.0
## [4] DESeq2_1.32.0
## [5] GenomicAlignments_1.28.0
## [6] Rsamtools_2.8.0
## [7] Biostrings_2.60.0
## [8] XVector_0.32.0
## [9] SummarizedExperiment_1.22.0
## [10] MatrixGenerics_1.4.0
## [11] matrixStats_0.58.0
## [12] Homo.sapiens_1.3.1
## [13] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [14] org.Hs.eg.db_3.13.0
## [15] GO.db_3.13.0
## [16] OrganismDbi_1.34.0
## [17] GenomicFeatures_1.44.0
## [18] GenomicRanges_1.44.0
## [19] GenomeInfoDb_1.28.0
## [20] AnnotationDbi_1.54.0
## [21] IRanges_2.26.0
## [22] S4Vectors_0.30.0
## [23] Biobase_2.52.0
## [24] BiocGenerics_0.38.0
## [25] stringr_1.4.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 bit64_4.0.5 doParallel_1.0.16
## [4] filelock_1.0.2 RColorBrewer_1.1-2 progress_1.2.2
## [7] httr_1.4.2 tools_4.1.0 utf8_1.2.1
## [10] R6_2.5.0 colorspace_2.0-1 DBI_1.1.1
## [13] GetoptLong_1.0.5 tidyselect_1.1.1 prettyunits_1.1.1
## [16] bit_4.0.4 curl_4.3.1 compiler_4.1.0
## [19] graph_1.70.0 DelayedArray_0.18.0 rtracklayer_1.52.0
## [22] scales_1.1.1 genefilter_1.74.0 RBGL_1.68.0
## [25] rappdirs_0.3.3 digest_0.6.27 rmarkdown_2.11
## [28] pkgconfig_2.0.3 htmltools_0.5.1.1 highr_0.9
## [31] dbplyr_2.1.1 fastmap_1.1.0 GlobalOptions_0.1.2
## [34] rlang_0.4.11 RSQLite_2.2.7 shape_1.4.6
## [37] jquerylib_0.1.4 BiocIO_1.2.0 generics_0.1.0
## [40] BiocParallel_1.26.0 dplyr_1.0.6 RCurl_1.98-1.3
## [43] magrittr_2.0.1 GenomeInfoDbData_1.2.6 Matrix_1.3-3
## [46] munsell_0.5.0 Rcpp_1.0.6 fansi_0.5.0
## [49] lifecycle_1.0.0 stringi_1.6.2 yaml_2.2.1
## [52] zlibbioc_1.38.0 BiocFileCache_2.0.0 blob_1.2.1
## [55] crayon_1.4.1 lattice_0.20-44 splines_4.1.0
## [58] annotate_1.70.0 circlize_0.4.13 hms_1.1.0
## [61] KEGGREST_1.32.0 locfit_1.5-9.4 knitr_1.33
## [64] pillar_1.6.1 rjson_0.2.20 codetools_0.2-18
## [67] geneplotter_1.70.0 biomaRt_2.48.0 XML_3.99-0.6
## [70] glue_1.4.2 evaluate_0.14 BiocManager_1.30.15
## [73] foreach_1.5.1 png_0.1-7 vctrs_0.3.8
## [76] cellranger_1.1.0 gtable_0.3.0 purrr_0.3.4
## [79] clue_0.3-59 assertthat_0.2.1 cachem_1.0.5
## [82] ggplot2_3.3.3 xfun_0.23 xtable_1.8-4
## [85] restfulr_0.0.13 survival_3.2-11 tibble_3.1.2
## [88] iterators_1.0.13 memoise_2.0.0 cluster_2.1.2
## [91] ellipsis_0.3.2