#chooseCRANmirror()
#install.packages("BiocManager")
#library(BiocManager)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
counts <- read.table("ProstCa_030921_2.txt")
#if (!requireNamespace("BiocManager", quietly = TRUE))
#install.packages("BiocManager")
#BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## 载入需要的程辑包:GenomicFeatures
## 载入需要的程辑包:BiocGenerics
## 载入需要的程辑包:parallel
##
## 载入程辑包:'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## 载入需要的程辑包:S4Vectors
## 载入需要的程辑包:stats4
##
## 载入程辑包:'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## 载入需要的程辑包:IRanges
## 载入需要的程辑包:GenomeInfoDb
## 载入需要的程辑包:GenomicRanges
## 载入需要的程辑包:AnnotationDbi
## 载入需要的程辑包:Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevels(txdb)
## [1] "chr1" "chr2" "chr3"
## [4] "chr4" "chr5" "chr6"
## [7] "chr7" "chr8" "chr9"
## [10] "chr10" "chr11" "chr12"
## [13] "chr13" "chr14" "chr15"
## [16] "chr16" "chr17" "chr18"
## [19] "chr19" "chr20" "chr21"
## [22] "chr22" "chrX" "chrY"
## [25] "chrM" "chr1_gl000191_random" "chr1_gl000192_random"
## [28] "chr4_ctg9_hap1" "chr4_gl000193_random" "chr4_gl000194_random"
## [31] "chr6_apd_hap1" "chr6_cox_hap2" "chr6_dbb_hap3"
## [34] "chr6_mann_hap4" "chr6_mcf_hap5" "chr6_qbl_hap6"
## [37] "chr6_ssto_hap7" "chr7_gl000195_random" "chr8_gl000196_random"
## [40] "chr8_gl000197_random" "chr9_gl000198_random" "chr9_gl000199_random"
## [43] "chr9_gl000200_random" "chr9_gl000201_random" "chr11_gl000202_random"
## [46] "chr17_ctg5_hap1" "chr17_gl000203_random" "chr17_gl000204_random"
## [49] "chr17_gl000205_random" "chr17_gl000206_random" "chr18_gl000207_random"
## [52] "chr19_gl000208_random" "chr19_gl000209_random" "chr21_gl000210_random"
## [55] "chrUn_gl000211" "chrUn_gl000212" "chrUn_gl000213"
## [58] "chrUn_gl000214" "chrUn_gl000215" "chrUn_gl000216"
## [61] "chrUn_gl000217" "chrUn_gl000218" "chrUn_gl000219"
## [64] "chrUn_gl000220" "chrUn_gl000221" "chrUn_gl000222"
## [67] "chrUn_gl000223" "chrUn_gl000224" "chrUn_gl000225"
## [70] "chrUn_gl000226" "chrUn_gl000227" "chrUn_gl000228"
## [73] "chrUn_gl000229" "chrUn_gl000230" "chrUn_gl000231"
## [76] "chrUn_gl000232" "chrUn_gl000233" "chrUn_gl000234"
## [79] "chrUn_gl000235" "chrUn_gl000236" "chrUn_gl000237"
## [82] "chrUn_gl000238" "chrUn_gl000239" "chrUn_gl000240"
## [85] "chrUn_gl000241" "chrUn_gl000242" "chrUn_gl000243"
## [88] "chrUn_gl000244" "chrUn_gl000245" "chrUn_gl000246"
## [91] "chrUn_gl000247" "chrUn_gl000248" "chrUn_gl000249"
library(stringr)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
#if (!requireNamespace("BiocManager", quietly = TRUE))
#install.packages("BiocManager")
#BiocManager::install("Homo.sapiens")
library(Homo.sapiens)
## 载入需要的程辑包:OrganismDbi
## 载入需要的程辑包:GO.db
##
## 载入需要的程辑包:org.Hs.eg.db
##
geneid <- rownames(counts)
gene <- str_match(geneid, "(\\w*).*")
geneid <- gene[,2]
geneid2<-geneid
geneid2=data.frame(geneid2)
counts<-cbind(counts,geneid2)
genes <- select(Homo.sapiens, keys=geneid,
columns=c("SYMBOL","TXCHROM"),
keytype="ENSEMBL")
## 'select()' returned many:many mapping between keys and columns
#dim(genes)
genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
#counts_2<-counts[which(counts$geneid2 %in% genes$ENSEMBL),]
row.names(counts)<-counts$geneid2
seqlevels(txdb) <- c("chr14")
genelist <- genes(txdb)
txlist <- transcripts(txdb)
exonlist <- exons(txdb)
library(GenomicAlignments)
## 载入需要的程辑包:SummarizedExperiment
## 载入需要的程辑包:MatrixGenerics
## 载入需要的程辑包:matrixStats
##
## 载入程辑包:'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## 载入程辑包:'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## 载入需要的程辑包:Biostrings
## 载入需要的程辑包:XVector
##
## 载入程辑包:'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## 载入需要的程辑包:Rsamtools
counts<-cbind(counts,genes$SYMBOL)
counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready
counts_filtered<-counts[1:6]
colnames(counts_filtered)<-c("C1_1","C1_2","C1_3","T1_1","T1_2","T1_3")
coldata=data.frame(row.names=c("C1_1","C1_2","C1_3","T1_1","T1_2","T1_3"),
condition=rep(c("C1","T1"),each=3),
treatment=rep(c("C1","T1"),each=3))
coldata$condition<-factor(coldata$condition)
coldata$treatment<-factor(coldata$treatment)
#if (!requireNamespace("BiocManager", quietly = TRUE))
#install.packages("BiocManager")
#BiocManager::install("DESeq2")
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
colData = coldata,
design = ~ condition)
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds$condition <- factor(dds$condition, levels = c("C1","T1"))
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
res
## log2 fold change (MLE): condition T1 vs C1
## Wald test p-value: condition T1 vs C1
## DataFrame with 29912 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.0325524 0.325711 0.0999427 0.920390 0.998905
## TNMD 32.8382 1.4031560 1.356203 1.0346212 0.300846 0.971182
## DPM1 473.0177 0.0528333 0.619625 0.0852666 0.932049 0.999246
## SCYL3 1171.0906 -0.1550594 0.417280 -0.3715957 0.710194 0.994762
## C1orf112 321.5382 -0.2140606 0.496694 -0.4309708 0.666490 0.994762
## ... ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -0.582843 3.484002 -0.1672914 0.867141 0.996308
## H2AL1SP 4.26468 -2.997713 3.846888 -0.7792566 0.435829 NA
## NIPBL-DT 1361.89229 -0.269032 0.465077 -0.5784681 0.562948 0.988494
## CERNA2 29.84232 -0.150351 1.893759 -0.0793926 0.936720 0.999246
## LOC100996886 32.40953 5.663064 2.655310 2.1327323 NA NA
res <- results(dds, contrast=c("condition","T1","C1"))
resultsNames(dds)
## [1] "Intercept" "condition_T1_vs_C1"
#if (!requireNamespace("BiocManager", quietly = TRUE))
#install.packages("BiocManager")
#BiocManager::install("apeglm")
library(apeglm)
#install.packages("emdbook")
resLFC <- lfcShrink(dds, coef="condition_T1_vs_C1", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC
## log2 fold change (MAP): condition T1 vs C1
## Wald test p-value: condition T1 vs C1
## DataFrame with 29912 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## TSPAN6 1107.8741 0.001487557 0.0684490 0.920390 0.998905
## TNMD 32.8382 0.003521716 0.0700459 0.300846 0.971182
## DPM1 473.0177 0.000797669 0.0695532 0.932049 0.999246
## SCYL3 1171.0906 -0.004260059 0.0692136 0.710194 0.994762
## C1orf112 321.5382 -0.004050157 0.0694818 0.666490 0.994762
## ... ... ... ... ... ...
## BMP8B-AS1 10.53852 -0.000217798 0.0699759 0.867141 0.996308
## H2AL1SP 4.26468 -0.000759203 0.0699901 0.435829 NA
## NIPBL-DT 1361.89229 -0.005912317 0.0695784 0.562948 0.988494
## CERNA2 29.84232 -0.000177364 0.0699416 0.936720 0.999246
## LOC100996886 32.40953 0.002115149 0.0700345 NA NA
resOrdered <- res[order(res$pvalue),]
summary(res)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 65, 0.22%
## LFC < 0 (down) : 8, 0.027%
## outliers [1] : 1970, 6.6%
## low counts [2] : 2320, 7.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sum(res$padj < 0.1, na.rm=TRUE)
## [1] 73
res05 <- results(dds, alpha=0.05) #Pvalue cutoff 0.05
summary(res05)
##
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 37, 0.12%
## LFC < 0 (down) : 5, 0.017%
## outliers [1] : 1970, 6.6%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sum(res05$padj < 0.05, na.rm=TRUE)
## [1] 42
plotMA(res, ylim=c(-2,2))
plotMA(resLFC, ylim=c(-2,2)) #With Log2 fold changes
### Alternative shrinkage estimators# To look for other choice
#install.packages("ashr")
resultsNames(dds)
## [1] "Intercept" "condition_T1_vs_C1"
resNorm <- lfcShrink(dds, coef=2, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=2, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")
plotCounts(dds, gene=which.min(res$padj), intgroup="condition")
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd), 3)
## C1_1 C1_2 C1_3 T1_1 T1_2 T1_3
## TSPAN6 10.486297 10.707607 10.736549 10.639462 10.616665 10.753322
## TNMD 8.507890 8.503406 8.453807 8.562585 8.990775 8.436141
## DPM1 9.641772 9.944028 10.009209 9.943590 9.213472 10.309385
#if (!requireNamespace("BiocManager", quietly = TRUE))
#install.packages("BiocManager")
#BiocManager::install("vsn")
#install.packages("hexbin")
# This will give log2(n + 1)
ntd <- normTransform(dds)
library("vsn")
meanSdPlot(assay(ntd))
### VarianceStabilizingTransformation
meanSdPlot(assay(vsd))
meanSdPlot(assay(rld))
#if (!requireNamespace("BiocManager", quietly = TRUE))
#install.packages("BiocManager")
#BiocManager::install("heatmaps")
#install.packages("pheatmap")
library("pheatmap")
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:40]
df <- as.data.frame(colData(dds)[,c("condition","treatment")])
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=FALSE, annotation_col=df)
sampleDists <- dist(t(assay(vsd)))
library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
down <- subset(res05, log2FoldChange < 0)
up <- subset(res05, log2FoldChange > 0)
down_matrix<-data.frame(down)
down_matrix_in_order<-down_matrix[order(down_matrix$padj),]
head(down_matrix_in_order,10)
## baseMean log2FoldChange lfcSE stat pvalue
## HSPB6 6923.2898 -1.730502 0.3386476 -5.110037 3.220960e-07
## TPSAB1 742.1138 -1.877489 0.4057341 -4.627387 3.703082e-06
## LDB3 3353.0830 -1.353514 0.3008800 -4.498519 6.842845e-06
## RNA5SP321 590.0383 -3.644505 0.8591623 -4.241928 2.216078e-05
## RBPMS 9681.6183 -1.176493 0.2831198 -4.155459 3.246347e-05
## MYOCD 3079.7012 -1.396420 0.3698119 -3.776029 1.593484e-04
## PDLIM7 7936.5805 -1.139134 0.3096481 -3.678804 2.343305e-04
## ACOT11 3715.9475 -1.063013 0.2921382 -3.638732 2.739834e-04
## SORBS1 22282.7956 -1.116812 0.3082320 -3.623284 2.908866e-04
## SMTN 12921.3072 -1.067069 0.2981782 -3.578627 3.454036e-04
## padj
## HSPB6 0.0006923083
## TPSAB1 0.0054458691
## LDB3 0.0076481115
## RNA5SP321 0.0206405467
## RBPMS 0.0283466996
## MYOCD 0.0824539325
## PDLIM7 0.0992069926
## ACOT11 0.1088382321
## SORBS1 0.1093975285
## SMTN 0.1209400312
up_matrix<-data.frame(up)
up_matrix_in_order<-up_matrix[order(up_matrix$padj),]
head(up_matrix_in_order,10)
## baseMean log2FoldChange lfcSE stat pvalue
## CCL26 14.00997 20.793362 3.9104145 5.317432 1.052422e-07
## BEND4 1569.35620 1.852659 0.3228705 5.738087 9.575185e-09
## RNU6-744P 15.81842 20.867645 3.9099994 5.336994 9.449995e-08
## RPSAP23 14.00997 20.793362 3.9104145 5.317432 1.052422e-07
## MBD3L2 14.42202 20.831246 3.9103101 5.327262 9.970435e-08
## PLSCR5 14.42202 20.831246 3.9103101 5.327262 9.970435e-08
## LINC00383 14.42202 20.831246 3.9103101 5.327262 9.970435e-08
## RNU6-731P 14.00997 20.793362 3.9104145 5.317432 1.052422e-07
## PPP1R1AP1 14.21599 20.815209 3.9103616 5.323091 1.020189e-07
## TP53TG3GP 14.42202 20.831246 3.9103101 5.327262 9.970435e-08
## padj
## CCL26 0.0002673344
## BEND4 0.0002673344
## RNU6-744P 0.0002673344
## RPSAP23 0.0002673344
## MBD3L2 0.0002673344
## PLSCR5 0.0002673344
## LINC00383 0.0002673344
## RNU6-731P 0.0002673344
## PPP1R1AP1 0.0002673344
## TP53TG3GP 0.0002673344