zeyuan-5/31-code

Set up and Library

#chooseCRANmirror()
#install.packages("BiocManager")
#library(BiocManager)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
counts <- read.table("ProstCa_030921_2.txt")
#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

## 载入需要的程辑包：GenomicFeatures

## 载入需要的程辑包：BiocGenerics

## 载入需要的程辑包：parallel

## 
## 载入程辑包：'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 载入需要的程辑包：S4Vectors

## 载入需要的程辑包：stats4

## 
## 载入程辑包：'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## 载入需要的程辑包：IRanges

## 载入需要的程辑包：GenomeInfoDb

## 载入需要的程辑包：GenomicRanges

## 载入需要的程辑包：AnnotationDbi

## 载入需要的程辑包：Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevels(txdb)

##  [1] "chr1"                  "chr2"                  "chr3"                 
##  [4] "chr4"                  "chr5"                  "chr6"                 
##  [7] "chr7"                  "chr8"                  "chr9"                 
## [10] "chr10"                 "chr11"                 "chr12"                
## [13] "chr13"                 "chr14"                 "chr15"                
## [16] "chr16"                 "chr17"                 "chr18"                
## [19] "chr19"                 "chr20"                 "chr21"                
## [22] "chr22"                 "chrX"                  "chrY"                 
## [25] "chrM"                  "chr1_gl000191_random"  "chr1_gl000192_random" 
## [28] "chr4_ctg9_hap1"        "chr4_gl000193_random"  "chr4_gl000194_random" 
## [31] "chr6_apd_hap1"         "chr6_cox_hap2"         "chr6_dbb_hap3"        
## [34] "chr6_mann_hap4"        "chr6_mcf_hap5"         "chr6_qbl_hap6"        
## [37] "chr6_ssto_hap7"        "chr7_gl000195_random"  "chr8_gl000196_random" 
## [40] "chr8_gl000197_random"  "chr9_gl000198_random"  "chr9_gl000199_random" 
## [43] "chr9_gl000200_random"  "chr9_gl000201_random"  "chr11_gl000202_random"
## [46] "chr17_ctg5_hap1"       "chr17_gl000203_random" "chr17_gl000204_random"
## [49] "chr17_gl000205_random" "chr17_gl000206_random" "chr18_gl000207_random"
## [52] "chr19_gl000208_random" "chr19_gl000209_random" "chr21_gl000210_random"
## [55] "chrUn_gl000211"        "chrUn_gl000212"        "chrUn_gl000213"       
## [58] "chrUn_gl000214"        "chrUn_gl000215"        "chrUn_gl000216"       
## [61] "chrUn_gl000217"        "chrUn_gl000218"        "chrUn_gl000219"       
## [64] "chrUn_gl000220"        "chrUn_gl000221"        "chrUn_gl000222"       
## [67] "chrUn_gl000223"        "chrUn_gl000224"        "chrUn_gl000225"       
## [70] "chrUn_gl000226"        "chrUn_gl000227"        "chrUn_gl000228"       
## [73] "chrUn_gl000229"        "chrUn_gl000230"        "chrUn_gl000231"       
## [76] "chrUn_gl000232"        "chrUn_gl000233"        "chrUn_gl000234"       
## [79] "chrUn_gl000235"        "chrUn_gl000236"        "chrUn_gl000237"       
## [82] "chrUn_gl000238"        "chrUn_gl000239"        "chrUn_gl000240"       
## [85] "chrUn_gl000241"        "chrUn_gl000242"        "chrUn_gl000243"       
## [88] "chrUn_gl000244"        "chrUn_gl000245"        "chrUn_gl000246"       
## [91] "chrUn_gl000247"        "chrUn_gl000248"        "chrUn_gl000249"

Install

library(stringr)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("Homo.sapiens")
library(Homo.sapiens)

## 载入需要的程辑包：OrganismDbi

## 载入需要的程辑包：GO.db

##

## 载入需要的程辑包：org.Hs.eg.db

##

Convert geneid

geneid <- rownames(counts) 
gene <- str_match(geneid, "(\\w*).*")
geneid <- gene[,2]
geneid2<-geneid
geneid2=data.frame(geneid2)
counts<-cbind(counts,geneid2)

genes <- select(Homo.sapiens, keys=geneid,
                columns=c("SYMBOL","TXCHROM"),
                keytype="ENSEMBL")

## 'select()' returned many:many mapping between keys and columns

Get rid of duplicate genes

#dim(genes)
genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
#counts_2<-counts[which(counts$geneid2 %in% genes$ENSEMBL),]

row.names(counts)<-counts$geneid2

seqlevels(txdb) <- c("chr14")
genelist <- genes(txdb)
txlist <- transcripts(txdb)
exonlist <- exons(txdb)

library(GenomicAlignments)

## 载入需要的程辑包：SummarizedExperiment

## 载入需要的程辑包：MatrixGenerics

## 载入需要的程辑包：matrixStats

## 
## 载入程辑包：'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## 
## 载入程辑包：'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## The following object is masked from 'package:Biobase':
## 
##     rowMedians

## 载入需要的程辑包：Biostrings

## 载入需要的程辑包：XVector

## 
## 载入程辑包：'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

## 载入需要的程辑包：Rsamtools

Render the results of the conversion in counts and filter out duplicate genes

counts<-cbind(counts,genes$SYMBOL)

counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready

Group and build a matrix that formally processes the data

counts_filtered<-counts[1:6]
colnames(counts_filtered)<-c("C1_1","C1_2","C1_3","T1_1","T1_2","T1_3")
coldata=data.frame(row.names=c("C1_1","C1_2","C1_3","T1_1","T1_2","T1_3"),
                   condition=rep(c("C1","T1"),each=3),
                   treatment=rep(c("C1","T1"),each=3))
coldata$condition<-factor(coldata$condition)
coldata$treatment<-factor(coldata$treatment)

Install Deseq2

#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("DESeq2")
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
                              colData = coldata,
                              design = ~ condition)

Pre-Filtering

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

Note on Factor Levels

dds$condition <- factor(dds$condition, levels = c("C1","T1"))

Differential expression analysis

dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res <- results(dds)
res

## log2 fold change (MLE): condition T1 vs C1 
## Wald test p-value: condition T1 vs C1 
## DataFrame with 29912 rows and 6 columns
##                baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##               <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## TSPAN6        1107.8741      0.0325524  0.325711  0.0999427  0.920390  0.998905
## TNMD            32.8382      1.4031560  1.356203  1.0346212  0.300846  0.971182
## DPM1           473.0177      0.0528333  0.619625  0.0852666  0.932049  0.999246
## SCYL3         1171.0906     -0.1550594  0.417280 -0.3715957  0.710194  0.994762
## C1orf112       321.5382     -0.2140606  0.496694 -0.4309708  0.666490  0.994762
## ...                 ...            ...       ...        ...       ...       ...
## BMP8B-AS1      10.53852      -0.582843  3.484002 -0.1672914  0.867141  0.996308
## H2AL1SP         4.26468      -2.997713  3.846888 -0.7792566  0.435829        NA
## NIPBL-DT     1361.89229      -0.269032  0.465077 -0.5784681  0.562948  0.988494
## CERNA2         29.84232      -0.150351  1.893759 -0.0793926  0.936720  0.999246
## LOC100996886   32.40953       5.663064  2.655310  2.1327323        NA        NA

res <- results(dds, contrast=c("condition","T1","C1"))

resultsNames(dds)

## [1] "Intercept"          "condition_T1_vs_C1"

#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("apeglm")
library(apeglm)

#install.packages("emdbook")
resLFC <- lfcShrink(dds, coef="condition_T1_vs_C1", type="apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

resLFC

## log2 fold change (MAP): condition T1 vs C1 
## Wald test p-value: condition T1 vs C1 
## DataFrame with 29912 rows and 5 columns
##                baseMean log2FoldChange     lfcSE    pvalue      padj
##               <numeric>      <numeric> <numeric> <numeric> <numeric>
## TSPAN6        1107.8741    0.001487557 0.0684490  0.920390  0.998905
## TNMD            32.8382    0.003521716 0.0700459  0.300846  0.971182
## DPM1           473.0177    0.000797669 0.0695532  0.932049  0.999246
## SCYL3         1171.0906   -0.004260059 0.0692136  0.710194  0.994762
## C1orf112       321.5382   -0.004050157 0.0694818  0.666490  0.994762
## ...                 ...            ...       ...       ...       ...
## BMP8B-AS1      10.53852   -0.000217798 0.0699759  0.867141  0.996308
## H2AL1SP         4.26468   -0.000759203 0.0699901  0.435829        NA
## NIPBL-DT     1361.89229   -0.005912317 0.0695784  0.562948  0.988494
## CERNA2         29.84232   -0.000177364 0.0699416  0.936720  0.999246
## LOC100996886   32.40953    0.002115149 0.0700345        NA        NA

p-values and adjusted p-values

resOrdered <- res[order(res$pvalue),]

summary(res)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 65, 0.22%
## LFC < 0 (down)     : 8, 0.027%
## outliers [1]       : 1970, 6.6%
## low counts [2]     : 2320, 7.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

sum(res$padj < 0.1, na.rm=TRUE)

## [1] 73

res05 <- results(dds, alpha=0.05) #Pvalue cutoff 0.05
summary(res05)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 37, 0.12%
## LFC < 0 (down)     : 5, 0.017%
## outliers [1]       : 1970, 6.6%
## low counts [2]     : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

sum(res05$padj < 0.05, na.rm=TRUE)

## [1] 42

MA-Plot

plotMA(res, ylim=c(-2,2))

plotMA(resLFC, ylim=c(-2,2)) #With Log2 fold changes

### Alternative shrinkage estimators# To look for other choice

#install.packages("ashr")
resultsNames(dds)

## [1] "Intercept"          "condition_T1_vs_C1"

resNorm <- lfcShrink(dds, coef=2, type="normal")

## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

resAsh <- lfcShrink(dds, coef=2, type="ashr")

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

Plot Counts

plotCounts(dds, gene=which.min(res$padj), intgroup="condition")

Extracting transformed values

vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd), 3)

##             C1_1      C1_2      C1_3      T1_1      T1_2      T1_3
## TSPAN6 10.486297 10.707607 10.736549 10.639462 10.616665 10.753322
## TNMD    8.507890  8.503406  8.453807  8.562585  8.990775  8.436141
## DPM1    9.641772  9.944028 10.009209  9.943590  9.213472 10.309385

Effects of transformations on the variance #Compare two conditions

#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("vsn")
#install.packages("hexbin")
# This will give log2(n + 1)
ntd <- normTransform(dds)

library("vsn")
meanSdPlot(assay(ntd))

### VarianceStabilizingTransformation

meanSdPlot(assay(vsd))

This function transforms the count data to the log2 scale in a way which minimizes differences between samples for rows with small counts, and which normalizes with respect to library size.

meanSdPlot(assay(rld))

Heatmap of the count matrix

#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("heatmaps")
#install.packages("pheatmap")
library("pheatmap")
select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:40]
df <- as.data.frame(colData(dds)[,c("condition","treatment")])
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=df)

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=FALSE, annotation_col=df)

Heatmap of the sample-to-sample distances

sampleDists <- dist(t(assay(vsd)))

library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

Get down and up genes from Res05

down <- subset(res05, log2FoldChange < 0)

up <- subset(res05, log2FoldChange > 0)

Get Top10 Gene(down)

down_matrix<-data.frame(down)
down_matrix_in_order<-down_matrix[order(down_matrix$padj),]
head(down_matrix_in_order,10)

##             baseMean log2FoldChange     lfcSE      stat       pvalue
## HSPB6      6923.2898      -1.730502 0.3386476 -5.110037 3.220960e-07
## TPSAB1      742.1138      -1.877489 0.4057341 -4.627387 3.703082e-06
## LDB3       3353.0830      -1.353514 0.3008800 -4.498519 6.842845e-06
## RNA5SP321   590.0383      -3.644505 0.8591623 -4.241928 2.216078e-05
## RBPMS      9681.6183      -1.176493 0.2831198 -4.155459 3.246347e-05
## MYOCD      3079.7012      -1.396420 0.3698119 -3.776029 1.593484e-04
## PDLIM7     7936.5805      -1.139134 0.3096481 -3.678804 2.343305e-04
## ACOT11     3715.9475      -1.063013 0.2921382 -3.638732 2.739834e-04
## SORBS1    22282.7956      -1.116812 0.3082320 -3.623284 2.908866e-04
## SMTN      12921.3072      -1.067069 0.2981782 -3.578627 3.454036e-04
##                   padj
## HSPB6     0.0006923083
## TPSAB1    0.0054458691
## LDB3      0.0076481115
## RNA5SP321 0.0206405467
## RBPMS     0.0283466996
## MYOCD     0.0824539325
## PDLIM7    0.0992069926
## ACOT11    0.1088382321
## SORBS1    0.1093975285
## SMTN      0.1209400312

Get Top10 Gene(up)

up_matrix<-data.frame(up)
up_matrix_in_order<-up_matrix[order(up_matrix$padj),]
head(up_matrix_in_order,10)

##             baseMean log2FoldChange     lfcSE     stat       pvalue
## CCL26       14.00997      20.793362 3.9104145 5.317432 1.052422e-07
## BEND4     1569.35620       1.852659 0.3228705 5.738087 9.575185e-09
## RNU6-744P   15.81842      20.867645 3.9099994 5.336994 9.449995e-08
## RPSAP23     14.00997      20.793362 3.9104145 5.317432 1.052422e-07
## MBD3L2      14.42202      20.831246 3.9103101 5.327262 9.970435e-08
## PLSCR5      14.42202      20.831246 3.9103101 5.327262 9.970435e-08
## LINC00383   14.42202      20.831246 3.9103101 5.327262 9.970435e-08
## RNU6-731P   14.00997      20.793362 3.9104145 5.317432 1.052422e-07
## PPP1R1AP1   14.21599      20.815209 3.9103616 5.323091 1.020189e-07
## TP53TG3GP   14.42202      20.831246 3.9103101 5.327262 9.970435e-08
##                   padj
## CCL26     0.0002673344
## BEND4     0.0002673344
## RNU6-744P 0.0002673344
## RPSAP23   0.0002673344
## MBD3L2    0.0002673344
## PLSCR5    0.0002673344
## LINC00383 0.0002673344
## RNU6-731P 0.0002673344
## PPP1R1AP1 0.0002673344
## TP53TG3GP 0.0002673344

zeyuan-5/31-code

Zeyuan Dong

5/31/2021

Set up and Library

Install

Convert geneid

Get rid of duplicate genes

Render the results of the conversion in counts and filter out duplicate genes

Group and build a matrix that formally processes the data

Install Deseq2

Pre-Filtering

Note on Factor Levels

Differential expression analysis

p-values and adjusted p-values

MA-Plot

Plot Counts

Extracting transformed values

Effects of transformations on the variance #Compare two conditions

This function transforms the count data to the log2 scale in a way which minimizes differences between samples for rows with small counts, and which normalizes with respect to library size.

Heatmap of the count matrix

Heatmap of the sample-to-sample distances

Get down and up genes from Res05

Get Top10 Gene(down)

Get Top10 Gene(up)