#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# install.packages("apeglm")
# BiocManager::install("BiocParallel")
# BiocManager::install("IHW")
library(apeglm)
library(BiocParallel)
## Warning: package 'BiocParallel' was built under R version 4.1.2
library(IHW)
directory <- "/Users/gaohuizi/Desktop/510-HFP/twosmallgroup"
setwd(directory)
getwd()
## [1] "/Users/gaohuizi/Desktop/510-HFP/twosmallgroup"
sampleFiles <- grep("SK",list.files(directory),value=TRUE)
#View("sampleFiles")
sampleCondition <- sub("(.*SK).*","\\1",sampleFiles)
sampleTable <- data.frame(sampleName = sampleFiles,
fileName = sampleFiles,
condition = sampleCondition)
sampleTable$condition <- factor(sampleTable$condition)
#View("sampleCondition")
library("DESeq2")
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.2
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Warning: package 'GenomicRanges' was built under R version 4.1.2
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = directory,
design= ~ condition)
ddsHTSeq
## class: DESeqDataSet
## dim: 60483 60
## metadata(1): version
## assays(1): counts
## rownames(60483): ENSG00000000003.13 ENSG00000000005.5 ...
## ENSGR0000280767.1 ENSGR0000281849.1
## rowData names(0):
## colnames(60): NSK1 NSK10 ... SK8 SK9
## colData names(1): condition
ddsHTSeq$condition <- factor(ddsHTSeq$condition, levels = c("NSK","SK"))
ddsHTSeq$condition
## [1] NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK
## [20] NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK SK SK SK SK SK SK SK SK
## [39] SK SK SK SK SK SK SK SK SK SK SK SK SK SK SK SK SK SK SK
## [58] SK SK SK
## Levels: NSK SK
ddsHTSeq <- DESeq(ddsHTSeq)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 8687 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res <- results(ddsHTSeq)
res
## log2 fold change (MLE): condition SK vs NSK
## Wald test p-value: condition SK vs NSK
## DataFrame with 60483 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.13 3125.04995 -0.101553 0.203158 -0.499871 0.6171660
## ENSG00000000005.5 3.02716 0.107599 0.550764 0.195363 0.8451088
## ENSG00000000419.11 1483.19336 -0.242811 0.127203 -1.908843 0.0562824
## ENSG00000000457.12 890.18941 0.155545 0.127797 1.217130 0.2235548
## ENSG00000000460.15 441.31424 0.161581 0.247103 0.653902 0.5131749
## ... ... ... ... ... ...
## ENSGR0000275287.3 0 NA NA NA NA
## ENSGR0000276543.3 0 NA NA NA NA
## ENSGR0000277120.3 0 NA NA NA NA
## ENSGR0000280767.1 0 NA NA NA NA
## ENSGR0000281849.1 0 NA NA NA NA
## padj
## <numeric>
## ENSG00000000003.13 0.852917
## ENSG00000000005.5 0.947795
## ENSG00000000419.11 0.328078
## ENSG00000000457.12 0.583603
## ENSG00000000460.15 0.799798
## ... ...
## ENSGR0000275287.3 NA
## ENSGR0000276543.3 NA
## ENSGR0000277120.3 NA
## ENSGR0000280767.1 NA
## ENSGR0000281849.1 NA
res <- results(ddsHTSeq, contrast=c("condition","SK","NSK"))
res
## log2 fold change (MLE): condition SK vs NSK
## Wald test p-value: condition SK vs NSK
## DataFrame with 60483 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.13 3125.04995 -0.101553 0.203158 -0.499871 0.6171660
## ENSG00000000005.5 3.02716 0.107599 0.550764 0.195363 0.8451088
## ENSG00000000419.11 1483.19336 -0.242811 0.127203 -1.908843 0.0562824
## ENSG00000000457.12 890.18941 0.155545 0.127797 1.217130 0.2235548
## ENSG00000000460.15 441.31424 0.161581 0.247103 0.653902 0.5131749
## ... ... ... ... ... ...
## ENSGR0000275287.3 0 NA NA NA NA
## ENSGR0000276543.3 0 NA NA NA NA
## ENSGR0000277120.3 0 NA NA NA NA
## ENSGR0000280767.1 0 NA NA NA NA
## ENSGR0000281849.1 0 NA NA NA NA
## padj
## <numeric>
## ENSG00000000003.13 0.852917
## ENSG00000000005.5 0.947795
## ENSG00000000419.11 0.328078
## ENSG00000000457.12 0.583603
## ENSG00000000460.15 0.799798
## ... ...
## ENSGR0000275287.3 NA
## ENSGR0000276543.3 NA
## ENSGR0000277120.3 NA
## ENSGR0000280767.1 NA
## ENSGR0000281849.1 NA
resultsNames(ddsHTSeq)
## [1] "Intercept" "condition_SK_vs_NSK"
resLFC <- lfcShrink(ddsHTSeq, coef="condition_SK_vs_NSK", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC
## log2 fold change (MAP): condition SK vs NSK
## Wald test p-value: condition SK vs NSK
## DataFrame with 60483 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.13 3125.04995 -7.92388e-06 0.00144267 0.6171660 0.852917
## ENSG00000000005.5 3.02716 -6.01038e-06 0.00144270 0.8451088 0.947795
## ENSG00000000419.11 1483.19336 -7.02132e-05 0.00144346 0.0562824 0.328078
## ENSG00000000457.12 890.18941 1.41207e-05 0.00144264 0.2235548 0.583603
## ENSG00000000460.15 441.31424 2.83565e-06 0.00144267 0.5131749 0.799798
## ... ... ... ... ... ...
## ENSGR0000275287.3 0 NA NA NA NA
## ENSGR0000276543.3 0 NA NA NA NA
## ENSGR0000277120.3 0 NA NA NA NA
## ENSGR0000280767.1 0 NA NA NA NA
## ENSGR0000281849.1 0 NA NA NA NA
library("BiocParallel")
register(MulticoreParam(4))
resOrdered <- res[order(res$pvalue),]
summary(res)
##
## out of 55653 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1339, 2.4%
## LFC < 0 (down) : 322, 0.58%
## outliers [1] : 0, 0%
## low counts [2] : 19393, 35%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sum(res$padj < 0.1, na.rm=TRUE)
## [1] 1661
res05 <- results(ddsHTSeq, alpha=0.05)
summary(res05)
##
## out of 55653 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 760, 1.4%
## LFC < 0 (down) : 175, 0.31%
## outliers [1] : 0, 0%
## low counts [2] : 20467, 37%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sum(res05$padj < 0.05, na.rm=TRUE)
## [1] 935
library("IHW")
resIHW <- results(ddsHTSeq, filterFun=ihw)
summary(resIHW)
##
## out of 55653 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1696, 3%
## LFC < 0 (down) : 329, 0.59%
## outliers [1] : 0, 0%
## [1] see 'cooksCutoff' argument of ?results
## see metadata(res)$ihwResult on hypothesis weighting
sum(resIHW$padj < 0.1, na.rm=TRUE)
## [1] 2025
metadata(resIHW)$ihwResult
## ihwResult object with 60483 hypothesis tests
## Nominal FDR control level: 0.1
## Split into 37 bins, based on an ordinal covariate
plotMA(res, ylim=c(-2,2))

plotMA(resLFC, ylim=c(-2,2))

#idx <- identify(res$baseMean, res$log2FoldChange)
#rownames(res)[idx]
resultsNames(ddsHTSeq)
## [1] "Intercept" "condition_SK_vs_NSK"
resNorm <- lfcShrink(ddsHTSeq, coef=2, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(ddsHTSeq, coef=2, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

plotCounts(ddsHTSeq, gene=which.min(res$padj), intgroup="condition")

d <- plotCounts(ddsHTSeq, gene=which.min(res$padj), intgroup="condition",
returnData=TRUE)
library("ggplot2")
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:IHW':
##
## alpha
ggplot(d, aes(x=condition, y=count)) +
geom_point(position=position_jitter(w=0.1,h=0)) +
scale_y_log10(breaks=c(25,100,400))

mcols(res)$description
## [1] "mean of normalized counts for all samples"
## [2] "log2 fold change (MLE): condition SK vs NSK"
## [3] "standard error: condition SK vs NSK"
## [4] "Wald statistic: condition SK vs NSK"
## [5] "Wald test p-value: condition SK vs NSK"
## [6] "BH adjusted p-values"
write.csv(as.data.frame(resOrdered),
file="condition_treated_results.csv")
resSig <- subset(resOrdered, padj < 0.1)
resSig
## log2 fold change (MLE): condition SK vs NSK
## Wald test p-value: condition SK vs NSK
## DataFrame with 1661 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000075043.16 116.9112 -5.20579 0.741800 -7.01778 2.25419e-12
## ENSG00000198028.3 24.7702 -5.45361 0.797955 -6.83449 8.22998e-12
## ENSG00000198883.10 52.0476 4.60650 0.689186 6.68397 2.32549e-11
## ENSG00000202538.1 780.1032 5.94446 0.911730 6.51997 7.03198e-11
## ENSG00000108244.15 680.8237 -4.27175 0.655880 -6.51300 7.36639e-11
## ... ... ... ... ... ...
## ENSG00000275833.1 4.78618 1.152818 0.406263 2.83762 0.00454519
## ENSG00000112992.15 3674.25885 0.476019 0.167779 2.83718 0.00455146
## ENSG00000241048.1 2.23282 3.208083 1.130780 2.83705 0.00455322
## ENSG00000204021.3 14.69839 2.299613 0.810568 2.83704 0.00455340
## ENSG00000259055.1 1.88827 3.144356 1.108699 2.83608 0.00456715
## padj
## <numeric>
## ENSG00000075043.16 8.18723e-08
## ENSG00000198028.3 1.49456e-07
## ENSG00000198883.10 2.81540e-07
## ENSG00000202538.1 4.62892e-07
## ENSG00000108244.15 4.62892e-07
## ... ...
## ENSG00000275833.1 0.0996263
## ENSG00000112992.15 0.0996263
## ENSG00000241048.1 0.0996263
## ENSG00000204021.3 0.0996263
## ENSG00000259055.1 0.0998668
colData(ddsHTSeq)
## DataFrame with 60 rows and 3 columns
## condition sizeFactor replaceable
## <factor> <numeric> <logical>
## NSK1 NSK 1.505694 TRUE
## NSK10 NSK 0.630384 TRUE
## NSK11 NSK 0.979879 TRUE
## NSK12 NSK 1.177331 TRUE
## NSK13 NSK 0.614824 TRUE
## ... ... ... ...
## SK5 SK 0.979791 TRUE
## SK6 SK 0.898952 TRUE
## SK7 SK 0.786680 TRUE
## SK8 SK 1.257792 TRUE
## SK9 SK 1.609233 TRUE
#ddsMF <- ddsHTSeq
#levels(ddsMF$type)
vsd <- vst(ddsHTSeq, blind=FALSE)
head(assay(vsd), 3)
## NSK1 NSK10 NSK11 NSK12 NSK13 NSK14
## ENSG00000000003.13 10.629839 10.246565 10.653592 11.490291 9.931235 12.307884
## ENSG00000000005.5 4.055727 4.055727 4.558601 4.381082 4.505090 4.055727
## ENSG00000000419.11 11.041604 10.244680 11.091010 11.084157 10.616650 11.720351
## NSK15 NSK16 NSK17 NSK18 NSK19 NSK2
## ENSG00000000003.13 10.814780 12.20348 12.294173 12.660084 10.911129 12.985426
## ENSG00000000005.5 4.553865 8.08755 5.078974 4.513144 4.055727 5.171051
## ENSG00000000419.11 10.764180 10.32809 10.505190 10.454354 10.509104 10.827314
## NSK20 NSK21 NSK22 NSK23 NSK24 NSK25
## ENSG00000000003.13 11.433397 10.383498 11.410823 11.499521 11.739228 11.918269
## ENSG00000000005.5 4.055727 5.388742 4.831131 4.345698 4.055727 4.438132
## ENSG00000000419.11 10.244262 10.082618 10.266306 10.633186 10.127650 11.055699
## NSK26 NSK27 NSK28 NSK29 NSK3 NSK30
## ENSG00000000003.13 12.541064 10.248761 10.659039 11.279465 10.787505 12.008173
## ENSG00000000005.5 4.505203 5.066085 4.983873 4.492204 5.055465 4.481925
## ENSG00000000419.11 10.978442 10.240340 10.653378 10.328885 10.412738 10.727329
## NSK4 NSK5 NSK6 NSK7 NSK8 NSK9
## ENSG00000000003.13 11.628547 12.203363 12.146297 11.096884 11.496832 12.284463
## ENSG00000000005.5 4.055727 4.418644 4.769275 4.308518 4.055727 4.055727
## ENSG00000000419.11 9.530335 11.012188 10.676569 9.901513 11.412967 10.796537
## SK1 SK10 SK11 SK12 SK13 SK14
## ENSG00000000003.13 10.465428 12.67751 10.860296 11.350796 12.487648 11.643013
## ENSG00000000005.5 5.590728 4.62645 4.055727 4.564486 4.055727 4.767747
## ENSG00000000419.11 10.207398 10.21258 10.343061 9.546005 11.315880 9.869542
## SK15 SK16 SK17 SK18 SK19 SK2
## ENSG00000000003.13 10.981093 11.897893 10.191228 11.627883 9.979860 11.658049
## ENSG00000000005.5 4.327786 4.055727 4.055727 4.899873 4.424726 4.055727
## ENSG00000000419.11 10.996255 10.771323 9.388993 10.631706 10.060417 9.652692
## SK20 SK21 SK22 SK23 SK24 SK25
## ENSG00000000003.13 10.191619 11.890599 11.701638 11.215111 11.788175 11.061951
## ENSG00000000005.5 4.691715 4.847086 4.316406 4.347482 4.055727 4.055727
## ENSG00000000419.11 10.129510 10.924587 10.267262 10.227988 11.002766 10.388950
## SK26 SK27 SK28 SK29 SK3 SK30
## ENSG00000000003.13 11.241665 10.915631 11.718055 11.051335 11.762035 11.023705
## ENSG00000000005.5 4.486288 4.055727 4.055727 4.055727 4.762194 5.512894
## ENSG00000000419.11 9.721033 10.960501 10.463742 10.688946 10.753419 9.923269
## SK4 SK5 SK6 SK7 SK8 SK9
## ENSG00000000003.13 10.443595 10.213536 11.451066 12.290541 12.895637 12.527255
## ENSG00000000005.5 4.614486 4.985174 4.580513 5.156962 4.370546 4.674567
## ENSG00000000419.11 10.467602 10.020022 10.346851 10.597663 10.998154 10.095902
BiocManager::install("vsn")
## Bioconductor version 3.14 (BiocManager 1.30.16), R 4.1.1 (2021-08-10)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
## re-install: 'vsn'
## Old packages: 'Matrix'
ntd <- normTransform(ddsHTSeq)
library("vsn")
meanSdPlot(assay(ntd))

meanSdPlot(assay(vsd))

#install.packages("pheatmap")
#install.packages("plyr")
#library("plyr")
#install.packages("stringr")
library("pheatmap")
library("stringr")
select <- order(rowMeans(counts(ddsHTSeq,normalized=TRUE)),
decreasing=TRUE)[1:20]
df <- as.data.frame(colData(ddsHTSeq)[,c("condition")])
colnames(assay(ntd)) <- str_sub(colnames(assay(ntd)), 1, 21)
rownames(df) <- colnames(assay(ntd))
pheatmap(assay(ntd)[select,],cluster_cols = F,cluster_rows = F,cellwidth = 2,cellheight = 8,fontsize_row = 9,fontsize_col = 9, annotation_col=df)

colnames(assay(vsd)) <- str_sub(colnames(assay(vsd)), 1, 21)
rownames(df) <- colnames(assay(vsd))
pheatmap(assay(vsd)[select,],cluster_cols = F,cluster_rows = F,cellwidth = 2,cellheight = 8,fontsize_row = 9,fontsize_col = 9, annotation_col=df)

sampleDists <- dist(t(assay(vsd)))
library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
fontsize_row = 5,
col=colors)

plotPCA(vsd, intgroup=c("condition"))
