milestone_510.knit

#if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
#     install.packages("apeglm")
#     BiocManager::install("BiocParallel")
#     BiocManager::install("IHW")

library(apeglm)
library(BiocParallel)

## Warning: package 'BiocParallel' was built under R version 4.1.2

library(IHW)

directory <- "/Users/gaohuizi/Desktop/510-HFP/twosmallgroup"

setwd(directory)
getwd()

## [1] "/Users/gaohuizi/Desktop/510-HFP/twosmallgroup"

sampleFiles <- grep("SK",list.files(directory),value=TRUE)

#View("sampleFiles")

sampleCondition <- sub("(.*SK).*","\\1",sampleFiles)

sampleTable <- data.frame(sampleName = sampleFiles,
                          fileName = sampleFiles,
                          condition = sampleCondition)
sampleTable$condition <- factor(sampleTable$condition)

#View("sampleCondition")

library("DESeq2")

## Loading required package: S4Vectors

## Warning: package 'S4Vectors' was built under R version 4.1.2

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Warning: package 'GenomicRanges' was built under R version 4.1.2

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
                                       directory = directory,
                                       design= ~ condition)
ddsHTSeq

## class: DESeqDataSet 
## dim: 60483 60 
## metadata(1): version
## assays(1): counts
## rownames(60483): ENSG00000000003.13 ENSG00000000005.5 ...
##   ENSGR0000280767.1 ENSGR0000281849.1
## rowData names(0):
## colnames(60): NSK1 NSK10 ... SK8 SK9
## colData names(1): condition

ddsHTSeq$condition <- factor(ddsHTSeq$condition, levels = c("NSK","SK"))
ddsHTSeq$condition

##  [1] NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK
## [20] NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK NSK SK  SK  SK  SK  SK  SK  SK  SK 
## [39] SK  SK  SK  SK  SK  SK  SK  SK  SK  SK  SK  SK  SK  SK  SK  SK  SK  SK  SK 
## [58] SK  SK  SK 
## Levels: NSK SK

ddsHTSeq <- DESeq(ddsHTSeq)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 8687 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

res <- results(ddsHTSeq)
res

## log2 fold change (MLE): condition SK vs NSK 
## Wald test p-value: condition SK vs NSK 
## DataFrame with 60483 rows and 6 columns
##                      baseMean log2FoldChange     lfcSE      stat    pvalue
##                     <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.13 3125.04995      -0.101553  0.203158 -0.499871 0.6171660
## ENSG00000000005.5     3.02716       0.107599  0.550764  0.195363 0.8451088
## ENSG00000000419.11 1483.19336      -0.242811  0.127203 -1.908843 0.0562824
## ENSG00000000457.12  890.18941       0.155545  0.127797  1.217130 0.2235548
## ENSG00000000460.15  441.31424       0.161581  0.247103  0.653902 0.5131749
## ...                       ...            ...       ...       ...       ...
## ENSGR0000275287.3           0             NA        NA        NA        NA
## ENSGR0000276543.3           0             NA        NA        NA        NA
## ENSGR0000277120.3           0             NA        NA        NA        NA
## ENSGR0000280767.1           0             NA        NA        NA        NA
## ENSGR0000281849.1           0             NA        NA        NA        NA
##                         padj
##                    <numeric>
## ENSG00000000003.13  0.852917
## ENSG00000000005.5   0.947795
## ENSG00000000419.11  0.328078
## ENSG00000000457.12  0.583603
## ENSG00000000460.15  0.799798
## ...                      ...
## ENSGR0000275287.3         NA
## ENSGR0000276543.3         NA
## ENSGR0000277120.3         NA
## ENSGR0000280767.1         NA
## ENSGR0000281849.1         NA

res <- results(ddsHTSeq, contrast=c("condition","SK","NSK"))
res

## log2 fold change (MLE): condition SK vs NSK 
## Wald test p-value: condition SK vs NSK 
## DataFrame with 60483 rows and 6 columns
##                      baseMean log2FoldChange     lfcSE      stat    pvalue
##                     <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.13 3125.04995      -0.101553  0.203158 -0.499871 0.6171660
## ENSG00000000005.5     3.02716       0.107599  0.550764  0.195363 0.8451088
## ENSG00000000419.11 1483.19336      -0.242811  0.127203 -1.908843 0.0562824
## ENSG00000000457.12  890.18941       0.155545  0.127797  1.217130 0.2235548
## ENSG00000000460.15  441.31424       0.161581  0.247103  0.653902 0.5131749
## ...                       ...            ...       ...       ...       ...
## ENSGR0000275287.3           0             NA        NA        NA        NA
## ENSGR0000276543.3           0             NA        NA        NA        NA
## ENSGR0000277120.3           0             NA        NA        NA        NA
## ENSGR0000280767.1           0             NA        NA        NA        NA
## ENSGR0000281849.1           0             NA        NA        NA        NA
##                         padj
##                    <numeric>
## ENSG00000000003.13  0.852917
## ENSG00000000005.5   0.947795
## ENSG00000000419.11  0.328078
## ENSG00000000457.12  0.583603
## ENSG00000000460.15  0.799798
## ...                      ...
## ENSGR0000275287.3         NA
## ENSGR0000276543.3         NA
## ENSGR0000277120.3         NA
## ENSGR0000280767.1         NA
## ENSGR0000281849.1         NA

resultsNames(ddsHTSeq)

## [1] "Intercept"           "condition_SK_vs_NSK"

resLFC <- lfcShrink(ddsHTSeq, coef="condition_SK_vs_NSK", type="apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

resLFC

## log2 fold change (MAP): condition SK vs NSK 
## Wald test p-value: condition SK vs NSK 
## DataFrame with 60483 rows and 5 columns
##                      baseMean log2FoldChange      lfcSE    pvalue      padj
##                     <numeric>      <numeric>  <numeric> <numeric> <numeric>
## ENSG00000000003.13 3125.04995   -7.92388e-06 0.00144267 0.6171660  0.852917
## ENSG00000000005.5     3.02716   -6.01038e-06 0.00144270 0.8451088  0.947795
## ENSG00000000419.11 1483.19336   -7.02132e-05 0.00144346 0.0562824  0.328078
## ENSG00000000457.12  890.18941    1.41207e-05 0.00144264 0.2235548  0.583603
## ENSG00000000460.15  441.31424    2.83565e-06 0.00144267 0.5131749  0.799798
## ...                       ...            ...        ...       ...       ...
## ENSGR0000275287.3           0             NA         NA        NA        NA
## ENSGR0000276543.3           0             NA         NA        NA        NA
## ENSGR0000277120.3           0             NA         NA        NA        NA
## ENSGR0000280767.1           0             NA         NA        NA        NA
## ENSGR0000281849.1           0             NA         NA        NA        NA

library("BiocParallel")
register(MulticoreParam(4))

resOrdered <- res[order(res$pvalue),]

summary(res)

## 
## out of 55653 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1339, 2.4%
## LFC < 0 (down)     : 322, 0.58%
## outliers [1]       : 0, 0%
## low counts [2]     : 19393, 35%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

sum(res$padj < 0.1, na.rm=TRUE)

## [1] 1661

res05 <- results(ddsHTSeq, alpha=0.05)
summary(res05)

## 
## out of 55653 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 760, 1.4%
## LFC < 0 (down)     : 175, 0.31%
## outliers [1]       : 0, 0%
## low counts [2]     : 20467, 37%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

sum(res05$padj < 0.05, na.rm=TRUE)

## [1] 935

library("IHW")
resIHW <- results(ddsHTSeq, filterFun=ihw)
summary(resIHW)

## 
## out of 55653 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1696, 3%
## LFC < 0 (down)     : 329, 0.59%
## outliers [1]       : 0, 0%
## [1] see 'cooksCutoff' argument of ?results
## see metadata(res)$ihwResult on hypothesis weighting

sum(resIHW$padj < 0.1, na.rm=TRUE)

## [1] 2025

metadata(resIHW)$ihwResult

## ihwResult object with 60483 hypothesis tests 
## Nominal FDR control level: 0.1 
## Split into 37 bins, based on an ordinal covariate

plotMA(res, ylim=c(-2,2))

plotMA(resLFC, ylim=c(-2,2))

#idx <- identify(res$baseMean, res$log2FoldChange)
#rownames(res)[idx]

resultsNames(ddsHTSeq)

## [1] "Intercept"           "condition_SK_vs_NSK"

resNorm <- lfcShrink(ddsHTSeq, coef=2, type="normal")

## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

resAsh <- lfcShrink(ddsHTSeq, coef=2, type="ashr")

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

plotCounts(ddsHTSeq, gene=which.min(res$padj), intgroup="condition")

d <- plotCounts(ddsHTSeq, gene=which.min(res$padj), intgroup="condition", 
                returnData=TRUE)
library("ggplot2")

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:IHW':
## 
##     alpha

ggplot(d, aes(x=condition, y=count)) + 
  geom_point(position=position_jitter(w=0.1,h=0)) + 
  scale_y_log10(breaks=c(25,100,400))

mcols(res)$description

## [1] "mean of normalized counts for all samples"  
## [2] "log2 fold change (MLE): condition SK vs NSK"
## [3] "standard error: condition SK vs NSK"        
## [4] "Wald statistic: condition SK vs NSK"        
## [5] "Wald test p-value: condition SK vs NSK"     
## [6] "BH adjusted p-values"

write.csv(as.data.frame(resOrdered), 
          file="condition_treated_results.csv")

resSig <- subset(resOrdered, padj < 0.1)
resSig

## log2 fold change (MLE): condition SK vs NSK 
## Wald test p-value: condition SK vs NSK 
## DataFrame with 1661 rows and 6 columns
##                      baseMean log2FoldChange     lfcSE      stat      pvalue
##                     <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000075043.16   116.9112       -5.20579  0.741800  -7.01778 2.25419e-12
## ENSG00000198028.3     24.7702       -5.45361  0.797955  -6.83449 8.22998e-12
## ENSG00000198883.10    52.0476        4.60650  0.689186   6.68397 2.32549e-11
## ENSG00000202538.1    780.1032        5.94446  0.911730   6.51997 7.03198e-11
## ENSG00000108244.15   680.8237       -4.27175  0.655880  -6.51300 7.36639e-11
## ...                       ...            ...       ...       ...         ...
## ENSG00000275833.1     4.78618       1.152818  0.406263   2.83762  0.00454519
## ENSG00000112992.15 3674.25885       0.476019  0.167779   2.83718  0.00455146
## ENSG00000241048.1     2.23282       3.208083  1.130780   2.83705  0.00455322
## ENSG00000204021.3    14.69839       2.299613  0.810568   2.83704  0.00455340
## ENSG00000259055.1     1.88827       3.144356  1.108699   2.83608  0.00456715
##                           padj
##                      <numeric>
## ENSG00000075043.16 8.18723e-08
## ENSG00000198028.3  1.49456e-07
## ENSG00000198883.10 2.81540e-07
## ENSG00000202538.1  4.62892e-07
## ENSG00000108244.15 4.62892e-07
## ...                        ...
## ENSG00000275833.1    0.0996263
## ENSG00000112992.15   0.0996263
## ENSG00000241048.1    0.0996263
## ENSG00000204021.3    0.0996263
## ENSG00000259055.1    0.0998668

colData(ddsHTSeq)

## DataFrame with 60 rows and 3 columns
##       condition sizeFactor replaceable
##        <factor>  <numeric>   <logical>
## NSK1        NSK   1.505694        TRUE
## NSK10       NSK   0.630384        TRUE
## NSK11       NSK   0.979879        TRUE
## NSK12       NSK   1.177331        TRUE
## NSK13       NSK   0.614824        TRUE
## ...         ...        ...         ...
## SK5          SK   0.979791        TRUE
## SK6          SK   0.898952        TRUE
## SK7          SK   0.786680        TRUE
## SK8          SK   1.257792        TRUE
## SK9          SK   1.609233        TRUE

#ddsMF <- ddsHTSeq

#levels(ddsMF$type)

vsd <- vst(ddsHTSeq, blind=FALSE)
head(assay(vsd), 3)

##                         NSK1     NSK10     NSK11     NSK12     NSK13     NSK14
## ENSG00000000003.13 10.629839 10.246565 10.653592 11.490291  9.931235 12.307884
## ENSG00000000005.5   4.055727  4.055727  4.558601  4.381082  4.505090  4.055727
## ENSG00000000419.11 11.041604 10.244680 11.091010 11.084157 10.616650 11.720351
##                        NSK15    NSK16     NSK17     NSK18     NSK19      NSK2
## ENSG00000000003.13 10.814780 12.20348 12.294173 12.660084 10.911129 12.985426
## ENSG00000000005.5   4.553865  8.08755  5.078974  4.513144  4.055727  5.171051
## ENSG00000000419.11 10.764180 10.32809 10.505190 10.454354 10.509104 10.827314
##                        NSK20     NSK21     NSK22     NSK23     NSK24     NSK25
## ENSG00000000003.13 11.433397 10.383498 11.410823 11.499521 11.739228 11.918269
## ENSG00000000005.5   4.055727  5.388742  4.831131  4.345698  4.055727  4.438132
## ENSG00000000419.11 10.244262 10.082618 10.266306 10.633186 10.127650 11.055699
##                        NSK26     NSK27     NSK28     NSK29      NSK3     NSK30
## ENSG00000000003.13 12.541064 10.248761 10.659039 11.279465 10.787505 12.008173
## ENSG00000000005.5   4.505203  5.066085  4.983873  4.492204  5.055465  4.481925
## ENSG00000000419.11 10.978442 10.240340 10.653378 10.328885 10.412738 10.727329
##                         NSK4      NSK5      NSK6      NSK7      NSK8      NSK9
## ENSG00000000003.13 11.628547 12.203363 12.146297 11.096884 11.496832 12.284463
## ENSG00000000005.5   4.055727  4.418644  4.769275  4.308518  4.055727  4.055727
## ENSG00000000419.11  9.530335 11.012188 10.676569  9.901513 11.412967 10.796537
##                          SK1     SK10      SK11      SK12      SK13      SK14
## ENSG00000000003.13 10.465428 12.67751 10.860296 11.350796 12.487648 11.643013
## ENSG00000000005.5   5.590728  4.62645  4.055727  4.564486  4.055727  4.767747
## ENSG00000000419.11 10.207398 10.21258 10.343061  9.546005 11.315880  9.869542
##                         SK15      SK16      SK17      SK18      SK19       SK2
## ENSG00000000003.13 10.981093 11.897893 10.191228 11.627883  9.979860 11.658049
## ENSG00000000005.5   4.327786  4.055727  4.055727  4.899873  4.424726  4.055727
## ENSG00000000419.11 10.996255 10.771323  9.388993 10.631706 10.060417  9.652692
##                         SK20      SK21      SK22      SK23      SK24      SK25
## ENSG00000000003.13 10.191619 11.890599 11.701638 11.215111 11.788175 11.061951
## ENSG00000000005.5   4.691715  4.847086  4.316406  4.347482  4.055727  4.055727
## ENSG00000000419.11 10.129510 10.924587 10.267262 10.227988 11.002766 10.388950
##                         SK26      SK27      SK28      SK29       SK3      SK30
## ENSG00000000003.13 11.241665 10.915631 11.718055 11.051335 11.762035 11.023705
## ENSG00000000005.5   4.486288  4.055727  4.055727  4.055727  4.762194  5.512894
## ENSG00000000419.11  9.721033 10.960501 10.463742 10.688946 10.753419  9.923269
##                          SK4       SK5       SK6       SK7       SK8       SK9
## ENSG00000000003.13 10.443595 10.213536 11.451066 12.290541 12.895637 12.527255
## ENSG00000000005.5   4.614486  4.985174  4.580513  5.156962  4.370546  4.674567
## ENSG00000000419.11 10.467602 10.020022 10.346851 10.597663 10.998154 10.095902

BiocManager::install("vsn")

## Bioconductor version 3.14 (BiocManager 1.30.16), R 4.1.1 (2021-08-10)

## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'vsn'

## Old packages: 'Matrix'

ntd <- normTransform(ddsHTSeq)
library("vsn")
meanSdPlot(assay(ntd))

meanSdPlot(assay(vsd))

#install.packages("pheatmap")

#install.packages("plyr")
#library("plyr")

#install.packages("stringr")

library("pheatmap")
library("stringr")
select <- order(rowMeans(counts(ddsHTSeq,normalized=TRUE)),
                decreasing=TRUE)[1:20]
df <- as.data.frame(colData(ddsHTSeq)[,c("condition")])
colnames(assay(ntd)) <- str_sub(colnames(assay(ntd)), 1, 21)
rownames(df) <- colnames(assay(ntd))
pheatmap(assay(ntd)[select,],cluster_cols = F,cluster_rows = F,cellwidth = 2,cellheight = 8,fontsize_row = 9,fontsize_col = 9, annotation_col=df)

colnames(assay(vsd)) <- str_sub(colnames(assay(vsd)), 1, 21)
rownames(df) <- colnames(assay(vsd))
pheatmap(assay(vsd)[select,],cluster_cols = F,cluster_rows = F,cellwidth = 2,cellheight = 8,fontsize_row = 9,fontsize_col = 9, annotation_col=df)

sampleDists <- dist(t(assay(vsd)))

library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         fontsize_row = 5,
         col=colors)

plotPCA(vsd, intgroup=c("condition"))