TestingMay23_Dong

#chooseCRANmirror()
#install.packages("BiocManager")
#library(BiocManager)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
counts <- read.table("ProstCa_030921_2.txt")
#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

## 载入需要的程辑包：GenomicFeatures

## 载入需要的程辑包：BiocGenerics

## 载入需要的程辑包：parallel

## 
## 载入程辑包：'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 载入需要的程辑包：S4Vectors

## 载入需要的程辑包：stats4

## 
## 载入程辑包：'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## 载入需要的程辑包：IRanges

## 载入需要的程辑包：GenomeInfoDb

## 载入需要的程辑包：GenomicRanges

## 载入需要的程辑包：AnnotationDbi

## 载入需要的程辑包：Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevels(txdb)

##  [1] "chr1"                  "chr2"                  "chr3"                 
##  [4] "chr4"                  "chr5"                  "chr6"                 
##  [7] "chr7"                  "chr8"                  "chr9"                 
## [10] "chr10"                 "chr11"                 "chr12"                
## [13] "chr13"                 "chr14"                 "chr15"                
## [16] "chr16"                 "chr17"                 "chr18"                
## [19] "chr19"                 "chr20"                 "chr21"                
## [22] "chr22"                 "chrX"                  "chrY"                 
## [25] "chrM"                  "chr1_gl000191_random"  "chr1_gl000192_random" 
## [28] "chr4_ctg9_hap1"        "chr4_gl000193_random"  "chr4_gl000194_random" 
## [31] "chr6_apd_hap1"         "chr6_cox_hap2"         "chr6_dbb_hap3"        
## [34] "chr6_mann_hap4"        "chr6_mcf_hap5"         "chr6_qbl_hap6"        
## [37] "chr6_ssto_hap7"        "chr7_gl000195_random"  "chr8_gl000196_random" 
## [40] "chr8_gl000197_random"  "chr9_gl000198_random"  "chr9_gl000199_random" 
## [43] "chr9_gl000200_random"  "chr9_gl000201_random"  "chr11_gl000202_random"
## [46] "chr17_ctg5_hap1"       "chr17_gl000203_random" "chr17_gl000204_random"
## [49] "chr17_gl000205_random" "chr17_gl000206_random" "chr18_gl000207_random"
## [52] "chr19_gl000208_random" "chr19_gl000209_random" "chr21_gl000210_random"
## [55] "chrUn_gl000211"        "chrUn_gl000212"        "chrUn_gl000213"       
## [58] "chrUn_gl000214"        "chrUn_gl000215"        "chrUn_gl000216"       
## [61] "chrUn_gl000217"        "chrUn_gl000218"        "chrUn_gl000219"       
## [64] "chrUn_gl000220"        "chrUn_gl000221"        "chrUn_gl000222"       
## [67] "chrUn_gl000223"        "chrUn_gl000224"        "chrUn_gl000225"       
## [70] "chrUn_gl000226"        "chrUn_gl000227"        "chrUn_gl000228"       
## [73] "chrUn_gl000229"        "chrUn_gl000230"        "chrUn_gl000231"       
## [76] "chrUn_gl000232"        "chrUn_gl000233"        "chrUn_gl000234"       
## [79] "chrUn_gl000235"        "chrUn_gl000236"        "chrUn_gl000237"       
## [82] "chrUn_gl000238"        "chrUn_gl000239"        "chrUn_gl000240"       
## [85] "chrUn_gl000241"        "chrUn_gl000242"        "chrUn_gl000243"       
## [88] "chrUn_gl000244"        "chrUn_gl000245"        "chrUn_gl000246"       
## [91] "chrUn_gl000247"        "chrUn_gl000248"        "chrUn_gl000249"

library(stringr)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("Homo.sapiens")

## Bioconductor version 3.14 (BiocManager 1.30.15), R 4.1.0 (2021-05-18)

## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'Homo.sapiens'

## Old packages: 'annotate', 'AnnotationDbi', 'Biobase', 'BiocFileCache',
##   'BiocGenerics', 'BiocIO', 'BiocParallel', 'biomaRt', 'Biostrings',
##   'DelayedArray', 'DESeq2', 'genefilter', 'geneplotter', 'GenomeInfoDb',
##   'GenomicAlignments', 'GenomicFeatures', 'GenomicRanges', 'graph', 'IRanges',
##   'KEGGREST', 'MatrixGenerics', 'OrganismDbi', 'RBGL', 'Rhtslib', 'Rsamtools',
##   'rtracklayer', 'S4Vectors', 'SummarizedExperiment', 'XVector', 'zlibbioc'

library(Homo.sapiens)

## 载入需要的程辑包：OrganismDbi

## 载入需要的程辑包：GO.db

##

## 载入需要的程辑包：org.Hs.eg.db

##

geneid <- rownames(counts) 
gene <- str_match(geneid, "(\\w*).*")
geneid <- gene[,2]
geneid2<-geneid
geneid2=data.frame(geneid2)
counts<-cbind(counts,geneid2)

genes <- select(Homo.sapiens, keys=geneid,
                columns=c("SYMBOL","TXCHROM"),
                keytype="ENSEMBL")

## 'select()' returned many:many mapping between keys and columns

#dim(genes)





genes <- genes[!duplicated(genes$ENSEMBL),]
counts<-counts[!duplicated(counts$geneid2),]
#counts_2<-counts[which(counts$geneid2 %in% genes$ENSEMBL),]


row.names(counts)<-counts$geneid2

seqlevels(txdb) <- c("chr14")
genelist <- genes(txdb)
txlist <- transcripts(txdb)
exonlist <- exons(txdb)

#genelist

#txlist

#exonlist

library(GenomicAlignments)

## 载入需要的程辑包：SummarizedExperiment

## 载入需要的程辑包：MatrixGenerics

## 载入需要的程辑包：matrixStats

## 
## 载入程辑包：'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## 
## 载入程辑包：'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## The following object is masked from 'package:Biobase':
## 
##     rowMedians

## 载入需要的程辑包：Biostrings

## 载入需要的程辑包：XVector

## 
## 载入程辑包：'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

## 载入需要的程辑包：Rsamtools

#genes<-genes[-c(21),]
#counts<-counts[-c(21),]
counts<-cbind(counts,genes$SYMBOL)
#row.names(counts)<-genes$SYMBOL
#counts <- data.frame(row.names=genelist$gene_id)
#counts_3<-counts[rowSums(is.na(counts)) == 0,]
counts_4<-na.omit(counts)
counts_4<-counts_4[!duplicated(counts_4$`genes$SYMBOL`),]
row.names(counts_4)<-counts_4$`genes$SYMBOL`
counts_ready<-counts_4[,c(1,2,3,4,5,6)] #use counts_ready
counts<-counts_ready

counts_filtered<-counts[1:6]
colnames(counts_filtered)<-c("C1_1","C1_2","C1_3","T1_1","T1_2","T1_3")
coldata=data.frame(row.names=c("C1_1","C1_2","C1_3","T1_1","T1_2","T1_3"),
                   condition=rep(c("C1","T1"),each=3),
                   treatment=rep(c("C1","T1"),each=3))

#design=data.frame(row.names=c("C1_1","C1_2","C1_3","T1_1","T1_2","T1_3"),
                   #group=rep(c("C1","T1"),each=3),
                   #treatment=rep(c("C1","T1"),each=3))

coldata$condition<-factor(coldata$condition)
coldata$treatment<-factor(coldata$treatment)

#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("DESeq2")
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = counts_filtered,
                              colData = coldata,
                              design = ~ condition)

#Pre-Filtering

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

#Note on Factor Levels

dds$condition <- factor(dds$condition, levels = c("C1","T1"))

#Differential expression analysis

dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res <- results(dds)
res

## log2 fold change (MLE): condition T1 vs C1 
## Wald test p-value: condition T1 vs C1 
## DataFrame with 29912 rows and 6 columns
##                baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##               <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## TSPAN6        1107.8741      0.0325524  0.325711  0.0999427  0.920390  0.998905
## TNMD            32.8382      1.4031560  1.356203  1.0346212  0.300846  0.971182
## DPM1           473.0177      0.0528333  0.619625  0.0852666  0.932049  0.999246
## SCYL3         1171.0906     -0.1550594  0.417280 -0.3715957  0.710194  0.994762
## C1orf112       321.5382     -0.2140606  0.496694 -0.4309708  0.666490  0.994762
## ...                 ...            ...       ...        ...       ...       ...
## BMP8B-AS1      10.53852      -0.582843  3.484002 -0.1672914  0.867141  0.996308
## H2AL1SP         4.26468      -2.997713  3.846888 -0.7792566  0.435829        NA
## NIPBL-DT     1361.89229      -0.269032  0.465077 -0.5784681  0.562948  0.988494
## CERNA2         29.84232      -0.150351  1.893759 -0.0793926  0.936720  0.999246
## LOC100996886   32.40953       5.663064  2.655310  2.1327323        NA        NA

res <- results(dds, contrast=c("condition","T1","C1"))

#Log fold change shrinkage for visualization and ranking

resultsNames(dds)

## [1] "Intercept"          "condition_T1_vs_C1"

#To see the name of coef

#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("apeglm")
library(apeglm)

#install.packages("emdbook")
resLFC <- lfcShrink(dds, coef="condition_T1_vs_C1", type="apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

resLFC

## log2 fold change (MAP): condition T1 vs C1 
## Wald test p-value: condition T1 vs C1 
## DataFrame with 29912 rows and 5 columns
##                baseMean log2FoldChange     lfcSE    pvalue      padj
##               <numeric>      <numeric> <numeric> <numeric> <numeric>
## TSPAN6        1107.8741    0.001487557 0.0684490  0.920390  0.998905
## TNMD            32.8382    0.003521716 0.0700459  0.300846  0.971182
## DPM1           473.0177    0.000797669 0.0695532  0.932049  0.999246
## SCYL3         1171.0906   -0.004260059 0.0692136  0.710194  0.994762
## C1orf112       321.5382   -0.004050157 0.0694818  0.666490  0.994762
## ...                 ...            ...       ...       ...       ...
## BMP8B-AS1      10.53852   -0.000217798 0.0699759  0.867141  0.996308
## H2AL1SP         4.26468   -0.000759203 0.0699901  0.435829        NA
## NIPBL-DT     1361.89229   -0.005912317 0.0695784  0.562948  0.988494
## CERNA2         29.84232   -0.000177364 0.0699416  0.936720  0.999246
## LOC100996886   32.40953    0.002115149 0.0700345        NA        NA

#p-values and adjusted p-values

resOrdered <- res[order(res$pvalue),]

summary(res)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 65, 0.22%
## LFC < 0 (down)     : 8, 0.027%
## outliers [1]       : 1970, 6.6%
## low counts [2]     : 2320, 7.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

sum(res$padj < 0.1, na.rm=TRUE)

## [1] 73

#To see how manydjusted p-values are less than 0.1

res05 <- results(dds, alpha=0.05) #Pvalue cutoff 0.05
summary(res05)

## 
## out of 29912 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 37, 0.12%
## LFC < 0 (down)     : 5, 0.017%
## outliers [1]       : 1970, 6.6%
## low counts [2]     : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

sum(res05$padj < 0.05, na.rm=TRUE)

## [1] 42

#MA-Plot

plotMA(res, ylim=c(-2,2))

plotMA(resLFC, ylim=c(-2,2)) #With Log2 fold changes

#idx <- identify(res$baseMean, res$log2FoldChange)
#rownames(res)[idx]

#Alternative shrinkage estimators# To look for other choice

#install.packages("ashr")
resultsNames(dds)

## [1] "Intercept"          "condition_T1_vs_C1"

resNorm <- lfcShrink(dds, coef=2, type="normal")

## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

resAsh <- lfcShrink(dds, coef=2, type="ashr")

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

#Plot Counts

plotCounts(dds, gene=which.min(res$padj), intgroup="condition")

#Extracting transformed values

vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd), 3)

##             C1_1      C1_2      C1_3      T1_1      T1_2      T1_3
## TSPAN6 10.486297 10.707607 10.736549 10.639462 10.616665 10.753322
## TNMD    8.507890  8.503406  8.453807  8.562585  8.990775  8.436141
## DPM1    9.641772  9.944028 10.009209  9.943590  9.213472 10.309385

#Effects of transformations on the variance #Compare two conditions

#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("vsn")

#install.packages("hexbin")

# This will give log2(n + 1)
ntd <- normTransform(dds)

library("vsn")
meanSdPlot(assay(ntd))

#varianceStabilizingTransformation()

meanSdPlot(assay(vsd))

#This function transforms the count data to the log2 scale in a way which minimizes differences between samples for rows with small counts, and which normalizes with respect to library size.

meanSdPlot(assay(rld))

#Heatmap of the count matrix

#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("heatmaps")
#install.packages("pheatmap")
library("pheatmap")
select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:40]
df <- as.data.frame(colData(dds)[,c("condition","treatment")])
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=df)

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=FALSE, annotation_col=df)

pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=FALSE, annotation_col=df)

#Heatmap of the sample-to-sample distances

sampleDists <- dist(t(assay(vsd)))

library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

#Get names of Top genes(HeatMap)

head(assay(ntd)[select,],5)

##            C1_1     C1_2     C1_3     T1_1     T1_2     T1_3
## MALAT1 21.05841 21.18160 21.80432 21.70229 20.72645 21.90110
## RN7SL2 18.98204 19.45015 16.54463 19.29107 19.17892 16.98007
## MYH11  19.40984 19.20996 18.15997 17.96271 17.85943 18.21345
## NEAT1  17.54565 17.58597 18.01506 17.73164 17.67095 17.72571
## FLNA   18.39341 18.20658 17.07089 17.09578 16.89797 16.94960

#Get down and up genes from Res05

down <- subset(res05, log2FoldChange < 0)

up <- subset(res05, log2FoldChange > 0)

##Do we need to make it in the order(decresing order)?

head(down)

## log2 fold change (MLE): condition T1 vs C1 
## Wald test p-value: condition T1 vs C1 
## DataFrame with 6 rows and 6 columns
##           baseMean log2FoldChange     lfcSE      stat    pvalue      padj
##          <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
## SCYL3     1171.091     -0.1550594  0.417280 -0.371596  0.710194  0.992431
## C1orf112   321.538     -0.2140606  0.496694 -0.430971  0.666490  0.991134
## FGR        354.913     -0.1304822  0.548982 -0.237680  0.812129  0.995653
## CFH       3953.334     -0.4245270  0.614524 -0.690823  0.489677  0.973155
## NIPAL3    4608.390     -0.0818472  0.432757 -0.189130  0.849991  0.995653
## CFTR       229.442     -0.0777675  0.777474 -0.100026  0.920324  0.998486