knitr::opts_chunk$set(
    echo = TRUE,#show code and output
    message = FALSE,
    warning = FALSE,
    engine.path="/Users/zero/anaconda3/bin/python"
)

Load packages:

library(tidyverse)
library(rlang)
library(rvest)
library(GDCRNATools)
source("./scripts/get_group.R")

read data and clean name

star_count <- read_delim("data/star_count.txt", 
    "\t", escape_double = FALSE, trim_ws = TRUE)
star_count <- set_names(star_count,~ str_replace(.x, "_gene",""))

countmatrix <-
  star_count %>%
  dplyr::select(group$Sample_ID) %>% as.matrix()

id <- str_sub(colnames(countmatrix),nchar("18-R-1"),nchar("18-R-1")+2) %>% 
  str_c("IgA", "-", .)
group$id <- id 

rownames(countmatrix) <- star_count$gene_name
colnames(countmatrix)  <- id

cluster with rlog

#Experiment design
dds <- DESeq2::DESeqDataSetFromMatrix(countData = countmatrix, colData = group, design = ~group)

#filter
nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 22205
#normalization
library(DESeq2)
#rld
load("data/IgA_rlog_rld.Rdata")
vsd <- vst(dds, blind = FALSE)
dds <- estimateSizeFactors(dds)

df <- bind_rows(
  as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
         mutate(transformation = "log2(x + 1)"),
  as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"),
  as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
  
colnames(df)[1:2] <- c("x", "y")  

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
  coord_fixed() + facet_grid( . ~ transformation) 

sampleDists <- dist(t(assay(rld)))
library("pheatmap")
library("RColorBrewer")
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- colnames(dds) 
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
 pheatmap(sampleDistMatrix,
          clustering_method = "complete",#complete, average, single
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)

cluster with GDC normalization

cut tree

hc.complete <- hclust(sampleDists, method = "complete")
hc.average <- hclust(sampleDists, method = "average")
hc.single <- hclust(sampleDists, method = "single")
par(mfrow=c(1,1))
plot(hc.complete)
abline(h=65,col="red")

cutree(hc.complete,9) 
## IgA-181 IgA-161 IgA-185 IgA-160 IgA-162 IgA-147 IgA-179 IgA-158 IgA-178 
##       1       2       1       2       3       2       1       4       1 
## IgA-167 IgA-149 IgA-153 IgA-175 IgA-152 IgA-163 IgA-169 IgA-187 IgA-184 
##       1       4       4       1       2       5       6       6       1 
## IgA-165 IgA-183 IgA-173 IgA-182 IgA-172 IgA-143 IgA-155 IgA-164 IgA-176 
##       6       5       6       1       1       4       4       7       5 
## IgA-171 IgA-159 IgA-168 IgA-170 IgA-150 IgA-142 IgA-148 IgA-157 IgA-144 
##       5       4       8       5       2       4       2       4       2 
## IgA-138 IgA-186 IgA-166 IgA-174 IgA-154 IgA-177 IgA-180 IgA-146 IgA-156 
##       2       1       1       1       2       1       6       4       2 
## IgA-140 IgA-145 
##       9       2
cutree(hc.complete,9)  %>% 
  table()
## .
##  1  2  3  4  5  6  7  8  9 
## 13 11  1  9  5  5  1  1  1

get cluster subgroup and create group

cutree(hc.complete,9) 
## IgA-181 IgA-161 IgA-185 IgA-160 IgA-162 IgA-147 IgA-179 IgA-158 IgA-178 
##       1       2       1       2       3       2       1       4       1 
## IgA-167 IgA-149 IgA-153 IgA-175 IgA-152 IgA-163 IgA-169 IgA-187 IgA-184 
##       1       4       4       1       2       5       6       6       1 
## IgA-165 IgA-183 IgA-173 IgA-182 IgA-172 IgA-143 IgA-155 IgA-164 IgA-176 
##       6       5       6       1       1       4       4       7       5 
## IgA-171 IgA-159 IgA-168 IgA-170 IgA-150 IgA-142 IgA-148 IgA-157 IgA-144 
##       5       4       8       5       2       4       2       4       2 
## IgA-138 IgA-186 IgA-166 IgA-174 IgA-154 IgA-177 IgA-180 IgA-146 IgA-156 
##       2       1       1       1       2       1       6       4       2 
## IgA-140 IgA-145 
##       9       2
cutree(hc.complete,9)  %>% 
  table()
## .
##  1  2  3  4  5  6  7  8  9 
## 13 11  1  9  5  5  1  1  1
group$tree <- as.character(cutree(hc.complete,9))
#Experiment design
dds <- DESeq2::DESeqDataSetFromMatrix(countData = countmatrix, colData = group, design = ~tree)

#filter
nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 22205

DEG group name

table(group$tree)
## 
##  1  2  3  4  5  6  7  8  9 
## 13 11  1  9  5  5  1  1  1
#2-11-A
#1-13-B
#4-9-C
group$tree[which(group$tree=="2")] <- "A"
group$tree[which(group$tree=="1")] <- "B"
group$tree[which(group$tree=="4")] <- "C"

DEG B-vs-A

dds

group_BvsA <-
  group %>%
  filter(tree %in% c("A", "B")) 

countmatix_BvsA <- countmatrix[,which(colnames(countmatrix) %in% group_BvsA$id)]


#experiment design

dds <- DESeqDataSetFromMatrix(countData = countmatix_BvsA, colData = group_BvsA, 
            design = ~tree)
comparison <- "B-vs-A"# 处理组比上对照组
dds$tree <- factor(dds$tree, levels = rev(strsplit(comparison, "-vs-", fixed = TRUE)[[1]]))

nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 21767

DESeq

n.cores <- 4
library(BiocParallel)
register(MulticoreParam(n.cores))
dds <- DESeq(dds, parallel = TRUE)
library(dplyr)
res.05 <- results(dds, contrast=c("tree","B","A"), alpha = 0.05)
summary(res.05)
## 
## out of 21766 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)     : 5662, 26% 
## LFC < 0 (down)   : 5824, 27% 
## outliers [1]     : 0, 0% 
## low counts [2]   : 2111, 9.7% 
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res <- results(dds)

annotation

library("AnnotationDbi")
library(org.Hs.eg.db)
columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 
##  [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    
##  [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    
## [17] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"
#tolower() toupper()
res.05$symbol <- rownames(res.05)
res.05$ensembl <- mapIds(org.Hs.eg.db,
                         keys=row.names(res.05),
                         column="ENSEMBL",
                         keytype="SYMBOL",
                         multiVals="first")
res.05$entrezid <- mapIds(org.Hs.eg.db,
                          keys=row.names(res.05),
                          column="ENTREZID",
                          keytype="SYMBOL",
                          multiVals="first")

report

library(dplyr)
dir.create("./table/DEG_report/IgA_BvsA",recursive = TRUE)
DEG <- as.data.frame(res.05)

DEG %<>% 
  dplyr::mutate(abs_log2FC = abs(log2FoldChange)) %>% 
  dplyr::mutate(FoldChange = 2^(log2FoldChange))  %>% 
  dplyr::mutate(trend = if_else(log2FoldChange <= 0,"down","up"))
write.csv(DEG, file="./table/DEG_report/IgA_BvsA/results_DEG.csv")
library("ReportingTools")
report <- dplyr::select(DEG, symbol,abs_log2FC,trend,FoldChange,log2FoldChange,pvalue, padj, entrezid) %>% 
          dplyr::arrange(abs_log2FC)
htmlRep <- HTMLReport(shortName="DEG_list", title="ZERO report",
                      reportDirectory="./table/DEG_report/IgA_BvsA/")
publish(report, htmlRep)
url <- finish(htmlRep)

enrichr combine DEG 500: padj < 0.01 |log2FC| > 1

library(dplyr)
report %>% 
  dplyr::filter(padj <= 0.01) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  .[1:500,] %>% 
  .[["trend"]] %>% 
  table()
## .
## down   up 
##  191  309

all DEG

library(dplyr)
report %>% 
  dplyr::filter(padj <= 0.01) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% 
  clipr::write_clip()

front 500

report %>% 
  dplyr::filter(padj <= 0.01) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% .[1:500] %>% 
  clipr::write_clip()

boxplot some gene

topGene="ACE"
intgroup <- c("tree")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = intgroup,
                         returnData = TRUE)
library(ggpubr)
library(plotly)
my_comparisons <- c("A", "B")
zero_boxplot <- function(dot_df){
  p <-  ggboxplot(dot_df, x = "tree", y = "count",
                color = "tree", palette =c("#00AFBB", "#FC4E07"),
                add = "jitter")
  p$labels$x <- dot_df$symbol[1]
  p$labels$y <- "Normalized Count"

  p + stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
  stat_compare_means(label.y = max(dot_df$count)+1)                   # Add global p-value

}

zero_boxplot(geneCounts)

DEG C-vs-A

dds

group_CvsA <-
  group %>%
  filter(tree %in% c("A", "C")) 

countmatix_CvsA <- countmatrix[,which(colnames(countmatrix) %in% group_CvsA$id)]


#experiment design

dds <- DESeqDataSetFromMatrix(countData = countmatix_CvsA, colData = group_CvsA, 
            design = ~tree)
comparison <- "C-vs-A"# 处理组比上对照组
dds$tree <- factor(dds$tree, levels = rev(strsplit(comparison, "-vs-", fixed = TRUE)[[1]]))

nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 21582

DESeq

n.cores <- 4
library(BiocParallel)
register(MulticoreParam(n.cores))
dds <- DESeq(dds, parallel = TRUE)
library(dplyr)
res.05 <- results(dds, contrast=c("tree","C","A"), alpha = 0.05)
summary(res.05)
## 
## out of 21581 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)     : 1855, 8.6% 
## LFC < 0 (down)   : 1605, 7.4% 
## outliers [1]     : 0, 0% 
## low counts [2]   : 3349, 16% 
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res <- results(dds)

annotation

library("AnnotationDbi")
library(org.Hs.eg.db)
columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 
##  [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    
##  [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    
## [17] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"
#tolower() toupper()
res.05$symbol <- rownames(res.05)
res.05$ensembl <- mapIds(org.Hs.eg.db,
                         keys=row.names(res.05),
                         column="ENSEMBL",
                         keytype="SYMBOL",
                         multiVals="first")
res.05$entrezid <- mapIds(org.Hs.eg.db,
                          keys=row.names(res.05),
                          column="ENTREZID",
                          keytype="SYMBOL",
                          multiVals="first")

report

library(dplyr)
dir.create("./table/DEG_report/IgA_CvsA",recursive = TRUE)
DEG <- as.data.frame(res.05)

DEG %<>% 
  dplyr::mutate(abs_log2FC = abs(log2FoldChange)) %>% 
  dplyr::mutate(FoldChange = 2^(log2FoldChange))  %>% 
  dplyr::mutate(trend = if_else(log2FoldChange <= 0,"down","up"))
write.csv(DEG, file="./table/DEG_report/IgA_CvsA/results_DEG.csv")
library("ReportingTools")
report <- dplyr::select(DEG, symbol,abs_log2FC,trend,FoldChange,log2FoldChange,pvalue, padj, entrezid) %>% 
          dplyr::arrange(abs_log2FC)
htmlRep <- HTMLReport(shortName="DEG_list", title="ZERO report",
                      reportDirectory="./table/DEG_report/IgA_CvsA/")
publish(report, htmlRep)
url <- finish(htmlRep)

enrichr combine DEG 500: padj < 0.01 |log2FC| > 1

library(dplyr)
report %>% 
  dplyr::filter(padj <= 0.01) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  .[["trend"]] %>% 
  table()
## .
## down   up 
##  311  429

all DEG

library(dplyr)
report %>% 
  dplyr::filter(padj <= 0.01) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% 
  clipr::write_clip()

front 500

report %>% 
  dplyr::filter(padj <= 0.01) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% .[1:500] %>% 
  clipr::write_clip()

boxplot some gene

topGene="ACE"
intgroup <- c("tree")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = intgroup,
                         returnData = TRUE)
library(ggpubr)
library(plotly)
my_comparisons <- c("A", "C")
zero_boxplot <- function(dot_df){
  p <-  ggboxplot(dot_df, x = "tree", y = "count",
                color = "tree", palette =c("#00AFBB", "#FC4E07"),
                add = "jitter")
  p$labels$x <- dot_df$symbol[1]
  p$labels$y <- "Normalized Count"

  p + stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
  stat_compare_means(label.y = max(dot_df$count)+1)                   # Add global p-value

}

zero_boxplot(geneCounts)

DEG B-vs-C

dds

group_BvsC <-
  group %>%
  filter(tree %in% c("C", "B")) 

countmatix_BvsC <- countmatrix[,which(colnames(countmatrix) %in% group_BvsC$id)]


#experiment design

dds <- DESeqDataSetFromMatrix(countData = countmatix_BvsC, colData = group_BvsC, 
            design = ~tree)
comparison <- "B-vs-C"# 处理组比上对照组
dds$tree <- factor(dds$tree, levels = rev(strsplit(comparison, "-vs-", fixed = TRUE)[[1]]))

nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 21677

DESeq

n.cores <- 4
library(BiocParallel)
register(MulticoreParam(n.cores))
dds <- DESeq(dds, parallel = TRUE)
library(dplyr)
res.05 <- results(dds, contrast=c("tree","B","C"), alpha = 0.05)
summary(res.05)
## 
## out of 21676 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)     : 4423, 20% 
## LFC < 0 (down)   : 4507, 21% 
## outliers [1]     : 0, 0% 
## low counts [2]   : 2943, 14% 
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res <- results(dds)

annotation

library("AnnotationDbi")
library(org.Hs.eg.db)
columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 
##  [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    
##  [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    
## [17] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"
#tolower() toupper()
res.05$symbol <- rownames(res.05)
res.05$ensembl <- mapIds(org.Hs.eg.db,
                         keys=row.names(res.05),
                         column="ENSEMBL",
                         keytype="SYMBOL",
                         multiVals="first")
res.05$entrezid <- mapIds(org.Hs.eg.db,
                          keys=row.names(res.05),
                          column="ENTREZID",
                          keytype="SYMBOL",
                          multiVals="first")

report

library(dplyr)
dir.create("./table/DEG_report/IgA_BvsC",recursive = TRUE)
DEG <- as.data.frame(res.05)

DEG %<>% 
  dplyr::mutate(abs_log2FC = abs(log2FoldChange)) %>% 
  dplyr::mutate(FoldChange = 2^(log2FoldChange))  %>% 
  dplyr::mutate(trend = if_else(log2FoldChange <= 0,"down","up"))
write.csv(DEG, file="./table/DEG_report/IgA_BvsC/results_DEG.csv")
library("ReportingTools")
report <- dplyr::select(DEG, symbol,abs_log2FC,trend,FoldChange,log2FoldChange,pvalue, padj, entrezid) %>% 
          dplyr::arrange(abs_log2FC)
htmlRep <- HTMLReport(shortName="DEG_list", title="ZERO report",
                      reportDirectory="./table/DEG_report/IgA_BvsC/")
publish(report, htmlRep)
url <- finish(htmlRep)

enrichr combine DEG 500: padj < 0.01 |log2FC| > 1

library(dplyr)
report %>% 
  dplyr::filter(padj <= 0.01) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  #dplyr::arrange(abs_log2FC) %>% 
  #.[1:500,] %>% 
  .[["trend"]] %>% 
  table()
## .
## down   up 
##   83  773

all DEG

library(dplyr)
report %>% 
  dplyr::filter(padj <= 0.01) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% 
  clipr::write_clip()

front 500

report %>% 
  dplyr::filter(padj <= 0.01) %>% 
  dplyr::filter(abs_log2FC >= 1) %>% 
  dplyr::arrange(abs_log2FC) %>% 
  dplyr::select(symbol) %>% 
  .[[1]] %>% .[1:500] %>% 
  clipr::write_clip()

boxplot some gene

topGene="ACE"
intgroup <- c("tree")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = intgroup,
                         returnData = TRUE)
library(ggpubr)
library(plotly)
my_comparisons <- c("C", "B")
zero_boxplot <- function(dot_df){
  p <-  ggboxplot(dot_df, x = "tree", y = "count",
                color = "tree", palette =c("#00AFBB", "#FC4E07"),
                add = "jitter")
  p$labels$x <- dot_df$symbol[1]
  p$labels$y <- "Normalized Count"

  p + stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
  stat_compare_means(label.y = max(dot_df$count)+1)                   # Add global p-value

}

zero_boxplot(geneCounts)