knitr::opts_chunk$set(
echo = TRUE,#show code and output
message = FALSE,
warning = FALSE,
engine.path="/Users/zero/anaconda3/bin/python"
)
Load packages:
library(tidyverse)
library(rlang)
library(rvest)
library(GDCRNATools)
source("./scripts/get_group.R")
read data and clean name
star_count <- read_delim("data/star_count.txt",
"\t", escape_double = FALSE, trim_ws = TRUE)
star_count <- set_names(star_count,~ str_replace(.x, "_gene",""))
countmatrix <-
star_count %>%
dplyr::select(group$Sample_ID) %>% as.matrix()
id <- str_sub(colnames(countmatrix),nchar("18-R-1"),nchar("18-R-1")+2) %>%
str_c("IgA", "-", .)
group$id <- id
rownames(countmatrix) <- star_count$gene_name
colnames(countmatrix) <- id
cluster with rlog
#Experiment design
dds <- DESeq2::DESeqDataSetFromMatrix(countData = countmatrix, colData = group, design = ~group)
#filter
nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 22205
#normalization
library(DESeq2)
#rld
load("data/IgA_rlog_rld.Rdata")
vsd <- vst(dds, blind = FALSE)
dds <- estimateSizeFactors(dds)
df <- bind_rows(
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
colnames(df)[1:2] <- c("x", "y")
ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)

sampleDists <- dist(t(assay(rld)))
library("pheatmap")
library("RColorBrewer")
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- colnames(dds)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_method = "complete",#complete, average, single
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)

cluster with GDC normalization
cut tree
hc.complete <- hclust(sampleDists, method = "complete")
hc.average <- hclust(sampleDists, method = "average")
hc.single <- hclust(sampleDists, method = "single")
par(mfrow=c(1,1))
plot(hc.complete)
abline(h=65,col="red")

cutree(hc.complete,9)
## IgA-181 IgA-161 IgA-185 IgA-160 IgA-162 IgA-147 IgA-179 IgA-158 IgA-178
## 1 2 1 2 3 2 1 4 1
## IgA-167 IgA-149 IgA-153 IgA-175 IgA-152 IgA-163 IgA-169 IgA-187 IgA-184
## 1 4 4 1 2 5 6 6 1
## IgA-165 IgA-183 IgA-173 IgA-182 IgA-172 IgA-143 IgA-155 IgA-164 IgA-176
## 6 5 6 1 1 4 4 7 5
## IgA-171 IgA-159 IgA-168 IgA-170 IgA-150 IgA-142 IgA-148 IgA-157 IgA-144
## 5 4 8 5 2 4 2 4 2
## IgA-138 IgA-186 IgA-166 IgA-174 IgA-154 IgA-177 IgA-180 IgA-146 IgA-156
## 2 1 1 1 2 1 6 4 2
## IgA-140 IgA-145
## 9 2
cutree(hc.complete,9) %>%
table()
## .
## 1 2 3 4 5 6 7 8 9
## 13 11 1 9 5 5 1 1 1
get cluster subgroup and create group
cutree(hc.complete,9)
## IgA-181 IgA-161 IgA-185 IgA-160 IgA-162 IgA-147 IgA-179 IgA-158 IgA-178
## 1 2 1 2 3 2 1 4 1
## IgA-167 IgA-149 IgA-153 IgA-175 IgA-152 IgA-163 IgA-169 IgA-187 IgA-184
## 1 4 4 1 2 5 6 6 1
## IgA-165 IgA-183 IgA-173 IgA-182 IgA-172 IgA-143 IgA-155 IgA-164 IgA-176
## 6 5 6 1 1 4 4 7 5
## IgA-171 IgA-159 IgA-168 IgA-170 IgA-150 IgA-142 IgA-148 IgA-157 IgA-144
## 5 4 8 5 2 4 2 4 2
## IgA-138 IgA-186 IgA-166 IgA-174 IgA-154 IgA-177 IgA-180 IgA-146 IgA-156
## 2 1 1 1 2 1 6 4 2
## IgA-140 IgA-145
## 9 2
cutree(hc.complete,9) %>%
table()
## .
## 1 2 3 4 5 6 7 8 9
## 13 11 1 9 5 5 1 1 1
group$tree <- as.character(cutree(hc.complete,9))
#Experiment design
dds <- DESeq2::DESeqDataSetFromMatrix(countData = countmatrix, colData = group, design = ~tree)
#filter
nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 22205
DEG group name
table(group$tree)
##
## 1 2 3 4 5 6 7 8 9
## 13 11 1 9 5 5 1 1 1
#2-11-A
#1-13-B
#4-9-C
group$tree[which(group$tree=="2")] <- "A"
group$tree[which(group$tree=="1")] <- "B"
group$tree[which(group$tree=="4")] <- "C"
DEG B-vs-A
dds
group_BvsA <-
group %>%
filter(tree %in% c("A", "B"))
countmatix_BvsA <- countmatrix[,which(colnames(countmatrix) %in% group_BvsA$id)]
#experiment design
dds <- DESeqDataSetFromMatrix(countData = countmatix_BvsA, colData = group_BvsA,
design = ~tree)
comparison <- "B-vs-A"# 处理组比上对照组
dds$tree <- factor(dds$tree, levels = rev(strsplit(comparison, "-vs-", fixed = TRUE)[[1]]))
nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 21767
DESeq
n.cores <- 4
library(BiocParallel)
register(MulticoreParam(n.cores))
dds <- DESeq(dds, parallel = TRUE)
library(dplyr)
res.05 <- results(dds, contrast=c("tree","B","A"), alpha = 0.05)
summary(res.05)
##
## out of 21766 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 5662, 26%
## LFC < 0 (down) : 5824, 27%
## outliers [1] : 0, 0%
## low counts [2] : 2111, 9.7%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res <- results(dds)
annotation
library("AnnotationDbi")
library(org.Hs.eg.db)
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
#tolower() toupper()
res.05$symbol <- rownames(res.05)
res.05$ensembl <- mapIds(org.Hs.eg.db,
keys=row.names(res.05),
column="ENSEMBL",
keytype="SYMBOL",
multiVals="first")
res.05$entrezid <- mapIds(org.Hs.eg.db,
keys=row.names(res.05),
column="ENTREZID",
keytype="SYMBOL",
multiVals="first")
report
library(dplyr)
dir.create("./table/DEG_report/IgA_BvsA",recursive = TRUE)
DEG <- as.data.frame(res.05)
DEG %<>%
dplyr::mutate(abs_log2FC = abs(log2FoldChange)) %>%
dplyr::mutate(FoldChange = 2^(log2FoldChange)) %>%
dplyr::mutate(trend = if_else(log2FoldChange <= 0,"down","up"))
write.csv(DEG, file="./table/DEG_report/IgA_BvsA/results_DEG.csv")
library("ReportingTools")
report <- dplyr::select(DEG, symbol,abs_log2FC,trend,FoldChange,log2FoldChange,pvalue, padj, entrezid) %>%
dplyr::arrange(abs_log2FC)
htmlRep <- HTMLReport(shortName="DEG_list", title="ZERO report",
reportDirectory="./table/DEG_report/IgA_BvsA/")
publish(report, htmlRep)
url <- finish(htmlRep)
enrichr combine DEG 500: padj < 0.01 |log2FC| > 1
library(dplyr)
report %>%
dplyr::filter(padj <= 0.01) %>%
dplyr::filter(abs_log2FC >= 1) %>%
dplyr::arrange(abs_log2FC) %>%
.[1:500,] %>%
.[["trend"]] %>%
table()
## .
## down up
## 191 309
all DEG
library(dplyr)
report %>%
dplyr::filter(padj <= 0.01) %>%
dplyr::filter(abs_log2FC >= 1) %>%
dplyr::arrange(abs_log2FC) %>%
dplyr::select(symbol) %>%
.[[1]] %>%
clipr::write_clip()
front 500
report %>%
dplyr::filter(padj <= 0.01) %>%
dplyr::filter(abs_log2FC >= 1) %>%
dplyr::arrange(abs_log2FC) %>%
dplyr::select(symbol) %>%
.[[1]] %>% .[1:500] %>%
clipr::write_clip()
boxplot some gene
topGene="ACE"
intgroup <- c("tree")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = intgroup,
returnData = TRUE)
library(ggpubr)
library(plotly)
my_comparisons <- c("A", "B")
zero_boxplot <- function(dot_df){
p <- ggboxplot(dot_df, x = "tree", y = "count",
color = "tree", palette =c("#00AFBB", "#FC4E07"),
add = "jitter")
p$labels$x <- dot_df$symbol[1]
p$labels$y <- "Normalized Count"
p + stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
stat_compare_means(label.y = max(dot_df$count)+1) # Add global p-value
}
zero_boxplot(geneCounts)

DEG C-vs-A
dds
group_CvsA <-
group %>%
filter(tree %in% c("A", "C"))
countmatix_CvsA <- countmatrix[,which(colnames(countmatrix) %in% group_CvsA$id)]
#experiment design
dds <- DESeqDataSetFromMatrix(countData = countmatix_CvsA, colData = group_CvsA,
design = ~tree)
comparison <- "C-vs-A"# 处理组比上对照组
dds$tree <- factor(dds$tree, levels = rev(strsplit(comparison, "-vs-", fixed = TRUE)[[1]]))
nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 21582
DESeq
n.cores <- 4
library(BiocParallel)
register(MulticoreParam(n.cores))
dds <- DESeq(dds, parallel = TRUE)
library(dplyr)
res.05 <- results(dds, contrast=c("tree","C","A"), alpha = 0.05)
summary(res.05)
##
## out of 21581 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1855, 8.6%
## LFC < 0 (down) : 1605, 7.4%
## outliers [1] : 0, 0%
## low counts [2] : 3349, 16%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res <- results(dds)
annotation
library("AnnotationDbi")
library(org.Hs.eg.db)
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
#tolower() toupper()
res.05$symbol <- rownames(res.05)
res.05$ensembl <- mapIds(org.Hs.eg.db,
keys=row.names(res.05),
column="ENSEMBL",
keytype="SYMBOL",
multiVals="first")
res.05$entrezid <- mapIds(org.Hs.eg.db,
keys=row.names(res.05),
column="ENTREZID",
keytype="SYMBOL",
multiVals="first")
report
library(dplyr)
dir.create("./table/DEG_report/IgA_CvsA",recursive = TRUE)
DEG <- as.data.frame(res.05)
DEG %<>%
dplyr::mutate(abs_log2FC = abs(log2FoldChange)) %>%
dplyr::mutate(FoldChange = 2^(log2FoldChange)) %>%
dplyr::mutate(trend = if_else(log2FoldChange <= 0,"down","up"))
write.csv(DEG, file="./table/DEG_report/IgA_CvsA/results_DEG.csv")
library("ReportingTools")
report <- dplyr::select(DEG, symbol,abs_log2FC,trend,FoldChange,log2FoldChange,pvalue, padj, entrezid) %>%
dplyr::arrange(abs_log2FC)
htmlRep <- HTMLReport(shortName="DEG_list", title="ZERO report",
reportDirectory="./table/DEG_report/IgA_CvsA/")
publish(report, htmlRep)
url <- finish(htmlRep)
enrichr combine DEG 500: padj < 0.01 |log2FC| > 1
library(dplyr)
report %>%
dplyr::filter(padj <= 0.01) %>%
dplyr::filter(abs_log2FC >= 1) %>%
.[["trend"]] %>%
table()
## .
## down up
## 311 429
all DEG
library(dplyr)
report %>%
dplyr::filter(padj <= 0.01) %>%
dplyr::filter(abs_log2FC >= 1) %>%
dplyr::arrange(abs_log2FC) %>%
dplyr::select(symbol) %>%
.[[1]] %>%
clipr::write_clip()
front 500
report %>%
dplyr::filter(padj <= 0.01) %>%
dplyr::filter(abs_log2FC >= 1) %>%
dplyr::arrange(abs_log2FC) %>%
dplyr::select(symbol) %>%
.[[1]] %>% .[1:500] %>%
clipr::write_clip()
boxplot some gene
topGene="ACE"
intgroup <- c("tree")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = intgroup,
returnData = TRUE)
library(ggpubr)
library(plotly)
my_comparisons <- c("A", "C")
zero_boxplot <- function(dot_df){
p <- ggboxplot(dot_df, x = "tree", y = "count",
color = "tree", palette =c("#00AFBB", "#FC4E07"),
add = "jitter")
p$labels$x <- dot_df$symbol[1]
p$labels$y <- "Normalized Count"
p + stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
stat_compare_means(label.y = max(dot_df$count)+1) # Add global p-value
}
zero_boxplot(geneCounts)

DEG B-vs-C
dds
group_BvsC <-
group %>%
filter(tree %in% c("C", "B"))
countmatix_BvsC <- countmatrix[,which(colnames(countmatrix) %in% group_BvsC$id)]
#experiment design
dds <- DESeqDataSetFromMatrix(countData = countmatix_BvsC, colData = group_BvsC,
design = ~tree)
comparison <- "B-vs-C"# 处理组比上对照组
dds$tree <- factor(dds$tree, levels = rev(strsplit(comparison, "-vs-", fixed = TRUE)[[1]]))
nrow(dds)
## [1] 25359
dds <- dds[rowSums(DESeq2::counts(dds)) > 1, ]
nrow(dds)
## [1] 21677
DESeq
n.cores <- 4
library(BiocParallel)
register(MulticoreParam(n.cores))
dds <- DESeq(dds, parallel = TRUE)
library(dplyr)
res.05 <- results(dds, contrast=c("tree","B","C"), alpha = 0.05)
summary(res.05)
##
## out of 21676 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 4423, 20%
## LFC < 0 (down) : 4507, 21%
## outliers [1] : 0, 0%
## low counts [2] : 2943, 14%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res <- results(dds)
annotation
library("AnnotationDbi")
library(org.Hs.eg.db)
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
#tolower() toupper()
res.05$symbol <- rownames(res.05)
res.05$ensembl <- mapIds(org.Hs.eg.db,
keys=row.names(res.05),
column="ENSEMBL",
keytype="SYMBOL",
multiVals="first")
res.05$entrezid <- mapIds(org.Hs.eg.db,
keys=row.names(res.05),
column="ENTREZID",
keytype="SYMBOL",
multiVals="first")
report
library(dplyr)
dir.create("./table/DEG_report/IgA_BvsC",recursive = TRUE)
DEG <- as.data.frame(res.05)
DEG %<>%
dplyr::mutate(abs_log2FC = abs(log2FoldChange)) %>%
dplyr::mutate(FoldChange = 2^(log2FoldChange)) %>%
dplyr::mutate(trend = if_else(log2FoldChange <= 0,"down","up"))
write.csv(DEG, file="./table/DEG_report/IgA_BvsC/results_DEG.csv")
library("ReportingTools")
report <- dplyr::select(DEG, symbol,abs_log2FC,trend,FoldChange,log2FoldChange,pvalue, padj, entrezid) %>%
dplyr::arrange(abs_log2FC)
htmlRep <- HTMLReport(shortName="DEG_list", title="ZERO report",
reportDirectory="./table/DEG_report/IgA_BvsC/")
publish(report, htmlRep)
url <- finish(htmlRep)
enrichr combine DEG 500: padj < 0.01 |log2FC| > 1
library(dplyr)
report %>%
dplyr::filter(padj <= 0.01) %>%
dplyr::filter(abs_log2FC >= 1) %>%
#dplyr::arrange(abs_log2FC) %>%
#.[1:500,] %>%
.[["trend"]] %>%
table()
## .
## down up
## 83 773
all DEG
library(dplyr)
report %>%
dplyr::filter(padj <= 0.01) %>%
dplyr::filter(abs_log2FC >= 1) %>%
dplyr::arrange(abs_log2FC) %>%
dplyr::select(symbol) %>%
.[[1]] %>%
clipr::write_clip()
front 500
report %>%
dplyr::filter(padj <= 0.01) %>%
dplyr::filter(abs_log2FC >= 1) %>%
dplyr::arrange(abs_log2FC) %>%
dplyr::select(symbol) %>%
.[[1]] %>% .[1:500] %>%
clipr::write_clip()
boxplot some gene
topGene="ACE"
intgroup <- c("tree")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = intgroup,
returnData = TRUE)
library(ggpubr)
library(plotly)
my_comparisons <- c("C", "B")
zero_boxplot <- function(dot_df){
p <- ggboxplot(dot_df, x = "tree", y = "count",
color = "tree", palette =c("#00AFBB", "#FC4E07"),
add = "jitter")
p$labels$x <- dot_df$symbol[1]
p$labels$y <- "Normalized Count"
p + stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
stat_compare_means(label.y = max(dot_df$count)+1) # Add global p-value
}
zero_boxplot(geneCounts)
