# IF 31.373
# 本研究利用小鼠肠道内皮淋巴细胞(IELs)探讨了GLP-1受体激动剂exendin-4对免疫细胞功能的潜在影响。
library(limma)
library(Glimma)
library(edgeR)
library(Mus.musculus)
library(DT)
# 读取基因表达计数数据
counts <- read.table("GSE202453_count_table.csv.gz", sep=",", row.names = 1, header=T)
head(counts) # 查看计数数据的前几行
## X11_Ex4 X17_PBS X18_Ex4 X19_PBS X3_PBS X4_Ex4 X5_PBS X6_Ex4
## Zc3h14 3024 1717 941 1517 2091 1762 2601 2416
## Troap 47 40 30 17 71 70 20 103
## Clca2 0 4 0 0 0 3 6 8
## 1810013L24Rik 2659 1695 1399 1755 1999 1819 1814 1981
## Srsf7 11196 6475 4189 6864 5515 7741 7815 8631
## Fignl2 15 32 71 40 44 30 27 48
# 读取样本信息
sample_info <- read.table("GSE202453_series_matrix.txt", sep="\t", header=F, skip = 25, fill=T)
sn <- sample_info[sample_info$V1=="!Sample_description",] # 样本描述信息
sp <- sample_info[sample_info$V1=="!Sample_characteristics_ch1",][2,] # 样本特征信息
# 构建样本信息数据框
sample_info <- data.frame(SampleID=paste0("X", unlist(sn)[-1]), group=sub("treatment: ","", unlist(sp)[-1]))
rownames(sample_info) <- sample_info$SampleID
# 匹配计数数据和样本信息
sample_info <- sample_info[match(colnames(counts), rownames(sample_info)),]
sample_info$group <- as.factor(sub("Anti-CD3 + ", "", sample_info$group, fixed = T)) # 转换为因子
sample_info # 查看样本信息
## SampleID group
## X11_Ex4 X11_Ex4 Ex4
## X17_PBS X17_PBS PBS
## X18_Ex4 X18_Ex4 Ex4
## X19_PBS X19_PBS PBS
## X3_PBS X3_PBS PBS
## X4_Ex4 X4_Ex4 Ex4
## X5_PBS X5_PBS PBS
## X6_Ex4 X6_Ex4 Ex4
# 创建 DGEList 对象,并计算归一化因子
dge_list <- DGEList(counts = counts, samples = sample_info)
dge_list$samples$norm.factors <- calcNormFactors(counts, method="TMM") # 使用 TMM 方法计算归一化因子
class(dge_list) # 查看对象的类
## [1] "DGEList"
## attr(,"package")
## [1] "edgeR"
# 检查具有零计数的基因数量
table(rowSums(dge_list$counts == 0) == 8) # 3778 个基因在所有样本中计数为零
##
## FALSE TRUE
## 21452 3778
# 过滤低表达基因
keep.exprs <- filterByExpr(dge_list, design = NULL, group = NULL, lib.size = NULL,
min.count = 10, min.total.count = 15)
# 创建非特异性过滤后的 DGEList 对象
dge_ns_filter <- dge_list[keep.exprs, keep.lib.sizes = F]
dge_ns_filter$samples$norm.factors <- calcNormFactors(dge_ns_filter$counts, method="TMM")
dim(dge_list) # 原始基因数量:25230
## [1] 25230 8
dim(dge_ns_filter) # 过滤后的基因数量:14747
## [1] 14747 8
# 计算 log-CPM 值以便进行比较
lcpm_original <- cpm(dge_list, log=TRUE, prior.count=2)
# 计算 log-CPM 的切割点
L <- mean(dge_list$samples$lib.size) * 1e-6
M <- median(dge_list$samples$lib.size) * 1e-6
lcpm.cutoff <- log2(10/M + 2/L)
library(RColorBrewer)
nsamples <- ncol(dge_list) # 样本数量
col <- brewer.pal(nsamples, "Paired") # 设置颜色
# 绘制原始数据的表达分布
plot(density(lcpm_original[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. 原始数据", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3) # 切割线
for (i in 2:nsamples) {
den <- density(lcpm_original[,i])
lines(den$x, den$y, col=col[i], lwd=2) # 添加其他样本的密度曲线
}
legend("topright", dge_list$samples$SampleID, text.col=col, bty="n")
# 过滤后的数据表达分布
lcpm_filter <- cpm(dge_ns_filter, log=TRUE)
plot(density(lcpm_filter[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. 过滤后的数据", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples) {
den <- density(lcpm_filter[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", dge_ns_filter$samples$SampleID, text.col=col, bty="n")
# 盒状图展示表达分布
par(mfrow=c(1,2))
boxplot(lcpm_original, las=2, col=col, main="原始数据盒状图")
boxplot(lcpm_filter, las=2, col=col, main="过滤后数据盒状图")
dev.off()
## null device
## 1
# 设置组别的颜色
col.group <- dge_ns_filter$samples$group
library(RColorBrewer)
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")[1:2]
col.group <- as.character(col.group)
# plotMDS(lcpm_filter, labels=dge_ns_filter$samples$SampleID, col=col.group)
glimmaMDS(lcpm_filter, labels=dge_ns_filter$samples$SampleID, col=col.group, groups=dge_ns_filter$samples$group)
# 设置线性模型设计矩阵
design <- model.matrix(~ 0 + group, data=dge_ns_filter$samples)
colnames(design) <- levels(dge_ns_filter$samples$group) # 设置列名
# 应用 voom 转换并绘制均值-方差图
v <- voom(dge_ns_filter, design, plot=TRUE)
v # 查看 voom 转换后的对象
## An object of class "EList"
## $targets
## group lib.size norm.factors SampleID
## X11_Ex4 Ex4 28985310 0.9107021 X11_Ex4
## X17_PBS PBS 20164836 1.0287602 X17_PBS
## X18_Ex4 Ex4 14141039 1.0580820 X18_Ex4
## X19_PBS PBS 17768302 0.9734807 X19_PBS
## X3_PBS PBS 20774746 1.0674535 X3_PBS
## X4_Ex4 Ex4 22577548 1.0273882 X4_Ex4
## X5_PBS PBS 24099004 0.9516797 X5_PBS
## X6_Ex4 Ex4 26567449 0.9928610 X6_Ex4
##
## $E
## X11_Ex4 X17_PBS X18_Ex4 X19_PBS X3_PBS X4_Ex4
## Zc3h14 6.7052309 6.4123245 6.057001 6.41624681 6.653563 6.2865906
## Troap 0.7126055 1.0060801 1.108921 -0.02195097 1.783112 1.6427344
## 1810013L24Rik 6.5196892 6.3937252 6.628879 6.62643229 6.588665 6.3325094
## Srsf7 8.5935101 8.3270060 8.210746 8.59370493 8.052517 8.4215804
## Fignl2 -0.9030538 0.6885979 2.338055 1.18861601 1.098974 0.4339204
## X5_PBS X6_Ex4
## Zc3h14 6.7542263 6.507115
## Troap -0.2333497 1.961899
## 1810013L24Rik 6.2344547 6.220789
## Srsf7 8.3412208 8.343808
## Fignl2 0.1904580 0.868325
## 14742 more rows ...
##
## $weights
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 23.217482 22.545601 20.3358678 22.100985 22.634645 22.531022 23.051353
## [2,] 2.634676 1.280504 1.4549682 1.147845 1.314249 2.155419 1.496397
## [3,] 23.269191 22.306345 20.4900599 21.811156 22.412031 22.606499 22.876849
## [4,] 23.902041 23.966672 23.9486807 23.969402 23.965214 23.951802 23.948281
## [5,] 1.805551 1.409078 0.9750576 1.261871 1.446212 1.457423 1.644921
## [,8]
## [1,] 23.000867
## [2,] 2.456207
## [3,] 23.063493
## [4,] 23.922035
## [5,] 1.678333
## 14742 more rows ...
##
## $design
## Ex4 PBS
## X11_Ex4 1 0
## X17_PBS 0 1
## X18_Ex4 1 0
## X19_PBS 0 1
## X3_PBS 0 1
## X4_Ex4 1 0
## X5_PBS 0 1
## X6_Ex4 1 0
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
v$targets等同于x$samples,而v$E中所储存的表达值类似于进行了log-CPM转换后的counts
# 拟合线性模型
efit <- lmFit(v, design)
efit <- contrasts.fit(efit, contrasts=makeContrasts(Ex4 - PBS, levels = design))
efit <- eBayes(efit) # 应用经验贝叶斯方法
plotSA(efit)
summary(decideTests(efit)) # 查看差异基因数量
## Ex4 - PBS
## Down 0
## NotSig 14746
## Up 1
说明:图中展示了每个基因的均值(x轴)与方差(y轴)的关系,左侧图表示在使用 voom 方法之前,这些基因的均值和方差之间的相关性,而右侧图则显示了应用 voom 权重后,这种趋势是如何被消除的。
左侧图使用 voom 函数绘制,该函数为经过 log-CPM 转换的数据拟合线性模型并提取残差方差。然后,对残差方差取四次方根(或者对标准差取平方根),并相对于每个基因的平均表达进行绘图。均值是通过将平均计数加上 2 后进行 log2 转换计算得到的。
右侧图则使用 plotSA 函数绘制了 log2 残差标准差与 log-CPM 均值的关系。在这两幅图中,每个黑点代表一个基因。左侧图中的红色曲线展示了用于计算 voom 权重的均值-方差趋势的估计结果,而右侧图中,由经验贝叶斯算法得到的平均 log2 残差标准差通过水平蓝线标出。
# 决定哪些基因在实验组和对照组之间差异表达
dt <- decideTests(efit, adjust.method="none", p.value=0.01, lfc=1)
summary(dt)
## Ex4 - PBS
## Down 24
## NotSig 14671
## Up 52
# 提取差异表达的基因
de <- which(dt[,1]!=0)
de <- rownames(v$E)[de]
# 获取所有差异基因的结果
re <- topTable(efit, number = Inf)
result <- re[rownames(re) %in% de,]
datatable(result)
# 使用 Glimma 包绘制交互式的差异表达图
# coef=1 指定要比较的组,status 用于指示差异表达的状态
# counts 是 log2 CPM 转换后的计数数据,groups 指定样本组信息
glimmaMD(efit, coef=1, status=dt[,1], main=colnames(efit)[1],
counts=lcpm_filter, groups= dge_ns_filter$samples$group)
# 使用 ggplot2 绘制火山图
library(ggplot2)
# 创建火山图,x 轴为 Log2 Fold Change,y 轴为 -Log10 P-value
ggplot(re, aes(x=logFC, y=-log10(P.Value))) +
geom_point(aes(color=abs(logFC) > 1 & re$P.Value < 0.01), alpha=0.5) +
scale_color_manual(values=c("grey", "red")) +
labs(title="差异表达火山图",
x="Log2 Fold Change",
y="-Log10 P-value") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_vline(xintercept = c(-1, 1), linetype="dashed", color="blue") +
geom_hline(yintercept = -log10(0.01), linetype="dashed", color="blue") +
guides(color=guide_legend(title="显著差异表达基因"))
# 使用 gplots 包绘制热图
library(gplots)
# 设置颜色面板,从蓝色到红色
mycol <- colorpanel(1000,"blue","white","red")
# 绘制热图,行按基因进行缩放,列按样本分组显示
heatmap.2(lcpm_filter[de,], scale="row",
labRow=de, labCol=dge_ns_filter$samples$group,
col=mycol, trace="none", main="基因表达热图")
save.image("env.RData")
# 使用 goana 进行 GO 注释
head(keys(Mus.musculus, keytype = "SYMBOL"), 10)
## [1] "Pzp" "Aanat" "Aatk" "Abca1" "Abca4" "Abca2" "Abcb7" "Abcg1" "Abi1"
## [10] "Abl1"
de_entrez_ids <- mapIds(Mus.musculus,
keys = rownames(result),
column = "ENTREZID",
keytype = "SYMBOL",
multiVals = "first")
de_entrez_ids <- na.omit(de_entrez_ids)
go_annotation <- goana(as.vector(de_entrez_ids), species="Mm")
datatable(topGO(go_annotation, ontology = "BP", number = 10))