Wong CK, Yusta B, Koehler JA, Baggio LL et al. Divergent roles for the gut intraepithelial lymphocyte GLP-1R in control of metabolism, microbiota, and T cell-induced inflammation. Cell Metab 2022 Oct 4;34(10):1514-1531.e7. PMID: 36027914

数据准备

# IF 31.373
# 本研究利用小鼠肠道内皮淋巴细胞（IELs）探讨了GLP-1受体激动剂exendin-4对免疫细胞功能的潜在影响。
library(limma)
library(Glimma)
library(edgeR)
library(Mus.musculus)
library(DT)

# 读取基因表达计数数据
counts <- read.table("GSE202453_count_table.csv.gz", sep=",", row.names = 1, header=T)
head(counts) # 查看计数数据的前几行

##               X11_Ex4 X17_PBS X18_Ex4 X19_PBS X3_PBS X4_Ex4 X5_PBS X6_Ex4
## Zc3h14           3024    1717     941    1517   2091   1762   2601   2416
## Troap              47      40      30      17     71     70     20    103
## Clca2               0       4       0       0      0      3      6      8
## 1810013L24Rik    2659    1695    1399    1755   1999   1819   1814   1981
## Srsf7           11196    6475    4189    6864   5515   7741   7815   8631
## Fignl2             15      32      71      40     44     30     27     48

# 读取样本信息
sample_info <- read.table("GSE202453_series_matrix.txt", sep="\t", header=F, skip = 25, fill=T)
sn <- sample_info[sample_info$V1=="!Sample_description",] # 样本描述信息
sp <- sample_info[sample_info$V1=="!Sample_characteristics_ch1",][2,] # 样本特征信息

# 构建样本信息数据框
sample_info <- data.frame(SampleID=paste0("X", unlist(sn)[-1]), group=sub("treatment: ","", unlist(sp)[-1]))
rownames(sample_info) <- sample_info$SampleID

# 匹配计数数据和样本信息
sample_info <- sample_info[match(colnames(counts), rownames(sample_info)),]
sample_info$group <- as.factor(sub("Anti-CD3 + ", "", sample_info$group, fixed = T)) # 转换为因子
sample_info # 查看样本信息

##         SampleID group
## X11_Ex4  X11_Ex4   Ex4
## X17_PBS  X17_PBS   PBS
## X18_Ex4  X18_Ex4   Ex4
## X19_PBS  X19_PBS   PBS
## X3_PBS    X3_PBS   PBS
## X4_Ex4    X4_Ex4   Ex4
## X5_PBS    X5_PBS   PBS
## X6_Ex4    X6_Ex4   Ex4

低表达基因非特异性过滤

# 创建 DGEList 对象，并计算归一化因子
dge_list <- DGEList(counts = counts, samples = sample_info)
dge_list$samples$norm.factors <- calcNormFactors(counts, method="TMM") # 使用 TMM 方法计算归一化因子
class(dge_list) # 查看对象的类

## [1] "DGEList"
## attr(,"package")
## [1] "edgeR"

# 检查具有零计数的基因数量
table(rowSums(dge_list$counts == 0) == 8) # 3778 个基因在所有样本中计数为零

## 
## FALSE  TRUE 
## 21452  3778

# 过滤低表达基因
keep.exprs <- filterByExpr(dge_list, design = NULL, group = NULL, lib.size = NULL,
                  min.count = 10, min.total.count = 15)

# 创建非特异性过滤后的 DGEList 对象
dge_ns_filter <- dge_list[keep.exprs, keep.lib.sizes = F] 
dge_ns_filter$samples$norm.factors <- calcNormFactors(dge_ns_filter$counts, method="TMM")
dim(dge_list) # 原始基因数量：25230

## [1] 25230     8

dim(dge_ns_filter) # 过滤后的基因数量：14747

## [1] 14747     8

比较基因非特异性过滤前后的样本表达分布

# 计算 log-CPM 值以便进行比较
lcpm_original <- cpm(dge_list, log=TRUE, prior.count=2)

# 计算 log-CPM 的切割点
L <- mean(dge_list$samples$lib.size) * 1e-6
M <- median(dge_list$samples$lib.size) * 1e-6
lcpm.cutoff <- log2(10/M + 2/L)

library(RColorBrewer)
nsamples <- ncol(dge_list) # 样本数量
col <- brewer.pal(nsamples, "Paired") # 设置颜色

# 绘制原始数据的表达分布
plot(density(lcpm_original[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. 原始数据", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3) # 切割线
for (i in 2:nsamples) {
  den <- density(lcpm_original[,i])
  lines(den$x, den$y, col=col[i], lwd=2) # 添加其他样本的密度曲线
}
legend("topright", dge_list$samples$SampleID, text.col=col, bty="n")

# 过滤后的数据表达分布
lcpm_filter <- cpm(dge_ns_filter, log=TRUE)
plot(density(lcpm_filter[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. 过滤后的数据", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples) {
  den <- density(lcpm_filter[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", dge_ns_filter$samples$SampleID, text.col=col, bty="n")

# 盒状图展示表达分布
par(mfrow=c(1,2))
boxplot(lcpm_original, las=2, col=col, main="原始数据盒状图")
boxplot(lcpm_filter, las=2, col=col, main="过滤后数据盒状图")

dev.off()

## null device 
##           1

交互式的数据降维可视化

# 设置组别的颜色
col.group <- dge_ns_filter$samples$group
library(RColorBrewer)
levels(col.group) <-  brewer.pal(nlevels(col.group), "Set1")[1:2]
col.group <- as.character(col.group)
# plotMDS(lcpm_filter, labels=dge_ns_filter$samples$SampleID, col=col.group)
glimmaMDS(lcpm_filter, labels=dge_ns_filter$samples$SampleID, col=col.group, groups=dge_ns_filter$samples$group)

差异表达分析

# 设置线性模型设计矩阵
design <- model.matrix(~ 0 + group, data=dge_ns_filter$samples)
colnames(design) <- levels(dge_ns_filter$samples$group) # 设置列名

# 应用 voom 转换并绘制均值-方差图
v <- voom(dge_ns_filter, design, plot=TRUE)

v # 查看 voom 转换后的对象

## An object of class "EList"
## $targets
##         group lib.size norm.factors SampleID
## X11_Ex4   Ex4 28985310    0.9107021  X11_Ex4
## X17_PBS   PBS 20164836    1.0287602  X17_PBS
## X18_Ex4   Ex4 14141039    1.0580820  X18_Ex4
## X19_PBS   PBS 17768302    0.9734807  X19_PBS
## X3_PBS    PBS 20774746    1.0674535   X3_PBS
## X4_Ex4    Ex4 22577548    1.0273882   X4_Ex4
## X5_PBS    PBS 24099004    0.9516797   X5_PBS
## X6_Ex4    Ex4 26567449    0.9928610   X6_Ex4
## 
## $E
##                  X11_Ex4   X17_PBS  X18_Ex4     X19_PBS   X3_PBS    X4_Ex4
## Zc3h14         6.7052309 6.4123245 6.057001  6.41624681 6.653563 6.2865906
## Troap          0.7126055 1.0060801 1.108921 -0.02195097 1.783112 1.6427344
## 1810013L24Rik  6.5196892 6.3937252 6.628879  6.62643229 6.588665 6.3325094
## Srsf7          8.5935101 8.3270060 8.210746  8.59370493 8.052517 8.4215804
## Fignl2        -0.9030538 0.6885979 2.338055  1.18861601 1.098974 0.4339204
##                   X5_PBS   X6_Ex4
## Zc3h14         6.7542263 6.507115
## Troap         -0.2333497 1.961899
## 1810013L24Rik  6.2344547 6.220789
## Srsf7          8.3412208 8.343808
## Fignl2         0.1904580 0.868325
## 14742 more rows ...
## 
## $weights
##           [,1]      [,2]       [,3]      [,4]      [,5]      [,6]      [,7]
## [1,] 23.217482 22.545601 20.3358678 22.100985 22.634645 22.531022 23.051353
## [2,]  2.634676  1.280504  1.4549682  1.147845  1.314249  2.155419  1.496397
## [3,] 23.269191 22.306345 20.4900599 21.811156 22.412031 22.606499 22.876849
## [4,] 23.902041 23.966672 23.9486807 23.969402 23.965214 23.951802 23.948281
## [5,]  1.805551  1.409078  0.9750576  1.261871  1.446212  1.457423  1.644921
##           [,8]
## [1,] 23.000867
## [2,]  2.456207
## [3,] 23.063493
## [4,] 23.922035
## [5,]  1.678333
## 14742 more rows ...
## 
## $design
##         Ex4 PBS
## X11_Ex4   1   0
## X17_PBS   0   1
## X18_Ex4   1   0
## X19_PBS   0   1
## X3_PBS    0   1
## X4_Ex4    1   0
## X5_PBS    0   1
## X6_Ex4    1   0
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"

v$targets等同于x$samples，而v$E中所储存的表达值类似于进行了log-CPM转换后的counts

# 拟合线性模型
efit <- lmFit(v, design)
efit <- contrasts.fit(efit, contrasts=makeContrasts(Ex4 - PBS, levels = design))
efit <- eBayes(efit)　# 应用经验贝叶斯方法

plotSA(efit)

summary(decideTests(efit)) # 查看差异基因数量

##        Ex4 - PBS
## Down           0
## NotSig     14746
## Up             1

说明：图中展示了每个基因的均值（x轴）与方差（y轴）的关系，左侧图表示在使用 voom 方法之前，这些基因的均值和方差之间的相关性，而右侧图则显示了应用 voom 权重后，这种趋势是如何被消除的。

左侧图使用 voom 函数绘制，该函数为经过 log-CPM 转换的数据拟合线性模型并提取残差方差。然后，对残差方差取四次方根（或者对标准差取平方根），并相对于每个基因的平均表达进行绘图。均值是通过将平均计数加上 2 后进行 log2 转换计算得到的。

右侧图则使用 plotSA 函数绘制了 log2 残差标准差与 log-CPM 均值的关系。在这两幅图中，每个黑点代表一个基因。左侧图中的红色曲线展示了用于计算 voom 权重的均值-方差趋势的估计结果，而右侧图中，由经验贝叶斯算法得到的平均 log2 残差标准差通过水平蓝线标出。

# 决定哪些基因在实验组和对照组之间差异表达
dt <- decideTests(efit, adjust.method="none", p.value=0.01, lfc=1)
summary(dt)

##        Ex4 - PBS
## Down          24
## NotSig     14671
## Up            52

# 提取差异表达的基因
de <- which(dt[,1]!=0)
de <- rownames(v$E)[de]

# 获取所有差异基因的结果
re <- topTable(efit, number = Inf)
result <- re[rownames(re) %in% de,]
datatable(result)

差异表达可以视化

# 使用 Glimma 包绘制交互式的差异表达图
# coef=1 指定要比较的组，status 用于指示差异表达的状态
# counts 是 log2 CPM 转换后的计数数据，groups 指定样本组信息
glimmaMD(efit, coef=1, status=dt[,1], main=colnames(efit)[1], 
          counts=lcpm_filter, groups= dge_ns_filter$samples$group)

火山图

# 使用 ggplot2 绘制火山图
library(ggplot2)

# 创建火山图，x 轴为 Log2 Fold Change，y 轴为 -Log10 P-value
ggplot(re, aes(x=logFC, y=-log10(P.Value))) +
    geom_point(aes(color=abs(logFC) > 1 & re$P.Value < 0.01), alpha=0.5) +
    scale_color_manual(values=c("grey", "red")) +
    labs(title="差异表达火山图",
         x="Log2 Fold Change",
         y="-Log10 P-value") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5)) +
    geom_vline(xintercept = c(-1, 1), linetype="dashed", color="blue") +
    geom_hline(yintercept = -log10(0.01), linetype="dashed", color="blue") +
    guides(color=guide_legend(title="显著差异表达基因"))

热图

# 使用 gplots 包绘制热图
library(gplots)

# 设置颜色面板，从蓝色到红色
mycol <- colorpanel(1000,"blue","white","red")

# 绘制热图，行按基因进行缩放，列按样本分组显示
heatmap.2(lcpm_filter[de,], scale="row",
   labRow=de, labCol=dge_ns_filter$samples$group, 
   col=mycol, trace="none", main="基因表达热图")

save.image("env.RData")

GO annotation

# 使用 goana 进行 GO 注释
head(keys(Mus.musculus, keytype = "SYMBOL"), 10)

##  [1] "Pzp"   "Aanat" "Aatk"  "Abca1" "Abca4" "Abca2" "Abcb7" "Abcg1" "Abi1" 
## [10] "Abl1"

de_entrez_ids <- mapIds(Mus.musculus,
                     keys = rownames(result),
                     column = "ENTREZID",
                     keytype = "SYMBOL",
                     multiVals = "first")
de_entrez_ids <- na.omit(de_entrez_ids)
go_annotation <- goana(as.vector(de_entrez_ids), species="Mm")
datatable(topGO(go_annotation, ontology = "BP", number = 10))

Differential gene analysis using the Limma package

wangzg, wangzg@pumc.edu.cn

2024-11-05