选择BCR/ABL和ALL1/AF4的两类ALL
suppressMessages(library(ALL))
data(ALL)
datatable(pData(ALL))
types <- c("ALL1/AF4", "BCR/ABL")
# 包括B-cell和T-cell tumor我们仅分析B-cell部分
bcell <- grep("^B",as.character(ALL$BT))
ALL_af4bcr <- ALL[,intersect(bcell,which(ALL$mol.biol %in% types))]
ALL_af4bcr$mol.biol <- factor(ALL_af4bcr$mol.biol)
ALL_af4bcr
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 47 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 03002 ... 84004 (47 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
table(ALL_af4bcr$mol.biol)
##
## ALL1/AF4 BCR/ABL
## 10 37
染色体定位过表示分析
#非特异性过滤
qrange <- function(x) diff(quantile(x,c(0.1,0.9)))
library(genefilter)
filt_af4bcr <- nsFilter(ALL_af4bcr,require.entrez=TRUE,require.GOBP=TRUE,var.func=qrange,var.cutoff=0.5)
ALLfilt_af4bcr <- filt_af4bcr$eset
# t检验
rt <- rowttests(ALLfilt_af4bcr,"mol.biol")
names(rt)
## [1] "statistic" "dm" "p.value"
datatable(head(rt,30))
# 选择p value最小的400个且p-value<0.05的基因
sel <- intersect(order(rt$p.value)[1:400], which(rt$p.value < 0.01))
ALLsub <- ALLfilt_af4bcr[sel,]
library(hgu95av2.db)
EGsub <- as.character(hgu95av2ENTREZID[featureNames(ALLsub)]) # 选择出的基因的Entrez Gene ID
table(table(EGsub)) # 映射到Entrez gene ID的Probe数目是唯一的
##
## 1
## 400
# Plot CD44 gene expression
syms <- as.character(hgu95av2SYMBOL[featureNames(ALLsub)])
whFeat <- names(which(syms=="CD44"))
ordSample <- order(ALLsub$mol.biol)
CD44 <- ALLsub[whFeat, ordSample]
plot(as.vector(exprs(CD44)),main=whFeat,col=c("sienna","tomato")[CD44$mol.biol], pch=c(15,16)[CD44$mol.biol], ylab="expression")
legend("topright", legend = levels(CD44$mol.biol), pch=c(15,16),col=c("sienna","tomato"))

# 映射到各染色体基因的数目
z <- toTable(hgu95av2CHR[featureNames(ALLsub)])
chrtab <- table(z$chromosome)
chrtab
##
## 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 3 4 5 6 7 8 9 X Y
## 42 23 22 18 9 18 6 14 16 5 12 24 11 6 12 21 14 13 45 22 13 21 13 1
barplot(chrtab[order(names(chrtab))])

ps_chr <- toTable(hgu95av2CHR)
ps_eg <- toTable(hgu95av2ENTREZID)
chr <- merge(ps_chr,ps_eg)
head(chr)
## probe_id chromosome gene_id
## 1 100_g_at 14 5875
## 2 1000_at 16 5595
## 3 1001_at 1 7075
## 4 1002_f_at 10 1557
## 5 1003_s_at 11 643
## 6 1004_at 11 643
table(table(chr$gene_id))
##
## 1 2 3 4 5 6 7 8
## 6716 1450 451 106 27 16 6 4
chr <- chr[!duplicated(chr$gene_id),]
isdiff <- chr$gene_id %in% EGsub
tab <- table(isdiff,chr$chromosome)
tab
##
## isdiff 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 3 4 5
## FALSE 878 304 495 474 150 268 241 353 503 124 529 538 219 82 234 455 315 379
## TRUE 42 23 22 18 9 18 6 14 16 5 12 24 11 6 12 21 14 13
##
## isdiff 6 7 8 9 X Y
## FALSE 474 387 295 305 357 17
## TRUE 45 22 13 21 13 0
fisher.test(tab,simulate.p.value=TRUE)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 2000 replicates)
##
## data: tab
## p-value = 0.003498
## alternative hypothesis: two.sided
library(gtools)
# 使用一个循环来逐个染色体进行 Fisher 检验
for (chrom in mixedsort(unique(chr$chromosome))) {
# 针对每个染色体提取数据
chrom_data <- tab[,chrom] # 对当前染色体提取isdiff的频数表
# 检查数据是否符合 Fisher 检验的条件
if (min(chrom_data) > 0) { # 如果某些类别频数为零,Fisher检验可能会失败
chrom_matrix <- matrix(0, nrow = 2, ncol = 2, dimnames = list(c("TRUE", "FALSE"), c(chrom, paste0("Not",chrom))))
chrom_matrix[, chrom] <- rev(chrom_data)
chrom_matrix[, 2] <- rev(rowSums(tab[, colnames(tab) != chrom]))
result <- fisher.test(chrom_matrix, simulate.p.value = TRUE, alternative = "greater")
print(paste("Chromosome", chrom, "Fisher p-value:", round(result$p.value,2), ifelse(result$p.value < 0.05, "Significant *", "")))
print(chrom_matrix)
} else {
print(paste("Chromosome", chrom, "has zero count for one or more categories. Skipping Fisher test."))
}
}
## [1] "Chromosome 1 Fisher p-value: 0.52 "
## 1 Not1
## TRUE 42 358
## FALSE 878 7498
## [1] "Chromosome 2 Fisher p-value: 0.66 "
## 2 Not2
## TRUE 24 376
## FALSE 538 7838
## [1] "Chromosome 3 Fisher p-value: 0.6 "
## 3 Not3
## TRUE 21 379
## FALSE 455 7921
## [1] "Chromosome 4 Fisher p-value: 0.64 "
## 4 Not4
## TRUE 14 386
## FALSE 315 8061
## [1] "Chromosome 5 Fisher p-value: 0.91 "
## 5 Not5
## TRUE 13 387
## FALSE 379 7997
## [1] "Chromosome 6 Fisher p-value: 0 Significant *"
## 6 Not6
## TRUE 45 355
## FALSE 474 7902
## [1] "Chromosome 7 Fisher p-value: 0.24 "
## 7 Not7
## TRUE 22 378
## FALSE 387 7989
## [1] "Chromosome 8 Fisher p-value: 0.65 "
## 8 Not8
## TRUE 13 387
## FALSE 295 8081
## [1] "Chromosome 9 Fisher p-value: 0.07 "
## 9 Not9
## TRUE 21 379
## FALSE 305 8071
## [1] "Chromosome 10 Fisher p-value: 0.03 Significant *"
## 10 Not10
## TRUE 23 377
## FALSE 304 8072
## [1] "Chromosome 11 Fisher p-value: 0.66 "
## 11 Not11
## TRUE 22 378
## FALSE 495 7881
## [1] "Chromosome 12 Fisher p-value: 0.87 "
## 12 Not12
## TRUE 18 382
## FALSE 474 7902
## [1] "Chromosome 13 Fisher p-value: 0.3 "
## 13 Not13
## TRUE 9 391
## FALSE 150 8226
## [1] "Chromosome 14 Fisher p-value: 0.1 "
## 14 Not14
## TRUE 18 382
## FALSE 268 8108
## [1] "Chromosome 15 Fisher p-value: 0.97 "
## 15 Not15
## TRUE 6 394
## FALSE 241 8135
## [1] "Chromosome 16 Fisher p-value: 0.79 "
## 16 Not16
## TRUE 14 386
## FALSE 353 8023
## [1] "Chromosome 17 Fisher p-value: 0.97 "
## 17 Not17
## TRUE 16 384
## FALSE 503 7873
## [1] "Chromosome 18 Fisher p-value: 0.71 "
## 18 Not18
## TRUE 5 395
## FALSE 124 8252
## [1] "Chromosome 19 Fisher p-value: 1 "
## 19 Not19
## TRUE 12 388
## FALSE 529 7847
## [1] "Chromosome 20 Fisher p-value: 0.48 "
## 20 Not20
## TRUE 11 389
## FALSE 219 8157
## [1] "Chromosome 21 Fisher p-value: 0.21 "
## 21 Not21
## TRUE 6 394
## FALSE 82 8294
## [1] "Chromosome 22 Fisher p-value: 0.45 "
## 22 Not22
## TRUE 12 388
## FALSE 234 8142
## [1] "Chromosome X Fisher p-value: 0.87 "
## X NotX
## TRUE 13 387
## FALSE 357 8019
## [1] "Chromosome Y has zero count for one or more categories. Skipping Fisher test."
使用超几何检验GO富集分析
data(ALL)
bcell <- grep("^B",as.character(ALL$BT))
types <- c("NEG","BCR/ABL")
moltyp <- which(as.character(ALL$mol.biol) %in% types)
ALL_bcrneg <- ALL[,intersect(bcell,moltyp)]
ALL_bcrneg$mol.biol <- factor(ALL_bcrneg$mol.biol)
varCut <- 0.5
filt_bcrneg <- nsFilter(ALL_bcrneg,require.entrez = T, require.GOBP = T, remove.dupEntrez = T,var.func=IQR, var.cutoff = varCut, feature.exclude = "^AFFX")
ALLfilt_bcrneg <- filt_bcrneg$eset
ttestCutoff <- 0.05
ttests <- rowttests(ALLfilt_bcrneg,"mol.biol")
smPV <- ttests$p.value<ttestCutoff
pvalueFiltered <- ALLfilt_bcrneg[smPV,]
selectedEntrezIds <- unlist(mget(featureNames(pvalueFiltered), hgu95av2ENTREZID))
# 一般推荐使用Non-specific filter之后的基因作为Unverse
entrezUniverse <- unique(unlist(mget(featureNames(ALLfilt_bcrneg), hgu95av2ENTREZID)))
library(Category)
library(GOstats)
library(org.Hs.eg.db)
library(GO.db)
params <- new("GOHyperGParams",geneIds=as.character(selectedEntrezIds),universeGeneIds=as.character(entrezUniverse),annotation="hgu95av2.db",ontology="BP",pvalueCutoff=0.0001,testDirection="over")
hgOver <- hyperGTest(params)
hgOver
## Gene to GO BP test for over-representation
## 7175 GO BP ids tested (46 have p < 1e-04)
## Selected gene set size: 718
## Gene universe size: 4163
## Annotation package: hgu95av2
datatable(summary(hgOver))