选择BCR/ABL和ALL1/AF4的两类ALL

suppressMessages(library(ALL))
data(ALL)
datatable(pData(ALL))

types <- c("ALL1/AF4", "BCR/ABL")

# 包括B-cell和T-cell tumor我们仅分析B-cell部分
bcell <- grep("^B",as.character(ALL$BT))
ALL_af4bcr <- ALL[,intersect(bcell,which(ALL$mol.biol %in% types))]
ALL_af4bcr$mol.biol <- factor(ALL_af4bcr$mol.biol)
ALL_af4bcr

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 47 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: 01005 03002 ... 84004 (47 total)
##   varLabels: cod diagnosis ... date last seen (21 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 14684422 16243790 
## Annotation: hgu95av2

table(ALL_af4bcr$mol.biol)

## 
## ALL1/AF4  BCR/ABL 
##       10       37

染色体定位过表示分析

#非特异性过滤
qrange <- function(x) diff(quantile(x,c(0.1,0.9)))
library(genefilter)
filt_af4bcr <- nsFilter(ALL_af4bcr,require.entrez=TRUE,require.GOBP=TRUE,var.func=qrange,var.cutoff=0.5)
ALLfilt_af4bcr <- filt_af4bcr$eset

# t检验
rt <- rowttests(ALLfilt_af4bcr,"mol.biol")
names(rt)

## [1] "statistic" "dm"        "p.value"

datatable(head(rt,30))

# 选择p value最小的400个且p-value<0.05的基因
sel <- intersect(order(rt$p.value)[1:400], which(rt$p.value < 0.01))
ALLsub <- ALLfilt_af4bcr[sel,]

library(hgu95av2.db)
EGsub <- as.character(hgu95av2ENTREZID[featureNames(ALLsub)]) # 选择出的基因的Entrez Gene ID
table(table(EGsub)) # 映射到Entrez gene ID的Probe数目是唯一的

## 
##   1 
## 400

# Plot CD44 gene expression
syms <- as.character(hgu95av2SYMBOL[featureNames(ALLsub)])
whFeat <- names(which(syms=="CD44"))
ordSample <- order(ALLsub$mol.biol)
CD44 <- ALLsub[whFeat, ordSample]
plot(as.vector(exprs(CD44)),main=whFeat,col=c("sienna","tomato")[CD44$mol.biol], pch=c(15,16)[CD44$mol.biol], ylab="expression")
legend("topright", legend = levels(CD44$mol.biol), pch=c(15,16),col=c("sienna","tomato"))

# 映射到各染色体基因的数目
z <- toTable(hgu95av2CHR[featureNames(ALLsub)])
chrtab <- table(z$chromosome)
chrtab

## 
##  1 10 11 12 13 14 15 16 17 18 19  2 20 21 22  3  4  5  6  7  8  9  X  Y 
## 42 23 22 18  9 18  6 14 16  5 12 24 11  6 12 21 14 13 45 22 13 21 13  1

barplot(chrtab[order(names(chrtab))])

ps_chr <- toTable(hgu95av2CHR)
ps_eg <- toTable(hgu95av2ENTREZID)
chr <- merge(ps_chr,ps_eg)
head(chr)

##    probe_id chromosome gene_id
## 1  100_g_at         14    5875
## 2   1000_at         16    5595
## 3   1001_at          1    7075
## 4 1002_f_at         10    1557
## 5 1003_s_at         11     643
## 6   1004_at         11     643

table(table(chr$gene_id))

## 
##    1    2    3    4    5    6    7    8 
## 6716 1450  451  106   27   16    6    4

chr <- chr[!duplicated(chr$gene_id),]

isdiff <- chr$gene_id %in% EGsub
tab <- table(isdiff,chr$chromosome)
tab

##        
## isdiff    1  10  11  12  13  14  15  16  17  18  19   2  20  21  22   3   4   5
##   FALSE 878 304 495 474 150 268 241 353 503 124 529 538 219  82 234 455 315 379
##   TRUE   42  23  22  18   9  18   6  14  16   5  12  24  11   6  12  21  14  13
##        
## isdiff    6   7   8   9   X   Y
##   FALSE 474 387 295 305 357  17
##   TRUE   45  22  13  21  13   0

fisher.test(tab,simulate.p.value=TRUE)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  2000 replicates)
## 
## data:  tab
## p-value = 0.003498
## alternative hypothesis: two.sided

library(gtools)
# 使用一个循环来逐个染色体进行 Fisher 检验
for (chrom in mixedsort(unique(chr$chromosome))) {
  # 针对每个染色体提取数据
  chrom_data <- tab[,chrom] # 对当前染色体提取isdiff的频数表
  
  # 检查数据是否符合 Fisher 检验的条件
  if (min(chrom_data) > 0) {  # 如果某些类别频数为零，Fisher检验可能会失败
    chrom_matrix <- matrix(0, nrow = 2, ncol = 2, dimnames = list(c("TRUE", "FALSE"), c(chrom, paste0("Not",chrom))))
    chrom_matrix[, chrom] <- rev(chrom_data)
    chrom_matrix[, 2] <- rev(rowSums(tab[, colnames(tab) != chrom]))

    result <- fisher.test(chrom_matrix, simulate.p.value = TRUE, alternative = "greater")
    
    
    print(paste("Chromosome", chrom, "Fisher p-value:", round(result$p.value,2), ifelse(result$p.value < 0.05, "Significant *", "")))
    print(chrom_matrix) 
  } else {
    print(paste("Chromosome", chrom, "has zero count for one or more categories. Skipping Fisher test."))
  }
}

## [1] "Chromosome 1 Fisher p-value: 0.52 "
##         1 Not1
## TRUE   42  358
## FALSE 878 7498
## [1] "Chromosome 2 Fisher p-value: 0.66 "
##         2 Not2
## TRUE   24  376
## FALSE 538 7838
## [1] "Chromosome 3 Fisher p-value: 0.6 "
##         3 Not3
## TRUE   21  379
## FALSE 455 7921
## [1] "Chromosome 4 Fisher p-value: 0.64 "
##         4 Not4
## TRUE   14  386
## FALSE 315 8061
## [1] "Chromosome 5 Fisher p-value: 0.91 "
##         5 Not5
## TRUE   13  387
## FALSE 379 7997
## [1] "Chromosome 6 Fisher p-value: 0 Significant *"
##         6 Not6
## TRUE   45  355
## FALSE 474 7902
## [1] "Chromosome 7 Fisher p-value: 0.24 "
##         7 Not7
## TRUE   22  378
## FALSE 387 7989
## [1] "Chromosome 8 Fisher p-value: 0.65 "
##         8 Not8
## TRUE   13  387
## FALSE 295 8081
## [1] "Chromosome 9 Fisher p-value: 0.07 "
##         9 Not9
## TRUE   21  379
## FALSE 305 8071
## [1] "Chromosome 10 Fisher p-value: 0.03 Significant *"
##        10 Not10
## TRUE   23   377
## FALSE 304  8072
## [1] "Chromosome 11 Fisher p-value: 0.66 "
##        11 Not11
## TRUE   22   378
## FALSE 495  7881
## [1] "Chromosome 12 Fisher p-value: 0.87 "
##        12 Not12
## TRUE   18   382
## FALSE 474  7902
## [1] "Chromosome 13 Fisher p-value: 0.3 "
##        13 Not13
## TRUE    9   391
## FALSE 150  8226
## [1] "Chromosome 14 Fisher p-value: 0.1 "
##        14 Not14
## TRUE   18   382
## FALSE 268  8108
## [1] "Chromosome 15 Fisher p-value: 0.97 "
##        15 Not15
## TRUE    6   394
## FALSE 241  8135
## [1] "Chromosome 16 Fisher p-value: 0.79 "
##        16 Not16
## TRUE   14   386
## FALSE 353  8023
## [1] "Chromosome 17 Fisher p-value: 0.97 "
##        17 Not17
## TRUE   16   384
## FALSE 503  7873
## [1] "Chromosome 18 Fisher p-value: 0.71 "
##        18 Not18
## TRUE    5   395
## FALSE 124  8252
## [1] "Chromosome 19 Fisher p-value: 1 "
##        19 Not19
## TRUE   12   388
## FALSE 529  7847
## [1] "Chromosome 20 Fisher p-value: 0.48 "
##        20 Not20
## TRUE   11   389
## FALSE 219  8157
## [1] "Chromosome 21 Fisher p-value: 0.21 "
##       21 Not21
## TRUE   6   394
## FALSE 82  8294
## [1] "Chromosome 22 Fisher p-value: 0.45 "
##        22 Not22
## TRUE   12   388
## FALSE 234  8142
## [1] "Chromosome X Fisher p-value: 0.87 "
##         X NotX
## TRUE   13  387
## FALSE 357 8019
## [1] "Chromosome Y has zero count for one or more categories. Skipping Fisher test."

使用超几何检验GO富集分析

data(ALL)
bcell <- grep("^B",as.character(ALL$BT))
types <- c("NEG","BCR/ABL")
moltyp <- which(as.character(ALL$mol.biol) %in% types)
ALL_bcrneg <- ALL[,intersect(bcell,moltyp)]
ALL_bcrneg$mol.biol <- factor(ALL_bcrneg$mol.biol)
varCut <- 0.5
filt_bcrneg <- nsFilter(ALL_bcrneg,require.entrez = T, require.GOBP = T, remove.dupEntrez = T,var.func=IQR, var.cutoff = varCut, feature.exclude = "^AFFX")
ALLfilt_bcrneg <- filt_bcrneg$eset

ttestCutoff <- 0.05
ttests <- rowttests(ALLfilt_bcrneg,"mol.biol")
smPV <- ttests$p.value<ttestCutoff
pvalueFiltered <- ALLfilt_bcrneg[smPV,]
selectedEntrezIds <- unlist(mget(featureNames(pvalueFiltered), hgu95av2ENTREZID))
# 一般推荐使用Non-specific filter之后的基因作为Unverse
entrezUniverse <- unique(unlist(mget(featureNames(ALLfilt_bcrneg), hgu95av2ENTREZID)))

library(Category)
library(GOstats)
library(org.Hs.eg.db)
library(GO.db)

params <- new("GOHyperGParams",geneIds=as.character(selectedEntrezIds),universeGeneIds=as.character(entrezUniverse),annotation="hgu95av2.db",ontology="BP",pvalueCutoff=0.0001,testDirection="over")
hgOver <- hyperGTest(params)
hgOver

## Gene to GO BP  test for over-representation 
## 7175 GO BP ids tested (46 have p < 1e-04)
## Selected gene set size: 718 
##     Gene universe size: 4163 
##     Annotation package: hgu95av2

datatable(summary(hgOver))

Fisher and Hypergeometric testing

Wang Zhigang, wangzg.pumc@gmail.com

2024-11-08

选择BCR/ABL和ALL1/AF4的两类ALL

染色体定位过表示分析

使用超几何检验GO富集分析