# Load the data
data(ALL)
# Get the number of samples and features
nsamples <- dim(ALL)[2]
nfeatures <- dim(ALL)[1]
# Get the phenotypes
pheno.dat <- pData(ALL)
# Get the average age
mu.age <- pheno.dat$age %>% mean(na.rm=T)
Ther are 128 samples and 12625 features in this data set and the average age is 32.4.
sub.idx <- intersect(grep('^B',pheno.dat$BT),which(pheno.dat$mol.biol %in% c('BCR/ABL','NEG')))
eset <- ALL[,sub.idx]
table(eset$mol)
##
## ALL1/AF4 BCR/ABL E2A/PBX1 NEG NUP-98 p15/p16
## 0 37 0 42 0 0
nsamps <- table(eset$mol) %>% sum
There are 79 sample numbers belonging to these groups.
f1 <- pOverA(0.25,log2(100))
f2 <- function(x) { IQR(x) > 0.5 }
ff <- filterfun(f1,f2)
selected <- genefilter(eset,ff)
nprobes.passed <- sum(selected)
esetSub <- eset[selected,]
After filtering, we have that 2391 probes passed.
tt <- rowttests(esetSub,'mol.biol')
q6.pval <- sum(tt$p.value<0.05)
We have that 451 features have nominal p-values less than 5% from the t-test.
pa <- p.adjust(tt$p.value,method='BH')
q7.pval <- sum(pa < 0.05)
We have that 108 features have a p-value of less than 5% after adjusting for the false discovery rate.
design <- cbind(1,diff=as.numeric(esetSub$mol.biol=='BCR/ABL'))
fit <- lmFit(esetSub,design)
fit2 <- eBayes(fit)
limma_pa <- topTable(fit2,number=length(fit2$F),coef='diff',adjust.method='fdr')
q8.pval <- sum(limma_pa$adj.P.Val<0.05)
We see that 112 features that are significant at an FDR of 5%.
As the figure shows below, the relationship between the limma and two-sample t-test results are linear in relationship. Indeed a quick regression shows that regression one on the other has an almost perfect R-squared.
limma.genes <- limma_pa %>% rownames
classical.genes <- tt %>% rownames
comp.dat <- tibble(limma=limma_pa$adj.P.Val,classical=pa[match(limma.genes,classical.genes)])
ggplot(comp.dat,aes(x=limma,y=classical)) +
geom_point() + labs(x='limma t-test',y='Two-sample t-test')
lm(limma~-1+classical,data=comp.dat) %>% summary
##
## Call:
## lm(formula = limma ~ -1 + classical, data = comp.dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027757 -0.005191 -0.000605 0.003383 0.042834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## classical 1.0002925 0.0002621 3817 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00823 on 2390 degrees of freedom
## Multiple R-squared: 0.9998, Adjusted R-squared: 0.9998
## F-statistic: 1.457e+07 on 1 and 2390 DF, p-value: < 2.2e-16
diff <- order(pa)[1:10]
genesymbols <- mget(featureNames(esetSub)[diff],hgu95av2SYMBOL)
genesymbols %>% unlist
## 1636_g_at 39730_at 1635_at 40202_at 37027_at 39837_s_at
## "ABL1" "ABL1" "ABL1" "KLF9" "AHNAK" "ZNF467"
## 33774_at 36591_at 40480_s_at 37014_at
## "CASP8" "TUBA4A" "FYN" "MX1"
We see that the top associated gene is ABL1 which is affected by the translocation of characterizing the BCR/ABL samples.