Questions 1,2, and 3

# Load the data
data(ALL) 
# Get the number of samples and features
nsamples <- dim(ALL)[2]
nfeatures <- dim(ALL)[1]
# Get the phenotypes
pheno.dat <- pData(ALL) 
# Get the average age
mu.age <- pheno.dat$age %>% mean(na.rm=T)

Ther are 128 samples and 12625 features in this data set and the average age is 32.4.

Question 4

sub.idx <- intersect(grep('^B',pheno.dat$BT),which(pheno.dat$mol.biol %in% c('BCR/ABL','NEG')))
eset <- ALL[,sub.idx]
table(eset$mol)
## 
## ALL1/AF4  BCR/ABL E2A/PBX1      NEG   NUP-98  p15/p16 
##        0       37        0       42        0        0
nsamps <- table(eset$mol) %>% sum

There are 79 sample numbers belonging to these groups.

Question 5

f1 <- pOverA(0.25,log2(100))
f2 <- function(x) { IQR(x) > 0.5 }
ff <- filterfun(f1,f2)
selected <- genefilter(eset,ff)
nprobes.passed <- sum(selected)
esetSub <- eset[selected,]

After filtering, we have that 2391 probes passed.

Question 6

tt <- rowttests(esetSub,'mol.biol')
q6.pval <- sum(tt$p.value<0.05)

We have that 451 features have nominal p-values less than 5% from the t-test.

Question 7

pa <- p.adjust(tt$p.value,method='BH')
q7.pval <- sum(pa < 0.05)

We have that 108 features have a p-value of less than 5% after adjusting for the false discovery rate.

Question 8

design <- cbind(1,diff=as.numeric(esetSub$mol.biol=='BCR/ABL'))
fit <- lmFit(esetSub,design)
fit2 <- eBayes(fit)
limma_pa <- topTable(fit2,number=length(fit2$F),coef='diff',adjust.method='fdr')
q8.pval <-  sum(limma_pa$adj.P.Val<0.05)

We see that 112 features that are significant at an FDR of 5%.

Question 9

As the figure shows below, the relationship between the limma and two-sample t-test results are linear in relationship. Indeed a quick regression shows that regression one on the other has an almost perfect R-squared.

limma.genes <- limma_pa %>% rownames
classical.genes <- tt %>% rownames
comp.dat <- tibble(limma=limma_pa$adj.P.Val,classical=pa[match(limma.genes,classical.genes)])
ggplot(comp.dat,aes(x=limma,y=classical)) + 
  geom_point() + labs(x='limma t-test',y='Two-sample t-test')

lm(limma~-1+classical,data=comp.dat) %>% summary
## 
## Call:
## lm(formula = limma ~ -1 + classical, data = comp.dat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027757 -0.005191 -0.000605  0.003383  0.042834 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## classical 1.0002925  0.0002621    3817   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00823 on 2390 degrees of freedom
## Multiple R-squared:  0.9998, Adjusted R-squared:  0.9998 
## F-statistic: 1.457e+07 on 1 and 2390 DF,  p-value: < 2.2e-16

Question 10

diff <- order(pa)[1:10]
genesymbols <- mget(featureNames(esetSub)[diff],hgu95av2SYMBOL)
genesymbols %>% unlist
##  1636_g_at   39730_at    1635_at   40202_at   37027_at 39837_s_at 
##     "ABL1"     "ABL1"     "ABL1"     "KLF9"    "AHNAK"   "ZNF467" 
##   33774_at   36591_at 40480_s_at   37014_at 
##    "CASP8"   "TUBA4A"      "FYN"      "MX1"

We see that the top associated gene is ABL1 which is affected by the translocation of characterizing the BCR/ABL samples.