Affymterix CEL files

We start by reading in the sample information table. This is usually created by the person who performed the experiment.

The raw data files for this lab are in the rawdata repository, available here:

https://github.com/genomicsclass/rawdata

Click Download ZIP in order to download all the files, unzip this file which should result in a rawdata-master folder.

First we save the initial working directory, so we can return to it.

wd <- getwd()

Now we can start reading in the files:

datadir <- "C:/Users/Prasanth/Documents/R/rawdata-master"
basedir <- paste0(datadir, "/celfiles")
setwd(basedir)
library(affy)

## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

tab <- read.delim("sampleinfo.txt",check.names=FALSE,as.is=TRUE)
rownames(tab) <- tab$filenames
tab

##                                       filenames 37777_at 684_at 1597_at
## 1521a99hpp_av06.CEL.gz   1521a99hpp_av06.CEL.gz     0.00   0.25     0.5
## 1532a99hpp_av04.CEL.gz   1532a99hpp_av04.CEL.gz     0.00   0.25     0.5
## 2353a99hpp_av08.CEL.gz   2353a99hpp_av08.CEL.gz     0.00   0.25     0.5
## 1521b99hpp_av06.CEL.gz   1521b99hpp_av06.CEL.gz     0.25   0.50     1.0
## 1532b99hpp_av04.CEL.gz   1532b99hpp_av04.CEL.gz     0.25   0.50     1.0
## 2353b99hpp_av08r.CEL.gz 2353b99hpp_av08r.CEL.gz     0.25   0.50     1.0
##                         38734_at 39058_at 36311_at 36889_at 1024_at
## 1521a99hpp_av06.CEL.gz         1        2        4        8      16
## 1532a99hpp_av04.CEL.gz         1        2        4        8      16
## 2353a99hpp_av08.CEL.gz         1        2        4        8      16
## 1521b99hpp_av06.CEL.gz         2        4        8       16      32
## 1532b99hpp_av04.CEL.gz         2        4        8       16      32
## 2353b99hpp_av08r.CEL.gz        2        4        8       16      32
##                         36202_at 36085_at 40322_at 407_at 1091_at 1708_at
## 1521a99hpp_av06.CEL.gz        32       64      128   0.00     512    1024
## 1532a99hpp_av04.CEL.gz        32       64      128   0.00     512    1024
## 2353a99hpp_av08.CEL.gz        32       64      128   0.00     512    1024
## 1521b99hpp_av06.CEL.gz        64      128      256   0.25    1024       0
## 1532b99hpp_av04.CEL.gz        64      128      256   0.25    1024       0
## 2353b99hpp_av08r.CEL.gz       64      128      256   0.25    1024       0
##                         33818_at 546_at
## 1521a99hpp_av06.CEL.gz       256     32
## 1532a99hpp_av04.CEL.gz       256     32
## 2353a99hpp_av08.CEL.gz       256     32
## 1521b99hpp_av06.CEL.gz       512     64
## 1532b99hpp_av04.CEL.gz       512     64
## 2353b99hpp_av08r.CEL.gz      512     64

fns <- list.celfiles()
fns

## [1] "1521a99hpp_av06.CEL.gz"  "1521b99hpp_av06.CEL.gz" 
## [3] "1532a99hpp_av04.CEL.gz"  "1532b99hpp_av04.CEL.gz" 
## [5] "2353a99hpp_av08.CEL.gz"  "2353b99hpp_av08r.CEL.gz"

fns %in% tab[,1] ##check

## [1] TRUE TRUE TRUE TRUE TRUE TRUE

ab <- ReadAffy(phenoData=tab)

This creates an AffyBatch object which object contains the information you need.

dim(pm(ab))

## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'

## [1] 201807      6

dim(pData(ab))

## [1]  6 17

annotation(ab)

## [1] "hgu95a"

Note, this object You can then preprocess RMA

e <- rma(ab)

## Background correcting
## Normalizing
## Calculating Expression

dim(exprs(e))

## [1] 12626     6

dim(pData(e))

## [1]  6 17

dim(fData(e))

## [1] 12626     0

Now we go back to the previous working directory.

setwd(wd)

If you are not interested in probe level data you could can use this function

setwd(basedir)
ejust <- justRMA(filenames=tab[,1],phenoData=tab)
dim(ejust)

## Features  Samples 
##    12626        6

———————————————————————————————————-

Agilent data

library(limma)

## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

library(rafalib)
basedir <- paste0(datadir, "/agilent")
setwd(basedir)
targets <- readTargets("TargetBeta7.txt")
# targets <- readTargets(file.path(basedir,"TargetBeta7.txt")) 
# this would combine both the two steps above into one. 
RG <- read.maimages(targets$FileName, source="genepix")

## Read 6Hs.195.1.gpr 
## Read 6Hs.168.gpr 
## Read 6Hs.166.gpr 
## Read 6Hs.187.1.gpr 
## Read 6Hs.194.gpr 
## Read 6Hs.243.1.gpr

MA <- MA.RG(RG,bc.method="none")
# The MA.RG function normalizes the data from the RG file as rate log ratio (M) and average of log is (A).  
# one advantage is that it will give a nice MA plot for the data. 
mypar(1,1)
plot(MA$A[,1], MA$M[,1]) # for the first sample.

imageplot(MA$M[,2], RG$printer, zlim=c(-3,3))

dev.off()

## null device 
##           1

Now we go back to the previous working directory.

setwd(wd)

————————————————————————————————————

oligo

We can also use oligo to read affy arrays

detach("package:affy")
library(oligo)

## Loading required package: oligoClasses
## Welcome to oligoClasses version 1.30.0
## Loading required package: Biostrings
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: XVector
## ===========================================================================
## Welcome to oligo version 1.32.0
## ===========================================================================
## 
## Attaching package: 'oligo'
## 
## The following object is masked from 'package:limma':
## 
##     backgroundCorrect

library(pd.hg.u95a)

## Loading required package: RSQLite
## Loading required package: DBI

basedir <- paste0(datadir,"/celfiles")
setwd(basedir)
tab <- read.delim("sampleinfo.txt",check.names=FALSE,as.is=TRUE)
fns <- list.celfiles(listGzipped=TRUE)
fns %in% tab[,1] ##check

## [1] TRUE TRUE TRUE TRUE TRUE TRUE

pd <- as(tab, "AnnotatedDataFrame")
efs <- read.celfiles(filenames=tab[,1],phenoData=pd,sampleNames=sampleNames(pd))

## Platform design info loaded.

## Reading in : 1521a99hpp_av06.CEL.gz
## Reading in : 1532a99hpp_av04.CEL.gz
## Reading in : 2353a99hpp_av08.CEL.gz
## Reading in : 1521b99hpp_av06.CEL.gz
## Reading in : 1532b99hpp_av04.CEL.gz
## Reading in : 2353b99hpp_av08r.CEL.gz

## Warning in read.celfiles(filenames = tab[, 1], phenoData = pd, sampleNames
## = sampleNames(pd)): 'channel' automatically added to varMetadata in
## phenoData.

e <- rma(efs)

## Background correcting
## Normalizing
## Calculating Expression

dim(pData(e))

## [1]  6 17

dim(exprs(e))

## [1] 12626     6

dim(fData(e))

## [1] 12626     0

setwd(wd)

——————————————————————————————————

EXERCISE

We start by reading in the sample information table. This is usually created by the person ho performed the experiment. The raw files for this lab are in the rawdata repository, available here.

You will also need to install the following library, which includes informaiton about the microarray platform we are working with here.

library(BiocInstaller)

## Bioconductor version 3.1 (BiocInstaller 1.18.4), ?biocLite for help

library(hgu95acdf)

Click Download ZIP in order to download all the files, unzip this file which should result in a rawdata-master folder.

Question 3.4.1

Once you download the raw data, locate the sampleinfo.txt file and read it using the funtion read.delim.

Based on the information on this table at what level was gene 36311_at spiked in for file 1521a99hpp_av06.CEL.gz.

# a file has been created as "tab" earlier reading sampleinfo.txt file. 
tab["1521a99hpp_av06.CEL.gz","36311_at"]

## [1] NA

Question 3.4.2

Now locate the celfiles and read them using the function ReadAffy, call the object ab.

Extract the feature IDs using the probeNames function. How many features are associated with gene, 36311_at?

library(affy)

## 
## Attaching package: 'affy'
## 
## The following objects are masked from 'package:oligo':
## 
##     intensity, MAplot, mm, mm<-, mmindex, pm, pm<-, pmindex,
##     probeNames, rma
## 
## The following object is masked from 'package:oligoClasses':
## 
##     list.celfiles

basedir <- paste0(datadir,"/celfiles")
setwd(basedir)
fns = list.celfiles()
fns

## [1] "1521a99hpp_av06.CEL.gz"  "1521b99hpp_av06.CEL.gz" 
## [3] "1532a99hpp_av04.CEL.gz"  "1532b99hpp_av04.CEL.gz" 
## [5] "2353a99hpp_av08.CEL.gz"  "2353b99hpp_av08r.CEL.gz"

all(fns %in% tab[,1])##check

## [1] TRUE

ab = ReadAffy(phenoData=tab)

## Warning: Mismatched phenoData and celfile names!
## 
## Please note that the row.names of your phenoData object should be identical to what you get from list.celfiles()!
## Otherwise you are responsible for ensuring that the ordering of your phenoData object conforms to the ordering of the celfiles as they are read into the AffyBatch!
## If not, errors may result from using the phenoData for subsetting or creating linear models, etc.

sum( probeNames(ab)=="36311_at")

## [1] 16

Note the differene between the number of genes and the number of probes

length( featureNames(ab))

## [1] 12626

length( probeNames(ab))

## [1] 201807

This is from an Affymetrix array and we explained in the videos that several probes are used for each gene.

Question 3.4.3

Use the pm function to create a matrix of the probe level intensities. Extract the rows associated with 36085_at. This is one of the spiked in genes and we can see their intended concentrations in pData(ab). Call the resulting matrix mat.

Use the function matplot, with the type="l" option, to plot the observed intensities in mat for samples 1532a99hpp_av04.CEL.gz and 1532b99hpp_av04.CEL.gz against their intended concentrations. Then plot the log-ratios (log base 2) comparing arrays 1532a99hpp_av04.CEL.gz and 1532b99hpp_av04.CEL.gz for each probe.

pid = "36085_at"
##which columns should we use?
ind = which(pData(ab)[,1]%in%c("1532a99hpp_av04.CEL.gz","1532b99hpp_av04.CEL.gz"))

##extract the correct rows
mat = pm(ab) [ probeNames(ab)==pid, ind] 

##what are the intended conc
conc = pData(ab)[ind, pid]

##make the plots
library(rafalib)
mypar(1,1)
matplot(conc, t(mat), log="y", type="l")

##now compute log fold changesa
lfc = log2(mat[,2]/mat[,1])
stripchart(lfc,vertical=TRUE,ylim=c(-0.5,1.5))
abline(h=log2(conc[2]/conc[1])) #intended log fold
abline(h=0)

Question 3.4.4

Run the function rma on the ab object created above with the default parameters. Note that the returned values are in the log-scale. Take a close look at the intended concentration in pData to note there are two groups of triplicates. Note you can simply do this.

e = rma(ab)

## Background correcting
## Normalizing
## Calculating Expression

exprs(e)[1:3,]

##                 1        2        3        4        5        6
## 100_g_at 7.458726 7.320466 7.321118 7.333904 7.203733 7.486129
## 1000_at  6.270813 6.046439 6.276017 6.052164 6.037026 6.262098
## 1001_at  4.345569 4.330692 4.546023 4.368368 4.132321 4.374769

exprs(ab)[1:3,]

##        1      2      3    4      5    6
## 1  282.0  217.0  236.0  169  188.0  148
## 2 9314.8 8718.8 8727.3 7695 6488.8 5405
## 3  253.0  262.0  213.0  190  170.0  137

g = factor(pData(ab)[,2])
# any column other than the first will do

Use the rowttests function in the genefilter package to compute t-test for each gene comparing these two groups.

What is the p-value for gene 36085_at?

library(genefilter)

## 
## Attaching package: 'genefilter'
## 
## The following object is masked from 'package:base':
## 
##     anyNA

myttest = rowttests(exprs(e), g)
myttest["36085_at","p.value"]

## [1] 0.3095655

Now that if we are not interested in the raw probe-level data we can go straight from file to expression values with:

setwd(basedir)
ejust = justRMA(filenames=tab[,1],phenoData=tab)

## Warning: Mismatched phenoData and celfile names!
## 
## Please note that the row.names of your phenoData object should be identical to what you get from list.celfiles()!
## Otherwise you are responsible for ensuring that the ordering of your phenoData object conforms to the ordering of the celfiles as they are read into the AffyBatch!
## If not, errors may result from using the phenoData for subsetting or creating linear models, etc.

se

## standardGeneric for "se" defined from package "oligo"
## 
## function (object) 
## standardGeneric("se")
## <environment: 0x00000000198fe968>
## Methods may be defined for arguments: object
## Use  showMethods("se")  for currently available ones.

Question 3.4.5

To get a general sense of the sensitivity and specificity of the technology, make a boxplot of the effect sizes (the dm column of the object returns by rowttests) for the spiked-in genes and the rest:

boxplot(myttest$dm)

# more clearly

g = factor(pData(e)[,2])
tt = rowttests(exprs(e),g)
lfc = -tt$dm

sig = colnames(pData(ab))[-1]
boxplot( split(lfc, rownames(tt)%in%sig))

##close up 
boxplot( split(lfc, rownames(tt)%in%sig),ylim=c(-1,1))

————————————————————————————–

Agilent Data Exercise

Question 3.4.6

The raw data files here also include two color arrays.

Read in the data using these commands:

library(limma)
datadir = "C:/Users/Prasanth/Documents/R/rawdata-master"
basedir = file.path(datadir, "/agilent")
setwd(basedir)
targets = readTargets("TargetBeta7.txt")
RG = read.maimages(targets$FileName, source="genepix")

## Read 6Hs.195.1.gpr 
## Read 6Hs.168.gpr 
## Read 6Hs.166.gpr 
## Read 6Hs.187.1.gpr 
## Read 6Hs.194.gpr 
## Read 6Hs.243.1.gpr

What is the log (base 2) ratio of the red to green channel for gene with ID “H200015482” on sample 6Hs.168 (hint: RG is just a list and RG$genes has information about genes)

i = which(RG$genes$ID=="H200015482")
j = which(rownames(RG$targets)=="6Hs.168")
log2(RG$R[i,j]/RG$G[i,j])

##   6Hs.168 
## 0.6456647

Note, you can also do this:

MA = MA.RG(RG,bc.method="none")
MA$M[i,j]

##   6Hs.168 
## 0.6456647

Now we go back to the previous working directory.

setwd(wd)

Reading in microarray data

Affymterix CEL files

———————————————————————————————————-

Agilent data

————————————————————————————————————

oligo

——————————————————————————————————

EXERCISE

Question 3.4.1

Question 3.4.2

Question 3.4.3

Question 3.4.4

Question 3.4.5

————————————————————————————–

Agilent Data Exercise

Question 3.4.6