We start by reading in the sample information table. This is usually created by the person who performed the experiment.
The raw data files for this lab are in the rawdata repository, available here:
https://github.com/genomicsclass/rawdata
Click Download ZIP in order to download all the files, unzip this file which should result in a rawdata-master folder.
First we save the initial working directory, so we can return to it.
wd <- getwd()
Now we can start reading in the files:
datadir <- "C:/Users/Prasanth/Documents/R/rawdata-master"
basedir <- paste0(datadir, "/celfiles")
setwd(basedir)
library(affy)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
tab <- read.delim("sampleinfo.txt",check.names=FALSE,as.is=TRUE)
rownames(tab) <- tab$filenames
tab
## filenames 37777_at 684_at 1597_at
## 1521a99hpp_av06.CEL.gz 1521a99hpp_av06.CEL.gz 0.00 0.25 0.5
## 1532a99hpp_av04.CEL.gz 1532a99hpp_av04.CEL.gz 0.00 0.25 0.5
## 2353a99hpp_av08.CEL.gz 2353a99hpp_av08.CEL.gz 0.00 0.25 0.5
## 1521b99hpp_av06.CEL.gz 1521b99hpp_av06.CEL.gz 0.25 0.50 1.0
## 1532b99hpp_av04.CEL.gz 1532b99hpp_av04.CEL.gz 0.25 0.50 1.0
## 2353b99hpp_av08r.CEL.gz 2353b99hpp_av08r.CEL.gz 0.25 0.50 1.0
## 38734_at 39058_at 36311_at 36889_at 1024_at
## 1521a99hpp_av06.CEL.gz 1 2 4 8 16
## 1532a99hpp_av04.CEL.gz 1 2 4 8 16
## 2353a99hpp_av08.CEL.gz 1 2 4 8 16
## 1521b99hpp_av06.CEL.gz 2 4 8 16 32
## 1532b99hpp_av04.CEL.gz 2 4 8 16 32
## 2353b99hpp_av08r.CEL.gz 2 4 8 16 32
## 36202_at 36085_at 40322_at 407_at 1091_at 1708_at
## 1521a99hpp_av06.CEL.gz 32 64 128 0.00 512 1024
## 1532a99hpp_av04.CEL.gz 32 64 128 0.00 512 1024
## 2353a99hpp_av08.CEL.gz 32 64 128 0.00 512 1024
## 1521b99hpp_av06.CEL.gz 64 128 256 0.25 1024 0
## 1532b99hpp_av04.CEL.gz 64 128 256 0.25 1024 0
## 2353b99hpp_av08r.CEL.gz 64 128 256 0.25 1024 0
## 33818_at 546_at
## 1521a99hpp_av06.CEL.gz 256 32
## 1532a99hpp_av04.CEL.gz 256 32
## 2353a99hpp_av08.CEL.gz 256 32
## 1521b99hpp_av06.CEL.gz 512 64
## 1532b99hpp_av04.CEL.gz 512 64
## 2353b99hpp_av08r.CEL.gz 512 64
fns <- list.celfiles()
fns
## [1] "1521a99hpp_av06.CEL.gz" "1521b99hpp_av06.CEL.gz"
## [3] "1532a99hpp_av04.CEL.gz" "1532b99hpp_av04.CEL.gz"
## [5] "2353a99hpp_av08.CEL.gz" "2353b99hpp_av08r.CEL.gz"
fns %in% tab[,1] ##check
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
ab <- ReadAffy(phenoData=tab)
This creates an AffyBatch object which object contains the information you need.
dim(pm(ab))
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
## [1] 201807 6
dim(pData(ab))
## [1] 6 17
annotation(ab)
## [1] "hgu95a"
Note, this object You can then preprocess RMA
e <- rma(ab)
## Background correcting
## Normalizing
## Calculating Expression
dim(exprs(e))
## [1] 12626 6
dim(pData(e))
## [1] 6 17
dim(fData(e))
## [1] 12626 0
Now we go back to the previous working directory.
setwd(wd)
If you are not interested in probe level data you could can use this function
setwd(basedir)
ejust <- justRMA(filenames=tab[,1],phenoData=tab)
dim(ejust)
## Features Samples
## 12626 6
library(limma)
##
## Attaching package: 'limma'
##
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(rafalib)
basedir <- paste0(datadir, "/agilent")
setwd(basedir)
targets <- readTargets("TargetBeta7.txt")
# targets <- readTargets(file.path(basedir,"TargetBeta7.txt"))
# this would combine both the two steps above into one.
RG <- read.maimages(targets$FileName, source="genepix")
## Read 6Hs.195.1.gpr
## Read 6Hs.168.gpr
## Read 6Hs.166.gpr
## Read 6Hs.187.1.gpr
## Read 6Hs.194.gpr
## Read 6Hs.243.1.gpr
MA <- MA.RG(RG,bc.method="none")
# The MA.RG function normalizes the data from the RG file as rate log ratio (M) and average of log is (A).
# one advantage is that it will give a nice MA plot for the data.
mypar(1,1)
plot(MA$A[,1], MA$M[,1]) # for the first sample.
imageplot(MA$M[,2], RG$printer, zlim=c(-3,3))
dev.off()
## null device
## 1
Now we go back to the previous working directory.
setwd(wd)
We can also use oligo to read affy arrays
detach("package:affy")
library(oligo)
## Loading required package: oligoClasses
## Welcome to oligoClasses version 1.30.0
## Loading required package: Biostrings
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: XVector
## ===========================================================================
## Welcome to oligo version 1.32.0
## ===========================================================================
##
## Attaching package: 'oligo'
##
## The following object is masked from 'package:limma':
##
## backgroundCorrect
library(pd.hg.u95a)
## Loading required package: RSQLite
## Loading required package: DBI
basedir <- paste0(datadir,"/celfiles")
setwd(basedir)
tab <- read.delim("sampleinfo.txt",check.names=FALSE,as.is=TRUE)
fns <- list.celfiles(listGzipped=TRUE)
fns %in% tab[,1] ##check
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
pd <- as(tab, "AnnotatedDataFrame")
efs <- read.celfiles(filenames=tab[,1],phenoData=pd,sampleNames=sampleNames(pd))
## Platform design info loaded.
## Reading in : 1521a99hpp_av06.CEL.gz
## Reading in : 1532a99hpp_av04.CEL.gz
## Reading in : 2353a99hpp_av08.CEL.gz
## Reading in : 1521b99hpp_av06.CEL.gz
## Reading in : 1532b99hpp_av04.CEL.gz
## Reading in : 2353b99hpp_av08r.CEL.gz
## Warning in read.celfiles(filenames = tab[, 1], phenoData = pd, sampleNames
## = sampleNames(pd)): 'channel' automatically added to varMetadata in
## phenoData.
e <- rma(efs)
## Background correcting
## Normalizing
## Calculating Expression
dim(pData(e))
## [1] 6 17
dim(exprs(e))
## [1] 12626 6
dim(fData(e))
## [1] 12626 0
setwd(wd)
We start by reading in the sample information table. This is usually created by the person ho performed the experiment. The raw files for this lab are in the rawdata repository, available here.
You will also need to install the following library, which includes informaiton about the microarray platform we are working with here.
library(BiocInstaller)
## Bioconductor version 3.1 (BiocInstaller 1.18.4), ?biocLite for help
library(hgu95acdf)
Click Download ZIP in order to download all the files, unzip this file which should result in a rawdata-master folder.
Once you download the raw data, locate the sampleinfo.txt file and read it using the funtion read.delim.
Based on the information on this table at what level was gene 36311_at spiked in for file 1521a99hpp_av06.CEL.gz.
# a file has been created as "tab" earlier reading sampleinfo.txt file.
tab["1521a99hpp_av06.CEL.gz","36311_at"]
## [1] NA
Now locate the celfiles and read them using the function ReadAffy, call the object ab.
Extract the feature IDs using the probeNames function. How many features are associated with gene, 36311_at?
library(affy)
##
## Attaching package: 'affy'
##
## The following objects are masked from 'package:oligo':
##
## intensity, MAplot, mm, mm<-, mmindex, pm, pm<-, pmindex,
## probeNames, rma
##
## The following object is masked from 'package:oligoClasses':
##
## list.celfiles
basedir <- paste0(datadir,"/celfiles")
setwd(basedir)
fns = list.celfiles()
fns
## [1] "1521a99hpp_av06.CEL.gz" "1521b99hpp_av06.CEL.gz"
## [3] "1532a99hpp_av04.CEL.gz" "1532b99hpp_av04.CEL.gz"
## [5] "2353a99hpp_av08.CEL.gz" "2353b99hpp_av08r.CEL.gz"
all(fns %in% tab[,1])##check
## [1] TRUE
ab = ReadAffy(phenoData=tab)
## Warning: Mismatched phenoData and celfile names!
##
## Please note that the row.names of your phenoData object should be identical to what you get from list.celfiles()!
## Otherwise you are responsible for ensuring that the ordering of your phenoData object conforms to the ordering of the celfiles as they are read into the AffyBatch!
## If not, errors may result from using the phenoData for subsetting or creating linear models, etc.
sum( probeNames(ab)=="36311_at")
## [1] 16
Note the differene between the number of genes and the number of probes
length( featureNames(ab))
## [1] 12626
length( probeNames(ab))
## [1] 201807
This is from an Affymetrix array and we explained in the videos that several probes are used for each gene.
Use the pm function to create a matrix of the probe level intensities. Extract the rows associated with 36085_at. This is one of the spiked in genes and we can see their intended concentrations in pData(ab). Call the resulting matrix mat.
Use the function matplot, with the type="l" option, to plot the observed intensities in mat for samples 1532a99hpp_av04.CEL.gz and 1532b99hpp_av04.CEL.gz against their intended concentrations. Then plot the log-ratios (log base 2) comparing arrays 1532a99hpp_av04.CEL.gz and 1532b99hpp_av04.CEL.gz for each probe.
pid = "36085_at"
##which columns should we use?
ind = which(pData(ab)[,1]%in%c("1532a99hpp_av04.CEL.gz","1532b99hpp_av04.CEL.gz"))
##extract the correct rows
mat = pm(ab) [ probeNames(ab)==pid, ind]
##what are the intended conc
conc = pData(ab)[ind, pid]
##make the plots
library(rafalib)
mypar(1,1)
matplot(conc, t(mat), log="y", type="l")
##now compute log fold changesa
lfc = log2(mat[,2]/mat[,1])
stripchart(lfc,vertical=TRUE,ylim=c(-0.5,1.5))
abline(h=log2(conc[2]/conc[1])) #intended log fold
abline(h=0)
Run the function rma on the ab object created above with the default parameters. Note that the returned values are in the log-scale. Take a close look at the intended concentration in pData to note there are two groups of triplicates. Note you can simply do this.
e = rma(ab)
## Background correcting
## Normalizing
## Calculating Expression
exprs(e)[1:3,]
## 1 2 3 4 5 6
## 100_g_at 7.458726 7.320466 7.321118 7.333904 7.203733 7.486129
## 1000_at 6.270813 6.046439 6.276017 6.052164 6.037026 6.262098
## 1001_at 4.345569 4.330692 4.546023 4.368368 4.132321 4.374769
exprs(ab)[1:3,]
## 1 2 3 4 5 6
## 1 282.0 217.0 236.0 169 188.0 148
## 2 9314.8 8718.8 8727.3 7695 6488.8 5405
## 3 253.0 262.0 213.0 190 170.0 137
g = factor(pData(ab)[,2])
# any column other than the first will do
Use the rowttests function in the genefilter package to compute t-test for each gene comparing these two groups.
What is the p-value for gene 36085_at?
library(genefilter)
##
## Attaching package: 'genefilter'
##
## The following object is masked from 'package:base':
##
## anyNA
myttest = rowttests(exprs(e), g)
myttest["36085_at","p.value"]
## [1] 0.3095655
Now that if we are not interested in the raw probe-level data we can go straight from file to expression values with:
setwd(basedir)
ejust = justRMA(filenames=tab[,1],phenoData=tab)
## Warning: Mismatched phenoData and celfile names!
##
## Please note that the row.names of your phenoData object should be identical to what you get from list.celfiles()!
## Otherwise you are responsible for ensuring that the ordering of your phenoData object conforms to the ordering of the celfiles as they are read into the AffyBatch!
## If not, errors may result from using the phenoData for subsetting or creating linear models, etc.
se
## standardGeneric for "se" defined from package "oligo"
##
## function (object)
## standardGeneric("se")
## <environment: 0x00000000198fe968>
## Methods may be defined for arguments: object
## Use showMethods("se") for currently available ones.
To get a general sense of the sensitivity and specificity of the technology, make a boxplot of the effect sizes (the dm column of the object returns by rowttests) for the spiked-in genes and the rest:
boxplot(myttest$dm)
# more clearly
g = factor(pData(e)[,2])
tt = rowttests(exprs(e),g)
lfc = -tt$dm
sig = colnames(pData(ab))[-1]
boxplot( split(lfc, rownames(tt)%in%sig))
##close up
boxplot( split(lfc, rownames(tt)%in%sig),ylim=c(-1,1))
The raw data files here also include two color arrays.
Read in the data using these commands:
library(limma)
datadir = "C:/Users/Prasanth/Documents/R/rawdata-master"
basedir = file.path(datadir, "/agilent")
setwd(basedir)
targets = readTargets("TargetBeta7.txt")
RG = read.maimages(targets$FileName, source="genepix")
## Read 6Hs.195.1.gpr
## Read 6Hs.168.gpr
## Read 6Hs.166.gpr
## Read 6Hs.187.1.gpr
## Read 6Hs.194.gpr
## Read 6Hs.243.1.gpr
What is the log (base 2) ratio of the red to green channel for gene with ID “H200015482” on sample 6Hs.168 (hint: RG is just a list and RG$genes has information about genes)
i = which(RG$genes$ID=="H200015482")
j = which(rownames(RG$targets)=="6Hs.168")
log2(RG$R[i,j]/RG$G[i,j])
## 6Hs.168
## 0.6456647
Note, you can also do this:
MA = MA.RG(RG,bc.method="none")
MA$M[i,j]
## 6Hs.168
## 0.6456647
Now we go back to the previous working directory.
setwd(wd)