Introduction to Analysis of Microarray data

Check installation of packages ….

installpkg <- function(pkg) {
    if (!require(pkg, character.only = T)) {
        source("http://bioconductor.org/biocLite.R")
        biocLite(pkg)
    } else {
        require(pkg, character.only = T)
    }
}
installpkg("golubEsets")
installpkg("GEOquery")
installpkg("affy")
library("pheatmap")

Read Annotation file

Download GPL file with annotations , put it in the current directory, and load it and

gpl96 <- getGEO(filename = "GPL96.annot")
Meta(gpl96)$title
## NULL
colnames(Table(gpl96))
##  [1] "ID"                    "Gene title"           
##  [3] "Gene symbol"           "Gene ID"              
##  [5] "UniGene title"         "UniGene symbol"       
##  [7] "UniGene ID"            "Nucleotide Title"     
##  [9] "GI"                    "GenBank Accession"    
## [11] "Platform_CLONEID"      "Platform_ORF"         
## [13] "Platform_SPOTID"       "Chromosome location"  
## [15] "Chromosome annotation" "GO:Function"          
## [17] "GO:Process"            "GO:Component"         
## [19] "GO:Function ID"        "GO:Process ID"        
## [21] "GO:Component ID"
Table(gpl96)[1:10, 1:4]
##           ID                                            Gene title
## 1  1007_s_at           discoidin domain receptor tyrosine kinase 1
## 2    1053_at           replication factor C (activator 1) 2, 40kDa
## 3     117_at                  heat shock 70kDa protein 6 (HSP70B')
## 4     121_at                                          paired box 8
## 5  1255_g_at               guanylate cyclase activator 1A (retina)
## 6    1294_at           ubiquitin-like modifier activating enzyme 7
## 7    1316_at                       thyroid hormone receptor, alpha
## 8    1320_at    protein tyrosine phosphatase, non-receptor type 21
## 9  1405_i_at                        chemokine (C-C motif) ligand 5
## 10   1431_at cytochrome P450, family 2, subfamily E, polypeptide 1
##    Gene symbol Gene ID
## 1         DDR1     780
## 2         RFC2    5982
## 3        HSPA6    3310
## 4         PAX8    7849
## 5       GUCA1A    2978
## 6         UBA7    7318
## 7         THRA    7067
## 8       PTPN21   11099
## 9         CCL5    6352
## 10      CYP2E1    1571
Table(gpl96)[1:10, c("ID", "Gene title")]
##           ID                                            Gene title
## 1  1007_s_at           discoidin domain receptor tyrosine kinase 1
## 2    1053_at           replication factor C (activator 1) 2, 40kDa
## 3     117_at                  heat shock 70kDa protein 6 (HSP70B')
## 4     121_at                                          paired box 8
## 5  1255_g_at               guanylate cyclase activator 1A (retina)
## 6    1294_at           ubiquitin-like modifier activating enzyme 7
## 7    1316_at                       thyroid hormone receptor, alpha
## 8    1320_at    protein tyrosine phosphatase, non-receptor type 21
## 9  1405_i_at                        chemokine (C-C motif) ligand 5
## 10   1431_at cytochrome P450, family 2, subfamily E, polypeptide 1
Table(gpl96)[1:10, c("ID", "Gene title", "Gene symbol", "Gene ID", "GenBank Accession")]
##           ID                                            Gene title
## 1  1007_s_at           discoidin domain receptor tyrosine kinase 1
## 2    1053_at           replication factor C (activator 1) 2, 40kDa
## 3     117_at                  heat shock 70kDa protein 6 (HSP70B')
## 4     121_at                                          paired box 8
## 5  1255_g_at               guanylate cyclase activator 1A (retina)
## 6    1294_at           ubiquitin-like modifier activating enzyme 7
## 7    1316_at                       thyroid hormone receptor, alpha
## 8    1320_at    protein tyrosine phosphatase, non-receptor type 21
## 9  1405_i_at                        chemokine (C-C motif) ligand 5
## 10   1431_at cytochrome P450, family 2, subfamily E, polypeptide 1
##    Gene symbol Gene ID GenBank Accession
## 1         DDR1     780            U48705
## 2         RFC2    5982            M87338
## 3        HSPA6    3310            X51757
## 4         PAX8    7849            X69699
## 5       GUCA1A    2978            L36861
## 6         UBA7    7318            L13852
## 7         THRA    7067            X55005
## 8       PTPN21   11099            X79510
## 9         CCL5    6352            M21121
## 10      CYP2E1    1571            J02843
IDs <- attr(dataTable(gpl96), "table")[, c("ID", "Gene symbol")]  #create a table of ID and gene symbols

Read Data

Reading the microarray data and skip 69 lines as data matrix starts from 70th line.


# This is the GSE7765 data you can use GEOQuery to retrieve the data.

x <- read.table("GSE7765-GPL96_series_matrix.txt", skip = 69, header = TRUE, 
    sep = "\t", row.names = 1, fill = TRUE)

Remove the last line from the matrix

x <- x[-22284, ]
dim(x)
## [1] 22283     6
names(x)
## [1] "GSM188013" "GSM188014" "GSM188016" "GSM188018" "GSM188020" "GSM188022"
# summarizing the the columns of data.
round(apply(x, 2, summary))
##         GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
## Min.            1         1         2         2         1         1
## 1st Qu.       327       317       192       233       172       169
## Median       1150      1100       774       844       692       686
## Mean         3460      3450      3360      3500      3440      3540
## 3rd Qu.      3050      3050      2970      2880      2850      2860
## Max.       115000    113000     95900    115000    104000    119000

Change to Gene names

Replace the rownames with the corresponding gene symbols.

x <- as.matrix(x)
rows <- rownames(x)
rows[3200]
## [1] "203673_at"
IDs[which(IDs[, 1] == rows[3200]), 2]
## [1] TG
## 13211 Levels: ABCF1 ARF1 ARF3 ATP6V0B BAD C1D CANX CAPNS1 CBX3 CCL5 ... SNHG17
symb <- rep(0, length(rows))
for (i in 1:length(rows)) {
    symb[i] <- as.character(IDs[which(IDs[, 1] == rows[i]), 2])
}
rownames(x) <- symb
rownames(x)[500]
## [1] "TSPAN3"
logX <- log2(x)
round(apply(x, 2, summary), 3)
##         GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
## Min.     7.35e-01  8.87e-01      2.01  2.45e+00  5.47e-01  8.06e-01
## 1st Qu.  3.27e+02  3.17e+02    192.00  2.33e+02  1.72e+02  1.69e+02
## Median   1.15e+03  1.10e+03    774.00  8.44e+02  6.92e+02  6.86e+02
## Mean     3.46e+03  3.45e+03   3360.00  3.50e+03  3.44e+03  3.54e+03
## 3rd Qu.  3.05e+03  3.05e+03   2970.00  2.88e+03  2.85e+03  2.86e+03
## Max.     1.15e+05  1.13e+05  95900.00  1.15e+05  1.04e+05  1.19e+05
round(apply(logX, 2, summary), 3)
##         GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
## Min.       -0.443    -0.173      1.01      1.29    -0.871    -0.311
## 1st Qu.     8.350     8.310      7.58      7.86     7.430     7.400
## Median     10.200    10.100      9.60      9.72     9.430     9.420
## Mean        9.990     9.960      9.52      9.68     9.420     9.400
## 3rd Qu.    11.600    11.600     11.50     11.50    11.500    11.500
## Max.       16.800    16.800     16.50     16.80    16.700    16.900

Plot histograms

hist(x[, 1])

plot of chunk unnamed-chunk-6

hist(M.1 <- logX[, 1])

plot of chunk unnamed-chunk-6

hist(M.1, main = "Expression values", xlab = "Expression", ylab = "# of spots", 
    col = 4)

plot of chunk unnamed-chunk-6

MAPlot

opt <- par(mfcol = c(2, 1))
plot(X1 <- x[, 1], Y1 <- x[, 4])
d <- data.frame(X1, Y1)
abline(0, 1, col = "yellow")
A.1 <- log2(X1) - log2(Y1)
M.1 <- (log2(X1) + log2(Y1)) * 0.5
plot(M.1, A.1, col = densCols(A.1, M.1), pch = 20, cex = 0.5)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
abline(h = 0, col = "yellow")

plot of chunk unnamed-chunk-7


# Using ma.plot function from library(affy)
ma.plot(rowMeans(log2(d)), log2(X1) - log2(Y1), cex = 0.5, pch = 20, col = densCols(A.1, 
    M.1))
par(opt)

plot of chunk unnamed-chunk-7

Boxplots

Boxplots for the data of normal and log transformed.

opt <- par(mfcol = c(1, 2))
boxplot(x, col = c(2, 3, 2, 3, 2, 3), main = "Expression values for 3 control and 3 treated patients", 
    xlab = "Slides", ylab = "Expression", las = 2, cex.axis = 0.7)
boxplot(logX, col = c(2, 3, 2, 3, 2, 3), main = "Expression values for 3 control and 3 treated patients", 
    xlab = "Slides", ylab = "Expression", las = 2, cex.axis = 0.7)
abline(0, 0, col = "black")

plot of chunk unnamed-chunk-8

par(opt)

Setting Labels

Labels for Normal and Disease for 6 experiments

logX.cl <- c(0, 1, 0, 1, 0, 1)

Function to perfrom t-test

ttest = function(x) {
    tt = t.test(x[logX.cl == 0], x[logX.cl == 1])
    return(c(tt$statistic, tt$p.value))
}

Perform t-test of tranformed data

ans = apply(logX, 1, ttest)
ts <- ans[1, ]
pvals <- ans[2, ]

Check Normality

qqnorm(ts)
qqline(ts)

plot of chunk unnamed-chunk-12

How many genes with pvalues ?

## pvalues are given to retrieve the genes which pass the p values
for (i in c(0.01, 0.05, 0.001, 1e-04, 1e-05, 1e-06, 1e-07)) print(paste("genes with p-values smaller than", 
    i, length(which(pvals < i))))
## [1] "genes with p-values smaller than 0.01 82"
## [1] "genes with p-values smaller than 0.05 424"
## [1] "genes with p-values smaller than 0.001 7"
## [1] "genes with p-values smaller than 1e-04 0"
## [1] "genes with p-values smaller than 1e-05 0"
## [1] "genes with p-values smaller than 1e-06 0"
## [1] "genes with p-values smaller than 1e-07 0"

#Plot heatmap of differentially expressed genes: Genes are differentially expressed if its p-valueis under a given threshold, which must be smaller than the usual 0.05 or 0.01 due to multiplicity of tests

a <- data.frame(which(pvals < 0.01))
d <- a[, 1]
data <- as.matrix(x = x[d, ])
heatmap(data, col = topo.colors(100), cexRow = 0.5)

plot of chunk unnamed-chunk-14

## using Pheatmap drows = dist(data, method = 'minkowski') dcols =
## dist(t(data), method = 'minkowski') pheatmap(data,
## clustering_distance_rows = drows, clustering_distance_cols = dcols)