Check installation of packages ….
installpkg <- function(pkg) {
if (!require(pkg, character.only = T)) {
source("http://bioconductor.org/biocLite.R")
biocLite(pkg)
} else {
require(pkg, character.only = T)
}
}
installpkg("golubEsets")
installpkg("GEOquery")
installpkg("affy")
library("pheatmap")
Download GPL file with annotations , put it in the current directory, and load it and
gpl96 <- getGEO(filename = "GPL96.annot")
Meta(gpl96)$title
## NULL
colnames(Table(gpl96))
## [1] "ID" "Gene title"
## [3] "Gene symbol" "Gene ID"
## [5] "UniGene title" "UniGene symbol"
## [7] "UniGene ID" "Nucleotide Title"
## [9] "GI" "GenBank Accession"
## [11] "Platform_CLONEID" "Platform_ORF"
## [13] "Platform_SPOTID" "Chromosome location"
## [15] "Chromosome annotation" "GO:Function"
## [17] "GO:Process" "GO:Component"
## [19] "GO:Function ID" "GO:Process ID"
## [21] "GO:Component ID"
Table(gpl96)[1:10, 1:4]
## ID Gene title
## 1 1007_s_at discoidin domain receptor tyrosine kinase 1
## 2 1053_at replication factor C (activator 1) 2, 40kDa
## 3 117_at heat shock 70kDa protein 6 (HSP70B')
## 4 121_at paired box 8
## 5 1255_g_at guanylate cyclase activator 1A (retina)
## 6 1294_at ubiquitin-like modifier activating enzyme 7
## 7 1316_at thyroid hormone receptor, alpha
## 8 1320_at protein tyrosine phosphatase, non-receptor type 21
## 9 1405_i_at chemokine (C-C motif) ligand 5
## 10 1431_at cytochrome P450, family 2, subfamily E, polypeptide 1
## Gene symbol Gene ID
## 1 DDR1 780
## 2 RFC2 5982
## 3 HSPA6 3310
## 4 PAX8 7849
## 5 GUCA1A 2978
## 6 UBA7 7318
## 7 THRA 7067
## 8 PTPN21 11099
## 9 CCL5 6352
## 10 CYP2E1 1571
Table(gpl96)[1:10, c("ID", "Gene title")]
## ID Gene title
## 1 1007_s_at discoidin domain receptor tyrosine kinase 1
## 2 1053_at replication factor C (activator 1) 2, 40kDa
## 3 117_at heat shock 70kDa protein 6 (HSP70B')
## 4 121_at paired box 8
## 5 1255_g_at guanylate cyclase activator 1A (retina)
## 6 1294_at ubiquitin-like modifier activating enzyme 7
## 7 1316_at thyroid hormone receptor, alpha
## 8 1320_at protein tyrosine phosphatase, non-receptor type 21
## 9 1405_i_at chemokine (C-C motif) ligand 5
## 10 1431_at cytochrome P450, family 2, subfamily E, polypeptide 1
Table(gpl96)[1:10, c("ID", "Gene title", "Gene symbol", "Gene ID", "GenBank Accession")]
## ID Gene title
## 1 1007_s_at discoidin domain receptor tyrosine kinase 1
## 2 1053_at replication factor C (activator 1) 2, 40kDa
## 3 117_at heat shock 70kDa protein 6 (HSP70B')
## 4 121_at paired box 8
## 5 1255_g_at guanylate cyclase activator 1A (retina)
## 6 1294_at ubiquitin-like modifier activating enzyme 7
## 7 1316_at thyroid hormone receptor, alpha
## 8 1320_at protein tyrosine phosphatase, non-receptor type 21
## 9 1405_i_at chemokine (C-C motif) ligand 5
## 10 1431_at cytochrome P450, family 2, subfamily E, polypeptide 1
## Gene symbol Gene ID GenBank Accession
## 1 DDR1 780 U48705
## 2 RFC2 5982 M87338
## 3 HSPA6 3310 X51757
## 4 PAX8 7849 X69699
## 5 GUCA1A 2978 L36861
## 6 UBA7 7318 L13852
## 7 THRA 7067 X55005
## 8 PTPN21 11099 X79510
## 9 CCL5 6352 M21121
## 10 CYP2E1 1571 J02843
IDs <- attr(dataTable(gpl96), "table")[, c("ID", "Gene symbol")] #create a table of ID and gene symbols
Reading the microarray data and skip 69 lines as data matrix starts from 70th line.
# This is the GSE7765 data you can use GEOQuery to retrieve the data.
x <- read.table("GSE7765-GPL96_series_matrix.txt", skip = 69, header = TRUE,
sep = "\t", row.names = 1, fill = TRUE)
Remove the last line from the matrix
x <- x[-22284, ]
dim(x)
## [1] 22283 6
names(x)
## [1] "GSM188013" "GSM188014" "GSM188016" "GSM188018" "GSM188020" "GSM188022"
# summarizing the the columns of data.
round(apply(x, 2, summary))
## GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
## Min. 1 1 2 2 1 1
## 1st Qu. 327 317 192 233 172 169
## Median 1150 1100 774 844 692 686
## Mean 3460 3450 3360 3500 3440 3540
## 3rd Qu. 3050 3050 2970 2880 2850 2860
## Max. 115000 113000 95900 115000 104000 119000
Replace the rownames with the corresponding gene symbols.
x <- as.matrix(x)
rows <- rownames(x)
rows[3200]
## [1] "203673_at"
IDs[which(IDs[, 1] == rows[3200]), 2]
## [1] TG
## 13211 Levels: ABCF1 ARF1 ARF3 ATP6V0B BAD C1D CANX CAPNS1 CBX3 CCL5 ... SNHG17
symb <- rep(0, length(rows))
for (i in 1:length(rows)) {
symb[i] <- as.character(IDs[which(IDs[, 1] == rows[i]), 2])
}
rownames(x) <- symb
rownames(x)[500]
## [1] "TSPAN3"
logX <- log2(x)
round(apply(x, 2, summary), 3)
## GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
## Min. 7.35e-01 8.87e-01 2.01 2.45e+00 5.47e-01 8.06e-01
## 1st Qu. 3.27e+02 3.17e+02 192.00 2.33e+02 1.72e+02 1.69e+02
## Median 1.15e+03 1.10e+03 774.00 8.44e+02 6.92e+02 6.86e+02
## Mean 3.46e+03 3.45e+03 3360.00 3.50e+03 3.44e+03 3.54e+03
## 3rd Qu. 3.05e+03 3.05e+03 2970.00 2.88e+03 2.85e+03 2.86e+03
## Max. 1.15e+05 1.13e+05 95900.00 1.15e+05 1.04e+05 1.19e+05
round(apply(logX, 2, summary), 3)
## GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
## Min. -0.443 -0.173 1.01 1.29 -0.871 -0.311
## 1st Qu. 8.350 8.310 7.58 7.86 7.430 7.400
## Median 10.200 10.100 9.60 9.72 9.430 9.420
## Mean 9.990 9.960 9.52 9.68 9.420 9.400
## 3rd Qu. 11.600 11.600 11.50 11.50 11.500 11.500
## Max. 16.800 16.800 16.50 16.80 16.700 16.900
hist(x[, 1])
hist(M.1 <- logX[, 1])
hist(M.1, main = "Expression values", xlab = "Expression", ylab = "# of spots",
col = 4)
opt <- par(mfcol = c(2, 1))
plot(X1 <- x[, 1], Y1 <- x[, 4])
d <- data.frame(X1, Y1)
abline(0, 1, col = "yellow")
A.1 <- log2(X1) - log2(Y1)
M.1 <- (log2(X1) + log2(Y1)) * 0.5
plot(M.1, A.1, col = densCols(A.1, M.1), pch = 20, cex = 0.5)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
abline(h = 0, col = "yellow")
# Using ma.plot function from library(affy)
ma.plot(rowMeans(log2(d)), log2(X1) - log2(Y1), cex = 0.5, pch = 20, col = densCols(A.1,
M.1))
par(opt)
Boxplots for the data of normal and log transformed.
opt <- par(mfcol = c(1, 2))
boxplot(x, col = c(2, 3, 2, 3, 2, 3), main = "Expression values for 3 control and 3 treated patients",
xlab = "Slides", ylab = "Expression", las = 2, cex.axis = 0.7)
boxplot(logX, col = c(2, 3, 2, 3, 2, 3), main = "Expression values for 3 control and 3 treated patients",
xlab = "Slides", ylab = "Expression", las = 2, cex.axis = 0.7)
abline(0, 0, col = "black")
par(opt)
Labels for Normal and Disease for 6 experiments
logX.cl <- c(0, 1, 0, 1, 0, 1)
ttest = function(x) {
tt = t.test(x[logX.cl == 0], x[logX.cl == 1])
return(c(tt$statistic, tt$p.value))
}
ans = apply(logX, 1, ttest)
ts <- ans[1, ]
pvals <- ans[2, ]
qqnorm(ts)
qqline(ts)
## pvalues are given to retrieve the genes which pass the p values
for (i in c(0.01, 0.05, 0.001, 1e-04, 1e-05, 1e-06, 1e-07)) print(paste("genes with p-values smaller than",
i, length(which(pvals < i))))
## [1] "genes with p-values smaller than 0.01 82"
## [1] "genes with p-values smaller than 0.05 424"
## [1] "genes with p-values smaller than 0.001 7"
## [1] "genes with p-values smaller than 1e-04 0"
## [1] "genes with p-values smaller than 1e-05 0"
## [1] "genes with p-values smaller than 1e-06 0"
## [1] "genes with p-values smaller than 1e-07 0"
#Plot heatmap of differentially expressed genes: Genes are differentially expressed if its p-valueis under a given threshold, which must be smaller than the usual 0.05 or 0.01 due to multiplicity of tests
a <- data.frame(which(pvals < 0.01))
d <- a[, 1]
data <- as.matrix(x = x[d, ])
heatmap(data, col = topo.colors(100), cexRow = 0.5)
## using Pheatmap drows = dist(data, method = 'minkowski') dcols =
## dist(t(data), method = 'minkowski') pheatmap(data,
## clustering_distance_rows = drows, clustering_distance_cols = dcols)