## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
data(leukemiasEset)
leuData <- exprs(leukemiasEset)
summary(leukemiasEset$LeukemiaType)
## ALL AML CLL CML NoL
## 12 12 12 12 12
phenotypeData <- pData(leukemiasEset)
length(colnames(leuData))
## [1] 60
summary(leukemiasEset$LeukemiaType)
## ALL AML CLL CML NoL
## 12 12 12 12 12
length(rownames(leuData))
## [1] 20172
ENSG00000157851 is an Ensembl indentifier
Extract the phenotype data and expression matrix from “leukemiasEset”. Answer the following questions by manipulating the expression matrix and phenotype data
hist(leuData, breaks = 100)
###Draw vertical lines on the histogram to indicate the median 25% quantile, 75% quantile and mean (use legend and include the plot in your report).
quanData <- quantile(leuData)
med <- quanData[3]
q25 <- quanData[2]
q75 <- quanData[4]
me <- mean(leuData)
hist(leuData, breaks = 100)
abline(v = med, col = "red")
abline(v = q25, col = "green")
abline(v = q75, col = "blue")
abline(v = me, col = "yellow")
legend("topright", col = c("red", "green", "blue", "yellow"), legend = c("median",
"25 quantile", "75 quantile", "mean"), pch = 20)
t <- melt(leuData)
m <- ggplot(t, aes(x = value))
m <- m + geom_histogram(binwidth = 0.1, aes(fill = ..count..)) + scale_fill_gradient("Count",
low = "yellow", high = "blue")
thresh <- data.frame(boundary = c("median", "25 quantile", "75 quantile", "mean"),
vals = c(med, q25, q75, me))
m + geom_vline(data = thresh, aes(xintercept = vals, linetype = boundary, colour = boundary),
size = 2, show_guide = TRUE)
qqnorm(leuData)
This works fine but it takes forever
You may noticed that the data distribution is not a strictly normal distribution. But this is based on the expression of all genes, we can still assume that the expression follows normal distribution on single gene level.
boxplot(leuData, horizontal = T, las = 2, col = leukemiasEset$LeukemiaType)
legend("topright", col = unique(as.numeric(leukemiasEset$LeukemiaType)), legend = unique(leukemiasEset$LeukemiaType),
pch = 20)
Perform hierarchical clustering to cluster patients, using distance measurement: 1. Spearman correlation 2. Euclidean distance and agglomeration methods: 1. ward 2. complete
deuc <- dist(t(leuData), method = "euclidean")
hc.euc.ward <- hclust(deuc, method = "ward")
hc.euc.complete <- hclust(deuc, method = "complete")
source("ftp://129.187.44.58/share/chen/R_Functions/plothclust.R")
source(“ftp://129.187.44.58/share/chen/R_Functions/plothclust.R”)
plothclust(hc.euc.ward, col = leukemiasEset$LeukemiaType)
title(main = "Euclidean distance with 'ward' clustering")
legend("topright", legend = unique(leukemiasEset$LeukemiaType), pch = 15, col = unique(as.numeric(leukemiasEset$LeukemiaType)))
plothclust(hc.euc.complete, col = leukemiasEset$LeukemiaType)
title(main = "euclidean distance with 'complete' clustering")
legend("topright", legend = unique(leukemiasEset$LeukemiaType), pch = 15, col = unique(as.numeric(leukemiasEset$LeukemiaType)))
dsp <- 1 - cor(leuData, method = "spearman")
dsp.ward <- hclust(as.dist(dsp), method = "ward")
dsp.complete <- hclust(as.dist(dsp), method = "complete")
plothclust(dsp.ward, col = leukemiasEset$LeukemiaType)
title(main = "spearman distance with 'ward' clustering")
legend("topright", legend = unique(leukemiasEset$LeukemiaType), pch = 15, col = unique(as.numeric(leukemiasEset$LeukemiaType)))
plothclust(dsp.complete, col = leukemiasEset$LeukemiaType)
title(main = "spearman distance with 'complete' clustering")
legend("topright", legend = unique(leukemiasEset$LeukemiaType), pch = 15, col = unique(as.numeric(leukemiasEset$LeukemiaType)))
A perfect clustering should show 5 perfectly seperated cluster. Ward clustering based on an euclidean distance works best for us. The reason is, that ward clustering tends to create clusters with same size. (perfect for uns because we have groups with same sizes.)
Complete clustering is not so useful because it tends to create smaller groups (the distance of all elements between to clusters should be maximum).
Now let’s characterize the transcriptome between different types of leukemia or normal tissues.
expr_scale <- scale(t(leuData), center = TRUE, scale = TRUE)
pca <- prcomp(expr_scale, center = FALSE, scale. = FALSE)
(pca$sdev^2)[1:2]/sum(pca$sdev^2)
## [1] 0.2144 0.1287
sum((pca$sdev^2)[1:2]/sum(pca$sdev^2))
## [1] 0.3432
plot(pca$x[, 1:2], col = leukemiasEset$LeukemiaType, pch = 20)
legend("topleft", legend = unique(leukemiasEset$LeukemiaType), pch = 15, col = unique(as.numeric(leukemiasEset$LeukemiaType)))
It shows that the first principal component is not very useful to seperate the cancer types, but the second principal component can seperate the different cancer types.
All (Acute lymphoblastic leukemia) und CLL (chronic lymphoid leukemia) are placed very near to each other, but still can be seperated by a straigt line. This suits to their same biological origin.
CML (Chronic myelogenous leukemia) and NoL (Non-leukemia and healthy bone marrow) are also placed very near to each other.
You already see that the ALL an CLL tissues are distinguishable from our dataset. At this step, let’s only consider genes that are changing significantly between ALL vs. CLL.
#### annotation stuff
colnames(leuData) <- paste(leukemiasEset$LeukemiaType, sampleNames(leukemiasEset),
sep = "_")
t <- colnames(leuData)
### find position
pos_all <- grep("ALL", t)
pos_cll <- grep("CLL", t)
#### subsetting
df_all_cll <- leuData[, c(pos_all, pos_cll)]
factorlvl <- gl(2, 12, labels = c("ALL", "CLL"))
pval <- rowttests(df_all_cll, factorlvl)
fdr <- p.adjust(pval$p.value, method = "fdr")
cal_fold_change <- function(x) {
apply(x[, 1:12], 1, median)/apply(x[, 13:24], 1, median)
}
fold_change = cal_fold_change(df_all_cll)
plot(y = -log10(fdr), x = log2(fold_change))
df <- data.frame(y = -log10(fdr), x = log2(fold_change))
df$significant = as.factor(abs(df$x) > log2(2) & df$y > -log10(0.05))
g = ggplot(data = df, aes(x = x, y = y, colour = significant)) + geom_point(alpha = 0.4,
size = 1.75)
g = g + xlab("log2 fold change") + ylab("-log10 p-value") + ggtitle(" min fold 2/0.5 and significant (blue) vs rest p = 0.05 (red)")
# dd_text = df[which((abs(df$x) > log2(1.5) & df$y > -log10(0.05))),] g +
# geom_text(data=dd_text, aes(x=x, y=y, label=name), colour='blue')
g
A volcano plot shows for every gene if the differential expression is significant (y - axis; a higher value means a smaller p-value) and how big the differential expression is (x - axis; the value 1 means it is two times higher expressed / a value of -1 means it is 50% expressed)
table(fdr < 0.01)
##
## FALSE TRUE
## 17848 2324
2324 genes are differentially expressed
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
heatmat <- df_all_cll[fdr < 0.01, ]
heatmap.2(heatmat, trace = "none", ColSideColors = as.character(as.numeric(factorlvl)))
After selecting just the differentially expressed genes the heatmap shows that the patients are perfectly matched according their cancer types. It confirms the results from the p-value calculation.
In this step, we will have a functional annotation about these differentially expressed genes via enrichment analysis (Fisher’s exact test). You already know how to create a two way contingency table, which is the basis of Fisher’s exact test. Many programs can do enrichment analysis. Here, you do not need to write your own commands to create a contingency table. This task could be done by function loaded by:
source("ftp://129.187.44.58/share/chen/R_Functions/GOFisher.R")
After execute the command, there should be a function in your workspace called GOFisher(). For more details about the function, read the supplements at the end of this document. Now, you have the function to perform the enrichment analysis. There are three pieces of important information required in the analysis: 1. Genes differentially expressed 2. GO term or other gene set information 3. Background information
load("/Users/tobias/Dropbox/Uni/Master/HT_Course/Report/GOterms.RData")
After loading, you should have an object named GOterms containing a list of GO terms.
length(GOterms)
## [1] 229
The last important piece of information we need is background. Because the data we are working with have been generated on a Microarray platform (HGU133PLUS2) and our data have been mapped to unique ensembl gene IDs, all possible genes can be detected are predefined by the array. Therefore, we can use the total number of unique ensembl IDs on that platform as the background, that is, n=20899. To do Fisher’s exact on one GO term you can do:
result_fisher <- lapply(GOterms, GOFisher, diff.genes = rownames(df_all_cll[fdr <
0.01, ]), background = 20899)
extract_p <- function(x) {
x$p.value
}
p_val_fisher <- lapply(result_fisher, extract_p)
fdr_fisher <- p.adjust(p_val_fisher, method = "fdr")
interestingGO <- names(which(fdr_fisher < 0.01))
print(interestingGO)
## [1] "GO:0000086" "GO:0000278" "GO:0000287" "GO:0003682" "GO:0004713"
## [6] "GO:0005515" "GO:0005524" "GO:0005543" "GO:0005634" "GO:0005654"
## [11] "GO:0005737" "GO:0005764" "GO:0005794" "GO:0005813" "GO:0005819"
## [16] "GO:0005829" "GO:0005874" "GO:0006260" "GO:0006281" "GO:0006310"
## [21] "GO:0006468" "GO:0006915" "GO:0007049" "GO:0007067" "GO:0007264"
## [26] "GO:0007596" "GO:0008283" "GO:0009897" "GO:0019901" "GO:0035556"
## [31] "GO:0042981" "GO:0043065" "GO:0043123" "GO:0044281" "GO:0051056"
## [36] "GO:0051301" "GO:0097190"
You already found a list of GO terms of interest from enrichment analysis, but you won’t know anything from the IDs. We need to know the name and description of those GO terms. You can retrieve the information from BioMart:
Hint: You can use the biomart/database “ensembl” and dataset “hsapiens_gene_ensembl”. This can be defined by:
ensembl_hsap <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
listFilters(ensembl_hsap)[grep("go", listFilters(ensembl_hsap)$name), ]
## name description
## 43 with_go_id with GO Term Accession(s)
## 88 with_ox_goslim_goa with GOSlim GOA(s)
## 100 with_go_go with GO ID(s)
## 128 go_id GO Term Accession(s) [e.g. GO:0005515]
## 129 goslim_goa_accession GOSlim GOA Accessions(s) [e.g. GO:0005623]
## 199 go_evidence_code GO Evidence code
## 200 go_parent_term Parent term accession
## 201 go_parent_name Parent term name
## 235 with_homolog_ggor Orthologous Gorilla Genes
la <- listAttributes(ensembl_hsap)
la[grep("definition", la$name), ]
## name description
## 30 definition_1006 GO Term Definition
final_result <- getBM(attributes = c("go_id", "definition_1006", "name_1006"),
filters = "go_id", values = interestingGO, mart = ensembl_hsap)
final_result
## go_id
## 1 GO:0000086
## 2 GO:0000278
## 3 GO:0000287
## 4 GO:0003682
## 5 GO:0004713
## 6 GO:0005515
## 7 GO:0005524
## 8 GO:0005543
## 9 GO:0005634
## 10 GO:0005654
## 11 GO:0005737
## 12 GO:0005764
## 13 GO:0005794
## 14 GO:0005813
## 15 GO:0005819
## 16 GO:0005829
## 17 GO:0005874
## 18 GO:0006260
## 19 GO:0006281
## 20 GO:0006310
## 21 GO:0006468
## 22 GO:0006915
## 23 GO:0007049
## 24 GO:0007067
## 25 GO:0007264
## 26 GO:0007596
## 27 GO:0008283
## 28 GO:0009897
## 29 GO:0019901
## 30 GO:0035556
## 31 GO:0042981
## 32 GO:0043065
## 33 GO:0043123
## 34 GO:0044281
## 35 GO:0051056
## 36 GO:0051301
## 37 GO:0097190
## definition_1006
## 1 The mitotic cell cycle transition by which a cell in G2 commits to M phase. The process begins when the kinase activity of M cyclin/CDK complex reaches a threshold high enough for the cell cycle to proceed. This is accomplished by activating a positive feedback loop that results in the accumulation of unphosphorylated and active M cyclin/CDK complex. [GOC:mtg_cell_cycle]
## 2 Progression through the phases of the mitotic cell cycle, the most common eukaryotic cell cycle, which canonically comprises four successive phases called G1, S, G2, and M and includes replication of the genome and the subsequent segregation of chromosomes into daughter cells. In some variant cell cycles nuclear replication or nuclear division may not be followed by cell division, or G1 and G2 phases may be absent. [GOC:mah, ISBN:0815316194, Reactome:69278]
## 3 Interacting selectively and non-covalently with magnesium (Mg) ions. [GOC:ai]
## 4 Interacting selectively and non-covalently with chromatin, the network of fibers of DNA, protein, and sometimes RNA, that make up the chromosomes of the eukaryotic nucleus during interphase. [GOC:jl, ISBN:0198506732, PMID:20404130]
## 5 Catalysis of the reaction: ATP + a protein tyrosine = ADP + protein tyrosine phosphate. [EC:2.7.10]
## 6 Interacting selectively and non-covalently with any protein or protein complex (a complex of two or more proteins that may include other nonprotein molecules). [GOC:go_curators]
## 7 Interacting selectively and non-covalently with ATP, adenosine 5'-triphosphate, a universally important coenzyme and enzyme regulator. [ISBN:0198506732]
## 8 Interacting selectively and non-covalently with phospholipids, a class of lipids containing phosphoric acid as a mono- or diester. [ISBN:0198506732]
## 9 A membrane-bounded organelle of eukaryotic cells in which chromosomes are housed and replicated. In most cells, the nucleus contains all of the cell's chromosomes except the organellar chromosomes, and is the site of RNA synthesis and processing. In some species, or in specialized cell types, RNA metabolism or DNA replication may be absent. [GOC:go_curators]
## 10 That part of the nuclear content other than the chromosomes or the nucleolus. [GOC:ma, ISBN:0124325653]
## 11 All of the contents of a cell excluding the plasma membrane and nucleus, but including other subcellular structures. [ISBN:0198547684]
## 12 A small lytic vacuole that has cell cycle-independent morphology and is found in most animal cells and that contains a variety of hydrolases, most of which have their maximal activities in the pH range 5-6. The contained enzymes display latency if properly isolated. About 40 different lysosomal hydrolases are known and lysosomes have a great variety of morphologies and functions. [GOC:mah, ISBN:0198506732]
## 13 A compound membranous cytoplasmic organelle of eukaryotic cells, consisting of flattened, ribosome-free vesicles arranged in a more or less regular stack. The Golgi apparatus differs from the endoplasmic reticulum in often having slightly thicker membranes, appearing in sections as a characteristic shallow semicircle so that the convex side (cis or entry face) abuts the endoplasmic reticulum, secretory vesicles emerging from the concave side (trans or exit face). In vertebrate cells there is usually one such organelle, while in invertebrates and plants, where they are known usually as dictyosomes, there may be several scattered in the cytoplasm. The Golgi apparatus processes proteins produced on the ribosomes of the rough endoplasmic reticulum; such processing includes modification of the core oligosaccharides of glycoproteins, and the sorting and packaging of proteins for transport to a variety of cellular locations. Three different regions of the Golgi are now recognized both in terms of structure and function: cis, in the vicinity of the cis face, trans, in the vicinity of the trans face, and medial, lying between the cis and trans regions. [ISBN:0198506732]
## 14 A structure comprised of a core structure (in most organisms, a pair of centrioles) and peripheral material from which a microtubule-based structure, such as a spindle apparatus, is organized. Centrosomes occur close to the nucleus during interphase in many eukaryotic cells, though in animal cells it changes continually during the cell-division cycle. [GOC:mah, ISBN:0198547684]
## 15 The array of microtubules and associated molecules that forms between opposite poles of a eukaryotic cell during mitosis or meiosis and serves to move the duplicated chromosomes apart. [ISBN:0198547684]
## 16 The part of the cytoplasm that does not contain organelles but which does contain other particulate matter, such as protein complexes. [GOC:hgd, GOC:jl]
## 17 Any of the long, generally straight, hollow tubes of internal diameter 12-15 nm and external diameter 24 nm found in a wide variety of eukaryotic cells; each consists (usually) of 13 protofilaments of polymeric tubulin, staggered in such a manner that the tubulin monomers are arranged in a helical pattern on the microtubular surface, and with the alpha/beta axes of the tubulin subunits parallel to the long axis of the tubule; exist in equilibrium with pool of tubulin monomers and can be rapidly assembled or disassembled in response to physiological stimuli; concerned with force generation, e.g. in the spindle. [ISBN:0879693568]
## 18 The cellular metabolic process in which a cell duplicates one or more molecules of DNA. DNA replication begins when specific sequences, known as origins of replication, are recognized and bound by initiation proteins, and ends when the original DNA molecule has been completely duplicated and the copies topologically separated. The unit of replication usually corresponds to the genome of the cell, an organelle, or a virus. The template for replication can either be an existing DNA molecule or RNA. [GOC:mah]
## 19 The process of restoring DNA after damage. Genomes are subject to damage by chemical and physical agents in the environment (e.g. UV and ionizing radiations, chemical mutagens, fungal and bacterial toxins, etc.) and by free radicals or alkylating agents endogenously generated in metabolism. DNA is also damaged because of errors during its replication. A variety of different DNA repair pathways have been reported that include direct reversal, base excision repair, nucleotide excision repair, photoreactivation, bypass, double-strand break repair pathway, and mismatch repair pathway. [PMID:11563486]
## 20 Any process in which a new genotype is formed by reassortment of genes resulting in gene combinations different from those that were present in the parents. In eukaryotes genetic recombination can occur by chromosome assortment, intrachromosomal recombination, or nonreciprocal interchromosomal recombination. Intrachromosomal recombination occurs by crossing over. In bacteria it may occur by genetic transformation, conjugation, transduction, or F-duction. [ISBN:0198506732]
## 21 The process of introducing a phosphate group on to a protein. [GOC:hb]
## 22 A programmed cell death process which begins when a cell receives an internal (e.g. DNA damage) or external signal (e.g. an extracellular death ligand), and proceeds through a series of biochemical events (signaling pathways) which typically lead to rounding-up of the cell, retraction of pseudopodes, reduction of cellular volume (pyknosis), chromatin condensation, nuclear fragmentation (karyorrhexis), plasma membrane blebbing and fragmentation of the cell into apoptotic bodies. The process ends when the cell has died. The process is divided into a signaling pathway phase, and an execution phase, which is triggered by the former. [GOC:dhl, GOC:ecd, GOC:go_curators, GOC:mtg_apoptosis, GOC:tb, ISBN:0198506732, PMID:18846107, PMID:21494263]
## 23 The progression of biochemical and morphological phases and events that occur in a cell during successive cell replication or nuclear replication events. Canonically, the cell cycle comprises the replication and segregation of genetic material followed by the division of the cell, but in endocycles or syncytial cells nuclear replication or nuclear division may not be followed by cell division. [GOC:go_curators, GOC:mtg_cell_cycle]
## 24 A cell cycle process comprising the steps by which the nucleus of a eukaryotic cell divides; the process involves condensation of chromosomal DNA into a highly compacted form. Canonically, mitosis produces two daughter nuclei whose chromosome complement is identical to that of the mother cell. [GOC:dph, GOC:ma, GOC:mah, ISBN:0198547684]
## 25 Any series of molecular signals in which a small monomeric GTPase relays one or more of the signals. [GOC:mah]
## 26 The sequential process in which the multiple coagulation factors of the blood interact, ultimately resulting in the formation of an insoluble fibrin clot; it may be divided into three stages: stage 1, the formation of intrinsic and extrinsic prothrombin converting principle; stage 2, the formation of thrombin; stage 3, the formation of stable fibrin polymers. [http://www.graylab.ac.uk/omd/, ISBN:0198506732]
## 27 The multiplication or reproduction of cells, resulting in the expansion of a cell population. [GOC:mah, GOC:mb]
## 28 The side (leaflet) of the plasma membrane that is opposite to the side that faces the cytoplasm. [GOC:tb]
## 29 Interacting selectively and non-covalently with a protein kinase, any enzyme that catalyzes the transfer of a phosphate group, usually from ATP, to a protein substrate. [GOC:jl]
## 30 The process in which a signal is passed on to downstream components within the cell, which become activated themselves to further propagate the signal and finally trigger a change in the function or state of the cell. [GOC:bf, GOC:jl, GOC:signaling, ISBN:3527303782]
## 31 Any process that modulates the occurrence or rate of cell death by apoptotic process. [GOC:jl, GOC:mtg_apoptosis]
## 32 Any process that activates or increases the frequency, rate or extent of cell death by apoptotic process. [GOC:jl, GOC:mtg_apoptosis]
## 33 Any process that activates or increases the frequency, rate or extent of I-kappaB kinase/NF-kappaB signaling. [GOC:jl]
## 34 The chemical reactions and pathways involving small molecules, any low molecular weight, monomeric, non-encoded molecule. [GOC:curators, GOC:pde, GOC:vw]
## 35 Any process that modulates the frequency, rate or extent of small GTPase mediated signal transduction. [GOC:go_curators]
## 36 The process resulting in the physical partitioning and separation of a cell into daughter cells. [GOC:go_curators]
## 37 A series of molecular signals which triggers the apoptotic death of a cell. The pathway starts with reception of a signal, and ends when the execution phase of apoptosis is triggered. [GOC:mtg_apoptosis]
## name_1006
## 1 G2/M transition of mitotic cell cycle
## 2 mitotic cell cycle
## 3 magnesium ion binding
## 4 chromatin binding
## 5 protein tyrosine kinase activity
## 6 protein binding
## 7 ATP binding
## 8 phospholipid binding
## 9 nucleus
## 10 nucleoplasm
## 11 cytoplasm
## 12 lysosome
## 13 Golgi apparatus
## 14 centrosome
## 15 spindle
## 16 cytosol
## 17 microtubule
## 18 DNA replication
## 19 DNA repair
## 20 DNA recombination
## 21 protein phosphorylation
## 22 apoptotic process
## 23 cell cycle
## 24 mitosis
## 25 small GTPase mediated signal transduction
## 26 blood coagulation
## 27 cell proliferation
## 28 external side of plasma membrane
## 29 protein kinase binding
## 30 intracellular signal transduction
## 31 regulation of apoptotic process
## 32 positive regulation of apoptotic process
## 33 positive regulation of I-kappaB kinase/NF-kappaB signaling
## 34 small molecule metabolic process
## 35 regulation of small GTPase mediated signal transduction
## 36 cell division
## 37 apoptotic signaling pathway