Load required packages

library(httr)
library(jsonlite)
library(Biobase)
library(GEOquery)
library(GeoDE)
library(limma)
library(ggplot2)
BASE_URL <- 'http://amp.pharm.mssm.edu/CREEDS/'

1. Search signatures using text

The /search endpoint handles the search of the meta-data associated with gene expression signatures. You can use a query term of interest such as ‘breast cancer’ to perform search using HTTP GET request.

The GET function perform HTTP GET request and return a response object. The httr::content method retrieve the content within the response object. Unfortunately the content method is masked by Biobase::content, we will have to use add namespace when using it.

r <- GET(paste0(BASE_URL, 'search'), query=list(q ='breast cancer'))
response <- fromJSON(httr::content(r, 'text'))
print(r$status_code)
## [1] 200

The response is an R data.frame, have a look at the search result

head(response)
##       do_id            cell_type
## 1 DOID:1612   Mammary Epithelium
## 2 DOID:1612 Mammary Gland Tissue
## 3 DOID:1612        Mammary gland
## 4 DOID:1612      Epithelial Cell
## 5 DOID:1612 Mammary Gland Tissue
## 6 DOID:1612 Mammary Gland Tissue
##                                                                                                                                                                                                                                                                                                                                                                                                         pert_ids
## 1                                                                                                                                                                                                                                               GSM1928, GSM1929, GSM1930, GSM1931, GSM1934, GSM1936, GSM1937, GSM1938, GSM1939, GSM1940, GSM1941, GSM1942, GSM1943, GSM1944, GSM1945, GSM1946, GSM1947, GSM1948
## 2 GSM85473, GSM85474, GSM85475, GSM85476, GSM85477, GSM85478, GSM85479, GSM85480, GSM85481, GSM85482, GSM85483, GSM85484, GSM85485, GSM85486, GSM85487, GSM85488, GSM85489, GSM85490, GSM85491, GSM85492, GSM85493, GSM85494, GSM85495, GSM85496, GSM85497, GSM85498, GSM85499, GSM85500, GSM85501, GSM85502, GSM85503, GSM85504, GSM85505, GSM85506, GSM85507, GSM85508, GSM85509, GSM85510, GSM85511, GSM85512
## 3                                                                                                                                                                                                                                                                                                                                           GSM48236, GSM48237, GSM48238, GSM48239, GSM48240, GSM48241, GSM48243
## 4                                                                                                                                                                                                                                                                                                                                 GSM38897, GSM38898, GSM38899, GSM38900, GSM38901, GSM38902, GSM38903, GSM38904
## 5                                                                                                                         GSM22365, GSM22366, GSM22367, GSM22368, GSM22370, GSM22371, GSM22372, GSM22373, GSM22375, GSM22376, GSM22377, GSM22378, GSM22379, GSM22380, GSM22383, GSM22386, GSM22389, GSM22391, GSM22395, GSM22396, GSM22398, GSM22399, GSM22402, GSM22407, GSM22411, GSM22412, GSM22415, GSM22416
## 6                                                                                                                                                                 GSM33169, GSM33170, GSM33171, GSM33172, GSM33173, GSM33174, GSM33176, GSM33177, GSM33178, GSM33179, GSM33180, GSM33181, GSM33183, GSM33184, GSM33185, GSM33186, GSM33187, GSM33188, GSM33189, GSM33190, GSM33191, GSM33192, GSM33193, GSM33194
##   umls_cui     curator platform
## 1 C0006142 Joel.Dudley   GPL170
## 2 C0006142 Joel.Dudley   GPL570
## 3 C0006142 Joel.Dudley    GPL81
## 4 C0006142 Joel.Dudley  GPL1794
## 5 C0006142 Joel.Dudley  GPL1223
## 6 C0006142 Joel.Dudley   GPL341
##                                                                                                                                                                                                                                                                                                                         ctrl_ids
## 1                                                                                                                                                                                                                                                         GSM1932, GSM1933, GSM1935, GSM1949, GSM1950, GSM1951, GSM1952, GSM1953
## 2                                                                                                                                                                                                                                                           GSM85513, GSM85514, GSM85515, GSM85516, GSM85517, GSM85518, GSM85519
## 3                                                                                                                                                                                                                                                                                                   GSM48242, GSM48244, GSM48245
## 4                                                                                                                                                                                                                                                                                                             GSM38895, GSM38896
## 5 GSM22369, GSM22374, GSM22381, GSM22382, GSM22384, GSM22385, GSM22387, GSM22388, GSM22390, GSM22392, GSM22393, GSM22394, GSM22397, GSM22400, GSM22401, GSM22403, GSM22404, GSM22405, GSM22406, GSM22408, GSM22409, GSM22410, GSM22413, GSM22414, GSM22417, GSM22418, GSM22419, GSM22420, GSM22421, GSM22422, GSM22423, GSM22424
## 6                                                                                                                                                                                                                   GSM33158, GSM33159, GSM33160, GSM33161, GSM33162, GSM33163, GSM33164, GSM33165, GSM33166, GSM33167, GSM33168
##    disease_name  geo_id organism    id
## 1 breast cancer   GSE53    human dz:11
## 2 breast cancer GSE3744    human dz:24
## 3 breast cancer GSE2528    mouse dz:28
## 4 breast cancer GSE2155    human dz:39
## 5 breast cancer GSE1378    human dz:52
## 6 breast cancer GSE1872      rat dz:63

2. Query the signature database using up/down gene sets to find similar/opposite signatures

Signatures can also be queried against using a pair of up/down gene lists from the /search endpoint using HTTP POST request. Here we generate vectors of up and down genes to query against the CREEDS database:

## Get some up and down gene lists 
up_genes <- c('KIAA0907','KDM5A','CDC25A','EGR1','GADD45B','RELB','TERF2IP','SMNDC1','TICAM1','NFKB2','RGS2','NCOA3','ICAM1','TEX10','CNOT4','ARID4B','CLPX','CHIC2','CXCL2','FBXO11','MTF2','CDK2','DNTTIP2','GADD45A','GOLT1B','POLR2K','NFKBIE','GABPB1','ECD','PHKG2','RAD9A','NET1','KIAA0753','EZH2','NRAS','ATP6V0B','CDK7','CCNH','SENP6','TIPARP','FOS','ARPP19','TFAP2A','KDM5B','NPC1','TP53BP2','NUSAP1')
dn_genes <- c('SCCPDH','KIF20A','FZD7','USP22','PIP4K2B','CRYZ','GNB5','EIF4EBP1','PHGDH','RRAGA','SLC25A46','RPA1','HADH','DAG1','RPIA','P4HA2','MACF1','TMEM97','MPZL1','PSMG1','PLK1','SLC37A4','GLRX','CBR3','PRSS23','NUDCD3','CDC20','KIAA0528','NIPSNAP1','TRAM2','STUB1','DERA','MTHFD2','BLVRA','IARS2','LIPA','PGM1','CNDP2','BNIP3','CTSL1','CDC25B','HSPA8','EPRS','PAX8','SACM1L','HOXA5','TLE1','PYGL','TUBB6','LOXL1')
payload = list(
  up_genes = up_genes,
  dn_genes = dn_genes,
  direction= 'similar'
  )

POST the input gene sets to the API and load the response to R data.frame

r <- POST(paste0(BASE_URL, 'search'), body=payload, encode='json')
print(r$status_code)
## [1] 200
response <- fromJSON(httr::content(r, 'text'))

Have a look at the search result

head(response)
##   signed_jaccard        id                                            name
## 1        0.00289 gene:2008     Met, http://www.ncbi.nlm.nih.gov/gene/17295
## 2        0.00251   gene:84   Lmx1b, http://www.ncbi.nlm.nih.gov/gene/16917
## 3        0.00207 gene:2314 Arhgap5, http://www.ncbi.nlm.nih.gov/gene/11855
## 4        0.00178 gene:2916  Pde10a, http://www.ncbi.nlm.nih.gov/gene/23984
## 5        0.00162  gene:601 MIR140, http://www.ncbi.nlm.nih.gov/gene/406932
## 6        0.00155 gene:2369    Vax2, http://www.ncbi.nlm.nih.gov/gene/24113
##     geo_id
## 1  GSE8747
## 2 GSE10516
## 3 GSE53388
## 4 GSE40377
## 5 GSE13590
## 6 GSE19626

3. Retrieve a signature using id

r <- GET(paste0(BASE_URL, 'api'), query=list(id= response[1, 'id']))
sig <- fromJSON(httr::content(r, 'text'))

sig is a list with the following keys:

names(sig)
##  [1] "cell_type"      "pert_ids"       "hs_gene_symbol" "curator"       
##  [5] "geo_id"         "platform"       "ctrl_ids"       "down_genes"    
##  [9] "up_genes"       "pert_type"      "organism"       "id"            
## [13] "mm_gene_symbol"

4. Retrieve a gene expression dataset from GEO and perform some analyses

4.0. Prepare data

Look at the meta-data required to retrieve dataset from GEO

cat('GEO id:', sig$geo_id)
## GEO id: GSE8747
cat('Control GSMs:', sig$ctrl_ids)
## Control GSMs: GSM217200 GSM217202 GSM217204 GSM217206 GSM217208
cat('Perturbation GSMs:', sig$pert_ids)
## Perturbation GSMs: GSM217210 GSM217214 GSM217215 GSM217217

Retrieve the GEO series using GEOquery::getGEO function, which return a list of ExpressionSet instance. The ExpressionSet object contains a matrix with rows being probes and columns being samples, and can be accessed by the exprs method.

eset <- getGEO(sig$geo_id, GSEMatrix=TRUE, AnnotGPL=TRUE)
## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE8nnn/GSE8747/matrix/
## Found 1 file(s)
## GSE8747_series_matrix.txt.gz
## File stored at:
## /var/folders/rw/0yzf11tn1fnb16wk6ynskvy80000gn/T//RtmpbYWo7b/GPL4200.annot.gz
eset <- eset[[1]]
print(dim(eset))
## Features  Samples 
##    36607       38

Subset the columns of the eset with ctrl_ids and pert_ids.

eset <- eset[, c(sig$ctrl_ids, sig$pert_ids)]
print(dim(eset))
## Features  Samples 
##    36607        9

4.1. Explanatory analyses

Visualize the gene expression values across samples using density plot

expr_df <- as.data.frame(exprs(eset))
dfs <- stack(expr_df)
colnames(dfs) <- c('Gene_expression', 'Sample')
ggplot(dfs, aes(x=Gene_expression)) + geom_density(aes(group=Sample, colour=Sample))

Visualize using PCA plot

# Make a list labeling the whether the samples are controls or perturbations
n_ctrls <- length(sig$ctrl_ids)
n_perts <- length(sig$pert_ids)
sample_classes <- factor(c(rep('CTRL', n_ctrls), rep('PERT', n_perts)))
# also make a integer version of sample_classes using 1 for 'CTRL' and 2 for 'PERT'
sample_classes_i <- factor(c(rep(1, n_ctrls), rep(2, n_perts)))
# Perform PCA on the expression transposed expression data.frame (samples x genes)
pca <- prcomp(t(exprs(eset)), scale=T)
# Get the coordinates
pca_coords <- data.frame(sample_classes, pca$x[,1:3])
qplot(x=PC1, y=PC2, data=pca_coords, colour=sample_classes)

4.2. Perform differential expression analysis

4.2.1. Use the Characteristic Direcion (CD) using the chdirAnalysis in the GeoDE package.

expr_df <- as.data.frame(exprs(eset))
expr_df <- cbind(rownames(expr_df), expr_df)
chdir_result <- chdirAnalysis(expr_df, sample_classes_i, gamma=0.5)

4.2.2. Use Limma

First construct design matrix

design <- model.matrix(~sample_classes + 0, eset)
colnames(design) <- levels(sample_classes)
print(design)
##           CTRL PERT
## GSM217200    1    0
## GSM217202    1    0
## GSM217204    1    0
## GSM217206    1    0
## GSM217208    1    0
## GSM217210    0    1
## GSM217214    0    1
## GSM217215    0    1
## GSM217217    0    1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$sample_classes
## [1] "contr.treatment"

Fit limma model and make volcano plot

fit <- lmFit(eset, design)
cont.matrix <- makeContrasts(PERT-CTRL, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
volcanoplot(fit2, highlight=5)

Examine top DEGs

topDEGs <- topTable(fit2, adjust="fdr")
# Only display probe ID and Gene.symbol
topDEGs <- topDEGs[,c('ID','Gene.symbol', 'logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B')]
print(topDEGs)
##                  ID Gene.symbol    logFC    AveExpr         t      P.Value
## 1790188_1 1790188_1             -1.65425 -0.1372222 -7.106741 1.677393e-05
## 1790158_1 1790158_1     Zfp36l1  1.25815  0.2467778  6.806793 2.509765e-05
## 1776352_1 1776352_1    Il1rapl2 -1.62545  0.1017778 -6.785714 2.582957e-05
## 1791593_1 1791593_1       H2-Bl  1.40535  0.5790000  6.520541 3.726582e-05
## 1771356_1 1771356_1             -1.42305 -0.1266667 -6.498438 3.843803e-05
## 1781422_1 1781422_1     Cdk2ap2  1.07055  0.4500000  6.356961 4.693754e-05
## 1784866_1 1784866_1       Pebp1  1.12140  0.8950000  6.313769 4.991590e-05
## 1774258_1 1774258_1       Pla1a  1.01600  0.1565556  6.241480 5.536080e-05
## 1771918_1 1771918_1       Folr1 -1.72255  0.1002222 -6.178985 6.057894e-05
## 1770043_1 1770043_1       Abca3  1.60805 -0.3431111  6.166194 6.171018e-05
##           adj.P.Val        B
## 1790188_1 0.1852948 2.794815
## 1790158_1 0.1852948 2.493036
## 1776352_1 0.1852948 2.471284
## 1791593_1 0.1852948 2.191373
## 1771356_1 0.1852948 2.167511
## 1781422_1 0.1852948 2.012818
## 1784866_1 0.1852948 1.964911
## 1774258_1 0.1852948 1.884011
## 1771918_1 0.1852948 1.813343
## 1770043_1 0.1852948 1.798796

Built with

sessionInfo()
## R version 3.1.3 (2015-03-09)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.2 (Yosemite)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggplot2_2.1.0       limma_3.22.7        GeoDE_1.0.1        
##  [4] MASS_7.3-44         Matrix_1.2-2        GEOquery_2.32.0    
##  [7] Biobase_2.26.0      BiocGenerics_0.12.1 jsonlite_0.9.17    
## [10] httr_1.0.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6     colorspace_1.2-6 curl_0.9.3       digest_0.6.9    
##  [5] evaluate_0.8     formatR_1.3      grid_3.1.3       gtable_0.2.0    
##  [9] htmltools_0.2.6  knitr_1.12.3     labeling_0.3     lattice_0.20-33 
## [13] magrittr_1.5     munsell_0.4.3    plyr_1.8.3       R6_2.1.1        
## [17] Rcpp_0.12.3      RCurl_1.95-4.7   rmarkdown_0.9.5  rstudioapi_0.3.1
## [21] scales_0.4.0     stringi_1.0-1    stringr_1.0.0    tools_3.1.3     
## [25] XML_3.98-1.3     yaml_2.1.13