Generating Oncogene Signatures Using Public Databases

The goal of this analysis is to combine information that is available in public databases in order to generate oncogene signatures for as many cancer cell lines as is possible. I will first download and process the functional genetic data that is available from the Project Achilles database and the COLT cancer database.

On 11/12/15 I downloaded the file named ‘Achilles_QC_v2.4.3.rnai.Gs.gct’ from http://www.broadinstitute.org/achilles/datasets/5/download (note that a username and password are required to access this page). This file contains ATARiS gene level scores for 216 cell lines passing quality control (p=0.05). The file was placed in C:/Users/stg43/R_Programming/CancerGenomicsDataSets

Reading in the Project Achilles dataset

setwd("C:/Users/stg43/R_Programming/CancerGenomicsDataSets")
Achilles <- read.table("Achilles_QC_v2.4.3.rnai.Gs.gct", skip=2, sep="\t", header=T)

Important thing to note here is that there are only 5711 rows(genes) in this dataset. The rest of th genes in these ‘genome-wide’ screens didn’t pass the Ataris filter

Nevertheless, extracting hits for each cell line from Achilles data

phen<-Achilles[,3:218]
hits_logicals<-sapply(phen, function(x) x<(-1))
hits_logicals<-as.data.frame(hits_logicals)

Achilles_Hits<-vector(mode="list", 216)
  for (i in 1:216){
    Achilles_Hits[[i]]<-Achilles[hits_logicals[[i]],2]
  }
cellLineNames<-names(Achilles)
names(Achilles_Hits)<-cellLineNames[3:218]

AchillesCount<-sapply(Achilles_Hits, function(x) length(x))
summary(AchillesCount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   247.0   423.8   523.0   543.0   633.0   943.0

library(ggplot2)
AchillesCount = as.data.frame(AchillesCount)
ggplot(data = AchillesCount, aes(x=AchillesCount)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

On 11/12/15 I downloaded the file named ‘All-GARP-pval.txt’ from ‘http://dpsc.ccbr.utoronto.ca/cancer/download.html’ (again note that a username and password are required for access). This file contains gene level GARP scores for 72 cell lines screened by the COLT cancer initiative.

Reading in the COLT dataset

setwd("C:/Users/stg43/R_Programming/CancerGenomicsDataSets")
COLT <- read.table("All-GARP-pval.txt", sep="\t", header=T)

Notice here that we’ve got 16028 rows (genes) with data for each cell line. This is what I consider to be genome-scale screening data

Extracting the gene symbols that correspond to hits in each cell line in the COLT database (pval<.1)

pvals_only<-COLT[,7:78]
hits_logicals<-sapply(pvals_only,function(x) x<.1)
hits_logicals<-as.data.frame(hits_logicals)

COLT_Hits<-vector(mode="list", 72)
  for (i in 1:72){
    COLT_Hits[[i]]<-COLT[hits_logicals[[i]],2]
  }
cellLineNames<-names(COLT)
names(COLT_Hits)<-cellLineNames[7:78]

Looking at how many hits there are for each cell line across each data set

COLTCount<-sapply(COLT_Hits, function(x) length(x))
summary(COLTCount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1223    1468    1574    1565    1644    2009

COLTCount = as.data.frame(COLTCount)
ggplot(data = COLTCount, aes(x=COLTCount)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Of course, format for cell line names between Achilles and COLT datasets are completely different

names(COLT)[7:12]

## [1] "BT.20"  "BT.474" "BT.549" "CAL.51" "CAMA.1" "EFM.19"

names(Achilles)[7:12]

## [1] "A172_CENTRAL_NERVOUS_SYSTEM" "A204_SOFT_TISSUE"           
## [3] "A2058_SKIN"                  "A549_LUNG"                  
## [5] "A673_BONE"                   "ACHN_KIDNEY"

Extracting cell line names from the Achilles data set in a format that allows them to be compared to COLT

x<-strsplit(names(Achilles), "_")

AchillesNames<-vector(mode="character", length=length(names(Achilles)))
for (i in 1:length(names(Achilles))) {
  AchillesNames[i]<-x[[i]][1]
}
AchillesNames<-AchillesNames[3:218]

Extracting cell line names from the COLT data set in a format that allows them to be compared to Achilles

COLTNames<-gsub(".", "", names(COLT), fixed=T)
COLTNames<-COLTNames[7:78]

change the names of the lists containing hits for each cell line such that Achilles and COLT are in the same format

names(Achilles_Hits)<-AchillesNames
names(COLT_Hits)<-COLTNames

determining which cell lines have been screened by both Achilles and COLT

AchillesNames<-as.data.frame(AchillesNames)
COLTNames<-as.data.frame(COLTNames)

names(AchillesNames)<-"Gene Symbol"
names(COLTNames)<-"Gene Symbol"

screenedByBoth<-merge(AchillesNames, COLTNames, by="Gene Symbol")
screenedByBoth<-screenedByBoth[,]
screenedByBoth<-as.character(screenedByBoth)
print(screenedByBoth)

##  [1] "BT20"     "BT474"    "CAL51"    "CFPAC1"   "EFM19"    "HCC1187" 
##  [7] "HCC1395"  "HCC1954"  "HPAC"     "HPAFII"   "KP4"      "MCF7"    
## [13] "MDAMB453" "OV90"     "SKOV3"    "SU8686"

checking overlap of hits for cell lines that were screened by both

for (i in 1:length(screenedByBoth)){
  Achilles<-Achilles_Hits[which(names(Achilles_Hits)==screenedByBoth[i])]
  COLT<-COLT_Hits[which(names(COLT_Hits)==screenedByBoth[i])]
  Achilles<-unique(as.data.frame(Achilles))
  COLT<-unique(as.data.frame(COLT))
  names(Achilles)<-"Gene Symbol"
  names(COLT)<-"Gene Symbol"
  inCommon<-merge(unique(Achilles), unique(COLT), by="Gene Symbol")
  
  library(VennDiagram)
  AchillesOnly<-dim(Achilles)[1]
  COLTOnly<-dim(COLT)[1]
  AchillesName<-paste("Achilles",screenedByBoth[i], sep="-")
  COLTName<-paste("COLT",screenedByBoth[i], sep="-")
  
  venn.plot<-draw.pairwise.venn(AchillesOnly, COLTOnly, dim(inCommon)[1], c(AchillesName, COLTName), fill=c("red", "blue"), alpha=c(.2, .2), cex=c(1.5,1.5,1.5), cat.pos=c(-180,180), cat.fontface = rep("bold", 2), cat.cex = rep(1.5, 1.5))
  grid.draw(venn.plot)
  grid.newpage()
}

## Loading required package: grid

There is signifcant overlap in most if not all cases but in all casess the majority of hits are unique. For each cell line that was screened by both initiatives we will pool the hits from both sources

combinedHits<- vector('list', length=length(screenedByBoth))

for (i in 1:length(screenedByBoth)){
  Achilles<-Achilles_Hits[which(names(Achilles_Hits)==screenedByBoth[i])]
  Achilles<-unique(as.data.frame(Achilles))
  COLT<-COLT_Hits[which(names(COLT_Hits)==screenedByBoth[i])]
  COLT<-unique(as.data.frame(COLT))
      combinedHits[i]<-unique(rbind(Achilles, COLT)) 
}

names(combinedHits)<-screenedByBoth

For Achilles_Hits and COLT_Hits we need to find out which elements contain data for cell lines that were screened by both. We then need to remove them. We’ll add them back later.

toRemoveAch<-integer(length=length(screenedByBoth))
toRemoveCOLT<-integer(length=length(screenedByBoth))

for (i in 1:length(screenedByBoth)){
toRemoveAch[i]<-which(names(Achilles_Hits)==screenedByBoth[i])
toRemoveCOLT[i]<-which(names(COLT_Hits)==screenedByBoth[i])
}

Achilles_Hits<-Achilles_Hits[-(toRemoveAch)]
COLT_Hits<-COLT_Hits[-(toRemoveCOLT)]

We’ll now put together the Achilles and COLT hits and then add in the data for cell lines that were screened by both

allFunctionalGeneticData<-append(Achilles_Hits, COLT_Hits)
allFunctionalGeneticData<-append(allFunctionalGeneticData, combinedHits)

AllCount<-sapply(allFunctionalGeneticData, function(x) length(x))
summary(AllCount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   247.0   458.8   603.5   836.1  1311.0  2278.0

AllCount = as.data.frame(AllCount)
ggplot(data = AllCount, aes(x=AllCount)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

summary(AllCount)

##     AllCount     
##  Min.   : 247.0  
##  1st Qu.: 458.8  
##  Median : 603.5  
##  Mean   : 836.1  
##  3rd Qu.:1311.0  
##  Max.   :2278.0

Next we’ll read in the copy number data for all of the cancer cell line encyclopedia cell lines. This data was extracted from the TCGA database using a seperate R script

setwd("C:/Users/stg43/R_Programming/CCLE_Genomic_Data_Sets")
Copy_Number <- read.csv("CCLE_All_Copy_Number_Data_TCGA.csv", header=T)

We want cell lines as columns instead of rows so we’ll transform the data

t_Copy_Number <- t(Copy_Number[,-1])
t_Copy_Number <- as.data.frame(t_Copy_Number)
names(t_Copy_Number) <- as.character(Copy_Number$X)
t_Copy_Number$Gene.Symbol <- row.names(t_Copy_Number)
t_Copy_Number <- t_Copy_Number[,c(996,1:995)]

Extracting the genes for each cell line that have a high-level amplification

Amp_Genes <- sapply(t_Copy_Number[,2:996], function(x) x==2)
Amp_Genes <- as.data.frame(Amp_Genes)

Amp_Genes_List<-vector(mode="list", 995)
  for (i in 1:995){
    Amp_Genes_List[[i]]<-t_Copy_Number[Amp_Genes[[i]],1]
  }
cellLineNames<-names(t_Copy_Number)
names(Amp_Genes_List)<-cellLineNames[2:996]

x <- sapply(Amp_Genes_List, length)
x <- as.data.frame(x)
plot(x)

ggplot(x, aes(x=x)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

summary(x)

##        x         
##  Min.   :   0.0  
##  1st Qu.: 116.0  
##  Median : 346.0  
##  Mean   : 559.9  
##  3rd Qu.: 800.0  
##  Max.   :4179.0

Less_Than_1 <- which(x==0)
length(Less_Than_1)

## [1] 5

Less_Than_50 <- which(x < 50)
length(Less_Than_50)

## [1] 115

Less_Than_100 <- which(x < 100)
length(Less_Than_100)

## [1] 214

Greater_Than_1000 <- which(x > 1000)
length(Greater_Than_1000)

## [1] 182

Greater_Than_2000 <- which(x > 2000)
length(Greater_Than_2000)

## [1] 34

Now let’s look at how often each gene is amplified

geneAmpCount<-colSums(Copy_Number==2, na.rm=T)
ampCountGenesDF<-data.frame(names(Copy_Number), geneAmpCount)
ampCountGenesDF<-ampCountGenesDF[order(ampCountGenesDF$geneAmpCount, decreasing=T),]
head(ampCountGenesDF)

##           names.Copy_Number. geneAmpCount
## CFHR3                  CFHR3          729
## CFHR1                  CFHR1          727
## ADAM6                  ADAM6          645
## KIAA0125            KIAA0125          635
## LINC00226          LINC00226          587
## OR2T10                OR2T10          526

Now let’s intersect amplified gene symbols with functional hits for cell lines that have data for both

In_Both <- intersect(names(Amp_Genes_List), names(allFunctionalGeneticData))

Amp_Hit_List <- vector(mode="list", 213)

for (i in 1:length(In_Both)){
  Amp_Hit_List[[i]] <- intersect(Amp_Genes_List[[i]], allFunctionalGeneticData[[i]])
}
names(Amp_Hit_List) <- In_Both

Generating Oncogene Signatures Using Public Databases

Stephen Guest

Thursday, May 14, 2015