The goal of this analysis is to combine information that is available in public databases in order to generate oncogene signatures for as many cancer cell lines as is possible. I will first download and process the functional genetic data that is available from the Project Achilles database and the COLT cancer database.
On 11/12/15 I downloaded the file named ‘Achilles_QC_v2.4.3.rnai.Gs.gct’ from http://www.broadinstitute.org/achilles/datasets/5/download (note that a username and password are required to access this page). This file contains ATARiS gene level scores for 216 cell lines passing quality control (p=0.05). The file was placed in C:/Users/stg43/R_Programming/CancerGenomicsDataSets
Reading in the Project Achilles dataset
setwd("C:/Users/stg43/R_Programming/CancerGenomicsDataSets")
Achilles <- read.table("Achilles_QC_v2.4.3.rnai.Gs.gct", skip=2, sep="\t", header=T)
Important thing to note here is that there are only 5711 rows(genes) in this dataset. The rest of th genes in these ‘genome-wide’ screens didn’t pass the Ataris filter
Nevertheless, extracting hits for each cell line from Achilles data
phen<-Achilles[,3:218]
hits_logicals<-sapply(phen, function(x) x<(-1))
hits_logicals<-as.data.frame(hits_logicals)
Achilles_Hits<-vector(mode="list", 216)
for (i in 1:216){
Achilles_Hits[[i]]<-Achilles[hits_logicals[[i]],2]
}
cellLineNames<-names(Achilles)
names(Achilles_Hits)<-cellLineNames[3:218]
AchillesCount<-sapply(Achilles_Hits, function(x) length(x))
summary(AchillesCount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 247.0 423.8 523.0 543.0 633.0 943.0
library(ggplot2)
AchillesCount = as.data.frame(AchillesCount)
ggplot(data = AchillesCount, aes(x=AchillesCount)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
On 11/12/15 I downloaded the file named ‘All-GARP-pval.txt’ from ‘http://dpsc.ccbr.utoronto.ca/cancer/download.html’ (again note that a username and password are required for access). This file contains gene level GARP scores for 72 cell lines screened by the COLT cancer initiative.
Reading in the COLT dataset
setwd("C:/Users/stg43/R_Programming/CancerGenomicsDataSets")
COLT <- read.table("All-GARP-pval.txt", sep="\t", header=T)
Notice here that we’ve got 16028 rows (genes) with data for each cell line. This is what I consider to be genome-scale screening data
Extracting the gene symbols that correspond to hits in each cell line in the COLT database (pval<.1)
pvals_only<-COLT[,7:78]
hits_logicals<-sapply(pvals_only,function(x) x<.1)
hits_logicals<-as.data.frame(hits_logicals)
COLT_Hits<-vector(mode="list", 72)
for (i in 1:72){
COLT_Hits[[i]]<-COLT[hits_logicals[[i]],2]
}
cellLineNames<-names(COLT)
names(COLT_Hits)<-cellLineNames[7:78]
Looking at how many hits there are for each cell line across each data set
COLTCount<-sapply(COLT_Hits, function(x) length(x))
summary(COLTCount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1223 1468 1574 1565 1644 2009
COLTCount = as.data.frame(COLTCount)
ggplot(data = COLTCount, aes(x=COLTCount)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Of course, format for cell line names between Achilles and COLT datasets are completely different
names(COLT)[7:12]
## [1] "BT.20" "BT.474" "BT.549" "CAL.51" "CAMA.1" "EFM.19"
names(Achilles)[7:12]
## [1] "A172_CENTRAL_NERVOUS_SYSTEM" "A204_SOFT_TISSUE"
## [3] "A2058_SKIN" "A549_LUNG"
## [5] "A673_BONE" "ACHN_KIDNEY"
Extracting cell line names from the Achilles data set in a format that allows them to be compared to COLT
x<-strsplit(names(Achilles), "_")
AchillesNames<-vector(mode="character", length=length(names(Achilles)))
for (i in 1:length(names(Achilles))) {
AchillesNames[i]<-x[[i]][1]
}
AchillesNames<-AchillesNames[3:218]
Extracting cell line names from the COLT data set in a format that allows them to be compared to Achilles
COLTNames<-gsub(".", "", names(COLT), fixed=T)
COLTNames<-COLTNames[7:78]
change the names of the lists containing hits for each cell line such that Achilles and COLT are in the same format
names(Achilles_Hits)<-AchillesNames
names(COLT_Hits)<-COLTNames
determining which cell lines have been screened by both Achilles and COLT
AchillesNames<-as.data.frame(AchillesNames)
COLTNames<-as.data.frame(COLTNames)
names(AchillesNames)<-"Gene Symbol"
names(COLTNames)<-"Gene Symbol"
screenedByBoth<-merge(AchillesNames, COLTNames, by="Gene Symbol")
screenedByBoth<-screenedByBoth[,]
screenedByBoth<-as.character(screenedByBoth)
print(screenedByBoth)
## [1] "BT20" "BT474" "CAL51" "CFPAC1" "EFM19" "HCC1187"
## [7] "HCC1395" "HCC1954" "HPAC" "HPAFII" "KP4" "MCF7"
## [13] "MDAMB453" "OV90" "SKOV3" "SU8686"
checking overlap of hits for cell lines that were screened by both
for (i in 1:length(screenedByBoth)){
Achilles<-Achilles_Hits[which(names(Achilles_Hits)==screenedByBoth[i])]
COLT<-COLT_Hits[which(names(COLT_Hits)==screenedByBoth[i])]
Achilles<-unique(as.data.frame(Achilles))
COLT<-unique(as.data.frame(COLT))
names(Achilles)<-"Gene Symbol"
names(COLT)<-"Gene Symbol"
inCommon<-merge(unique(Achilles), unique(COLT), by="Gene Symbol")
library(VennDiagram)
AchillesOnly<-dim(Achilles)[1]
COLTOnly<-dim(COLT)[1]
AchillesName<-paste("Achilles",screenedByBoth[i], sep="-")
COLTName<-paste("COLT",screenedByBoth[i], sep="-")
venn.plot<-draw.pairwise.venn(AchillesOnly, COLTOnly, dim(inCommon)[1], c(AchillesName, COLTName), fill=c("red", "blue"), alpha=c(.2, .2), cex=c(1.5,1.5,1.5), cat.pos=c(-180,180), cat.fontface = rep("bold", 2), cat.cex = rep(1.5, 1.5))
grid.draw(venn.plot)
grid.newpage()
}
## Loading required package: grid
There is signifcant overlap in most if not all cases but in all casess the majority of hits are unique. For each cell line that was screened by both initiatives we will pool the hits from both sources
combinedHits<- vector('list', length=length(screenedByBoth))
for (i in 1:length(screenedByBoth)){
Achilles<-Achilles_Hits[which(names(Achilles_Hits)==screenedByBoth[i])]
Achilles<-unique(as.data.frame(Achilles))
COLT<-COLT_Hits[which(names(COLT_Hits)==screenedByBoth[i])]
COLT<-unique(as.data.frame(COLT))
combinedHits[i]<-unique(rbind(Achilles, COLT))
}
names(combinedHits)<-screenedByBoth
For Achilles_Hits and COLT_Hits we need to find out which elements contain data for cell lines that were screened by both. We then need to remove them. We’ll add them back later.
toRemoveAch<-integer(length=length(screenedByBoth))
toRemoveCOLT<-integer(length=length(screenedByBoth))
for (i in 1:length(screenedByBoth)){
toRemoveAch[i]<-which(names(Achilles_Hits)==screenedByBoth[i])
toRemoveCOLT[i]<-which(names(COLT_Hits)==screenedByBoth[i])
}
Achilles_Hits<-Achilles_Hits[-(toRemoveAch)]
COLT_Hits<-COLT_Hits[-(toRemoveCOLT)]
We’ll now put together the Achilles and COLT hits and then add in the data for cell lines that were screened by both
allFunctionalGeneticData<-append(Achilles_Hits, COLT_Hits)
allFunctionalGeneticData<-append(allFunctionalGeneticData, combinedHits)
AllCount<-sapply(allFunctionalGeneticData, function(x) length(x))
summary(AllCount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 247.0 458.8 603.5 836.1 1311.0 2278.0
AllCount = as.data.frame(AllCount)
ggplot(data = AllCount, aes(x=AllCount)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
summary(AllCount)
## AllCount
## Min. : 247.0
## 1st Qu.: 458.8
## Median : 603.5
## Mean : 836.1
## 3rd Qu.:1311.0
## Max. :2278.0
Next we’ll read in the copy number data for all of the cancer cell line encyclopedia cell lines. This data was extracted from the TCGA database using a seperate R script
setwd("C:/Users/stg43/R_Programming/CCLE_Genomic_Data_Sets")
Copy_Number <- read.csv("CCLE_All_Copy_Number_Data_TCGA.csv", header=T)
We want cell lines as columns instead of rows so we’ll transform the data
t_Copy_Number <- t(Copy_Number[,-1])
t_Copy_Number <- as.data.frame(t_Copy_Number)
names(t_Copy_Number) <- as.character(Copy_Number$X)
t_Copy_Number$Gene.Symbol <- row.names(t_Copy_Number)
t_Copy_Number <- t_Copy_Number[,c(996,1:995)]
Extracting the genes for each cell line that have a high-level amplification
Amp_Genes <- sapply(t_Copy_Number[,2:996], function(x) x==2)
Amp_Genes <- as.data.frame(Amp_Genes)
Amp_Genes_List<-vector(mode="list", 995)
for (i in 1:995){
Amp_Genes_List[[i]]<-t_Copy_Number[Amp_Genes[[i]],1]
}
cellLineNames<-names(t_Copy_Number)
names(Amp_Genes_List)<-cellLineNames[2:996]
x <- sapply(Amp_Genes_List, length)
x <- as.data.frame(x)
plot(x)
ggplot(x, aes(x=x)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
summary(x)
## x
## Min. : 0.0
## 1st Qu.: 116.0
## Median : 346.0
## Mean : 559.9
## 3rd Qu.: 800.0
## Max. :4179.0
Less_Than_1 <- which(x==0)
length(Less_Than_1)
## [1] 5
Less_Than_50 <- which(x < 50)
length(Less_Than_50)
## [1] 115
Less_Than_100 <- which(x < 100)
length(Less_Than_100)
## [1] 214
Greater_Than_1000 <- which(x > 1000)
length(Greater_Than_1000)
## [1] 182
Greater_Than_2000 <- which(x > 2000)
length(Greater_Than_2000)
## [1] 34
Now let’s look at how often each gene is amplified
geneAmpCount<-colSums(Copy_Number==2, na.rm=T)
ampCountGenesDF<-data.frame(names(Copy_Number), geneAmpCount)
ampCountGenesDF<-ampCountGenesDF[order(ampCountGenesDF$geneAmpCount, decreasing=T),]
head(ampCountGenesDF)
## names.Copy_Number. geneAmpCount
## CFHR3 CFHR3 729
## CFHR1 CFHR1 727
## ADAM6 ADAM6 645
## KIAA0125 KIAA0125 635
## LINC00226 LINC00226 587
## OR2T10 OR2T10 526
Now let’s intersect amplified gene symbols with functional hits for cell lines that have data for both
In_Both <- intersect(names(Amp_Genes_List), names(allFunctionalGeneticData))
Amp_Hit_List <- vector(mode="list", 213)
for (i in 1:length(In_Both)){
Amp_Hit_List[[i]] <- intersect(Amp_Genes_List[[i]], allFunctionalGeneticData[[i]])
}
names(Amp_Hit_List) <- In_Both