We are interested in identifying relevant oncological disease cohorts for their solid tumor immunotherapy. Using mRNA expression data from TCGA, we want to identify the disease type with the highest median co-expression of PD-1 and PD-L1.
Please provide 4 deliverables:
Scatterplot comparing median RSEM normalized expression per cohort of PD-1 vs PD-L1
Table (tsv) of median RSEM normalized expression per cohort of PD-1 and PD-L1 sorted by descending combined expression.
One paragraph summary of your conclusions from these results.
R-code used to generate these results.
library(FirebrowseR)
cohorts = Metadata.Cohorts(format = "csv") # Download all available cohorts
immuno.Genes = c("PDCD1", "CD274") #The official gene symbol of PD-1 and PD-L1
slide.size = 100 #number of tcga barcodes for Gene expression extract. FirebrowseR always return error if extract expression data for the whole cohort. Therefore I break it into several slides of small number of samples.
extract_Exp <- function(tmp.Pats, diff.Exp.Genes, cohort_name){ #Given sample barcode, gene id, and cohort, return the gene expression matrix
all.Found = F
page.Counter = 1
mRNA.Exp = list()
page.Size = 2000
while(all.Found == F){
mRNA.Exp[[page.Counter]] = Samples.mRNASeq(format = "csv",
gene = diff.Exp.Genes,
cohort = cohort_name,
tcga_participant_barcode =
tmp.Pats$tcga_participant_barcode,
page_size = page.Size,
page = page.Counter)
if(nrow(mRNA.Exp[[page.Counter]]) < page.Size)
all.Found = T
else
page.Counter = page.Counter + 1
}
return(mRNA.Exp)
}
cohorts = cohorts[-10,] #remove FPPP
cohorts = cohorts[-17,] #remove LAML
nCohorts = nrow(cohorts)
for (i in 1:nCohorts){ #loop all the cohorts
cancer.Type=cohorts[[1]][i]
#find the tcga barcode for the samples in the specific cancer cohorts
all.Received = F
page.Counter = 1
page.size = 150
cancer.Pats = list()
while(all.Received == F){
cancer.Pats[[page.Counter]] = Samples.Clinical(format = "csv",
cohort = cancer.Type,
page_size = page.size,
page = page.Counter)
if(page.Counter > 1)
colnames(cancer.Pats[[page.Counter]]) = colnames(cancer.Pats[[page.Counter-1]])
if(nrow(cancer.Pats[[page.Counter]]) < page.size){
all.Received = T
} else{
page.Counter = page.Counter + 1
}
}
cancer.Pats = do.call(rbind, cancer.Pats)
nslide = floor(nrow(cancer.Pats) / slide.size) #find the number of slides based on sample size for each slide
if(exists(deparse(substitute(immuno.Genes.Exp)))){ #remove the immuno.Genes.Exp from last round
rm(immuno.Genes.Exp)
}
if(nslide > 0){
for (j in 1:nslide){
tmp.Pats = cancer.Pats[((j-1)*slide.size+1):(j*slide.size),]
mRNA.Exp = extract_Exp(tmp.Pats, immuno.Genes, cancer.Type)
if (j==1){
immuno.Genes.Exp = mRNA.Exp
}else{
immuno.Genes.Exp = append(immuno.Genes.Exp,mRNA.Exp)
}
}
}
if(nrow(cancer.Pats)>nslide*slide.size){
tmp.Pats=cancer.Pats[(nslide*slide.size+1):nrow(cancer.Pats),]
mRNA.Exp = extract_Exp(tmp.Pats, immuno.Genes, cancer.Type)
if(exists(deparse(substitute(immuno.Genes.Exp)))){
immuno.Genes.Exp = append(immuno.Genes.Exp,mRNA.Exp)
}else{
immuno.Genes.Exp = mRNA.Exp
}
}
immuno.Genes.Exp = do.call(rbind,immuno.Genes.Exp)
assign(cancer.Type,immuno.Genes.Exp)
}
#Calculate the median exp value for PD-1 and PD-L1 for each cohort
PD1_exp = matrix(0,nrow=nCohorts,ncol=1)
PDL1_exp = matrix(0,nrow=nCohorts,ncol=1)
for(i in 1:nCohorts){
mRNA.exp = get(cohorts[[1]][i])
PD1_exp[i] = median(as.numeric(mRNA.exp$expression_log2[which(mRNA.exp$gene == "PDCD1" & mRNA.exp$sample_type=="TP")]),na.rm=T)
PDL1_exp[i] = median(as.numeric(mRNA.exp$expression_log2[which(mRNA.exp$gene == "CD274" & mRNA.exp$sample_type=="TP")]),na.rm=T)
}
plot(PD1_exp,PDL1_exp,main = "Median expression of PD-1 and PD-L1 from 36 TCGA cohorts")
lines(PDL1_exp,PDL1_exp,type = 'l',col="red")
text(PD1_exp,PDL1_exp,labels = cohorts[[1]],pos=1, offset=0.2,cex=0.5)
immuno_exp = cbind(PD1_exp,PDL1_exp)
row.names(immuno_exp)=cohorts[[1]]
colnames(immuno_exp)=c("PD-1","PD-L1")
#sort by the mean expression of PD1 and PDL1
tmp=apply(immuno_exp,1,mean)
tmpx = sort.int(tmp,index.return = T,decreasing = T)
immuno_exp = immuno_exp[tmpx$ix,]
write.table(immuno_exp, file="C:/Users/wwhla/Downloads/job/data scientist/interview/rancho/PD1_PDL1_median_exp_solid_tumor_TCGA_cohorts.tsv", quote=FALSE, sep='\t', col.names = T,row.names=T)
#Calculate the correlation coefficient
cor(immuno_exp[,1],immuno_exp[,2])
## [1] 0.5019677
cor(immuno_exp[,1],immuno_exp[,2],method = "spearman")
## [1] 0.5477477
cor.test(immuno_exp[,1],immuno_exp[,2])
##
## Pearson's product-moment correlation
##
## data: immuno_exp[, 1] and immuno_exp[, 2]
## t = 3.3842, df = 34, p-value = 0.001813
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2076816 0.7129309
## sample estimates:
## cor
## 0.5019677
We extract the median expression of PD-1 and PD-L1 of solid tumor from 36 TCGA cohorts (“FPPP” and “LAML” are removed). The scatter plot indicates that the expression of PD-L1 and PD-1 are highly correlated. We further checked the correlation and correlation test which also indicate the significant high correlation between PD-1 and PD-L1. According to the scatter plot, THYM shows the highest median co-expression of PD-1 and PD-L1.