immune checkpoint inhibitor therapy key genes PD-1 and PD-L1 expressoin across TCGA cancer cohorts

Target Expression Analysis by Disease Type

We are interested in identifying relevant oncological disease cohorts for their solid tumor immunotherapy. Using mRNA expression data from TCGA, we want to identify the disease type with the highest median co-expression of PD-1 and PD-L1.

Please provide 4 deliverables:

Scatterplot comparing median RSEM normalized expression per cohort of PD-1 vs PD-L1
Table (tsv) of median RSEM normalized expression per cohort of PD-1 and PD-L1 sorted by descending combined expression.
One paragraph summary of your conclusions from these results.
R-code used to generate these results.

library(FirebrowseR)
cohorts = Metadata.Cohorts(format = "csv") # Download all available cohorts



immuno.Genes = c("PDCD1", "CD274") #The official gene symbol of PD-1 and PD-L1




slide.size = 100 #number of tcga barcodes for Gene expression extract. FirebrowseR always return error if extract expression data for the whole cohort. Therefore I break it into several slides of small number of samples. 


extract_Exp <- function(tmp.Pats, diff.Exp.Genes, cohort_name){ #Given sample barcode, gene id, and cohort, return the gene expression matrix
  all.Found = F
  page.Counter = 1
  mRNA.Exp = list()
  page.Size = 2000 
  while(all.Found == F){
    mRNA.Exp[[page.Counter]] = Samples.mRNASeq(format = "csv",
                                             gene = diff.Exp.Genes,
                                             cohort = cohort_name,
                                             tcga_participant_barcode =
                                               tmp.Pats$tcga_participant_barcode,
                                             page_size = page.Size,
                                             page = page.Counter)
    if(nrow(mRNA.Exp[[page.Counter]]) < page.Size)
      all.Found = T
    else
      page.Counter = page.Counter + 1
  }
  return(mRNA.Exp)
}

cohorts = cohorts[-10,] #remove FPPP
cohorts = cohorts[-17,] #remove LAML
nCohorts = nrow(cohorts)
for (i in 1:nCohorts){  #loop all the cohorts
  cancer.Type=cohorts[[1]][i]
  #find the tcga barcode for the samples in the specific cancer cohorts
  all.Received = F
  page.Counter = 1
  page.size = 150
  cancer.Pats = list()
  while(all.Received == F){
    cancer.Pats[[page.Counter]] = Samples.Clinical(format = "csv",
                                               cohort = cancer.Type,
                                               page_size = page.size,
                                               page = page.Counter)
    if(page.Counter > 1)
      colnames(cancer.Pats[[page.Counter]]) = colnames(cancer.Pats[[page.Counter-1]])

    if(nrow(cancer.Pats[[page.Counter]]) < page.size){
      all.Received = T
    } else{
      page.Counter = page.Counter + 1
    }
  }
  cancer.Pats = do.call(rbind, cancer.Pats)
  
  nslide = floor(nrow(cancer.Pats) / slide.size) #find the number of slides based on sample size for each slide
  if(exists(deparse(substitute(immuno.Genes.Exp)))){ #remove the immuno.Genes.Exp from last round
    rm(immuno.Genes.Exp)
  }
  if(nslide > 0){
    for (j in 1:nslide){
      tmp.Pats = cancer.Pats[((j-1)*slide.size+1):(j*slide.size),]
      mRNA.Exp = extract_Exp(tmp.Pats, immuno.Genes, cancer.Type)
      if (j==1){
        immuno.Genes.Exp = mRNA.Exp
      }else{
        immuno.Genes.Exp = append(immuno.Genes.Exp,mRNA.Exp)
      }
    }
  }
  if(nrow(cancer.Pats)>nslide*slide.size){
    tmp.Pats=cancer.Pats[(nslide*slide.size+1):nrow(cancer.Pats),]
    mRNA.Exp = extract_Exp(tmp.Pats, immuno.Genes, cancer.Type)
    if(exists(deparse(substitute(immuno.Genes.Exp)))){
      immuno.Genes.Exp = append(immuno.Genes.Exp,mRNA.Exp)
    }else{
      immuno.Genes.Exp = mRNA.Exp
    }
  }
  immuno.Genes.Exp = do.call(rbind,immuno.Genes.Exp)
  assign(cancer.Type,immuno.Genes.Exp)
}
#Calculate the median exp value for PD-1 and PD-L1 for each cohort
PD1_exp = matrix(0,nrow=nCohorts,ncol=1)
PDL1_exp = matrix(0,nrow=nCohorts,ncol=1)
for(i in 1:nCohorts){
  mRNA.exp = get(cohorts[[1]][i])
  PD1_exp[i] = median(as.numeric(mRNA.exp$expression_log2[which(mRNA.exp$gene == "PDCD1" & mRNA.exp$sample_type=="TP")]),na.rm=T)
  PDL1_exp[i] = median(as.numeric(mRNA.exp$expression_log2[which(mRNA.exp$gene == "CD274" & mRNA.exp$sample_type=="TP")]),na.rm=T)
}

plot(PD1_exp,PDL1_exp,main = "Median expression of PD-1 and PD-L1 from 36 TCGA cohorts")
lines(PDL1_exp,PDL1_exp,type = 'l',col="red")
text(PD1_exp,PDL1_exp,labels = cohorts[[1]],pos=1, offset=0.2,cex=0.5)

immuno_exp = cbind(PD1_exp,PDL1_exp)
row.names(immuno_exp)=cohorts[[1]]
colnames(immuno_exp)=c("PD-1","PD-L1")
#sort by the mean expression of PD1 and PDL1
tmp=apply(immuno_exp,1,mean)
tmpx = sort.int(tmp,index.return = T,decreasing = T)
immuno_exp = immuno_exp[tmpx$ix,]
write.table(immuno_exp, file="C:/Users/wwhla/Downloads/job/data scientist/interview/rancho/PD1_PDL1_median_exp_solid_tumor_TCGA_cohorts.tsv", quote=FALSE, sep='\t', col.names = T,row.names=T)

#Calculate the correlation coefficient
cor(immuno_exp[,1],immuno_exp[,2])

## [1] 0.5019677

cor(immuno_exp[,1],immuno_exp[,2],method = "spearman")

## [1] 0.5477477

cor.test(immuno_exp[,1],immuno_exp[,2])

## 
##  Pearson's product-moment correlation
## 
## data:  immuno_exp[, 1] and immuno_exp[, 2]
## t = 3.3842, df = 34, p-value = 0.001813
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2076816 0.7129309
## sample estimates:
##       cor 
## 0.5019677

immune checkpoint inhibitor therapy key genes PD-1 and PD-L1 expressoin across TCGA cancer cohorts

Wenhui Wang

2023-03-07

Target Expression Analysis by Disease Type

Conclusion