We found that the positivity rate diverges the most between Colon normal tissue and Primary/Metastatic cancer tissue, using 10TPM as the cutoff (as shown below). This shiny notebook plots the expression value (TPM in log2 scale) distribution for visualization purpose. Positivity rate of top gene pair expression

  1. Retrieve the expression data for Colon normal, primary, liver.metastatic, lung.metastatic cancer samples from GEO
types = c("primary", "metastasis.liver", "metastasis.lung")

gsms_metastasis.lung <- "XXXXXXXXXXXXXXXXXXXXX1XX1X0X0XXXXXXXX0XX0XX1XXXXX0XX0XX0X0X0XX0X01X0X0X1X0XXX0X0X0XXXXXXXXX0XX1XXXX1XX0XX1XXXXXX1XXXXXXXXXXXXX1XXXXXXXXXXXXXXXXXXXX1XXXX11X1XX1X1XXXXX1XXX111XXXXXXXX0XXXXX00XXXXXXXXXXXXXXXX0XXX0XXXXXXXXXX0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0XXX0XX0XX0XXXXXXX0X0XXX0XXXXXXXX0XXXX0XXXX0X0XXXXX0XXX0XXXX0X0X0XXXXXXXX0XX0XXXXX0XXXXXXXX0XXX0XX0X0XXXXX0XXX0XXXXX0X0XXX00XXXXXXXXXXXXXX"

gsms_metastasis.liver <- "X1111111X1111111XX1XXXXXXX0X0XXXXXXXX0XX0XXXXXXXX0XX0XX0X0X0XX0X0XX0X0XXX0XXX0X0X0XXXXXXXXX0XXXXXXXXXX0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX1XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX11XXX1X01XXXX00XXX1X1XX11XXX1XX011X0XXXXXXXXXX0XX1XXXX1XXXXXXXXXXX1X11111XXXXX0XXX0XX0XX0XXXXXXX0X0X1X0XX1X11XX0XX1X0XXXX0X0XXXXX0XXX0XXXX0X0X0XXXXXXXX0XX0XXX1X0X1XXXXXX0XXX0XX0X0XX1XX0X1X0X1XXX0X0X1X00X1XXXXXXXXXXXX"

gsms_primary <- "XXXXXXXX1XXXXXXXXXX1XXXXX101011X1X1X1011011X1X11101X01X0101011010X1010XX101X101010X1X11X111011X1111XX101XX1111X1X111X1111111XXX1X11111111X1XX1X1X11X1111XX1X1XX1X1111XX111XXX1XXX11XX0X111X00111X1XX1XXX11XXX0XX10111X1X11XX011X1XXXX1X111X1XXXXXXXXXXX11111011101101101111X11010XX10XXX1XX11011X101111010111110X110111101010111XX1X10110XXXX10XX11111101X10X10101XX110XX10XX1X1010XX100XXXXXXXXXXXXXX"

# load series and platform data from GEO
gset <- getGEO("GSE41258", GSEMatrix =TRUE, AnnotGPL=FALSE)
if (length(gset) > 1) idx <- grep("GPL96", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]

# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))


for (type in types){
  sml <- c()
  for (i in 1:nchar(get(paste0("gsms_",type)))) { sml[i] <- substr(get(paste0("gsms_",type)),i,i) }
  
  # eliminate samples marked as "X"
  sel <- which(sml != "X")
  sml <- sml[sel]
  #assign(paste0("gset_",type), gset[ ,sel])
  temp <- gset[ ,sel]
  
  # log2 transform
  ex <- exprs(temp)
  qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
  LogC <- (qx[5] > 100) ||
            (qx[6]-qx[1] > 50 && qx[2] > 0) ||
            (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
  
  if (LogC) { ex[which(ex <= 0)] <- NaN
    exprs(temp) <- log2(ex) }
  
  sml <- paste("G", sml, sep="")    # set group names
  fl <- as.factor(sml)
  
  tmp_grp <- gsub("G1",paste(type,"Tumor", sep = "."),sml)
  assign(paste("groupLabel",type,sep = "_"), gsub("G0","Normal",tmp_grp))
  
  temp$description <- fl
  #temp$group <- groupLabel

  assign(paste0("gset_",type), temp)
}

# extract the TPM median values from GTEx normals
gtex_exp <- read.table(file = file.path(getwd(),"GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct"),  header = T, sep = "\t")
  1. Draw histogram and density plots for the top pairs of genes in multiple sample types
drawExpHist <- function (x) { 
    # declare an empty dataframe
    merged.df <- data.frame()
    merged.groupLabel <- c()
    # for a gene pair, subset and combine expression DF for normal, primary, met.liver, met.lung
    for (type in types) {
      mat <- exprs(get(paste0("gset_",type)))[fData(get(paste0("gset_",type)))$Gene.Symbol %in% x,]
      # median out all the probe sets in one genes and pad with zeros for not expressed genes
      probe_IDs <- row.names(mat)
      mat <- as.data.frame(mat)
      pheno <- fData(get(paste0("gset_",type)))
      mat$row.names <- pheno[pheno$Gene.Symbol %in% x,]$Gene.Symbol
      df_genes_list <- as.data.frame(mat %>% group_by(row.names) %>% 
                                       summarise_all(list(~median(.)))
                                     ) %>% column_to_rownames(var = "row.names")

      groupLabel <- get(paste("groupLabel",type,sep = "_"))
      # Convert to TPM space from intensity space
      for (idx in 1:length(x)) { 
        # array expression is in log2 scale, so TPM from GTEX needs to be transformed to log2
        # minus used instead of divide, due to the log scale
        factor <- apply(df_genes_list[x[idx],groupLabel=="Normal"],1,median) -
        log2(apply(gtex_exp[gtex_exp$Description==x[idx],
                                    grepl("Colon",names(gtex_exp), ignore.case = T)], 1, median))
    
        df_genes_list[x[idx],] <- df_genes_list[x[idx],] - factor
      }
      
      
      # Merge both dataframe and sample grouplabel
      if (dim(merged.df)[1] == 0){
          merged.df <- df_genes_list
          merged.groupLabel <- groupLabel
        }else {
          merged.groupLabel <- c(merged.groupLabel,groupLabel[!names(df_genes_list) %in% names(merged.df)])
          merged.df <- cbind(merged.df, df_genes_list[,!names(df_genes_list) %in% names(merged.df)])
        }
      # convert value from log2 scale to normal scale in TPM
      #merged.df <- 2^merged.df
    }
    
    # melt the dataframe
    df.plot <- data.frame(t(merged.df),group=merged.groupLabel) %>%
      mutate(group=factor(group, levels=c("Normal","primary.Tumor",
                                          "metastasis.liver.Tumor",
                                          "metastasis.lung.Tumor"))) %>%
      mutate(sampleID=names(merged.df)) %>%
      melt(variable.name= "gene",
           value.name = "expression",
           id.vars = c("sampleID","group"))

    # Draw histogram by cancer types
    theme_set(
      theme_classic() + 
      theme(legend.position = "top")
    )
    
    gene1 <- x[1]
    gene2 <- x[2]
  
    p_hist_g1 <- ggplot(df.plot[df.plot$gene == gene1,], aes(x = expression)) + 
      geom_histogram(aes(color=group, fill=group),
                     position = "identity", bins = 50, alpha = 0.4) +
      xlab("Expression, log2(TPM)") + 
      ggtitle(paste0("Histogram of expression for the gene ",gene1)) + 
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 15)) +
    scale_color_manual(values = c("#0000FF", "#FF0000","#A0A0A0","#FF9933")) +
    scale_fill_manual(values = c("#0000FF", "#FF0000","#A0A0A0","#FF9933")) 

    p_density_g1 <- ggplot(df.plot[df.plot$gene == gene1,], aes(x = expression)) + 
      geom_density(aes(color=group, fill=group), alpha=.2) +  
      xlab("Expression, log2(TPM)") + 
      ggtitle(paste0("Kernel Density Estimation of expression for the gene ",gene1)) + 
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 15)) +
    scale_color_manual(values = c("#0000FF", "#FF0000","#A0A0A0","#FF9933")) +
    scale_fill_manual(values = c("#0000FF", "#FF0000","#A0A0A0","#FF9933")) 
    
    p_hist_g2 <- ggplot(df.plot[df.plot$gene == gene2,], aes(x = expression)) + 
      geom_histogram(aes(color=group, fill=group),
                     position = "identity", bins = 50, alpha = 0.4) +
      xlab("Expression, log2(TPM)") + 
      ggtitle(paste0("Histogram of expression for the gene ",gene2)) + 
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 15)) +
    scale_color_manual(values = c("#0000FF", "#FF0000","#A0A0A0","#FF9933")) +
    scale_fill_manual(values = c("#0000FF", "#FF0000","#A0A0A0","#FF9933")) 

    p_density_g2 <- ggplot(df.plot[df.plot$gene == gene2,], aes(x = expression)) + 
      geom_density(aes(color=group, fill=group), alpha=.2) +  
      xlab("Expression, log2(TPM)") + 
      ggtitle(paste0("Kernel Density Estimation of expression for the gene ",gene2)) + 
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 15)) +
    scale_color_manual(values = c("#0000FF", "#FF0000","#A0A0A0","#FF9933")) +
    scale_fill_manual(values = c("#0000FF", "#FF0000","#A0A0A0","#FF9933")) 
    
    plots <- ggarrange(p_hist_g1,p_hist_g2,p_density_g1,p_density_g2, ncol = 2, nrow = 2) + 
      theme(panel.border = element_rect(colour = "black", fill = NA, size = 2))
    return(plots)
}


df_pairs <- read.table(file = file.path(getwd(),"gene_pairs_for_exp.txt"), header = F, sep = "\t", quote = "", stringsAsFactors = F)

library(pander)
for(i in 1:dim(df_pairs)[1]){
  genes_list <- c(df_pairs[i,1],df_pairs[i,2])
  #cat("  \n###",genes_list[1],"_vs_",genes_list[2])
  pander::pandoc.header(paste(genes_list, collapse = "_vs_"), level = 3)
  print(drawExpHist(genes_list))
  #cat("  \n")
  pander::pandoc.p('')
  pander::pandoc.p('**************************')
}

CDH11_vs_MET


CDH11_vs_MMP14


CDH11_vs_RNF43


CDH11_vs_SLC7A5


CDH3_vs_DPEP1


CDH3_vs_LGR5


CDH3_vs_MET


CDH3_vs_RNF43


CDH3_vs_SLC7A5


DPEP1_vs_LGR5


DPEP1_vs_MET


DPEP1_vs_RNF43


DPEP1_vs_SLC7A5


LGR5_vs_SLC7A5


MET_vs_NOTCH3


MET_vs_RNF43


MET_vs_SLC7A5


MET_vs_SLCO4A1


RNF43_vs_SLC7A5


RNF43_vs_SLCO4A1


#drawExpHist(c("CDH3","LGR5"))