Machine learnning for classification of Colorectal cancer using the TCGA and GTEX data.

The Cancer Genome Atlas (TCGA) https://cancergenome.nih.gov/ and Genotype-Tissue Expression (GTEx) projects https://www.gtexportal.org/home/ provided public accessible RNA Seqencing data for cancer and normal samples, providing a godzilla goldmine for cancer research. The UCSC TOIL project http://xena.ucsc.edu/ has recomputed TCGA and GTEx all expression raw data based on a standard pipeline to making it more comparable and minimize differences from different sources.

1.Download the in the TOIL TCGA GTEx cohort RNAseq RSEM gene expression data in the current working directory and extract the colon data.
https://toil.xenahubs.net/download/gtex_gene_expected_count.gz
https://toil.xenahubs.net/download/gtex_RSEM_gene_tpm.gz https://toil.xenahubs.net/download/tcga_RSEM_gene_tpm.gz https://toil.xenahubs.net/download/tcga_gene_expected_count.gz https://toil.xenahubs.net/download/TCGA_GTEX_category.txt.gz https://toil.xenahubs.net/download/TcgaTargetGTEX_phenotype.txt.gz https://toil.xenahubs.net/download/TCGA_survival_data.gz

2.Using window sublinux system ubuntu shell command line for extract all TOIL recomputed TCGA and GTEX samples names.

cd/mnt/c/Users/woodhaha/Desktop/XENA/data

echo | grep TCGA tcga_gene_expected_count | xargs - n1 > “all_tcga_samples.txt”

echo | grep GTEX gtex_gene_expected_count | xargs - n1 > “all_gtex_samples.txt”

3.Using pyton script to extract colon data in TCGA and GTEX

import pandas as pd

import os

path = ‘C:/Users/woodhaha/Desktop/xena/data/’

os.chdir(path)

path1 = ‘C:/Users/woodhaha/Desktop/xena/data/gtex_gene_expected_count’ gtex_expected_count = pd.read_csv(path1, sep=‘’, index_col=0)

with open(“colon_gtex.txt”) as f: content = f.readlines()

colon_gtex_samples = [x.strip() for x in content]

gtex_colon_expected_count = gtex_expected_count[colon_gtex_samples]

## Reverse Xena normalized data,log2(x + 1) to get expected counts

gtex_colon_expected_count = gtex_colon_expected_count.apply(lambda x: 2**x - 1)

gtex_colon_expected_count.to_csv(“gtex_colon_expected_count.csv”)

path2 = ‘C:/Users/woodhaha/Desktop/xena/data/gtex_RSEM_gene_tpm’

gtex_RSEM_tpm = pd.read_csv(path2, sep=‘’, index_col=0)

gtex_colon_RSEM_tpm = gtex_RSEM_tpm[colon_gtex_samples]

gtex_colon_RSEM_tpm = gtex_colon_RSEM_tpm.apply(lambda x: 2**x - 0.001)

gtex_colon_RSEM_tpm.to_csv(“gtex_colon_RSEM_tpm.csv”)

path3 = ‘C:/Users/woodhaha/Desktop/xena/data/tcga_gene_expected_count’ tcga_expected_count= pd.read_csv(path3, sep=‘’, index_col=0)

with open(“colon_tcga.txt”) as f: content = f.readlines()

colon_tcga_samples = [x.strip() for x in content] tcga_colon_expected_count=tcga_expected_count[colon_tcga_samples] tcga_colon_expected_count = tcga_colon_expected_count.apply(lambda x: 2**x - 1) tcga_colon_expected_count.to_csv(“tcga_colon_expected_count.csv”)

path4 = ‘C:/Users/woodhaha/Desktop/xena/data/tcga_RSEM_gene_tpm’ tcga_RSEM_tpm= pd.read_csv(path4, sep=‘’, index_col=0)

tcga_colon_RSEM_tpm=tcga_RSEM_tpm[colon_tcga_samples] tcga_colon_RSEM_tpm = tcga_colon_RSEM_tpm.apply(lambda x: 2**x - 0.001) tcga_colon_RSEM_tpm.to_csv(“tcga_colon_RSEM_tpm.csv”)

When you click the R Tools | Publish | Preview button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

setwd("C:/Users/woodhaha/Desktop/XENA/data")
TcgaTargetGTEX = read.table("TcgaTargetGTEX_phenotype.txt", sep = "\t", header = T) ## read TcgaTargetGTEX phenotype

## subset the colon data in TcgaTargetGTEX
##1. gtex_colon 
gtex_colon = dplyr::filter(TcgaTargetGTEX, TcgaTargetGTEX$study == "GTEX" & TcgaTargetGTEX$primary_site == "Colon") 
cat("total gtex_colon samples",length(gtex_colon$sample))

## total gtex_colon samples 308

all_gtex = read.table("all_gtex_samples.txt", sep = "\t", header=T)

table(all_gtex$sample %in% gtex_colon$sample) ## 307 samples in Toil Recompute data set

## 
## FALSE  TRUE 
##  7539   307

# which(all_gtex$samples %in% gtex_colon$sample )
DAT1 = as.character(all_gtex$sample[all_gtex$sample %in% gtex_colon$sample])
write.table(DAT1, "colon_gtex.txt", row.names = FALSE, col.names = F, quote = F)
#2.
tcga_colon = dplyr::filter(TcgaTargetGTEX, TcgaTargetGTEX$study == "TCGA" & TcgaTargetGTEX$primary_site == "Colon")
## 331 colon samples
all_tcga = read.table("all_tcga_samples.txt", sep = "\t", header = T)
table(all_tcga$sample %in% tcga_colon$sample) ## A total 351 samples(330 and 21 replicated samples) in Toil Recompute data set

## 
## FALSE  TRUE 
## 10479   351

table(duplicated(all_tcga$sample[all_tcga$sample %in% tcga_colon$sample]))

## 
## FALSE  TRUE 
##   330    21

table(as.character(all_tcga$sample[which(duplicated(all_tcga$sample[all_tcga$sample %in% tcga_colon$sample]))]))

## 
## TCGA-05-4425-01 TCGA-05-4434-01 TCGA-21-1076-01 TCGA-86-8280-01 
##               1               1               1               1 
## TCGA-91-6840-01 TCGA-A3-3380-01 TCGA-A5-AB3J-01 TCGA-BH-A0B5-11 
##               1               1               1               1 
## TCGA-BH-A0HP-01 TCGA-DK-A3IQ-01 TCGA-DK-A3IT-01 TCGA-EL-A3CW-01 
##               1               1               1               1 
## TCGA-EW-A1J6-01 TCGA-HC-A631-01 TCGA-HP-A5N0-01 TCGA-MP-A4T7-01 
##               1               1               1               1 
## TCGA-MQ-A4LJ-01 TCGA-NJ-A55O-01 TCGA-OL-A6VO-01 TCGA-UY-A8OB-01 
##               1               1               1               1 
## TCGA-ZB-A96D-01 
##               1

# which(all_gtex$sample %in% gtex_colon$sample )
DAT2 = as.character(all_tcga$sample[all_tcga$sample %in% tcga_colon$sample])
write.table(DAT2, "colon_tcga.txt", row.names = FALSE, col.names = F, quote = F)

You can also embed plots, for example:

plot(cars)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.