TCGA-COAD Data Analysis

Author

Akschya Sivacoumar

LOAD LIBRARIES

R libraries, also known as packages or extensions, are collections of functions, data sets, and other resources that provide additional capabilities and tools for performing specific tasks in R programming. These libraries can be easily installed and loaded into your R environment to access the functions and data they provide.

For bioinformatics analysis, several R libraries are commonly used. Here are some key libraries frequently used in RNA-seq analysis which are loaded for this pipeline:

library(dplyr)
library(readr)
library(maftools)
library(TCGAbiolinks)
library(survminer)
library(survival)
library(SummarizedExperiment)
library(tidyverse)
library(DESeq2)
library(devtools)
library(EnhancedVolcano)
library(pheatmap)
library(limma)
library(ggplot2)
library(VennDiagram)

Path to base directory

Code
base_path <- "C:/Users/aksch/Downloads/Exercise/Exercise"

DATA CURATION

Data curation ensures that the data used for subsequent analysis is accurate, consistent, and ready for statistical and visualization procedures. Each step in the curation process helps in addressing data quality issues and preparing the dataset for meaningful analysis and interpretation.

Code
# Data import
# Load clinical data
clinical_df <- read_tsv(file.path(base_path, "clinical", "clinical.tsv"))
exposure_df <- read_tsv(file.path(base_path, "clinical", "exposure.tsv"))

# Load miRNA data
# Path to miRNA directory
path <- file.path(base_path, "miRNA")

# List all folders in the directory
folders <- list.files(path, full.names = TRUE)

# Initialize an empty list to store data frames
df_list <- list()
case_ids <- character(0)

# Loop through each folder
for (folder in folders) {
  
  # Construct the full path to the annotations.txt file within the folder
  annotation_path <- file.path(folder, "annotations.txt")
  
  # Check if annotations.txt file exists
  if (file.exists(annotation_path)) {
    
    # Read the annotations.txt file to get the entity_id
    annotations <- read_tsv(annotation_path)
    entity_id <- annotations$entity_id[1]
    
    # Append entity_id to the list
    case_ids <- c(case_ids, entity_id)
    
    # Find the miRNA expression data file 
    mirna_files <- list.files(folder, pattern = "mirbase21.mirnas.quantification", full.names = TRUE)
      
    # Read the miRNA expression data file
    mirna_data <- read_tsv(mirna_files)
      
    # Rename the columns (except for miRNA_ID) 
    colnames(mirna_data)[-1] <- paste(colnames(mirna_data)[-1], entity_id, sep = "_")
      
    # Append the data frame to the list
    df_list[[length(df_list) + 1]] <- mirna_data
    
  }
}

# Combine all data frames 
dataAssy.miR <- Reduce(function(x, y) merge(x, y, by = "miRNA_ID", all = TRUE), df_list)
rownames(dataAssy.miR) <- dataAssy.miR$miRNA_ID

# using read_count's data 
read_countData <-  colnames(dataAssy.miR)[grep("count", colnames(dataAssy.miR))]
dataAssy.miR <- dataAssy.miR[,read_countData]
colnames(dataAssy.miR) <- gsub("read_count_","", colnames(dataAssy.miR))


# Initialize empty lists for early and late stage case_ids
early_stage_ids <- character(0)
late_stage_ids <- character(0)

# Define early and late stage criteria based on tumor_stage
early_stage <- c("stage i", "stage ia", "stage ii", "stage iia", "stage iib", "stage iic", 
                 "stage iii", "stage iiia", "stage iiib", "stage iiic")
late_stage <- c("stage iv", "stage iva", "stage ivb")

Curate clinical data

# Create a new column in clinical_df for stage_group
clinical_df$stage_group <- ifelse(clinical_df$tumor_stage %in% early_stage, "Early", "Late")

# Clean the data
clinical_df$days_to_death <- as.numeric(clinical_df$days_to_death)
clinical_df$days_to_last_follow_up <- as.numeric(clinical_df$days_to_last_follow_up)
rows_to_remove <- which(is.na(clinical_df$days_to_death) & is.na(clinical_df$days_to_last_follow_up))
cleaned_clinical_df <- clinical_df[-rows_to_remove, ]

# Change certain values the way they are encoded
cleaned_clinical_df$deceased <- ifelse(cleaned_clinical_df$vital_status == "Alive", FALSE, TRUE)

Curate miRNA data

Function for df processing

Code
# Function to combine duplicated columns by taking the mean
combine_duplicated_columns <- function(df) {
  # Get the original column names without suffixes
  original_colnames <- gsub("\\.x$|\\.y$", "", colnames(df))
  
  # Identify duplicated columns
  duplicated_colnames <- unique(original_colnames[duplicated(original_colnames)])
  
  # For each duplicated column, compute the mean and replace the original columns
  for (col in duplicated_colnames) {
    col_x <- paste0(col, ".x")
    col_y <- paste0(col, ".y")
    
    # Ensure columns are numeric before computing mean
    df[[col_x]] <- as.numeric(df[[col_x]])
    df[[col_y]] <- as.numeric(df[[col_y]])
    
    # Check if both columns exist
    if (col_x %in% colnames(df) && col_y %in% colnames(df)) {
      df[[col]] <- rowMeans(df[, c(col_x, col_y)], na.rm = TRUE)
      
      # Remove the original columns with suffixes
      df <- df %>% select(-all_of(c(col_x, col_y)))
    }
  }
  
  return(df)
}
# Combine duplicated columns in dataAssy.miR
dataAssy.miR <- combine_duplicated_columns(dataAssy.miR)

# Get only the case IDs present in clinical data
subset_case_ids <- intersect(case_ids, clinical_df$case_id)
subset_case_ids <- unique(subset_case_ids)
df_tumor_stage <- clinical_df[clinical_df$case_id %in% subset_case_ids, c("case_id", "tumor_stage")]


# Iterate through df_tumor_stage and categorize case_ids
for (i in df_tumor_stage$case_id) {

  tumor_stage <- df_tumor_stage$tumor_stage[df_tumor_stage$case_id == i]
  
  if (tumor_stage %in% early_stage) {
    early_stage_ids <- c(early_stage_ids, i)
  } else if (tumor_stage %in% late_stage) {
    late_stage_ids <- c(late_stage_ids, i)
  } else {
   subset_case_ids <- subset_case_ids[-which(subset_case_ids == i)]
  }
}

# Final curated miRNA expression data
dataAssy.miR <- dataAssy.miR[,subset_case_ids]

DIFFERENTIAL EXPRESSION ANALYSIS OF

Differential Expression Analysis of miRNA data

Differential Expression Analysis (DEA) is a critical step in miRNA data analysis, aiming to pinpoint miRNAs that exhibit significant expression differences between different experimental conditions. In the provided R script, DEA is performed using the TCGAanalyze_DEA function from the TCGAbiolinks package. This function compares expression levels of miRNAs between defined groups, such as early and late cancer stages. It utilizes statistical methods like the exact test to assess significance, with parameters including false discovery rate (FDR) cutoff and fold change (logFC) threshold. By automating data filtering, normalization, and statistical testing, TCGAanalyze_DEA facilitates the identification of miRNAs that are potentially involved in cancer progression, offering insights into biological mechanisms and therapeutic targets.

Visualisation

Visualizing Differential Expression Analysis (DEA) results through various plots such as Volcano plots, MA plots, PCA plots, and heatmaps is crucial for interpreting and communicating findings in miRNA research. Volcano plots provide a concise overview by plotting log fold change (logFC) against statistical significance (e.g., FDR-adjusted p-values), highlighting miRNAs that are both highly significant and exhibit substantial expression changes between experimental conditions, such as early and late cancer stages. MA plots complement this by showing the relationship between miRNA abundance (M) and average intensity (A), aiding in the identification of differentially expressed miRNAs based on biological relevance. PCA plots visualize the overall variation in miRNA expression patterns across samples, enabling insights into sample clustering and potential batch effects. Finally, heatmaps offer a comprehensive view of expression profiles for top differentially expressed miRNAs, facilitating the identification of expression patterns and sample grouping based on miRNA expression levels. Together, these visualizations enhance understanding of miRNA regulation in cancer progression, aiding in the discovery of biomarkers and therapeutic targets.

Volcano plot

Code
EnhancedVolcano(dataDEGs,
                lab = rownames(dataDEGs),
                x = 'logFC',
                y = 'FDR',
                pCutoff = 0.05,
                FCcutoff = 1,
                title = 'Volcano Plot of DEGs',
                subtitle = 'Early vs Late Stage',
                legendPosition = 'right')

Heatmap

Code
# Select the top 50 DEGs based on adjusted p-value
top_degs <- dataDEGs[order(dataDEGs$FDR), ][1:50, ]

# Extract the expression data for the top 50 DEGs
top_degs_expr_early <- early_data[rownames(top_degs), ]
top_degs_expr_late <- late_data[rownames(top_degs), ]

# Extract expression data for the top 50 DEGs
top_degs_expr <-cbind(early_data[rownames(top_degs), ], late_data[rownames(top_degs), ])

# Create an annotation for columns
annotation_col <- data.frame(
  Stage = rep(c("Early", "Late"), c(ncol(top_degs_expr_early), ncol(top_degs_expr_late)))
)
rownames(annotation_col) <- colnames(top_degs_expr)

# Check for and handle NA, NaN, and Inf values
top_degs_expr_clean <- top_degs_expr[complete.cases(top_degs_expr), ]

# Define column order: first early stages, then late stages
col_order <- c(colnames(top_degs_expr_early), colnames(top_degs_expr_late))

# Heatmap
pheatmap(
  top_degs_expr_clean[, col_order],
  scale = 'row',
  annotation_col = annotation_col,
  cluster_rows = FALSE,
  cluster_cols = FALSE,
  show_rownames = FALSE,
  show_colnames = FALSE,
  main = 'Heatmap of Top 50 DEGs'
)

PCA plot

Code
pca <- prcomp(t(cbind(early_data, late_data)))
pca_df <- as.data.frame(pca$x)
pca_df$Stage <- rep(c('Early', 'Late'), c(ncol(early_data), ncol(late_data)))

ggplot(pca_df, aes(PC1, PC2, color = Stage)) +
  geom_point(size = 3) +
  labs(title = "PCA Plot", x = "Principal Component 1", y = "Principal Component 2") +
  theme_minimal()

MA plot

Code
plotMA(dataDEGs, ylim = c(-5, 5), main = "MA Plot of DEGs")

SURVIVAL ANALYSIS

Code
# Create an "overall_survival" variable

cleaned_clinical_df$overall_survival <- ifelse(cleaned_clinical_df$vital_status == "Alive", cleaned_clinical_df$days_to_last_follow_up,
                                               cleaned_clinical_df$days_to_death) 
# Fit the survival curves 
fit <- survfit(Surv(overall_survival, deceased) ~ stage_group, data = cleaned_clinical_df)

Plot the survival curves

ggsurvplot(fit,
           data = cleaned_clinical_df,
           pval = TRUE,
           risk.table = TRUE,
           main = "Kaplan-Meier Overall Survival Curves for Early vs Late Stage",
           xlab = "Time since diagnosis (days)",
           ylab = "Probability of survival")

Perform log-rank test

Code
fit2 <- survdiff(Surv(overall_survival, deceased) ~ stage_group, data = cleaned_clinical_df) 
fit2
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ stage_group, 
    data = cleaned_clinical_df)

n=101, 340 observations deleted due to missingness.

                   N Observed Expected (O-E)^2/E (O-E)^2/V
stage_group=Early 65       65     74.5      1.20      4.83
stage_group=Late  36       36     26.5      3.37      4.83

 Chisq= 4.8  on 1 degrees of freedom, p= 0.03 
Code
##### Survival analysis using TCGAbiolink
## Did not work due to bugs in the function

# TCGAanalyze_survival(
# data = cleaned_clinical_df,
# clusterCol = c("stage_group", "gender"),
# main = "Kaplan-Meier Overall Survival Curves",
# height = 10,
# width = 10,
# ylab = "Probability of survival",
# xlab = "Time since diagnosis (days)",
# filename = "survival.pdf",
# color = NULL,
# dpi = 300,
# legend.position = "inside",
# legend.title.position = "top",
# legend.ncols = 1,
# add.legend = TRUE,
# print.value = TRUE,
# add.points = TRUE
# )

ADDITIONAL ANALYSIS

To gain a comprehensive understanding of the dataset without specific hypotheses, I would explore the following analyses:

Correlation Analysis

Investigate correlations between miRNA expression levels and clinical parameters such as patient age, gender, and survival status to identify potential associations.

Network Analysis

Construct protein-protein interaction (PPI) networks of differentially expressed miRNAs or genes to uncover functional relationships and pathways implicated in COAD.

Dimensionality Reduction

Apply techniques like principal component analysis (PCA) or t-distributed stochastic neighbor embedding (t-SNE) to visualize high-dimensional data and identify clusters or patterns within the dataset.

Enrichment Analysis

Perform pathway enrichment analysis (e.g., GO analysis, KEGG pathways) to elucidate biological processes and pathways enriched with differentially expressed genes, providing insights into molecular mechanisms underlying COAD progression.

These analyses aim to provide a comprehensive exploration of the TCGA-COAD dataset, offering insights into miRNA regulation, clinical outcomes, and potential therapeutic targets in colon adenocarcinoma.