BINF_Cancer_dataset

What are we working with ?

We are working with the dataset: Breast Cancer: METABRIC cohort (Nature 2012; Nature Communications 2016)

Download the dataset via the provided link.

Define Environment Path

getwd()

## [1] "C:/Users/Lenovo/Documents/NAAMI"

dir.data <- file.path("C:/Users/Lenovo/Documents/NAAMI/brca_metabric")

Defining File paths

file.patient_data <- file.path(dir.data,"data_clinical_patient.txt")
file.sample_data <- file.path(dir.data,"data_clinical_sample.txt")
file.patient_metadata <- file.path(dir.data,"meta_clinical_patient.txt")
file.sample_metadata <- file.path(dir.data,"meta_clinical_sample.txt")
file.mutation_data <- file.path(dir.data,"data_mutations.txt")
file.mutation_metadata <- file.path(dir.data,"meta_mutations.txt")

Load the data.

patient_data <-read.table(file=file.patient_data,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

patient_metadata <- read.table(file=file.patient_metadata,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

sample_data <- read.table(file=file.sample_data,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

sample_metadata <- read.table(file=file.sample_metadata,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

mutation_data <- read.table(file=file.mutation_data,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

mutation_metadata <- read.table(file=file.mutation_metadata,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

Sneak peak into the data

head(patient_data)

##   PATIENT_ID LYMPH_NODES_EXAMINED_POSITIVE   NPI CELLULARITY CHEMOTHERAPY
## 1    MB-0000                            10 6.044                       NO
## 2    MB-0002                             0 4.020        High           NO
## 3    MB-0005                             1 4.030        High          YES
## 4    MB-0006                             3 4.050    Moderate          YES
## 5    MB-0008                             8 6.080        High          YES
## 6    MB-0010                             0 4.062    Moderate           NO
##   COHORT  ER_IHC HER2_SNP6 HORMONE_THERAPY INFERRED_MENOPAUSAL_STATE    SEX
## 1      1 Positve   NEUTRAL             YES                      Post Female
## 2      1 Positve   NEUTRAL             YES                       Pre Female
## 3      1 Positve   NEUTRAL             YES                       Pre Female
## 4      1 Positve   NEUTRAL             YES                       Pre Female
## 5      1 Positve   NEUTRAL             YES                      Post Female
## 6      1 Positve   NEUTRAL             YES                      Post Female
##   INTCLUST AGE_AT_DIAGNOSIS OS_MONTHS  OS_STATUS CLAUDIN_SUBTYPE
## 1     4ER+            75.65 140.50000   0:LIVING     claudin-low
## 2     4ER+            43.19  84.63333   0:LIVING            LumA
## 3        3            48.87 163.70000 1:DECEASED            LumB
## 4        9            47.68 164.93333   0:LIVING            LumB
## 5        9            76.97  41.36667 1:DECEASED            LumB
## 6        7            78.77   7.80000 1:DECEASED            LumB
##               THREEGENE    VITAL_STATUS LATERALITY RADIO_THERAPY
## 1             ER-/HER2-          Living      Right           YES
## 2 ER+/HER2- High Prolif          Living      Right           YES
## 3                       Died of Disease      Right            NO
## 4                                Living      Right           YES
## 5 ER+/HER2- High Prolif Died of Disease      Right           YES
## 6 ER+/HER2- High Prolif Died of Disease       Left           YES
##   HISTOLOGICAL_SUBTYPE    BREAST_SURGERY RFS_MONTHS     RFS_STATUS
## 1           Ductal/NST        MASTECTOMY 140.500000 0:Not Recurred
## 2           Ductal/NST BREAST CONSERVING  84.633333 0:Not Recurred
## 3           Ductal/NST        MASTECTOMY 153.300000     1:Recurred
## 4                Mixed        MASTECTOMY 164.933333 0:Not Recurred
## 5                Mixed        MASTECTOMY  18.800000     1:Recurred
## 6           Ductal/NST        MASTECTOMY   2.933333     1:Recurred

head(sample_data)

##   PATIENT_ID SAMPLE_ID   CANCER_TYPE                      CANCER_TYPE_DETAILED
## 1    MB-0000   MB-0000 Breast Cancer          Breast Invasive Ductal Carcinoma
## 2    MB-0002   MB-0002 Breast Cancer          Breast Invasive Ductal Carcinoma
## 3    MB-0005   MB-0005 Breast Cancer          Breast Invasive Ductal Carcinoma
## 4    MB-0006   MB-0006 Breast Cancer Breast Mixed Ductal and Lobular Carcinoma
## 5    MB-0008   MB-0008 Breast Cancer Breast Mixed Ductal and Lobular Carcinoma
## 6    MB-0010   MB-0010 Breast Cancer          Breast Invasive Ductal Carcinoma
##   ER_STATUS HER2_STATUS GRADE ONCOTREE_CODE PR_STATUS SAMPLE_TYPE TUMOR_SIZE
## 1  Positive    Negative     3           IDC  Negative     Primary         22
## 2  Positive    Negative     3           IDC  Positive     Primary         10
## 3  Positive    Negative     2           IDC  Positive     Primary         15
## 4  Positive    Negative     2          MDLC  Positive     Primary         25
## 5  Positive    Negative     3          MDLC  Positive     Primary         40
## 6  Positive    Negative     3           IDC  Positive     Primary         31
##   TUMOR_STAGE TMB_NONSYNONYMOUS
## 1           2          0.000000
## 2           1          2.615035
## 3           2          2.615035
## 4           2          1.307518
## 5           2          2.615035
## 6           4          5.230071

#For a better and organized output use the View() function in your console as:
View(head(patient_data))

Patient Cohort:

Distribution of patients by cancer stage, cancer subtype, age group at diagnosis, or any other relevant clinical parameters.

colnames(sample_data)

##  [1] "PATIENT_ID"           "SAMPLE_ID"            "CANCER_TYPE"         
##  [4] "CANCER_TYPE_DETAILED" "ER_STATUS"            "HER2_STATUS"         
##  [7] "GRADE"                "ONCOTREE_CODE"        "PR_STATUS"           
## [10] "SAMPLE_TYPE"          "TUMOR_SIZE"           "TUMOR_STAGE"         
## [13] "TMB_NONSYNONYMOUS"

colnames(patient_data)

##  [1] "PATIENT_ID"                    "LYMPH_NODES_EXAMINED_POSITIVE"
##  [3] "NPI"                           "CELLULARITY"                  
##  [5] "CHEMOTHERAPY"                  "COHORT"                       
##  [7] "ER_IHC"                        "HER2_SNP6"                    
##  [9] "HORMONE_THERAPY"               "INFERRED_MENOPAUSAL_STATE"    
## [11] "SEX"                           "INTCLUST"                     
## [13] "AGE_AT_DIAGNOSIS"              "OS_MONTHS"                    
## [15] "OS_STATUS"                     "CLAUDIN_SUBTYPE"              
## [17] "THREEGENE"                     "VITAL_STATUS"                 
## [19] "LATERALITY"                    "RADIO_THERAPY"                
## [21] "HISTOLOGICAL_SUBTYPE"          "BREAST_SURGERY"               
## [23] "RFS_MONTHS"                    "RFS_STATUS"

Cancer Type

# As Cancer type is a categorical data.
# Create a table to store the frequency of the cancer types.

CancerType_count_table <- table(sample_data$CANCER_TYPE_DETAILED)
CancerType_count_table

## 
##                                    Breast 
##                                        21 
##                       Breast Angiosarcoma 
##                                         2 
##          Breast Invasive Ductal Carcinoma 
##                                      1865 
##         Breast Invasive Lobular Carcinoma 
##                                       192 
##  Breast Invasive Mixed Mucinous Carcinoma 
##                                        25 
## Breast Mixed Ductal and Lobular Carcinoma 
##                                       269 
##                 Invasive Breast Carcinoma 
##                                       133 
##                 Metaplastic Breast Cancer 
##                                         2

bp <- barplot(CancerType_count_table,
         col="blue",
         main="Breast Cancer Type Distribution",
         ylab="Number of patients",
         ylim=c(0,max(CancerType_count_table)* 1.1),
         las = 2,
        cex.names=0.3)

 text(
     x = bp,
     y = CancerType_count_table,
     labels = CancerType_count_table,
     pos = 3,                        # Position above the bar
     cex = 0.8                      # Font size for labels
 )

Cancer/Tumor Stage

tumor_stage_count <- table(sample_data$TUMOR_STAGE)
tumor_bp <- barplot(tumor_stage_count,
        col='red',
        main="Tumor Stage Distribution",
        xlab="Tumor Stage",
        ylab='Patient Frequency',
        ylim=c(0,max(tumor_stage_count)*1.2))
 text(
     x = tumor_bp,
     y = tumor_stage_count,
     labels = tumor_stage_count,
     pos = 3,                        # Position above the bar
     cex = 0.8                      # Font size for labels
 )

Age Group At Diagnosis:

As age is continuous and include decimal values histogram would be an excellent choice for visualizing the distribution of age at diagnosis.

# Remove NAs and set up age bins
age_data <- na.omit(patient_data$AGE_AT_DIAGNOSIS)
min_age <- floor(min(age_data))
max_age <- ceiling(max(age_data))
age_bins <- seq(min_age, max_age + 5, by = 5)  # It Ensures all data are included

# Create the histogram and suppress default x-axis
hist_obj <- hist(
  age_data,
  breaks = age_bins,
  col = "dodgerblue",
  border = "white",
  main = "Age at Diagnosis",
  xlab = "Age at Diagnosis (years)",
  ylab = "Number of Patients",
  xaxt = "n"
)

# Add custom x-axis ticks at the bin edges (start/end of each bar)
axis(1, at = hist_obj$breaks, labels = FALSE)

# Add the labels at the bin edges, slanted at 45 degrees and right-aligned
text(
  x = hist_obj$breaks,
  y = par("usr")[3] - 0.03 * max(hist_obj$counts),
  labels = hist_obj$breaks,
  srt = 45,
  adj = 1,
  xpd = TRUE,
  cex = 0.8
)


# Show number of NAs 
num_na <- sum(is.na(patient_data$AGE_AT_DIAGNOSIS))
if (num_na > 0) {
  mtext(paste("NA:", num_na), side = 3, adj = 1, line = -1, cex = 0.8, col = "gray40")
}

Somatic Mutational Landscape

1.What are the most frequently mutated genes?

mutation_frequency_table <- table(mutation_data$Hugo_Symbol)


# Sort the table in decreasing order
mutation_frequency_sorted <- sort(mutation_frequency_table, decreasing = TRUE)

# Display the top 10 most frequently mutated genes
head(mutation_frequency_sorted, 10)

## 
## PIK3CA   TP53 AHNAK2  MUC16  SYNE1  KMT2C MAP3K1  AHNAK  GATA3 DNAH11 
##    670    552    513    405    294    220    218    202    188    186

2. What are the most common types of somatic mutations (e.g., missense, nonsense, frameshift)?

somatic_mutation_frequency_count <- table(mutation_data$Variant_Classification)
sorted.somatic_mutation_frequency_count <- sort(somatic_mutation_frequency_count, decreasing = TRUE)

head(sorted.somatic_mutation_frequency_count)

## 
## Missense_Mutation            Silent Nonsense_Mutation   Frame_Shift_Del 
##              6204              2501               522               515 
##   Frame_Shift_Ins       Splice_Site 
##               299               237

3. What is are mutational frequency of the Cancer Gene Census (CGC) genes (list top-20 most frequently mutated CGC genes)

(CGC: download link)

file.CGC <- file.path("C:/Users/Lenovo/Documents/NAAMI/Cancer_Gene_Census_allFri_Jun_20_03_26_10_2025.tsv"
)
CGC_data <- read.delim(file.CGC, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

# From CGC Extract the column containing gene symbols 
CGC_gene <- unique(CGC_data$Gene.Symbol)

# Filter your mutation data to keep only mutations in CGC genes
mutation_CGC <- mutation_data[mutation_data$Hugo_Symbol %in% CGC_gene,]

# Count the number of mutations for each CGC gene
mutation_counts <- table(mutation_CGC$Hugo_Symbol)
sorted.mutation_counts <- sort(mutation_counts, decreasing = TRUE)
top20_genes <- sorted.mutation_counts[1:20]
View(top20_genes)

#Visualizing top20_genes
bp <- barplot(
  top20_genes,
  las = 2,              # Rotate gene names for readability
  col = "pink",
  ylim=c(0,max(top20_genes)*1.2),
  main = "Top 20 Mutated Genes",
  ylab = "Mutation Count",
  cex.names = 1
)

# Add value labels over the bars
text(
  x = bp,
  y = top20_genes,
  labels = top20_genes,
  pos = 3,            # Position text above bars
  cex = 0.8,
  col = "black"
)

4 What is the overall mutation burden (tumor mutational burden/TMB) across samples, and how does it vary? Consider only SNVs for this analysis.

Tumor mutation burden is the total number of mutations found within a defined region genome. [ The total number of mutations found within the DNA of cancer cells.]
Single Nucleotide Variants (SNVs) are the most common type of genetic variation where a single nucleotide (A, T, C, or G) in a DNA sequence is altered.
View(table(mutation_data$Variant_Classification))

Variant Classification from mutation_data table
Only Missense_Mutation, Nonsense_Mutation, Silent, and Nonstop_Mutation from this list are explicitly categorized as SNVs
Other mutation variants in the list (eg: Frame_Shift_Del, Intron, Splice_Site, In_Frame_Del, etc.) involve insertions, deletions, or changes affecting one or more than one nucleotide or non-coding regions, and thus they are not always SNVs.
From ‘mutation_data’ the Genome Build (NCBI_Build) for the samples is: GRCh37
For GRCh37, the genome size is 3.1 Billion Base Pairs (3100 Mb) and its coding region size is typically around 30 to 33 Mb.
The standard practice to calculate TMB is : the number of mutations per megabase (Mb) of coding region, not per the full genome size.

 #---- 4. Tumor Mutational Burden (TMB) ----

# Define SNV mutation types 
snv_types <- c("Missense_Mutation", "Nonsense_Mutation", "Silent",  
               "Nonstop_Mutation")

# Filter mutation_data for SNVs only
snv_data <- mutation_data[mutation_data$Variant_Classification %in% snv_types, ]

# Calculate TMB per sample
# Assume coding region size = 30 Mb
coding_region_mb <- 30

tmb_per_sample <- as.data.frame(table(snv_data$Tumor_Sample_Barcode))
colnames(tmb_per_sample) <- c("Sample", "SNV_Count")
tmb_per_sample$TMB <- tmb_per_sample$SNV_Count / coding_region_mb

# Visualize TMB distribution
hist(tmb_per_sample$TMB, 
     breaks = 30, 
     col = "skyblue", 
     main = "Tumor Mutational Burden (TMB) Across Samples",
     xlab = "TMB (Mutations per Mb)",
     ylab = "Number of Samples")

Copy Number Landscape

file.data_cna <- file.path(dir.data,"data_cna.txt")
CNA_data <-read.table(file=file.data_cna,sep="\t",header=TRUE,
                          stringsAsFactors=FALSE)

file.cna_metadata <- file.path(dir.data,"meta_cna.txt")
CNA_metadata <- read.table(file=file.cna_metadata,sep="\t",header=TRUE                                ,stringsAsFactors=FALSE)

cna_data <-read.table(file=file.data_cna,sep="\t",header=TRUE,
                          stringsAsFactors=FALSE)

CNV and CNA nearly have the same meaning.
In your CNA gene matrix, each row is a gene, and each column (except the first two) is a sample.
The values show CNV status per gene in each sample (-2, -1: deletions; 0: neutral; 1, 2: amplifications).

Visualize the Copy Number Variation (CNV) data across the entire genome for the tumor cohort.

# 1. Assume cna_data is your loaded CNV data frame with columns:
# Hugo_Symbol, Entrez_Gene_Id, Sample1, Sample2, ..., SampleN

library(pheatmap)

# 2. Remove extra columns, set rownames
mat <- as.matrix(cna_data[, -(1:2)])
rownames(mat) <- make.unique(cna_data$Hugo_Symbol)


# 3. (Optional but recommended) Order genes by chromosome for genome-style display
# If you have chromosome info, order by it. If not, just proceed.

# 4. (Optional) Subset to most variable genes for better readability, e.g., top 30
gene_sd <- apply(mat, 1, sd)
top_genes <- order(gene_sd, decreasing=TRUE)[1:40]
mat_top <- mat[top_genes, ]

# 5. Heatmap plot
pheatmap(
  mat_top,
  color = colorRampPalette(c("blue", "white", "red"))(50),
  cluster_rows = T, # set TRUE if you want clustering
  cluster_cols = FALSE, # set TRUE if you want clustering
  show_rownames = TRUE,
  show_colnames = FALSE,
  cex = 0.8,
  main = "Genome-wide Copy Number Variation (Top 40 Genes)"
)

Which genes or chromosomal regions are most frequently amplified or deleted in the cancer cohort?

cnv_values <- as.matrix(cna_data[, -(1:2)])
rownames(cnv_values) <- cna_data$Hugo_Symbol

ampl_count <- rowSums(cnv_values > 0, na.rm=TRUE)
del_count  <- rowSums(cnv_values < 0, na.rm=TRUE)

freq_table <- data.frame(
     Gene = rownames(cnv_values),
     Amplified_Samples = ampl_count,
     Deleted_Samples   = del_count)

#Sort and select the top 100 most frequently altered genes

# For amplifications:
top100_ampl <- freq_table[order(-freq_table$Amplified_Samples), ][1:100, ]
top30_ampl <- freq_table[order(-freq_table$Amplified_Samples), ][1:30, ]

# For deletions:
top100_del <- freq_table[order(-freq_table$Deleted_Samples), ][1:100, ]
top30_del <- freq_table[order(-freq_table$Deleted_Samples), ][1:30, ]

# Plot for Amplification

library(ggplot2)
ggplot(top30_ampl, aes(x = reorder(Gene, Amplified_Samples), y = Amplified_Samples)) +
  geom_bar(stat = "identity", fill = "blue") +
  coord_flip() +
  labs(title="Top 30 Most Frequently Amplified Genes",
       x="Gene", y="Number of Samples Amplified")

# Plot for Deletion

library(ggplot2)
ggplot(top30_del, aes(x = reorder(Gene, Deleted_Samples), y = Deleted_Samples)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(title="Top 30 Most Frequently Deleted Genes",
       x="Gene", y="Number of Samples ")

4. CNV within the CGC genes ?

#filter Extract only those rows (genes) in the CNV(CNA) matrix that are present in the CGC gene list

cna_cgc <- cna_data[cna_data$Hugo_Symbol %in% CGC_gene, ]

# Remove non-numeric columns
cna_matrix <- as.matrix(cna_cgc[, -(1:2)])  # Example: skip Hugo_Symbol & Entrez_Gene_Id
rownames(cna_matrix) <- cna_cgc$Hugo_Symbol

# Count amplifications and deletions per gene
ampl_count <- rowSums(cna_matrix > 0, na.rm = TRUE)
del_count  <- rowSums(cna_matrix < 0, na.rm = TRUE)

# Combine results
result_table <- data.frame(
  Gene = rownames(cna_matrix),
  Amplified_Samples = ampl_count,
  Deleted_Samples = del_count
)

top_ampl <- result_table[order(-result_table$Amplified_Samples), ][1:20, ]
top_del  <- result_table[order(-result_table$Deleted_Samples), ][1:20, ]

library(ggplot2)
ggplot(top_ampl, aes(x = reorder(Gene, Amplified_Samples), y = Amplified_Samples)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(title = "Top 20 Most Frequently Amplified CGC Genes",
       x = "Gene", y = "Number of Samples (Amplified)")

ggplot(top_del, aes(x = reorder(Gene, Deleted_Samples), y = Deleted_Samples)) +
  geom_bar(stat = "identity", fill = "blue") +
  coord_flip() +
  labs(title = "Top 20 Most Frequently Deleted CGC Genes",
       x = "Gene", y = "Number of Samples (Deleted)")

Are there CNV hotspots that recur across many samples?

= A CNV hotspot is a genomic region that is amplified or deleted in a large number of samples

# Use your processed CNV matrix, eg:
cnv_matrix <- as.matrix(cna_data[,-c(1,2)])  # Remove non-numeric columns
rownames(cnv_matrix) <- make.unique(cna_data$Hugo_Symbol)

amp_freq <- rowSums(cnv_matrix > 0, na.rm=TRUE)
del_freq <- rowSums(cnv_matrix < 0, na.rm=TRUE)

hotspot_table <- data.frame(
  Gene = rownames(cnv_matrix),
  Amplified_Samples = amp_freq,
  Deleted_Samples   = del_freq
)

top_amplified <- hotspot_table[order(-hotspot_table$Amplified_Samples), ][1:20, ]
top_deleted   <- hotspot_table[order(-hotspot_table$Deleted_Samples), ][1:20, ]

library(ggplot2)

# Amplification hotspots
ggplot(top_amplified, aes(x=reorder(Gene, Amplified_Samples), y=Amplified_Samples)) +
  geom_bar(stat="identity", fill="tomato") + coord_flip() +
  labs(title = "Top 20 Amplification Hotspots across the Cohort",
       x = "Gene", y = "Number of Samples (Amplified)")

# Deletion hotspots
ggplot(top_deleted, aes(x=reorder(Gene, Deleted_Samples), y=Deleted_Samples)) +
  geom_bar(stat="identity", fill="steelblue") + coord_flip() +
  labs(title = "Top 20 Deletion Hotspots across the Cohort",
       x = "Gene", y = "Number of Samples (Deleted)")

BINF_Cancer_dataset_analysis

GROUP A, BI-02,06,08,09

2025-07-02