What are we working with ?

We are working with the dataset: Breast Cancer: METABRIC cohort (Nature 2012; Nature Communications 2016)

Download the dataset via the provided link.

Define Environment Path

getwd()
## [1] "C:/Users/Lenovo/Documents/NAAMI"
dir.data <- file.path("C:/Users/Lenovo/Documents/NAAMI/brca_metabric")

Defining File paths

file.patient_data <- file.path(dir.data,"data_clinical_patient.txt")
file.sample_data <- file.path(dir.data,"data_clinical_sample.txt")
file.patient_metadata <- file.path(dir.data,"meta_clinical_patient.txt")
file.sample_metadata <- file.path(dir.data,"meta_clinical_sample.txt")
file.mutation_data <- file.path(dir.data,"data_mutations.txt")
file.mutation_metadata <- file.path(dir.data,"meta_mutations.txt")

Load the data.

patient_data <-read.table(file=file.patient_data,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

patient_metadata <- read.table(file=file.patient_metadata,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

sample_data <- read.table(file=file.sample_data,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

sample_metadata <- read.table(file=file.sample_metadata,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

mutation_data <- read.table(file=file.mutation_data,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

mutation_metadata <- read.table(file=file.mutation_metadata,sep="\t",header=TRUE,
 stringsAsFactors=FALSE)

Sneak peak into the data

head(patient_data)
##   PATIENT_ID LYMPH_NODES_EXAMINED_POSITIVE   NPI CELLULARITY CHEMOTHERAPY
## 1    MB-0000                            10 6.044                       NO
## 2    MB-0002                             0 4.020        High           NO
## 3    MB-0005                             1 4.030        High          YES
## 4    MB-0006                             3 4.050    Moderate          YES
## 5    MB-0008                             8 6.080        High          YES
## 6    MB-0010                             0 4.062    Moderate           NO
##   COHORT  ER_IHC HER2_SNP6 HORMONE_THERAPY INFERRED_MENOPAUSAL_STATE    SEX
## 1      1 Positve   NEUTRAL             YES                      Post Female
## 2      1 Positve   NEUTRAL             YES                       Pre Female
## 3      1 Positve   NEUTRAL             YES                       Pre Female
## 4      1 Positve   NEUTRAL             YES                       Pre Female
## 5      1 Positve   NEUTRAL             YES                      Post Female
## 6      1 Positve   NEUTRAL             YES                      Post Female
##   INTCLUST AGE_AT_DIAGNOSIS OS_MONTHS  OS_STATUS CLAUDIN_SUBTYPE
## 1     4ER+            75.65 140.50000   0:LIVING     claudin-low
## 2     4ER+            43.19  84.63333   0:LIVING            LumA
## 3        3            48.87 163.70000 1:DECEASED            LumB
## 4        9            47.68 164.93333   0:LIVING            LumB
## 5        9            76.97  41.36667 1:DECEASED            LumB
## 6        7            78.77   7.80000 1:DECEASED            LumB
##               THREEGENE    VITAL_STATUS LATERALITY RADIO_THERAPY
## 1             ER-/HER2-          Living      Right           YES
## 2 ER+/HER2- High Prolif          Living      Right           YES
## 3                       Died of Disease      Right            NO
## 4                                Living      Right           YES
## 5 ER+/HER2- High Prolif Died of Disease      Right           YES
## 6 ER+/HER2- High Prolif Died of Disease       Left           YES
##   HISTOLOGICAL_SUBTYPE    BREAST_SURGERY RFS_MONTHS     RFS_STATUS
## 1           Ductal/NST        MASTECTOMY 140.500000 0:Not Recurred
## 2           Ductal/NST BREAST CONSERVING  84.633333 0:Not Recurred
## 3           Ductal/NST        MASTECTOMY 153.300000     1:Recurred
## 4                Mixed        MASTECTOMY 164.933333 0:Not Recurred
## 5                Mixed        MASTECTOMY  18.800000     1:Recurred
## 6           Ductal/NST        MASTECTOMY   2.933333     1:Recurred
head(sample_data)
##   PATIENT_ID SAMPLE_ID   CANCER_TYPE                      CANCER_TYPE_DETAILED
## 1    MB-0000   MB-0000 Breast Cancer          Breast Invasive Ductal Carcinoma
## 2    MB-0002   MB-0002 Breast Cancer          Breast Invasive Ductal Carcinoma
## 3    MB-0005   MB-0005 Breast Cancer          Breast Invasive Ductal Carcinoma
## 4    MB-0006   MB-0006 Breast Cancer Breast Mixed Ductal and Lobular Carcinoma
## 5    MB-0008   MB-0008 Breast Cancer Breast Mixed Ductal and Lobular Carcinoma
## 6    MB-0010   MB-0010 Breast Cancer          Breast Invasive Ductal Carcinoma
##   ER_STATUS HER2_STATUS GRADE ONCOTREE_CODE PR_STATUS SAMPLE_TYPE TUMOR_SIZE
## 1  Positive    Negative     3           IDC  Negative     Primary         22
## 2  Positive    Negative     3           IDC  Positive     Primary         10
## 3  Positive    Negative     2           IDC  Positive     Primary         15
## 4  Positive    Negative     2          MDLC  Positive     Primary         25
## 5  Positive    Negative     3          MDLC  Positive     Primary         40
## 6  Positive    Negative     3           IDC  Positive     Primary         31
##   TUMOR_STAGE TMB_NONSYNONYMOUS
## 1           2          0.000000
## 2           1          2.615035
## 3           2          2.615035
## 4           2          1.307518
## 5           2          2.615035
## 6           4          5.230071
#For a better and organized output use the View() function in your console as:
View(head(patient_data))

Patient Cohort:

Distribution of patients by cancer stage, cancer subtype, age group at diagnosis, or any other relevant clinical parameters.

colnames(sample_data)
##  [1] "PATIENT_ID"           "SAMPLE_ID"            "CANCER_TYPE"         
##  [4] "CANCER_TYPE_DETAILED" "ER_STATUS"            "HER2_STATUS"         
##  [7] "GRADE"                "ONCOTREE_CODE"        "PR_STATUS"           
## [10] "SAMPLE_TYPE"          "TUMOR_SIZE"           "TUMOR_STAGE"         
## [13] "TMB_NONSYNONYMOUS"
colnames(patient_data)
##  [1] "PATIENT_ID"                    "LYMPH_NODES_EXAMINED_POSITIVE"
##  [3] "NPI"                           "CELLULARITY"                  
##  [5] "CHEMOTHERAPY"                  "COHORT"                       
##  [7] "ER_IHC"                        "HER2_SNP6"                    
##  [9] "HORMONE_THERAPY"               "INFERRED_MENOPAUSAL_STATE"    
## [11] "SEX"                           "INTCLUST"                     
## [13] "AGE_AT_DIAGNOSIS"              "OS_MONTHS"                    
## [15] "OS_STATUS"                     "CLAUDIN_SUBTYPE"              
## [17] "THREEGENE"                     "VITAL_STATUS"                 
## [19] "LATERALITY"                    "RADIO_THERAPY"                
## [21] "HISTOLOGICAL_SUBTYPE"          "BREAST_SURGERY"               
## [23] "RFS_MONTHS"                    "RFS_STATUS"
Cancer Type
# As Cancer type is a categorical data.
# Create a table to store the frequency of the cancer types.

CancerType_count_table <- table(sample_data$CANCER_TYPE_DETAILED)
CancerType_count_table
## 
##                                    Breast 
##                                        21 
##                       Breast Angiosarcoma 
##                                         2 
##          Breast Invasive Ductal Carcinoma 
##                                      1865 
##         Breast Invasive Lobular Carcinoma 
##                                       192 
##  Breast Invasive Mixed Mucinous Carcinoma 
##                                        25 
## Breast Mixed Ductal and Lobular Carcinoma 
##                                       269 
##                 Invasive Breast Carcinoma 
##                                       133 
##                 Metaplastic Breast Cancer 
##                                         2
bp <- barplot(CancerType_count_table,
         col="blue",
         main="Breast Cancer Type Distribution",
         ylab="Number of patients",
         ylim=c(0,max(CancerType_count_table)* 1.1),
         las = 2,
        cex.names=0.3)

 text(
     x = bp,
     y = CancerType_count_table,
     labels = CancerType_count_table,
     pos = 3,                        # Position above the bar
     cex = 0.8                      # Font size for labels
 )

Cancer/Tumor Stage

tumor_stage_count <- table(sample_data$TUMOR_STAGE)
tumor_bp <- barplot(tumor_stage_count,
        col='red',
        main="Tumor Stage Distribution",
        xlab="Tumor Stage",
        ylab='Patient Frequency',
        ylim=c(0,max(tumor_stage_count)*1.2))
 text(
     x = tumor_bp,
     y = tumor_stage_count,
     labels = tumor_stage_count,
     pos = 3,                        # Position above the bar
     cex = 0.8                      # Font size for labels
 )

Age Group At Diagnosis:

As age is continuous and include decimal values histogram would be an excellent choice for visualizing the distribution of age at diagnosis.

# Remove NAs and set up age bins
age_data <- na.omit(patient_data$AGE_AT_DIAGNOSIS)
min_age <- floor(min(age_data))
max_age <- ceiling(max(age_data))
age_bins <- seq(min_age, max_age + 5, by = 5)  # It Ensures all data are included

# Create the histogram and suppress default x-axis
hist_obj <- hist(
  age_data,
  breaks = age_bins,
  col = "dodgerblue",
  border = "white",
  main = "Age at Diagnosis",
  xlab = "Age at Diagnosis (years)",
  ylab = "Number of Patients",
  xaxt = "n"
)

# Add custom x-axis ticks at the bin edges (start/end of each bar)
axis(1, at = hist_obj$breaks, labels = FALSE)

# Add the labels at the bin edges, slanted at 45 degrees and right-aligned
text(
  x = hist_obj$breaks,
  y = par("usr")[3] - 0.03 * max(hist_obj$counts),
  labels = hist_obj$breaks,
  srt = 45,
  adj = 1,
  xpd = TRUE,
  cex = 0.8
)


# Show number of NAs 
num_na <- sum(is.na(patient_data$AGE_AT_DIAGNOSIS))
if (num_na > 0) {
  mtext(paste("NA:", num_na), side = 3, adj = 1, line = -1, cex = 0.8, col = "gray40")
}

Somatic Mutational Landscape

1.What are the most frequently mutated genes?
mutation_frequency_table <- table(mutation_data$Hugo_Symbol)


# Sort the table in decreasing order
mutation_frequency_sorted <- sort(mutation_frequency_table, decreasing = TRUE)

# Display the top 10 most frequently mutated genes
head(mutation_frequency_sorted, 10)
## 
## PIK3CA   TP53 AHNAK2  MUC16  SYNE1  KMT2C MAP3K1  AHNAK  GATA3 DNAH11 
##    670    552    513    405    294    220    218    202    188    186
2. What are the most common types of somatic mutations (e.g., missense, nonsense, frameshift)?
somatic_mutation_frequency_count <- table(mutation_data$Variant_Classification)
sorted.somatic_mutation_frequency_count <- sort(somatic_mutation_frequency_count, decreasing = TRUE)

head(sorted.somatic_mutation_frequency_count)
## 
## Missense_Mutation            Silent Nonsense_Mutation   Frame_Shift_Del 
##              6204              2501               522               515 
##   Frame_Shift_Ins       Splice_Site 
##               299               237
3. What is are mutational frequency of the Cancer Gene Census (CGC) genes (list top-20 most frequently mutated CGC genes)
Copy Number Landscape
file.data_cna <- file.path(dir.data,"data_cna.txt")
CNA_data <-read.table(file=file.data_cna,sep="\t",header=TRUE,
                          stringsAsFactors=FALSE)

file.cna_metadata <- file.path(dir.data,"meta_cna.txt")
CNA_metadata <- read.table(file=file.cna_metadata,sep="\t",header=TRUE                                ,stringsAsFactors=FALSE)

cna_data <-read.table(file=file.data_cna,sep="\t",header=TRUE,
                          stringsAsFactors=FALSE)
  • CNV and CNA nearly have the same meaning.

  • In your CNA gene matrix, each row is a gene, and each column (except the first two) is a sample.

  • The values show CNV status per gene in each sample (-2, -1: deletions; 0: neutral; 1, 2: amplifications).

  1. Visualize the Copy Number Variation (CNV) data across the entire genome for the tumor cohort.

    # 1. Assume cna_data is your loaded CNV data frame with columns:
    # Hugo_Symbol, Entrez_Gene_Id, Sample1, Sample2, ..., SampleN
    
    library(pheatmap)
    
    # 2. Remove extra columns, set rownames
    mat <- as.matrix(cna_data[, -(1:2)])
    rownames(mat) <- make.unique(cna_data$Hugo_Symbol)
    
    
    # 3. (Optional but recommended) Order genes by chromosome for genome-style display
    # If you have chromosome info, order by it. If not, just proceed.
    
    # 4. (Optional) Subset to most variable genes for better readability, e.g., top 30
    gene_sd <- apply(mat, 1, sd)
    top_genes <- order(gene_sd, decreasing=TRUE)[1:40]
    mat_top <- mat[top_genes, ]
    
    # 5. Heatmap plot
    pheatmap(
      mat_top,
      color = colorRampPalette(c("blue", "white", "red"))(50),
      cluster_rows = T, # set TRUE if you want clustering
      cluster_cols = FALSE, # set TRUE if you want clustering
      show_rownames = TRUE,
      show_colnames = FALSE,
      cex = 0.8,
      main = "Genome-wide Copy Number Variation (Top 40 Genes)"
    )

  2. Which genes or chromosomal regions are most frequently amplified or deleted in the cancer cohort?

cnv_values <- as.matrix(cna_data[, -(1:2)])
rownames(cnv_values) <- cna_data$Hugo_Symbol

ampl_count <- rowSums(cnv_values > 0, na.rm=TRUE)
del_count  <- rowSums(cnv_values < 0, na.rm=TRUE)
freq_table <- data.frame(
     Gene = rownames(cnv_values),
     Amplified_Samples = ampl_count,
     Deleted_Samples   = del_count)

#Sort and select the top 100 most frequently altered genes

# For amplifications:
top100_ampl <- freq_table[order(-freq_table$Amplified_Samples), ][1:100, ]
top30_ampl <- freq_table[order(-freq_table$Amplified_Samples), ][1:30, ]

# For deletions:
top100_del <- freq_table[order(-freq_table$Deleted_Samples), ][1:100, ]
top30_del <- freq_table[order(-freq_table$Deleted_Samples), ][1:30, ]
# Plot for Amplification

library(ggplot2)
ggplot(top30_ampl, aes(x = reorder(Gene, Amplified_Samples), y = Amplified_Samples)) +
  geom_bar(stat = "identity", fill = "blue") +
  coord_flip() +
  labs(title="Top 30 Most Frequently Amplified Genes",
       x="Gene", y="Number of Samples Amplified")

# Plot for Deletion

library(ggplot2)
ggplot(top30_del, aes(x = reorder(Gene, Deleted_Samples), y = Deleted_Samples)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(title="Top 30 Most Frequently Deleted Genes",
       x="Gene", y="Number of Samples ")

4. CNV within the CGC genes ?

#filter Extract only those rows (genes) in the CNV(CNA) matrix that are present in the CGC gene list

cna_cgc <- cna_data[cna_data$Hugo_Symbol %in% CGC_gene, ]
# Remove non-numeric columns
cna_matrix <- as.matrix(cna_cgc[, -(1:2)])  # Example: skip Hugo_Symbol & Entrez_Gene_Id
rownames(cna_matrix) <- cna_cgc$Hugo_Symbol

# Count amplifications and deletions per gene
ampl_count <- rowSums(cna_matrix > 0, na.rm = TRUE)
del_count  <- rowSums(cna_matrix < 0, na.rm = TRUE)

# Combine results
result_table <- data.frame(
  Gene = rownames(cna_matrix),
  Amplified_Samples = ampl_count,
  Deleted_Samples = del_count
)
top_ampl <- result_table[order(-result_table$Amplified_Samples), ][1:20, ]
top_del  <- result_table[order(-result_table$Deleted_Samples), ][1:20, ]
library(ggplot2)
ggplot(top_ampl, aes(x = reorder(Gene, Amplified_Samples), y = Amplified_Samples)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(title = "Top 20 Most Frequently Amplified CGC Genes",
       x = "Gene", y = "Number of Samples (Amplified)")

ggplot(top_del, aes(x = reorder(Gene, Deleted_Samples), y = Deleted_Samples)) +
  geom_bar(stat = "identity", fill = "blue") +
  coord_flip() +
  labs(title = "Top 20 Most Frequently Deleted CGC Genes",
       x = "Gene", y = "Number of Samples (Deleted)")

  1. Are there CNV hotspots that recur across many samples?

    = A CNV hotspot is a genomic region that is amplified or deleted in a large number of samples 

    # Use your processed CNV matrix, eg:
    cnv_matrix <- as.matrix(cna_data[,-c(1,2)])  # Remove non-numeric columns
    rownames(cnv_matrix) <- make.unique(cna_data$Hugo_Symbol)
    amp_freq <- rowSums(cnv_matrix > 0, na.rm=TRUE)
    del_freq <- rowSums(cnv_matrix < 0, na.rm=TRUE)
    
    hotspot_table <- data.frame(
      Gene = rownames(cnv_matrix),
      Amplified_Samples = amp_freq,
      Deleted_Samples   = del_freq
    )
top_amplified <- hotspot_table[order(-hotspot_table$Amplified_Samples), ][1:20, ]
top_deleted   <- hotspot_table[order(-hotspot_table$Deleted_Samples), ][1:20, ]
library(ggplot2)

# Amplification hotspots
ggplot(top_amplified, aes(x=reorder(Gene, Amplified_Samples), y=Amplified_Samples)) +
  geom_bar(stat="identity", fill="tomato") + coord_flip() +
  labs(title = "Top 20 Amplification Hotspots across the Cohort",
       x = "Gene", y = "Number of Samples (Amplified)")

# Deletion hotspots
ggplot(top_deleted, aes(x=reorder(Gene, Deleted_Samples), y=Deleted_Samples)) +
  geom_bar(stat="identity", fill="steelblue") + coord_flip() +
  labs(title = "Top 20 Deletion Hotspots across the Cohort",
       x = "Gene", y = "Number of Samples (Deleted)")