What are we working with ?
We are working with the dataset: Breast Cancer: METABRIC cohort (Nature 2012; Nature Communications 2016)
Download the dataset via the provided link.
getwd()
## [1] "C:/Users/Lenovo/Documents/NAAMI"
dir.data <- file.path("C:/Users/Lenovo/Documents/NAAMI/brca_metabric")
file.patient_data <- file.path(dir.data,"data_clinical_patient.txt")
file.sample_data <- file.path(dir.data,"data_clinical_sample.txt")
file.patient_metadata <- file.path(dir.data,"meta_clinical_patient.txt")
file.sample_metadata <- file.path(dir.data,"meta_clinical_sample.txt")
file.mutation_data <- file.path(dir.data,"data_mutations.txt")
file.mutation_metadata <- file.path(dir.data,"meta_mutations.txt")
patient_data <-read.table(file=file.patient_data,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
patient_metadata <- read.table(file=file.patient_metadata,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
sample_data <- read.table(file=file.sample_data,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
sample_metadata <- read.table(file=file.sample_metadata,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
mutation_data <- read.table(file=file.mutation_data,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
mutation_metadata <- read.table(file=file.mutation_metadata,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
head(patient_data)
## PATIENT_ID LYMPH_NODES_EXAMINED_POSITIVE NPI CELLULARITY CHEMOTHERAPY
## 1 MB-0000 10 6.044 NO
## 2 MB-0002 0 4.020 High NO
## 3 MB-0005 1 4.030 High YES
## 4 MB-0006 3 4.050 Moderate YES
## 5 MB-0008 8 6.080 High YES
## 6 MB-0010 0 4.062 Moderate NO
## COHORT ER_IHC HER2_SNP6 HORMONE_THERAPY INFERRED_MENOPAUSAL_STATE SEX
## 1 1 Positve NEUTRAL YES Post Female
## 2 1 Positve NEUTRAL YES Pre Female
## 3 1 Positve NEUTRAL YES Pre Female
## 4 1 Positve NEUTRAL YES Pre Female
## 5 1 Positve NEUTRAL YES Post Female
## 6 1 Positve NEUTRAL YES Post Female
## INTCLUST AGE_AT_DIAGNOSIS OS_MONTHS OS_STATUS CLAUDIN_SUBTYPE
## 1 4ER+ 75.65 140.50000 0:LIVING claudin-low
## 2 4ER+ 43.19 84.63333 0:LIVING LumA
## 3 3 48.87 163.70000 1:DECEASED LumB
## 4 9 47.68 164.93333 0:LIVING LumB
## 5 9 76.97 41.36667 1:DECEASED LumB
## 6 7 78.77 7.80000 1:DECEASED LumB
## THREEGENE VITAL_STATUS LATERALITY RADIO_THERAPY
## 1 ER-/HER2- Living Right YES
## 2 ER+/HER2- High Prolif Living Right YES
## 3 Died of Disease Right NO
## 4 Living Right YES
## 5 ER+/HER2- High Prolif Died of Disease Right YES
## 6 ER+/HER2- High Prolif Died of Disease Left YES
## HISTOLOGICAL_SUBTYPE BREAST_SURGERY RFS_MONTHS RFS_STATUS
## 1 Ductal/NST MASTECTOMY 140.500000 0:Not Recurred
## 2 Ductal/NST BREAST CONSERVING 84.633333 0:Not Recurred
## 3 Ductal/NST MASTECTOMY 153.300000 1:Recurred
## 4 Mixed MASTECTOMY 164.933333 0:Not Recurred
## 5 Mixed MASTECTOMY 18.800000 1:Recurred
## 6 Ductal/NST MASTECTOMY 2.933333 1:Recurred
head(sample_data)
## PATIENT_ID SAMPLE_ID CANCER_TYPE CANCER_TYPE_DETAILED
## 1 MB-0000 MB-0000 Breast Cancer Breast Invasive Ductal Carcinoma
## 2 MB-0002 MB-0002 Breast Cancer Breast Invasive Ductal Carcinoma
## 3 MB-0005 MB-0005 Breast Cancer Breast Invasive Ductal Carcinoma
## 4 MB-0006 MB-0006 Breast Cancer Breast Mixed Ductal and Lobular Carcinoma
## 5 MB-0008 MB-0008 Breast Cancer Breast Mixed Ductal and Lobular Carcinoma
## 6 MB-0010 MB-0010 Breast Cancer Breast Invasive Ductal Carcinoma
## ER_STATUS HER2_STATUS GRADE ONCOTREE_CODE PR_STATUS SAMPLE_TYPE TUMOR_SIZE
## 1 Positive Negative 3 IDC Negative Primary 22
## 2 Positive Negative 3 IDC Positive Primary 10
## 3 Positive Negative 2 IDC Positive Primary 15
## 4 Positive Negative 2 MDLC Positive Primary 25
## 5 Positive Negative 3 MDLC Positive Primary 40
## 6 Positive Negative 3 IDC Positive Primary 31
## TUMOR_STAGE TMB_NONSYNONYMOUS
## 1 2 0.000000
## 2 1 2.615035
## 3 2 2.615035
## 4 2 1.307518
## 5 2 2.615035
## 6 4 5.230071
#For a better and organized output use the View() function in your console as:
View(head(patient_data))
Distribution of patients by cancer stage, cancer subtype, age group at diagnosis, or any other relevant clinical parameters.
colnames(sample_data)
## [1] "PATIENT_ID" "SAMPLE_ID" "CANCER_TYPE"
## [4] "CANCER_TYPE_DETAILED" "ER_STATUS" "HER2_STATUS"
## [7] "GRADE" "ONCOTREE_CODE" "PR_STATUS"
## [10] "SAMPLE_TYPE" "TUMOR_SIZE" "TUMOR_STAGE"
## [13] "TMB_NONSYNONYMOUS"
colnames(patient_data)
## [1] "PATIENT_ID" "LYMPH_NODES_EXAMINED_POSITIVE"
## [3] "NPI" "CELLULARITY"
## [5] "CHEMOTHERAPY" "COHORT"
## [7] "ER_IHC" "HER2_SNP6"
## [9] "HORMONE_THERAPY" "INFERRED_MENOPAUSAL_STATE"
## [11] "SEX" "INTCLUST"
## [13] "AGE_AT_DIAGNOSIS" "OS_MONTHS"
## [15] "OS_STATUS" "CLAUDIN_SUBTYPE"
## [17] "THREEGENE" "VITAL_STATUS"
## [19] "LATERALITY" "RADIO_THERAPY"
## [21] "HISTOLOGICAL_SUBTYPE" "BREAST_SURGERY"
## [23] "RFS_MONTHS" "RFS_STATUS"
# As Cancer type is a categorical data.
# Create a table to store the frequency of the cancer types.
CancerType_count_table <- table(sample_data$CANCER_TYPE_DETAILED)
CancerType_count_table
##
## Breast
## 21
## Breast Angiosarcoma
## 2
## Breast Invasive Ductal Carcinoma
## 1865
## Breast Invasive Lobular Carcinoma
## 192
## Breast Invasive Mixed Mucinous Carcinoma
## 25
## Breast Mixed Ductal and Lobular Carcinoma
## 269
## Invasive Breast Carcinoma
## 133
## Metaplastic Breast Cancer
## 2
bp <- barplot(CancerType_count_table,
col="blue",
main="Breast Cancer Type Distribution",
ylab="Number of patients",
ylim=c(0,max(CancerType_count_table)* 1.1),
las = 2,
cex.names=0.3)
text(
x = bp,
y = CancerType_count_table,
labels = CancerType_count_table,
pos = 3, # Position above the bar
cex = 0.8 # Font size for labels
)
Cancer/Tumor Stage
tumor_stage_count <- table(sample_data$TUMOR_STAGE)
tumor_bp <- barplot(tumor_stage_count,
col='red',
main="Tumor Stage Distribution",
xlab="Tumor Stage",
ylab='Patient Frequency',
ylim=c(0,max(tumor_stage_count)*1.2))
text(
x = tumor_bp,
y = tumor_stage_count,
labels = tumor_stage_count,
pos = 3, # Position above the bar
cex = 0.8 # Font size for labels
)
Age Group At Diagnosis:
As age is continuous and include decimal values histogram would be an excellent choice for visualizing the distribution of age at diagnosis.
# Remove NAs and set up age bins
age_data <- na.omit(patient_data$AGE_AT_DIAGNOSIS)
min_age <- floor(min(age_data))
max_age <- ceiling(max(age_data))
age_bins <- seq(min_age, max_age + 5, by = 5) # It Ensures all data are included
# Create the histogram and suppress default x-axis
hist_obj <- hist(
age_data,
breaks = age_bins,
col = "dodgerblue",
border = "white",
main = "Age at Diagnosis",
xlab = "Age at Diagnosis (years)",
ylab = "Number of Patients",
xaxt = "n"
)
# Add custom x-axis ticks at the bin edges (start/end of each bar)
axis(1, at = hist_obj$breaks, labels = FALSE)
# Add the labels at the bin edges, slanted at 45 degrees and right-aligned
text(
x = hist_obj$breaks,
y = par("usr")[3] - 0.03 * max(hist_obj$counts),
labels = hist_obj$breaks,
srt = 45,
adj = 1,
xpd = TRUE,
cex = 0.8
)
# Show number of NAs
num_na <- sum(is.na(patient_data$AGE_AT_DIAGNOSIS))
if (num_na > 0) {
mtext(paste("NA:", num_na), side = 3, adj = 1, line = -1, cex = 0.8, col = "gray40")
}
mutation_frequency_table <- table(mutation_data$Hugo_Symbol)
# Sort the table in decreasing order
mutation_frequency_sorted <- sort(mutation_frequency_table, decreasing = TRUE)
# Display the top 10 most frequently mutated genes
head(mutation_frequency_sorted, 10)
##
## PIK3CA TP53 AHNAK2 MUC16 SYNE1 KMT2C MAP3K1 AHNAK GATA3 DNAH11
## 670 552 513 405 294 220 218 202 188 186
somatic_mutation_frequency_count <- table(mutation_data$Variant_Classification)
sorted.somatic_mutation_frequency_count <- sort(somatic_mutation_frequency_count, decreasing = TRUE)
head(sorted.somatic_mutation_frequency_count)
##
## Missense_Mutation Silent Nonsense_Mutation Frame_Shift_Del
## 6204 2501 522 515
## Frame_Shift_Ins Splice_Site
## 299 237
file.CGC <- file.path("C:/Users/Lenovo/Documents/NAAMI/Cancer_Gene_Census_allFri_Jun_20_03_26_10_2025.tsv"
)
CGC_data <- read.delim(file.CGC, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
# From CGC Extract the column containing gene symbols
CGC_gene <- unique(CGC_data$Gene.Symbol)
# Filter your mutation data to keep only mutations in CGC genes
mutation_CGC <- mutation_data[mutation_data$Hugo_Symbol %in% CGC_gene,]
# Count the number of mutations for each CGC gene
mutation_counts <- table(mutation_CGC$Hugo_Symbol)
sorted.mutation_counts <- sort(mutation_counts, decreasing = TRUE)
top20_genes <- sorted.mutation_counts[1:20]
View(top20_genes)
#Visualizing top20_genes
bp <- barplot(
top20_genes,
las = 2, # Rotate gene names for readability
col = "pink",
ylim=c(0,max(top20_genes)*1.2),
main = "Top 20 Mutated Genes",
ylab = "Mutation Count",
cex.names = 1
)
# Add value labels over the bars
text(
x = bp,
y = top20_genes,
labels = top20_genes,
pos = 3, # Position text above bars
cex = 0.8,
col = "black"
)
4 What is the overall mutation burden (tumor mutational burden/TMB) across samples, and how does it vary? Consider only SNVs for this analysis.
Tumor mutation burden is the total number of mutations found within a defined region genome. [ The total number of mutations found within the DNA of cancer cells.]
Single Nucleotide Variants (SNVs) are the most common type of genetic variation where a single nucleotide (A, T, C, or G) in a DNA sequence is altered.
View(table(mutation_data$Variant_Classification))
Only Missense_Mutation, Nonsense_Mutation, Silent, and Nonstop_Mutation from this list are explicitly categorized as SNVs
Other mutation variants in the list (eg: Frame_Shift_Del, Intron, Splice_Site, In_Frame_Del, etc.) involve insertions, deletions, or changes affecting one or more than one nucleotide or non-coding regions, and thus they are not always SNVs.
From ‘mutation_data’ the Genome Build (NCBI_Build) for the samples is: GRCh37
For GRCh37, the genome size is 3.1 Billion Base Pairs (3100 Mb) and its coding region size is typically around 30 to 33 Mb.
The standard practice to calculate TMB is : the number of mutations per megabase (Mb) of coding region, not per the full genome size.
#---- 4. Tumor Mutational Burden (TMB) ----
# Define SNV mutation types
snv_types <- c("Missense_Mutation", "Nonsense_Mutation", "Silent",
"Nonstop_Mutation")
# Filter mutation_data for SNVs only
snv_data <- mutation_data[mutation_data$Variant_Classification %in% snv_types, ]
# Calculate TMB per sample
# Assume coding region size = 30 Mb
coding_region_mb <- 30
tmb_per_sample <- as.data.frame(table(snv_data$Tumor_Sample_Barcode))
colnames(tmb_per_sample) <- c("Sample", "SNV_Count")
tmb_per_sample$TMB <- tmb_per_sample$SNV_Count / coding_region_mb
# Visualize TMB distribution
hist(tmb_per_sample$TMB,
breaks = 30,
col = "skyblue",
main = "Tumor Mutational Burden (TMB) Across Samples",
xlab = "TMB (Mutations per Mb)",
ylab = "Number of Samples")
file.data_cna <- file.path(dir.data,"data_cna.txt")
CNA_data <-read.table(file=file.data_cna,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
file.cna_metadata <- file.path(dir.data,"meta_cna.txt")
CNA_metadata <- read.table(file=file.cna_metadata,sep="\t",header=TRUE ,stringsAsFactors=FALSE)
cna_data <-read.table(file=file.data_cna,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
CNV and CNA nearly have the same meaning.
In your CNA gene matrix, each row is a gene, and each column (except the first two) is a sample.
The values show CNV status per gene in each sample (-2, -1: deletions; 0: neutral; 1, 2: amplifications).
Visualize the Copy Number Variation (CNV) data across the entire genome for the tumor cohort.
# 1. Assume cna_data is your loaded CNV data frame with columns:
# Hugo_Symbol, Entrez_Gene_Id, Sample1, Sample2, ..., SampleN
library(pheatmap)
# 2. Remove extra columns, set rownames
mat <- as.matrix(cna_data[, -(1:2)])
rownames(mat) <- make.unique(cna_data$Hugo_Symbol)
# 3. (Optional but recommended) Order genes by chromosome for genome-style display
# If you have chromosome info, order by it. If not, just proceed.
# 4. (Optional) Subset to most variable genes for better readability, e.g., top 30
gene_sd <- apply(mat, 1, sd)
top_genes <- order(gene_sd, decreasing=TRUE)[1:40]
mat_top <- mat[top_genes, ]
# 5. Heatmap plot
pheatmap(
mat_top,
color = colorRampPalette(c("blue", "white", "red"))(50),
cluster_rows = T, # set TRUE if you want clustering
cluster_cols = FALSE, # set TRUE if you want clustering
show_rownames = TRUE,
show_colnames = FALSE,
cex = 0.8,
main = "Genome-wide Copy Number Variation (Top 40 Genes)"
)
Which genes or chromosomal regions are most frequently amplified or deleted in the cancer cohort?
cnv_values <- as.matrix(cna_data[, -(1:2)])
rownames(cnv_values) <- cna_data$Hugo_Symbol
ampl_count <- rowSums(cnv_values > 0, na.rm=TRUE)
del_count <- rowSums(cnv_values < 0, na.rm=TRUE)
freq_table <- data.frame(
Gene = rownames(cnv_values),
Amplified_Samples = ampl_count,
Deleted_Samples = del_count)
#Sort and select the top 100 most frequently altered genes
# For amplifications:
top100_ampl <- freq_table[order(-freq_table$Amplified_Samples), ][1:100, ]
top30_ampl <- freq_table[order(-freq_table$Amplified_Samples), ][1:30, ]
# For deletions:
top100_del <- freq_table[order(-freq_table$Deleted_Samples), ][1:100, ]
top30_del <- freq_table[order(-freq_table$Deleted_Samples), ][1:30, ]
# Plot for Amplification
library(ggplot2)
ggplot(top30_ampl, aes(x = reorder(Gene, Amplified_Samples), y = Amplified_Samples)) +
geom_bar(stat = "identity", fill = "blue") +
coord_flip() +
labs(title="Top 30 Most Frequently Amplified Genes",
x="Gene", y="Number of Samples Amplified")
# Plot for Deletion
library(ggplot2)
ggplot(top30_del, aes(x = reorder(Gene, Deleted_Samples), y = Deleted_Samples)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(title="Top 30 Most Frequently Deleted Genes",
x="Gene", y="Number of Samples ")
4. CNV within the CGC genes ?
#filter Extract only those rows (genes) in the CNV(CNA) matrix that are present in the CGC gene list
cna_cgc <- cna_data[cna_data$Hugo_Symbol %in% CGC_gene, ]
# Remove non-numeric columns
cna_matrix <- as.matrix(cna_cgc[, -(1:2)]) # Example: skip Hugo_Symbol & Entrez_Gene_Id
rownames(cna_matrix) <- cna_cgc$Hugo_Symbol
# Count amplifications and deletions per gene
ampl_count <- rowSums(cna_matrix > 0, na.rm = TRUE)
del_count <- rowSums(cna_matrix < 0, na.rm = TRUE)
# Combine results
result_table <- data.frame(
Gene = rownames(cna_matrix),
Amplified_Samples = ampl_count,
Deleted_Samples = del_count
)
top_ampl <- result_table[order(-result_table$Amplified_Samples), ][1:20, ]
top_del <- result_table[order(-result_table$Deleted_Samples), ][1:20, ]
library(ggplot2)
ggplot(top_ampl, aes(x = reorder(Gene, Amplified_Samples), y = Amplified_Samples)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(title = "Top 20 Most Frequently Amplified CGC Genes",
x = "Gene", y = "Number of Samples (Amplified)")
ggplot(top_del, aes(x = reorder(Gene, Deleted_Samples), y = Deleted_Samples)) +
geom_bar(stat = "identity", fill = "blue") +
coord_flip() +
labs(title = "Top 20 Most Frequently Deleted CGC Genes",
x = "Gene", y = "Number of Samples (Deleted)")
Are there CNV hotspots that recur across many samples?
= A CNV hotspot is a genomic region that is amplified or deleted in a large number of samples
# Use your processed CNV matrix, eg:
cnv_matrix <- as.matrix(cna_data[,-c(1,2)]) # Remove non-numeric columns
rownames(cnv_matrix) <- make.unique(cna_data$Hugo_Symbol)
amp_freq <- rowSums(cnv_matrix > 0, na.rm=TRUE)
del_freq <- rowSums(cnv_matrix < 0, na.rm=TRUE)
hotspot_table <- data.frame(
Gene = rownames(cnv_matrix),
Amplified_Samples = amp_freq,
Deleted_Samples = del_freq
)top_amplified <- hotspot_table[order(-hotspot_table$Amplified_Samples), ][1:20, ]
top_deleted <- hotspot_table[order(-hotspot_table$Deleted_Samples), ][1:20, ]
library(ggplot2)
# Amplification hotspots
ggplot(top_amplified, aes(x=reorder(Gene, Amplified_Samples), y=Amplified_Samples)) +
geom_bar(stat="identity", fill="tomato") + coord_flip() +
labs(title = "Top 20 Amplification Hotspots across the Cohort",
x = "Gene", y = "Number of Samples (Amplified)")
# Deletion hotspots
ggplot(top_deleted, aes(x=reorder(Gene, Deleted_Samples), y=Deleted_Samples)) +
geom_bar(stat="identity", fill="steelblue") + coord_flip() +
labs(title = "Top 20 Deletion Hotspots across the Cohort",
x = "Gene", y = "Number of Samples (Deleted)")