Setting Environmet and File Pathing
getwd()
## [1] "C:/Users/acer/Desktop/Project/NAAMII_Prostate_Cancer_Analysis"
dir.data <- file.path("C:/Users/acer/Desktop/Project/NAAMII_Prostate_Cancer_Analysis/prad_mskcc")
file.patient_dat <- file.path(dir.data,'data_clinical_patient.txt')
file.sample_dat <- file.path(dir.data,'data_clinical_sample.txt')
file.patient_meta <- file.path(dir.data,'meta_clinical_patient.txt')
file.sample_meta <- file.path(dir.data,'meta_clinical_sample.txt')
file.mutation_dat <- file.path(dir.data,'data_mutations.txt')
file.mutation_meta <- file.path(dir.data,'meta_mutations.txt')
Loading the data
patient_data <-read.table(file=file.patient_dat,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
patient_metadata <- read.table(file=file.patient_meta,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
## Warning in read.table(file = file.patient_meta, sep = "\t", header = TRUE, :
## incomplete final line found by readTableHeader on
## 'C:/Users/acer/Desktop/Project/NAAMII_Prostate_Cancer_Analysis/prad_mskcc/meta_clinical_patient.txt'
sample_data <- read.table(file=file.sample_dat,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
sample_metadata <- read.table(file=file.sample_meta,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
## Warning in read.table(file = file.sample_meta, sep = "\t", header = TRUE, :
## incomplete final line found by readTableHeader on
## 'C:/Users/acer/Desktop/Project/NAAMII_Prostate_Cancer_Analysis/prad_mskcc/meta_clinical_sample.txt'
mutation_data <- read.table(file=file.mutation_dat,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
mutation_metadata <- read.table(file=file.mutation_meta,sep="\t",header=TRUE,
stringsAsFactors=FALSE)
Patient Cohort:
colnames(sample_data)
## [1] "SAMPLE_ID" "PATIENT_ID" "SAMPLE_CLASS"
## [4] "SAMPLE_TYPE" "GLEASON_SCORE_1" "GLEASON_SCORE_2"
## [7] "GLEASON_SCORE" "ERG_FUSION_ACGH" "ERG_FUSION_GEX"
## [10] "SEQUENCED" "COMPLETE_DATA" "CANCER_TYPE"
## [13] "CANCER_TYPE_DETAILED" "ONCOTREE_CODE" "SEQUENCING"
## [16] "SOMATIC_STATUS" "TMB_NONSYNONYMOUS"
cancertype_tab <- table(sample_data$CANCER_TYPE_DETAILED)
bp_cancer_subtype <- barplot(cancertype_tab,
col = 'purple',
main = 'Prostate Cancer Type',
ylab = 'Number of Patients',
ylim=c(0,max(cancertype_tab)* 1.1)
)
text(
x = bp_cancer_subtype,
y = cancertype_tab,
labels = cancertype_tab,
pos = 3, # Position above the bar
cex = 0.8 # Font size for labels
)
Tumor stage Two different tumor stages present clinical and pathological
clin_tumor_stage_count <- table(patient_data$CLIN_T_STAGE)
bp_clin_tumor_stage <- barplot(clin_tumor_stage_count,
col='yellow',
main="Clinical Tumor Stage Distribution",
xlab="Tumor Stage",
ylab='Patient Frequency',
ylim=c(0,max(clin_tumor_stage_count)*1.2))
text(
x = bp_clin_tumor_stage,
y = clin_tumor_stage_count,
labels = clin_tumor_stage_count,
pos = 3, # Position above the bar
cex = 0.8 # Font size for labels
)
path_tumor_stage_count <- table(patient_data$PATH_T_STAGE)
bp_path_tumor_stage <- barplot(path_tumor_stage_count,
col='cyan',
main="Pathological Tumor Stage Distribution",
xlab="Tumor Stage",
ylab='Patient Frequency',
ylim=c(0,max(path_tumor_stage_count)*1.1))
text(
x = bp_path_tumor_stage,
y = path_tumor_stage_count,
labels = path_tumor_stage_count,
pos = 3, # Position above the bar
cex = 0.8 # Font size for labels
)
#(Possible to make a comparative bar plot of clinical and thaological)
Age group at diagnosis
colnames(patient_data)
## [1] "PATIENT_ID" "CLIN_T_STAGE" "PATH_T_STAGE"
## [4] "COPY_NUMBER_CLUSTER" "OS_MONTHS" "OS_STATUS"
## [7] "DFS_MONTHS" "DFS_STATUS" "SEX"
colnames(sample_data)
## [1] "SAMPLE_ID" "PATIENT_ID" "SAMPLE_CLASS"
## [4] "SAMPLE_TYPE" "GLEASON_SCORE_1" "GLEASON_SCORE_2"
## [7] "GLEASON_SCORE" "ERG_FUSION_ACGH" "ERG_FUSION_GEX"
## [10] "SEQUENCED" "COMPLETE_DATA" "CANCER_TYPE"
## [13] "CANCER_TYPE_DETAILED" "ONCOTREE_CODE" "SEQUENCING"
## [16] "SOMATIC_STATUS" "TMB_NONSYNONYMOUS"
#Age at diognosis data not present in this cohort
Somatic Muation Landscape:
Most frequently mutated genes:
mutation_frequency_table <- table(mutation_data$Hugo_Symbol)
# Sort the table in decreasing order
mutation_frequency_sorted <- sort(mutation_frequency_table, decreasing = TRUE)
# Display the top 10 most frequently mutated genes
head(mutation_frequency_sorted, 10)
##
## PTPRF AR AXL NCOR2 EP300 FLT4 APC CDH1 JAK1 PIK3CG
## 10 7 7 7 6 6 5 5 5 5
#Most common type of mutations
somatic_mutation_frequency_count <- table(mutation_data$Variant_Classification)
sorted.somatic_mutation_frequency_count <- sort(somatic_mutation_frequency_count, decreasing = TRUE)
head(sorted.somatic_mutation_frequency_count)
##
## Missense_Mutation Nonsense_Mutation Frame_Shift_Del Splice_Region
## 242 18 15 9
## Splice_Site Intron
## 6 5
#Mutational frequency of the Cancer Gene Census (CGC) genes (list top-20 most frequently mutated CGC genes)