Setting Environmet and File Pathing

getwd()
## [1] "C:/Users/acer/Desktop/Project/NAAMII_Prostate_Cancer_Analysis"
dir.data <- file.path("C:/Users/acer/Desktop/Project/NAAMII_Prostate_Cancer_Analysis/prad_mskcc")

file.patient_dat <- file.path(dir.data,'data_clinical_patient.txt')
file.sample_dat <- file.path(dir.data,'data_clinical_sample.txt')

file.patient_meta <- file.path(dir.data,'meta_clinical_patient.txt')
file.sample_meta <- file.path(dir.data,'meta_clinical_sample.txt')

file.mutation_dat <- file.path(dir.data,'data_mutations.txt')
file.mutation_meta <- file.path(dir.data,'meta_mutations.txt')

Loading the data

patient_data <-read.table(file=file.patient_dat,sep="\t",header=TRUE,
                          stringsAsFactors=FALSE)

patient_metadata <- read.table(file=file.patient_meta,sep="\t",header=TRUE,
                               stringsAsFactors=FALSE)
## Warning in read.table(file = file.patient_meta, sep = "\t", header = TRUE, :
## incomplete final line found by readTableHeader on
## 'C:/Users/acer/Desktop/Project/NAAMII_Prostate_Cancer_Analysis/prad_mskcc/meta_clinical_patient.txt'
sample_data <- read.table(file=file.sample_dat,sep="\t",header=TRUE,
                          stringsAsFactors=FALSE)

sample_metadata <- read.table(file=file.sample_meta,sep="\t",header=TRUE,
                              stringsAsFactors=FALSE)
## Warning in read.table(file = file.sample_meta, sep = "\t", header = TRUE, :
## incomplete final line found by readTableHeader on
## 'C:/Users/acer/Desktop/Project/NAAMII_Prostate_Cancer_Analysis/prad_mskcc/meta_clinical_sample.txt'
mutation_data <- read.table(file=file.mutation_dat,sep="\t",header=TRUE,
                            stringsAsFactors=FALSE)

mutation_metadata <- read.table(file=file.mutation_meta,sep="\t",header=TRUE,
                                stringsAsFactors=FALSE)

Patient Cohort:

colnames(sample_data)
##  [1] "SAMPLE_ID"            "PATIENT_ID"           "SAMPLE_CLASS"        
##  [4] "SAMPLE_TYPE"          "GLEASON_SCORE_1"      "GLEASON_SCORE_2"     
##  [7] "GLEASON_SCORE"        "ERG_FUSION_ACGH"      "ERG_FUSION_GEX"      
## [10] "SEQUENCED"            "COMPLETE_DATA"        "CANCER_TYPE"         
## [13] "CANCER_TYPE_DETAILED" "ONCOTREE_CODE"        "SEQUENCING"          
## [16] "SOMATIC_STATUS"       "TMB_NONSYNONYMOUS"
cancertype_tab <- table(sample_data$CANCER_TYPE_DETAILED)

bp_cancer_subtype <- barplot(cancertype_tab,
                             col = 'purple',
                             main = 'Prostate Cancer Type',
                             ylab = 'Number of Patients',
                             ylim=c(0,max(cancertype_tab)* 1.1)
                             )
text(
  x = bp_cancer_subtype,
  y = cancertype_tab,
  labels = cancertype_tab,
  pos = 3,                        # Position above the bar
  cex = 0.8                      # Font size for labels
)

Tumor stage Two different tumor stages present clinical and pathological

clin_tumor_stage_count <- table(patient_data$CLIN_T_STAGE)
bp_clin_tumor_stage <- barplot(clin_tumor_stage_count,
                    col='yellow',
                    main="Clinical Tumor Stage Distribution",
                    xlab="Tumor Stage",
                    ylab='Patient Frequency',
                    ylim=c(0,max(clin_tumor_stage_count)*1.2))
text(
  x = bp_clin_tumor_stage,
  y = clin_tumor_stage_count,
  labels = clin_tumor_stage_count,
  pos = 3,                        # Position above the bar
  cex = 0.8                      # Font size for labels
)

path_tumor_stage_count <- table(patient_data$PATH_T_STAGE)
bp_path_tumor_stage <- barplot(path_tumor_stage_count,
                               col='cyan',
                               main="Pathological Tumor Stage Distribution",
                               xlab="Tumor Stage",
                               ylab='Patient Frequency',
                               ylim=c(0,max(path_tumor_stage_count)*1.1))
text(
  x = bp_path_tumor_stage,
  y = path_tumor_stage_count,
  labels = path_tumor_stage_count,
  pos = 3,                        # Position above the bar
  cex = 0.8                      # Font size for labels
)

#(Possible to make a comparative bar plot of clinical and thaological)

Age group at diagnosis

colnames(patient_data)
## [1] "PATIENT_ID"          "CLIN_T_STAGE"        "PATH_T_STAGE"       
## [4] "COPY_NUMBER_CLUSTER" "OS_MONTHS"           "OS_STATUS"          
## [7] "DFS_MONTHS"          "DFS_STATUS"          "SEX"
colnames(sample_data)
##  [1] "SAMPLE_ID"            "PATIENT_ID"           "SAMPLE_CLASS"        
##  [4] "SAMPLE_TYPE"          "GLEASON_SCORE_1"      "GLEASON_SCORE_2"     
##  [7] "GLEASON_SCORE"        "ERG_FUSION_ACGH"      "ERG_FUSION_GEX"      
## [10] "SEQUENCED"            "COMPLETE_DATA"        "CANCER_TYPE"         
## [13] "CANCER_TYPE_DETAILED" "ONCOTREE_CODE"        "SEQUENCING"          
## [16] "SOMATIC_STATUS"       "TMB_NONSYNONYMOUS"
#Age at diognosis data not present in this cohort

Somatic Muation Landscape:

Most frequently mutated genes:

mutation_frequency_table <- table(mutation_data$Hugo_Symbol)

# Sort the table in decreasing order
mutation_frequency_sorted <- sort(mutation_frequency_table, decreasing = TRUE)

# Display the top 10 most frequently mutated genes
head(mutation_frequency_sorted, 10)
## 
##  PTPRF     AR    AXL  NCOR2  EP300   FLT4    APC   CDH1   JAK1 PIK3CG 
##     10      7      7      7      6      6      5      5      5      5
#Most common type of mutations
somatic_mutation_frequency_count <- table(mutation_data$Variant_Classification)
sorted.somatic_mutation_frequency_count <- sort(somatic_mutation_frequency_count, decreasing = TRUE)

head(sorted.somatic_mutation_frequency_count)
## 
## Missense_Mutation Nonsense_Mutation   Frame_Shift_Del     Splice_Region 
##               242                18                15                 9 
##       Splice_Site            Intron 
##                 6                 5
#Mutational frequency of the Cancer Gene Census (CGC) genes (list top-20 most frequently mutated CGC genes)