projects <- getGDCprojects()
projects[,c("id", "project_id", "released", "tumor")]
## id project_id released tumor
## 1 CTSP-DLBCL1 CTSP-DLBCL1 TRUE DLBCL1
## 2 TCGA-BRCA TCGA-BRCA TRUE BRCA
## 3 TCGA-LUAD TCGA-LUAD TRUE LUAD
## 4 CPTAC-3 CPTAC-3 TRUE 3
## 5 APOLLO-LUAD APOLLO-LUAD TRUE LUAD
## 6 MATCH-B MATCH-B TRUE B
## 7 CMI-ASC CMI-ASC TRUE ASC
## 8 MATCH-C1 MATCH-C1 TRUE C1
## 9 BEATAML1.0-CRENOLANIB BEATAML1.0-CRENOLANIB TRUE CRENOLANIB
## 10 CDDP_EAGLE-1 CDDP_EAGLE-1 TRUE 1
## 11 CMI-MPC CMI-MPC TRUE MPC
## 12 MATCH-S1 MATCH-S1 TRUE S1
## 13 MATCH-W MATCH-W TRUE W
## 14 MATCH-Z1D MATCH-Z1D TRUE Z1D
## 15 MATCH-Z1A MATCH-Z1A TRUE Z1A
## 16 MATCH-Y MATCH-Y TRUE Y
## 17 MATCH-U MATCH-U TRUE U
## 18 MATCH-Z1B MATCH-Z1B TRUE Z1B
## 19 MATCH-S2 MATCH-S2 TRUE S2
## 20 FM-AD FM-AD TRUE AD
## 21 VAREPOP-APOLLO VAREPOP-APOLLO TRUE APOLLO
## 22 MATCH-I MATCH-I TRUE I
## 23 MATCH-P MATCH-P TRUE P
## 24 MATCH-R MATCH-R TRUE R
## 25 MATCH-N MATCH-N TRUE N
## 26 MATCH-Q MATCH-Q TRUE Q
## 27 MATCH-H MATCH-H TRUE H
## 28 CMI-MBC CMI-MBC TRUE MBC
## 29 MATCH-Z1I MATCH-Z1I TRUE Z1I
## 30 BEATAML1.0-COHORT BEATAML1.0-COHORT TRUE COHORT
## 31 OHSU-CNL OHSU-CNL TRUE CNL
## 32 ORGANOID-PANCREATIC ORGANOID-PANCREATIC TRUE PANCREATIC
## 33 NCICCR-DLBCL NCICCR-DLBCL TRUE DLBCL
## 34 CPTAC-2 CPTAC-2 TRUE 2
## 35 TRIO-CRU TRIO-CRU TRUE CRU
## 36 MMRF-COMMPASS MMRF-COMMPASS TRUE COMMPASS
## 37 WCDT-MCRPC WCDT-MCRPC TRUE MCRPC
## 38 MP2PRT-WT MP2PRT-WT TRUE WT
## 39 MP2PRT-ALL MP2PRT-ALL TRUE ALL
## 40 REBC-THYR REBC-THYR TRUE THYR
## 41 APOLLO-OV APOLLO-OV TRUE OV
## 42 CCDI-MCI CCDI-MCI TRUE MCI
## 43 TCGA-DLBC TCGA-DLBC TRUE DLBC
## 44 TCGA-COAD TCGA-COAD TRUE COAD
## 45 TCGA-CESC TCGA-CESC TRUE CESC
## 46 TCGA-CHOL TCGA-CHOL TRUE CHOL
## 47 TCGA-ESCA TCGA-ESCA TRUE ESCA
## 48 TCGA-LIHC TCGA-LIHC TRUE LIHC
## 49 TCGA-MESO TCGA-MESO TRUE MESO
## 50 TCGA-KIRP TCGA-KIRP TRUE KIRP
## 51 TCGA-LAML TCGA-LAML TRUE LAML
## 52 TCGA-PCPG TCGA-PCPG TRUE PCPG
## 53 TCGA-HNSC TCGA-HNSC TRUE HNSC
## 54 CGCI-HTMCP-DLBCL CGCI-HTMCP-DLBCL TRUE HTMCP
## 55 TCGA-READ TCGA-READ TRUE READ
## 56 TCGA-PAAD TCGA-PAAD TRUE PAAD
## 57 TCGA-UCS TCGA-UCS TRUE UCS
## 58 TCGA-KIRC TCGA-KIRC TRUE KIRC
## 59 TCGA-GBM TCGA-GBM TRUE GBM
## 60 TCGA-KICH TCGA-KICH TRUE KICH
## 61 EXCEPTIONAL_RESPONDERS-ER EXCEPTIONAL_RESPONDERS-ER TRUE ER
## 62 CGCI-HTMCP-LC CGCI-HTMCP-LC TRUE HTMCP
## 63 TARGET-OS TARGET-OS TRUE OS
## 64 TARGET-ALL-P3 TARGET-ALL-P3 TRUE ALL
## 65 CGCI-BLGSP CGCI-BLGSP TRUE BLGSP
## 66 TCGA-THYM TCGA-THYM TRUE THYM
## 67 TCGA-UVM TCGA-UVM TRUE UVM
## 68 TARGET-RT TARGET-RT TRUE RT
## 69 TARGET-CCSK TARGET-CCSK TRUE CCSK
## 70 TARGET-NBL TARGET-NBL TRUE NBL
## 71 TCGA-SKCM TCGA-SKCM TRUE SKCM
## 72 TCGA-THCA TCGA-THCA TRUE THCA
## 73 TCGA-STAD TCGA-STAD TRUE STAD
## 74 TARGET-ALL-P2 TARGET-ALL-P2 TRUE ALL
## 75 TCGA-ACC TCGA-ACC TRUE ACC
## 76 TARGET-ALL-P1 TARGET-ALL-P1 TRUE ALL
## 77 TARGET-WT TARGET-WT TRUE WT
## 78 TCGA-LGG TCGA-LGG TRUE LGG
## 79 HCMI-CMDC HCMI-CMDC TRUE CMDC
## 80 TCGA-SARC TCGA-SARC TRUE SARC
## 81 CGCI-HTMCP-CC CGCI-HTMCP-CC TRUE HTMCP
## 82 TCGA-OV TCGA-OV TRUE OV
## 83 TCGA-BLCA TCGA-BLCA TRUE BLCA
## 84 TCGA-UCEC TCGA-UCEC TRUE UCEC
## 85 TCGA-PRAD TCGA-PRAD TRUE PRAD
## 86 TARGET-AML TARGET-AML TRUE AML
## 87 TCGA-TGCT TCGA-TGCT TRUE TGCT
## 88 TCGA-LUSC TCGA-LUSC TRUE LUSC
packageVersion("TCGAbiolinks")
## [1] '2.38.0'
# added 4 lines of code to original chunk to filter TCGA projects only (33 total)
tcga_projects <- projects[grepl("^TCGA-", projects$project_id), ]
id <- tcga_projects$project_id
smpls <- list()
for(i in 1:length(id)){
temp <- NULL
query_Target <- NULL
tryCatch({
query_Target <- suppressMessages(GDCquery(project = id[i],
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts"))
},
error = function(e) NULL # line added for silent error handling
)
if(!is.null(query_Target)){
samplesDown_Target <- getResults(query_Target)
if("sample_type" %in% colnames(samplesDown_Target)) { # line added to check for missing sample_type
temp[[1]] <- table(samplesDown_Target$sample_type)
} else {
temp[[1]] <- NA
}
names(temp) <- id[i]
} else {
temp[[1]] <- NA
names(temp) <- id[i]
}
smpls <- c(smpls, temp)
}
smpls
## $`TCGA-BRCA`
##
## Metastatic Primary Tumor Solid Tissue Normal
## 7 1111 113
##
## $`TCGA-LUAD`
##
## Primary Tumor Recurrent Tumor Solid Tissue Normal
## 540 2 59
##
## $`TCGA-DLBC`
##
## Primary Tumor
## 48
##
## $`TCGA-COAD`
##
## Metastatic Primary Tumor Recurrent Tumor Solid Tissue Normal
## 1 481 1 41
##
## $`TCGA-CESC`
##
## Metastatic Primary Tumor Solid Tissue Normal
## 2 304 3
##
## $`TCGA-CHOL`
##
## Primary Tumor Solid Tissue Normal
## 35 9
##
## $`TCGA-ESCA`
##
## Metastatic Primary Tumor Solid Tissue Normal
## 1 184 13
##
## $`TCGA-LIHC`
##
## Primary Tumor Recurrent Tumor Solid Tissue Normal
## 371 3 50
##
## $`TCGA-MESO`
##
## Primary Tumor
## 87
##
## $`TCGA-KIRP`
##
## Additional - New Primary Primary Tumor Solid Tissue Normal
## 1 290 32
##
## $`TCGA-LAML`
##
## Primary Blood Derived Cancer - Peripheral Blood
## 151
##
## $`TCGA-PCPG`
##
## Additional - New Primary Metastatic Primary Tumor
## 3 2 179
## Solid Tissue Normal
## 3
##
## $`TCGA-HNSC`
##
## Metastatic Primary Tumor Solid Tissue Normal
## 2 520 44
##
## $`TCGA-READ`
##
## Primary Tumor Recurrent Tumor Solid Tissue Normal
## 166 1 10
##
## $`TCGA-PAAD`
##
## Metastatic Primary Tumor Solid Tissue Normal
## 1 178 4
##
## $`TCGA-UCS`
##
## Primary Tumor
## 57
##
## $`TCGA-KIRC`
##
## Additional - New Primary Primary Tumor Solid Tissue Normal
## 1 541 72
##
## $`TCGA-GBM`
##
## Primary Tumor Recurrent Tumor Solid Tissue Normal
## 372 14 5
##
## $`TCGA-KICH`
##
## Primary Tumor Solid Tissue Normal
## 66 25
##
## $`TCGA-THYM`
##
## Primary Tumor Solid Tissue Normal
## 120 2
##
## $`TCGA-UVM`
##
## Primary Tumor
## 80
##
## $`TCGA-SKCM`
##
## Additional Metastatic Metastatic Primary Tumor
## 1 368 103
## Solid Tissue Normal
## 1
##
## $`TCGA-THCA`
##
## Metastatic Primary Tumor Solid Tissue Normal
## 8 505 59
##
## $`TCGA-STAD`
##
## Primary Tumor Solid Tissue Normal
## 412 36
##
## $`TCGA-ACC`
##
## Primary Tumor
## 79
##
## $`TCGA-LGG`
##
## Primary Tumor Recurrent Tumor
## 516 18
##
## $`TCGA-SARC`
##
## Metastatic Primary Tumor Recurrent Tumor Solid Tissue Normal
## 1 259 3 2
##
## $`TCGA-OV`
##
## Primary Tumor Recurrent Tumor
## 426 8
##
## $`TCGA-BLCA`
##
## Primary Tumor Solid Tissue Normal
## 412 19
##
## $`TCGA-UCEC`
##
## Primary Tumor Recurrent Tumor Solid Tissue Normal
## 553 1 35
##
## $`TCGA-PRAD`
##
## Metastatic Primary Tumor Solid Tissue Normal
## 1 501 52
##
## $`TCGA-TGCT`
##
## Additional - New Primary Primary Tumor
## 6 150
##
## $`TCGA-LUSC`
##
## Primary Tumor Solid Tissue Normal
## 511 51
smpls[[1]]
##
## Metastatic Primary Tumor Solid Tissue Normal
## 7 1111 113
posIDs <- c("TCGA-DLBC","TCGA-LUAD","TCGA-COAD","TCGA-BRCA")
as_tibble(projects[projects$id %in% posIDs, ])
## # A tibble: 4 × 10
## id primary_site dbgap_accession_number project_id disease_type name
## <chr> <list> <chr> <chr> <list> <chr>
## 1 TCGA-BRCA <chr [1]> <NA> TCGA-BRCA <chr [9]> Breast …
## 2 TCGA-LUAD <chr [1]> <NA> TCGA-LUAD <chr [4]> Lung Ad…
## 3 TCGA-DLBC <chr [14]> <NA> TCGA-DLBC <chr [2]> Lymphoi…
## 4 TCGA-COAD <chr [2]> <NA> TCGA-COAD <chr [4]> Colon A…
## # ℹ 4 more variables: releasable <lgl>, state <chr>, released <lgl>,
## # tumor <chr>
smpls[names(smpls) %in% posIDs[3:4]]
## $`TCGA-BRCA`
##
## Metastatic Primary Tumor Solid Tissue Normal
## 7 1111 113
##
## $`TCGA-COAD`
##
## Metastatic Primary Tumor Recurrent Tumor Solid Tissue Normal
## 1 481 1 41
TCGAbiolinks:::getProjectSummary("TCGA-BRCA")
## $file_count
## [1] 70776
##
## $data_categories
## file_count case_count data_category
## 1 21134 1098 Simple Nucleotide Variation
## 2 9282 1098 Sequencing Reads
## 3 5317 1098 Biospecimen
## 4 2288 1098 Clinical
## 5 14346 1098 Copy Number Variation
## 6 4876 1097 Transcriptome Profiling
## 7 3714 1097 DNA Methylation
## 8 919 881 Proteome Profiling
## 9 3128 927 Somatic Structural Variation
## 10 5772 1098 Structural Variation
##
## $case_count
## [1] 1098
##
## $file_size
## [1] 6.249966e+14
#### Downloading and prepare TARGET CASE ####
TargetSamples <- GDCquery(project = "TCGA-BRCA",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts")
## --------------------------------------
## o GDCquery: Searching in GDC database
## --------------------------------------
## Genome of reference: hg38
## --------------------------------------------
## oo Accessing GDC. This might take a while...
## --------------------------------------------
## ooo Project: TCGA-BRCA
## --------------------
## oo Filtering results
## --------------------
## ooo By data.type
## ooo By workflow.type
## ----------------
## oo Checking data
## ----------------
## ooo Checking if there are duplicated cases
## ooo Checking if there are results for the query
## -------------------
## o Preparing output
## -------------------
#### obtain case information ####
CaseInfo <- getResults(TargetSamples)#, cols = c("cases"))
as_tibble(head(CaseInfo))
## # A tibble: 6 × 30
## id data_format cases access file_name submitter_id data_category type
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 9dc09c86-… TSV TCGA… open d1f1743c… da44e611-ff… Transcriptom… gene…
## 2 95668f0b-… TSV TCGA… open 6365a756… 06cd79cd-aa… Transcriptom… gene…
## 3 461fda5d-… TSV TCGA… open 30285113… dbf87563-f6… Transcriptom… gene…
## 4 30ff778c-… TSV TCGA… open 5167da8c… 7c7193d8-75… Transcriptom… gene…
## 5 427a04c9-… TSV TCGA… open fead73ce… 9248dd2f-37… Transcriptom… gene…
## 6 0682b5b9-… TSV TCGA… open d5066dc8… f123e0b6-4c… Transcriptom… gene…
## # ℹ 22 more variables: platform <chr>, file_size <int>, created_datetime <chr>,
## # md5sum <chr>, updated_datetime <chr>, file_id <chr>, data_type <chr>,
## # state <chr>, experimental_strategy <chr>, version <chr>,
## # data_release <chr>, project <chr>, analysis_id <chr>, analysis_state <chr>,
## # analysis_submitter_id <chr>, analysis_workflow_link <chr>,
## # analysis_workflow_type <chr>, analysis_workflow_version <chr>,
## # sample_type <chr>, is_ffpe <lgl>, cases.submitter_id <chr>, …
data <- GDCprepare(TargetSamples)
## | | 0% | |0.4424779% ~9 s remaining | |0.8849558% ~7 s remaining | |1.327434% ~7 s remaining | |1.769912% ~7 s remaining |= |2.212389% ~6 s remaining |= |2.654867% ~6 s remaining |= |3.097345% ~6 s remaining |= |3.539823% ~6 s remaining |== |3.982301% ~6 s remaining |== |4.424779% ~6 s remaining |== |4.867257% ~6 s remaining |== |5.309735% ~6 s remaining |== |5.752212% ~6 s remaining |=== |6.19469% ~6 s remaining |=== |6.637168% ~6 s remaining |=== |7.079646% ~6 s remaining |=== |7.522124% ~6 s remaining |==== |7.964602% ~6 s remaining |==== |8.40708% ~6 s remaining |==== |8.849558% ~6 s remaining |==== |9.292035% ~6 s remaining |===== |9.734513% ~6 s remaining |===== |10.17699% ~6 s remaining |===== |10.61947% ~6 s remaining |===== |11.06195% ~6 s remaining |===== |11.50442% ~6 s remaining |====== |11.9469% ~6 s remaining |====== |12.38938% ~6 s remaining |====== |12.83186% ~6 s remaining |====== |13.27434% ~6 s remaining |======= |13.71681% ~6 s remaining |======= |14.15929% ~6 s remaining |======= |14.60177% ~9 s remaining |======= |15.04425% ~9 s remaining |======== |15.48673% ~9 s remaining |======== |15.9292% ~9 s remaining |======== |16.37168% ~8 s remaining |======== |16.81416% ~8 s remaining |======== |17.25664% ~8 s remaining |========= |17.69912% ~8 s remaining |========= |18.14159% ~8 s remaining |========= |18.58407% ~8 s remaining |========= |19.02655% ~8 s remaining |========== |19.46903% ~8 s remaining |========== |19.9115% ~7 s remaining |========== |20.35398% ~7 s remaining |========== |20.79646% ~7 s remaining |=========== |21.23894% ~7 s remaining |=========== |21.68142% ~7 s remaining |=========== |22.12389% ~9 s remaining |=========== |22.56637% ~9 s remaining |=========== |23.00885% ~9 s remaining |============ |23.45133% ~9 s remaining |============ |23.89381% ~8 s remaining |============ |24.33628% ~8 s remaining |============ |24.77876% ~8 s remaining |============= |25.22124% ~8 s remaining |============= |25.66372% ~8 s remaining |============= |26.10619% ~8 s remaining |============= |26.54867% ~8 s remaining |============== |26.99115% ~8 s remaining |============== |27.43363% ~8 s remaining |============== |27.87611% ~8 s remaining |============== |28.31858% ~7 s remaining |============== |28.76106% ~7 s remaining |=============== |29.20354% ~7 s remaining |=============== |29.64602% ~7 s remaining |=============== |30.0885% ~7 s remaining |=============== |30.53097% ~7 s remaining |================ |30.97345% ~7 s remaining |================ |31.41593% ~8 s remaining |================ |31.85841% ~8 s remaining |================ |32.30088% ~8 s remaining |================= |32.74336% ~8 s remaining |================= |33.18584% ~8 s remaining |================= |33.62832% ~8 s remaining |================= |34.0708% ~7 s remaining |================= |34.51327% ~7 s remaining |================== |34.95575% ~7 s remaining |================== |35.39823% ~7 s remaining |================== |35.84071% ~7 s remaining |================== |36.28319% ~7 s remaining |=================== |36.72566% ~7 s remaining |=================== |37.16814% ~7 s remaining |=================== |37.61062% ~7 s remaining |=================== |38.0531% ~7 s remaining |==================== |38.49558% ~7 s remaining |==================== |38.93805% ~6 s remaining |==================== |39.38053% ~6 s remaining |==================== |39.82301% ~6 s remaining |==================== |40.26549% ~6 s remaining |===================== |40.70796% ~6 s remaining |===================== |41.15044% ~6 s remaining |===================== |41.59292% ~6 s remaining |===================== |42.0354% ~6 s remaining |====================== |42.47788% ~7 s remaining |====================== |42.92035% ~7 s remaining |====================== |43.36283% ~7 s remaining |====================== |43.80531% ~6 s remaining |======================= |44.24779% ~6 s remaining |======================= |44.69027% ~6 s remaining |======================= |45.13274% ~6 s remaining |======================= |45.57522% ~6 s remaining |======================= |46.0177% ~6 s remaining |======================== |46.46018% ~6 s remaining |======================== |46.90265% ~6 s remaining |======================== |47.34513% ~6 s remaining |======================== |47.78761% ~6 s remaining |========================= |48.23009% ~6 s remaining |========================= |48.67257% ~6 s remaining |========================= |49.11504% ~6 s remaining |========================= |49.55752% ~5 s remaining |========================== | 50% ~5 s remaining |========================== |50.44248% ~5 s remaining |========================== |50.88496% ~5 s remaining |========================== |51.32743% ~5 s remaining |========================== |51.76991% ~5 s remaining |=========================== |52.21239% ~5 s remaining |=========================== |52.65487% ~5 s remaining |=========================== |53.09735% ~5 s remaining |=========================== |53.53982% ~5 s remaining |============================ |53.9823% ~5 s remaining |============================ |54.42478% ~5 s remaining |============================ |54.86726% ~5 s remaining |============================ |55.30973% ~5 s remaining |============================ |55.75221% ~5 s remaining |============================= |56.19469% ~5 s remaining |============================= |56.63717% ~5 s remaining |============================= |57.07965% ~5 s remaining |============================= |57.52212% ~5 s remaining |============================== |57.9646% ~5 s remaining |============================== |58.40708% ~5 s remaining |============================== |58.84956% ~5 s remaining |============================== |59.29204% ~5 s remaining |=============================== |59.73451% ~5 s remaining |=============================== |60.17699% ~4 s remaining |=============================== |60.61947% ~4 s remaining |=============================== |61.06195% ~4 s remaining |=============================== |61.50442% ~4 s remaining |================================ |61.9469% ~4 s remaining |================================ |62.38938% ~4 s remaining |================================ |62.83186% ~4 s remaining |================================ |63.27434% ~4 s remaining |================================= |63.71681% ~4 s remaining |================================= |64.15929% ~4 s remaining |================================= |64.60177% ~4 s remaining |================================= |65.04425% ~4 s remaining |================================== |65.48673% ~4 s remaining |================================== |65.9292% ~4 s remaining |================================== |66.37168% ~4 s remaining |================================== |66.81416% ~4 s remaining |================================== |67.25664% ~3 s remaining |=================================== |67.69912% ~3 s remaining |=================================== |68.14159% ~3 s remaining |=================================== |68.58407% ~3 s remaining |=================================== |69.02655% ~3 s remaining |==================================== |69.46903% ~3 s remaining |==================================== |69.9115% ~3 s remaining |==================================== |70.35398% ~3 s remaining |==================================== |70.79646% ~3 s remaining |===================================== |71.23894% ~3 s remaining |===================================== |71.68142% ~3 s remaining |===================================== |72.12389% ~3 s remaining |===================================== |72.56637% ~3 s remaining |===================================== |73.00885% ~3 s remaining |====================================== |73.45133% ~3 s remaining |====================================== |73.89381% ~3 s remaining |====================================== |74.33628% ~3 s remaining |====================================== |74.77876% ~3 s remaining |======================================= |75.22124% ~3 s remaining |======================================= |75.66372% ~3 s remaining |======================================= |76.10619% ~3 s remaining |======================================= |76.54867% ~3 s remaining |======================================== |76.99115% ~3 s remaining |======================================== |77.43363% ~2 s remaining |======================================== |77.87611% ~2 s remaining |======================================== |78.31858% ~2 s remaining |======================================== |78.76106% ~2 s remaining |========================================= |79.20354% ~2 s remaining |========================================= |79.64602% ~2 s remaining |========================================= |80.0885% ~2 s remaining |========================================= |80.53097% ~2 s remaining |========================================== |80.97345% ~2 s remaining |========================================== |81.41593% ~2 s remaining |========================================== |81.85841% ~2 s remaining |========================================== |82.30088% ~2 s remaining |=========================================== |82.74336% ~2 s remaining |=========================================== |83.18584% ~2 s remaining |=========================================== |83.62832% ~2 s remaining |=========================================== |84.0708% ~2 s remaining |=========================================== |84.51327% ~2 s remaining |============================================ |84.95575% ~2 s remaining |============================================ |85.39823% ~2 s remaining |============================================ |85.84071% ~1 s remaining |============================================ |86.28319% ~1 s remaining |============================================= |86.72566% ~1 s remaining |============================================= |87.16814% ~1 s remaining |============================================= |87.61062% ~1 s remaining |============================================= |88.0531% ~1 s remaining |============================================== |88.49558% ~1 s remaining |============================================== |88.93805% ~1 s remaining |============================================== |89.38053% ~1 s remaining |============================================== |89.82301% ~1 s remaining |============================================== |90.26549% ~1 s remaining |=============================================== |90.70796% ~1 s remaining |=============================================== |91.15044% ~1 s remaining |=============================================== |91.59292% ~1 s remaining |=============================================== |92.0354% ~1 s remaining |================================================ |92.47788% ~1 s remaining |================================================ |92.92035% ~1 s remaining |================================================ |93.36283% ~1 s remaining |================================================ |93.80531% ~1 s remaining |================================================= |94.24779% ~1 s remaining |================================================= |94.69027% ~1 s remaining |================================================= |95.13274% ~1 s remaining |================================================= |95.57522% ~0 s remaining |================================================= |96.0177% ~0 s remaining |================================================== |96.46018% ~0 s remaining |================================================== |96.90265% ~0 s remaining |================================================== |97.34513% ~0 s remaining |================================================== |97.78761% ~0 s remaining |=================================================== |98.23009% ~0 s remaining |=================================================== |98.67257% ~0 s remaining |=================================================== |99.11504% ~0 s remaining |=================================================== |99.55752% ~0 s remaining |====================================================|100% ~0 s remaining |====================================================|100% Completed after 11 s
## Starting to add information to samples
## => Add clinical information to samples
## => Adding TCGA molecular information from marker papers
## => Information will have prefix 'paper_'
## brca subtype information from:doi.org/10.1016/j.ccell.2018.03.014
## Available assays in SummarizedExperiment :
## => unstranded
## => stranded_first
## => stranded_second
## => tpm_unstrand
## => fpkm_unstrand
## => fpkm_uq_unstrand
assays(data)
## List of length 6
## names(6): unstranded stranded_first ... fpkm_unstrand fpkm_uq_unstrand
as_tibble(colData(data))
## # A tibble: 226 × 93
## barcode patient sample shortLetterCode definition sample_submitter_id
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 TCGA-EW-A2FS-0… TCGA-E… TCGA-… TP Primary s… TCGA-EW-A2FS-01A
## 2 TCGA-OL-A6VR-0… TCGA-O… TCGA-… TP Primary s… TCGA-OL-A6VR-01A
## 3 TCGA-E9-A226-0… TCGA-E… TCGA-… TP Primary s… TCGA-E9-A226-01A
## 4 TCGA-A8-A08H-0… TCGA-A… TCGA-… TP Primary s… TCGA-A8-A08H-01A
## 5 TCGA-D8-A27H-0… TCGA-D… TCGA-… TP Primary s… TCGA-D8-A27H-01A
## 6 TCGA-D8-A3Z6-0… TCGA-D… TCGA-… TP Primary s… TCGA-D8-A3Z6-01A
## 7 TCGA-B6-A1KN-0… TCGA-B… TCGA-… TP Primary s… TCGA-B6-A1KN-01A
## 8 TCGA-BH-A0DL-0… TCGA-B… TCGA-… TP Primary s… TCGA-BH-A0DL-01A
## 9 TCGA-A8-A09X-0… TCGA-A… TCGA-… TP Primary s… TCGA-A8-A09X-01A
## 10 TCGA-BH-A2L8-0… TCGA-B… TCGA-… TP Primary s… TCGA-BH-A2L8-01A
## # ℹ 216 more rows
## # ℹ 87 more variables: tumor_descriptor <chr>, sample_id <chr>,
## # pathology_report_uuid <chr>, submitter_id <chr>, sample_type <chr>,
## # specimen_type <chr>, days_to_collection <int>, state <chr>,
## # initial_weight <dbl>, tissue_type <chr>, preservation_method <chr>,
## # synchronous_malignancy <chr>, ajcc_pathologic_stage <chr>,
## # days_to_diagnosis <int>, laterality <chr>, treatments <list>, …
as_tibble(rowData(data))
## # A tibble: 60,660 × 10
## source type score phase gene_id gene_type gene_name level hgnc_id
## <fct> <fct> <dbl> <int> <chr> <chr> <chr> <chr> <chr>
## 1 HAVANA gene NA NA ENSG00000000003.15 protein_… TSPAN6 2 HGNC:1…
## 2 HAVANA gene NA NA ENSG00000000005.6 protein_… TNMD 2 HGNC:1…
## 3 HAVANA gene NA NA ENSG00000000419.13 protein_… DPM1 2 HGNC:3…
## 4 HAVANA gene NA NA ENSG00000000457.14 protein_… SCYL3 2 HGNC:1…
## 5 HAVANA gene NA NA ENSG00000000460.17 protein_… C1orf112 2 HGNC:2…
## 6 HAVANA gene NA NA ENSG00000000938.13 protein_… FGR 2 HGNC:3…
## 7 HAVANA gene NA NA ENSG00000000971.16 protein_… CFH 1 HGNC:4…
## 8 HAVANA gene NA NA ENSG00000001036.14 protein_… FUCA2 2 HGNC:4…
## 9 HAVANA gene NA NA ENSG00000001084.13 protein_… GCLC 1 HGNC:4…
## 10 HAVANA gene NA NA ENSG00000001167.14 protein_… NFYA 2 HGNC:7…
## # ℹ 60,650 more rows
## # ℹ 1 more variable: havana_gene <chr>
table(rowData(data)$gene_type)
##
## IG_C_gene IG_C_pseudogene
## 14 9
## IG_D_gene IG_J_gene
## 37 18
## IG_J_pseudogene IG_pseudogene
## 3 1
## IG_V_gene IG_V_pseudogene
## 145 187
## lncRNA miRNA
## 16901 1881
## misc_RNA Mt_rRNA
## 2212 2
## Mt_tRNA polymorphic_pseudogene
## 22 48
## processed_pseudogene protein_coding
## 10167 19962
## pseudogene ribozyme
## 18 8
## rRNA rRNA_pseudogene
## 47 497
## scaRNA scRNA
## 49 1
## snoRNA snRNA
## 943 1901
## sRNA TEC
## 5 1057
## TR_C_gene TR_D_gene
## 6 4
## TR_J_gene TR_J_pseudogene
## 79 4
## TR_V_gene TR_V_pseudogene
## 106 33
## transcribed_processed_pseudogene transcribed_unitary_pseudogene
## 500 138
## transcribed_unprocessed_pseudogene translated_processed_pseudogene
## 939 2
## translated_unprocessed_pseudogene unitary_pseudogene
## 1 98
## unprocessed_pseudogene vault_RNA
## 2614 1
SECoding <- data[rowData(data)$gene_type == "protein_coding", ]
#### The following function will return the data from specified slots in the summarizedExperiment object ####
dataPrep_Coding <- TCGAanalyze_Preprocessing(
object = SECoding,
cor.cut = 0.6,
datatype = "fpkm_unstrand"
)
## Number of outliers: 0
boxplot(dataPrep_Coding, outline = FALSE)
dataNorm_Coding <- TCGAanalyze_Normalization(
tabDF = dataPrep_Coding,
geneInfo = geneInfoHT,
method = "geneLength"
)
## I Need about 55 seconds for this Complete Normalization Upper Quantile [Processing 80k elements /s]
## Step 1 of 4: newSeqExpressionSet ...
## Step 2 of 4: withinLaneNormalization ...
## Step 3 of 4: betweenLaneNormalization ...
## Step 4 of 4: exprs ...
dataFilt_Coding <- TCGAanalyze_Filtering(
tabDF = dataPrep_Coding,
method = "quantile",
qnt.cut = 0.25
)
boxplot(dataNorm_Coding, outline = FALSE)
DEGsCoding <- TCGAanalyze_DEA(mat1 = dataFilt_Coding[,dataNormal_Target],
mat2 = dataFilt_Coding[,dataPrimary_Target],
pipeline="limma",
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01 ,
logFC.cut = 1,
method = "glmLRT", ClinicalDF = data.frame())
## Batch correction skipped since no factors provided
## ----------------------- DEA -------------------------------
## o 113 samples in Cond1type Normal
## o 113 samples in Cond2type Tumor
## o 14971 features as miRNA or genes
## This may take some minutes...
## ----------------------- END DEA -------------------------------
head(DEGsCoding)
## logFC AveExpr t P.Value adj.P.Val
## ENSG00000187824.9 -3.617888 2.806241 -29.66605 1.111120e-79 1.663458e-75
## ENSG00000136158.12 -17.360418 13.400130 -25.30245 1.021027e-67 7.642895e-64
## ENSG00000148053.17 -19.251799 12.448263 -24.58882 1.159118e-65 5.784384e-62
## ENSG00000132561.14 -25.387601 18.288332 -23.61005 8.466446e-63 3.168779e-59
## ENSG00000177098.9 -7.558010 5.750185 -23.38276 3.981548e-62 1.192155e-58
## ENSG00000154065.17 -3.069429 2.404372 -23.02268 4.686785e-61 1.169431e-57
## B
## ENSG00000187824.9 170.8503
## ENSG00000136158.12 143.6198
## ENSG00000148053.17 138.9340
## ENSG00000132561.14 132.4016
## ENSG00000177098.9 130.8673
## ENSG00000154065.17 128.4233
Why TCGA-BRCA Was Chosen for This Project
When choosing which cancer type to model, there are several possible motivations. One could start from a biological or clinical perspective. For example, focusing on a cancer with high mortality where earlier detection could have a major impact on patient outcomes. From a personal perspective, one might choose a cancer type that is familiar or meaningful. However, in practice, a successful machine-learning analysis also depends heavily on the availability and quality of data. Many TCGA projects include very few normal (control) samples because healthy tissue is difficult to obtain, which makes reliable classification modeling difficult. To ensure that the downstream models would be properly trained and evaluated, I used a data-driven approach to first assess the number of tumor and normal RNA-seq samples available within each TCGA project. This involved scanning all TCGA projects for “Transcriptome Profiling” datasets processed with the same pipeline (STAR-Counts) and summarizing the distribution of sample types.
Based on this data audit, four candidate projects, CPTAC-3, REBC-THYR, TCGA-COAD, and TCGA-BRCA—had enough normal tissue samples to support a balanced classification task. Among these, TCGA-BRCA (breast invasive carcinoma) had by far the largest number of paired tumor and solid normal tissue samples. This makes the dataset statistically powerful, more robust for cross-validation, and well-suited for building predictive models without risking overfitting due to limited controls. Additionally, the TCGA-BRCA project provides a comprehensive set of RNA-seq samples processed using consistent methods, along with extensive clinical annotations, enabling both biological interpretation and machine-learning-focused analysis.
To start the analysis, I used the TCGAbiolinks package to access all TCGA projects and then filtered them so I was only looking at studies that begin with “TCGA-”. From this list, I focused on TCGA-BRCA, which is the breast cancer dataset required for this assignment. I checked the project information and confirmed that the dataset includes RNA-seq gene expression files processed using the STAR pipeline.
Next, I pulled all available sample barcodes for TCGA-BRCA and separated them into tumor (TP) and normal (NT) types. To create a balanced dataset, I selected the first 113 tumor samples and 113 normal samples, giving a total of 226 samples. I then downloaded only these barcodes, which ensures that the analysis follows the assignment instructions and that the two outcome groups are equally sized.
After downloading the data, I used GDCprepare() to turn everything into a SummarizedExperiment object. This object stores the expression counts, sample information (like age and sample type), and gene annotations. I inspected both the sample metadata and the gene table to understand what variables were available.
Since TCGA includes many types of genes, I kept only protein-coding genes, because these are usually the most relevant for disease biology and classification tasks. From there, I ran the recommended preprocessing function in TCGAbiolinks (TCGAanalyze_Preprocessing), which checks for low-quality samples based on sample–sample correlations and keeps only samples that behave normally compared to the rest. I used a correlation cutoff of 0.6, as shown in the example code.
I then applied normalization using TCGAanalyze_Normalization, which adjusts the data for things like gene length and library size so that values are more comparable across samples. After that, I filtered out the lowest-expressed 25% of genes using quantile filtering. These low-expression genes usually add noise and don’t help with downstream modeling, so removing them keeps the dataset cleaner and more meaningful.
Finally, I ran the differential expression analysis using the filtering results. This compared tumor and normal samples using the limma pipeline, and I set strong cutoffs (FDR < 0.01 and |log2FC| > 1) so that only clearly changed genes were included. These genes can be interpreted later as possible biomarkers or pathways involved in breast cancer. Overall, these preprocessing steps make sure the dataset is clean, balanced, normalized, and ready for both biological interpretation and model building.
meta <- as.data.frame(colData(data))
df <- meta[, c("age_at_diagnosis",
"days_to_death",
"days_to_last_follow_up")]
corrplot(cor(df, use = "complete.obs"),
method = "color",
type = "upper",
tl.col = "black",
addCoef.col = "black")
# gene–age correlations
age <- meta$age_at_diagnosis
keep <- !is.na(age)
expr_mat <- dataFilt_Coding[, keep] # genes x samples
age <- age[keep]
cors <- apply(expr_mat, 1, function(gene_expr) {
cor(gene_expr, age, use = "complete.obs")
})
# visualize distribution of gene–age correlations
hist(cors,
breaks = 50,
main = "Distribution of Gene–Age Correlations",
xlab = "Correlation with Age",
col = "steelblue")
# list top 10 age-associated genes
top_genes <- sort(cors, decreasing = TRUE)[1:10]
top_genes
## ENSG00000162604.12 ENSG00000129048.7 ENSG00000206538.9 ENSG00000168758.11
## 0.2145226 0.2110849 0.1993478 0.1950629
## ENSG00000153707.17 ENSG00000140285.12 ENSG00000243244.7 ENSG00000143322.21
## 0.1933563 0.1931205 0.1928156 0.1870714
## ENSG00000165124.18 ENSG00000164929.17
## 0.1852641 0.1835406
I extracted continuous metadata attributes from colData(data) and computed the correlation matrix. Age_at_diagnosis shows only weak negative correlation with days_to_death and days_to_last_follow_up (r ≈ –0.18), while days_to_death and days_to_last_follow_up are perfectly correlated due to how survival is encoded in TCGA (patients have either one or the other, but never both). Overall, no strong metadata–metadata correlations were present, and none appear to be major confounders.
Gene–metadata correlations were evaluated by computing Pearson correlations between each protein-coding gene and age_at_diagnosis. The distribution is centered near 0, with most correlations between –0.1 and +0.1, and only a small number approaching ±0.2. This indicates that no large subset of genes is strongly age-associated and age is unlikely to confound downstream expression modeling. Gene–gene correlation structure is explored through variance and PCA in later sections.
gene_var <- apply(dataFilt_Coding, 1, var)
# sort genes by variance
high_var_genes <- sort(gene_var, decreasing = TRUE)[1:10]
low_var_genes <- sort(gene_var, decreasing = FALSE)[1:10]
Top 10 Highest-Variance Genes
top10_high_var <- tibble(
Gene = names(high_var_genes),
Variance = as.numeric(high_var_genes)
)
Top 10 Lowest-Variance Genes
top10_low_var <- tibble(
Gene = names(low_var_genes),
Variance = as.numeric(low_var_genes)
)
top10_high_var
## # A tibble: 10 × 2
## Gene Variance
## <chr> <dbl>
## 1 ENSG00000198886.2 22980088.
## 2 ENSG00000198938.2 20343714.
## 3 ENSG00000198712.1 13639727.
## 4 ENSG00000198804.2 12815116.
## 5 ENSG00000198888.2 12045789.
## 6 ENSG00000198840.2 11963556.
## 7 ENSG00000198727.2 11690664.
## 8 ENSG00000198763.3 10305571.
## 9 ENSG00000198899.2 8589456.
## 10 ENSG00000198695.2 5662473.
top10_low_var
## # A tibble: 10 × 2
## Gene Variance
## <chr> <dbl>
## 1 ENSG00000243725.7 0.00820
## 2 ENSG00000206559.8 0.0117
## 3 ENSG00000042429.12 0.0120
## 4 ENSG00000258315.5 0.0124
## 5 ENSG00000214827.11 0.0127
## 6 ENSG00000159110.20 0.0146
## 7 ENSG00000164037.17 0.0148
## 8 ENSG00000109911.19 0.0150
## 9 ENSG00000110801.14 0.0159
## 10 ENSG00000162959.13 0.0179
# visualize the distribution of gene-wise variance
hist(log10(gene_var + 1),
breaks = 50,
main = "Distribution of Gene-Wise Variance (log10)",
xlab = "log10(Variance)",
col = "darkorange")
# top N genes for downstream modeling
N <- 500
top_genes_var <- names(sort(gene_var, decreasing = TRUE))[1:N]
length(top_genes_var)
## [1] 500
# example gene–gene correlation among top 50 variable genes
sub_mat <- dataFilt_Coding[top_genes_var[1:50], ]
gene_cor <- cor(t(sub_mat))
hist(gene_cor[upper.tri(gene_cor)],
breaks = 50,
main = "Distribution of Gene–Gene Correlations (Top 50 Var Genes)",
xlab = "Correlation")
The variance of each gene was computed using the filtered, normalized expression matrix. Genes with the highest variance represent features that change substantially across samples, while genes with very low variance show nearly constant expression and contribute little information for distinguishing tumor from normal tissue.
The distribution of gene-wise variance is right-skewed, with most genes showing low to moderate variability and a small number of genes exhibiting high variability. The top 10 high-variance genes likely reflect strong biological heterogeneity in TCGA-BRCA, such as proliferation-associated genes or immune-related genes that differ substantially between tumors and normal tissues. Conversely, the lowest-variance genes are housekeeping-like genes that remain stable across conditions and do not contribute meaningfully to classification.
Exploring variance is important for model building because low-variance genes add noise, increase computational load, and may reduce model performance. High-variance genes capture the dominant structure in the data and are more likely to carry predictive signal. For the modeling step, I selected the top 500 most variable genes, which is a common dimensionality-reduction step that retains biological signal while removing uninformative features.
# transpose gene matrix for PCA: samples x genes
expr_for_pca <- t(dataFilt_Coding)
# run PCA
pca_res <- prcomp(expr_for_pca, scale. = TRUE)
# variance explained
pca_var <- pca_res$sdev^2
pca_var_exp <- pca_var / sum(pca_var)
plot(pca_var_exp[1:20],
type = "b",
pch = 19,
xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
main = "Scree Plot of First 20 Principal Components")
meta <- as.data.frame(colData(data))
group <- factor(ifelse(meta$shortLetterCode == "TP", "Tumor", "Normal"))
pca_df <- data.frame(
PC1 = pca_res$x[,1],
PC2 = pca_res$x[,2],
Group = group
)
plot(pca_df$PC1, pca_df$PC2,
col = ifelse(pca_df$Group == "Tumor", "firebrick", "steelblue"),
pch = 19,
xlab = paste0("PC1 (", round(pca_var_exp[1] * 100, 1), "%)"),
ylab = paste0("PC2 (", round(pca_var_exp[2] * 100, 1), "%)"),
main = "PCA of TCGA-BRCA (Tumor vs Normal)")
legend("topright",
legend = c("Tumor", "Normal"),
col = c("firebrick", "steelblue"),
pch = 19)
age <- meta$age_at_diagnosis
plot(pca_res$x[,1], age,
pch = 19,
col = ifelse(group == "Tumor", "firebrick", "steelblue"),
xlab = paste0("PC1 (", round(pca_var_exp[1] * 100, 1), "%)"),
ylab = "Age at Diagnosis",
main = "PC1 vs Age Colored by Group")
legend("topright",
legend = c("Tumor", "Normal"),
col = c("firebrick", "steelblue"),
pch = 19)
PCA was performed on the filtered, normalized expression matrix using gene-wise standardized values. The scree plot shows that PC1 explains ~15% of the variance and PC2 explains ~10%, with the variance contribution dropping sharply after the first few components. This indicates that a small number of principal components capture much of the global structure in the expression data, while later components mainly represent noise.
When projecting samples onto the first two PCs, a clear separation appears between tumor and normal samples. Tumor samples cluster primarily on the right side of PC1 while normal samples cluster toward the left, demonstrating that the dominant source of variation in TCGA-BRCA expression data is tumor vs. normal biology, not noise or batch effects. This validates that the dataset contains strong biological signal that is suitable for downstream classification models.
To explore potential confounding variables, PC1 was plotted against age_at_diagnosis. No visible linear or nonlinear trend was present, indicating that age does not strongly drive any of the principal components, consistent with the weak gene–age correlations observed earlier. This suggests that age is unlikely to act as a confounder in downstream modeling of tumor vs. normal status.
The top 500 most variable genes (identified in Section 3.2) were used as predictors. These high-variance genes capture the strongest biological signals in the dataset and reduce dimensionality prior to modeling. Each row represents a patient sample, and each column represents a gene expression feature.
The outcome variable was defined using the TCGA sample type, where “TP” indicates Tumor and “NT” indicates Normal.
To ensure a fair comparison across models, a single 70/30 train–test split was used for all algorithms. The same train_idx, X_train, X_test, y_train, and y_test objects were reused for every model. A random seed was set for reproducibility.
X <- t(dataFilt_Coding[top_genes_var, ])
y <- factor(ifelse(meta$shortLetterCode == "TP", "Tumor", "Normal"))
set.seed(123)
train_idx <- sample(seq_len(nrow(X)), size = 0.7 * nrow(X))
X_train <- X[train_idx, ]
X_test <- X[-train_idx, ]
y_train <- y[train_idx]
y_test <- y[-train_idx]
For the logistic regression model, I used a regularized version because the dataset has a large number of genes. This type of model lets you adjust two main settings: alpha, which controls whether the model acts more like ridge, lasso, or a mix of both; and lambda, which controls the strength of the regularization. I tested several alpha values (0, 0.5, and 1) and tuned lambda using cross-validation. The best model was chosen based on the lowest cross-validated error.
train_df <- as.data.frame(X_train)
train_df$Outcome <- y_train
test_df <- as.data.frame(X_test)
test_df$Outcome <- y_test
train_df$Outcome <- factor(train_df$Outcome,
levels = c("Normal", "Tumor"))
test_df$Outcome <- factor(test_df$Outcome,
levels = c("Normal", "Tumor"))
alpha_grid <- c(0, 0.5, 1)
lambda_grid <- 10 ^ seq(-3, 1, length.out = 20)
tune_grid <- expand.grid(
alpha = alpha_grid,
lambda = lambda_grid
)
set.seed(123)
ctrl <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
savePredictions = "final"
)
logit_fit <- train(
Outcome ~ .,
data = train_df,
method = "glmnet",
trControl = ctrl,
tuneGrid = tune_grid,
metric = "Accuracy"
)
logit_pred_class <- predict(logit_fit, newdata = test_df)
logit_pred_prob <- predict(logit_fit, newdata = test_df, type = "prob")
cm_logit <- confusionMatrix(logit_pred_class, test_df$Outcome)
cm_logit
## Confusion Matrix and Statistics
##
## Reference
## Prediction Normal Tumor
## Normal 35 0
## Tumor 1 32
##
## Accuracy : 0.9853
## 95% CI : (0.9208, 0.9996)
## No Information Rate : 0.5294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9705
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9722
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9697
## Prevalence : 0.5294
## Detection Rate : 0.5147
## Detection Prevalence : 0.5147
## Balanced Accuracy : 0.9861
##
## 'Positive' Class : Normal
##
Each value was tested using 5-fold cross-validation, and the k that gave the lowest average classification error was selected.
#scaling
train_means <- apply(X_train, 2, mean)
train_sds <- apply(X_train, 2, sd)
train_sds[train_sds == 0] <- 1
X_train_scaled <- scale(X_train, center = train_means, scale = train_sds)
X_test_scaled <- scale(X_test, center = train_means, scale = train_sds)
knn_train <- as.data.frame(X_train_scaled)
knn_test <- as.data.frame(X_test_scaled)
knn_train$Outcome <- y_train
knn_test$Outcome <- y_test
knn_train$Outcome <- factor(knn_train$Outcome,
levels = c("Normal", "Tumor"))
knn_test$Outcome <- factor(knn_test$Outcome,
levels = c("Normal", "Tumor"))
k_values <- seq(1, 25, by = 2)
knn_grid <- expand.grid(k = k_values)
set.seed(123)
knn_ctrl <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
savePredictions = "final"
)
knn_fit <- train(
Outcome ~ .,
data = knn_train,
method = "knn",
trControl = knn_ctrl,
tuneGrid = knn_grid,
metric = "Accuracy"
)
knn_pred_class <- predict(knn_fit, newdata = knn_test)
knn_pred_prob <- predict(knn_fit, newdata = knn_test, type = "prob")
cm_knn <- confusionMatrix(knn_pred_class, knn_test$Outcome)
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Normal Tumor
## Normal 36 6
## Tumor 0 26
##
## Accuracy : 0.9118
## 95% CI : (0.8178, 0.9669)
## No Information Rate : 0.5294
## P-Value [Acc > NIR] : 9.964e-12
##
## Kappa : 0.8211
##
## Mcnemar's Test P-Value : 0.04123
##
## Sensitivity : 1.0000
## Specificity : 0.8125
## Pos Pred Value : 0.8571
## Neg Pred Value : 1.0000
## Prevalence : 0.5294
## Detection Rate : 0.5294
## Detection Prevalence : 0.6176
## Balanced Accuracy : 0.9062
##
## 'Positive' Class : Normal
##
For the Random Forest model, I focused on two main settings: mtry, which is how many genes the model looks at each time it makes a split, and ntree, which is the number of trees in the forest. In practice, I used a standard, widely recommended configuration for classification problems in high-dimensional data: ntree was set to 500 trees, and mtry was set to approximately the square root of the number of genes (mtry ≈ √p). This provides a strong baseline without overfitting and is consistent with common defaults in the literature.
p <- ncol(X_train)
# simple, robust RF settings
fallback_mtry <- floor(sqrt(p))
fallback_ntree <- 500
set.seed(123)
rf_final <- randomForest(
x = X_train,
y = y_train,
ntree = fallback_ntree,
mtry = fallback_mtry,
importance = TRUE
)
# evaluate on test data
rf_pred_class <- predict(rf_final, newdata = X_test)
cm_rf <- confusionMatrix(rf_pred_class, y_test)
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Normal Tumor
## Normal 34 1
## Tumor 2 31
##
## Accuracy : 0.9559
## 95% CI : (0.8764, 0.9908)
## No Information Rate : 0.5294
## P-Value [Acc > NIR] : 6.122e-15
##
## Kappa : 0.9116
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9444
## Specificity : 0.9688
## Pos Pred Value : 0.9714
## Neg Pred Value : 0.9394
## Prevalence : 0.5294
## Detection Rate : 0.5000
## Detection Prevalence : 0.5147
## Balanced Accuracy : 0.9566
##
## 'Positive' Class : Normal
##
For the Neural Network model, I tested several different sizes for the hidden layer (3, 5, and 7 neurons) and experimented with learning-rate settings. Each setup was evaluated using cross-validation, and the model that produced the best classification accuracy was selected as the final version.
nn_train <- as.data.frame(X_train_scaled)
nn_test <- as.data.frame(X_test_scaled)
nn_train$Outcome <- y_train
nn_test$Outcome <- y_test
nn_train$Outcome <- factor(nn_train$Outcome,
levels = c("Normal", "Tumor"))
nn_test$Outcome <- factor(nn_test$Outcome,
levels = c("Normal", "Tumor"))
# had to add smaller grid to reduce failures
nn_grid <- expand.grid(
size = c(3, 5),
decay = c(0, 1e-4, 1e-3)
)
set.seed(123)
nn_ctrl <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
savePredictions = "final"
)
nn_fit <- train(
Outcome ~ .,
data = nn_train,
method = "nnet",
trControl = nn_ctrl,
tuneGrid = nn_grid,
metric = "Accuracy",
trace = FALSE,
maxit = 200,
MaxNWts = 5000
)
nn_fit$bestTune
## size decay
## 3 3 0.001
nn_fit
## Neural Network
##
## 158 samples
## 500 predictors
## 2 classes: 'Normal', 'Tumor'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 127, 126, 126, 127, 126
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 3 0e+00 0.9810484 0.9621102
## 3 1e-04 0.9872984 0.9746102
## 3 1e-03 0.9872984 0.9746102
## 5 0e+00 0.9808468 0.9617739
## 5 1e-04 0.9872984 0.9746102
## 5 1e-03 0.9872984 0.9746102
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 3 and decay = 0.001.
nn_pred_class <- predict(nn_fit, newdata = nn_test)
nn_pred_prob <- predict(nn_fit, newdata = nn_test, type = "prob")
cm_nn <- confusionMatrix(nn_pred_class, nn_test$Outcome)
cm_nn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Normal Tumor
## Normal 35 1
## Tumor 1 31
##
## Accuracy : 0.9706
## 95% CI : (0.8978, 0.9964)
## No Information Rate : 0.5294
## P-Value [Acc > NIR] : 3.075e-16
##
## Kappa : 0.941
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9722
## Specificity : 0.9688
## Pos Pred Value : 0.9722
## Neg Pred Value : 0.9688
## Prevalence : 0.5294
## Detection Rate : 0.5147
## Detection Prevalence : 0.5294
## Balanced Accuracy : 0.9705
##
## 'Positive' Class : Normal
##
cm_logit$overall["Accuracy"]
## Accuracy
## 0.9852941
cm_knn$overall["Accuracy"]
## Accuracy
## 0.9117647
cm_rf$overall["Accuracy"]
## Accuracy
## 0.9558824
cm_nn$overall["Accuracy"]
## Accuracy
## 0.9705882
Overall, all four models performed well on the held-out test set. Regularized logistic regression achieved the highest accuracy (~0.99), followed by the neural network (~0.97), random forest (~0.96), and kNN (~0.91). The strong performance of logistic regression suggests that, after dimensionality reduction to the top 500 most variable genes, a relatively simple linear decision boundary is already sufficient to separate tumor and normal samples. kNN likely struggled more because it relies directly on distances in a still high-dimensional space, which can be noisy even after variance filtering. The random forest and neural network models performed slightly worse than logistic regression, possibly because they are more flexible and at risk of mild overfitting on a dataset of this size.
Across all models, hyperparameters were tuned using 5-fold cross-validation on the training set. For logistic regression, I tuned alpha (ridge, elastic net, lasso) and lambda and chose the combination with the best cross-validated accuracy. For kNN, I tried odd k values from 1 to 25 and kept the k with the lowest error. For random forest, I used a standard setting of 500 trees and mtry ≈ √p, which is commonly recommended for classification. For the neural network, I tested small hidden layers (3–5 units) and several decay values; the best setting was selected based on cross-validated accuracy, while keeping the model small enough to avoid weight explosion errors in nnet.
In this assignment, I used TCGA-BRCA RNA-seq data to build a tumor vs. normal classifier based on protein-coding gene expression. After downloading and preparing a balanced set of 113 tumor and 113 normal samples, I applied standard preprocessing steps in TCGAbiolinks, including quality control, normalization, and removal of low-expression genes. This produced a filtered expression matrix that was suitable for both exploratory analysis and predictive modeling.
Using the clinical metadata, I first checked for potential confounding effects. Correlations among age_at_diagnosis, days_to_death, and days_to_last_follow_up were generally weak, except for the expected perfect relationship between the two survival-related variables. Gene–age correlations were also centered near zero, with only a small number of genes showing modest associations with age. PCA on the filtered expression matrix showed clear separation between tumor and normal samples along the first principal component, while age did not appear to drive any major axis of variation. Together, these results suggest that the dominant signal in this dataset reflects tumor biology rather than obvious confounding by age or follow-up time.
For modeling, I focused on the top 500 most variable genes, which is a common way to reduce dimensionality and emphasize features that carry strong biological signal. Four classification algorithms were trained and tuned on a shared 70/30 train–test split: regularized logistic regression, k-nearest neighbors, random forest, and a small neural network. All models achieved high accuracy on the held-out test set, but regularized logistic regression performed best overall (~0.99 accuracy), followed by the neural network (~0.97), random forest (~0.96), and kNN (~0.91). This pattern suggests that, once the data are well-preprocessed and reduced to informative genes, a relatively simple linear model is already sufficient to separate tumor from normal samples in TCGA-BRCA.
There are several limitations to keep in mind. The analysis was performed on a single cohort with one random train–test split, and I did not evaluate performance on an external dataset, so the generalizability of these models is not fully tested. I also restricted the features to 500 high-variance genes and did not explore other feature-selection strategies, pathway-level summaries, or integration of clinical variables into the models. Despite these limitations, the workflow demonstrates a complete pipeline from raw TCGA data to exploratory analysis and predictive modeling, and it shows that carefully preprocessed RNA-seq data can support highly accurate tumor vs. normal classification using standard machine-learning methods. Future work could extend this approach to subtype prediction, survival modeling, or cross-cohort validation to better understand how robust these models are in real-world settings.