The objective of this work is to associate mammography features and radiomic models to patient BRCA recurrence and to describe what aspects of the tumor phenotype are associates with patient survival.
73 patients were followed over 10 years. The event of breast cancer recurrence status was recorded and the baseline mammograms were analyzed by radiomics methods, after that analysis we obtain more that thousand features. This give us a 73x1091 dataframe where 73 are the total observations and 1091 are the total number of features.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(FRESA.CAD)
## Loading required package: Rcpp
## Loading required package: stringr
## Loading required package: miscTools
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Import the dataset and convert variables
setwd("/home/eider/Documents/Exploring_Learning_Algorithms_4_CADx/Breast_Cancer_SanJose")
BRCASanJose <- read_csv("../datasets/breast-cancer/BRCASanJose.csv",
col_types = cols(recurrence = col_factor()))
#change from char to numeric in order to perform further calculations
# unique(BRCASanJose$Protocol)
BRCASanJose$Protocol <- 1* (BRCASanJose$Protocol == "Combo")
#removing the unnesesary features
BRCASanJose$timeToEvent <- NULL
BRCASanJose$Protocol <- NULL
BRCASanJose$Onco2 <- NULL
BRCASanJose$PAM50 <- NULL
BRCASanJose$CanceryType <- NULL
BRCASanJose$Oncotype <- NULL
BRCASanJose$AvGRISK <- NULL
BRCASanJose$Type <- NULL
BRCASanJose <- data.frame(BRCASanJose)
#str(BRCASanJose)
set.seed(42)
BRCASanJose_F <- BRCASanJose #leave one factor class
BRCASanJose$recurrence <-as.numeric(BRCASanJose$recurrence) #set one numeric class
# data to for further calculations
theData <- BRCASanJose;
theOutcome <- "recurrence";
reps <- 20;
fraction <- 0.75;
#split the dataset
tranSet <- sample(nrow(BRCASanJose_F),fraction*nrow(BRCASanJose_F))
#FACTOR PARTITION
BRCASanJoseTrain <- BRCASanJose_F[tranSet,]
BRCASanJoseTest <- BRCASanJose_F[-tranSet,]
#NUMERIC PARTITION
BRCASanJoseNTrain <- BRCASanJose[tranSet,]
BRCASanJoseNTest <- BRCASanJose[-tranSet,]
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
tunningctrl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 3,
savePredictions = "all"
)
classprobs_tunningctrl <- trainControl(
summaryFunction = twoClassSummary,
method = "repeatedcv",
number = 5,
repeats = 3,
classProbs = TRUE
)
noTuningControl <- trainControl(method = "none")
we grouped these models from HIGH interpretability (and LOW Flexibility) to HIGH Flexibility (and LOW interpretability)
set.seed(42)
class(BRCASanJoseTrain$recurrence)
## [1] "factor"
#Linear Discriminant Analysis
lda2_fit <- train(recurrence ~ .,BRCASanJoseTrain,
method = "lda2",
preProc = c("center", "scale"),
trControl = tunningctrl
)
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
#fails to run
#The lasso
#lasso_fit <- train(recurrence ~ .,as.numeric(as.character(BRCASanJoseTrain)),
# method = "rqlasso",
# preProc = c("center", "scale"),
# trControl = tunningctrl
# )
#
##Mixture Discriminant Analysis
#mda_fit <- train(recurrence ~ .,BRCASanJoseTrain,
# method = "mda",
# preProc = c("center", "scale"),
# trControl = tunningctrl
# )
#mda_fit$results
#
#mda_out <- mda(recurrence ~ ., data = BRCASanJoseTrain)
ranger_fit <- train(recurrence ~ .,BRCASanJoseTrain,
method = "ranger",
preProc = c("center", "scale"),
trControl = tunningctrl
)
rf_fit <- train(recurrence ~ .,BRCASanJoseTrain,
method = "rf",
preProc = c("center", "scale"),
trControl = tunningctrl
)
#Support Vector Machines with Polynomial Kernel
svmPoly_fit <- train(recurrence ~ .,BRCASanJoseTrain,
method = "svmPoly",
preProc = c("center", "scale"),
trControl = tunningctrl
)
#Relevance Vector Machines with Linear Kernel
#fails to run
#rvmLinear_fit <- train(recurrence ~ .,BRCASanJoseNTrain,
# method = "rvmLinear",
# preProc = c("center", "scale"),
# trControl = tunningctrl
# )
#
# eXtreme Gradient Boosting
xgbTree_fit <- train(recurrence ~ .,BRCASanJoseTrain,
method = "xgbTree",
preProc = c("center", "scale"),
trControl = tunningctrl
)
#stochastic gradient boosting
gbm_fit <- train(recurrence ~ .,BRCASanJose_F,
method = "gbm",
trControl = tunningctrl,
preProc = c("center", "scale"),
verbose = FALSE)
#gbm_fit$results$Accuracy
#failed to run
#nodeHarvest_fit <- train(recurrence ~ .,BRCASanJose_F,
# method = "nodeHarvest",
# trControl = tunningctrl,
# preProc = c("center", "scale"),
# verbose = FALSE)
#
#failed to run
#apply some feature filter to the data so the nnet can model
#avNNet_fit <- train(recurrence ~ .,BRCASanJose_F,
# method = "avNNet",
# trControl = tunningctrl,
# preProc = c("center", "scale"),
# trace = FALSE,
# )
model_list <- list(gbm = gbm_fit,
xgbTree = xgbTree_fit,
svmPoly=svmPoly_fit,
rf=rf_fit,
ranger=ranger_fit,
lda2= lda2_fit
)
resamps <- resamples(model_list)
#pander::pander(summary(resamps))
#bwplot(resamps, metric = "Accuracy")
dotplot(resamps, metric = "Accuracy")
#densityplot(resamps, metric = "Accuracy")