Does mammography predicts recurrence for women with Breast Cancer?

The objective of this work is to associate mammography features and radiomic models to patient BRCA recurrence and to describe what aspects of the tumor phenotype are associates with patient survival.

Dataset

73 patients were followed over 10 years. The event of breast cancer recurrence status was recorded and the baseline mammograms were analyzed by radiomics methods, after that analysis we obtain more that thousand features. This give us a 73x1091 dataframe where 73 are the total observations and 1091 are the total number of features.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(FRESA.CAD)
## Loading required package: Rcpp
## Loading required package: stringr
## Loading required package: miscTools
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Import the dataset and convert variables

setwd("/home/eider/Documents/Exploring_Learning_Algorithms_4_CADx/Breast_Cancer_SanJose")
BRCASanJose <- read_csv("../datasets/breast-cancer/BRCASanJose.csv", 
                    col_types = cols(recurrence = col_factor()))

#change from char to numeric in order to perform further calculations
# unique(BRCASanJose$Protocol)
BRCASanJose$Protocol <- 1* (BRCASanJose$Protocol == "Combo")
#removing the unnesesary features
BRCASanJose$timeToEvent <- NULL
BRCASanJose$Protocol <- NULL
BRCASanJose$Onco2 <- NULL
BRCASanJose$PAM50 <- NULL
BRCASanJose$CanceryType <- NULL
BRCASanJose$Oncotype <- NULL
BRCASanJose$AvGRISK <- NULL
BRCASanJose$Type <- NULL
BRCASanJose <- data.frame(BRCASanJose)
#str(BRCASanJose)

Data partitioning

set.seed(42)
BRCASanJose_F <- BRCASanJose #leave one factor class
BRCASanJose$recurrence <-as.numeric(BRCASanJose$recurrence) #set one numeric class 

# data to for further calculations
theData <- BRCASanJose;
theOutcome <- "recurrence";
reps <- 20;
fraction <- 0.75;

#split the dataset
tranSet <- sample(nrow(BRCASanJose_F),fraction*nrow(BRCASanJose_F))
#FACTOR PARTITION
BRCASanJoseTrain <- BRCASanJose_F[tranSet,]
BRCASanJoseTest <- BRCASanJose_F[-tranSet,]
#NUMERIC PARTITION
BRCASanJoseNTrain <- BRCASanJose[tranSet,] 
BRCASanJoseNTest <- BRCASanJose[-tranSet,]

Caret taining control

library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
tunningctrl <- trainControl(
  method = "repeatedcv", 
  number = 5,
  repeats = 3,
  savePredictions = "all"
)

classprobs_tunningctrl <- trainControl(
  summaryFunction = twoClassSummary,
  method = "repeatedcv", 
  number = 5,
  repeats = 3,
  classProbs = TRUE
)

noTuningControl <- trainControl(method = "none")

The most used models in CADx

we grouped these models from HIGH interpretability (and LOW Flexibility) to HIGH Flexibility (and LOW interpretability)

lasso and linear methods

set.seed(42)

class(BRCASanJoseTrain$recurrence)
## [1] "factor"
#Linear Discriminant Analysis
lda2_fit <- train(recurrence ~ .,BRCASanJoseTrain, 
             method = "lda2",
             preProc = c("center", "scale"),
             trControl = tunningctrl
             )
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
#fails to run
#The lasso 
#lasso_fit <- train(recurrence ~ .,as.numeric(as.character(BRCASanJoseTrain)), 
#             method = "rqlasso",
#             preProc = c("center", "scale"),
#             trControl = tunningctrl
#             )
#
##Mixture Discriminant Analysis
#mda_fit <- train(recurrence ~ .,BRCASanJoseTrain, 
#             method = "mda",
#             preProc = c("center", "scale"),
#             trControl = tunningctrl
#             )
#mda_fit$results
#
#mda_out <- mda(recurrence ~ ., data = BRCASanJoseTrain)

Trees

ranger_fit <- train(recurrence ~ .,BRCASanJoseTrain, 
             method = "ranger",
             preProc = c("center", "scale"),
             trControl = tunningctrl
             )


rf_fit <- train(recurrence ~ .,BRCASanJoseTrain, 
             method = "rf",
             preProc = c("center", "scale"),
             trControl = tunningctrl
             )

SVM

#Support Vector Machines with Polynomial Kernel

svmPoly_fit <- train(recurrence ~ .,BRCASanJoseTrain, 
             method = "svmPoly",
             preProc = c("center", "scale"),
             trControl = tunningctrl
             )
#Relevance Vector Machines with Linear Kernel

#fails to run
#rvmLinear_fit <- train(recurrence ~ .,BRCASanJoseNTrain, 
#             method = "rvmLinear",
#             preProc = c("center", "scale"),
#             trControl = tunningctrl
#             )
#

Bagging and boosting

# eXtreme Gradient Boosting
xgbTree_fit <- train(recurrence ~ .,BRCASanJoseTrain, 
             method = "xgbTree",
             preProc = c("center", "scale"),
             trControl = tunningctrl
             )


#stochastic gradient boosting
gbm_fit <- train(recurrence ~ .,BRCASanJose_F, 
             method = "gbm",  
             trControl = tunningctrl,
             preProc = c("center", "scale"),
             verbose = FALSE)

#gbm_fit$results$Accuracy

#failed to run 
#nodeHarvest_fit <- train(recurrence ~ .,BRCASanJose_F, 
#             method = "nodeHarvest",  
#             trControl = tunningctrl,
#             preProc = c("center", "scale"),
#             verbose = FALSE)
#

Neural nets

#failed to run
#apply some feature filter to the data so the nnet can model
#avNNet_fit <- train(recurrence ~ .,BRCASanJose_F, 
#             method = "avNNet",
#             trControl = tunningctrl,
#             preProc = c("center", "scale"),
#             trace = FALSE,
#             )

Comparing caret modelsโ€™ accuracy accorss models

model_list <- list(gbm = gbm_fit, 
     xgbTree = xgbTree_fit,
     svmPoly=svmPoly_fit,
     rf=rf_fit,
     ranger=ranger_fit,
     lda2= lda2_fit
     )

resamps <- resamples(model_list)
#pander::pander(summary(resamps))
#bwplot(resamps, metric = "Accuracy")
dotplot(resamps, metric = "Accuracy")

#densityplot(resamps, metric = "Accuracy")