1 Presentation of the dataset

The dataset used for this analysis is called Wisconsin Breast Cancer Database and can be found here.

This breast cancer database was obtained from the University of Wisconsin Hospitals, Madison from Dr. William H. Wolberg and describes 699 clinical cases presenting malignant or benign tumors.

Several attributes were measured on a 1-10 scale on these patients between January 1989 and November 1991.

2 Loading and preparing the data

# load libraries
library(tidyverse)
library(knitr)
library(kableExtra)
library(ggthemr)

# define default theme for plots
ggthemr("dust")

According to the help file associated to the database, some missing values are present in the data, denoted by “?”.
We can also add column names when reading the file.

cancer <- read_csv("breast-cancer-wisconsin.data", 
                   col_names = c("Sample code number", "Clump Thickness", 
                                 "Uniformity of Cell Size", "Uniformity of Cell Shape",
                                 "Marginal Adhesion", "Single Epithelial Cell Size",
                                 "Bare Nuclei", "Bland Chromatin",
                                 "Normal Nucleoli", "Mitoses", "Class"),
                   col_types = c("fnnnnnnnnnf"),
                   na = "?")
glimpse(cancer)

# clean and reformat column names
cancer <- janitor::clean_names(cancer)

2.1 Identifier variable

The sample_code_number variable is an identifier and not relevant in a machine learning approach.
However, we have 699 patients, but only 645 unique identifiers.

# check number of unique identifiers => 645
nlevels(cancer$sample_code_number)

## [1] 645

Which observations have the same identifier ?

# extract multiple observations with the same identifier
multi_obs <- cancer %>% 
  count(sample_code_number) %>% 
  arrange(desc(n)) %>% 
  filter(n > 1)

multi_obs %>% 
  head(10) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

sample_code_number	n
1182404	6
1276091	5
1198641	3
1017023	2
1033078	2
1070935	2
1100524	2
1105524	2
1115293	2
1116116	2

Some observations occur up to 6 times using the same identifier !!
Since we don’t have any information related to the exact time of evaluation, we can assume that some patients were followed over several months and were evaluated several times during the campaign.

However, a more concerning issue is that some observations (8 in total) are redundant in the database, which could bias our analysis.

# redundant observations
duplicated_obs <- cancer %>% 
  filter(duplicated(.))

# check duplicated observations
cancer %>%
  filter(sample_code_number %in% duplicated_obs$sample_code_number) %>%
  arrange(sample_code_number) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left") %>% 
  scroll_box(width = "800px")

sample_code_number	clump_thickness	uniformity_of_cell_size	uniformity_of_cell_shape	marginal_adhesion	single_epithelial_cell_size	bare_nuclei	bland_chromatin	normal_nucleoli	mitoses	class
1100524	6	10	10	2	8	10	7	3	3	4
1100524	6	10	10	2	8	10	7	3	3	4
1116116	9	10	10	1	10	8	3	3	1	4
1116116	9	10	10	1	10	8	3	3	1	4
1198641	3	1	1	1	2	1	3	1	1	2
1198641	3	1	1	1	2	1	3	1	1	2
1198641	10	10	6	3	3	10	4	3	2	4
1218860	1	1	1	1	1	1	3	1	1	2
1218860	1	1	1	1	1	1	3	1	1	2
320675	3	3	5	2	3	10	7	1	1	4
320675	3	3	5	2	3	10	7	1	1	4
704097	1	1	1	1	1	1	2	1	1	2
704097	1	1	1	1	1	1	2	1	1	2
1321942	5	1	1	1	2	1	3	1	1	2
1321942	5	1	1	1	2	1	3	1	1	2
466906	1	1	1	1	2	1	1	1	1	2
466906	1	1	1	1	2	1	1	1	1	2

As we don’t know why these data are redundant but could bias the analysis, we will delete them.

# remove redundant observations and identifier column
cancer <- cancer %>% 
  filter(!duplicated(.)) %>% 
  select(-sample_code_number)

glimpse(cancer)

## Observations: 691
## Variables: 10
## $ clump_thickness             <dbl> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, …
## $ uniformity_of_cell_size     <dbl> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1,…
## $ uniformity_of_cell_shape    <dbl> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1,…
## $ marginal_adhesion           <dbl> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, …
## $ single_epithelial_cell_size <dbl> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, …
## $ bare_nuclei                 <dbl> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, …
## $ bland_chromatin             <dbl> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, …
## $ normal_nucleoli             <dbl> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, …
## $ mitoses                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, …
## $ class                       <fct> 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, …

2.2 Response variable

The class variable is our variable of interest, and can be recoded as a factor using appropriate labels.

# recode 'class' variable
cancer <- cancer %>% 
  mutate(class = as.factor(class),
         class = fct_recode(class, "benign" = "2", "malignant" = "4"))

# Plot distribution
ggplot(cancer, aes(x = class, fill = class)) +
  geom_bar(show.legend = FALSE) +
  geom_label(aes(label = paste(..count.., " (", round(prop.table(..count..) * 100, 2), "%)")), 
             stat = "count", 
             position = position_stack(vjust = 0.75),
             show.legend = FALSE) +
  labs(title = "Tumor type distribution", x = "Class", y = "Frequency")

Even though the data is clearly imbalanced, we wont’ treat this issue in this analysis.

2.3 Handling missing values

summary(cancer)

##  clump_thickness  uniformity_of_cell_size uniformity_of_cell_shape
##  Min.   : 1.000   Min.   : 1.00           Min.   : 1.000          
##  1st Qu.: 2.000   1st Qu.: 1.00           1st Qu.: 1.000          
##  Median : 4.000   Median : 1.00           Median : 1.000          
##  Mean   : 4.427   Mean   : 3.13           Mean   : 3.201          
##  3rd Qu.: 6.000   3rd Qu.: 5.00           3rd Qu.: 5.000          
##  Max.   :10.000   Max.   :10.00           Max.   :10.000          
##                                                                   
##  marginal_adhesion single_epithelial_cell_size  bare_nuclei    
##  Min.   : 1.000    Min.   : 1.000              Min.   : 1.000  
##  1st Qu.: 1.000    1st Qu.: 2.000              1st Qu.: 1.000  
##  Median : 1.000    Median : 2.000              Median : 1.000  
##  Mean   : 2.825    Mean   : 3.211              Mean   : 3.538  
##  3rd Qu.: 4.000    3rd Qu.: 4.000              3rd Qu.: 6.000  
##  Max.   :10.000    Max.   :10.000              Max.   :10.000  
##                                                NA's   :16      
##  bland_chromatin  normal_nucleoli     mitoses             class    
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   benign   :453  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   malignant:238  
##  Median : 3.000   Median : 1.000   Median : 1.000                  
##  Mean   : 3.436   Mean   : 2.883   Mean   : 1.593                  
##  3rd Qu.: 5.000   3rd Qu.: 4.000   3rd Qu.: 1.000                  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000                  
##

We have 16 missing values from the bare_nuclei variable. How do they relate to the response variable ?

p1 <- cancer %>% 
  filter(is.na(bare_nuclei)) %>% 
  ggplot(aes(x = class, fill = class)) +
    geom_bar(show.legend = FALSE) +
    labs(title = "Tumor type distribution", x = "Class",
         y = "Frequency", subtitle = "(missing values)")

p2 <- ggplot(cancer, aes(x = bare_nuclei, color = class)) +
  geom_density(show.legend = FALSE) +
  labs(title = "'Bare Nuclei' distribution \n per tumor class")

gridExtra::grid.arrange(p1, p2, ncol = 2)

## Warning: Removed 16 rows containing non-finite values (stat_density).

We can see that the bare_nuclei distribution is very different for each class value (benign or malignant). So using a basic mean imputation may not be a good solution.

2.3.1 Imputation

These missing values can be imputed and be tracked, but the process is always tricky.
An easy implementation for imputation is to use the mice package.

library(mice)
# run multiple imputation ; 
# do not into account the 'id' variable (not relevant for imputation) ; 
# set the seed for reproducibility
cancer_impute <- mice(data = cancer, m = 8, seed = 42, printFlag = FALSE)

# have a a look at all imputed values for the 5 imputations
cancer_impute$imp$bare_nuclei

##      1  2  3  4 5  6 7  8
## 24  10  3 10 10 7 10 8 10
## 41   5  7  2  5 5  5 7  5
## 140  1  1  1  1 1  1 1  1
## 146  1  1  1  1 1  5 1  2
## 159  1  1  1  1 2  2 1  1
## 165  1  1  1  1 1  4 1  1
## 235  1  5  5  1 1  1 1  1
## 249  1  1  1  1 8  1 1  1
## 271  1  1  8  1 1  2 2  1
## 288  6 10  3  6 3 10 9  1
## 290  1  1  5  1 1  1 1  1
## 293  1  1  3  1 1  1 3  1
## 311  8  2  2  1 8  1 7  1
## 317  1  1  1  1 8  1 1  2
## 406  1  1  1  1 1  1 1  1
## 611  1  1  1  1 1  1 1  3

# extract one of the imputed datasets, and add a tracking column (evaluated on rownames)
set.seed(42)
cancer_complete <- mice::complete(data = cancer_impute, action = sample(x = 1:cancer_impute$m, size = 1))
cancer_complete$bare_nuclei_miss <- ifelse(rownames(cancer_complete) %in% rownames(cancer_impute$imp$bare_nuclei), 1, 0)

The new cancer_complete dataset doesn’t have any missing value now.

sum(is.na(cancer_complete))

## [1] 0

2.3.2 Deletion

Imputation, by construct, can induce irrelevant patterns in analysis.
Since we only have 16 missing values (2.32% of total observations), another solution can be to delete the observations where missing data are present.

# delete observations with missing values
cancer_complete <- cancer[complete.cases(cancer), ]

This solution will be kept for the rest of the analysis.

2.4 Other variables

Quick look at other variables’ distribution.

cancer_complete %>% 
  gather(key = "variable", value = "value", clump_thickness:mitoses) %>% 
  ggplot(aes(x = value, fill = class, color = class)) +
    geom_density(alpha = 0.3) +
    facet_wrap(~variable, scales = "free_y")

Some variables, such as bare_nuclei or clump_thickness, seem to create a clear separation between benign and malignant cases.

3 Model Building

Since we have a classification problem, we can use the following algorithms :

logistic regression
random forest

Before buiding these models, we need to create some training and testing sets to assess models’ performance.

# ML libraries
library(Metrics)
library(randomForest)

3.1 Training/Testing datasets

In order to fine tune parameters’ models, a validation set is needed, in addition to training and testing sets.
Here, we won’t need a validation dataset. Indeed, random forest has a convenient tuning wrapper function tuneRF() that already split the data into training and validation set and perform, automatically, the cross-validation (10 fold) of the data.

# create indices for a 80/20 split
set.seed(31415)
idx <- sample(x = 1:2, size = nrow(cancer_complete), replace = TRUE, prob = c(0.8, 0.2))

# create train/valid/test sets
cancer_train <- cancer_complete[idx == 1, ]
cancer_test <- cancer_complete[idx == 2, ]

Check distribution in all new sets.

prop.table(table(cancer_train$class))

## 
##    benign malignant 
## 0.6534091 0.3465909

prop.table(table(cancer_test$class))

## 
##    benign malignant 
## 0.6394558 0.3605442

3.2 Scaling

Since all variables are on the same 1-10 scale, we don’t need to scale the data the training variables.

3.3 Logistic regression

model_glm <- glm(class ~ ., data = cancer_train, family = "binomial")
summary(model_glm)

## 
## Call:
## glm(formula = class ~ ., family = "binomial", data = cancer_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4016  -0.1252  -0.0643   0.0280   2.5307  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -10.07249    1.30683  -7.708 1.28e-14 ***
## clump_thickness               0.56222    0.15897   3.537 0.000405 ***
## uniformity_of_cell_size       0.01942    0.21580   0.090 0.928304    
## uniformity_of_cell_shape      0.24103    0.24071   1.001 0.316657    
## marginal_adhesion             0.30872    0.12484   2.473 0.013401 *  
## single_epithelial_cell_size   0.16143    0.16399   0.984 0.324908    
## bare_nuclei                   0.33035    0.09378   3.523 0.000427 ***
## bland_chromatin               0.44280    0.17722   2.499 0.012467 *  
## normal_nucleoli               0.19318    0.11598   1.666 0.095800 .  
## mitoses                       0.58425    0.38337   1.524 0.127509    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 681.448  on 527  degrees of freedom
## Residual deviance:  91.238  on 518  degrees of freedom
## AIC: 111.24
## 
## Number of Fisher Scoring iterations: 8

As seen in graphs, clump_thickness and bare_nuclei seem to be significant variables, with higher values of these parameters leading to a higher probability of malignant tumor.

3.4 Random Forest

Random forest has a convenient wrapper tuning function to determine the best parameters.
We will tune the number of variables in each tree mtry using tuneRF() function ; then the number of trees in each model will be estimated ‘by hand’.

set.seed(42)
tune_rf <- tuneRF(x = cancer_train[, 1:9],
                  y = cancer_train$class,
                  ntreeTry = 500)

## mtry = 3  OOB error = 2.84% 
## Searching left ...
## mtry = 2     OOB error = 3.22% 
## -0.1333333 0.05 
## Searching right ...
## mtry = 6     OOB error = 3.98% 
## -0.4 0.05

We can see that the classification error is minimized when mtry is equal to 3.
Let’s build a model using this optimized value of mtry, and visualize the Out-Of-Bag error versus the number of trees.

set.seed(42)
model_rf <- randomForest(class ~ ., data = cancer_train, mtry = 3, ntree = 2000)
plot(model_rf)

We can see that over around 700 trees, we don’t have any significant improvement in the classification error value. We will pick this value as the optimized one.

set.seed(42)
rf_opt <- randomForest(class ~ ., data = cancer_train, mtry = 3, ntree = 700)

Plot the variable importance.

varImpPlot(rf_opt)

Contrary to the logistic regression model, uniformity_of_cell_size and uniformity_of_cell_shape seem to be the more relevant variables in the random forest model.

3.5 Models’ evaluation

Let’s predict values on unseen data (testing set) for each of the 2 created models.

# logistic regression
pred_logis <- predict(model_glm, newdata = cancer_test[, 1:9], type = "response")
pred_logis <- ifelse(pred_logis < 0.5, "benign", "malignant")

# optimized RF
pred_rf <- predict(rf_opt, newdata = cancer_test[, 1:9])

# build a dataframe with accuracy of each model
accs_df <- tibble(model = c("logis_reg", "rf_opt"),
                  metric = c(accuracy(actual = cancer_test$class, 
                                      predicted = pred_logis),
                             accuracy(actual = cancer_test$class, 
                                      predicted = pred_rf)))

ggplot(accs_df, aes(x = fct_reorder(model, metric), y = metric)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = scales::percent(metric)), 
            position = position_stack(vjust = 0.94)) +
  coord_flip() +
  labs(title = "Model's accuracy", x = "Model", y = "Accuracy")

It seems that in our case, both models perform similarly.

4 Bonus : H2O modeling approach

As a complement, we can try to perform a modeling using H2O platform.
A very quick implementation can be done using “Automatic Machine Learning” via automl() function.

library(h2o)
h2o.init(nthreads = -1, port = 54321, max_mem_size = "4g")

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /tmp/RtmpZyGgNG/h2o_alex_started_from_r.out
##     /tmp/RtmpZyGgNG/h2o_alex_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         4 seconds 892 milliseconds 
##     H2O cluster timezone:       Europe/Paris 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.22.1.1 
##     H2O cluster version age:    4 months and 14 days !!! 
##     H2O cluster name:           H2O_started_from_R_alex_uml253 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.56 GB 
##     H2O cluster total cores:    2 
##     H2O cluster allowed cores:  2 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.6.0 (2019-04-26)

h2o.removeAll()

## [1] 0

h2o.no_progress()  # no progress bar

We will use the clean cancer_complete dataset.

# Import data
cancer_h2o <- as.h2o(cancer_complete)

# Define response and predictors
y <- "class"
x <- setdiff(names(cancer_h2o), y)

# classification problem => need to convert response as a factor variable
cancer_h2o[, y] <- as.factor(cancer_h2o[, y])

As for previous approach, we have to bluid training and testing datasets.

split_df <- h2o.splitFrame(data = cancer_h2o, ratios = c(0.7, 0.15), seed = 42)

h2o_train <- split_df[[1]]
h2o_valid <- split_df[[2]]
h2o_test <- split_df[[3]]

AutoML only needs a dataset, a target variable and a time or model number limit for training.

automl_model <- h2o.automl(x = x, 
                           y = y,
                           training_frame = h2o_train,
                           validation_frame = h2o_valid,
                           max_runtime_secs = 300,
                           sort_metric = "AUC" ,
                           seed = 42)

By extracting the created leaderboard, we can see that the best model for our data was built using a Gradient Boosting Machine algorithm.

# extract list of models
automl_model@leaderboard

##                                     model_id       auc    logloss
## 1 GBM_grid_1_AutoML_20190512_214820_model_11 0.9960819 0.10748830
## 2  GLM_grid_1_AutoML_20190512_214820_model_1 0.9960429 0.07734271
## 3 GBM_grid_1_AutoML_20190512_214820_model_17 0.9957115 0.07023189
## 4 GBM_grid_1_AutoML_20190512_214820_model_20 0.9954776 0.07604255
## 5 GBM_grid_1_AutoML_20190512_214820_model_15 0.9953996 0.08017143
## 6 GBM_grid_1_AutoML_20190512_214820_model_18 0.9952827 0.10242014
##   mean_per_class_error      rmse        mse
## 1           0.02461988 0.1527318 0.02332700
## 2           0.02210526 0.1450744 0.02104659
## 3           0.01918129 0.1367856 0.01871029
## 4           0.02125731 0.1467313 0.02153008
## 5           0.02669591 0.1515410 0.02296468
## 6           0.02000000 0.1512859 0.02288742
## 
## [51 rows x 6 columns]

# get the best model
model_ids <- as.data.frame(automl_model@leaderboard)$model_id
best_automl <- h2o.getModel(model_ids[1])

Let’s compare the performance of this model using the previous unseen test set.

# import previous testing data as h2o data
h2o_test <- as.h2o(cancer_test)

# predictions
h2o_predict <- h2o.predict(best_automl, newdata = h2o_test[, 1:9])

# add to the accuracy dataframe the accuracy of the automl model
accs_df <- bind_rows(accs_df, 
                     tibble(model = "best_automl", 
                            metric = accuracy(actual = h2o_test$class,
                                              predicted = h2o_predict$predict)))

ggplot(accs_df, aes(x = fct_reorder(model, metric), y = metric)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = scales::percent(metric)), 
            position = position_stack(vjust = 0.94)) +
  coord_flip() +
  labs(title = "Model's accuracy", x = "Model", y = "Accuracy")

The GBM model created with automl() has a perfect scoring of 100% on the same unseen previous data !!!

h2o.shutdown(prompt = FALSE)

## [1] TRUE

Machine Learning Implementation